Merge branch 'x86-cpu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git...
authorLinus Torvalds <torvalds@linux-foundation.org>
Tue, 2 May 2017 04:15:50 +0000 (21:15 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 2 May 2017 04:15:50 +0000 (21:15 -0700)
Pull x86 cpu updates from Ingo Molnar:
 "The biggest changes are an extension of the Intel RDT code to extend
  it with Intel Memory Bandwidth Allocation CPU support: MBA allows
  bandwidth allocation between cores, while CBM (already upstream)
  allows CPU cache partitioning.

  There's also misc smaller fixes and updates"

* 'x86-cpu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (23 commits)
  x86/intel_rdt: Return error for incorrect resource names in schemata
  x86/intel_rdt: Trim whitespace while parsing schemata input
  x86/intel_rdt: Fix padding when resource is enabled via mount
  x86/intel_rdt: Get rid of anon union
  x86/cpu: Keep model defines sorted by model number
  x86/intel_rdt/mba: Add schemata file support for MBA
  x86/intel_rdt: Make schemata file parsers resource specific
  x86/intel_rdt/mba: Add info directory files for Memory Bandwidth Allocation
  x86/intel_rdt: Make information files resource specific
  x86/intel_rdt/mba: Add primary support for Memory Bandwidth Allocation (MBA)
  x86/intel_rdt/mba: Memory bandwith allocation feature detect
  x86/intel_rdt: Add resource specific msr update function
  x86/intel_rdt: Move CBM specific data into a struct
  x86/intel_rdt: Cleanup namespace to support multiple resource types
  Documentation, x86: Intel Memory bandwidth allocation
  x86/intel_rdt: Organize code properly
  x86/intel_rdt: Init padding only if a device exists
  x86/intel_rdt: Add cpus_list rdtgroup file
  x86/intel_rdt: Cleanup kernel-doc
  x86/intel_rdt: Update schemata read to show data in tabular format
  ...

2129 files changed:
.mailmap
CREDITS
Documentation/ABI/obsolete/sysfs-firmware-acpi [new file with mode: 0644]
Documentation/ABI/testing/sysfs-block
Documentation/ABI/testing/sysfs-firmware-acpi
Documentation/acpi/dsd/graph.txt [new file with mode: 0644]
Documentation/acpi/linuxized-acpica.txt
Documentation/admin-guide/README.rst
Documentation/admin-guide/kernel-parameters.rst
Documentation/admin-guide/kernel-parameters.txt
Documentation/arm64/silicon-errata.txt
Documentation/block/00-INDEX
Documentation/block/bfq-iosched.txt [new file with mode: 0644]
Documentation/block/kyber-iosched.txt [new file with mode: 0644]
Documentation/block/queue-sysfs.txt
Documentation/blockdev/mflash.txt [deleted file]
Documentation/devicetree/bindings/ata/ahci-dm816.txt [new file with mode: 0644]
Documentation/devicetree/bindings/hwmon/ads7828.txt [new file with mode: 0644]
Documentation/devicetree/bindings/hwmon/aspeed-pwm-tacho.txt [new file with mode: 0644]
Documentation/devicetree/bindings/hwmon/lm87.txt [new file with mode: 0644]
Documentation/devicetree/bindings/interrupt-controller/faraday,ftintc010.txt [moved from Documentation/devicetree/bindings/interrupt-controller/cortina,gemini-interrupt-controller.txt with 63% similarity]
Documentation/devicetree/bindings/interrupt-controller/mediatek,cirq.txt [new file with mode: 0644]
Documentation/devicetree/bindings/interrupt-controller/mediatek,sysirq.txt
Documentation/devicetree/bindings/ipmi/aspeed,ast2400-ibt-bmc.txt
Documentation/devicetree/bindings/leds/leds-cpcap.txt [new file with mode: 0644]
Documentation/devicetree/bindings/leds/leds-mt6323.txt [new file with mode: 0644]
Documentation/devicetree/bindings/leds/leds-pca9532.txt
Documentation/devicetree/bindings/mailbox/brcm,iproc-flexrm-mbox.txt [new file with mode: 0644]
Documentation/devicetree/bindings/mailbox/brcm,iproc-pdc-mbox.txt
Documentation/devicetree/bindings/pci/hisilicon-pcie.txt
Documentation/devicetree/bindings/power/power_domain.txt
Documentation/devicetree/bindings/power/reset/gemini-poweroff.txt [new file with mode: 0644]
Documentation/devicetree/bindings/power/reset/syscon-poweroff.txt
Documentation/devicetree/bindings/power/rockchip-io-domain.txt
Documentation/devicetree/bindings/power/supply/cpcap-charger.txt [new file with mode: 0644]
Documentation/devicetree/bindings/power/supply/lego_ev3_battery.txt [new file with mode: 0644]
Documentation/devicetree/bindings/power/supply/ltc2941.txt
Documentation/devicetree/bindings/power/supply/max8925_battery.txt [moved from Documentation/devicetree/bindings/power/supply/max8925_batter.txt with 100% similarity]
Documentation/devicetree/bindings/timer/cortina,gemini-timer.txt [deleted file]
Documentation/devicetree/bindings/timer/faraday,fttmr010.txt [new file with mode: 0644]
Documentation/devicetree/bindings/timer/rockchip,rk-timer.txt
Documentation/devicetree/bindings/vendor-prefixes.txt
Documentation/features/core/BPF-JIT/arch-support.txt
Documentation/features/core/generic-idle-thread/arch-support.txt
Documentation/features/core/jump-labels/arch-support.txt
Documentation/features/core/tracehook/arch-support.txt
Documentation/features/debug/KASAN/arch-support.txt
Documentation/features/debug/gcov-profile-all/arch-support.txt
Documentation/features/debug/kgdb/arch-support.txt
Documentation/features/debug/kprobes-on-ftrace/arch-support.txt
Documentation/features/debug/kprobes/arch-support.txt
Documentation/features/debug/kretprobes/arch-support.txt
Documentation/features/debug/optprobes/arch-support.txt
Documentation/features/debug/stackprotector/arch-support.txt
Documentation/features/debug/uprobes/arch-support.txt
Documentation/features/debug/user-ret-profiler/arch-support.txt
Documentation/features/io/dma-api-debug/arch-support.txt
Documentation/features/io/dma-contiguous/arch-support.txt
Documentation/features/io/sg-chain/arch-support.txt
Documentation/features/lib/strncasecmp/arch-support.txt
Documentation/features/locking/cmpxchg-local/arch-support.txt
Documentation/features/locking/lockdep/arch-support.txt
Documentation/features/locking/queued-rwlocks/arch-support.txt
Documentation/features/locking/queued-spinlocks/arch-support.txt
Documentation/features/locking/rwsem-optimized/arch-support.txt
Documentation/features/perf/kprobes-event/arch-support.txt
Documentation/features/perf/perf-regs/arch-support.txt
Documentation/features/perf/perf-stackdump/arch-support.txt
Documentation/features/sched/numa-balancing/arch-support.txt
Documentation/features/seccomp/seccomp-filter/arch-support.txt
Documentation/features/time/arch-tick-broadcast/arch-support.txt
Documentation/features/time/clockevents/arch-support.txt
Documentation/features/time/context-tracking/arch-support.txt
Documentation/features/time/irq-time-acct/arch-support.txt
Documentation/features/time/modern-timekeeping/arch-support.txt
Documentation/features/time/virt-cpuacct/arch-support.txt
Documentation/features/vm/ELF-ASLR/arch-support.txt
Documentation/features/vm/PG_uncached/arch-support.txt
Documentation/features/vm/THP/arch-support.txt
Documentation/features/vm/TLB/arch-support.txt
Documentation/features/vm/huge-vmap/arch-support.txt
Documentation/features/vm/ioremap_prot/arch-support.txt
Documentation/features/vm/numa-memblock/arch-support.txt
Documentation/features/vm/pte_special/arch-support.txt
Documentation/filesystems/Locking
Documentation/filesystems/porting
Documentation/filesystems/vfs.txt
Documentation/hwmon/aspeed-pwm-tacho [new file with mode: 0644]
Documentation/hwmon/tc654
Documentation/lightnvm/pblk.txt [new file with mode: 0644]
Documentation/media/v4l-drivers/soc-camera.rst
Documentation/pinctrl.txt
Documentation/power/runtime_pm.txt
Documentation/process/stable-kernel-rules.rst
Documentation/scheduler/sched-pelt.c [new file with mode: 0644]
Documentation/trace/kprobetrace.txt
Documentation/virtual/kvm/devices/arm-vgic.txt
Documentation/x86/zero-page.txt
MAINTAINERS
Makefile
arch/alpha/include/asm/extable.h [new file with mode: 0644]
arch/alpha/include/asm/futex.h
arch/alpha/include/asm/uaccess.h
arch/alpha/kernel/osf_sys.c
arch/alpha/kernel/traps.c
arch/alpha/lib/clear_user.S
arch/alpha/lib/copy_user.S
arch/alpha/lib/csum_partial_copy.c
arch/alpha/lib/ev6-clear_user.S
arch/alpha/lib/ev6-copy_user.S
arch/arc/Kconfig
arch/arc/include/asm/Kbuild
arch/arc/include/asm/atomic.h
arch/arc/include/asm/entry-arcv2.h
arch/arc/include/asm/ptrace.h
arch/arc/include/asm/uaccess.h
arch/arc/kernel/setup.c
arch/arc/mm/extable.c
arch/arm/Kconfig
arch/arm/boot/dts/am335x-baltos.dtsi
arch/arm/boot/dts/am335x-evmsk.dts
arch/arm/boot/dts/dra7.dtsi
arch/arm/boot/dts/logicpd-torpedo-som.dtsi
arch/arm/boot/dts/rk3188.dtsi
arch/arm/boot/dts/rk322x.dtsi
arch/arm/boot/dts/ste-dbx5x0.dtsi
arch/arm/boot/dts/sun8i-a33.dtsi
arch/arm/configs/multi_v7_defconfig
arch/arm/configs/pxa_defconfig
arch/arm/include/asm/Kbuild
arch/arm/include/asm/efi.h
arch/arm/include/asm/uaccess.h
arch/arm/kvm/arm.c
arch/arm/kvm/mmu.c
arch/arm/lib/uaccess_with_memcpy.c
arch/arm/mach-moxart/Kconfig
arch/arm/mach-omap2/common.h
arch/arm/mach-omap2/omap-hotplug.c
arch/arm/mach-omap2/omap-mpuss-lowpower.c
arch/arm/mach-omap2/omap-smc.S
arch/arm/mach-omap2/omap-smp.c
arch/arm/mach-omap2/omap_device.c
arch/arm/mach-orion5x/Kconfig
arch/arm/mm/dma-mapping.c
arch/arm/mm/nommu.c
arch/arm/plat-orion/common.c
arch/arm/probes/kprobes/core.c
arch/arm/probes/kprobes/test-core.c
arch/arm64/Kconfig
arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi
arch/arm64/include/asm/arch_timer.h
arch/arm64/include/asm/cpucaps.h
arch/arm64/include/asm/cputype.h
arch/arm64/include/asm/efi.h
arch/arm64/include/asm/esr.h
arch/arm64/include/asm/extable.h [new file with mode: 0644]
arch/arm64/include/asm/uaccess.h
arch/arm64/kernel/acpi.c
arch/arm64/kernel/arm64ksyms.c
arch/arm64/kernel/cpu_errata.c
arch/arm64/kernel/cpufeature.c
arch/arm64/kernel/traps.c
arch/arm64/lib/copy_in_user.S
arch/arm64/mm/fault.c
arch/arm64/mm/hugetlbpage.c
arch/avr32/Kconfig [deleted file]
arch/avr32/Kconfig.debug [deleted file]
arch/avr32/Makefile [deleted file]
arch/avr32/boards/atngw100/Kconfig [deleted file]
arch/avr32/boards/atngw100/Kconfig_mrmt [deleted file]
arch/avr32/boards/atngw100/Makefile [deleted file]
arch/avr32/boards/atngw100/evklcd10x.c [deleted file]
arch/avr32/boards/atngw100/flash.c [deleted file]
arch/avr32/boards/atngw100/mrmt.c [deleted file]
arch/avr32/boards/atngw100/setup.c [deleted file]
arch/avr32/boards/atstk1000/Kconfig [deleted file]
arch/avr32/boards/atstk1000/Makefile [deleted file]
arch/avr32/boards/atstk1000/atstk1000.h [deleted file]
arch/avr32/boards/atstk1000/atstk1002.c [deleted file]
arch/avr32/boards/atstk1000/atstk1003.c [deleted file]
arch/avr32/boards/atstk1000/atstk1004.c [deleted file]
arch/avr32/boards/atstk1000/flash.c [deleted file]
arch/avr32/boards/atstk1000/setup.c [deleted file]
arch/avr32/boards/favr-32/Kconfig [deleted file]
arch/avr32/boards/favr-32/Makefile [deleted file]
arch/avr32/boards/favr-32/flash.c [deleted file]
arch/avr32/boards/favr-32/setup.c [deleted file]
arch/avr32/boards/hammerhead/Kconfig [deleted file]
arch/avr32/boards/hammerhead/Makefile [deleted file]
arch/avr32/boards/hammerhead/flash.c [deleted file]
arch/avr32/boards/hammerhead/flash.h [deleted file]
arch/avr32/boards/hammerhead/setup.c [deleted file]
arch/avr32/boards/merisc/Kconfig [deleted file]
arch/avr32/boards/merisc/Makefile [deleted file]
arch/avr32/boards/merisc/display.c [deleted file]
arch/avr32/boards/merisc/flash.c [deleted file]
arch/avr32/boards/merisc/merisc.h [deleted file]
arch/avr32/boards/merisc/merisc_sysfs.c [deleted file]
arch/avr32/boards/merisc/setup.c [deleted file]
arch/avr32/boards/mimc200/Makefile [deleted file]
arch/avr32/boards/mimc200/flash.c [deleted file]
arch/avr32/boards/mimc200/setup.c [deleted file]
arch/avr32/boot/images/.gitignore [deleted file]
arch/avr32/boot/images/Makefile [deleted file]
arch/avr32/boot/u-boot/Makefile [deleted file]
arch/avr32/boot/u-boot/empty.S [deleted file]
arch/avr32/boot/u-boot/head.S [deleted file]
arch/avr32/configs/atngw100_defconfig [deleted file]
arch/avr32/configs/atngw100_evklcd100_defconfig [deleted file]
arch/avr32/configs/atngw100_evklcd101_defconfig [deleted file]
arch/avr32/configs/atngw100_mrmt_defconfig [deleted file]
arch/avr32/configs/atngw100mkii_defconfig [deleted file]
arch/avr32/configs/atngw100mkii_evklcd100_defconfig [deleted file]
arch/avr32/configs/atngw100mkii_evklcd101_defconfig [deleted file]
arch/avr32/configs/atstk1002_defconfig [deleted file]
arch/avr32/configs/atstk1003_defconfig [deleted file]
arch/avr32/configs/atstk1004_defconfig [deleted file]
arch/avr32/configs/atstk1006_defconfig [deleted file]
arch/avr32/configs/favr-32_defconfig [deleted file]
arch/avr32/configs/hammerhead_defconfig [deleted file]
arch/avr32/configs/merisc_defconfig [deleted file]
arch/avr32/configs/mimc200_defconfig [deleted file]
arch/avr32/include/asm/Kbuild [deleted file]
arch/avr32/include/asm/addrspace.h [deleted file]
arch/avr32/include/asm/asm-offsets.h [deleted file]
arch/avr32/include/asm/asm.h [deleted file]
arch/avr32/include/asm/atomic.h [deleted file]
arch/avr32/include/asm/barrier.h [deleted file]
arch/avr32/include/asm/bitops.h [deleted file]
arch/avr32/include/asm/bug.h [deleted file]
arch/avr32/include/asm/bugs.h [deleted file]
arch/avr32/include/asm/cache.h [deleted file]
arch/avr32/include/asm/cacheflush.h [deleted file]
arch/avr32/include/asm/checksum.h [deleted file]
arch/avr32/include/asm/cmpxchg.h [deleted file]
arch/avr32/include/asm/current.h [deleted file]
arch/avr32/include/asm/dma-mapping.h [deleted file]
arch/avr32/include/asm/dma.h [deleted file]
arch/avr32/include/asm/elf.h [deleted file]
arch/avr32/include/asm/fb.h [deleted file]
arch/avr32/include/asm/ftrace.h [deleted file]
arch/avr32/include/asm/gpio.h [deleted file]
arch/avr32/include/asm/hardirq.h [deleted file]
arch/avr32/include/asm/hw_irq.h [deleted file]
arch/avr32/include/asm/io.h [deleted file]
arch/avr32/include/asm/irq.h [deleted file]
arch/avr32/include/asm/irqflags.h [deleted file]
arch/avr32/include/asm/kdebug.h [deleted file]
arch/avr32/include/asm/kmap_types.h [deleted file]
arch/avr32/include/asm/kprobes.h [deleted file]
arch/avr32/include/asm/linkage.h [deleted file]
arch/avr32/include/asm/mmu.h [deleted file]
arch/avr32/include/asm/mmu_context.h [deleted file]
arch/avr32/include/asm/module.h [deleted file]
arch/avr32/include/asm/ocd.h [deleted file]
arch/avr32/include/asm/page.h [deleted file]
arch/avr32/include/asm/pci.h [deleted file]
arch/avr32/include/asm/pgalloc.h [deleted file]
arch/avr32/include/asm/pgtable-2level.h [deleted file]
arch/avr32/include/asm/pgtable.h [deleted file]
arch/avr32/include/asm/processor.h [deleted file]
arch/avr32/include/asm/ptrace.h [deleted file]
arch/avr32/include/asm/serial.h [deleted file]
arch/avr32/include/asm/setup.h [deleted file]
arch/avr32/include/asm/shmparam.h [deleted file]
arch/avr32/include/asm/signal.h [deleted file]
arch/avr32/include/asm/string.h [deleted file]
arch/avr32/include/asm/switch_to.h [deleted file]
arch/avr32/include/asm/syscalls.h [deleted file]
arch/avr32/include/asm/sysreg.h [deleted file]
arch/avr32/include/asm/termios.h [deleted file]
arch/avr32/include/asm/thread_info.h [deleted file]
arch/avr32/include/asm/timex.h [deleted file]
arch/avr32/include/asm/tlb.h [deleted file]
arch/avr32/include/asm/tlbflush.h [deleted file]
arch/avr32/include/asm/traps.h [deleted file]
arch/avr32/include/asm/types.h [deleted file]
arch/avr32/include/asm/uaccess.h [deleted file]
arch/avr32/include/asm/ucontext.h [deleted file]
arch/avr32/include/asm/unaligned.h [deleted file]
arch/avr32/include/asm/unistd.h [deleted file]
arch/avr32/include/asm/user.h [deleted file]
arch/avr32/include/uapi/asm/Kbuild [deleted file]
arch/avr32/include/uapi/asm/auxvec.h [deleted file]
arch/avr32/include/uapi/asm/byteorder.h [deleted file]
arch/avr32/include/uapi/asm/cachectl.h [deleted file]
arch/avr32/include/uapi/asm/msgbuf.h [deleted file]
arch/avr32/include/uapi/asm/posix_types.h [deleted file]
arch/avr32/include/uapi/asm/ptrace.h [deleted file]
arch/avr32/include/uapi/asm/sembuf.h [deleted file]
arch/avr32/include/uapi/asm/setup.h [deleted file]
arch/avr32/include/uapi/asm/shmbuf.h [deleted file]
arch/avr32/include/uapi/asm/sigcontext.h [deleted file]
arch/avr32/include/uapi/asm/signal.h [deleted file]
arch/avr32/include/uapi/asm/socket.h [deleted file]
arch/avr32/include/uapi/asm/sockios.h [deleted file]
arch/avr32/include/uapi/asm/stat.h [deleted file]
arch/avr32/include/uapi/asm/swab.h [deleted file]
arch/avr32/include/uapi/asm/termbits.h [deleted file]
arch/avr32/include/uapi/asm/termios.h [deleted file]
arch/avr32/include/uapi/asm/types.h [deleted file]
arch/avr32/include/uapi/asm/unistd.h [deleted file]
arch/avr32/kernel/.gitignore [deleted file]
arch/avr32/kernel/Makefile [deleted file]
arch/avr32/kernel/asm-offsets.c [deleted file]
arch/avr32/kernel/avr32_ksyms.c [deleted file]
arch/avr32/kernel/cpu.c [deleted file]
arch/avr32/kernel/entry-avr32b.S [deleted file]
arch/avr32/kernel/head.S [deleted file]
arch/avr32/kernel/irq.c [deleted file]
arch/avr32/kernel/kprobes.c [deleted file]
arch/avr32/kernel/module.c [deleted file]
arch/avr32/kernel/nmi_debug.c [deleted file]
arch/avr32/kernel/ocd.c [deleted file]
arch/avr32/kernel/process.c [deleted file]
arch/avr32/kernel/ptrace.c [deleted file]
arch/avr32/kernel/setup.c [deleted file]
arch/avr32/kernel/signal.c [deleted file]
arch/avr32/kernel/stacktrace.c [deleted file]
arch/avr32/kernel/switch_to.S [deleted file]
arch/avr32/kernel/syscall-stubs.S [deleted file]
arch/avr32/kernel/syscall_table.S [deleted file]
arch/avr32/kernel/time.c [deleted file]
arch/avr32/kernel/traps.c [deleted file]
arch/avr32/kernel/vmlinux.lds.S [deleted file]
arch/avr32/lib/Makefile [deleted file]
arch/avr32/lib/__avr32_asr64.S [deleted file]
arch/avr32/lib/__avr32_lsl64.S [deleted file]
arch/avr32/lib/__avr32_lsr64.S [deleted file]
arch/avr32/lib/clear_user.S [deleted file]
arch/avr32/lib/copy_user.S [deleted file]
arch/avr32/lib/csum_partial.S [deleted file]
arch/avr32/lib/csum_partial_copy_generic.S [deleted file]
arch/avr32/lib/delay.c [deleted file]
arch/avr32/lib/findbit.S [deleted file]
arch/avr32/lib/io-readsb.S [deleted file]
arch/avr32/lib/io-readsl.S [deleted file]
arch/avr32/lib/io-readsw.S [deleted file]
arch/avr32/lib/io-writesb.S [deleted file]
arch/avr32/lib/io-writesl.S [deleted file]
arch/avr32/lib/io-writesw.S [deleted file]
arch/avr32/lib/memcpy.S [deleted file]
arch/avr32/lib/memset.S [deleted file]
arch/avr32/lib/strncpy_from_user.S [deleted file]
arch/avr32/lib/strnlen_user.S [deleted file]
arch/avr32/mach-at32ap/Kconfig [deleted file]
arch/avr32/mach-at32ap/Makefile [deleted file]
arch/avr32/mach-at32ap/at32ap700x.c [deleted file]
arch/avr32/mach-at32ap/clock.c [deleted file]
arch/avr32/mach-at32ap/clock.h [deleted file]
arch/avr32/mach-at32ap/extint.c [deleted file]
arch/avr32/mach-at32ap/hmatrix.c [deleted file]
arch/avr32/mach-at32ap/hsmc.c [deleted file]
arch/avr32/mach-at32ap/hsmc.h [deleted file]
arch/avr32/mach-at32ap/include/mach/at32ap700x.h [deleted file]
arch/avr32/mach-at32ap/include/mach/board.h [deleted file]
arch/avr32/mach-at32ap/include/mach/chip.h [deleted file]
arch/avr32/mach-at32ap/include/mach/cpu.h [deleted file]
arch/avr32/mach-at32ap/include/mach/gpio.h [deleted file]
arch/avr32/mach-at32ap/include/mach/hmatrix.h [deleted file]
arch/avr32/mach-at32ap/include/mach/init.h [deleted file]
arch/avr32/mach-at32ap/include/mach/io.h [deleted file]
arch/avr32/mach-at32ap/include/mach/irq.h [deleted file]
arch/avr32/mach-at32ap/include/mach/pm.h [deleted file]
arch/avr32/mach-at32ap/include/mach/portmux.h [deleted file]
arch/avr32/mach-at32ap/include/mach/smc.h [deleted file]
arch/avr32/mach-at32ap/include/mach/sram.h [deleted file]
arch/avr32/mach-at32ap/intc.c [deleted file]
arch/avr32/mach-at32ap/intc.h [deleted file]
arch/avr32/mach-at32ap/pdc.c [deleted file]
arch/avr32/mach-at32ap/pio.c [deleted file]
arch/avr32/mach-at32ap/pio.h [deleted file]
arch/avr32/mach-at32ap/pm-at32ap700x.S [deleted file]
arch/avr32/mach-at32ap/pm.c [deleted file]
arch/avr32/mach-at32ap/pm.h [deleted file]
arch/avr32/mach-at32ap/sdramc.h [deleted file]
arch/avr32/mm/Makefile [deleted file]
arch/avr32/mm/cache.c [deleted file]
arch/avr32/mm/clear_page.S [deleted file]
arch/avr32/mm/copy_page.S [deleted file]
arch/avr32/mm/dma-coherent.c [deleted file]
arch/avr32/mm/fault.c [deleted file]
arch/avr32/mm/init.c [deleted file]
arch/avr32/mm/ioremap.c [deleted file]
arch/avr32/mm/tlb.c [deleted file]
arch/avr32/oprofile/Makefile [deleted file]
arch/avr32/oprofile/backtrace.c [deleted file]
arch/avr32/oprofile/op_model_avr32.c [deleted file]
arch/blackfin/include/asm/Kbuild
arch/blackfin/include/asm/uaccess.h
arch/blackfin/kernel/process.c
arch/blackfin/kernel/time-ts.c
arch/c6x/include/asm/Kbuild
arch/c6x/include/asm/uaccess.h
arch/c6x/kernel/sys_c6x.c
arch/c6x/platforms/timer64.c
arch/cris/arch-v10/lib/usercopy.c
arch/cris/arch-v32/lib/usercopy.c
arch/cris/include/arch-v10/arch/uaccess.h
arch/cris/include/arch-v32/arch/uaccess.h
arch/cris/include/asm/Kbuild
arch/cris/include/asm/uaccess.h
arch/frv/include/asm/Kbuild
arch/frv/include/asm/uaccess.h
arch/frv/kernel/traps.c
arch/frv/mm/extable.c
arch/frv/mm/fault.c
arch/h8300/include/asm/Kbuild
arch/h8300/include/asm/uaccess.h [new file with mode: 0644]
arch/hexagon/include/asm/Kbuild
arch/hexagon/include/asm/uaccess.h
arch/hexagon/kernel/hexagon_ksyms.c
arch/hexagon/kernel/time.c
arch/hexagon/mm/copy_from_user.S
arch/hexagon/mm/copy_to_user.S
arch/ia64/Kconfig
arch/ia64/include/asm/asm-prototypes.h [new file with mode: 0644]
arch/ia64/include/asm/extable.h [new file with mode: 0644]
arch/ia64/include/asm/uaccess.h
arch/ia64/kernel/module.c
arch/ia64/kernel/salinfo.c
arch/ia64/kernel/topology.c
arch/ia64/lib/Makefile
arch/ia64/lib/memcpy_mck.S
arch/ia64/mm/extable.c
arch/ia64/sn/kernel/sn2/sn_hwperf.c
arch/m32r/include/asm/Kbuild
arch/m32r/include/asm/uaccess.h
arch/m32r/kernel/m32r_ksyms.c
arch/m32r/lib/usercopy.c
arch/m68k/coldfire/pit.c
arch/m68k/include/asm/Kbuild
arch/m68k/include/asm/processor.h
arch/m68k/include/asm/uaccess.h
arch/m68k/include/asm/uaccess_mm.h
arch/m68k/include/asm/uaccess_no.h
arch/m68k/kernel/signal.c
arch/m68k/kernel/traps.c
arch/m68k/lib/uaccess.c
arch/m68k/mm/fault.c
arch/metag/include/asm/Kbuild
arch/metag/include/asm/uaccess.h
arch/metag/lib/usercopy.c
arch/microblaze/include/asm/Kbuild
arch/microblaze/include/asm/uaccess.h
arch/mips/Kconfig
arch/mips/Makefile
arch/mips/alchemy/common/time.c
arch/mips/cavium-octeon/octeon-memcpy.S
arch/mips/include/asm/asm-prototypes.h
arch/mips/include/asm/checksum.h
arch/mips/include/asm/fpu.h
arch/mips/include/asm/irq.h
arch/mips/include/asm/r4kcache.h
arch/mips/include/asm/spinlock.h
arch/mips/include/asm/uaccess.h
arch/mips/include/uapi/asm/unistd.h
arch/mips/jz4740/time.c
arch/mips/kernel/asm-offsets.c
arch/mips/kernel/cevt-bcm1480.c
arch/mips/kernel/cevt-ds1287.c
arch/mips/kernel/cevt-gt641xx.c
arch/mips/kernel/cevt-r4k.c
arch/mips/kernel/cevt-sb1250.c
arch/mips/kernel/cevt-txx9.c
arch/mips/kernel/cps-vec.S
arch/mips/kernel/cpu-probe.c
arch/mips/kernel/elf.c
arch/mips/kernel/genex.S
arch/mips/kernel/kgdb.c
arch/mips/kernel/mips-r2-to-r6-emul.c
arch/mips/kernel/perf_event_mipsxx.c
arch/mips/kernel/process.c
arch/mips/kernel/relocate.c
arch/mips/kernel/scall32-o32.S
arch/mips/kernel/scall64-64.S
arch/mips/kernel/scall64-n32.S
arch/mips/kernel/scall64-o32.S
arch/mips/kernel/smp-cps.c
arch/mips/kernel/syscall.c
arch/mips/kernel/traps.c
arch/mips/kernel/unaligned.c
arch/mips/lantiq/xway/sysctrl.c
arch/mips/lib/memcpy.S
arch/mips/loongson32/common/time.c
arch/mips/loongson64/common/cs5536/cs5536_mfgpt.c
arch/mips/loongson64/loongson-3/hpet.c
arch/mips/mm/c-r4k.c
arch/mips/mm/tlbex.c
arch/mips/mti-malta/malta-int.c
arch/mips/mti-malta/malta-time.c
arch/mips/oprofile/backtrace.c
arch/mips/pci/pci-legacy.c
arch/mips/ralink/cevt-rt3352.c
arch/mips/ralink/rt3883.c
arch/mips/sgi-ip27/ip27-timer.c
arch/mn10300/include/asm/Kbuild
arch/mn10300/include/asm/uaccess.h
arch/mn10300/kernel/cevt-mn10300.c
arch/mn10300/kernel/mn10300_ksyms.c
arch/mn10300/lib/usercopy.c
arch/nios2/include/asm/Kbuild
arch/nios2/include/asm/uaccess.h
arch/nios2/mm/uaccess.c
arch/openrisc/include/asm/Kbuild
arch/openrisc/include/asm/uaccess.h
arch/parisc/Kconfig
arch/parisc/include/asm/futex.h
arch/parisc/include/asm/uaccess.h
arch/parisc/lib/lusercopy.S
arch/parisc/lib/memcpy.c
arch/powerpc/Kconfig
arch/powerpc/configs/85xx-hw.config
arch/powerpc/configs/85xx/ge_imp3a_defconfig
arch/powerpc/configs/85xx/xes_mpc85xx_defconfig
arch/powerpc/configs/cell_defconfig
arch/powerpc/configs/pasemi_defconfig
arch/powerpc/configs/ppc64_defconfig
arch/powerpc/configs/ppc64e_defconfig
arch/powerpc/configs/ppc6xx_defconfig
arch/powerpc/crypto/crc32c-vpmsum_glue.c
arch/powerpc/include/asm/exception-64s.h
arch/powerpc/include/asm/extable.h [new file with mode: 0644]
arch/powerpc/include/asm/uaccess.h
arch/powerpc/kernel/align.c
arch/powerpc/kernel/entry_64.S
arch/powerpc/kernel/exceptions-64s.S
arch/powerpc/kernel/misc_64.S
arch/powerpc/kernel/setup_64.c
arch/powerpc/kernel/smp.c
arch/powerpc/kernel/time.c
arch/powerpc/kvm/book3s_64_mmu_hv.c
arch/powerpc/lib/Makefile
arch/powerpc/lib/copy_32.S
arch/powerpc/lib/copyuser_64.S
arch/powerpc/lib/usercopy_64.c [deleted file]
arch/powerpc/mm/hash_native_64.c
arch/s390/Kconfig
arch/s390/include/asm/extable.h [new file with mode: 0644]
arch/s390/include/asm/pgtable.h
arch/s390/include/asm/uaccess.h
arch/s390/kernel/time.c
arch/s390/kvm/gaccess.c
arch/s390/lib/uaccess.c
arch/score/include/asm/Kbuild
arch/score/include/asm/extable.h [deleted file]
arch/score/include/asm/uaccess.h
arch/score/kernel/time.c
arch/sh/include/asm/extable.h [new file with mode: 0644]
arch/sh/include/asm/uaccess.h
arch/sparc/Kconfig
arch/sparc/include/asm/page_64.h
arch/sparc/include/asm/pgtable_64.h
arch/sparc/include/asm/processor_32.h
arch/sparc/include/asm/processor_64.h
arch/sparc/include/asm/ptrace.h
arch/sparc/include/asm/uaccess.h
arch/sparc/include/asm/uaccess_32.h
arch/sparc/include/asm/uaccess_64.h
arch/sparc/include/uapi/asm/unistd.h
arch/sparc/kernel/head_32.S
arch/sparc/kernel/head_64.S
arch/sparc/kernel/misctrap.S
arch/sparc/kernel/ptrace_64.c
arch/sparc/kernel/rtrap_64.S
arch/sparc/kernel/spiterrs.S
arch/sparc/kernel/sun4v_tlb_miss.S
arch/sparc/kernel/sysfs.c
arch/sparc/kernel/systbls_32.S
arch/sparc/kernel/systbls_64.S
arch/sparc/kernel/time_32.c
arch/sparc/kernel/time_64.c
arch/sparc/kernel/urtt_fill.S
arch/sparc/kernel/winfixup.S
arch/sparc/lib/GENcopy_from_user.S
arch/sparc/lib/GENcopy_to_user.S
arch/sparc/lib/GENpatch.S
arch/sparc/lib/NG2copy_from_user.S
arch/sparc/lib/NG2copy_to_user.S
arch/sparc/lib/NG2memcpy.S
arch/sparc/lib/NG2patch.S
arch/sparc/lib/NG4copy_from_user.S
arch/sparc/lib/NG4copy_to_user.S
arch/sparc/lib/NG4memcpy.S
arch/sparc/lib/NG4memset.S
arch/sparc/lib/NG4patch.S
arch/sparc/lib/NGcopy_from_user.S
arch/sparc/lib/NGcopy_to_user.S
arch/sparc/lib/NGmemcpy.S
arch/sparc/lib/NGpatch.S
arch/sparc/lib/U1copy_from_user.S
arch/sparc/lib/U1copy_to_user.S
arch/sparc/lib/U3copy_to_user.S
arch/sparc/lib/U3patch.S
arch/sparc/lib/copy_in_user.S
arch/sparc/lib/copy_user.S
arch/sparc/mm/hugetlbpage.c
arch/sparc/mm/init_64.c
arch/sparc/mm/srmmu.c
arch/sparc/mm/tlb.c
arch/sparc/mm/tsb.c
arch/tile/configs/tilegx_defconfig
arch/tile/configs/tilepro_defconfig
arch/tile/include/asm/Kbuild
arch/tile/include/asm/uaccess.h
arch/tile/kernel/time.c
arch/tile/lib/exports.c
arch/tile/lib/memcpy_32.S
arch/tile/lib/memcpy_user_64.c
arch/um/include/asm/Kbuild
arch/um/include/asm/uaccess.h
arch/um/include/shared/os.h
arch/um/kernel/skas/uaccess.c
arch/um/kernel/time.c
arch/unicore32/include/asm/Kbuild
arch/unicore32/include/asm/uaccess.h
arch/unicore32/kernel/ksyms.c
arch/unicore32/kernel/process.c
arch/unicore32/kernel/time.c
arch/unicore32/lib/copy_from_user.S
arch/unicore32/lib/copy_to_user.S
arch/x86/Kconfig
arch/x86/Makefile
arch/x86/boot/boot.h
arch/x86/boot/compressed/eboot.c
arch/x86/boot/compressed/kaslr.c
arch/x86/boot/header.S
arch/x86/boot/memory.c
arch/x86/configs/i386_defconfig
arch/x86/configs/x86_64_defconfig
arch/x86/entry/syscalls/syscall_32.tbl
arch/x86/events/amd/iommu.c
arch/x86/events/amd/iommu.h
arch/x86/events/amd/uncore.c
arch/x86/events/intel/bts.c
arch/x86/events/intel/core.c
arch/x86/events/intel/ds.c
arch/x86/events/intel/lbr.c
arch/x86/events/intel/pt.c
arch/x86/events/intel/pt.h
arch/x86/events/perf_event.h
arch/x86/include/asm/acpi.h
arch/x86/include/asm/atomic.h
arch/x86/include/asm/atomic64_64.h
arch/x86/include/asm/cmpxchg.h
arch/x86/include/asm/cpufeatures.h
arch/x86/include/asm/e820.h [deleted file]
arch/x86/include/asm/e820/api.h [new file with mode: 0644]
arch/x86/include/asm/e820/types.h [new file with mode: 0644]
arch/x86/include/asm/gart.h
arch/x86/include/asm/kprobes.h
arch/x86/include/asm/mce.h
arch/x86/include/asm/mpspec.h
arch/x86/include/asm/msr-index.h
arch/x86/include/asm/pci_x86.h
arch/x86/include/asm/pgtable.h
arch/x86/include/asm/pmem.h
arch/x86/include/asm/processor.h
arch/x86/include/asm/proto.h
arch/x86/include/asm/reboot.h
arch/x86/include/asm/thread_info.h
arch/x86/include/asm/tlbflush.h
arch/x86/include/asm/uaccess.h
arch/x86/include/asm/uaccess_32.h
arch/x86/include/asm/uaccess_64.h
arch/x86/include/asm/xen/page.h
arch/x86/include/uapi/asm/bootparam.h
arch/x86/include/uapi/asm/prctl.h
arch/x86/kernel/acpi/boot.c
arch/x86/kernel/aperture_64.c
arch/x86/kernel/apic/apic.c
arch/x86/kernel/apic/apic_noop.c
arch/x86/kernel/apic/probe_32.c
arch/x86/kernel/apic/x2apic_uv_x.c
arch/x86/kernel/cpu/centaur.c
arch/x86/kernel/cpu/intel.c
arch/x86/kernel/cpu/mcheck/Makefile
arch/x86/kernel/cpu/mcheck/dev-mcelog.c [new file with mode: 0644]
arch/x86/kernel/cpu/mcheck/mce-genpool.c
arch/x86/kernel/cpu/mcheck/mce-internal.h
arch/x86/kernel/cpu/mcheck/mce.c
arch/x86/kernel/cpu/mcheck/mce_intel.c
arch/x86/kernel/cpu/mtrr/cleanup.c
arch/x86/kernel/cpu/mtrr/main.c
arch/x86/kernel/crash.c
arch/x86/kernel/e820.c
arch/x86/kernel/early-quirks.c
arch/x86/kernel/ftrace.c
arch/x86/kernel/head32.c
arch/x86/kernel/head64.c
arch/x86/kernel/head_64.S
arch/x86/kernel/kexec-bzimage64.c
arch/x86/kernel/kprobes/common.h
arch/x86/kernel/kprobes/core.c
arch/x86/kernel/kprobes/ftrace.c
arch/x86/kernel/kprobes/opt.c
arch/x86/kernel/mpparse.c
arch/x86/kernel/nmi.c
arch/x86/kernel/probe_roms.c
arch/x86/kernel/process.c
arch/x86/kernel/process_32.c
arch/x86/kernel/process_64.c
arch/x86/kernel/ptrace.c
arch/x86/kernel/reboot.c
arch/x86/kernel/resource.c
arch/x86/kernel/setup.c
arch/x86/kernel/smp.c
arch/x86/kernel/tboot.c
arch/x86/kernel/x86_init.c
arch/x86/kvm/vmx.c
arch/x86/lguest/boot.c
arch/x86/lib/delay.c
arch/x86/lib/kaslr.c
arch/x86/lib/usercopy.c
arch/x86/lib/usercopy_32.c
arch/x86/lib/usercopy_64.c
arch/x86/mm/amdtopology.c
arch/x86/mm/init.c
arch/x86/mm/init_32.c
arch/x86/mm/init_64.c
arch/x86/mm/ioremap.c
arch/x86/mm/kasan_init_64.c
arch/x86/mm/mmio-mod.c
arch/x86/mm/numa.c
arch/x86/mm/pageattr.c
arch/x86/mm/pat.c
arch/x86/mm/pgtable_32.c
arch/x86/mm/srat.c
arch/x86/pci/i386.c
arch/x86/pci/mmconfig-shared.c
arch/x86/pci/mmconfig_32.c
arch/x86/pci/mmconfig_64.c
arch/x86/pci/pcbios.c
arch/x86/platform/efi/Makefile
arch/x86/platform/efi/efi.c
arch/x86/platform/efi/efi_64.c
arch/x86/platform/efi/quirks.c
arch/x86/platform/uv/uv_time.c
arch/x86/power/hibernate_64.c
arch/x86/ras/Kconfig
arch/x86/um/Makefile
arch/x86/um/asm/ptrace.h
arch/x86/um/os-Linux/prctl.c
arch/x86/um/syscalls_32.c [new file with mode: 0644]
arch/x86/um/syscalls_64.c
arch/x86/xen/enlighten.c
arch/x86/xen/mmu.c
arch/x86/xen/setup.c
arch/x86/xen/time.c
arch/xtensa/include/asm/Kbuild
arch/xtensa/include/asm/asm-uaccess.h
arch/xtensa/include/asm/uaccess.h
arch/xtensa/lib/usercopy.S
block/Kconfig
block/Kconfig.iosched
block/Makefile
block/bfq-cgroup.c [new file with mode: 0644]
block/bfq-iosched.c [new file with mode: 0644]
block/bfq-iosched.h [new file with mode: 0644]
block/bfq-wf2q.c [new file with mode: 0644]
block/bio.c
block/blk-cgroup.c
block/blk-core.c
block/blk-exec.c
block/blk-flush.c
block/blk-integrity.c
block/blk-lib.c
block/blk-merge.c
block/blk-mq-debugfs.c
block/blk-mq-pci.c
block/blk-mq-sched.c
block/blk-mq-sched.h
block/blk-mq-sysfs.c
block/blk-mq-tag.c
block/blk-mq.c
block/blk-mq.h
block/blk-settings.c
block/blk-stat.c
block/blk-stat.h
block/blk-sysfs.c
block/blk-throttle.c
block/blk-timeout.c
block/blk-wbt.c
block/blk-wbt.h
block/blk.h
block/bsg-lib.c
block/bsg.c
block/cfq-iosched.c
block/compat_ioctl.c
block/elevator.c
block/genhd.c
block/ioctl.c
block/ioprio.c
block/kyber-iosched.c [new file with mode: 0644]
block/partition-generic.c
block/scsi_ioctl.c
block/sed-opal.c
block/t10-pi.c
crypto/ahash.c
crypto/algif_aead.c
crypto/lrw.c
crypto/xts.c
drivers/acpi/Kconfig
drivers/acpi/Makefile
drivers/acpi/ac.c
drivers/acpi/acpi_extlog.c
drivers/acpi/acpi_ipmi.c
drivers/acpi/acpi_platform.c
drivers/acpi/acpi_processor.c
drivers/acpi/acpi_video.c
drivers/acpi/acpica/utresrc.c
drivers/acpi/apei/ghes.c
drivers/acpi/arm64/Kconfig
drivers/acpi/arm64/Makefile
drivers/acpi/arm64/gtdt.c [new file with mode: 0644]
drivers/acpi/battery.c
drivers/acpi/bgrt.c
drivers/acpi/blacklist.c
drivers/acpi/cppc_acpi.c
drivers/acpi/glue.c
drivers/acpi/internal.h
drivers/acpi/nfit/core.c
drivers/acpi/pmic/intel_pmic_chtwc.c [new file with mode: 0644]
drivers/acpi/pmic/intel_pmic_xpower.c
drivers/acpi/power.c
drivers/acpi/processor_driver.c
drivers/acpi/processor_throttling.c
drivers/acpi/property.c
drivers/acpi/scan.c
drivers/acpi/sysfs.c
drivers/acpi/tables.c
drivers/acpi/utils.c
drivers/ata/Kconfig
drivers/ata/Makefile
drivers/ata/ahci_dm816.c [new file with mode: 0644]
drivers/ata/ahci_octeon.c
drivers/ata/libata-core.c
drivers/ata/libata-scsi.c
drivers/ata/pata_at91.c [deleted file]
drivers/ata/pata_atiixp.c
drivers/ata/pata_macio.c
drivers/ata/pata_mpc52xx.c
drivers/ata/pata_of_platform.c
drivers/ata/sata_fsl.c
drivers/ata/sata_mv.c
drivers/ata/sata_via.c
drivers/base/platform-msi.c
drivers/base/power/domain.c
drivers/base/property.c
drivers/block/Kconfig
drivers/block/Makefile
drivers/block/ataflop.c
drivers/block/brd.c
drivers/block/cciss.c
drivers/block/drbd/drbd_debugfs.c
drivers/block/drbd/drbd_int.h
drivers/block/drbd/drbd_main.c
drivers/block/drbd/drbd_nl.c
drivers/block/drbd/drbd_receiver.c
drivers/block/drbd/drbd_req.c
drivers/block/drbd/drbd_worker.c
drivers/block/floppy.c
drivers/block/hd.c [deleted file]
drivers/block/loop.c
drivers/block/loop.h
drivers/block/mg_disk.c [deleted file]
drivers/block/mtip32xx/mtip32xx.c
drivers/block/mtip32xx/mtip32xx.h
drivers/block/nbd.c
drivers/block/null_blk.c
drivers/block/osdblk.c [deleted file]
drivers/block/paride/pcd.c
drivers/block/paride/pd.c
drivers/block/paride/pf.c
drivers/block/pktcdvd.c
drivers/block/rbd.c
drivers/block/rsxx/dev.c
drivers/block/swim.c
drivers/block/swim3.c
drivers/block/virtio_blk.c
drivers/block/xen-blkfront.c
drivers/block/zram/zram_drv.c
drivers/cdrom/cdrom.c
drivers/char/agp/amd64-agp.c
drivers/char/ipmi/bt-bmc.c
drivers/char/ipmi/ipmi_si_intf.c
drivers/char/ipmi/ipmi_ssif.c
drivers/char/ipmi/ipmi_watchdog.c
drivers/char/mem.c
drivers/char/mmtimer.c
drivers/char/virtio_console.c
drivers/clk/clk-stm32f4.c
drivers/clk/sunxi-ng/Kconfig
drivers/clk/sunxi-ng/ccu-sun8i-a33.c
drivers/clk/sunxi-ng/ccu_common.c
drivers/clk/sunxi-ng/ccu_common.h
drivers/clocksource/Kconfig
drivers/clocksource/Makefile
drivers/clocksource/arc_timer.c
drivers/clocksource/arm_arch_timer.c
drivers/clocksource/asm9260_timer.c
drivers/clocksource/bcm2835_timer.c
drivers/clocksource/bcm_kona_timer.c
drivers/clocksource/clksrc-probe.c
drivers/clocksource/dw_apb_timer.c
drivers/clocksource/em_sti.c
drivers/clocksource/h8300_timer8.c
drivers/clocksource/meson6_timer.c
drivers/clocksource/metag_generic.c
drivers/clocksource/mips-gic-timer.c
drivers/clocksource/nomadik-mtu.c
drivers/clocksource/numachip.c
drivers/clocksource/pxa_timer.c
drivers/clocksource/rockchip_timer.c
drivers/clocksource/samsung_pwm_timer.c
drivers/clocksource/sh_cmt.c
drivers/clocksource/sh_tmu.c
drivers/clocksource/sun4i_timer.c
drivers/clocksource/tegra20_timer.c
drivers/clocksource/time-armada-370-xp.c
drivers/clocksource/time-efm32.c
drivers/clocksource/time-orion.c
drivers/clocksource/timer-atlas7.c
drivers/clocksource/timer-atmel-pit.c
drivers/clocksource/timer-digicolor.c
drivers/clocksource/timer-fttmr010.c [moved from drivers/clocksource/timer-gemini.c with 72% similarity]
drivers/clocksource/timer-integrator-ap.c
drivers/clocksource/timer-nps.c
drivers/clocksource/timer-prima2.c
drivers/clocksource/timer-sp804.c
drivers/clocksource/timer-sun5i.c
drivers/clocksource/vf_pit_timer.c
drivers/cpufreq/Kconfig.arm
drivers/cpufreq/Makefile
drivers/cpufreq/cpufreq.c
drivers/cpufreq/dbx500-cpufreq.c
drivers/cpufreq/ia64-acpi-cpufreq.c
drivers/cpufreq/imx6q-cpufreq.c
drivers/cpufreq/intel_pstate.c
drivers/cpufreq/mt8173-cpufreq.c
drivers/cpufreq/qoriq-cpufreq.c
drivers/cpufreq/sh-cpufreq.c
drivers/cpufreq/sparc-us2e-cpufreq.c
drivers/cpufreq/sparc-us3-cpufreq.c
drivers/cpufreq/tegra186-cpufreq.c [new file with mode: 0644]
drivers/cpuidle/cpuidle-cps.c
drivers/cpuidle/cpuidle-powernv.c
drivers/crypto/caam/caampkc.c
drivers/crypto/caam/ctrl.c
drivers/crypto/caam/intern.h
drivers/crypto/n2_core.c
drivers/dax/Kconfig
drivers/dax/dax.c
drivers/devfreq/governor.h
drivers/edac/Kconfig
drivers/edac/Makefile
drivers/edac/altera_edac.c
drivers/edac/edac_mc.c
drivers/edac/edac_stub.c [deleted file]
drivers/edac/pnd2_edac.c
drivers/edac/sb_edac.c
drivers/edac/skx_edac.c
drivers/edac/thunderx_edac.c [new file with mode: 0644]
drivers/extcon/devres.c
drivers/extcon/extcon.c
drivers/extcon/extcon.h
drivers/firmware/efi/Makefile
drivers/firmware/efi/efi-bgrt.c [moved from arch/x86/platform/efi/efi-bgrt.c with 100% similarity]
drivers/firmware/efi/efi-pstore.c
drivers/firmware/efi/libstub/arm-stub.c
drivers/firmware/efi/libstub/arm32-stub.c
drivers/firmware/efi/libstub/arm64-stub.c
drivers/firmware/efi/libstub/efi-stub-helper.c
drivers/firmware/efi/libstub/efistub.h
drivers/firmware/efi/libstub/fdt.c
drivers/firmware/efi/libstub/gop.c
drivers/firmware/efi/libstub/secureboot.c
drivers/gpu/drm/etnaviv/etnaviv_gpu.c
drivers/gpu/drm/i915/gvt/cfg_space.c
drivers/gpu/drm/i915/gvt/execlist.c
drivers/gpu/drm/i915/gvt/firmware.c
drivers/gpu/drm/i915/gvt/gvt.c
drivers/gpu/drm/i915/gvt/gvt.h
drivers/gpu/drm/i915/gvt/kvmgt.c
drivers/gpu/drm/i915/gvt/vgpu.c
drivers/gpu/drm/i915/i915_drv.c
drivers/gpu/drm/i915/i915_drv.h
drivers/gpu/drm/i915/i915_gem.c
drivers/gpu/drm/i915/i915_gem_execbuffer.c
drivers/gpu/drm/i915/i915_gem_gtt.c
drivers/gpu/drm/i915/i915_gem_request.c
drivers/gpu/drm/i915/i915_gem_shrinker.c
drivers/gpu/drm/i915/i915_pci.c
drivers/gpu/drm/i915/i915_perf.c
drivers/gpu/drm/i915/intel_lrc.c
drivers/gpu/drm/i915/intel_ringbuffer.h
drivers/gpu/drm/nouveau/nv50_display.c
drivers/gpu/drm/nouveau/nvkm/engine/device/base.c
drivers/gpu/drm/nouveau/nvkm/engine/mpeg/nv31.c
drivers/gpu/drm/nouveau/nvkm/engine/mpeg/nv44.c
drivers/gpu/drm/udl/udl_transfer.c
drivers/hid/hid-core.c
drivers/hid/hid-ids.h
drivers/hid/hid-uclogic.c
drivers/hid/wacom_wac.c
drivers/hsi/clients/ssi_protocol.c
drivers/hwmon/Kconfig
drivers/hwmon/Makefile
drivers/hwmon/ad7414.c
drivers/hwmon/adc128d818.c
drivers/hwmon/ads1015.c
drivers/hwmon/ads7828.c
drivers/hwmon/adt7475.c
drivers/hwmon/aspeed-pwm-tacho.c [new file with mode: 0644]
drivers/hwmon/dell-smm-hwmon.c
drivers/hwmon/hwmon.c
drivers/hwmon/ina209.c
drivers/hwmon/ina2xx.c
drivers/hwmon/lm63.c
drivers/hwmon/lm75.c
drivers/hwmon/lm85.c
drivers/hwmon/lm87.c
drivers/hwmon/lm90.c
drivers/hwmon/lm95245.c
drivers/hwmon/max6697.c
drivers/hwmon/pmbus/adm1275.c
drivers/hwmon/pmbus/ucd9000.c
drivers/hwmon/pmbus/ucd9200.c
drivers/hwmon/stts751.c
drivers/hwmon/tmp102.c
drivers/hwmon/tmp103.c
drivers/hwmon/tmp421.c
drivers/hwmon/twl4030-madc-hwmon.c [deleted file]
drivers/hwmon/w83627ehf.c
drivers/hwtracing/coresight/coresight-etb10.c
drivers/hwtracing/coresight/coresight-etm-perf.c
drivers/hwtracing/coresight/coresight-priv.h
drivers/hwtracing/coresight/coresight-tmc-etf.c
drivers/ide/ide-atapi.c
drivers/ide/ide-cd.c
drivers/ide/ide-cd_ioctl.c
drivers/ide/ide-devsets.c
drivers/ide/ide-disk.c
drivers/ide/ide-dma.c
drivers/ide/ide-eh.c
drivers/ide/ide-floppy.c
drivers/ide/ide-io.c
drivers/ide/ide-ioctls.c
drivers/ide/ide-park.c
drivers/ide/ide-pm.c
drivers/ide/ide-tape.c
drivers/ide/ide-taskfile.c
drivers/iio/accel/hid-sensor-accel-3d.c
drivers/iio/common/cros_ec_sensors/cros_ec_sensors.c
drivers/iio/common/hid-sensors/hid-sensor-attributes.c
drivers/iio/gyro/bmg160_core.c
drivers/iio/industrialio-core.c
drivers/iio/pressure/st_pressure_core.c
drivers/infiniband/ulp/isert/ib_isert.c
drivers/infiniband/ulp/isert/ib_isert.h
drivers/input/joystick/xpad.c
drivers/input/mouse/elantech.c
drivers/input/serio/i8042-x86ia64io.h
drivers/iommu/amd_iommu.c
drivers/iommu/amd_iommu_init.c
drivers/iommu/amd_iommu_proto.h
drivers/iommu/amd_iommu_types.h
drivers/irqchip/Kconfig
drivers/irqchip/Makefile
drivers/irqchip/irq-atmel-aic5.c
drivers/irqchip/irq-ftintc010.c [new file with mode: 0644]
drivers/irqchip/irq-gemini.c [deleted file]
drivers/irqchip/irq-gic-v3-its-platform-msi.c
drivers/irqchip/irq-gic-v3-its.c
drivers/irqchip/irq-imx-gpcv2.c
drivers/irqchip/irq-mbigen.c
drivers/irqchip/irq-mips-gic.c
drivers/irqchip/irq-moxart.c [deleted file]
drivers/irqchip/irq-mtk-cirq.c [new file with mode: 0644]
drivers/irqchip/irq-mtk-sysirq.c
drivers/isdn/capi/kcapi.c
drivers/leds/Kconfig
drivers/leds/Makefile
drivers/leds/led-class.c
drivers/leds/leds-cpcap.c [new file with mode: 0644]
drivers/leds/leds-gpio.c
drivers/leds/leds-lp3952.c
drivers/leds/leds-mt6323.c [new file with mode: 0644]
drivers/leds/leds-pca9532.c
drivers/leds/trigger/ledtrig-cpu.c
drivers/lightnvm/Kconfig
drivers/lightnvm/Makefile
drivers/lightnvm/core.c
drivers/lightnvm/pblk-cache.c [new file with mode: 0644]
drivers/lightnvm/pblk-core.c [new file with mode: 0644]
drivers/lightnvm/pblk-gc.c [new file with mode: 0644]
drivers/lightnvm/pblk-init.c [new file with mode: 0644]
drivers/lightnvm/pblk-map.c [new file with mode: 0644]
drivers/lightnvm/pblk-rb.c [new file with mode: 0644]
drivers/lightnvm/pblk-read.c [new file with mode: 0644]
drivers/lightnvm/pblk-recovery.c [new file with mode: 0644]
drivers/lightnvm/pblk-rl.c [new file with mode: 0644]
drivers/lightnvm/pblk-sysfs.c [new file with mode: 0644]
drivers/lightnvm/pblk-write.c [new file with mode: 0644]
drivers/lightnvm/pblk.h [new file with mode: 0644]
drivers/lightnvm/rrpc.c
drivers/mailbox/Kconfig
drivers/mailbox/Makefile
drivers/mailbox/bcm-flexrm-mailbox.c [new file with mode: 0644]
drivers/mailbox/bcm-pdc-mailbox.c
drivers/mailbox/hi6220-mailbox.c
drivers/mailbox/mailbox-xgene-slimpro.c
drivers/mailbox/mailbox.c
drivers/md/dm-cache-metadata.c
drivers/md/dm-cache-target.c
drivers/md/dm-core.h
drivers/md/dm-crypt.c
drivers/md/dm-io.c
drivers/md/dm-kcopyd.c
drivers/md/dm-linear.c
drivers/md/dm-mpath.c
drivers/md/dm-raid.c
drivers/md/dm-raid1.c
drivers/md/dm-rq.c
drivers/md/dm-stripe.c
drivers/md/dm-table.c
drivers/md/dm-thin.c
drivers/md/dm-verity-fec.c
drivers/md/dm-verity-fec.h
drivers/md/dm.c
drivers/md/linear.c
drivers/md/md.h
drivers/md/multipath.c
drivers/md/raid0.c
drivers/md/raid1.c
drivers/md/raid10.c
drivers/md/raid5.c
drivers/mmc/core/queue.c
drivers/mmc/core/sdio_bus.c
drivers/mmc/host/dw_mmc.c
drivers/mmc/host/sdhci-esdhc-imx.c
drivers/mtd/mtdcore.c
drivers/mtd/mtdsuper.c
drivers/mtd/ubi/block.c
drivers/mtd/ubi/upd.c
drivers/net/bonding/bond_main.c
drivers/net/can/ifi_canfd/ifi_canfd.c
drivers/net/can/rcar/rcar_can.c
drivers/net/can/usb/Kconfig
drivers/net/can/usb/gs_usb.c
drivers/net/can/usb/peak_usb/pcan_usb_core.c
drivers/net/can/usb/peak_usb/pcan_usb_core.h
drivers/net/can/usb/peak_usb/pcan_usb_fd.c
drivers/net/dsa/b53/b53_common.c
drivers/net/dsa/b53/b53_regs.h
drivers/net/ethernet/aquantia/atlantic/aq_main.c
drivers/net/ethernet/aquantia/atlantic/aq_nic.c
drivers/net/ethernet/aquantia/atlantic/aq_ring.c
drivers/net/ethernet/aquantia/atlantic/aq_ring.h
drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_a0.c
drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c
drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
drivers/net/ethernet/broadcom/bnxt/bnxt.c
drivers/net/ethernet/brocade/bna/bfa_ioc.c
drivers/net/ethernet/cavium/thunder/thunder_bgx.c
drivers/net/ethernet/cavium/thunder/thunder_bgx.h
drivers/net/ethernet/emulex/benet/be_cmds.c
drivers/net/ethernet/ezchip/nps_enet.c
drivers/net/ethernet/faraday/ftgmac100.c
drivers/net/ethernet/hisilicon/hns/hns_dsaf_mac.c
drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.c
drivers/net/ethernet/hisilicon/hns/hns_dsaf_misc.c
drivers/net/ethernet/intel/e1000e/netdev.c
drivers/net/ethernet/intel/i40e/i40e_main.c
drivers/net/ethernet/mediatek/mtk_eth_soc.c
drivers/net/ethernet/mediatek/mtk_eth_soc.h
drivers/net/ethernet/mellanox/mlx5/core/en.h
drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
drivers/net/ethernet/mellanox/mlx5/core/en_main.c
drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
drivers/net/ethernet/mellanox/mlx5/core/lag.c
drivers/net/ethernet/mellanox/mlx5/core/main.c
drivers/net/ethernet/mellanox/mlx5/core/uar.c
drivers/net/ethernet/moxa/moxart_ether.c
drivers/net/ethernet/moxa/moxart_ether.h
drivers/net/ethernet/netronome/nfp/nfp_net_common.c
drivers/net/ethernet/qlogic/qed/qed_dcbx.c
drivers/net/ethernet/renesas/ravb_main.c
drivers/net/ethernet/renesas/sh_eth.c
drivers/net/ethernet/rocker/rocker_ofdpa.c
drivers/net/ethernet/sfc/efx.c
drivers/net/ethernet/sfc/efx.h
drivers/net/ethernet/sfc/falcon/efx.c
drivers/net/ethernet/sfc/workarounds.h
drivers/net/ethernet/ti/Kconfig
drivers/net/ethernet/ti/cpsw.c
drivers/net/ethernet/toshiba/tc35815.c
drivers/net/hyperv/hyperv_net.h
drivers/net/hyperv/netvsc.c
drivers/net/irda/vlsi_ir.c
drivers/net/macsec.c
drivers/net/macvlan.c
drivers/net/phy/dp83640.c
drivers/net/phy/mdio-boardinfo.c
drivers/net/phy/micrel.c
drivers/net/phy/phy.c
drivers/net/team/team.c
drivers/net/usb/Kconfig
drivers/net/usb/cdc_ether.c
drivers/net/usb/ch9200.c
drivers/net/usb/cx82310_eth.c
drivers/net/usb/hso.c
drivers/net/usb/kaweth.c
drivers/net/usb/lan78xx.c
drivers/net/usb/plusb.c
drivers/net/usb/qmi_wwan.c
drivers/net/usb/r8152.c
drivers/net/usb/smsc75xx.c
drivers/net/usb/smsc95xx.c
drivers/net/usb/sr9700.c
drivers/net/usb/usbnet.c
drivers/net/virtio_net.c
drivers/net/vrf.c
drivers/net/wireless/broadcom/brcm80211/brcmfmac/p2p.c
drivers/net/wireless/intel/iwlwifi/mvm/debugfs.c
drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c
drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
drivers/net/wireless/intel/iwlwifi/mvm/sta.c
drivers/net/wireless/intel/iwlwifi/mvm/tx.c
drivers/net/wireless/realtek/rtlwifi/base.c
drivers/nvdimm/bus.c
drivers/nvdimm/claim.c
drivers/nvdimm/dimm_devs.c
drivers/nvme/host/core.c
drivers/nvme/host/fabrics.c
drivers/nvme/host/fabrics.h
drivers/nvme/host/fc.c
drivers/nvme/host/lightnvm.c
drivers/nvme/host/nvme.h
drivers/nvme/host/pci.c
drivers/nvme/host/rdma.c
drivers/nvme/host/scsi.c
drivers/nvme/target/admin-cmd.c
drivers/nvme/target/core.c
drivers/nvme/target/discovery.c
drivers/nvme/target/fabrics-cmd.c
drivers/nvme/target/fc.c
drivers/nvme/target/fcloop.c
drivers/nvme/target/io-cmd.c
drivers/nvme/target/loop.c
drivers/nvme/target/nvmet.h
drivers/nvme/target/rdma.c
drivers/pci/dwc/Kconfig
drivers/pci/dwc/pcie-artpec6.c
drivers/pci/dwc/pcie-designware-plat.c
drivers/pci/dwc/pcie-hisi.c
drivers/pci/host/pci-thunder-pem.c
drivers/pinctrl/core.c
drivers/pinctrl/freescale/pinctrl-imx.c
drivers/pinctrl/intel/pinctrl-cherryview.c
drivers/pinctrl/pinctrl-single.c
drivers/pinctrl/samsung/pinctrl-exynos.c
drivers/pinctrl/samsung/pinctrl-exynos.h
drivers/pinctrl/sh-pfc/pinctrl.c
drivers/pinctrl/ti/pinctrl-ti-iodelay.c
drivers/platform/x86/Kconfig
drivers/platform/x86/Makefile
drivers/platform/x86/dell-laptop.c
drivers/platform/x86/dell-wmi-led.c [moved from drivers/leds/dell-led.c with 56% similarity]
drivers/power/avs/rockchip-io-domain.c
drivers/power/reset/Kconfig
drivers/power/reset/Makefile
drivers/power/reset/gemini-poweroff.c [new file with mode: 0644]
drivers/power/reset/syscon-poweroff.c
drivers/power/supply/Kconfig
drivers/power/supply/Makefile
drivers/power/supply/ab8500_bmdata.c
drivers/power/supply/axp288_charger.c
drivers/power/supply/bq24190_charger.c
drivers/power/supply/bq25890_charger.c
drivers/power/supply/charger-manager.c
drivers/power/supply/cpcap-charger.c [new file with mode: 0644]
drivers/power/supply/lego_ev3_battery.c [new file with mode: 0644]
drivers/power/supply/lp8788-charger.c
drivers/power/supply/ltc2941-battery-gauge.c
drivers/power/supply/max17040_battery.c
drivers/power/supply/sbs-charger.c
drivers/power/supply/tps65217_charger.c
drivers/power/supply/twl4030_charger.c
drivers/ptp/ptp_clock.c
drivers/pwm/pwm-lpss-pci.c
drivers/pwm/pwm-lpss-platform.c
drivers/pwm/pwm-lpss.c
drivers/pwm/pwm-lpss.h
drivers/pwm/pwm-rockchip.c
drivers/ras/Makefile
drivers/ras/cec.c [new file with mode: 0644]
drivers/ras/debugfs.c
drivers/ras/debugfs.h [new file with mode: 0644]
drivers/ras/ras.c
drivers/reset/core.c
drivers/s390/net/qeth_core.h
drivers/s390/net/qeth_core_main.c
drivers/s390/net/qeth_l2_main.c
drivers/s390/net/qeth_l3_main.c
drivers/sbus/char/jsflash.c
drivers/scsi/Makefile
drivers/scsi/aacraid/aacraid.h
drivers/scsi/aacraid/commsup.c
drivers/scsi/esas2r/esas2r_ioctl.c
drivers/scsi/ipr.c
drivers/scsi/iscsi_tcp.c
drivers/scsi/lpfc/lpfc.h
drivers/scsi/lpfc/lpfc_attr.c
drivers/scsi/lpfc/lpfc_bsg.c
drivers/scsi/lpfc/lpfc_crtn.h
drivers/scsi/lpfc/lpfc_ct.c
drivers/scsi/lpfc/lpfc_debugfs.c
drivers/scsi/lpfc/lpfc_disc.h
drivers/scsi/lpfc/lpfc_els.c
drivers/scsi/lpfc/lpfc_hbadisc.c
drivers/scsi/lpfc/lpfc_hw.h
drivers/scsi/lpfc/lpfc_hw4.h
drivers/scsi/lpfc/lpfc_init.c
drivers/scsi/lpfc/lpfc_mbox.c
drivers/scsi/lpfc/lpfc_nportdisc.c
drivers/scsi/lpfc/lpfc_nvme.c
drivers/scsi/lpfc/lpfc_nvme.h
drivers/scsi/lpfc/lpfc_nvmet.c
drivers/scsi/lpfc/lpfc_nvmet.h
drivers/scsi/lpfc/lpfc_sli.c
drivers/scsi/lpfc/lpfc_sli4.h
drivers/scsi/lpfc/lpfc_version.h
drivers/scsi/lpfc/lpfc_vport.c
drivers/scsi/osd/osd_initiator.c
drivers/scsi/osst.c
drivers/scsi/qedf/qedf_fip.c
drivers/scsi/qedf/qedf_main.c
drivers/scsi/qla2xxx/qla_bsg.c
drivers/scsi/qla2xxx/qla_os.c
drivers/scsi/scsi_debugfs.c [new file with mode: 0644]
drivers/scsi/scsi_debugfs.h [new file with mode: 0644]
drivers/scsi/scsi_error.c
drivers/scsi/scsi_lib.c
drivers/scsi/scsi_transport_sas.c
drivers/scsi/sd.c
drivers/scsi/sd.h
drivers/scsi/sd_zbc.c
drivers/scsi/sg.c
drivers/scsi/sr.c
drivers/scsi/st.c
drivers/staging/android/ashmem.c
drivers/staging/lustre/lustre/include/lustre_disk.h
drivers/staging/lustre/lustre/llite/llite_lib.c
drivers/target/iscsi/iscsi_target.c
drivers/target/iscsi/iscsi_target_configfs.c
drivers/target/iscsi/iscsi_target_parameters.c
drivers/target/iscsi/iscsi_target_util.c
drivers/target/iscsi/iscsi_target_util.h
drivers/target/target_core_alua.c
drivers/target/target_core_configfs.c
drivers/target/target_core_device.c
drivers/target/target_core_fabric_configfs.c
drivers/target/target_core_pscsi.c
drivers/target/target_core_tpg.c
drivers/target/target_core_transport.c
drivers/target/target_core_user.c
drivers/thermal/Kconfig
drivers/thermal/Makefile
drivers/thermal/db8500_cpufreq_cooling.c [deleted file]
drivers/tty/tty_ldisc.c
drivers/usb/gadget/function/f_tcm.c
drivers/video/backlight/pwm_bl.c
drivers/video/fbdev/efifb.c
drivers/video/fbdev/omap/omapfb_main.c
drivers/video/fbdev/ssd1307fb.c
drivers/video/fbdev/xen-fbfront.c
drivers/virtio/virtio.c
drivers/virtio/virtio_pci_common.c
drivers/virtio/virtio_pci_common.h
drivers/virtio/virtio_pci_legacy.c
drivers/virtio/virtio_pci_modern.c
drivers/xen/xenbus/xenbus_dev_frontend.c
fs/9p/v9fs.c
fs/9p/v9fs.h
fs/9p/vfs_super.c
fs/afs/internal.h
fs/afs/super.c
fs/afs/volume.c
fs/block_dev.c
fs/btrfs/ctree.h
fs/btrfs/disk-io.c
fs/btrfs/inode.c
fs/btrfs/qgroup.c
fs/btrfs/super.c
fs/btrfs/volumes.c
fs/ceph/addr.c
fs/ceph/debugfs.c
fs/ceph/inode.c
fs/ceph/super.c
fs/ceph/super.h
fs/cifs/cifs_fs_sb.h
fs/cifs/cifsfs.c
fs/cifs/cifsfs.h
fs/cifs/cifsglob.h
fs/cifs/cifssmb.c
fs/cifs/connect.c
fs/cifs/file.c
fs/cifs/ioctl.c
fs/cifs/smb1ops.c
fs/cifs/smb2misc.c
fs/cifs/smb2ops.c
fs/cifs/smb2pdu.c
fs/cifs/smb2proto.h
fs/cifs/smb2transport.c
fs/cifs/transport.c
fs/coda/inode.c
fs/dax.c
fs/ecryptfs/ecryptfs_kernel.h
fs/ecryptfs/main.c
fs/exec.c
fs/exofs/exofs.h
fs/exofs/super.c
fs/ext4/ext4.h
fs/ext4/file.c
fs/ext4/inode.c
fs/ext4/namei.c
fs/ext4/symlink.c
fs/fuse/dev.c
fs/fuse/fuse_i.h
fs/fuse/inode.c
fs/gfs2/ops_fstype.c
fs/hugetlbfs/inode.c
fs/namei.c
fs/ncpfs/inode.c
fs/ncpfs/ncp_fs_sb.h
fs/nfs/client.c
fs/nfs/direct.c
fs/nfs/internal.h
fs/nfs/super.c
fs/nfs/write.c
fs/nfsd/blocklayout.c
fs/nfsd/nfs3xdr.c
fs/nfsd/nfs4proc.c
fs/nfsd/nfssvc.c
fs/nfsd/nfsxdr.c
fs/nfsd/vfs.c
fs/nilfs2/super.c
fs/nsfs.c
fs/ocfs2/cluster/tcp.c
fs/orangefs/devorangefs-req.c
fs/orangefs/orangefs-bufmap.c
fs/orangefs/orangefs-kernel.h
fs/orangefs/super.c
fs/proc/proc_sysctl.c
fs/proc/task_mmu.c
fs/stat.c
fs/super.c
fs/sysfs/file.c
fs/ubifs/debug.c
fs/ubifs/dir.c
fs/ubifs/super.c
fs/ubifs/ubifs.h
fs/userfaultfd.c
fs/xfs/libxfs/xfs_dir2_priv.h
fs/xfs/libxfs/xfs_dir2_sf.c
fs/xfs/libxfs/xfs_inode_fork.c
fs/xfs/libxfs/xfs_inode_fork.h
fs/xfs/xfs_bmap_util.c
fs/xfs/xfs_inode.c
fs/xfs/xfs_iops.c
fs/xfs/xfs_itable.c
include/acpi/acpi_bus.h
include/acpi/cppc_acpi.h
include/asm-generic/extable.h [new file with mode: 0644]
include/asm-generic/uaccess.h
include/asm-generic/vmlinux.lds.h
include/clocksource/arm_arch_timer.h
include/crypto/internal/hash.h
include/kvm/arm_vgic.h
include/linux/acpi.h
include/linux/ata.h
include/linux/atomic.h
include/linux/backing-dev-defs.h
include/linux/backing-dev.h
include/linux/bio.h
include/linux/blk-mq.h
include/linux/blk_types.h
include/linux/blkdev.h
include/linux/cgroup-defs.h
include/linux/cgroup.h
include/linux/clockchips.h
include/linux/clocksource.h
include/linux/coda_psdev.h
include/linux/compat.h
include/linux/coresight.h
include/linux/cpufreq.h
include/linux/cpumask.h
include/linux/cpuset.h
include/linux/dell-led.h
include/linux/devfreq.h
include/linux/device-mapper.h
include/linux/edac.h
include/linux/efi-bgrt.h
include/linux/efi.h
include/linux/elevator.h
include/linux/extcon.h
include/linux/fs.h
include/linux/fwnode.h
include/linux/genhd.h
include/linux/hrtimer.h
include/linux/hwmon.h
include/linux/ide.h
include/linux/inet.h
include/linux/init_task.h
include/linux/irqchip/arm-gic.h
include/linux/irqchip/mips-gic.h
include/linux/kernel.h
include/linux/kobject.h
include/linux/kprobes.h
include/linux/leds-pca9532.h
include/linux/leds.h
include/linux/lightnvm.h
include/linux/lockdep.h
include/linux/mailbox/brcm-message.h
include/linux/mfd/cros_ec.h
include/linux/mfd/motorola-cpcap.h
include/linux/mg_disk.h [deleted file]
include/linux/mmc/sdio_func.h
include/linux/mmu_notifier.h
include/linux/module.h
include/linux/mtd/mtd.h
include/linux/nfs_fs_sb.h
include/linux/nvme-fc-driver.h
include/linux/nvme-fc.h
include/linux/nvme.h
include/linux/of.h
include/linux/percpu.h
include/linux/perf_event.h
include/linux/phy.h
include/linux/pinctrl/pinctrl.h
include/linux/pm_domain.h
include/linux/posix-clock.h
include/linux/posix-timers.h
include/linux/power/bq24190_charger.h [deleted file]
include/linux/property.h
include/linux/ras.h
include/linux/refcount.h
include/linux/reset.h
include/linux/sbitmap.h
include/linux/sched.h
include/linux/sched/rt.h
include/linux/smp.h
include/linux/stat.h
include/linux/t10-pi.h
include/linux/thread_info.h
include/linux/tick.h
include/linux/timekeeping.h
include/linux/uaccess.h
include/linux/uio.h
include/linux/virtio.h
include/linux/workqueue.h
include/linux/writeback.h
include/net/sctp/sctp.h
include/net/sctp/structs.h
include/rdma/ib.h
include/scsi/scsi_request.h
include/target/target_core_base.h
include/trace/events/block.h
include/trace/events/sched.h
include/uapi/linux/Kbuild
include/uapi/linux/elf-em.h
include/uapi/linux/ipv6_route.h
include/uapi/linux/lightnvm.h
include/uapi/linux/nbd-netlink.h [new file with mode: 0644]
include/uapi/linux/nbd.h
include/uapi/linux/perf_event.h
include/uapi/linux/stat.h
include/uapi/linux/virtio_pci.h
include/xen/page.h
kernel/audit.c
kernel/audit.h
kernel/auditsc.c
kernel/bpf/core.c
kernel/bpf/syscall.c
kernel/bpf/verifier.c
kernel/cgroup/cgroup-internal.h
kernel/cgroup/cgroup-v1.c
kernel/cgroup/cgroup.c
kernel/cgroup/cpuset.c
kernel/cgroup/namespace.c
kernel/compat.c
kernel/cpu.c
kernel/events/core.c
kernel/events/ring_buffer.c
kernel/fork.c
kernel/futex.c
kernel/irq/affinity.c
kernel/irq/chip.c
kernel/irq/manage.c
kernel/kprobes.c
kernel/kthread.c
kernel/locking/lockdep.c
kernel/locking/lockdep_internals.h
kernel/locking/rtmutex-debug.c
kernel/locking/rtmutex-debug.h
kernel/locking/rtmutex.c
kernel/locking/rtmutex.h
kernel/locking/rtmutex_common.h
kernel/locking/rwsem.c
kernel/locking/test-ww_mutex.c
kernel/module.c
kernel/nsproxy.c
kernel/params.c
kernel/ptrace.c
kernel/sched/core.c
kernel/sched/cpufreq_schedutil.c
kernel/sched/cputime.c
kernel/sched/fair.c
kernel/sched/features.h
kernel/sched/rt.c
kernel/sched/sched-pelt.h [new file with mode: 0644]
kernel/sched/sched.h
kernel/softirq.c
kernel/sysctl.c
kernel/time/alarmtimer.c
kernel/time/clockevents.c
kernel/time/hrtimer.c
kernel/time/posix-clock.c
kernel/time/posix-cpu-timers.c
kernel/time/posix-stubs.c
kernel/time/posix-timers.c
kernel/time/sched_clock.c
kernel/time/tick-sched.c
kernel/time/time.c
kernel/time/timekeeping.c
kernel/time/timer.c
kernel/time/timer_list.c
kernel/trace/Kconfig
kernel/trace/blktrace.c
kernel/trace/bpf_trace.c
kernel/trace/ftrace.c
kernel/trace/ring_buffer.c
kernel/trace/trace.c
kernel/trace/trace.h
kernel/trace/trace_kprobe.c
kernel/workqueue.c
lib/Kconfig.debug
lib/Makefile
lib/cmdline.c
lib/iov_iter.c
lib/kobject.c
lib/refcount.c
lib/sbitmap.c
lib/test_user_copy.c
lib/usercopy.c [new file with mode: 0644]
mm/Kconfig
mm/backing-dev.c
mm/huge_memory.c
mm/internal.h
mm/memory.c
mm/mempolicy.c
mm/migrate.c
mm/page_alloc.c
mm/page_vma_mapped.c
mm/percpu.c
mm/swap.c
mm/swap_cgroup.c
mm/vmstat.c
mm/z3fold.c
mm/zsmalloc.c
net/9p/client.c
net/bridge/br_device.c
net/bridge/br_if.c
net/bridge/br_multicast.c
net/bridge/br_netlink.c
net/bridge/br_private.h
net/core/datagram.c
net/core/dev.c
net/core/flow_dissector.c
net/core/neighbour.c
net/core/netpoll.c
net/core/secure_seq.c
net/core/skbuff.c
net/core/sock.c
net/core/sysctl_net_core.c
net/core/utils.c
net/ipv4/af_inet.c
net/ipv4/ip_sockglue.c
net/ipv4/ipconfig.c
net/ipv4/ipmr.c
net/ipv4/netfilter/ipt_CLUSTERIP.c
net/ipv4/netfilter/nf_nat_snmp_basic.c
net/ipv4/ping.c
net/ipv4/raw.c
net/ipv4/route.c
net/ipv4/tcp.c
net/ipv4/tcp_cong.c
net/ipv4/tcp_input.c
net/ipv4/tcp_output.c
net/ipv4/tcp_recovery.c
net/ipv4/udp_offload.c
net/ipv6/addrconf.c
net/ipv6/af_inet6.c
net/ipv6/datagram.c
net/ipv6/exthdrs.c
net/ipv6/ip6_input.c
net/ipv6/ip6_tunnel.c
net/ipv6/ip6mr.c
net/ipv6/ndisc.c
net/ipv6/raw.c
net/ipv6/route.c
net/ipv6/seg6.c
net/kcm/kcmsock.c
net/key/af_key.c
net/l2tp/l2tp_core.c
net/l2tp/l2tp_core.h
net/l2tp/l2tp_debugfs.c
net/l2tp/l2tp_eth.c
net/l2tp/l2tp_ip.c
net/l2tp/l2tp_ip6.c
net/l2tp/l2tp_netlink.c
net/l2tp/l2tp_ppp.c
net/mac80211/iface.c
net/mac80211/rx.c
net/netfilter/nf_conntrack_ecache.c
net/netfilter/nf_conntrack_expect.c
net/netfilter/nf_conntrack_extend.c
net/netfilter/nf_conntrack_helper.c
net/netfilter/nf_conntrack_netlink.c
net/netfilter/nf_nat_core.c
net/netfilter/nf_nat_redirect.c
net/netfilter/nfnetlink_cthelper.c
net/netfilter/nfnetlink_cttimeout.c
net/netfilter/nfnetlink_queue.c
net/netfilter/nft_hash.c
net/netfilter/xt_TCPMSS.c
net/netfilter/xt_TPROXY.c
net/openvswitch/conntrack.c
net/openvswitch/flow.c
net/packet/af_packet.c
net/qrtr/qrtr.c
net/rds/tcp.c
net/rds/tcp_send.c
net/sched/act_api.c
net/sched/sch_generic.c
net/sctp/associola.c
net/sctp/input.c
net/sctp/output.c
net/sctp/outqueue.c
net/sctp/proc.c
net/sctp/sm_make_chunk.c
net/sctp/sm_statefuns.c
net/sctp/socket.c
net/sctp/stream.c
net/sctp/transport.c
net/tipc/socket.c
net/wireless/sysfs.c
net/xfrm/xfrm_input.c
net/xfrm/xfrm_policy.c
samples/statx/test-statx.c
scripts/Makefile.lib
scripts/checkstack.pl
scripts/checksyscalls.sh
scripts/kconfig/gconf.c
security/Kconfig
security/keys/gc.c
security/keys/keyctl.c
security/keys/process_keys.c
security/tomoyo/network.c
sound/core/seq/seq_lock.c
sound/firewire/lib.h
sound/firewire/oxfw/oxfw.c
sound/pci/hda/dell_wmi_helper.c
sound/soc/intel/boards/bytcr_rt5640.c
sound/soc/intel/boards/bytcr_rt5651.c
sound/soc/soc-topology.c
sound/soc/sti/uniperif.h
sound/soc/sti/uniperif_player.c
sound/soc/sti/uniperif_reader.c
tools/arch/arm/include/uapi/asm/kvm.h
tools/arch/arm64/include/uapi/asm/kvm.h
tools/arch/powerpc/include/uapi/asm/kvm.h
tools/arch/x86/include/asm/atomic.h
tools/arch/x86/include/asm/cmpxchg.h [new file with mode: 0644]
tools/arch/x86/include/asm/cpufeatures.h
tools/arch/x86/lib/memcpy_64.S
tools/build/Makefile.feature
tools/build/feature/Makefile
tools/build/feature/test-all.c
tools/build/feature/test-sched_getcpu.c [new file with mode: 0644]
tools/include/asm-generic/atomic-gcc.h
tools/include/linux/atomic.h
tools/include/linux/bug.h [new file with mode: 0644]
tools/include/linux/compiler-gcc.h
tools/include/linux/compiler.h
tools/include/linux/filter.h
tools/include/linux/hashtable.h
tools/include/linux/kernel.h
tools/include/linux/log2.h
tools/include/linux/refcount.h [new file with mode: 0644]
tools/include/linux/types.h
tools/include/uapi/linux/fcntl.h [new file with mode: 0644]
tools/include/uapi/linux/perf_event.h
tools/include/uapi/linux/stat.h [new file with mode: 0644]
tools/lguest/lguest.c
tools/lib/api/fs/fs.c
tools/lib/api/fs/fs.h
tools/lib/subcmd/help.h
tools/lib/symbol/kallsyms.c
tools/objtool/builtin-check.c
tools/objtool/objtool.c
tools/perf/.gitignore
tools/perf/Build
tools/perf/Documentation/perf-ftrace.txt
tools/perf/Documentation/perf-list.txt
tools/perf/Documentation/perf-record.txt
tools/perf/Documentation/perf-report.txt
tools/perf/Documentation/perf-sched.txt
tools/perf/Documentation/perf-script.txt
tools/perf/Documentation/perf-stat.txt
tools/perf/Documentation/perf-trace.txt
tools/perf/Documentation/perf.data-file-format.txt
tools/perf/MANIFEST
tools/perf/Makefile.config
tools/perf/arch/arm/util/cs-etm.c
tools/perf/arch/arm/util/dwarf-regs.c
tools/perf/arch/arm/util/unwind-libdw.c
tools/perf/arch/arm64/util/dwarf-regs.c
tools/perf/arch/arm64/util/unwind-libunwind.c
tools/perf/arch/common.c
tools/perf/arch/powerpc/util/dwarf-regs.c
tools/perf/arch/powerpc/util/kvm-stat.c
tools/perf/arch/powerpc/util/perf_regs.c
tools/perf/arch/powerpc/util/sym-handling.c
tools/perf/arch/s390/annotate/instructions.c [new file with mode: 0644]
tools/perf/arch/s390/util/kvm-stat.c
tools/perf/arch/x86/entry/syscalls/syscall_64.tbl
tools/perf/arch/x86/tests/intel-cqm.c
tools/perf/arch/x86/tests/perf-time-to-tsc.c
tools/perf/arch/x86/util/auxtrace.c
tools/perf/arch/x86/util/intel-bts.c
tools/perf/arch/x86/util/intel-pt.c
tools/perf/arch/x86/util/kvm-stat.c
tools/perf/arch/x86/util/perf_regs.c
tools/perf/arch/x86/util/unwind-libdw.c
tools/perf/bench/bench.h
tools/perf/bench/futex-hash.c
tools/perf/bench/futex-lock-pi.c
tools/perf/bench/futex-requeue.c
tools/perf/bench/futex-wake-parallel.c
tools/perf/bench/futex-wake.c
tools/perf/bench/futex.h
tools/perf/bench/mem-functions.c
tools/perf/bench/numa.c
tools/perf/bench/sched-messaging.c
tools/perf/bench/sched-pipe.c
tools/perf/builtin-annotate.c
tools/perf/builtin-bench.c
tools/perf/builtin-buildid-cache.c
tools/perf/builtin-buildid-list.c
tools/perf/builtin-c2c.c
tools/perf/builtin-config.c
tools/perf/builtin-data.c
tools/perf/builtin-diff.c
tools/perf/builtin-evlist.c
tools/perf/builtin-ftrace.c
tools/perf/builtin-help.c
tools/perf/builtin-inject.c
tools/perf/builtin-kallsyms.c
tools/perf/builtin-kmem.c
tools/perf/builtin-kvm.c
tools/perf/builtin-list.c
tools/perf/builtin-lock.c
tools/perf/builtin-mem.c
tools/perf/builtin-probe.c
tools/perf/builtin-record.c
tools/perf/builtin-report.c
tools/perf/builtin-sched.c
tools/perf/builtin-script.c
tools/perf/builtin-stat.c
tools/perf/builtin-timechart.c
tools/perf/builtin-top.c
tools/perf/builtin-trace.c
tools/perf/builtin-version.c
tools/perf/builtin.h
tools/perf/check-headers.sh
tools/perf/command-list.txt
tools/perf/perf.c
tools/perf/perf.h
tools/perf/pmu-events/arch/x86/broadwell/uncore.json [new file with mode: 0644]
tools/perf/pmu-events/arch/x86/broadwellde/uncore-cache.json
tools/perf/pmu-events/arch/x86/broadwellde/uncore-memory.json
tools/perf/pmu-events/arch/x86/broadwellde/uncore-power.json
tools/perf/pmu-events/arch/x86/broadwellx/uncore-cache.json
tools/perf/pmu-events/arch/x86/broadwellx/uncore-interconnect.json
tools/perf/pmu-events/arch/x86/broadwellx/uncore-memory.json
tools/perf/pmu-events/arch/x86/broadwellx/uncore-power.json
tools/perf/pmu-events/arch/x86/haswell/uncore.json [new file with mode: 0644]
tools/perf/pmu-events/arch/x86/haswellx/uncore-cache.json
tools/perf/pmu-events/arch/x86/haswellx/uncore-interconnect.json
tools/perf/pmu-events/arch/x86/haswellx/uncore-memory.json
tools/perf/pmu-events/arch/x86/haswellx/uncore-power.json
tools/perf/pmu-events/arch/x86/ivybridge/uncore.json [new file with mode: 0644]
tools/perf/pmu-events/arch/x86/ivytown/uncore-cache.json
tools/perf/pmu-events/arch/x86/ivytown/uncore-interconnect.json
tools/perf/pmu-events/arch/x86/ivytown/uncore-memory.json
tools/perf/pmu-events/arch/x86/ivytown/uncore-power.json
tools/perf/pmu-events/arch/x86/jaketown/uncore-cache.json
tools/perf/pmu-events/arch/x86/jaketown/uncore-interconnect.json
tools/perf/pmu-events/arch/x86/jaketown/uncore-memory.json
tools/perf/pmu-events/arch/x86/jaketown/uncore-power.json
tools/perf/pmu-events/arch/x86/mapfile.csv
tools/perf/pmu-events/arch/x86/sandybridge/uncore.json [new file with mode: 0644]
tools/perf/pmu-events/arch/x86/skylake/uncore.json [new file with mode: 0644]
tools/perf/pmu-events/jevents.c
tools/perf/pmu-events/jevents.h
tools/perf/pmu-events/pmu-events.h
tools/perf/tests/Build
tools/perf/tests/attr.c
tools/perf/tests/backward-ring-buffer.c
tools/perf/tests/bpf.c
tools/perf/tests/builtin-test.c
tools/perf/tests/clang.c
tools/perf/tests/code-reading.c
tools/perf/tests/cpumap.c
tools/perf/tests/dso-data.c
tools/perf/tests/dwarf-unwind.c
tools/perf/tests/event-times.c
tools/perf/tests/evsel-roundtrip-name.c
tools/perf/tests/expr.c [new file with mode: 0644]
tools/perf/tests/hists_common.c
tools/perf/tests/hists_cumulate.c
tools/perf/tests/hists_filter.c
tools/perf/tests/hists_link.c
tools/perf/tests/hists_output.c
tools/perf/tests/is_printable_array.c
tools/perf/tests/mmap-basic.c
tools/perf/tests/mmap-thread-lookup.c
tools/perf/tests/openat-syscall-all-cpus.c
tools/perf/tests/openat-syscall-tp-fields.c
tools/perf/tests/openat-syscall.c
tools/perf/tests/parse-events.c
tools/perf/tests/parse-no-sample-id-all.c
tools/perf/tests/perf-record.c
tools/perf/tests/pmu.c
tools/perf/tests/sample-parsing.c
tools/perf/tests/sdt.c
tools/perf/tests/sw-clock.c
tools/perf/tests/switch-tracking.c
tools/perf/tests/task-exit.c
tools/perf/tests/tests.h
tools/perf/tests/thread-map.c
tools/perf/tests/thread-mg-share.c
tools/perf/tests/unit_number__scnprintf.c
tools/perf/tests/vmlinux-kallsyms.c
tools/perf/trace/beauty/Build [new file with mode: 0644]
tools/perf/trace/beauty/beauty.h [new file with mode: 0644]
tools/perf/trace/beauty/signum.c
tools/perf/trace/beauty/statx.c [new file with mode: 0644]
tools/perf/ui/browser.c
tools/perf/ui/browsers/annotate.c
tools/perf/ui/browsers/header.c
tools/perf/ui/browsers/hists.c
tools/perf/ui/browsers/map.c
tools/perf/ui/gtk/annotate.c
tools/perf/ui/gtk/hists.c
tools/perf/ui/hist.c
tools/perf/ui/setup.c
tools/perf/ui/stdio/hist.c
tools/perf/ui/tui/setup.c
tools/perf/util/Build
tools/perf/util/alias.c [deleted file]
tools/perf/util/annotate.c
tools/perf/util/annotate.h
tools/perf/util/auxtrace.c
tools/perf/util/auxtrace.h
tools/perf/util/bpf-loader.c
tools/perf/util/bpf-loader.h
tools/perf/util/bpf-prologue.c
tools/perf/util/bpf-prologue.h
tools/perf/util/build-id.c
tools/perf/util/build-id.h
tools/perf/util/c++/clang-c.h
tools/perf/util/cache.h
tools/perf/util/callchain.c
tools/perf/util/callchain.h
tools/perf/util/cgroup.c
tools/perf/util/cgroup.h
tools/perf/util/cloexec.c
tools/perf/util/cloexec.h
tools/perf/util/color.h
tools/perf/util/comm.c
tools/perf/util/compress.h [new file with mode: 0644]
tools/perf/util/config.c
tools/perf/util/counts.c
tools/perf/util/cpumap.c
tools/perf/util/cpumap.h
tools/perf/util/ctype.c
tools/perf/util/data-convert-bt.c
tools/perf/util/data.c
tools/perf/util/debug.c
tools/perf/util/debug.h
tools/perf/util/demangle-java.c
tools/perf/util/drv_configs.c
tools/perf/util/dso.c
tools/perf/util/dso.h
tools/perf/util/dump-insn.c [new file with mode: 0644]
tools/perf/util/dump-insn.h [new file with mode: 0644]
tools/perf/util/dwarf-aux.c
tools/perf/util/dwarf-regs.c
tools/perf/util/env.c
tools/perf/util/event.c
tools/perf/util/event.h
tools/perf/util/evlist.c
tools/perf/util/evlist.h
tools/perf/util/evsel.c
tools/perf/util/evsel.h
tools/perf/util/evsel_fprintf.c
tools/perf/util/expr.h [new file with mode: 0644]
tools/perf/util/expr.y [new file with mode: 0644]
tools/perf/util/header.c
tools/perf/util/help-unknown-cmd.c
tools/perf/util/hist.c
tools/perf/util/hist.h
tools/perf/util/intel-bts.c
tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c
tools/perf/util/intel-pt.c
tools/perf/util/jitdump.c
tools/perf/util/llvm-utils.c
tools/perf/util/lzma.c
tools/perf/util/machine.c
tools/perf/util/machine.h
tools/perf/util/map.c
tools/perf/util/map.h
tools/perf/util/mem-events.c
tools/perf/util/namespaces.c [new file with mode: 0644]
tools/perf/util/namespaces.h [new file with mode: 0644]
tools/perf/util/ordered-events.c
tools/perf/util/parse-events.c
tools/perf/util/parse-events.h
tools/perf/util/parse-events.y
tools/perf/util/path.c
tools/perf/util/path.h [new file with mode: 0644]
tools/perf/util/perf-hooks.c
tools/perf/util/perf_regs.c
tools/perf/util/perf_regs.h
tools/perf/util/pmu.c
tools/perf/util/pmu.h
tools/perf/util/print_binary.c [new file with mode: 0644]
tools/perf/util/print_binary.h [new file with mode: 0644]
tools/perf/util/probe-event.c
tools/perf/util/probe-event.h
tools/perf/util/probe-file.c
tools/perf/util/probe-file.h
tools/perf/util/probe-finder.c
tools/perf/util/probe-finder.h
tools/perf/util/python-ext-sources
tools/perf/util/python.c
tools/perf/util/quote.c
tools/perf/util/record.c
tools/perf/util/sane_ctype.h [new file with mode: 0644]
tools/perf/util/scripting-engines/trace-event-perl.c
tools/perf/util/scripting-engines/trace-event-python.c
tools/perf/util/session.c
tools/perf/util/session.h
tools/perf/util/sort.c
tools/perf/util/sort.h
tools/perf/util/srcline.c
tools/perf/util/srcline.h [new file with mode: 0644]
tools/perf/util/stat-shadow.c
tools/perf/util/stat.c
tools/perf/util/stat.h
tools/perf/util/strbuf.c
tools/perf/util/strfilter.c
tools/perf/util/string.c
tools/perf/util/string2.h [new file with mode: 0644]
tools/perf/util/strlist.c
tools/perf/util/symbol-elf.c
tools/perf/util/symbol-minimal.c
tools/perf/util/symbol.c
tools/perf/util/symbol.h
tools/perf/util/term.c
tools/perf/util/thread-stack.c
tools/perf/util/thread.c
tools/perf/util/thread.h
tools/perf/util/thread_map.c
tools/perf/util/thread_map.h
tools/perf/util/time-utils.c
tools/perf/util/time-utils.h
tools/perf/util/tool.h
tools/perf/util/top.h
tools/perf/util/trace-event-parse.c
tools/perf/util/trace-event-read.c
tools/perf/util/units.c [new file with mode: 0644]
tools/perf/util/units.h [new file with mode: 0644]
tools/perf/util/unwind-libdw.c
tools/perf/util/unwind-libdw.h
tools/perf/util/unwind-libunwind-local.c
tools/perf/util/unwind.h
tools/perf/util/util.c
tools/perf/util/util.h
tools/perf/util/values.c
tools/perf/util/vdso.c
tools/perf/util/xyarray.c
tools/perf/util/zlib.c
tools/power/cpupower/utils/helpers/cpuid.c
tools/power/pm-graph/Makefile [new file with mode: 0644]
tools/power/pm-graph/analyze_boot.py [new file with mode: 0755]
tools/power/pm-graph/analyze_suspend.py [moved from scripts/analyze_suspend.py with 91% similarity]
tools/power/pm-graph/bootgraph.8 [new file with mode: 0644]
tools/power/pm-graph/sleepgraph.8 [new file with mode: 0644]
tools/power/x86/intel_pstate_tracer/intel_pstate_tracer.py
tools/power/x86/turbostat/turbostat.8
tools/power/x86/turbostat/turbostat.c
tools/scripts/Makefile.include
tools/testing/selftests/bpf/Makefile
tools/testing/selftests/bpf/test_maps.c
tools/testing/selftests/bpf/test_verifier.c
tools/testing/selftests/ftrace/test.d/ftrace/func-filter-pid.tc [new file with mode: 0644]
tools/testing/selftests/net/psock_fanout.c
tools/testing/selftests/net/psock_lib.h
tools/testing/selftests/powerpc/Makefile
virt/kvm/arm/vgic/vgic-init.c
virt/kvm/arm/vgic/vgic-mmio-v2.c
virt/kvm/arm/vgic/vgic-v2.c
virt/kvm/arm/vgic/vgic.h

index 67dc22f..1d6f4e7 100644 (file)
--- a/.mailmap
+++ b/.mailmap
@@ -99,6 +99,8 @@ Linas Vepstas <linas@austin.ibm.com>
 Linus Lüssing <linus.luessing@c0d3.blue> <linus.luessing@web.de>
 Linus Lüssing <linus.luessing@c0d3.blue> <linus.luessing@ascom.ch>
 Mark Brown <broonie@sirena.org.uk>
+Martin Kepplinger <martink@posteo.de> <martin.kepplinger@theobroma-systems.com>
+Martin Kepplinger <martink@posteo.de> <martin.kepplinger@ginzinger.com>
 Matthieu CASTET <castet.matthieu@free.fr>
 Mauro Carvalho Chehab <mchehab@kernel.org> <mchehab@brturbo.com.br>
 Mauro Carvalho Chehab <mchehab@kernel.org> <maurochehab@gmail.com>
@@ -171,6 +173,7 @@ Vlad Dogaru <ddvlad@gmail.com> <vlad.dogaru@intel.com>
 Vladimir Davydov <vdavydov.dev@gmail.com> <vdavydov@virtuozzo.com>
 Vladimir Davydov <vdavydov.dev@gmail.com> <vdavydov@parallels.com>
 Takashi YOSHII <takashi.yoshii.zj@renesas.com>
+Yakir Yang <kuankuan.y@gmail.com> <ykk@rock-chips.com>
 Yusuke Goda <goda.yusuke@renesas.com>
 Gustavo Padovan <gustavo@las.ic.unicamp.br>
 Gustavo Padovan <padovan@profusion.mobi>
diff --git a/CREDITS b/CREDITS
index c5626bf..5d09c26 100644 (file)
--- a/CREDITS
+++ b/CREDITS
@@ -1034,6 +1034,10 @@ S: 2037 Walnut #6
 S: Boulder, Colorado 80302
 S: USA
 
+N: Hans-Christian Noren Egtvedt
+E: egtvedt@samfundet.no
+D: AVR32 architecture maintainer.
+
 N: Heiko Eißfeldt
 E: heiko@colossus.escape.de heiko@unifix.de
 D: verify_area stuff, generic SCSI fixes
@@ -3398,6 +3402,10 @@ S: Suite 101
 S: Markham, Ontario L3R 2Z6
 S: Canada
 
+N: Haavard Skinnemoen
+M: Haavard Skinnemoen <hskinnemoen@gmail.com>
+D: AVR32 architecture port to Linux and maintainer.
+
 N: Rick Sladkey
 E: jrs@world.std.com
 D: utility hacker: Emacs, NFS server, mount, kmem-ps, UPS debugger, strace, GDB
diff --git a/Documentation/ABI/obsolete/sysfs-firmware-acpi b/Documentation/ABI/obsolete/sysfs-firmware-acpi
new file mode 100644 (file)
index 0000000..6715a71
--- /dev/null
@@ -0,0 +1,8 @@
+What:          /sys/firmware/acpi/hotplug/force_remove
+Date:          Mar 2017
+Contact:       Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Description:
+               Since the force_remove is inherently broken and dangerous to
+               use for some hotplugable resources like memory (because ignoring
+               the offline failure might lead to memory corruption and crashes)
+               enabling this knob is not safe and thus unsupported.
index 2da04ce..dea212d 100644 (file)
@@ -213,14 +213,8 @@ What:              /sys/block/<disk>/queue/discard_zeroes_data
 Date:          May 2011
 Contact:       Martin K. Petersen <martin.petersen@oracle.com>
 Description:
-               Devices that support discard functionality may return
-               stale or random data when a previously discarded block
-               is read back. This can cause problems if the filesystem
-               expects discarded blocks to be explicitly cleared. If a
-               device reports that it deterministically returns zeroes
-               when a discarded area is read the discard_zeroes_data
-               parameter will be set to one. Otherwise it will be 0 and
-               the result of reading a discarded area is undefined.
+               Will always return 0.  Don't rely on any specific behavior
+               for discards, and don't read this file.
 
 What:          /sys/block/<disk>/queue/write_same_max_bytes
 Date:          January 2012
index c7fc72d..613f42a 100644 (file)
@@ -44,16 +44,6 @@ Description:
                or 0 (unset).  Attempts to write any other values to it will
                cause -EINVAL to be returned.
 
-What:          /sys/firmware/acpi/hotplug/force_remove
-Date:          May 2013
-Contact:       Rafael J. Wysocki <rafael.j.wysocki@intel.com>
-Description:
-               The number in this file (0 or 1) determines whether (1) or not
-               (0) the ACPI subsystem will allow devices to be hot-removed even
-               if they cannot be put offline gracefully (from the kernel's
-               viewpoint).  That number can be changed by writing a boolean
-               value to this file.
-
 What:          /sys/firmware/acpi/interrupts/
 Date:          February 2008
 Contact:       Len Brown <lenb@kernel.org>
diff --git a/Documentation/acpi/dsd/graph.txt b/Documentation/acpi/dsd/graph.txt
new file mode 100644 (file)
index 0000000..ac09e31
--- /dev/null
@@ -0,0 +1,162 @@
+Graphs
+
+
+_DSD
+----
+
+_DSD (Device Specific Data) [7] is a predefined ACPI device
+configuration object that can be used to convey information on
+hardware features which are not specifically covered by the ACPI
+specification [1][6]. There are two _DSD extensions that are relevant
+for graphs: property [4] and hierarchical data extensions [5]. The
+property extension provides generic key-value pairs whereas the
+hierarchical data extension supports nodes with references to other
+nodes, forming a tree. The nodes in the tree may contain properties as
+defined by the property extension. The two extensions together provide
+a tree-like structure with zero or more properties (key-value pairs)
+in each node of the tree.
+
+The data structure may be accessed at runtime by using the device_*
+and fwnode_* functions defined in include/linux/fwnode.h .
+
+Fwnode represents a generic firmware node object. It is independent on
+the firmware type. In ACPI, fwnodes are _DSD hierarchical data
+extensions objects. A device's _DSD object is represented by an
+fwnode.
+
+The data structure may be referenced to elsewhere in the ACPI tables
+by using a hard reference to the device itself and an index to the
+hierarchical data extension array on each depth.
+
+
+Ports and endpoints
+-------------------
+
+The port and endpoint concepts are very similar to those in Devicetree
+[3]. A port represents an interface in a device, and an endpoint
+represents a connection to that interface.
+
+All port nodes are located under the device's "_DSD" node in the
+hierarchical data extension tree. The property extension related to
+each port node must contain the key "port" and an integer value which
+is the number of the port. The object it refers to should be called "PRTX",
+where "X" is the number of the port.
+
+Further on, endpoints are located under the individual port nodes. The
+first hierarchical data extension package list entry of the endpoint
+nodes must begin with "endpoint" and must be followed by the number
+of the endpoint. The object it refers to should be called "EPXY", where
+"X" is the number of the port and "Y" is the number of the endpoint.
+
+Each port node contains a property extension key "port", the value of
+which is the number of the port node. The each endpoint is similarly numbered
+with a property extension key "endpoint". Port numbers must be unique within a
+device and endpoint numbers must be unique within a port.
+
+The endpoint reference uses property extension with "remote-endpoint" property
+name followed by a reference in the same package. Such references consist of the
+the remote device reference, number of the port in the device and finally the
+number of the endpoint in that port. Individual references thus appear as:
+
+    Package() { device, port_number, endpoint_number }
+
+The references to endpoints must be always done both ways, to the
+remote endpoint and back from the referred remote endpoint node.
+
+A simple example of this is show below:
+
+    Scope (\_SB.PCI0.I2C2)
+    {
+       Device (CAM0)
+       {
+           Name (_DSD, Package () {
+               ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"),
+               Package () {
+                   Package () { "compatible", Package () { "nokia,smia" } },
+               },
+               ToUUID("dbb8e3e6-5886-4ba6-8795-1319f52a966b"),
+               Package () {
+                   Package () { "port0", "PRT0" },
+               }
+           })
+           Name (PRT0, Package() {
+               ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"),
+               Package () {
+                   Package () { "port", 0 },
+               },
+               ToUUID("dbb8e3e6-5886-4ba6-8795-1319f52a966b"),
+               Package () {
+                   Package () { "endpoint0", "EP00" },
+               }
+           })
+           Name (EP00, Package() {
+               ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"),
+               Package () {
+                   Package () { "endpoint", 0 },
+                   Package () { "remote-endpoint", Package() { \_SB.PCI0.ISP, 4, 0 } },
+               }
+           })
+       }
+    }
+
+    Scope (\_SB.PCI0)
+    {
+       Device (ISP)
+       {
+           Name (_DSD, Package () {
+               ToUUID("dbb8e3e6-5886-4ba6-8795-1319f52a966b"),
+               Package () {
+                   Package () { "port4", "PRT4" },
+               }
+           })
+
+           Name (PRT4, Package() {
+               ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"),
+               Package () {
+                   Package () { "port", 4 }, /* CSI-2 port number */
+               },
+               ToUUID("dbb8e3e6-5886-4ba6-8795-1319f52a966b"),
+               Package () {
+                   Package () { "endpoint0", "EP40" },
+               }
+           })
+
+           Name (EP40, Package() {
+               ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"),
+               Package () {
+                   Package () { "endpoint", 0 },
+                   Package () { "remote-endpoint", Package () { \_SB.PCI0.I2C2.CAM0, 0, 0 } },
+               }
+           })
+       }
+    }
+
+Here, the port 0 of the "CAM0" device is connected to the port 4 of
+the "ISP" device and vice versa.
+
+
+References
+----------
+
+[1] _DSD (Device Specific Data) Implementation Guide.
+    <URL:http://www.uefi.org/sites/default/files/resources/_DSD-implementation-guide-toplevel-1_1.htm>,
+    referenced 2016-10-03.
+
+[2] Devicetree. <URL:http://www.devicetree.org>, referenced 2016-10-03.
+
+[3] Documentation/devicetree/bindings/graph.txt
+
+[4] Device Properties UUID For _DSD.
+    <URL:http://www.uefi.org/sites/default/files/resources/_DSD-device-properties-UUID.pdf>,
+    referenced 2016-10-04.
+
+[5] Hierarchical Data Extension UUID For _DSD.
+    <URL:http://www.uefi.org/sites/default/files/resources/_DSD-hierarchical-data-extension-UUID-v1.pdf>,
+    referenced 2016-10-04.
+
+[6] Advanced Configuration and Power Interface Specification.
+    <URL:http://www.uefi.org/sites/default/files/resources/ACPI_6_1.pdf>,
+    referenced 2016-10-04.
+
+[7] _DSD Device Properties Usage Rules.
+    Documentation/acpi/DSD-properties-rules.txt
index defe2ee..3ad7b0d 100644 (file)
@@ -24,7 +24,7 @@ upstream.
    The homepage of ACPICA project is: www.acpica.org, it is maintained and
    supported by Intel Corporation.
 
-   The following figure depicts the Linux ACPI subystem where the ACPICA
+   The following figure depicts the Linux ACPI subsystem where the ACPICA
    adaptation is included:
 
       +---------------------------------------------------------+
@@ -110,7 +110,7 @@ upstream.
    Linux patches.  The patches generated by this process are referred to as
    "linuxized ACPICA patches".  The release process is carried out on a local
    copy the ACPICA git repository.  Each commit in the monthly release is
-   converted into a linuxized ACPICA patch.  Together, they form the montly
+   converted into a linuxized ACPICA patch.  Together, they form the monthly
    ACPICA release patchset for the Linux ACPI community.  This process is
    illustrated in the following figure:
 
@@ -165,7 +165,7 @@ upstream.
        <http://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git>.
 
    Before the linuxized ACPICA patches are sent to the Linux ACPI community
-   for review, there is a quality ensurance build test process to reduce
+   for review, there is a quality assurance build test process to reduce
    porting issues.  Currently this build process only takes care of the
    following kernel configuration options:
    CONFIG_ACPI/CONFIG_ACPI_DEBUG/CONFIG_ACPI_DEBUGGER
@@ -195,12 +195,12 @@ upstream.
       release utilities (please refer to Section 4 below for the details).
    3. Linux specific features - Sometimes it's impossible to use the
       current ACPICA APIs to implement features required by the Linux kernel,
-      so Linux developers occasionaly have to change ACPICA code directly.
+      so Linux developers occasionally have to change ACPICA code directly.
       Those changes may not be acceptable by ACPICA upstream and in such cases
       they are left as committed ACPICA divergences unless the ACPICA side can
       implement new mechanisms as replacements for them.
    4. ACPICA release fixups - ACPICA only tests commits using a set of the
-      user space simulation utilies, thus the linuxized ACPICA patches may
+      user space simulation utilities, thus the linuxized ACPICA patches may
       break the Linux kernel, leaving us build/boot failures.  In order to
       avoid breaking Linux bisection, fixes are applied directly to the
       linuxized ACPICA patches during the release process.  When the release
index 697a00c..02f639a 100644 (file)
@@ -27,7 +27,7 @@ On what hardware does it run?
   today Linux also runs on (at least) the Compaq Alpha AXP, Sun SPARC and
   UltraSPARC, Motorola 68000, PowerPC, PowerPC64, ARM, Hitachi SuperH, Cell,
   IBM S/390, MIPS, HP PA-RISC, Intel IA-64, DEC VAX, AMD x86-64, AXIS CRIS,
-  Xtensa, Tilera TILE, AVR32, ARC and Renesas M32R architectures.
+  Xtensa, Tilera TILE, ARC and Renesas M32R architectures.
 
   Linux is easily portable to most general-purpose 32- or 64-bit architectures
   as long as they have a paged memory management unit (PMMU) and a port of the
index b516164..c74933c 100644 (file)
@@ -86,7 +86,6 @@ parameter is applicable::
        APIC    APIC support is enabled.
        APM     Advanced Power Management support is enabled.
        ARM     ARM architecture is enabled.
-       AVR32   AVR32 architecture is enabled.
        AX25    Appropriate AX.25 support is enabled.
        BLACKFIN Blackfin architecture is enabled.
        CLK     Common clock infrastructure is enabled.
index facc20a..ba29450 100644 (file)
                        [ACPI] acpi_pm
                        [ARM] imx_timer1,OSTS,netx_timer,mpu_timer2,
                                pxa_timer,timer3,32k_counter,timer0_1
-                       [AVR32] avr32
                        [X86-32] pit,hpet,tsc;
                                scx200_hrt on Geode; cyclone on IBM x440
                        [MIPS] MIPS
                        osd-targets. Please see:
                        Documentation/filesystems/pnfs.txt for more explanations
 
-       nmi_debug=      [KNL,AVR32,SH] Specify one or more actions to take
+       nmi_debug=      [KNL,SH] Specify one or more actions to take
                        when a NMI is triggered.
                        Format: [state][,regs][,debounce][,die]
 
        ramdisk_size=   [RAM] Sizes of RAM disks in kilobytes
                        See Documentation/blockdev/ramdisk.txt.
 
+       ras=option[,option,...] [KNL] RAS-specific options
+
+               cec_disable     [X86]
+                               Disable the Correctable Errors Collector,
+                               see CONFIG_RAS_CEC help text.
+
        rcu_nocbs=      [KNL]
                        The argument is a cpu list, as described above.
 
index 2f66683..10f2ddd 100644 (file)
@@ -54,6 +54,7 @@ stable kernels.
 | ARM            | Cortex-A57      | #852523         | N/A                         |
 | ARM            | Cortex-A57      | #834220         | ARM64_ERRATUM_834220        |
 | ARM            | Cortex-A72      | #853709         | N/A                         |
+| ARM            | Cortex-A73      | #858921         | ARM64_ERRATUM_858921        |
 | ARM            | MMU-500         | #841119,#826419 | N/A                         |
 |                |                 |                 |                             |
 | Cavium         | ThunderX ITS    | #22375, #24313  | CAVIUM_ERRATUM_22375        |
index e55103a..8d55b4b 100644 (file)
@@ -1,5 +1,7 @@
 00-INDEX
        - This file
+bfq-iosched.txt
+       - BFQ IO scheduler and its tunables
 biodoc.txt
        - Notes on the Generic Block Layer Rewrite in Linux 2.5
 biovecs.txt
diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt
new file mode 100644 (file)
index 0000000..1b87df6
--- /dev/null
@@ -0,0 +1,531 @@
+BFQ (Budget Fair Queueing)
+==========================
+
+BFQ is a proportional-share I/O scheduler, with some extra
+low-latency capabilities. In addition to cgroups support (blkio or io
+controllers), BFQ's main features are:
+- BFQ guarantees a high system and application responsiveness, and a
+  low latency for time-sensitive applications, such as audio or video
+  players;
+- BFQ distributes bandwidth, and not just time, among processes or
+  groups (switching back to time distribution when needed to keep
+  throughput high).
+
+On average CPUs, the current version of BFQ can handle devices
+performing at most ~30K IOPS; at most ~50 KIOPS on faster CPUs. As a
+reference, 30-50 KIOPS correspond to very high bandwidths with
+sequential I/O (e.g., 8-12 GB/s if I/O requests are 256 KB large), and
+to 120-200 MB/s with 4KB random I/O. BFQ has not yet been tested on
+multi-queue devices.
+
+The table of contents follow. Impatients can just jump to Section 3.
+
+CONTENTS
+
+1. When may BFQ be useful?
+ 1-1 Personal systems
+ 1-2 Server systems
+2. How does BFQ work?
+3. What are BFQ's tunable?
+4. BFQ group scheduling
+ 4-1 Service guarantees provided
+ 4-2 Interface
+
+1. When may BFQ be useful?
+==========================
+
+BFQ provides the following benefits on personal and server systems.
+
+1-1 Personal systems
+--------------------
+
+Low latency for interactive applications
+
+Regardless of the actual background workload, BFQ guarantees that, for
+interactive tasks, the storage device is virtually as responsive as if
+it was idle. For example, even if one or more of the following
+background workloads are being executed:
+- one or more large files are being read, written or copied,
+- a tree of source files is being compiled,
+- one or more virtual machines are performing I/O,
+- a software update is in progress,
+- indexing daemons are scanning filesystems and updating their
+  databases,
+starting an application or loading a file from within an application
+takes about the same time as if the storage device was idle. As a
+comparison, with CFQ, NOOP or DEADLINE, and in the same conditions,
+applications experience high latencies, or even become unresponsive
+until the background workload terminates (also on SSDs).
+
+Low latency for soft real-time applications
+
+Also soft real-time applications, such as audio and video
+players/streamers, enjoy a low latency and a low drop rate, regardless
+of the background I/O workload. As a consequence, these applications
+do not suffer from almost any glitch due to the background workload.
+
+Higher speed for code-development tasks
+
+If some additional workload happens to be executed in parallel, then
+BFQ executes the I/O-related components of typical code-development
+tasks (compilation, checkout, merge, ...) much more quickly than CFQ,
+NOOP or DEADLINE.
+
+High throughput
+
+On hard disks, BFQ achieves up to 30% higher throughput than CFQ, and
+up to 150% higher throughput than DEADLINE and NOOP, with all the
+sequential workloads considered in our tests. With random workloads,
+and with all the workloads on flash-based devices, BFQ achieves,
+instead, about the same throughput as the other schedulers.
+
+Strong fairness, bandwidth and delay guarantees
+
+BFQ distributes the device throughput, and not just the device time,
+among I/O-bound applications in proportion their weights, with any
+workload and regardless of the device parameters. From these bandwidth
+guarantees, it is possible to compute tight per-I/O-request delay
+guarantees by a simple formula. If not configured for strict service
+guarantees, BFQ switches to time-based resource sharing (only) for
+applications that would otherwise cause a throughput loss.
+
+1-2 Server systems
+------------------
+
+Most benefits for server systems follow from the same service
+properties as above. In particular, regardless of whether additional,
+possibly heavy workloads are being served, BFQ guarantees:
+
+. audio and video-streaming with zero or very low jitter and drop
+  rate;
+
+. fast retrieval of WEB pages and embedded objects;
+
+. real-time recording of data in live-dumping applications (e.g.,
+  packet logging);
+
+. responsiveness in local and remote access to a server.
+
+
+2. How does BFQ work?
+=====================
+
+BFQ is a proportional-share I/O scheduler, whose general structure,
+plus a lot of code, are borrowed from CFQ.
+
+- Each process doing I/O on a device is associated with a weight and a
+  (bfq_)queue.
+
+- BFQ grants exclusive access to the device, for a while, to one queue
+  (process) at a time, and implements this service model by
+  associating every queue with a budget, measured in number of
+  sectors.
+
+  - After a queue is granted access to the device, the budget of the
+    queue is decremented, on each request dispatch, by the size of the
+    request.
+
+  - The in-service queue is expired, i.e., its service is suspended,
+    only if one of the following events occurs: 1) the queue finishes
+    its budget, 2) the queue empties, 3) a "budget timeout" fires.
+
+    - The budget timeout prevents processes doing random I/O from
+      holding the device for too long and dramatically reducing
+      throughput.
+
+    - Actually, as in CFQ, a queue associated with a process issuing
+      sync requests may not be expired immediately when it empties. In
+      contrast, BFQ may idle the device for a short time interval,
+      giving the process the chance to go on being served if it issues
+      a new request in time. Device idling typically boosts the
+      throughput on rotational devices, if processes do synchronous
+      and sequential I/O. In addition, under BFQ, device idling is
+      also instrumental in guaranteeing the desired throughput
+      fraction to processes issuing sync requests (see the description
+      of the slice_idle tunable in this document, or [1, 2], for more
+      details).
+
+      - With respect to idling for service guarantees, if several
+       processes are competing for the device at the same time, but
+       all processes (and groups, after the following commit) have
+       the same weight, then BFQ guarantees the expected throughput
+       distribution without ever idling the device. Throughput is
+       thus as high as possible in this common scenario.
+
+  - If low-latency mode is enabled (default configuration), BFQ
+    executes some special heuristics to detect interactive and soft
+    real-time applications (e.g., video or audio players/streamers),
+    and to reduce their latency. The most important action taken to
+    achieve this goal is to give to the queues associated with these
+    applications more than their fair share of the device
+    throughput. For brevity, we call just "weight-raising" the whole
+    sets of actions taken by BFQ to privilege these queues. In
+    particular, BFQ provides a milder form of weight-raising for
+    interactive applications, and a stronger form for soft real-time
+    applications.
+
+  - BFQ automatically deactivates idling for queues born in a burst of
+    queue creations. In fact, these queues are usually associated with
+    the processes of applications and services that benefit mostly
+    from a high throughput. Examples are systemd during boot, or git
+    grep.
+
+  - As CFQ, BFQ merges queues performing interleaved I/O, i.e.,
+    performing random I/O that becomes mostly sequential if
+    merged. Differently from CFQ, BFQ achieves this goal with a more
+    reactive mechanism, called Early Queue Merge (EQM). EQM is so
+    responsive in detecting interleaved I/O (cooperating processes),
+    that it enables BFQ to achieve a high throughput, by queue
+    merging, even for queues for which CFQ needs a different
+    mechanism, preemption, to get a high throughput. As such EQM is a
+    unified mechanism to achieve a high throughput with interleaved
+    I/O.
+
+  - Queues are scheduled according to a variant of WF2Q+, named
+    B-WF2Q+, and implemented using an augmented rb-tree to preserve an
+    O(log N) overall complexity.  See [2] for more details. B-WF2Q+ is
+    also ready for hierarchical scheduling. However, for a cleaner
+    logical breakdown, the code that enables and completes
+    hierarchical support is provided in the next commit, which focuses
+    exactly on this feature.
+
+  - B-WF2Q+ guarantees a tight deviation with respect to an ideal,
+    perfectly fair, and smooth service. In particular, B-WF2Q+
+    guarantees that each queue receives a fraction of the device
+    throughput proportional to its weight, even if the throughput
+    fluctuates, and regardless of: the device parameters, the current
+    workload and the budgets assigned to the queue.
+
+  - The last, budget-independence, property (although probably
+    counterintuitive in the first place) is definitely beneficial, for
+    the following reasons:
+
+    - First, with any proportional-share scheduler, the maximum
+      deviation with respect to an ideal service is proportional to
+      the maximum budget (slice) assigned to queues. As a consequence,
+      BFQ can keep this deviation tight not only because of the
+      accurate service of B-WF2Q+, but also because BFQ *does not*
+      need to assign a larger budget to a queue to let the queue
+      receive a higher fraction of the device throughput.
+
+    - Second, BFQ is free to choose, for every process (queue), the
+      budget that best fits the needs of the process, or best
+      leverages the I/O pattern of the process. In particular, BFQ
+      updates queue budgets with a simple feedback-loop algorithm that
+      allows a high throughput to be achieved, while still providing
+      tight latency guarantees to time-sensitive applications. When
+      the in-service queue expires, this algorithm computes the next
+      budget of the queue so as to:
+
+      - Let large budgets be eventually assigned to the queues
+       associated with I/O-bound applications performing sequential
+       I/O: in fact, the longer these applications are served once
+       got access to the device, the higher the throughput is.
+
+      - Let small budgets be eventually assigned to the queues
+       associated with time-sensitive applications (which typically
+       perform sporadic and short I/O), because, the smaller the
+       budget assigned to a queue waiting for service is, the sooner
+       B-WF2Q+ will serve that queue (Subsec 3.3 in [2]).
+
+- If several processes are competing for the device at the same time,
+  but all processes and groups have the same weight, then BFQ
+  guarantees the expected throughput distribution without ever idling
+  the device. It uses preemption instead. Throughput is then much
+  higher in this common scenario.
+
+- ioprio classes are served in strict priority order, i.e.,
+  lower-priority queues are not served as long as there are
+  higher-priority queues.  Among queues in the same class, the
+  bandwidth is distributed in proportion to the weight of each
+  queue. A very thin extra bandwidth is however guaranteed to
+  the Idle class, to prevent it from starving.
+
+
+3. What are BFQ's tunable?
+==========================
+
+The tunables back_seek-max, back_seek_penalty, fifo_expire_async and
+fifo_expire_sync below are the same as in CFQ. Their description is
+just copied from that for CFQ. Some considerations in the description
+of slice_idle are copied from CFQ too.
+
+per-process ioprio and weight
+-----------------------------
+
+Unless the cgroups interface is used (see "4. BFQ group scheduling"),
+weights can be assigned to processes only indirectly, through I/O
+priorities, and according to the relation:
+weight = (IOPRIO_BE_NR - ioprio) * 10.
+
+Beware that, if low-latency is set, then BFQ automatically raises the
+weight of the queues associated with interactive and soft real-time
+applications. Unset this tunable if you need/want to control weights.
+
+slice_idle
+----------
+
+This parameter specifies how long BFQ should idle for next I/O
+request, when certain sync BFQ queues become empty. By default
+slice_idle is a non-zero value. Idling has a double purpose: boosting
+throughput and making sure that the desired throughput distribution is
+respected (see the description of how BFQ works, and, if needed, the
+papers referred there).
+
+As for throughput, idling can be very helpful on highly seeky media
+like single spindle SATA/SAS disks where we can cut down on overall
+number of seeks and see improved throughput.
+
+Setting slice_idle to 0 will remove all the idling on queues and one
+should see an overall improved throughput on faster storage devices
+like multiple SATA/SAS disks in hardware RAID configuration.
+
+So depending on storage and workload, it might be useful to set
+slice_idle=0.  In general for SATA/SAS disks and software RAID of
+SATA/SAS disks keeping slice_idle enabled should be useful. For any
+configurations where there are multiple spindles behind single LUN
+(Host based hardware RAID controller or for storage arrays), setting
+slice_idle=0 might end up in better throughput and acceptable
+latencies.
+
+Idling is however necessary to have service guarantees enforced in
+case of differentiated weights or differentiated I/O-request lengths.
+To see why, suppose that a given BFQ queue A must get several I/O
+requests served for each request served for another queue B. Idling
+ensures that, if A makes a new I/O request slightly after becoming
+empty, then no request of B is dispatched in the middle, and thus A
+does not lose the possibility to get more than one request dispatched
+before the next request of B is dispatched. Note that idling
+guarantees the desired differentiated treatment of queues only in
+terms of I/O-request dispatches. To guarantee that the actual service
+order then corresponds to the dispatch order, the strict_guarantees
+tunable must be set too.
+
+There is an important flipside for idling: apart from the above cases
+where it is beneficial also for throughput, idling can severely impact
+throughput. One important case is random workload. Because of this
+issue, BFQ tends to avoid idling as much as possible, when it is not
+beneficial also for throughput. As a consequence of this behavior, and
+of further issues described for the strict_guarantees tunable,
+short-term service guarantees may be occasionally violated. And, in
+some cases, these guarantees may be more important than guaranteeing
+maximum throughput. For example, in video playing/streaming, a very
+low drop rate may be more important than maximum throughput. In these
+cases, consider setting the strict_guarantees parameter.
+
+strict_guarantees
+-----------------
+
+If this parameter is set (default: unset), then BFQ
+
+- always performs idling when the in-service queue becomes empty;
+
+- forces the device to serve one I/O request at a time, by dispatching a
+  new request only if there is no outstanding request.
+
+In the presence of differentiated weights or I/O-request sizes, both
+the above conditions are needed to guarantee that every BFQ queue
+receives its allotted share of the bandwidth. The first condition is
+needed for the reasons explained in the description of the slice_idle
+tunable.  The second condition is needed because all modern storage
+devices reorder internally-queued requests, which may trivially break
+the service guarantees enforced by the I/O scheduler.
+
+Setting strict_guarantees may evidently affect throughput.
+
+back_seek_max
+-------------
+
+This specifies, given in Kbytes, the maximum "distance" for backward seeking.
+The distance is the amount of space from the current head location to the
+sectors that are backward in terms of distance.
+
+This parameter allows the scheduler to anticipate requests in the "backward"
+direction and consider them as being the "next" if they are within this
+distance from the current head location.
+
+back_seek_penalty
+-----------------
+
+This parameter is used to compute the cost of backward seeking. If the
+backward distance of request is just 1/back_seek_penalty from a "front"
+request, then the seeking cost of two requests is considered equivalent.
+
+So scheduler will not bias toward one or the other request (otherwise scheduler
+will bias toward front request). Default value of back_seek_penalty is 2.
+
+fifo_expire_async
+-----------------
+
+This parameter is used to set the timeout of asynchronous requests. Default
+value of this is 248ms.
+
+fifo_expire_sync
+----------------
+
+This parameter is used to set the timeout of synchronous requests. Default
+value of this is 124ms. In case to favor synchronous requests over asynchronous
+one, this value should be decreased relative to fifo_expire_async.
+
+low_latency
+-----------
+
+This parameter is used to enable/disable BFQ's low latency mode. By
+default, low latency mode is enabled. If enabled, interactive and soft
+real-time applications are privileged and experience a lower latency,
+as explained in more detail in the description of how BFQ works.
+
+DO NOT enable this mode if you need full control on bandwidth
+distribution. In fact, if it is enabled, then BFQ automatically
+increases the bandwidth share of privileged applications, as the main
+means to guarantee a lower latency to them.
+
+timeout_sync
+------------
+
+Maximum amount of device time that can be given to a task (queue) once
+it has been selected for service. On devices with costly seeks,
+increasing this time usually increases maximum throughput. On the
+opposite end, increasing this time coarsens the granularity of the
+short-term bandwidth and latency guarantees, especially if the
+following parameter is set to zero.
+
+max_budget
+----------
+
+Maximum amount of service, measured in sectors, that can be provided
+to a BFQ queue once it is set in service (of course within the limits
+of the above timeout). According to what said in the description of
+the algorithm, larger values increase the throughput in proportion to
+the percentage of sequential I/O requests issued. The price of larger
+values is that they coarsen the granularity of short-term bandwidth
+and latency guarantees.
+
+The default value is 0, which enables auto-tuning: BFQ sets max_budget
+to the maximum number of sectors that can be served during
+timeout_sync, according to the estimated peak rate.
+
+weights
+-------
+
+Read-only parameter, used to show the weights of the currently active
+BFQ queues.
+
+
+wr_ tunables
+------------
+
+BFQ exports a few parameters to control/tune the behavior of
+low-latency heuristics.
+
+wr_coeff
+
+Factor by which the weight of a weight-raised queue is multiplied. If
+the queue is deemed soft real-time, then the weight is further
+multiplied by an additional, constant factor.
+
+wr_max_time
+
+Maximum duration of a weight-raising period for an interactive task
+(ms). If set to zero (default value), then this value is computed
+automatically, as a function of the peak rate of the device. In any
+case, when the value of this parameter is read, it always reports the
+current duration, regardless of whether it has been set manually or
+computed automatically.
+
+wr_max_softrt_rate
+
+Maximum service rate below which a queue is deemed to be associated
+with a soft real-time application, and is then weight-raised
+accordingly (sectors/sec).
+
+wr_min_idle_time
+
+Minimum idle period after which interactive weight-raising may be
+reactivated for a queue (in ms).
+
+wr_rt_max_time
+
+Maximum weight-raising duration for soft real-time queues (in ms). The
+start time from which this duration is considered is automatically
+moved forward if the queue is detected to be still soft real-time
+before the current soft real-time weight-raising period finishes.
+
+wr_min_inter_arr_async
+
+Minimum period between I/O request arrivals after which weight-raising
+may be reactivated for an already busy async queue (in ms).
+
+
+4. Group scheduling with BFQ
+============================
+
+BFQ supports both cgroups-v1 and cgroups-v2 io controllers, namely
+blkio and io. In particular, BFQ supports weight-based proportional
+share. To activate cgroups support, set BFQ_GROUP_IOSCHED.
+
+4-1 Service guarantees provided
+-------------------------------
+
+With BFQ, proportional share means true proportional share of the
+device bandwidth, according to group weights. For example, a group
+with weight 200 gets twice the bandwidth, and not just twice the time,
+of a group with weight 100.
+
+BFQ supports hierarchies (group trees) of any depth. Bandwidth is
+distributed among groups and processes in the expected way: for each
+group, the children of the group share the whole bandwidth of the
+group in proportion to their weights. In particular, this implies
+that, for each leaf group, every process of the group receives the
+same share of the whole group bandwidth, unless the ioprio of the
+process is modified.
+
+The resource-sharing guarantee for a group may partially or totally
+switch from bandwidth to time, if providing bandwidth guarantees to
+the group lowers the throughput too much. This switch occurs on a
+per-process basis: if a process of a leaf group causes throughput loss
+if served in such a way to receive its share of the bandwidth, then
+BFQ switches back to just time-based proportional share for that
+process.
+
+4-2 Interface
+-------------
+
+To get proportional sharing of bandwidth with BFQ for a given device,
+BFQ must of course be the active scheduler for that device.
+
+Within each group directory, the names of the files associated with
+BFQ-specific cgroup parameters and stats begin with the "bfq."
+prefix. So, with cgroups-v1 or cgroups-v2, the full prefix for
+BFQ-specific files is "blkio.bfq." or "io.bfq." For example, the group
+parameter to set the weight of a group with BFQ is blkio.bfq.weight
+or io.bfq.weight.
+
+Parameters to set
+-----------------
+
+For each group, there is only the following parameter to set.
+
+weight (namely blkio.bfq.weight or io.bfq-weight): the weight of the
+group inside its parent. Available values: 1..10000 (default 100). The
+linear mapping between ioprio and weights, described at the beginning
+of the tunable section, is still valid, but all weights higher than
+IOPRIO_BE_NR*10 are mapped to ioprio 0.
+
+Recall that, if low-latency is set, then BFQ automatically raises the
+weight of the queues associated with interactive and soft real-time
+applications. Unset this tunable if you need/want to control weights.
+
+
+[1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O
+    Scheduler", Proceedings of the First Workshop on Mobile System
+    Technologies (MST-2015), May 2015.
+    http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf
+
+[2] P. Valente and M. Andreolini, "Improving Application
+    Responsiveness with the BFQ Disk I/O Scheduler", Proceedings of
+    the 5th Annual International Systems and Storage Conference
+    (SYSTOR '12), June 2012.
+    Slightly extended version:
+    http://algogroup.unimore.it/people/paolo/disk_sched/bfq-v1-suite-
+                                                       results.pdf
diff --git a/Documentation/block/kyber-iosched.txt b/Documentation/block/kyber-iosched.txt
new file mode 100644 (file)
index 0000000..e94feac
--- /dev/null
@@ -0,0 +1,14 @@
+Kyber I/O scheduler tunables
+===========================
+
+The only two tunables for the Kyber scheduler are the target latencies for
+reads and synchronous writes. Kyber will throttle requests in order to meet
+these target latencies.
+
+read_lat_nsec
+-------------
+Target latency for reads (in nanoseconds).
+
+write_lat_nsec
+--------------
+Target latency for synchronous writes (in nanoseconds).
index c0a3bb5..2c1e670 100644 (file)
@@ -43,11 +43,6 @@ large discards are issued, setting this value lower will make Linux issue
 smaller discards and potentially help reduce latencies induced by large
 discard operations.
 
-discard_zeroes_data (RO)
-------------------------
-When read, this file will show if the discarded block are zeroed by the
-device or not. If its value is '1' the blocks are zeroed otherwise not.
-
 hw_sector_size (RO)
 -------------------
 This is the hardware sector size of the device, in bytes.
@@ -192,5 +187,11 @@ scaling back writes. Writing a value of '0' to this file disables the
 feature. Writing a value of '-1' to this file resets the value to the
 default setting.
 
+throttle_sample_time (RW)
+-------------------------
+This is the time window that blk-throttle samples data, in millisecond.
+blk-throttle makes decision based on the samplings. Lower time means cgroups
+have more smooth throughput, but higher CPU overhead. This exists only when
+CONFIG_BLK_DEV_THROTTLING_LOW is enabled.
 
 Jens Axboe <jens.axboe@oracle.com>, February 2009
diff --git a/Documentation/blockdev/mflash.txt b/Documentation/blockdev/mflash.txt
deleted file mode 100644 (file)
index f7e0505..0000000
+++ /dev/null
@@ -1,84 +0,0 @@
-This document describes m[g]flash support in linux.
-
-Contents
-  1. Overview
-  2. Reserved area configuration
-  3. Example of mflash platform driver registration
-
-1. Overview
-
-Mflash and gflash are embedded flash drive. The only difference is mflash is
-MCP(Multi Chip Package) device. These two device operate exactly same way.
-So the rest mflash repersents mflash and gflash altogether.
-
-Internally, mflash has nand flash and other hardware logics and supports
-2 different operation (ATA, IO) modes. ATA mode doesn't need any new
-driver and currently works well under standard IDE subsystem. Actually it's
-one chip SSD. IO mode is ATA-like custom mode for the host that doesn't have
-IDE interface.
-
-Following are brief descriptions about IO mode.
-A. IO mode based on ATA protocol and uses some custom command. (read confirm,
-write confirm)
-B. IO mode uses SRAM bus interface.
-C. IO mode supports 4kB boot area, so host can boot from mflash.
-
-2. Reserved area configuration
-If host boot from mflash, usually needs raw area for boot loader image. All of
-the mflash's block device operation will be taken this value as start offset.
-Note that boot loader's size of reserved area and kernel configuration value
-must be same.
-
-3. Example of mflash platform driver registration
-Working mflash is very straight forward. Adding platform device stuff to board
-configuration file is all. Here is some pseudo example.
-
-static struct mg_drv_data mflash_drv_data = {
-       /* If you want to polling driver set to 1 */
-       .use_polling = 0,
-       /* device attribution */
-       .dev_attr = MG_BOOT_DEV
-};
-
-static struct resource mg_mflash_rsc[] = {
-       /* Base address of mflash */
-       [0] = {
-               .start = 0x08000000,
-               .end = 0x08000000 + SZ_64K - 1,
-               .flags = IORESOURCE_MEM
-       },
-       /* mflash interrupt pin */
-       [1] = {
-               .start = IRQ_GPIO(84),
-               .end = IRQ_GPIO(84),
-               .flags = IORESOURCE_IRQ
-       },
-       /* mflash reset pin */
-       [2] = {
-               .start = 43,
-               .end = 43,
-               .name = MG_RST_PIN,
-               .flags = IORESOURCE_IO
-       },
-       /* mflash reset-out pin
-        * If you use mflash as storage device (i.e. other than MG_BOOT_DEV),
-        * should assign this */
-       [3] = {
-               .start = 51,
-               .end = 51,
-               .name = MG_RSTOUT_PIN,
-               .flags = IORESOURCE_IO
-       }
-};
-
-static struct platform_device mflash_dev = {
-       .name = MG_DEV_NAME,
-       .id = -1,
-       .dev = {
-               .platform_data = &mflash_drv_data,
-       },
-       .num_resources = ARRAY_SIZE(mg_mflash_rsc),
-       .resource = mg_mflash_rsc
-};
-
-platform_device_register(&mflash_dev);
diff --git a/Documentation/devicetree/bindings/ata/ahci-dm816.txt b/Documentation/devicetree/bindings/ata/ahci-dm816.txt
new file mode 100644 (file)
index 0000000..f8c535f
--- /dev/null
@@ -0,0 +1,21 @@
+Device tree binding for the TI DM816 AHCI SATA Controller
+---------------------------------------------------------
+
+Required properties:
+  - compatible: must be "ti,dm816-ahci"
+  - reg: physical base address and size of the register region used by
+         the controller (as defined by the AHCI 1.1 standard)
+  - interrupts: interrupt specifier (refer to the interrupt binding)
+  - clocks: list of phandle and clock specifier pairs (or only
+            phandles for clock providers with '0' defined for
+            #clock-cells); two clocks must be specified: the functional
+            clock and an external reference clock
+
+Example:
+
+       sata: sata@4a140000 {
+               compatible = "ti,dm816-ahci";
+               reg = <0x4a140000 0x10000>;
+               interrupts = <16>;
+               clocks = <&sysclk5_ck>, <&sata_refclk>;
+       };
diff --git a/Documentation/devicetree/bindings/hwmon/ads7828.txt b/Documentation/devicetree/bindings/hwmon/ads7828.txt
new file mode 100644 (file)
index 0000000..fe0cc4a
--- /dev/null
@@ -0,0 +1,25 @@
+ads7828 properties
+
+Required properties:
+- compatible: Should be one of
+              ti,ads7828
+              ti,ads7830
+- reg: I2C address
+
+Optional properties:
+
+- ti,differential-input
+  Set to use the device in differential mode.
+- vref-supply
+  The external reference on the device is set to this regulators output. If it
+  does not exists the internal reference will be used and output by the ads78xx
+  on the "external vref" pin.
+
+  Example ADS7828 node:
+
+  ads7828: ads@48 {
+          comatible = "ti,ads7828";
+          reg = <0x48>;
+          vref-supply = <&vref>;
+          ti,differential-input;
+  };
diff --git a/Documentation/devicetree/bindings/hwmon/aspeed-pwm-tacho.txt b/Documentation/devicetree/bindings/hwmon/aspeed-pwm-tacho.txt
new file mode 100644 (file)
index 0000000..cf44605
--- /dev/null
@@ -0,0 +1,68 @@
+ASPEED AST2400/AST2500 PWM and Fan Tacho controller device driver
+
+The ASPEED PWM controller can support upto 8 PWM outputs. The ASPEED Fan Tacho
+controller can support upto 16 Fan tachometer inputs.
+
+There can be upto 8 fans supported. Each fan can have one PWM output and
+one/two Fan tach inputs.
+
+Required properties for pwm-tacho node:
+- #address-cells : should be 1.
+
+- #size-cells : should be 1.
+
+- reg : address and length of the register set for the device.
+
+- pinctrl-names : a pinctrl state named "default" must be defined.
+
+- pinctrl-0 : phandle referencing pin configuration of the PWM ports.
+
+- compatible : should be "aspeed,ast2400-pwm-tacho" for AST2400 and
+              "aspeed,ast2500-pwm-tacho" for AST2500.
+
+- clocks : a fixed clock providing input clock frequency(PWM
+          and Fan Tach clock)
+
+fan subnode format:
+===================
+Under fan subnode there can upto 8 child nodes, with each child node
+representing a fan. If there are 8 fans each fan can have one PWM port and
+one/two Fan tach inputs.
+
+Required properties for each child node:
+- reg : should specify PWM source port.
+       integer value in the range 0 to 7 with 0 indicating PWM port A and
+       7 indicating PWM port H.
+
+- aspeed,fan-tach-ch : should specify the Fan tach input channel.
+                integer value in the range 0 through 15, with 0 indicating
+               Fan tach channel 0 and 15 indicating Fan tach channel 15.
+               Atleast one Fan tach input channel is required.
+
+Examples:
+
+pwm_tacho_fixed_clk: fixedclk {
+       compatible = "fixed-clock";
+       #clock-cells = <0>;
+       clock-frequency = <24000000>;
+};
+
+pwm_tacho: pwmtachocontroller@1e786000 {
+       #address-cells = <1>;
+       #size-cells = <1>;
+       reg = <0x1E786000 0x1000>;
+       compatible = "aspeed,ast2500-pwm-tacho";
+       clocks = <&pwm_tacho_fixed_clk>;
+       pinctrl-names = "default";
+       pinctrl-0 = <&pinctrl_pwm0_default &pinctrl_pwm1_default>;
+
+       fan@0 {
+               reg = <0x00>;
+               aspeed,fan-tach-ch = /bits/ 8 <0x00>;
+       };
+
+       fan@1 {
+               reg = <0x01>;
+               aspeed,fan-tach-ch = /bits/ 8 <0x01 0x02>;
+       };
+};
diff --git a/Documentation/devicetree/bindings/hwmon/lm87.txt b/Documentation/devicetree/bindings/hwmon/lm87.txt
new file mode 100644 (file)
index 0000000..e1b7990
--- /dev/null
@@ -0,0 +1,30 @@
+*LM87 hwmon sensor.
+
+Required properties:
+- compatible: Should be
+       "ti,lm87"
+
+- reg: I2C address
+
+optional properties:
+- has-temp3: This configures pins 18 and 19 to be used as a second
+             remote temperature sensing channel. By default the pins
+             are configured as voltage input pins in0 and in5.
+
+- has-in6: When set, pin 5 is configured to be used as voltage input
+           in6. Otherwise the pin is set as FAN1 input.
+
+- has-in7: When set, pin 6 is configured to be used as voltage input
+           in7. Otherwise the pin is set as FAN2 input.
+
+- vcc-supply: a Phandle for the regulator supplying power, can be
+              cofigured to measure 5.0V power supply. Default is 3.3V.
+
+Example:
+
+lm87@2e {
+       compatible = "ti,lm87";
+       reg = <0x2e>;
+       has-temp3;
+       vcc-supply = <&reg_5v0>;
+};
@@ -1,9 +1,12 @@
-* Cortina Systems Gemini interrupt controller
+* Faraday Technologt FTINTC010 interrupt controller
 
-This interrupt controller is found on the Gemini SoCs.
+This interrupt controller is a stock IP block from Faraday Technology found
+in the Gemini SoCs and other designs.
 
 Required properties:
-- compatible: must be "cortina,gemini-interrupt-controller"
+- compatible: must be one of
+  "faraday,ftintc010"
+  "cortina,gemini-interrupt-controller" (deprecated)
 - reg: The register bank for the interrupt controller.
 - interrupt-controller: Identifies the node as an interrupt controller
 - #interrupt-cells: The number of cells to define the interrupts.
@@ -15,7 +18,7 @@ Required properties:
 Example:
 
 interrupt-controller@48000000 {
-       compatible = "cortina,gemini-interrupt-controller";
+       compatible = "faraday,ftintc010"
        reg = <0x48000000 0x1000>;
        interrupt-controller;
        #interrupt-cells = <2>;
diff --git a/Documentation/devicetree/bindings/interrupt-controller/mediatek,cirq.txt b/Documentation/devicetree/bindings/interrupt-controller/mediatek,cirq.txt
new file mode 100644 (file)
index 0000000..a7efdbc
--- /dev/null
@@ -0,0 +1,35 @@
+* Mediatek 27xx cirq
+
+In Mediatek SOCs, the CIRQ is a low power interrupt controller designed to
+work outside MCUSYS which comprises with Cortex-Ax cores,CCI and GIC.
+The external interrupts (outside MCUSYS) will feed through CIRQ and connect
+to GIC in MCUSYS. When CIRQ is enabled, it will record the edge-sensitive
+interrupts and generate a pulse signal to parent interrupt controller when
+flush command is executed. With CIRQ, MCUSYS can be completely turned off
+to improve the system power consumption without losing interrupts.
+
+Required properties:
+- compatible: should be one of
+  - "mediatek,mt2701-cirq" for mt2701 CIRQ
+  - "mediatek,mt8135-cirq" for mt8135 CIRQ
+  - "mediatek,mt8173-cirq" for mt8173 CIRQ
+  and "mediatek,cirq" as a fallback.
+- interrupt-controller : Identifies the node as an interrupt controller.
+- #interrupt-cells : Use the same format as specified by GIC in arm,gic.txt.
+- interrupt-parent: phandle of irq parent for cirq. The parent must
+  use the same interrupt-cells format as GIC.
+- reg: Physical base address of the cirq registers and length of memory
+  mapped region.
+- mediatek,ext-irq-range: Identifies external irq number range in different
+  SOCs.
+
+Example:
+       cirq: interrupt-controller@10204000 {
+               compatible = "mediatek,mt2701-cirq",
+                            "mediatek,mtk-cirq";
+               interrupt-controller;
+               #interrupt-cells = <3>;
+               interrupt-parent = <&sysirq>;
+               reg = <0 0x10204000 0 0x400>;
+               mediatek,ext-irq-start = <32 200>;
+       };
index 9d1d72c..a89c03b 100644 (file)
@@ -21,13 +21,16 @@ Required properties:
 - interrupt-parent: phandle of irq parent for sysirq. The parent must
   use the same interrupt-cells format as GIC.
 - reg: Physical base address of the intpol registers and length of memory
-  mapped region.
+  mapped region. Could be multiple bases here. Ex: mt6797 needs 2 reg, others
+  need 1.
 
 Example:
-       sysirq: interrupt-controller@10200100 {
-               compatible = "mediatek,mt6589-sysirq", "mediatek,mt6577-sysirq";
+       sysirq: intpol-controller@10200620 {
+               compatible = "mediatek,mt6797-sysirq",
+                            "mediatek,mt6577-sysirq";
                interrupt-controller;
                #interrupt-cells = <3>;
                interrupt-parent = <&gic>;
-               reg = <0 0x10200100 0 0x1c>;
+               reg = <0 0x10220620 0 0x20>,
+                     <0 0x10220690 0 0x10>;
        };
index 6f28969..028268f 100644 (file)
@@ -6,7 +6,9 @@ perform in-band IPMI communication with their host.
 
 Required properties:
 
-- compatible : should be "aspeed,ast2400-ibt-bmc"
+- compatible : should be one of
+       "aspeed,ast2400-ibt-bmc"
+       "aspeed,ast2500-ibt-bmc"
 - reg: physical address and size of the registers
 
 Optional properties:
diff --git a/Documentation/devicetree/bindings/leds/leds-cpcap.txt b/Documentation/devicetree/bindings/leds/leds-cpcap.txt
new file mode 100644 (file)
index 0000000..ebf7cdc
--- /dev/null
@@ -0,0 +1,29 @@
+Motorola CPCAP PMIC LEDs
+------------------------
+
+This module is part of the CPCAP. For more details about the whole
+chip see Documentation/devicetree/bindings/mfd/motorola-cpcap.txt.
+
+Requires node properties:
+- compatible: should be one of
+   * "motorola,cpcap-led-mdl"          (Main Display Lighting)
+   * "motorola,cpcap-led-kl"           (Keyboard Lighting)
+   * "motorola,cpcap-led-adl"          (Aux Display Lighting)
+   * "motorola,cpcap-led-red"          (Red Triode)
+   * "motorola,cpcap-led-green"                (Green Triode)
+   * "motorola,cpcap-led-blue"         (Blue Triode)
+   * "motorola,cpcap-led-cf"           (Camera Flash)
+   * "motorola,cpcap-led-bt"           (Bluetooth)
+   * "motorola,cpcap-led-cp"           (Camera Privacy LED)
+- label: see Documentation/devicetree/bindings/leds/common.txt
+- vdd-supply: A phandle to the regulator powering the LED
+
+Example:
+
+&cpcap {
+       cpcap_led_red: red-led {
+               compatible = "motorola,cpcap-led-red";
+               label = "cpcap:red";
+               vdd-supply = <&sw5>;
+       };
+};
diff --git a/Documentation/devicetree/bindings/leds/leds-mt6323.txt b/Documentation/devicetree/bindings/leds/leds-mt6323.txt
new file mode 100644 (file)
index 0000000..45bf9f7
--- /dev/null
@@ -0,0 +1,60 @@
+Device Tree Bindings for LED support on MT6323 PMIC
+
+MT6323 LED controller is subfunction provided by MT6323 PMIC, so the LED
+controllers are defined as the subnode of the function node provided by MT6323
+PMIC controller that is being defined as one kind of Muti-Function Device (MFD)
+using shared bus called PMIC wrapper for each subfunction to access remote
+MT6323 PMIC hardware.
+
+For MT6323 MFD bindings see:
+Documentation/devicetree/bindings/mfd/mt6397.txt
+For MediaTek PMIC wrapper bindings see:
+Documentation/devicetree/bindings/soc/mediatek/pwrap.txt
+
+Required properties:
+- compatible : Must be "mediatek,mt6323-led"
+- address-cells : Must be 1
+- size-cells : Must be 0
+
+Each led is represented as a child node of the mediatek,mt6323-led that
+describes the initial behavior for each LED physically and currently only four
+LED child nodes can be supported.
+
+Required properties for the LED child node:
+- reg : LED channel number (0..3)
+
+Optional properties for the LED child node:
+- label : See Documentation/devicetree/bindings/leds/common.txt
+- linux,default-trigger : See Documentation/devicetree/bindings/leds/common.txt
+- default-state: See Documentation/devicetree/bindings/leds/common.txt
+
+Example:
+
+       mt6323: pmic {
+               compatible = "mediatek,mt6323";
+
+               ...
+
+               mt6323led: leds {
+                       compatible = "mediatek,mt6323-led";
+                       #address-cells = <1>;
+                       #size-cells = <0>;
+
+                       led@0 {
+                               reg = <0>;
+                               label = "LED0";
+                               linux,default-trigger = "timer";
+                               default-state = "on";
+                       };
+                       led@1 {
+                               reg = <1>;
+                               label = "LED1";
+                               default-state = "off";
+                       };
+                       led@2 {
+                               reg = <2>;
+                               label = "LED2";
+                               default-state = "on";
+                       };
+               };
+       };
index 198f3ba..f769c52 100644 (file)
@@ -17,6 +17,8 @@ Optional sub-node properties:
        - label: see Documentation/devicetree/bindings/leds/common.txt
        - type: Output configuration, see dt-bindings/leds/leds-pca9532.h (default NONE)
        - linux,default-trigger: see Documentation/devicetree/bindings/leds/common.txt
+       - default-state: see Documentation/devicetree/bindings/leds/common.txt
+         This property is only valid for sub-nodes of type <PCA9532_TYPE_LED>.
 
 Example:
   #include <dt-bindings/leds/leds-pca9532.h>
@@ -33,6 +35,14 @@ Example:
       label = "pca:green:power";
       type = <PCA9532_TYPE_LED>;
     };
+    kernel-booting {
+      type = <PCA9532_TYPE_LED>;
+      default-state = "on";
+    };
+    sys-stat {
+      type = <PCA9532_TYPE_LED>;
+      default-state = "keep"; // don't touch, was set by U-Boot
+    };
   };
 
 For more product information please see the link below:
diff --git a/Documentation/devicetree/bindings/mailbox/brcm,iproc-flexrm-mbox.txt b/Documentation/devicetree/bindings/mailbox/brcm,iproc-flexrm-mbox.txt
new file mode 100644 (file)
index 0000000..752ae6b
--- /dev/null
@@ -0,0 +1,59 @@
+Broadcom FlexRM Ring Manager
+============================
+The Broadcom FlexRM ring manager provides a set of rings which can be
+used to submit work to offload engines. An SoC may have multiple FlexRM
+hardware blocks. There is one device tree entry per FlexRM block. The
+FlexRM driver will create a mailbox-controller instance for given FlexRM
+hardware block where each mailbox channel is a separate FlexRM ring.
+
+Required properties:
+--------------------
+- compatible:  Should be "brcm,iproc-flexrm-mbox"
+- reg:         Specifies base physical address and size of the FlexRM
+               ring registers
+- msi-parent:  Phandles (and potential Device IDs) to MSI controllers
+               The FlexRM engine will send MSIs (instead of wired
+               interrupts) to CPU. There is one MSI for each FlexRM ring.
+               Refer devicetree/bindings/interrupt-controller/msi.txt
+- #mbox-cells: Specifies the number of cells needed to encode a mailbox
+               channel. This should be 3.
+
+               The 1st cell is the mailbox channel number.
+
+               The 2nd cell contains MSI completion threshold. This is the
+               number of completion messages for which FlexRM will inject
+               one MSI interrupt to CPU.
+
+               The 3nd cell contains MSI timer value representing time for
+               which FlexRM will wait to accumulate N completion messages
+               where N is the value specified by 2nd cell above. If FlexRM
+               does not get required number of completion messages in time
+               specified by this cell then it will inject one MSI interrupt
+               to CPU provided atleast one completion message is available.
+
+Optional properties:
+--------------------
+- dma-coherent:        Present if DMA operations made by the FlexRM engine (such
+               as DMA descriptor access, access to buffers pointed by DMA
+               descriptors and read/write pointer updates to DDR) are
+               cache coherent with the CPU.
+
+Example:
+--------
+crypto_mbox: mbox@67000000 {
+       compatible = "brcm,iproc-flexrm-mbox";
+       reg = <0x67000000 0x200000>;
+       msi-parent = <&gic_its 0x7f00>;
+       #mbox-cells = <3>;
+};
+
+crypto@672c0000 {
+       compatible = "brcm,spu2-v2-crypto";
+       reg = <0x672c0000 0x1000>;
+       mboxes = <&crypto_mbox 0 0x1 0xffff>,
+                <&crypto_mbox 1 0x1 0xffff>,
+                <&crypto_mbox 16 0x1 0xffff>,
+                <&crypto_mbox 17 0x1 0xffff>,
+                <&crypto_mbox 30 0x1 0xffff>,
+                <&crypto_mbox 31 0x1 0xffff>;
+};
index 411ccf4..0f3ee81 100644 (file)
@@ -1,9 +1,11 @@
 The PDC driver manages data transfer to and from various offload engines
 on some Broadcom SoCs. An SoC may have multiple PDC hardware blocks. There is
-one device tree entry per block.
+one device tree entry per block.  On some chips, the PDC functionality is
+handled by the FA2 (Northstar Plus).
 
 Required properties:
-- compatible : Should be "brcm,iproc-pdc-mbox".
+- compatible : Should be "brcm,iproc-pdc-mbox" or "brcm,iproc-fa2-mbox" for
+  FA2/Northstar Plus.
 - reg: Should contain PDC registers location and length.
 - interrupts: Should contain the IRQ line for the PDC.
 - #mbox-cells: 1
index b7fa3b9..a339dbb 100644 (file)
@@ -44,13 +44,19 @@ Hip05 Example (note that Hip06 is the same except compatible):
        };
 
 HiSilicon Hip06/Hip07 PCIe host bridge DT (almost-ECAM) description.
+
+Some BIOSes place the host controller in a mode where it is ECAM
+compliant for all devices other than the root complex. In such cases,
+the host controller should be described as below.
+
 The properties and their meanings are identical to those described in
 host-generic-pci.txt except as listed below.
 
 Properties of the host controller node that differ from
 host-generic-pci.txt:
 
-- compatible     : Must be "hisilicon,pcie-almost-ecam"
+- compatible     : Must be "hisilicon,hip06-pcie-ecam", or
+                  "hisilicon,hip07-pcie-ecam"
 
 - reg            : Two entries: First the ECAM configuration space for any
                   other bus underneath the root bus. Second, the base
@@ -59,7 +65,7 @@ host-generic-pci.txt:
 
 Example:
        pcie0: pcie@a0090000 {
-               compatible = "hisilicon,pcie-almost-ecam";
+               compatible = "hisilicon,hip06-pcie-ecam";
                reg = <0 0xb0000000 0 0x2000000>,  /*  ECAM configuration space */
                      <0 0xa0090000 0 0x10000>; /* host bridge registers */
                bus-range = <0  31>;
index 723e1ad..940707d 100644 (file)
@@ -31,7 +31,9 @@ Optional properties:
 
 - domain-idle-states : A phandle of an idle-state that shall be soaked into a
                 generic domain power state. The idle state definitions are
-                compatible with domain-idle-state specified in [1].
+                compatible with domain-idle-state specified in [1]. phandles
+                that are not compatible with domain-idle-state will be
+                ignored.
   The domain-idle-state property reflects the idle state of this PM domain and
   not the idle states of the devices or sub-domains in the PM domain. Devices
   and sub-domains have their own idle-states independent of the parent
diff --git a/Documentation/devicetree/bindings/power/reset/gemini-poweroff.txt b/Documentation/devicetree/bindings/power/reset/gemini-poweroff.txt
new file mode 100644 (file)
index 0000000..7fec3e1
--- /dev/null
@@ -0,0 +1,17 @@
+* Device-Tree bindings for Cortina Systems Gemini Poweroff
+
+This is a special IP block in the Cortina Gemini SoC that only
+deals with different ways to power the system down.
+
+Required properties:
+- compatible: should be "cortina,gemini-power-controller"
+- reg: should contain the physical memory base and size
+- interrupts: should contain the power management interrupt
+
+Example:
+
+power-controller@4b000000 {
+       compatible = "cortina,gemini-power-controller";
+       reg = <0x4b000000 0x100>;
+       interrupts = <26 IRQ_TYPE_EDGE_FALLING>;
+};
index 1e2546f..022ed1f 100644 (file)
@@ -3,13 +3,20 @@ Generic SYSCON mapped register poweroff driver
 This is a generic poweroff driver using syscon to map the poweroff register.
 The poweroff is generally performed with a write to the poweroff register
 defined by the register map pointed by syscon reference plus the offset
-with the mask defined in the poweroff node.
+with the value and mask defined in the poweroff node.
 
 Required properties:
 - compatible: should contain "syscon-poweroff"
 - regmap: this is phandle to the register map node
 - offset: offset in the register map for the poweroff register (in bytes)
-- mask: the poweroff value written to the poweroff register (32 bit access)
+- value: the poweroff value written to the poweroff register (32 bit access)
+
+Optional properties:
+- mask: update only the register bits defined by the mask (32 bit)
+
+Legacy usage:
+If a node doesn't contain a value property but contains a mask property, the
+mask property is used as the value.
 
 Default will be little endian mode, 32 bit access only.
 
index d23dc00..d3a5a93 100644 (file)
@@ -33,6 +33,7 @@ Required properties:
 - compatible: should be one of:
   - "rockchip,rk3188-io-voltage-domain" for rk3188
   - "rockchip,rk3288-io-voltage-domain" for rk3288
+  - "rockchip,rk3328-io-voltage-domain" for rk3328
   - "rockchip,rk3368-io-voltage-domain" for rk3368
   - "rockchip,rk3368-pmu-io-voltage-domain" for rk3368 pmu-domains
   - "rockchip,rk3399-io-voltage-domain" for rk3399
diff --git a/Documentation/devicetree/bindings/power/supply/cpcap-charger.txt b/Documentation/devicetree/bindings/power/supply/cpcap-charger.txt
new file mode 100644 (file)
index 0000000..80bd873
--- /dev/null
@@ -0,0 +1,37 @@
+Motorola CPCAP PMIC battery charger binding
+
+Required properties:
+- compatible: Shall be "motorola,mapphone-cpcap-charger"
+- interrupts: Interrupt specifier for each name in interrupt-names
+- interrupt-names: Should contain the following entries:
+                  "chrg_det", "rvrs_chrg", "chrg_se1b", "se0conn",
+                  "rvrs_mode", "chrgcurr1", "vbusvld", "battdetb"
+- io-channels: IIO ADC channel specifier for each name in io-channel-names
+- io-channel-names: Should contain the following entries:
+                   "battdetb", "battp", "vbus", "chg_isense", "batti"
+
+Optional properties:
+- mode-gpios: Optionally CPCAP charger can have a companion wireless
+             charge controller that is controlled with two GPIOs
+             that are active low.
+
+Example:
+
+cpcap_charger: charger {
+       compatible = "motorola,mapphone-cpcap-charger";
+       interrupts-extended = <
+               &cpcap 13 0 &cpcap 12 0 &cpcap 29 0 &cpcap 28 0
+               &cpcap 22 0 &cpcap 20 0 &cpcap 19 0 &cpcap 54 0
+       >;
+       interrupt-names =
+               "chrg_det", "rvrs_chrg", "chrg_se1b", "se0conn",
+               "rvrs_mode", "chrgcurr1", "vbusvld", "battdetb";
+       mode-gpios = <&gpio3 29 GPIO_ACTIVE_LOW
+                     &gpio3 23 GPIO_ACTIVE_LOW>;
+       io-channels = <&cpcap_adc 0 &cpcap_adc 1
+                      &cpcap_adc 2 &cpcap_adc 5
+                      &cpcap_adc 6>;
+       io-channel-names = "battdetb", "battp",
+                          "vbus", "chg_isense",
+                          "batti";
+};
diff --git a/Documentation/devicetree/bindings/power/supply/lego_ev3_battery.txt b/Documentation/devicetree/bindings/power/supply/lego_ev3_battery.txt
new file mode 100644 (file)
index 0000000..5485633
--- /dev/null
@@ -0,0 +1,21 @@
+LEGO MINDSTORMS EV3 Battery
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+LEGO MINDSTORMS EV3 has some built-in capability for monitoring the battery.
+It uses 6 AA batteries or a special Li-ion rechargeable battery pack that is
+detected by a key switch in the battery compartment.
+
+Required properties:
+ - compatible: Must be "lego,ev3-battery"
+ - io-channels: phandles to analog inputs for reading voltage and current
+ - io-channel-names: Must be "voltage", "current"
+ - rechargeable-gpios: phandle to the rechargeable battery indication gpio
+
+Example:
+
+       battery {
+               compatible = "lego,ev3-battery";
+               io-channels = <&adc 4>, <&adc 3>;
+               io-channel-names = "voltage", "current";
+               rechargeable-gpios = <&gpio 136 GPIO_ACTIVE_LOW>;
+       };
index ea42ae1..a9d7aa6 100644 (file)
@@ -6,8 +6,8 @@ temperature monitoring, and uses a slightly different conversion
 formula for the charge counter.
 
 Required properties:
-- compatible: Should contain "ltc2941" or "ltc2943" which also indicates the
-    type of I2C chip attached.
+- compatible: Should contain "lltc,ltc2941" or "lltc,ltc2943" which also
+    indicates the type of I2C chip attached.
 - reg: The 7-bit I2C address.
 - lltc,resistor-sense: The sense resistor value in milli-ohms. Can be a 32-bit
     negative value when the battery has been connected to the wrong end of the
@@ -20,7 +20,7 @@ Required properties:
 Example from the Topic Miami Florida board:
 
        fuelgauge: ltc2943@64 {
-               compatible = "ltc2943";
+               compatible = "lltc,ltc2943";
                reg = <0x64>;
                lltc,resistor-sense = <15>;
                lltc,prescaler-exponent = <5>; /* 2^(2*5) = 1024 */
diff --git a/Documentation/devicetree/bindings/timer/cortina,gemini-timer.txt b/Documentation/devicetree/bindings/timer/cortina,gemini-timer.txt
deleted file mode 100644 (file)
index 16ea1d3..0000000
+++ /dev/null
@@ -1,22 +0,0 @@
-Cortina Systems Gemini timer
-
-This timer is embedded in the Cortina Systems Gemini SoCs.
-
-Required properties:
-
-- compatible : Must be "cortina,gemini-timer"
-- reg : Should contain registers location and length
-- interrupts : Should contain the three timer interrupts with
-  flags for rising edge
-- syscon : a phandle to the global Gemini system controller
-
-Example:
-
-timer@43000000 {
-       compatible = "cortina,gemini-timer";
-       reg = <0x43000000 0x1000>;
-       interrupts = <14 IRQ_TYPE_EDGE_RISING>, /* Timer 1 */
-                  <15 IRQ_TYPE_EDGE_RISING>, /* Timer 2 */
-                  <16 IRQ_TYPE_EDGE_RISING>; /* Timer 3 */
-       syscon = <&syscon>;
-};
diff --git a/Documentation/devicetree/bindings/timer/faraday,fttmr010.txt b/Documentation/devicetree/bindings/timer/faraday,fttmr010.txt
new file mode 100644 (file)
index 0000000..b73ca6c
--- /dev/null
@@ -0,0 +1,33 @@
+Faraday Technology timer
+
+This timer is a generic IP block from Faraday Technology, embedded in the
+Cortina Systems Gemini SoCs and other designs.
+
+Required properties:
+
+- compatible : Must be one of
+  "faraday,fttmr010"
+  "cortina,gemini-timer"
+- reg : Should contain registers location and length
+- interrupts : Should contain the three timer interrupts usually with
+  flags for falling edge
+
+Optionally required properties:
+
+- clocks : a clock to provide the tick rate for "faraday,fttmr010"
+- clock-names : should be "EXTCLK" and "PCLK" for the external tick timer
+  and peripheral clock respectively, for "faraday,fttmr010"
+- syscon : a phandle to the global Gemini system controller if the compatible
+  type is "cortina,gemini-timer"
+
+Example:
+
+timer@43000000 {
+       compatible = "faraday,fttmr010";
+       reg = <0x43000000 0x1000>;
+       interrupts = <14 IRQ_TYPE_EDGE_FALLING>, /* Timer 1 */
+                  <15 IRQ_TYPE_EDGE_FALLING>, /* Timer 2 */
+                  <16 IRQ_TYPE_EDGE_FALLING>; /* Timer 3 */
+       clocks = <&extclk>, <&pclk>;
+       clock-names = "EXTCLK", "PCLK";
+};
index a41b184..16a5f45 100644 (file)
@@ -1,9 +1,15 @@
 Rockchip rk timer
 
 Required properties:
-- compatible: shall be one of:
-  "rockchip,rk3288-timer" - for rk3066, rk3036, rk3188, rk322x, rk3288, rk3368
-  "rockchip,rk3399-timer" - for rk3399
+- compatible: should be:
+  "rockchip,rk3036-timer", "rockchip,rk3288-timer": for Rockchip RK3036
+  "rockchip,rk3066-timer", "rockchip,rk3288-timer": for Rockchip RK3066
+  "rockchip,rk3188-timer", "rockchip,rk3288-timer": for Rockchip RK3188
+  "rockchip,rk3228-timer", "rockchip,rk3288-timer": for Rockchip RK3228
+  "rockchip,rk3229-timer", "rockchip,rk3288-timer": for Rockchip RK3229
+  "rockchip,rk3288-timer": for Rockchip RK3288
+  "rockchip,rk3368-timer", "rockchip,rk3288-timer": for Rockchip RK3368
+  "rockchip,rk3399-timer": for Rockchip RK3399
 - reg: base address of the timer register starting with TIMERS CONTROL register
 - interrupts: should contain the interrupts for Timer0
 - clocks : must contain an entry for each entry in clock-names
index ec0bfb9..830c998 100644 (file)
@@ -265,6 +265,7 @@ sbs Smart Battery System
 schindler      Schindler
 seagate        Seagate Technology PLC
 semtech        Semtech Corporation
+sensirion      Sensirion AG
 sgx    SGX Sensortech
 sharp  Sharp Corporation
 si-en  Si-En Technology Ltd.
index c1b4f91..5575d2d 100644 (file)
@@ -10,7 +10,6 @@
     |         arc: | TODO |
     |         arm: |  ok  |
     |       arm64: |  ok  |
-    |       avr32: | TODO |
     |    blackfin: | TODO |
     |         c6x: | TODO |
     |        cris: | TODO |
index 6d930fc..abb5f27 100644 (file)
@@ -10,7 +10,6 @@
     |         arc: |  ok  |
     |         arm: |  ok  |
     |       arm64: |  ok  |
-    |       avr32: | TODO |
     |    blackfin: |  ok  |
     |         c6x: | TODO |
     |        cris: | TODO |
index 136868b..dbdaffc 100644 (file)
@@ -10,7 +10,6 @@
     |         arc: | TODO |
     |         arm: |  ok  |
     |       arm64: |  ok  |
-    |       avr32: | TODO |
     |    blackfin: | TODO |
     |         c6x: | TODO |
     |        cris: | TODO |
index 728061d..5e97a89 100644 (file)
@@ -10,7 +10,6 @@
     |         arc: |  ok  |
     |         arm: |  ok  |
     |       arm64: |  ok  |
-    |       avr32: | TODO |
     |    blackfin: |  ok  |
     |         c6x: |  ok  |
     |        cris: | TODO |
index 703f578..76bbd7f 100644 (file)
@@ -10,7 +10,6 @@
     |         arc: | TODO |
     |         arm: | TODO |
     |       arm64: |  ok  |
-    |       avr32: | TODO |
     |    blackfin: | TODO |
     |         c6x: | TODO |
     |        cris: | TODO |
index 38dea8e..830dbe8 100644 (file)
@@ -10,7 +10,6 @@
     |         arc: | TODO |
     |         arm: |  ok  |
     |       arm64: |  ok  |
-    |       avr32: | TODO |
     |    blackfin: | TODO |
     |         c6x: | TODO |
     |        cris: | TODO |
index 862e15d..0217bf6 100644 (file)
@@ -10,7 +10,6 @@
     |         arc: |  ok  |
     |         arm: |  ok  |
     |       arm64: |  ok  |
-    |       avr32: | TODO |
     |    blackfin: |  ok  |
     |         c6x: | TODO |
     |        cris: | TODO |
index 40f44d0..f9133a9 100644 (file)
@@ -10,7 +10,6 @@
     |         arc: | TODO |
     |         arm: | TODO |
     |       arm64: | TODO |
-    |       avr32: | TODO |
     |    blackfin: | TODO |
     |         c6x: | TODO |
     |        cris: | TODO |
index a44bfff..529f66e 100644 (file)
@@ -10,7 +10,6 @@
     |         arc: |  ok  |
     |         arm: |  ok  |
     |       arm64: | TODO |
-    |       avr32: |  ok  |
     |    blackfin: | TODO |
     |         c6x: | TODO |
     |        cris: | TODO |
index d87c1ce..4335324 100644 (file)
@@ -10,7 +10,6 @@
     |         arc: |  ok  |
     |         arm: |  ok  |
     |       arm64: | TODO |
-    |       avr32: | TODO |
     |    blackfin: | TODO |
     |         c6x: | TODO |
     |        cris: | TODO |
index b8999d8..f559f1b 100644 (file)
@@ -10,7 +10,6 @@
     |         arc: | TODO |
     |         arm: |  ok  |
     |       arm64: | TODO |
-    |       avr32: | TODO |
     |    blackfin: | TODO |
     |         c6x: | TODO |
     |        cris: | TODO |
index 0fa4233..d7acd7b 100644 (file)
@@ -10,7 +10,6 @@
     |         arc: | TODO |
     |         arm: |  ok  |
     |       arm64: |  ok  |
-    |       avr32: | TODO |
     |    blackfin: | TODO |
     |         c6x: | TODO |
     |        cris: | TODO |
index d605c3f..53ed42b 100644 (file)
@@ -10,7 +10,6 @@
     |         arc: | TODO |
     |         arm: |  ok  |
     |       arm64: | TODO |
-    |       avr32: | TODO |
     |    blackfin: | TODO |
     |         c6x: | TODO |
     |        cris: | TODO |
index 44cc1ff..1494439 100644 (file)
@@ -10,7 +10,6 @@
     |         arc: | TODO |
     |         arm: | TODO |
     |       arm64: | TODO |
-    |       avr32: | TODO |
     |    blackfin: | TODO |
     |         c6x: | TODO |
     |        cris: | TODO |
index ffa522a..6be9206 100644 (file)
@@ -10,7 +10,6 @@
     |         arc: | TODO |
     |         arm: |  ok  |
     |       arm64: |  ok  |
-    |       avr32: | TODO |
     |    blackfin: | TODO |
     |         c6x: |  ok  |
     |        cris: | TODO |
index 83d2cf9..0eb08e1 100644 (file)
@@ -10,7 +10,6 @@
     |         arc: | TODO |
     |         arm: |  ok  |
     |       arm64: |  ok  |
-    |       avr32: | TODO |
     |    blackfin: | TODO |
     |         c6x: | TODO |
     |        cris: | TODO |
index 6ca98f9..514ad34 100644 (file)
@@ -10,7 +10,6 @@
     |         arc: |  ok  |
     |         arm: |  ok  |
     |       arm64: |  ok  |
-    |       avr32: | TODO |
     |    blackfin: | TODO |
     |         c6x: | TODO |
     |        cris: | TODO |
index 12b1c93..532c6f0 100644 (file)
@@ -10,7 +10,6 @@
     |         arc: | TODO |
     |         arm: | TODO |
     |       arm64: | TODO |
-    |       avr32: | TODO |
     |    blackfin: | TODO |
     |         c6x: | TODO |
     |        cris: | TODO |
index d9c3108..f3eec26 100644 (file)
@@ -10,7 +10,6 @@
     |         arc: | TODO |
     |         arm: | TODO |
     |       arm64: | TODO |
-    |       avr32: | TODO |
     |    blackfin: | TODO |
     |         c6x: | TODO |
     |        cris: | TODO |
index cf90635..9756abc 100644 (file)
@@ -10,7 +10,6 @@
     |         arc: |  ok  |
     |         arm: |  ok  |
     |       arm64: |  ok  |
-    |       avr32: |  ok  |
     |    blackfin: |  ok  |
     |         c6x: | TODO |
     |        cris: | TODO |
index 68c3a5d..62f4ee5 100644 (file)
@@ -10,7 +10,6 @@
     |         arc: | TODO |
     |         arm: | TODO |
     |       arm64: | TODO |
-    |       avr32: | TODO |
     |    blackfin: | TODO |
     |         c6x: | TODO |
     |        cris: | TODO |
index e973b1a..321b32f 100644 (file)
@@ -10,7 +10,6 @@
     |         arc: | TODO |
     |         arm: | TODO |
     |       arm64: | TODO |
-    |       avr32: | TODO |
     |    blackfin: | TODO |
     |         c6x: | TODO |
     |        cris: | TODO |
index ac93d7a..79bfa4d 100644 (file)
@@ -10,7 +10,6 @@
     |         arc: | TODO |
     |         arm: | TODO |
     |       arm64: | TODO |
-    |       avr32: | TODO |
     |    blackfin: | TODO |
     |         c6x: | TODO |
     |        cris: | TODO |
index 4660bf2..00f1606 100644 (file)
@@ -10,7 +10,6 @@
     |         arc: | TODO |
     |         arm: |  ok  |
     |       arm64: | TODO |
-    |       avr32: | TODO |
     |    blackfin: | TODO |
     |         c6x: | TODO |
     |        cris: | TODO |
index f179b1f..7d516ea 100644 (file)
@@ -10,7 +10,6 @@
     |         arc: | TODO |
     |         arm: |  ok  |
     |       arm64: |  ok  |
-    |       avr32: | TODO |
     |    blackfin: | TODO |
     |         c6x: | TODO |
     |        cris: | TODO |
index 85777c5..f974b8d 100644 (file)
@@ -10,7 +10,6 @@
     |         arc: | TODO |
     |         arm: |  ok  |
     |       arm64: |  ok  |
-    |       avr32: | TODO |
     |    blackfin: | TODO |
     |         c6x: | TODO |
     |        cris: | TODO |
index ac7cd6b..1d3c0f6 100644 (file)
@@ -10,7 +10,6 @@
     |         arc: |  ..  |
     |         arm: |  ..  |
     |       arm64: |  ..  |
-    |       avr32: |  ..  |
     |    blackfin: |  ..  |
     |         c6x: |  ..  |
     |        cris: |  ..  |
index 4f66ec1..a32d5b2 100644 (file)
@@ -10,7 +10,6 @@
     |         arc: | TODO |
     |         arm: |  ok  |
     |       arm64: |  ok  |
-    |       avr32: | TODO |
     |    blackfin: | TODO |
     |         c6x: | TODO |
     |        cris: | TODO |
index 8acb439..caee8f6 100644 (file)
@@ -10,7 +10,6 @@
     |         arc: | TODO |
     |         arm: |  ok  |
     |       arm64: |  ok  |
-    |       avr32: | TODO |
     |    blackfin: | TODO |
     |         c6x: | TODO |
     |        cris: | TODO |
index ff670b2..1cd87f6 100644 (file)
@@ -10,7 +10,6 @@
     |         arc: |  ok  |
     |         arm: |  ok  |
     |       arm64: |  ok  |
-    |       avr32: |  ok  |
     |    blackfin: |  ok  |
     |         c6x: |  ok  |
     |        cris: |  ok  |
index a1e3eea..e6d7c7b 100644 (file)
@@ -10,7 +10,6 @@
     |         arc: | TODO |
     |         arm: |  ok  |
     |       arm64: |  ok  |
-    |       avr32: | TODO |
     |    blackfin: | TODO |
     |         c6x: | TODO |
     |        cris: | TODO |
index 4199ffe..15c6071 100644 (file)
@@ -10,7 +10,6 @@
     |         arc: | TODO |
     |         arm: |  ok  |
     |       arm64: |  ok  |
-    |       avr32: | TODO |
     |    blackfin: | TODO |
     |         c6x: | TODO |
     |        cris: | TODO |
index 17f68a0..baee761 100644 (file)
@@ -10,7 +10,6 @@
     |         arc: |  ok  |
     |         arm: | TODO |
     |       arm64: |  ok  |
-    |       avr32: |  ok  |
     |    blackfin: | TODO |
     |         c6x: |  ok  |
     |        cris: | TODO |
index cf3c3e3..9129530 100644 (file)
@@ -10,7 +10,6 @@
     |         arc: | TODO |
     |         arm: |  ok  |
     |       arm64: |  ok  |
-    |       avr32: | TODO |
     |    blackfin: | TODO |
     |         c6x: | TODO |
     |        cris: | TODO |
index ec4dd28..f6829af 100644 (file)
@@ -10,7 +10,6 @@
     |         arc: | TODO |
     |         arm: |  ok  |
     |       arm64: |  ok  |
-    |       avr32: | TODO |
     |    blackfin: | TODO |
     |         c6x: | TODO |
     |        cris: | TODO |
index 9919742..1a09ea9 100644 (file)
@@ -10,7 +10,6 @@
     |         arc: | TODO |
     |         arm: | TODO |
     |       arm64: | TODO |
-    |       avr32: | TODO |
     |    blackfin: | TODO |
     |         c6x: | TODO |
     |        cris: | TODO |
index 523f830..d170e62 100644 (file)
@@ -10,7 +10,6 @@
     |         arc: |  ok  |
     |         arm: |  ok  |
     |       arm64: |  ok  |
-    |       avr32: |  ..  |
     |    blackfin: |  ..  |
     |         c6x: |  ..  |
     |        cris: |  ..  |
index 261b92e..abfab40 100644 (file)
@@ -10,7 +10,6 @@
     |         arc: | TODO |
     |         arm: | TODO |
     |       arm64: | TODO |
-    |       avr32: |  ..  |
     |    blackfin: | TODO |
     |         c6x: |  ..  |
     |        cris: |  ..  |
index df1d1f3..f81f09b 100644 (file)
@@ -10,7 +10,6 @@
     |         arc: | TODO |
     |         arm: | TODO |
     |       arm64: |  ok  |
-    |       avr32: | TODO |
     |    blackfin: | TODO |
     |         c6x: | TODO |
     |        cris: | TODO |
index 90c5374..0cc3e11 100644 (file)
@@ -10,7 +10,6 @@
     |         arc: |  ok  |
     |         arm: | TODO |
     |       arm64: | TODO |
-    |       avr32: | TODO |
     |    blackfin: | TODO |
     |         c6x: | TODO |
     |        cris: | TODO |
index e7c252a..9a3fdac 100644 (file)
@@ -10,7 +10,6 @@
     |         arc: |  ..  |
     |         arm: |  ..  |
     |       arm64: |  ..  |
-    |       avr32: |  ..  |
     |    blackfin: |  ..  |
     |         c6x: |  ..  |
     |        cris: |  ..  |
index 3de5434..dfaa39e 100644 (file)
@@ -10,7 +10,6 @@
     |         arc: |  ok  |
     |         arm: |  ok  |
     |       arm64: |  ok  |
-    |       avr32: | TODO |
     |    blackfin: | TODO |
     |         c6x: | TODO |
     |        cris: | TODO |
index fdcfdd7..fe25787 100644 (file)
@@ -58,8 +58,7 @@ prototypes:
        int (*permission) (struct inode *, int, unsigned int);
        int (*get_acl)(struct inode *, int);
        int (*setattr) (struct dentry *, struct iattr *);
-       int (*getattr) (const struct path *, struct dentry *, struct kstat *,
-                       u32, unsigned int);
+       int (*getattr) (const struct path *, struct kstat *, u32, unsigned int);
        ssize_t (*listxattr) (struct dentry *, char *, size_t);
        int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len);
        void (*update_time)(struct inode *, struct timespec *, int);
index 9528007..5fb17f4 100644 (file)
@@ -600,3 +600,9 @@ in your dentry operations instead.
 [recommended]
        ->readlink is optional for symlinks.  Don't set, unless filesystem needs
        to fake something for readlink(2).
+--
+[mandatory]
+       ->getattr() is now passed a struct path rather than a vfsmount and
+       dentry separately, and it now has request_mask and query_flags arguments
+       to specify the fields and sync type requested by statx.  Filesystems not
+       supporting any statx-specific features may ignore the new arguments.
index 5692117..94dd27e 100644 (file)
@@ -382,8 +382,7 @@ struct inode_operations {
        int (*permission) (struct inode *, int);
        int (*get_acl)(struct inode *, int);
        int (*setattr) (struct dentry *, struct iattr *);
-       int (*getattr) (const struct path *, struct dentry *, struct kstat *,
-                       u32, unsigned int);
+       int (*getattr) (const struct path *, struct kstat *, u32, unsigned int);
        ssize_t (*listxattr) (struct dentry *, char *, size_t);
        void (*update_time)(struct inode *, struct timespec *, int);
        int (*atomic_open)(struct inode *, struct dentry *, struct file *,
diff --git a/Documentation/hwmon/aspeed-pwm-tacho b/Documentation/hwmon/aspeed-pwm-tacho
new file mode 100644 (file)
index 0000000..7cfb349
--- /dev/null
@@ -0,0 +1,22 @@
+Kernel driver aspeed-pwm-tacho
+==============================
+
+Supported chips:
+       ASPEED AST2400/2500
+
+Authors:
+       <jaghu@google.com>
+
+Description:
+------------
+This driver implements support for ASPEED AST2400/2500 PWM and Fan Tacho
+controller. The PWM controller supports upto 8 PWM outputs. The Fan tacho
+controller supports up to 16 tachometer inputs.
+
+The driver provides the following sensor accesses in sysfs:
+
+fanX_input     ro      provide current fan rotation value in RPM as reported
+                       by the fan to the device.
+
+pwmX           rw      get or set PWM fan control value. This is an integer
+                       value between 0(off) and 255(full speed).
index 91a2843..47636a8 100644 (file)
@@ -2,7 +2,7 @@ Kernel driver tc654
 ===================
 
 Supported chips:
-  * Microship TC654 and TC655
+  * Microchip TC654 and TC655
     Prefix: 'tc654'
     Datasheet: http://ww1.microchip.com/downloads/en/DeviceDoc/20001734C.pdf
 
diff --git a/Documentation/lightnvm/pblk.txt b/Documentation/lightnvm/pblk.txt
new file mode 100644 (file)
index 0000000..1040ed1
--- /dev/null
@@ -0,0 +1,21 @@
+pblk: Physical Block Device Target
+==================================
+
+pblk implements a fully associative, host-based FTL that exposes a traditional
+block I/O interface. Its primary responsibilities are:
+
+  - Map logical addresses onto physical addresses (4KB granularity) in a
+    logical-to-physical (L2P) table.
+  - Maintain the integrity and consistency of the L2P table as well as its
+    recovery from normal tear down and power outage.
+  - Deal with controller- and media-specific constrains.
+  - Handle I/O errors.
+  - Implement garbage collection.
+  - Maintain consistency across the I/O stack during synchronization points.
+
+For more information please refer to:
+
+  http://lightnvm.io
+
+which maintains updated FAQs, manual pages, technical documentation, tools,
+contacts, etc.
index ba0c15d..79d09e4 100644 (file)
@@ -12,7 +12,7 @@ The following terms are used in this document:
    control and configuration, and a parallel or a serial bus for data.
  - camera host - an interface, to which a camera is connected. Typically a
    specialised interface, present on many SoCs, e.g. PXA27x and PXA3xx, SuperH,
-   AVR32, i.MX27, i.MX31.
+   i.MX27, i.MX31.
  - camera host bus - a connection between a camera host and a camera. Can be
    parallel or serial, consists of data and control lines, e.g. clock, vertical
    and horizontal synchronization signals.
index 54bd5fa..f2af35f 100644 (file)
@@ -77,9 +77,15 @@ static struct pinctrl_desc foo_desc = {
 
 int __init foo_probe(void)
 {
+       int error;
+
        struct pinctrl_dev *pctl;
 
-       return pinctrl_register_and_init(&foo_desc, <PARENT>, NULL, &pctl);
+       error = pinctrl_register_and_init(&foo_desc, <PARENT>, NULL, &pctl);
+       if (error)
+               return error;
+
+       return pinctrl_enable(pctl);
 }
 
 To enable the pinctrl subsystem and the subgroups for PINMUX and PINCONF and
index 64546eb..ee69d75 100644 (file)
@@ -478,15 +478,23 @@ drivers/base/power/runtime.c and include/linux/pm_runtime.h:
     - set the power.last_busy field to the current time
 
   void pm_runtime_use_autosuspend(struct device *dev);
-    - set the power.use_autosuspend flag, enabling autosuspend delays
+    - set the power.use_autosuspend flag, enabling autosuspend delays; call
+      pm_runtime_get_sync if the flag was previously cleared and
+      power.autosuspend_delay is negative
 
   void pm_runtime_dont_use_autosuspend(struct device *dev);
-    - clear the power.use_autosuspend flag, disabling autosuspend delays
+    - clear the power.use_autosuspend flag, disabling autosuspend delays;
+      decrement the device's usage counter if the flag was previously set and
+      power.autosuspend_delay is negative; call pm_runtime_idle
 
   void pm_runtime_set_autosuspend_delay(struct device *dev, int delay);
     - set the power.autosuspend_delay value to 'delay' (expressed in
       milliseconds); if 'delay' is negative then runtime suspends are
-      prevented
+      prevented; if power.use_autosuspend is set, pm_runtime_get_sync may be
+      called or the device's usage counter may be decremented and
+      pm_runtime_idle called depending on if power.autosuspend_delay is
+      changed to or from a negative value; if power.use_autosuspend is clear,
+      pm_runtime_idle is called
 
   unsigned long pm_runtime_autosuspend_expiration(struct device *dev);
     - calculate the time when the current autosuspend delay period will expire,
@@ -836,9 +844,8 @@ of the non-autosuspend counterparts:
        Instead of: pm_runtime_put_sync   use: pm_runtime_put_sync_autosuspend.
 
 Drivers may also continue to use the non-autosuspend helper functions; they
-will behave normally, not taking the autosuspend delay into account.
-Similarly, if the power.use_autosuspend field isn't set then the autosuspend
-helper functions will behave just like the non-autosuspend counterparts.
+will behave normally, which means sometimes taking the autosuspend delay into
+account (see pm_runtime_idle).
 
 Under some circumstances a driver or subsystem may want to prevent a device
 from autosuspending immediately, even though the usage counter is zero and the
index 11ec2d9..61e9c78 100644 (file)
@@ -124,7 +124,7 @@ specified in the following format in the sign-off area:
 
 .. code-block:: none
 
-     Cc: <stable@vger.kernel.org> # 3.3.x-
+     Cc: <stable@vger.kernel.org> # 3.3.x
 
 The tag has the meaning of:
 
diff --git a/Documentation/scheduler/sched-pelt.c b/Documentation/scheduler/sched-pelt.c
new file mode 100644 (file)
index 0000000..e421913
--- /dev/null
@@ -0,0 +1,108 @@
+/*
+ * The following program is used to generate the constants for
+ * computing sched averages.
+ *
+ * ==============================================================
+ *             C program (compile with -lm)
+ * ==============================================================
+ */
+
+#include <math.h>
+#include <stdio.h>
+
+#define HALFLIFE 32
+#define SHIFT 32
+
+double y;
+
+void calc_runnable_avg_yN_inv(void)
+{
+       int i;
+       unsigned int x;
+
+       printf("static const u32 runnable_avg_yN_inv[] = {");
+       for (i = 0; i < HALFLIFE; i++) {
+               x = ((1UL<<32)-1)*pow(y, i);
+
+               if (i % 6 == 0) printf("\n\t");
+               printf("0x%8x, ", x);
+       }
+       printf("\n};\n\n");
+}
+
+int sum = 1024;
+
+void calc_runnable_avg_yN_sum(void)
+{
+       int i;
+
+       printf("static const u32 runnable_avg_yN_sum[] = {\n\t    0,");
+       for (i = 1; i <= HALFLIFE; i++) {
+               if (i == 1)
+                       sum *= y;
+               else
+                       sum = sum*y + 1024*y;
+
+               if (i % 11 == 0)
+                       printf("\n\t");
+
+               printf("%5d,", sum);
+       }
+       printf("\n};\n\n");
+}
+
+int n = -1;
+/* first period */
+long max = 1024;
+
+void calc_converged_max(void)
+{
+       long last = 0, y_inv = ((1UL<<32)-1)*y;
+
+       for (; ; n++) {
+               if (n > -1)
+                       max = ((max*y_inv)>>SHIFT) + 1024;
+                       /*
+                        * This is the same as:
+                        * max = max*y + 1024;
+                        */
+
+               if (last == max)
+                       break;
+
+               last = max;
+       }
+       n--;
+       printf("#define LOAD_AVG_PERIOD %d\n", HALFLIFE);
+       printf("#define LOAD_AVG_MAX %ld\n", max);
+//     printf("#define LOAD_AVG_MAX_N %d\n\n", n);
+}
+
+void calc_accumulated_sum_32(void)
+{
+       int i, x = sum;
+
+       printf("static const u32 __accumulated_sum_N32[] = {\n\t     0,");
+       for (i = 1; i <= n/HALFLIFE+1; i++) {
+               if (i > 1)
+                       x = x/2 + sum;
+
+               if (i % 6 == 0)
+                       printf("\n\t");
+
+               printf("%6d,", x);
+       }
+       printf("\n};\n\n");
+}
+
+void main(void)
+{
+       printf("/* Generated by Documentation/scheduler/sched-pelt; do not modify. */\n\n");
+
+       y = pow(0.5, 1/(double)HALFLIFE);
+
+       calc_runnable_avg_yN_inv();
+//     calc_runnable_avg_yN_sum();
+       calc_converged_max();
+//     calc_accumulated_sum_32();
+}
index 41ef9d8..5ea8505 100644 (file)
@@ -8,8 +8,9 @@ Overview
 --------
 These events are similar to tracepoint based events. Instead of Tracepoint,
 this is based on kprobes (kprobe and kretprobe). So it can probe wherever
-kprobes can probe (this means, all functions body except for __kprobes
-functions). Unlike the Tracepoint based event, this can be added and removed
+kprobes can probe (this means, all functions except those with
+__kprobes/nokprobe_inline annotation and those marked NOKPROBE_SYMBOL).
+Unlike the Tracepoint based event, this can be added and removed
 dynamically, on the fly.
 
 To enable this feature, build your kernel with CONFIG_KPROBE_EVENTS=y.
index 76e61c8..b2f60ca 100644 (file)
@@ -83,6 +83,12 @@ Groups:
 
     Bits for undefined preemption levels are RAZ/WI.
 
+    For historical reasons and to provide ABI compatibility with userspace we
+    export the GICC_PMR register in the format of the GICH_VMCR.VMPriMask
+    field in the lower 5 bits of a word, meaning that userspace must always
+    use the lower 5 bits to communicate with the KVM device and must shift the
+    value left by 3 places to obtain the actual priority mask level.
+
   Limitations:
     - Priorities are not implemented, and registers are RAZ/WI
     - Currently only implemented for KVM_DEV_TYPE_ARM_VGIC_V2.
index b8527c6..97b7adb 100644 (file)
@@ -27,7 +27,7 @@ Offset        Proto   Name            Meaning
 1C0/020        ALL     efi_info        EFI 32 information (struct efi_info)
 1E0/004        ALL     alk_mem_k       Alternative mem check, in KB
 1E4/004        ALL     scratch         Scratch field for the kernel setup code
-1E8/001        ALL     e820_entries    Number of entries in e820_map (below)
+1E8/001        ALL     e820_entries    Number of entries in e820_table (below)
 1E9/001        ALL     eddbuf_entries  Number of entries in eddbuf (below)
 1EA/001        ALL     edd_mbr_sig_buf_entries Number of entries in edd_mbr_sig_buffer
                                (below)
@@ -35,6 +35,6 @@ Offset        Proto   Name            Meaning
 1EC/001        ALL     secure_boot     Secure boot is enabled in the firmware
 1EF/001        ALL     sentinel        Used to detect broken bootloaders
 290/040        ALL     edd_mbr_sig_buffer EDD MBR signatures
-2D0/A00        ALL     e820_map        E820 memory map table
-                               (array of struct e820entry)
+2D0/A00        ALL     e820_table      E820 memory map table
+                               (array of struct e820_entry)
 D00/1EC        ALL     eddbuf          EDD data (array of struct edd_info)
index 1b0a87f..33ecf26 100644 (file)
@@ -2327,21 +2327,6 @@ S:       Maintained
 F:     drivers/auxdisplay/
 F:     include/linux/cfag12864b.h
 
-AVR32 ARCHITECTURE
-M:     Haavard Skinnemoen <hskinnemoen@gmail.com>
-M:     Hans-Christian Egtvedt <egtvedt@samfundet.no>
-W:     http://www.atmel.com/products/AVR32/
-W:     http://mirror.egtvedt.no/avr32linux.org/
-W:     http://avrfreaks.net/
-S:     Maintained
-F:     arch/avr32/
-
-AVR32/AT32AP MACHINE SUPPORT
-M:     Haavard Skinnemoen <hskinnemoen@gmail.com>
-M:     Hans-Christian Egtvedt <egtvedt@samfundet.no>
-S:     Maintained
-F:     arch/avr32/mach-at32ap/
-
 AX.25 NETWORK LAYER
 M:     Ralf Baechle <ralf@linux-mips.org>
 L:     linux-hams@vger.kernel.org
@@ -2544,6 +2529,14 @@ F:       block/
 F:     kernel/trace/blktrace.c
 F:     lib/sbitmap.c
 
+BFQ I/O SCHEDULER
+M:     Paolo Valente <paolo.valente@linaro.org>
+M:     Jens Axboe <axboe@kernel.dk>
+L:     linux-block@vger.kernel.org
+S:     Maintained
+F:     block/bfq-*
+F:     Documentation/block/bfq-iosched.txt
+
 BLOCK2MTD DRIVER
 M:     Joern Engel <joern@lazybastard.org>
 L:     linux-mtd@lists.infradead.org
@@ -2585,12 +2578,26 @@ F:      include/uapi/linux/if_bonding.h
 
 BPF (Safe dynamic programs and tools)
 M:     Alexei Starovoitov <ast@kernel.org>
+M:     Daniel Borkmann <daniel@iogearbox.net>
 L:     netdev@vger.kernel.org
 L:     linux-kernel@vger.kernel.org
 S:     Supported
+F:     arch/x86/net/bpf_jit*
+F:     Documentation/networking/filter.txt
+F:     include/linux/bpf*
+F:     include/linux/filter.h
+F:     include/uapi/linux/bpf*
+F:     include/uapi/linux/filter.h
 F:     kernel/bpf/
-F:     tools/testing/selftests/bpf/
+F:     kernel/trace/bpf_trace.c
 F:     lib/test_bpf.c
+F:     net/bpf/
+F:     net/core/filter.c
+F:     net/sched/act_bpf.c
+F:     net/sched/cls_bpf.c
+F:     samples/bpf/
+F:     tools/net/bpf*
+F:     tools/testing/selftests/bpf/
 
 BROADCOM B44 10/100 ETHERNET DRIVER
 M:     Michael Chan <michael.chan@broadcom.com>
@@ -3449,6 +3456,7 @@ T:        git git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git
 T:     git git://git.linaro.org/people/vireshk/linux.git (For ARM Updates)
 B:     https://bugzilla.kernel.org
 F:     Documentation/cpu-freq/
+F:     Documentation/devicetree/bindings/cpufreq/
 F:     drivers/cpufreq/
 F:     include/linux/cpufreq.h
 F:     tools/testing/selftests/cpufreq/
@@ -4117,14 +4125,13 @@ F:      drivers/block/drbd/
 F:     lib/lru_cache.c
 F:     Documentation/blockdev/drbd/
 
-DRIVER CORE, KOBJECTS, DEBUGFS, KERNFS AND SYSFS
+DRIVER CORE, KOBJECTS, DEBUGFS AND SYSFS
 M:     Greg Kroah-Hartman <gregkh@linuxfoundation.org>
 T:     git git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/driver-core.git
 S:     Supported
 F:     Documentation/kobject.txt
 F:     drivers/base/
 F:     fs/debugfs/
-F:     fs/kernfs/
 F:     fs/sysfs/
 F:     include/linux/debugfs.h
 F:     include/linux/kobj*
@@ -4694,6 +4701,7 @@ L:        linux-edac@vger.kernel.org
 L:     linux-mips@linux-mips.org
 S:     Supported
 F:     drivers/edac/octeon_edac*
+F:     drivers/edac/thunderx_edac*
 
 EDAC-E752X
 M:     Mark Gross <mark.gross@intel.com>
@@ -4928,6 +4936,7 @@ F:        include/linux/netfilter_bridge/
 F:     net/bridge/
 
 ETHERNET PHY LIBRARY
+M:     Andrew Lunn <andrew@lunn.ch>
 M:     Florian Fainelli <f.fainelli@gmail.com>
 L:     netdev@vger.kernel.org
 S:     Maintained
@@ -5406,6 +5415,23 @@ F:       fs/fuse/
 F:     include/uapi/linux/fuse.h
 F:     Documentation/filesystems/fuse.txt
 
+FUTEX SUBSYSTEM
+M:     Thomas Gleixner <tglx@linutronix.de>
+M:     Ingo Molnar <mingo@redhat.com>
+R:     Peter Zijlstra <peterz@infradead.org>
+R:     Darren Hart <dvhart@infradead.org>
+L:     linux-kernel@vger.kernel.org
+T:     git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git locking/core
+S:     Maintained
+F:     kernel/futex.c
+F:     kernel/futex_compat.c
+F:     include/asm-generic/futex.h
+F:     include/linux/futex.h
+F:     include/uapi/linux/futex.h
+F:     tools/testing/selftests/futex/
+F:     tools/perf/bench/futex*
+F:     Documentation/*futex*
+
 FUTURE DOMAIN TMC-16x0 SCSI DRIVER (16-bit)
 M:     Rik Faith <faith@cs.unc.edu>
 L:     linux-scsi@vger.kernel.org
@@ -7089,9 +7115,9 @@ S:        Maintained
 F:     fs/autofs4/
 
 KERNEL BUILD + files below scripts/ (unless maintained elsewhere)
+M:     Masahiro Yamada <yamada.masahiro@socionext.com>
 M:     Michal Marek <mmarek@suse.com>
-T:     git git://git.kernel.org/pub/scm/linux/kernel/git/mmarek/kbuild.git for-next
-T:     git git://git.kernel.org/pub/scm/linux/kernel/git/mmarek/kbuild.git rc-fixes
+T:     git git://git.kernel.org/pub/scm/linux/kernel/git/masahiroy/linux-kbuild.git
 L:     linux-kbuild@vger.kernel.org
 S:     Maintained
 F:     Documentation/kbuild/
@@ -7208,6 +7234,14 @@ F:       arch/mips/include/uapi/asm/kvm*
 F:     arch/mips/include/asm/kvm*
 F:     arch/mips/kvm/
 
+KERNFS
+M:     Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+M:     Tejun Heo <tj@kernel.org>
+T:     git git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/driver-core.git
+S:     Supported
+F:     include/linux/kernfs.h
+F:     fs/kernfs/
+
 KEXEC
 M:     Eric Biederman <ebiederm@xmission.com>
 W:     http://kernel.org/pub/linux/utils/kernel/kexec/
@@ -8753,6 +8787,7 @@ W:        http://www.linuxfoundation.org/en/Net
 Q:     http://patchwork.ozlabs.org/project/netdev/list/
 T:     git git://git.kernel.org/pub/scm/linux/kernel/git/davem/net.git
 T:     git git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git
+B:     mailto:netdev@vger.kernel.org
 S:     Maintained
 F:     net/
 F:     include/net/
@@ -10814,6 +10849,7 @@ F:      drivers/s390/block/dasd*
 F:     block/partitions/ibm.c
 
 S390 NETWORK DRIVERS
+M:     Julian Wiedmann <jwi@linux.vnet.ibm.com>
 M:     Ursula Braun <ubraun@linux.vnet.ibm.com>
 L:     linux-s390@vger.kernel.org
 W:     http://www.ibm.com/developerworks/linux/linux390/
@@ -10844,6 +10880,7 @@ S:      Supported
 F:     drivers/s390/scsi/zfcp_*
 
 S390 IUCV NETWORK LAYER
+M:     Julian Wiedmann <jwi@linux.vnet.ibm.com>
 M:     Ursula Braun <ubraun@linux.vnet.ibm.com>
 L:     linux-s390@vger.kernel.org
 W:     http://www.ibm.com/developerworks/linux/linux390/
@@ -11099,6 +11136,7 @@ F:      drivers/power/supply/bq27xxx_battery_i2c.c
 TIMEKEEPING, CLOCKSOURCE CORE, NTP, ALARMTIMER
 M:     John Stultz <john.stultz@linaro.org>
 M:     Thomas Gleixner <tglx@linutronix.de>
+R:     Stephen Boyd <sboyd@codeaurora.org>
 L:     linux-kernel@vger.kernel.org
 T:     git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git timers/core
 S:     Supported
@@ -12454,7 +12492,6 @@ F:      drivers/clk/ti/
 F:     include/linux/clk/ti.h
 
 TI ETHERNET SWITCH DRIVER (CPSW)
-M:     Mugunthan V N <mugunthanvnm@ti.com>
 R:     Grygorii Strashko <grygorii.strashko@ti.com>
 L:     linux-omap@vger.kernel.org
 L:     netdev@vger.kernel.org
@@ -13295,7 +13332,7 @@ F:      drivers/virtio/
 F:     tools/virtio/
 F:     drivers/net/virtio_net.c
 F:     drivers/block/virtio_blk.c
-F:     include/linux/virtio_*.h
+F:     include/linux/virtio*.h
 F:     include/uapi/linux/virtio_*.h
 F:     drivers/crypto/virtio/
 
index e11989d..4b074a9 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 VERSION = 4
 PATCHLEVEL = 11
 SUBLEVEL = 0
-EXTRAVERSION = -rc5
+EXTRAVERSION =
 NAME = Fearless Coyote
 
 # *DOCUMENTATION*
@@ -372,7 +372,7 @@ LDFLAGS_MODULE  =
 CFLAGS_KERNEL  =
 AFLAGS_KERNEL  =
 LDFLAGS_vmlinux =
-CFLAGS_GCOV    = -fprofile-arcs -ftest-coverage -fno-tree-loop-im -Wno-maybe-uninitialized
+CFLAGS_GCOV    := -fprofile-arcs -ftest-coverage -fno-tree-loop-im $(call cc-disable-warning,maybe-uninitialized,)
 CFLAGS_KCOV    := $(call cc-option,-fsanitize-coverage=trace-pc,)
 
 
@@ -653,6 +653,12 @@ KBUILD_CFLAGS += $(call cc-ifversion, -lt, 0409, \
 # Tell gcc to never replace conditional load with a non-conditional one
 KBUILD_CFLAGS  += $(call cc-option,--param=allow-store-data-races=0)
 
+# check for 'asm goto'
+ifeq ($(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-goto.sh $(CC) $(KBUILD_CFLAGS)), y)
+       KBUILD_CFLAGS += -DCC_HAVE_ASM_GOTO
+       KBUILD_AFLAGS += -DCC_HAVE_ASM_GOTO
+endif
+
 include scripts/Makefile.gcc-plugins
 
 ifdef CONFIG_READABLE_ASM
@@ -798,12 +804,6 @@ KBUILD_CFLAGS   += $(call cc-option,-Werror=incompatible-pointer-types)
 # use the deterministic mode of AR if available
 KBUILD_ARFLAGS := $(call ar-option,D)
 
-# check for 'asm goto'
-ifeq ($(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-goto.sh $(CC) $(KBUILD_CFLAGS)), y)
-       KBUILD_CFLAGS += -DCC_HAVE_ASM_GOTO
-       KBUILD_AFLAGS += -DCC_HAVE_ASM_GOTO
-endif
-
 include scripts/Makefile.kasan
 include scripts/Makefile.extrawarn
 include scripts/Makefile.ubsan
diff --git a/arch/alpha/include/asm/extable.h b/arch/alpha/include/asm/extable.h
new file mode 100644 (file)
index 0000000..048e209
--- /dev/null
@@ -0,0 +1,55 @@
+#ifndef _ASM_EXTABLE_H
+#define _ASM_EXTABLE_H
+
+/*
+ * About the exception table:
+ *
+ * - insn is a 32-bit pc-relative offset from the faulting insn.
+ * - nextinsn is a 16-bit offset off of the faulting instruction
+ *   (not off of the *next* instruction as branches are).
+ * - errreg is the register in which to place -EFAULT.
+ * - valreg is the final target register for the load sequence
+ *   and will be zeroed.
+ *
+ * Either errreg or valreg may be $31, in which case nothing happens.
+ *
+ * The exception fixup information "just so happens" to be arranged
+ * as in a MEM format instruction.  This lets us emit our three
+ * values like so:
+ *
+ *      lda valreg, nextinsn(errreg)
+ *
+ */
+
+struct exception_table_entry
+{
+       signed int insn;
+       union exception_fixup {
+               unsigned unit;
+               struct {
+                       signed int nextinsn : 16;
+                       unsigned int errreg : 5;
+                       unsigned int valreg : 5;
+               } bits;
+       } fixup;
+};
+
+/* Returns the new pc */
+#define fixup_exception(map_reg, _fixup, pc)                   \
+({                                                             \
+       if ((_fixup)->fixup.bits.valreg != 31)                  \
+               map_reg((_fixup)->fixup.bits.valreg) = 0;       \
+       if ((_fixup)->fixup.bits.errreg != 31)                  \
+               map_reg((_fixup)->fixup.bits.errreg) = -EFAULT; \
+       (pc) + (_fixup)->fixup.bits.nextinsn;                   \
+})
+
+#define ARCH_HAS_RELATIVE_EXTABLE
+
+#define swap_ex_entry_fixup(a, b, tmp, delta)                  \
+       do {                                                    \
+               (a)->fixup.unit = (b)->fixup.unit;              \
+               (b)->fixup.unit = (tmp).fixup.unit;             \
+       } while (0)
+
+#endif
index f939794..fb01dfb 100644 (file)
        "3:     .subsection 2\n"                                \
        "4:     br      1b\n"                                   \
        "       .previous\n"                                    \
-       "       .section __ex_table,\"a\"\n"                    \
-       "       .long   1b-.\n"                                 \
-       "       lda     $31,3b-1b(%1)\n"                        \
-       "       .long   2b-.\n"                                 \
-       "       lda     $31,3b-2b(%1)\n"                        \
-       "       .previous\n"                                    \
+       EXC(1b,3b,%1,$31)                                       \
+       EXC(2b,3b,%1,$31)                                       \
        :       "=&r" (oldval), "=&r"(ret)                      \
        :       "r" (uaddr), "r"(oparg)                         \
        :       "memory")
@@ -101,12 +97,8 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
        "3:     .subsection 2\n"
        "4:     br      1b\n"
        "       .previous\n"
-       "       .section __ex_table,\"a\"\n"
-       "       .long   1b-.\n"
-       "       lda     $31,3b-1b(%0)\n"
-       "       .long   2b-.\n"
-       "       lda     $31,3b-2b(%0)\n"
-       "       .previous\n"
+       EXC(1b,3b,%0,$31)
+       EXC(2b,3b,%0,$31)
        :       "+r"(ret), "=&r"(prev), "=&r"(cmp)
        :       "r"(uaddr), "r"((long)(int)oldval), "r"(newval)
        :       "memory");
index 94f5875..7b82dc9 100644 (file)
@@ -1,10 +1,6 @@
 #ifndef __ALPHA_UACCESS_H
 #define __ALPHA_UACCESS_H
 
-#include <linux/errno.h>
-#include <linux/sched.h>
-
-
 /*
  * The fs value determines whether argument validity checking should be
  * performed or not.  If get_fs() == USER_DS, checking is performed, with
@@ -20,9 +16,6 @@
 #define KERNEL_DS      ((mm_segment_t) { 0UL })
 #define USER_DS                ((mm_segment_t) { -0x40000000000UL })
 
-#define VERIFY_READ    0
-#define VERIFY_WRITE   1
-
 #define get_fs()  (current_thread_info()->addr_limit)
 #define get_ds()  (KERNEL_DS)
 #define set_fs(x) (current_thread_info()->addr_limit = (x))
  *  - AND "addr+size" doesn't have any high-bits set
  *  - OR we are in kernel mode.
  */
-#define __access_ok(addr, size, segment) \
-       (((segment).seg & (addr | size | (addr+size))) == 0)
+#define __access_ok(addr, size) \
+       ((get_fs().seg & (addr | size | (addr+size))) == 0)
 
-#define access_ok(type, addr, size)                            \
-({                                                             \
-       __chk_user_ptr(addr);                                   \
-       __access_ok(((unsigned long)(addr)), (size), get_fs()); \
+#define access_ok(type, addr, size)                    \
+({                                                     \
+       __chk_user_ptr(addr);                           \
+       __access_ok(((unsigned long)(addr)), (size));   \
 })
 
 /*
@@ -61,9 +54,9 @@
  * (b) require any knowledge of processes at this stage
  */
 #define put_user(x, ptr) \
-  __put_user_check((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr)), get_fs())
+  __put_user_check((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr)))
 #define get_user(x, ptr) \
-  __get_user_check((x), (ptr), sizeof(*(ptr)), get_fs())
+  __get_user_check((x), (ptr), sizeof(*(ptr)))
 
 /*
  * The "__xxx" versions do not do address space checking, useful when
  * more extensive comments with fixup_inline_exception below for
  * more information.
  */
+#define EXC(label,cont,res,err)                                \
+       ".section __ex_table,\"a\"\n"                   \
+       "       .long "#label"-.\n"                     \
+       "       lda "#res","#cont"-"#label"("#err")\n"  \
+       ".previous\n"
 
 extern void __get_user_unknown(void);
 
@@ -100,23 +98,23 @@ extern void __get_user_unknown(void);
        __gu_err;                                               \
 })
 
-#define __get_user_check(x, ptr, size, segment)                                \
-({                                                                     \
-       long __gu_err = -EFAULT;                                        \
-       unsigned long __gu_val = 0;                                     \
-       const __typeof__(*(ptr)) __user *__gu_addr = (ptr);             \
-       if (__access_ok((unsigned long)__gu_addr, size, segment)) {     \
-               __gu_err = 0;                                           \
-               switch (size) {                                         \
-                 case 1: __get_user_8(__gu_addr); break;               \
-                 case 2: __get_user_16(__gu_addr); break;              \
-                 case 4: __get_user_32(__gu_addr); break;              \
-                 case 8: __get_user_64(__gu_addr); break;              \
-                 default: __get_user_unknown(); break;                 \
-               }                                                       \
-       }                                                               \
-       (x) = (__force __typeof__(*(ptr))) __gu_val;                    \
-       __gu_err;                                                       \
+#define __get_user_check(x, ptr, size)                         \
+({                                                             \
+       long __gu_err = -EFAULT;                                \
+       unsigned long __gu_val = 0;                             \
+       const __typeof__(*(ptr)) __user *__gu_addr = (ptr);     \
+       if (__access_ok((unsigned long)__gu_addr, size)) {      \
+               __gu_err = 0;                                   \
+               switch (size) {                                 \
+                 case 1: __get_user_8(__gu_addr); break;       \
+                 case 2: __get_user_16(__gu_addr); break;      \
+                 case 4: __get_user_32(__gu_addr); break;      \
+                 case 8: __get_user_64(__gu_addr); break;      \
+                 default: __get_user_unknown(); break;         \
+               }                                               \
+       }                                                       \
+       (x) = (__force __typeof__(*(ptr))) __gu_val;            \
+       __gu_err;                                               \
 })
 
 struct __large_struct { unsigned long buf[100]; };
@@ -125,20 +123,14 @@ struct __large_struct { unsigned long buf[100]; };
 #define __get_user_64(addr)                            \
        __asm__("1: ldq %0,%2\n"                        \
        "2:\n"                                          \
-       ".section __ex_table,\"a\"\n"                   \
-       "       .long 1b - .\n"                         \
-       "       lda %0, 2b-1b(%1)\n"                    \
-       ".previous"                                     \
+       EXC(1b,2b,%0,%1)                                \
                : "=r"(__gu_val), "=r"(__gu_err)        \
                : "m"(__m(addr)), "1"(__gu_err))
 
 #define __get_user_32(addr)                            \
        __asm__("1: ldl %0,%2\n"                        \
        "2:\n"                                          \
-       ".section __ex_table,\"a\"\n"                   \
-       "       .long 1b - .\n"                         \
-       "       lda %0, 2b-1b(%1)\n"                    \
-       ".previous"                                     \
+       EXC(1b,2b,%0,%1)                                \
                : "=r"(__gu_val), "=r"(__gu_err)        \
                : "m"(__m(addr)), "1"(__gu_err))
 
@@ -148,20 +140,14 @@ struct __large_struct { unsigned long buf[100]; };
 #define __get_user_16(addr)                            \
        __asm__("1: ldwu %0,%2\n"                       \
        "2:\n"                                          \
-       ".section __ex_table,\"a\"\n"                   \
-       "       .long 1b - .\n"                         \
-       "       lda %0, 2b-1b(%1)\n"                    \
-       ".previous"                                     \
+       EXC(1b,2b,%0,%1)                                \
                : "=r"(__gu_val), "=r"(__gu_err)        \
                : "m"(__m(addr)), "1"(__gu_err))
 
 #define __get_user_8(addr)                             \
        __asm__("1: ldbu %0,%2\n"                       \
        "2:\n"                                          \
-       ".section __ex_table,\"a\"\n"                   \
-       "       .long 1b - .\n"                         \
-       "       lda %0, 2b-1b(%1)\n"                    \
-       ".previous"                                     \
+       EXC(1b,2b,%0,%1)                                \
                : "=r"(__gu_val), "=r"(__gu_err)        \
                : "m"(__m(addr)), "1"(__gu_err))
 #else
@@ -177,12 +163,8 @@ struct __large_struct { unsigned long buf[100]; };
        "       extwh %1,%3,%1\n"                                       \
        "       or %0,%1,%0\n"                                          \
        "3:\n"                                                          \
-       ".section __ex_table,\"a\"\n"                                   \
-       "       .long 1b - .\n"                                         \
-       "       lda %0, 3b-1b(%2)\n"                                    \
-       "       .long 2b - .\n"                                         \
-       "       lda %0, 3b-2b(%2)\n"                                    \
-       ".previous"                                                     \
+       EXC(1b,3b,%0,%2)                                                \
+       EXC(2b,3b,%0,%2)                                                \
                : "=&r"(__gu_val), "=&r"(__gu_tmp), "=r"(__gu_err)      \
                : "r"(addr), "2"(__gu_err));                            \
 }
@@ -191,10 +173,7 @@ struct __large_struct { unsigned long buf[100]; };
        __asm__("1: ldq_u %0,0(%2)\n"                                   \
        "       extbl %0,%2,%0\n"                                       \
        "2:\n"                                                          \
-       ".section __ex_table,\"a\"\n"                                   \
-       "       .long 1b - .\n"                                         \
-       "       lda %0, 2b-1b(%1)\n"                                    \
-       ".previous"                                                     \
+       EXC(1b,2b,%0,%1)                                                \
                : "=&r"(__gu_val), "=r"(__gu_err)                       \
                : "r"(addr), "1"(__gu_err))
 #endif
@@ -215,21 +194,21 @@ extern void __put_user_unknown(void);
        __pu_err;                                               \
 })
 
-#define __put_user_check(x, ptr, size, segment)                                \
-({                                                                     \
-       long __pu_err = -EFAULT;                                        \
-       __typeof__(*(ptr)) __user *__pu_addr = (ptr);                   \
-       if (__access_ok((unsigned long)__pu_addr, size, segment)) {     \
-               __pu_err = 0;                                           \
-               switch (size) {                                         \
-                 case 1: __put_user_8(x, __pu_addr); break;            \
-                 case 2: __put_user_16(x, __pu_addr); break;           \
-                 case 4: __put_user_32(x, __pu_addr); break;           \
-                 case 8: __put_user_64(x, __pu_addr); break;           \
-                 default: __put_user_unknown(); break;                 \
-               }                                                       \
-       }                                                               \
-       __pu_err;                                                       \
+#define __put_user_check(x, ptr, size)                         \
+({                                                             \
+       long __pu_err = -EFAULT;                                \
+       __typeof__(*(ptr)) __user *__pu_addr = (ptr);           \
+       if (__access_ok((unsigned long)__pu_addr, size)) {      \
+               __pu_err = 0;                                   \
+               switch (size) {                                 \
+                 case 1: __put_user_8(x, __pu_addr); break;    \
+                 case 2: __put_user_16(x, __pu_addr); break;   \
+                 case 4: __put_user_32(x, __pu_addr); break;   \
+                 case 8: __put_user_64(x, __pu_addr); break;   \
+                 default: __put_user_unknown(); break;         \
+               }                                               \
+       }                                                       \
+       __pu_err;                                               \
 })
 
 /*
@@ -240,20 +219,14 @@ extern void __put_user_unknown(void);
 #define __put_user_64(x, addr)                                 \
 __asm__ __volatile__("1: stq %r2,%1\n"                         \
        "2:\n"                                                  \
-       ".section __ex_table,\"a\"\n"                           \
-       "       .long 1b - .\n"                                 \
-       "       lda $31,2b-1b(%0)\n"                            \
-       ".previous"                                             \
+       EXC(1b,2b,$31,%0)                                       \
                : "=r"(__pu_err)                                \
                : "m" (__m(addr)), "rJ" (x), "0"(__pu_err))
 
 #define __put_user_32(x, addr)                                 \
 __asm__ __volatile__("1: stl %r2,%1\n"                         \
        "2:\n"                                                  \
-       ".section __ex_table,\"a\"\n"                           \
-       "       .long 1b - .\n"                                 \
-       "       lda $31,2b-1b(%0)\n"                            \
-       ".previous"                                             \
+       EXC(1b,2b,$31,%0)                                       \
                : "=r"(__pu_err)                                \
                : "m"(__m(addr)), "rJ"(x), "0"(__pu_err))
 
@@ -263,20 +236,14 @@ __asm__ __volatile__("1: stl %r2,%1\n"                            \
 #define __put_user_16(x, addr)                                 \
 __asm__ __volatile__("1: stw %r2,%1\n"                         \
        "2:\n"                                                  \
-       ".section __ex_table,\"a\"\n"                           \
-       "       .long 1b - .\n"                                 \
-       "       lda $31,2b-1b(%0)\n"                            \
-       ".previous"                                             \
+       EXC(1b,2b,$31,%0)                                       \
                : "=r"(__pu_err)                                \
                : "m"(__m(addr)), "rJ"(x), "0"(__pu_err))
 
 #define __put_user_8(x, addr)                                  \
 __asm__ __volatile__("1: stb %r2,%1\n"                         \
        "2:\n"                                                  \
-       ".section __ex_table,\"a\"\n"                           \
-       "       .long 1b - .\n"                                 \
-       "       lda $31,2b-1b(%0)\n"                            \
-       ".previous"                                             \
+       EXC(1b,2b,$31,%0)                                       \
                : "=r"(__pu_err)                                \
                : "m"(__m(addr)), "rJ"(x), "0"(__pu_err))
 #else
@@ -298,16 +265,10 @@ __asm__ __volatile__("1: stb %r2,%1\n"                            \
        "3:     stq_u %2,1(%5)\n"                               \
        "4:     stq_u %1,0(%5)\n"                               \
        "5:\n"                                                  \
-       ".section __ex_table,\"a\"\n"                           \
-       "       .long 1b - .\n"                                 \
-       "       lda $31, 5b-1b(%0)\n"                           \
-       "       .long 2b - .\n"                                 \
-       "       lda $31, 5b-2b(%0)\n"                           \
-       "       .long 3b - .\n"                                 \
-       "       lda $31, 5b-3b(%0)\n"                           \
-       "       .long 4b - .\n"                                 \
-       "       lda $31, 5b-4b(%0)\n"                           \
-       ".previous"                                             \
+       EXC(1b,5b,$31,%0)                                       \
+       EXC(2b,5b,$31,%0)                                       \
+       EXC(3b,5b,$31,%0)                                       \
+       EXC(4b,5b,$31,%0)                                       \
                : "=r"(__pu_err), "=&r"(__pu_tmp1),             \
                  "=&r"(__pu_tmp2), "=&r"(__pu_tmp3),           \
                  "=&r"(__pu_tmp4)                              \
@@ -324,12 +285,8 @@ __asm__ __volatile__("1: stb %r2,%1\n"                             \
        "       or %1,%2,%1\n"                                  \
        "2:     stq_u %1,0(%4)\n"                               \
        "3:\n"                                                  \
-       ".section __ex_table,\"a\"\n"                           \
-       "       .long 1b - .\n"                                 \
-       "       lda $31, 3b-1b(%0)\n"                           \
-       "       .long 2b - .\n"                                 \
-       "       lda $31, 3b-2b(%0)\n"                           \
-       ".previous"                                             \
+       EXC(1b,3b,$31,%0)                                       \
+       EXC(2b,3b,$31,%0)                                       \
                : "=r"(__pu_err),                               \
                  "=&r"(__pu_tmp1), "=&r"(__pu_tmp2)            \
                : "r"((unsigned long)(x)), "r"(addr), "0"(__pu_err)); \
@@ -341,153 +298,37 @@ __asm__ __volatile__("1: stb %r2,%1\n"                           \
  * Complex access routines
  */
 
-/* This little bit of silliness is to get the GP loaded for a function
-   that ordinarily wouldn't.  Otherwise we could have it done by the macro
-   directly, which can be optimized the linker.  */
-#ifdef MODULE
-#define __module_address(sym)          "r"(sym),
-#define __module_call(ra, arg, sym)    "jsr $" #ra ",(%" #arg ")," #sym
-#else
-#define __module_address(sym)
-#define __module_call(ra, arg, sym)    "bsr $" #ra "," #sym " !samegp"
-#endif
-
-extern void __copy_user(void);
-
-extern inline long
-__copy_tofrom_user_nocheck(void *to, const void *from, long len)
-{
-       register void * __cu_to __asm__("$6") = to;
-       register const void * __cu_from __asm__("$7") = from;
-       register long __cu_len __asm__("$0") = len;
-
-       __asm__ __volatile__(
-               __module_call(28, 3, __copy_user)
-               : "=r" (__cu_len), "=r" (__cu_from), "=r" (__cu_to)
-               : __module_address(__copy_user)
-                 "0" (__cu_len), "1" (__cu_from), "2" (__cu_to)
-               : "$1", "$2", "$3", "$4", "$5", "$28", "memory");
-
-       return __cu_len;
-}
-
-#define __copy_to_user(to, from, n)                                    \
-({                                                                     \
-       __chk_user_ptr(to);                                             \
-       __copy_tofrom_user_nocheck((__force void *)(to), (from), (n));  \
-})
-#define __copy_from_user(to, from, n)                                  \
-({                                                                     \
-       __chk_user_ptr(from);                                           \
-       __copy_tofrom_user_nocheck((to), (__force void *)(from), (n));  \
-})
-
-#define __copy_to_user_inatomic __copy_to_user
-#define __copy_from_user_inatomic __copy_from_user
+extern long __copy_user(void *to, const void *from, long len);
 
-extern inline long
-copy_to_user(void __user *to, const void *from, long n)
+static inline unsigned long
+raw_copy_from_user(void *to, const void __user *from, unsigned long len)
 {
-       if (likely(__access_ok((unsigned long)to, n, get_fs())))
-               n = __copy_tofrom_user_nocheck((__force void *)to, from, n);
-       return n;
+       return __copy_user(to, (__force const void *)from, len);
 }
 
-extern inline long
-copy_from_user(void *to, const void __user *from, long n)
+static inline unsigned long
+raw_copy_to_user(void __user *to, const void *from, unsigned long len)
 {
-       long res = n;
-       if (likely(__access_ok((unsigned long)from, n, get_fs())))
-               res = __copy_from_user_inatomic(to, from, n);
-       if (unlikely(res))
-               memset(to + (n - res), 0, res);
-       return res;
+       return __copy_user((__force void *)to, from, len);
 }
 
-extern void __do_clear_user(void);
-
-extern inline long
-__clear_user(void __user *to, long len)
-{
-       register void __user * __cl_to __asm__("$6") = to;
-       register long __cl_len __asm__("$0") = len;
-       __asm__ __volatile__(
-               __module_call(28, 2, __do_clear_user)
-               : "=r"(__cl_len), "=r"(__cl_to)
-               : __module_address(__do_clear_user)
-                 "0"(__cl_len), "1"(__cl_to)
-               : "$1", "$2", "$3", "$4", "$5", "$28", "memory");
-       return __cl_len;
-}
+extern long __clear_user(void __user *to, long len);
 
 extern inline long
 clear_user(void __user *to, long len)
 {
-       if (__access_ok((unsigned long)to, len, get_fs()))
+       if (__access_ok((unsigned long)to, len))
                len = __clear_user(to, len);
        return len;
 }
 
-#undef __module_address
-#undef __module_call
-
 #define user_addr_max() \
-        (segment_eq(get_fs(), USER_DS) ? TASK_SIZE : ~0UL)
+        (uaccess_kernel() ? ~0UL : TASK_SIZE)
 
 extern long strncpy_from_user(char *dest, const char __user *src, long count);
 extern __must_check long strlen_user(const char __user *str);
 extern __must_check long strnlen_user(const char __user *str, long n);
 
-/*
- * About the exception table:
- *
- * - insn is a 32-bit pc-relative offset from the faulting insn.
- * - nextinsn is a 16-bit offset off of the faulting instruction
- *   (not off of the *next* instruction as branches are).
- * - errreg is the register in which to place -EFAULT.
- * - valreg is the final target register for the load sequence
- *   and will be zeroed.
- *
- * Either errreg or valreg may be $31, in which case nothing happens.
- *
- * The exception fixup information "just so happens" to be arranged
- * as in a MEM format instruction.  This lets us emit our three
- * values like so:
- *
- *      lda valreg, nextinsn(errreg)
- *
- */
-
-struct exception_table_entry
-{
-       signed int insn;
-       union exception_fixup {
-               unsigned unit;
-               struct {
-                       signed int nextinsn : 16;
-                       unsigned int errreg : 5;
-                       unsigned int valreg : 5;
-               } bits;
-       } fixup;
-};
-
-/* Returns the new pc */
-#define fixup_exception(map_reg, _fixup, pc)                   \
-({                                                             \
-       if ((_fixup)->fixup.bits.valreg != 31)                  \
-               map_reg((_fixup)->fixup.bits.valreg) = 0;       \
-       if ((_fixup)->fixup.bits.errreg != 31)                  \
-               map_reg((_fixup)->fixup.bits.errreg) = -EFAULT; \
-       (pc) + (_fixup)->fixup.bits.nextinsn;                   \
-})
-
-#define ARCH_HAS_RELATIVE_EXTABLE
-
-#define swap_ex_entry_fixup(a, b, tmp, delta)                  \
-       do {                                                    \
-               (a)->fixup.unit = (b)->fixup.unit;              \
-               (b)->fixup.unit = (tmp).fixup.unit;             \
-       } while (0)
-
+#include <asm/extable.h>
 
 #endif /* __ALPHA_UACCESS_H */
index 0b96109..9ec56dc 100644 (file)
@@ -1016,6 +1016,7 @@ SYSCALL_DEFINE2(osf_gettimeofday, struct timeval32 __user *, tv,
 SYSCALL_DEFINE2(osf_settimeofday, struct timeval32 __user *, tv,
                struct timezone __user *, tz)
 {
+       struct timespec64 kts64;
        struct timespec kts;
        struct timezone ktz;
 
@@ -1023,13 +1024,14 @@ SYSCALL_DEFINE2(osf_settimeofday, struct timeval32 __user *, tv,
                if (get_tv32((struct timeval *)&kts, tv))
                        return -EFAULT;
                kts.tv_nsec *= 1000;
+               kts64 = timespec_to_timespec64(kts);
        }
        if (tz) {
                if (copy_from_user(&ktz, tz, sizeof(*tz)))
                        return -EFAULT;
        }
 
-       return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL);
+       return do_sys_settimeofday64(tv ? &kts64 : NULL, tz ? &ktz : NULL);
 }
 
 asmlinkage long sys_ni_posix_timers(void);
@@ -1290,7 +1292,7 @@ SYSCALL_DEFINE1(old_adjtimex, struct timex32 __user *, txc_p)
        /* copy relevant bits of struct timex. */
        if (copy_from_user(&txc, txc_p, offsetof(struct timex32, time)) ||
            copy_from_user(&txc.tick, &txc_p->tick, sizeof(struct timex32) - 
-                          offsetof(struct timex32, time)))
+                          offsetof(struct timex32, tick)))
          return -EFAULT;
 
        ret = do_adjtimex(&txc);        
index b137390..65bb102 100644 (file)
@@ -482,12 +482,8 @@ do_entUna(void * va, unsigned long opcode, unsigned long reg,
                "       extwl %1,%3,%1\n"
                "       extwh %2,%3,%2\n"
                "3:\n"
-               ".section __ex_table,\"a\"\n"
-               "       .long 1b - .\n"
-               "       lda %1,3b-1b(%0)\n"
-               "       .long 2b - .\n"
-               "       lda %2,3b-2b(%0)\n"
-               ".previous"
+               EXC(1b,3b,%1,%0)
+               EXC(2b,3b,%2,%0)
                        : "=r"(error), "=&r"(tmp1), "=&r"(tmp2)
                        : "r"(va), "0"(0));
                if (error)
@@ -502,12 +498,8 @@ do_entUna(void * va, unsigned long opcode, unsigned long reg,
                "       extll %1,%3,%1\n"
                "       extlh %2,%3,%2\n"
                "3:\n"
-               ".section __ex_table,\"a\"\n"
-               "       .long 1b - .\n"
-               "       lda %1,3b-1b(%0)\n"
-               "       .long 2b - .\n"
-               "       lda %2,3b-2b(%0)\n"
-               ".previous"
+               EXC(1b,3b,%1,%0)
+               EXC(2b,3b,%2,%0)
                        : "=r"(error), "=&r"(tmp1), "=&r"(tmp2)
                        : "r"(va), "0"(0));
                if (error)
@@ -522,12 +514,8 @@ do_entUna(void * va, unsigned long opcode, unsigned long reg,
                "       extql %1,%3,%1\n"
                "       extqh %2,%3,%2\n"
                "3:\n"
-               ".section __ex_table,\"a\"\n"
-               "       .long 1b - .\n"
-               "       lda %1,3b-1b(%0)\n"
-               "       .long 2b - .\n"
-               "       lda %2,3b-2b(%0)\n"
-               ".previous"
+               EXC(1b,3b,%1,%0)
+               EXC(2b,3b,%2,%0)
                        : "=r"(error), "=&r"(tmp1), "=&r"(tmp2)
                        : "r"(va), "0"(0));
                if (error)
@@ -551,16 +539,10 @@ do_entUna(void * va, unsigned long opcode, unsigned long reg,
                "3:     stq_u %2,1(%5)\n"
                "4:     stq_u %1,0(%5)\n"
                "5:\n"
-               ".section __ex_table,\"a\"\n"
-               "       .long 1b - .\n"
-               "       lda %2,5b-1b(%0)\n"
-               "       .long 2b - .\n"
-               "       lda %1,5b-2b(%0)\n"
-               "       .long 3b - .\n"
-               "       lda $31,5b-3b(%0)\n"
-               "       .long 4b - .\n"
-               "       lda $31,5b-4b(%0)\n"
-               ".previous"
+               EXC(1b,5b,%2,%0)
+               EXC(2b,5b,%1,%0)
+               EXC(3b,5b,$31,%0)
+               EXC(4b,5b,$31,%0)
                        : "=r"(error), "=&r"(tmp1), "=&r"(tmp2),
                          "=&r"(tmp3), "=&r"(tmp4)
                        : "r"(va), "r"(una_reg(reg)), "0"(0));
@@ -581,16 +563,10 @@ do_entUna(void * va, unsigned long opcode, unsigned long reg,
                "3:     stq_u %2,3(%5)\n"
                "4:     stq_u %1,0(%5)\n"
                "5:\n"
-               ".section __ex_table,\"a\"\n"
-               "       .long 1b - .\n"
-               "       lda %2,5b-1b(%0)\n"
-               "       .long 2b - .\n"
-               "       lda %1,5b-2b(%0)\n"
-               "       .long 3b - .\n"
-               "       lda $31,5b-3b(%0)\n"
-               "       .long 4b - .\n"
-               "       lda $31,5b-4b(%0)\n"
-               ".previous"
+               EXC(1b,5b,%2,%0)
+               EXC(2b,5b,%1,%0)
+               EXC(3b,5b,$31,%0)
+               EXC(4b,5b,$31,%0)
                        : "=r"(error), "=&r"(tmp1), "=&r"(tmp2),
                          "=&r"(tmp3), "=&r"(tmp4)
                        : "r"(va), "r"(una_reg(reg)), "0"(0));
@@ -611,16 +587,10 @@ do_entUna(void * va, unsigned long opcode, unsigned long reg,
                "3:     stq_u %2,7(%5)\n"
                "4:     stq_u %1,0(%5)\n"
                "5:\n"
-               ".section __ex_table,\"a\"\n\t"
-               "       .long 1b - .\n"
-               "       lda %2,5b-1b(%0)\n"
-               "       .long 2b - .\n"
-               "       lda %1,5b-2b(%0)\n"
-               "       .long 3b - .\n"
-               "       lda $31,5b-3b(%0)\n"
-               "       .long 4b - .\n"
-               "       lda $31,5b-4b(%0)\n"
-               ".previous"
+               EXC(1b,5b,%2,%0)
+               EXC(2b,5b,%1,%0)
+               EXC(3b,5b,$31,%0)
+               EXC(4b,5b,$31,%0)
                        : "=r"(error), "=&r"(tmp1), "=&r"(tmp2),
                          "=&r"(tmp3), "=&r"(tmp4)
                        : "r"(va), "r"(una_reg(reg)), "0"(0));
@@ -802,7 +772,7 @@ do_entUnaUser(void __user * va, unsigned long opcode,
        /* Don't bother reading ds in the access check since we already
           know that this came from the user.  Also rely on the fact that
           the page at TASK_SIZE is unmapped and so can't be touched anyway. */
-       if (!__access_ok((unsigned long)va, 0, USER_DS))
+       if ((unsigned long)va >= TASK_SIZE)
                goto give_sigsegv;
 
        ++unaligned[1].count;
@@ -835,12 +805,8 @@ do_entUnaUser(void __user * va, unsigned long opcode,
                "       extwl %1,%3,%1\n"
                "       extwh %2,%3,%2\n"
                "3:\n"
-               ".section __ex_table,\"a\"\n"
-               "       .long 1b - .\n"
-               "       lda %1,3b-1b(%0)\n"
-               "       .long 2b - .\n"
-               "       lda %2,3b-2b(%0)\n"
-               ".previous"
+               EXC(1b,3b,%1,%0)
+               EXC(2b,3b,%2,%0)
                        : "=r"(error), "=&r"(tmp1), "=&r"(tmp2)
                        : "r"(va), "0"(0));
                if (error)
@@ -855,12 +821,8 @@ do_entUnaUser(void __user * va, unsigned long opcode,
                "       extll %1,%3,%1\n"
                "       extlh %2,%3,%2\n"
                "3:\n"
-               ".section __ex_table,\"a\"\n"
-               "       .long 1b - .\n"
-               "       lda %1,3b-1b(%0)\n"
-               "       .long 2b - .\n"
-               "       lda %2,3b-2b(%0)\n"
-               ".previous"
+               EXC(1b,3b,%1,%0)
+               EXC(2b,3b,%2,%0)
                        : "=r"(error), "=&r"(tmp1), "=&r"(tmp2)
                        : "r"(va), "0"(0));
                if (error)
@@ -875,12 +837,8 @@ do_entUnaUser(void __user * va, unsigned long opcode,
                "       extql %1,%3,%1\n"
                "       extqh %2,%3,%2\n"
                "3:\n"
-               ".section __ex_table,\"a\"\n"
-               "       .long 1b - .\n"
-               "       lda %1,3b-1b(%0)\n"
-               "       .long 2b - .\n"
-               "       lda %2,3b-2b(%0)\n"
-               ".previous"
+               EXC(1b,3b,%1,%0)
+               EXC(2b,3b,%2,%0)
                        : "=r"(error), "=&r"(tmp1), "=&r"(tmp2)
                        : "r"(va), "0"(0));
                if (error)
@@ -895,12 +853,8 @@ do_entUnaUser(void __user * va, unsigned long opcode,
                "       extll %1,%3,%1\n"
                "       extlh %2,%3,%2\n"
                "3:\n"
-               ".section __ex_table,\"a\"\n"
-               "       .long 1b - .\n"
-               "       lda %1,3b-1b(%0)\n"
-               "       .long 2b - .\n"
-               "       lda %2,3b-2b(%0)\n"
-               ".previous"
+               EXC(1b,3b,%1,%0)
+               EXC(2b,3b,%2,%0)
                        : "=r"(error), "=&r"(tmp1), "=&r"(tmp2)
                        : "r"(va), "0"(0));
                if (error)
@@ -915,12 +869,8 @@ do_entUnaUser(void __user * va, unsigned long opcode,
                "       extql %1,%3,%1\n"
                "       extqh %2,%3,%2\n"
                "3:\n"
-               ".section __ex_table,\"a\"\n"
-               "       .long 1b - .\n"
-               "       lda %1,3b-1b(%0)\n"
-               "       .long 2b - .\n"
-               "       lda %2,3b-2b(%0)\n"
-               ".previous"
+               EXC(1b,3b,%1,%0)
+               EXC(2b,3b,%2,%0)
                        : "=r"(error), "=&r"(tmp1), "=&r"(tmp2)
                        : "r"(va), "0"(0));
                if (error)
@@ -944,16 +894,10 @@ do_entUnaUser(void __user * va, unsigned long opcode,
                "3:     stq_u %2,1(%5)\n"
                "4:     stq_u %1,0(%5)\n"
                "5:\n"
-               ".section __ex_table,\"a\"\n"
-               "       .long 1b - .\n"
-               "       lda %2,5b-1b(%0)\n"
-               "       .long 2b - .\n"
-               "       lda %1,5b-2b(%0)\n"
-               "       .long 3b - .\n"
-               "       lda $31,5b-3b(%0)\n"
-               "       .long 4b - .\n"
-               "       lda $31,5b-4b(%0)\n"
-               ".previous"
+               EXC(1b,5b,%2,%0)
+               EXC(2b,5b,%1,%0)
+               EXC(3b,5b,$31,%0)
+               EXC(4b,5b,$31,%0)
                        : "=r"(error), "=&r"(tmp1), "=&r"(tmp2),
                          "=&r"(tmp3), "=&r"(tmp4)
                        : "r"(va), "r"(*reg_addr), "0"(0));
@@ -978,16 +922,10 @@ do_entUnaUser(void __user * va, unsigned long opcode,
                "3:     stq_u %2,3(%5)\n"
                "4:     stq_u %1,0(%5)\n"
                "5:\n"
-               ".section __ex_table,\"a\"\n"
-               "       .long 1b - .\n"
-               "       lda %2,5b-1b(%0)\n"
-               "       .long 2b - .\n"
-               "       lda %1,5b-2b(%0)\n"
-               "       .long 3b - .\n"
-               "       lda $31,5b-3b(%0)\n"
-               "       .long 4b - .\n"
-               "       lda $31,5b-4b(%0)\n"
-               ".previous"
+               EXC(1b,5b,%2,%0)
+               EXC(2b,5b,%1,%0)
+               EXC(3b,5b,$31,%0)
+               EXC(4b,5b,$31,%0)
                        : "=r"(error), "=&r"(tmp1), "=&r"(tmp2),
                          "=&r"(tmp3), "=&r"(tmp4)
                        : "r"(va), "r"(*reg_addr), "0"(0));
@@ -1012,16 +950,10 @@ do_entUnaUser(void __user * va, unsigned long opcode,
                "3:     stq_u %2,7(%5)\n"
                "4:     stq_u %1,0(%5)\n"
                "5:\n"
-               ".section __ex_table,\"a\"\n\t"
-               "       .long 1b - .\n"
-               "       lda %2,5b-1b(%0)\n"
-               "       .long 2b - .\n"
-               "       lda %1,5b-2b(%0)\n"
-               "       .long 3b - .\n"
-               "       lda $31,5b-3b(%0)\n"
-               "       .long 4b - .\n"
-               "       lda $31,5b-4b(%0)\n"
-               ".previous"
+               EXC(1b,5b,%2,%0)
+               EXC(2b,5b,%1,%0)
+               EXC(3b,5b,$31,%0)
+               EXC(4b,5b,$31,%0)
                        : "=r"(error), "=&r"(tmp1), "=&r"(tmp2),
                          "=&r"(tmp3), "=&r"(tmp4)
                        : "r"(va), "r"(*reg_addr), "0"(0));
@@ -1047,7 +979,7 @@ give_sigsegv:
        /* We need to replicate some of the logic in mm/fault.c,
           since we don't have access to the fault code in the
           exception handling return path.  */
-       if (!__access_ok((unsigned long)va, 0, USER_DS))
+       if ((unsigned long)va >= TASK_SIZE)
                info.si_code = SEGV_ACCERR;
        else {
                struct mm_struct *mm = current->mm;
index bf5b931..006f469 100644 (file)
@@ -8,21 +8,6 @@
  * right "bytes left to zero" value (and that it is updated only _after_
  * a successful copy).  There is also some rather minor exception setup
  * stuff.
- *
- * NOTE! This is not directly C-callable, because the calling semantics
- * are different:
- *
- * Inputs:
- *     length in $0
- *     destination address in $6
- *     exception pointer in $7
- *     return address in $28 (exceptions expect it there)
- *
- * Outputs:
- *     bytes left to copy in $0
- *
- * Clobbers:
- *     $1,$2,$3,$4,$5,$6
  */
 #include <asm/export.h>
 
        .set noreorder
        .align 4
 
-       .globl __do_clear_user
-       .ent __do_clear_user
-       .frame  $30, 0, $28
+       .globl __clear_user
+       .ent __clear_user
+       .frame  $30, 0, $26
        .prologue 0
 
 $loop:
        and     $1, 3, $4       # e0    :
        beq     $4, 1f          # .. e1 :
 
-0:     EX( stq_u $31, 0($6) )  # e0    : zero one word
+0:     EX( stq_u $31, 0($16) ) # e0    : zero one word
        subq    $0, 8, $0       # .. e1 :
        subq    $4, 1, $4       # e0    :
-       addq    $6, 8, $6       # .. e1 :
+       addq    $16, 8, $16     # .. e1 :
        bne     $4, 0b          # e1    :
        unop                    #       :
 
 1:     bic     $1, 3, $1       # e0    :
        beq     $1, $tail       # .. e1 :
 
-2:     EX( stq_u $31, 0($6) )  # e0    : zero four words
+2:     EX( stq_u $31, 0($16) ) # e0    : zero four words
        subq    $0, 8, $0       # .. e1 :
-       EX( stq_u $31, 8($6) )  # e0    :
+       EX( stq_u $31, 8($16) ) # e0    :
        subq    $0, 8, $0       # .. e1 :
-       EX( stq_u $31, 16($6) ) # e0    :
+       EX( stq_u $31, 16($16) )        # e0    :
        subq    $0, 8, $0       # .. e1 :
-       EX( stq_u $31, 24($6) ) # e0    :
+       EX( stq_u $31, 24($16) )        # e0    :
        subq    $0, 8, $0       # .. e1 :
        subq    $1, 4, $1       # e0    :
-       addq    $6, 32, $6      # .. e1 :
+       addq    $16, 32, $16    # .. e1 :
        bne     $1, 2b          # e1    :
 
 $tail:
        bne     $2, 1f          # e1    : is there a tail to do?
-       ret     $31, ($28), 1   # .. e1 :
+       ret     $31, ($26), 1   # .. e1 :
 
-1:     EX( ldq_u $5, 0($6) )   # e0    :
+1:     EX( ldq_u $5, 0($16) )  # e0    :
        clr     $0              # .. e1 :
        nop                     # e1    :
        mskqh   $5, $0, $5      # e0    :
-       EX( stq_u $5, 0($6) )   # e0    :
-       ret     $31, ($28), 1   # .. e1 :
+       EX( stq_u $5, 0($16) )  # e0    :
+       ret     $31, ($26), 1   # .. e1 :
 
-__do_clear_user:
-       and     $6, 7, $4       # e0    : find dest misalignment
+__clear_user:
+       and     $17, $17, $0
+       and     $16, 7, $4      # e0    : find dest misalignment
        beq     $0, $zerolength # .. e1 :
        addq    $0, $4, $1      # e0    : bias counter
        and     $1, 7, $2       # e1    : number of bytes in tail
        srl     $1, 3, $1       # e0    :
        beq     $4, $loop       # .. e1 :
 
-       EX( ldq_u $5, 0($6) )   # e0    : load dst word to mask back in
+       EX( ldq_u $5, 0($16) )  # e0    : load dst word to mask back in
        beq     $1, $oneword    # .. e1 : sub-word store?
 
-       mskql   $5, $6, $5      # e0    : take care of misaligned head
-       addq    $6, 8, $6       # .. e1 :
-       EX( stq_u $5, -8($6) )  # e0    :
+       mskql   $5, $16, $5     # e0    : take care of misaligned head
+       addq    $16, 8, $16     # .. e1 :
+       EX( stq_u $5, -8($16) ) # e0    :
        addq    $0, $4, $0      # .. e1 : bytes left -= 8 - misalignment
        subq    $1, 1, $1       # e0    :
        subq    $0, 8, $0       # .. e1 :
@@ -101,15 +87,15 @@ __do_clear_user:
        unop                    #       :
 
 $oneword:
-       mskql   $5, $6, $4      # e0    :
+       mskql   $5, $16, $4     # e0    :
        mskqh   $5, $2, $5      # e0    :
        or      $5, $4, $5      # e1    :
-       EX( stq_u $5, 0($6) )   # e0    :
+       EX( stq_u $5, 0($16) )  # e0    :
        clr     $0              # .. e1 :
 
 $zerolength:
 $exception:
-       ret     $31, ($28), 1   # .. e1 :
+       ret     $31, ($26), 1   # .. e1 :
 
-       .end __do_clear_user
-       EXPORT_SYMBOL(__do_clear_user)
+       .end __clear_user
+       EXPORT_SYMBOL(__clear_user)
index 509f62b..159f1b7 100644 (file)
@@ -9,21 +9,6 @@
  * contains the right "bytes left to copy" value (and that it is updated
  * only _after_ a successful copy). There is also some rather minor
  * exception setup stuff..
- *
- * NOTE! This is not directly C-callable, because the calling semantics are
- * different:
- *
- * Inputs:
- *     length in $0
- *     destination address in $6
- *     source address in $7
- *     return address in $28
- *
- * Outputs:
- *     bytes left to copy in $0
- *
- * Clobbers:
- *     $1,$2,$3,$4,$5,$6,$7
  */
 
 #include <asm/export.h>
        .ent __copy_user
 __copy_user:
        .prologue 0
-       and $6,7,$3
+       and $18,$18,$0
+       and $16,7,$3
        beq $0,$35
        beq $3,$36
        subq $3,8,$3
        .align 4
 $37:
-       EXI( ldq_u $1,0($7) )
-       EXO( ldq_u $2,0($6) )
-       extbl $1,$7,$1
-       mskbl $2,$6,$2
-       insbl $1,$6,$1
+       EXI( ldq_u $1,0($17) )
+       EXO( ldq_u $2,0($16) )
+       extbl $1,$17,$1
+       mskbl $2,$16,$2
+       insbl $1,$16,$1
        addq $3,1,$3
        bis $1,$2,$1
-       EXO( stq_u $1,0($6) )
+       EXO( stq_u $1,0($16) )
        subq $0,1,$0
-       addq $6,1,$6
-       addq $7,1,$7
+       addq $16,1,$16
+       addq $17,1,$17
        beq $0,$41
        bne $3,$37
 $36:
-       and $7,7,$1
+       and $17,7,$1
        bic $0,7,$4
        beq $1,$43
        beq $4,$48
-       EXI( ldq_u $3,0($7) )
+       EXI( ldq_u $3,0($17) )
        .align 4
 $50:
-       EXI( ldq_u $2,8($7) )
+       EXI( ldq_u $2,8($17) )
        subq $4,8,$4
-       extql $3,$7,$3
-       extqh $2,$7,$1
+       extql $3,$17,$3
+       extqh $2,$17,$1
        bis $3,$1,$1
-       EXO( stq $1,0($6) )
-       addq $7,8,$7
+       EXO( stq $1,0($16) )
+       addq $17,8,$17
        subq $0,8,$0
-       addq $6,8,$6
+       addq $16,8,$16
        bis $2,$2,$3
        bne $4,$50
 $48:
        beq $0,$41
        .align 4
 $57:
-       EXI( ldq_u $1,0($7) )
-       EXO( ldq_u $2,0($6) )
-       extbl $1,$7,$1
-       mskbl $2,$6,$2
-       insbl $1,$6,$1
+       EXI( ldq_u $1,0($17) )
+       EXO( ldq_u $2,0($16) )
+       extbl $1,$17,$1
+       mskbl $2,$16,$2
+       insbl $1,$16,$1
        bis $1,$2,$1
-       EXO( stq_u $1,0($6) )
+       EXO( stq_u $1,0($16) )
        subq $0,1,$0
-       addq $6,1,$6
-       addq $7,1,$7
+       addq $16,1,$16
+       addq $17,1,$17
        bne $0,$57
        br $31,$41
        .align 4
@@ -108,27 +94,27 @@ $43:
        beq $4,$65
        .align 4
 $66:
-       EXI( ldq $1,0($7) )
+       EXI( ldq $1,0($17) )
        subq $4,8,$4
-       EXO( stq $1,0($6) )
-       addq $7,8,$7
+       EXO( stq $1,0($16) )
+       addq $17,8,$17
        subq $0,8,$0
-       addq $6,8,$6
+       addq $16,8,$16
        bne $4,$66
 $65:
        beq $0,$41
-       EXI( ldq $2,0($7) )
-       EXO( ldq $1,0($6) )
+       EXI( ldq $2,0($17) )
+       EXO( ldq $1,0($16) )
        mskql $2,$0,$2
        mskqh $1,$0,$1
        bis $2,$1,$2
-       EXO( stq $2,0($6) )
+       EXO( stq $2,0($16) )
        bis $31,$31,$0
 $41:
 $35:
 $exitin:
 $exitout:
-       ret $31,($28),1
+       ret $31,($26),1
 
        .end __copy_user
 EXPORT_SYMBOL(__copy_user)
index 5dfb797..ab42afb 100644 (file)
@@ -45,10 +45,7 @@ __asm__ __volatile__("insqh %1,%2,%0":"=r" (z):"r" (x),"r" (y))
        __asm__ __volatile__(                           \
        "1:     ldq_u %0,%2\n"                          \
        "2:\n"                                          \
-       ".section __ex_table,\"a\"\n"                   \
-       "       .long 1b - .\n"                         \
-       "       lda %0,2b-1b(%1)\n"                     \
-       ".previous"                                     \
+       EXC(1b,2b,%0,%1)                                \
                : "=r"(x), "=r"(__guu_err)              \
                : "m"(__m(ptr)), "1"(0));               \
        __guu_err;                                      \
@@ -60,10 +57,7 @@ __asm__ __volatile__("insqh %1,%2,%0":"=r" (z):"r" (x),"r" (y))
        __asm__ __volatile__(                           \
        "1:     stq_u %2,%1\n"                          \
        "2:\n"                                          \
-       ".section __ex_table,\"a\"\n"                   \
-       "       .long 1b - ."                           \
-       "       lda $31,2b-1b(%0)\n"                    \
-       ".previous"                                     \
+       EXC(1b,2b,$31,%0)                               \
                : "=r"(__puu_err)                       \
                : "m"(__m(addr)), "rJ"(x), "0"(0));     \
        __puu_err;                                      \
index 05bef6b..e179e47 100644 (file)
@@ -9,21 +9,6 @@
  * a successful copy).  There is also some rather minor exception setup
  * stuff.
  *
- * NOTE! This is not directly C-callable, because the calling semantics
- * are different:
- *
- * Inputs:
- *     length in $0
- *     destination address in $6
- *     exception pointer in $7
- *     return address in $28 (exceptions expect it there)
- *
- * Outputs:
- *     bytes left to copy in $0
- *
- * Clobbers:
- *     $1,$2,$3,$4,$5,$6
- *
  * Much of the information about 21264 scheduling/coding comes from:
  *     Compiler Writer's Guide for the Alpha 21264
  *     abbreviated as 'CWG' in other comments here
        .set noreorder
        .align 4
 
-       .globl __do_clear_user
-       .ent __do_clear_user
-       .frame  $30, 0, $28
+       .globl __clear_user
+       .ent __clear_user
+       .frame  $30, 0, $26
        .prologue 0
 
                                # Pipeline info : Slotting & Comments
-__do_clear_user:
-       and     $6, 7, $4       # .. E  .. ..   : find dest head misalignment
+__clear_user:
+       and     $17, $17, $0
+       and     $16, 7, $4      # .. E  .. ..   : find dest head misalignment
        beq     $0, $zerolength # U  .. .. ..   :  U L U L
 
        addq    $0, $4, $1      # .. .. .. E    : bias counter
@@ -75,14 +61,14 @@ __do_clear_user:
 
 /*
  * Head is not aligned.  Write (8 - $4) bytes to head of destination
- * This means $6 is known to be misaligned
+ * This means $16 is known to be misaligned
  */
-       EX( ldq_u $5, 0($6) )   # .. .. .. L    : load dst word to mask back in
+       EX( ldq_u $5, 0($16) )  # .. .. .. L    : load dst word to mask back in
        beq     $1, $onebyte    # .. .. U  ..   : sub-word store?
-       mskql   $5, $6, $5      # .. U  .. ..   : take care of misaligned head
-       addq    $6, 8, $6       # E  .. .. ..   : L U U L
+       mskql   $5, $16, $5     # .. U  .. ..   : take care of misaligned head
+       addq    $16, 8, $16     # E  .. .. ..   : L U U L
 
-       EX( stq_u $5, -8($6) )  # .. .. .. L    :
+       EX( stq_u $5, -8($16) ) # .. .. .. L    :
        subq    $1, 1, $1       # .. .. E  ..   :
        addq    $0, $4, $0      # .. E  .. ..   : bytes left -= 8 - misalignment
        subq    $0, 8, $0       # E  .. .. ..   : U L U L
@@ -93,11 +79,11 @@ __do_clear_user:
  * values upon initial entry to the loop
  * $1 is number of quadwords to clear (zero is a valid value)
  * $2 is number of trailing bytes (0..7) ($2 never used...)
- * $6 is known to be aligned 0mod8
+ * $16 is known to be aligned 0mod8
  */
 $headalign:
        subq    $1, 16, $4      # .. .. .. E    : If < 16, we can not use the huge loop
-       and     $6, 0x3f, $2    # .. .. E  ..   : Forward work for huge loop
+       and     $16, 0x3f, $2   # .. .. E  ..   : Forward work for huge loop
        subq    $2, 0x40, $3    # .. E  .. ..   : bias counter (huge loop)
        blt     $4, $trailquad  # U  .. .. ..   : U L U L
 
@@ -114,21 +100,21 @@ $headalign:
        beq     $3, $bigalign   # U  .. .. ..   : U L U L : Aligned 0mod64
 
 $alignmod64:
-       EX( stq_u $31, 0($6) )  # .. .. .. L
+       EX( stq_u $31, 0($16) ) # .. .. .. L
        addq    $3, 8, $3       # .. .. E  ..
        subq    $0, 8, $0       # .. E  .. ..
        nop                     # E  .. .. ..   : U L U L
 
        nop                     # .. .. .. E
        subq    $1, 1, $1       # .. .. E  ..
-       addq    $6, 8, $6       # .. E  .. ..
+       addq    $16, 8, $16     # .. E  .. ..
        blt     $3, $alignmod64 # U  .. .. ..   : U L U L
 
 $bigalign:
 /*
  * $0 is the number of bytes left
  * $1 is the number of quads left
- * $6 is aligned 0mod64
+ * $16 is aligned 0mod64
  * we know that we'll be taking a minimum of one trip through
  * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
  * We are _not_ going to update $0 after every single store.  That
@@ -145,39 +131,39 @@ $bigalign:
        nop                     # E :
        nop                     # E :
        nop                     # E :
-       bis     $6,$6,$3        # E : U L U L : Initial wh64 address is dest
+       bis     $16,$16,$3      # E : U L U L : Initial wh64 address is dest
        /* This might actually help for the current trip... */
 
 $do_wh64:
        wh64    ($3)            # .. .. .. L1   : memory subsystem hint
        subq    $1, 16, $4      # .. .. E  ..   : Forward calculation - repeat the loop?
-       EX( stq_u $31, 0($6) )  # .. L  .. ..
+       EX( stq_u $31, 0($16) ) # .. L  .. ..
        subq    $0, 8, $0       # E  .. .. ..   : U L U L
 
-       addq    $6, 128, $3     # E : Target address of wh64
-       EX( stq_u $31, 8($6) )  # L :
-       EX( stq_u $31, 16($6) ) # L :
+       addq    $16, 128, $3    # E : Target address of wh64
+       EX( stq_u $31, 8($16) ) # L :
+       EX( stq_u $31, 16($16) )        # L :
        subq    $0, 16, $0      # E : U L L U
 
        nop                     # E :
-       EX( stq_u $31, 24($6) ) # L :
-       EX( stq_u $31, 32($6) ) # L :
+       EX( stq_u $31, 24($16) )        # L :
+       EX( stq_u $31, 32($16) )        # L :
        subq    $0, 168, $5     # E : U L L U : two trips through the loop left?
        /* 168 = 192 - 24, since we've already completed some stores */
 
        subq    $0, 16, $0      # E :
-       EX( stq_u $31, 40($6) ) # L :
-       EX( stq_u $31, 48($6) ) # L :
-       cmovlt  $5, $6, $3      # E : U L L U : Latency 2, extra mapping cycle
+       EX( stq_u $31, 40($16) )        # L :
+       EX( stq_u $31, 48($16) )        # L :
+       cmovlt  $5, $16, $3     # E : U L L U : Latency 2, extra mapping cycle
 
        subq    $1, 8, $1       # E :
        subq    $0, 16, $0      # E :
-       EX( stq_u $31, 56($6) ) # L :
+       EX( stq_u $31, 56($16) )        # L :
        nop                     # E : U L U L
 
        nop                     # E :
        subq    $0, 8, $0       # E :
-       addq    $6, 64, $6      # E :
+       addq    $16, 64, $16    # E :
        bge     $4, $do_wh64    # U : U L U L
 
 $trailquad:
@@ -190,14 +176,14 @@ $trailquad:
        beq     $1, $trailbytes # U  .. .. ..   : U L U L : Only 0..7 bytes to go
 
 $onequad:
-       EX( stq_u $31, 0($6) )  # .. .. .. L
+       EX( stq_u $31, 0($16) ) # .. .. .. L
        subq    $1, 1, $1       # .. .. E  ..
        subq    $0, 8, $0       # .. E  .. ..
        nop                     # E  .. .. ..   : U L U L
 
        nop                     # .. .. .. E
        nop                     # .. .. E  ..
-       addq    $6, 8, $6       # .. E  .. ..
+       addq    $16, 8, $16     # .. E  .. ..
        bgt     $1, $onequad    # U  .. .. ..   : U L U L
 
        # We have an unknown number of bytes left to go.
@@ -211,9 +197,9 @@ $trailbytes:
        # so we will use $0 as the loop counter
        # We know for a fact that $0 > 0 zero due to previous context
 $onebyte:
-       EX( stb $31, 0($6) )    # .. .. .. L
+       EX( stb $31, 0($16) )   # .. .. .. L
        subq    $0, 1, $0       # .. .. E  ..   :
-       addq    $6, 1, $6       # .. E  .. ..   :
+       addq    $16, 1, $16     # .. E  .. ..   :
        bgt     $0, $onebyte    # U  .. .. ..   : U L U L
 
 $zerolength:
@@ -221,6 +207,6 @@ $exception:                 # Destination for exception recovery(?)
        nop                     # .. .. .. E    :
        nop                     # .. .. E  ..   :
        nop                     # .. E  .. ..   :
-       ret     $31, ($28), 1   # L0 .. .. ..   : L U L U
-       .end __do_clear_user
-       EXPORT_SYMBOL(__do_clear_user)
+       ret     $31, ($26), 1   # L0 .. .. ..   : L U L U
+       .end __clear_user
+       EXPORT_SYMBOL(__clear_user)
index be720b5..35e6710 100644 (file)
  * only _after_ a successful copy). There is also some rather minor
  * exception setup stuff..
  *
- * NOTE! This is not directly C-callable, because the calling semantics are
- * different:
- *
- * Inputs:
- *     length in $0
- *     destination address in $6
- *     source address in $7
- *     return address in $28
- *
- * Outputs:
- *     bytes left to copy in $0
- *
- * Clobbers:
- *     $1,$2,$3,$4,$5,$6,$7
- *
  * Much of the information about 21264 scheduling/coding comes from:
  *     Compiler Writer's Guide for the Alpha 21264
  *     abbreviated as 'CWG' in other comments here
                                # Pipeline info: Slotting & Comments
 __copy_user:
        .prologue 0
-       subq $0, 32, $1         # .. E  .. ..   : Is this going to be a small copy?
+       andq $18, $18, $0
+       subq $18, 32, $1        # .. E  .. ..   : Is this going to be a small copy?
        beq $0, $zerolength     # U  .. .. ..   : U L U L
 
-       and $6,7,$3             # .. .. .. E    : is leading dest misalignment
+       and $16,7,$3            # .. .. .. E    : is leading dest misalignment
        ble $1, $onebyteloop    # .. .. U  ..   : 1st branch : small amount of data
        beq $3, $destaligned    # .. U  .. ..   : 2nd (one cycle fetcher stall)
        subq $3, 8, $3          # E  .. .. ..   : L U U L : trip counter
@@ -73,17 +59,17 @@ __copy_user:
  * We know we have at least one trip through this loop
  */
 $aligndest:
-       EXI( ldbu $1,0($7) )    # .. .. .. L    : Keep loads separate from stores
-       addq $6,1,$6            # .. .. E  ..   : Section 3.8 in the CWG
+       EXI( ldbu $1,0($17) )   # .. .. .. L    : Keep loads separate from stores
+       addq $16,1,$16          # .. .. E  ..   : Section 3.8 in the CWG
        addq $3,1,$3            # .. E  .. ..   :
        nop                     # E  .. .. ..   : U L U L
 
 /*
- * the -1 is to compensate for the inc($6) done in a previous quadpack
+ * the -1 is to compensate for the inc($16) done in a previous quadpack
  * which allows us zero dependencies within either quadpack in the loop
  */
-       EXO( stb $1,-1($6) )    # .. .. .. L    :
-       addq $7,1,$7            # .. .. E  ..   : Section 3.8 in the CWG
+       EXO( stb $1,-1($16) )   # .. .. .. L    :
+       addq $17,1,$17          # .. .. E  ..   : Section 3.8 in the CWG
        subq $0,1,$0            # .. E  .. ..   :
        bne $3, $aligndest      # U  .. .. ..   : U L U L
 
@@ -92,29 +78,29 @@ $aligndest:
  * If we arrived via branch, we have a minimum of 32 bytes
  */
 $destaligned:
-       and $7,7,$1             # .. .. .. E    : Check _current_ source alignment
+       and $17,7,$1            # .. .. .. E    : Check _current_ source alignment
        bic $0,7,$4             # .. .. E  ..   : number bytes as a quadword loop
-       EXI( ldq_u $3,0($7) )   # .. L  .. ..   : Forward fetch for fallthrough code
+       EXI( ldq_u $3,0($17) )  # .. L  .. ..   : Forward fetch for fallthrough code
        beq $1,$quadaligned     # U  .. .. ..   : U L U L
 
 /*
- * In the worst case, we've just executed an ldq_u here from 0($7)
+ * In the worst case, we've just executed an ldq_u here from 0($17)
  * and we'll repeat it once if we take the branch
  */
 
 /* Misaligned quadword loop - not unrolled.  Leave it that way. */
 $misquad:
-       EXI( ldq_u $2,8($7) )   # .. .. .. L    :
+       EXI( ldq_u $2,8($17) )  # .. .. .. L    :
        subq $4,8,$4            # .. .. E  ..   :
-       extql $3,$7,$3          # .. U  .. ..   :
-       extqh $2,$7,$1          # U  .. .. ..   : U U L L
+       extql $3,$17,$3         # .. U  .. ..   :
+       extqh $2,$17,$1         # U  .. .. ..   : U U L L
 
        bis $3,$1,$1            # .. .. .. E    :
-       EXO( stq $1,0($6) )     # .. .. L  ..   :
-       addq $7,8,$7            # .. E  .. ..   :
+       EXO( stq $1,0($16) )    # .. .. L  ..   :
+       addq $17,8,$17          # .. E  .. ..   :
        subq $0,8,$0            # E  .. .. ..   : U L L U
 
-       addq $6,8,$6            # .. .. .. E    :
+       addq $16,8,$16          # .. .. .. E    :
        bis $2,$2,$3            # .. .. E  ..   :
        nop                     # .. E  .. ..   :
        bne $4,$misquad         # U  .. .. ..   : U L U L
@@ -125,8 +111,8 @@ $misquad:
        beq $0,$zerolength      # U  .. .. ..   : U L U L
 
 /* We know we have at least one trip through the byte loop */
-       EXI ( ldbu $2,0($7) )   # .. .. .. L    : No loads in the same quad
-       addq $6,1,$6            # .. .. E  ..   : as the store (Section 3.8 in CWG)
+       EXI ( ldbu $2,0($17) )  # .. .. .. L    : No loads in the same quad
+       addq $16,1,$16          # .. .. E  ..   : as the store (Section 3.8 in CWG)
        nop                     # .. E  .. ..   :
        br $31, $dirtyentry     # L0 .. .. ..   : L U U L
 /* Do the trailing byte loop load, then hop into the store part of the loop */
@@ -136,8 +122,8 @@ $misquad:
  * Based upon the usage context, it's worth the effort to unroll this loop
  * $0 - number of bytes to be moved
  * $4 - number of bytes to move as quadwords
- * $6 is current destination address
- * $7 is current source address
+ * $16 is current destination address
+ * $17 is current source address
  */
 $quadaligned:
        subq    $4, 32, $2      # .. .. .. E    : do not unroll for small stuff
@@ -155,29 +141,29 @@ $quadaligned:
  * instruction memory hint instruction).
  */
 $unroll4:
-       EXI( ldq $1,0($7) )     # .. .. .. L
-       EXI( ldq $2,8($7) )     # .. .. L  ..
+       EXI( ldq $1,0($17) )    # .. .. .. L
+       EXI( ldq $2,8($17) )    # .. .. L  ..
        subq    $4,32,$4        # .. E  .. ..
        nop                     # E  .. .. ..   : U U L L
 
-       addq    $7,16,$7        # .. .. .. E
-       EXO( stq $1,0($6) )     # .. .. L  ..
-       EXO( stq $2,8($6) )     # .. L  .. ..
+       addq    $17,16,$17      # .. .. .. E
+       EXO( stq $1,0($16) )    # .. .. L  ..
+       EXO( stq $2,8($16) )    # .. L  .. ..
        subq    $0,16,$0        # E  .. .. ..   : U L L U
 
-       addq    $6,16,$6        # .. .. .. E
-       EXI( ldq $1,0($7) )     # .. .. L  ..
-       EXI( ldq $2,8($7) )     # .. L  .. ..
+       addq    $16,16,$16      # .. .. .. E
+       EXI( ldq $1,0($17) )    # .. .. L  ..
+       EXI( ldq $2,8($17) )    # .. L  .. ..
        subq    $4, 32, $3      # E  .. .. ..   : U U L L : is there enough for another trip?
 
-       EXO( stq $1,0($6) )     # .. .. .. L
-       EXO( stq $2,8($6) )     # .. .. L  ..
+       EXO( stq $1,0($16) )    # .. .. .. L
+       EXO( stq $2,8($16) )    # .. .. L  ..
        subq    $0,16,$0        # .. E  .. ..
-       addq    $7,16,$7        # E  .. .. ..   : U L L U
+       addq    $17,16,$17      # E  .. .. ..   : U L L U
 
        nop                     # .. .. .. E
        nop                     # .. .. E  ..
-       addq    $6,16,$6        # .. E  .. ..
+       addq    $16,16,$16      # .. E  .. ..
        bgt     $3,$unroll4     # U  .. .. ..   : U L U L
 
        nop
@@ -186,14 +172,14 @@ $unroll4:
        beq     $4, $noquads
 
 $onequad:
-       EXI( ldq $1,0($7) )
+       EXI( ldq $1,0($17) )
        subq    $4,8,$4
-       addq    $7,8,$7
+       addq    $17,8,$17
        nop
 
-       EXO( stq $1,0($6) )
+       EXO( stq $1,0($16) )
        subq    $0,8,$0
-       addq    $6,8,$6
+       addq    $16,8,$16
        bne     $4,$onequad
 
 $noquads:
@@ -207,23 +193,23 @@ $noquads:
  * There's no point in doing a lot of complex alignment calculations to try to
  * to quadword stuff for a small amount of data.
  *     $0 - remaining number of bytes left to copy
- *     $6 - current dest addr
- *     $7 - current source addr
+ *     $16 - current dest addr
+ *     $17 - current source addr
  */
 
 $onebyteloop:
-       EXI ( ldbu $2,0($7) )   # .. .. .. L    : No loads in the same quad
-       addq $6,1,$6            # .. .. E  ..   : as the store (Section 3.8 in CWG)
+       EXI ( ldbu $2,0($17) )  # .. .. .. L    : No loads in the same quad
+       addq $16,1,$16          # .. .. E  ..   : as the store (Section 3.8 in CWG)
        nop                     # .. E  .. ..   :
        nop                     # E  .. .. ..   : U L U L
 
 $dirtyentry:
 /*
- * the -1 is to compensate for the inc($6) done in a previous quadpack
+ * the -1 is to compensate for the inc($16) done in a previous quadpack
  * which allows us zero dependencies within either quadpack in the loop
  */
-       EXO ( stb $2,-1($6) )   # .. .. .. L    :
-       addq $7,1,$7            # .. .. E  ..   : quadpack as the load
+       EXO ( stb $2,-1($16) )  # .. .. .. L    :
+       addq $17,1,$17          # .. .. E  ..   : quadpack as the load
        subq $0,1,$0            # .. E  .. ..   : change count _after_ copy
        bgt $0,$onebyteloop     # U  .. .. ..   : U L U L
 
@@ -233,7 +219,7 @@ $exitout:                   # Destination for exception recovery(?)
        nop                     # .. .. .. E
        nop                     # .. .. E  ..
        nop                     # .. E  .. ..
-       ret $31,($28),1         # L0 .. .. ..   : L U L U
+       ret $31,($26),1         # L0 .. .. ..   : L U L U
 
        .end __copy_user
        EXPORT_SYMBOL(__copy_user)
index c9f30f4..5d7fb3e 100644 (file)
@@ -406,6 +406,14 @@ config ARC_HAS_DIV_REM
        bool "Insn: div, divu, rem, remu"
        default y
 
+config ARC_HAS_ACCL_REGS
+       bool "Reg Pair ACCL:ACCH (FPU and/or MPY > 6)"
+       default n
+       help
+         Depending on the configuration, CPU can contain accumulator reg-pair
+         (also referred to as r58:r59). These can also be used by gcc as GPR so
+         kernel needs to save/restore per process
+
 endif  # ISA_ARCV2
 
 endmenu   # "ARC CPU Configuration"
index 63a0401..7bee4e4 100644 (file)
@@ -6,6 +6,7 @@ generic-y += device.h
 generic-y += div64.h
 generic-y += emergency-restart.h
 generic-y += errno.h
+generic-y += extable.h
 generic-y += fb.h
 generic-y += fcntl.h
 generic-y += ftrace.h
index b65930a..54b54da 100644 (file)
 #include <asm/barrier.h>
 #include <asm/smp.h>
 
+#define ATOMIC_INIT(i) { (i) }
+
 #ifndef CONFIG_ARC_PLAT_EZNPS
 
 #define atomic_read(v)  READ_ONCE((v)->counter)
-#define ATOMIC_INIT(i) { (i) }
 
 #ifdef CONFIG_ARC_HAS_LLSC
 
index aee1a77..ac85380 100644 (file)
        ;
        ; Now manually save: r12, sp, fp, gp, r25
 
+#ifdef CONFIG_ARC_HAS_ACCL_REGS
+       PUSH    r59
+       PUSH    r58
+#endif
+
        PUSH    r30
        PUSH    r12
 
        POP     r12
        POP     r30
 
+#ifdef CONFIG_ARC_HAS_ACCL_REGS
+       POP     r58
+       POP     r59
+#endif
+
 .endm
 
 /*------------------------------------------------------------------------*/
index 47111d5..5297faa 100644 (file)
@@ -86,6 +86,10 @@ struct pt_regs {
 
        unsigned long r12, r30;
 
+#ifdef CONFIG_ARC_HAS_ACCL_REGS
+       unsigned long r58, r59; /* ACCL/ACCH used by FPU / DSP MPY */
+#endif
+
        /*------- Below list auto saved by h/w -----------*/
        unsigned long r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11;
 
index 41faf17..f35974e 100644 (file)
 #ifndef _ASM_ARC_UACCESS_H
 #define _ASM_ARC_UACCESS_H
 
-#include <linux/sched.h>
-#include <asm/errno.h>
 #include <linux/string.h>      /* for generic string functions */
 
 
-#define __kernel_ok            (segment_eq(get_fs(), KERNEL_DS))
+#define __kernel_ok            (uaccess_kernel())
 
 /*
  * Algorithmically, for __user_ok() we want do:
 
 
 static inline unsigned long
-__arc_copy_from_user(void *to, const void __user *from, unsigned long n)
+raw_copy_from_user(void *to, const void __user *from, unsigned long n)
 {
        long res = 0;
        char val;
@@ -396,11 +394,8 @@ __arc_copy_from_user(void *to, const void __user *from, unsigned long n)
        return res;
 }
 
-extern unsigned long slowpath_copy_to_user(void __user *to, const void *from,
-                                          unsigned long n);
-
 static inline unsigned long
-__arc_copy_to_user(void __user *to, const void *from, unsigned long n)
+raw_copy_to_user(void __user *to, const void *from, unsigned long n)
 {
        long res = 0;
        char val;
@@ -726,24 +721,20 @@ static inline long __arc_strnlen_user(const char __user *s, long n)
 }
 
 #ifndef CONFIG_CC_OPTIMIZE_FOR_SIZE
-#define __copy_from_user(t, f, n)      __arc_copy_from_user(t, f, n)
-#define __copy_to_user(t, f, n)                __arc_copy_to_user(t, f, n)
+
+#define INLINE_COPY_TO_USER
+#define INLINE_COPY_FROM_USER
+
 #define __clear_user(d, n)             __arc_clear_user(d, n)
 #define __strncpy_from_user(d, s, n)   __arc_strncpy_from_user(d, s, n)
 #define __strnlen_user(s, n)           __arc_strnlen_user(s, n)
 #else
-extern long arc_copy_from_user_noinline(void *to, const void __user * from,
-               unsigned long n);
-extern long arc_copy_to_user_noinline(void __user *to, const void *from,
-               unsigned long n);
 extern unsigned long arc_clear_user_noinline(void __user *to,
                unsigned long n);
 extern long arc_strncpy_from_user_noinline (char *dst, const char __user *src,
                long count);
 extern long arc_strnlen_user_noinline(const char __user *src, long n);
 
-#define __copy_from_user(t, f, n)      arc_copy_from_user_noinline(t, f, n)
-#define __copy_to_user(t, f, n)                arc_copy_to_user_noinline(t, f, n)
 #define __clear_user(d, n)             arc_clear_user_noinline(d, n)
 #define __strncpy_from_user(d, s, n)   arc_strncpy_from_user_noinline(d, s, n)
 #define __strnlen_user(s, n)           arc_strnlen_user_noinline(s, n)
@@ -752,6 +743,4 @@ extern long arc_strnlen_user_noinline(const char __user *src, long n);
 
 #include <asm-generic/uaccess.h>
 
-extern int fixup_exception(struct pt_regs *regs);
-
 #endif
index fa62404..fc8211f 100644 (file)
@@ -319,7 +319,8 @@ static char *arc_extn_mumbojumbo(int cpu_id, char *buf, int len)
 static void arc_chk_core_config(void)
 {
        struct cpuinfo_arc *cpu = &cpuinfo_arc700[smp_processor_id()];
-       int fpu_enabled;
+       int saved = 0, present = 0;
+       char *opt_nm = NULL;;
 
        if (!cpu->extn.timer0)
                panic("Timer0 is not present!\n");
@@ -346,17 +347,28 @@ static void arc_chk_core_config(void)
 
        /*
         * FP hardware/software config sanity
-        * -If hardware contains DPFP, kernel needs to save/restore FPU state
+        * -If hardware present, kernel needs to save/restore FPU state
         * -If not, it will crash trying to save/restore the non-existant regs
-        *
-        * (only DPDP checked since SP has no arch visible regs)
         */
-       fpu_enabled = IS_ENABLED(CONFIG_ARC_FPU_SAVE_RESTORE);
 
-       if (cpu->extn.fpu_dp && !fpu_enabled)
-               pr_warn("CONFIG_ARC_FPU_SAVE_RESTORE needed for working apps\n");
-       else if (!cpu->extn.fpu_dp && fpu_enabled)
-               panic("FPU non-existent, disable CONFIG_ARC_FPU_SAVE_RESTORE\n");
+       if (is_isa_arcompact()) {
+               opt_nm = "CONFIG_ARC_FPU_SAVE_RESTORE";
+               saved = IS_ENABLED(CONFIG_ARC_FPU_SAVE_RESTORE);
+
+               /* only DPDP checked since SP has no arch visible regs */
+               present = cpu->extn.fpu_dp;
+       } else {
+               opt_nm = "CONFIG_ARC_HAS_ACCL_REGS";
+               saved = IS_ENABLED(CONFIG_ARC_HAS_ACCL_REGS);
+
+               /* Accumulator Low:High pair (r58:59) present if DSP MPY or FPU */
+               present = cpu->extn_mpy.dsp | cpu->extn.fpu_sp | cpu->extn.fpu_dp;
+       }
+
+       if (present && !saved)
+               pr_warn("Enable %s for working apps\n", opt_nm);
+       else if (!present && saved)
+               panic("Disable %s, hardware NOT present\n", opt_nm);
 }
 
 /*
index c86906b..72125a3 100644 (file)
@@ -28,20 +28,6 @@ int fixup_exception(struct pt_regs *regs)
 
 #ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE
 
-long arc_copy_from_user_noinline(void *to, const void __user *from,
-               unsigned long n)
-{
-       return __arc_copy_from_user(to, from, n);
-}
-EXPORT_SYMBOL(arc_copy_from_user_noinline);
-
-long arc_copy_to_user_noinline(void __user *to, const void *from,
-               unsigned long n)
-{
-       return __arc_copy_to_user(to, from, n);
-}
-EXPORT_SYMBOL(arc_copy_to_user_noinline);
-
 unsigned long arc_clear_user_noinline(void __user *to,
                unsigned long n)
 {
index 0d4e71b..8a7ab5e 100644 (file)
@@ -41,7 +41,6 @@ config ARM
        select HARDIRQS_SW_RESEND
        select HAVE_ARCH_AUDITSYSCALL if (AEABI && !OABI_COMPAT)
        select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6
-       select HAVE_ARCH_HARDENED_USERCOPY
        select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU
        select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32 && MMU
        select HAVE_ARCH_MMAP_RND_BITS if MMU
index efb5eae..d42b98f 100644 (file)
 
        phy1: ethernet-phy@1 {
                reg = <7>;
+               eee-broken-100tx;
+               eee-broken-1000t;
        };
 };
 
index 9e43c44..9ba4b18 100644 (file)
        ti,non-removable;
        bus-width = <4>;
        cap-power-off-card;
+       keep-power-in-suspend;
        pinctrl-names = "default";
        pinctrl-0 = <&mmc2_pins>;
 
index 2c9e56f..bbfb9d5 100644 (file)
                                device_type = "pci";
                                ranges = <0x81000000 0 0          0x03000 0 0x00010000
                                          0x82000000 0 0x20013000 0x13000 0 0xffed000>;
+                               bus-range = <0x00 0xff>;
                                #interrupt-cells = <1>;
                                num-lanes = <1>;
                                linux,pci-domain = <0>;
                                device_type = "pci";
                                ranges = <0x81000000 0 0          0x03000 0 0x00010000
                                          0x82000000 0 0x30013000 0x13000 0 0xffed000>;
+                               bus-range = <0x00 0xff>;
                                #interrupt-cells = <1>;
                                num-lanes = <1>;
                                linux,pci-domain = <1>;
index 8f9a69c..efe5399 100644 (file)
 &i2c3 {
        clock-frequency = <400000>;
        at24@50 {
-               compatible = "at24,24c02";
+               compatible = "atmel,24c64";
                readonly;
                reg = <0x50>;
        };
index cf91254..1aff4ad 100644 (file)
                };
        };
 
+       timer3: timer@2000e000 {
+               compatible = "rockchip,rk3188-timer", "rockchip,rk3288-timer";
+               reg = <0x2000e000 0x20>;
+               interrupts = <GIC_SPI 46 IRQ_TYPE_LEVEL_HIGH>;
+               clocks = <&cru SCLK_TIMER3>, <&cru PCLK_TIMER3>;
+               clock-names = "timer", "pclk";
+       };
+
+       timer6: timer@200380a0 {
+               compatible = "rockchip,rk3188-timer", "rockchip,rk3288-timer";
+               reg = <0x200380a0 0x20>;
+               interrupts = <GIC_SPI 64 IRQ_TYPE_LEVEL_HIGH>;
+               clocks = <&cru SCLK_TIMER6>, <&cru PCLK_TIMER0>;
+               clock-names = "timer", "pclk";
+       };
+
        i2s0: i2s@1011a000 {
                compatible = "rockchip,rk3188-i2s", "rockchip,rk3066-i2s";
                reg = <0x1011a000 0x2000>;
 
 &global_timer {
        interrupts = <GIC_PPI 11 0xf04>;
+       status = "disabled";
 };
 
 &local_timer {
index 9dff822..641607d 100644 (file)
        };
 
        timer: timer@110c0000 {
-               compatible = "rockchip,rk3288-timer";
+               compatible = "rockchip,rk3228-timer", "rockchip,rk3288-timer";
                reg = <0x110c0000 0x20>;
                interrupts = <GIC_SPI 43 IRQ_TYPE_LEVEL_HIGH>;
                clocks = <&xin24m>, <&cru PCLK_TIMER>;
index 162e1eb..6c5affe 100644 (file)
                        status = "disabled";
                };
 
-               cpufreq-cooling {
-                       compatible = "stericsson,db8500-cpufreq-cooling";
-                       status = "disabled";
-               };
-
                mcde@a0350000 {
                        compatible = "stericsson,mcde";
                        reg = <0xa0350000 0x1000>, /* MCDE */
index 0467fb3..306af6c 100644 (file)
                        opp-microvolt = <1200000>;
                        clock-latency-ns = <244144>; /* 8 32k periods */
                };
-
-               opp@1200000000 {
-                       opp-hz = /bits/ 64 <1200000000>;
-                       opp-microvolt = <1320000>;
-                       clock-latency-ns = <244144>; /* 8 32k periods */
-               };
        };
 
        cpus {
                        operating-points-v2 = <&cpu0_opp_table>;
                };
 
+               cpu@1 {
+                       operating-points-v2 = <&cpu0_opp_table>;
+               };
+
                cpu@2 {
                        compatible = "arm,cortex-a7";
                        device_type = "cpu";
                        reg = <2>;
+                       operating-points-v2 = <&cpu0_opp_table>;
                };
 
                cpu@3 {
                        compatible = "arm,cortex-a7";
                        device_type = "cpu";
                        reg = <3>;
+                       operating-points-v2 = <&cpu0_opp_table>;
                };
        };
 
index a94126f..6aa7be1 100644 (file)
@@ -748,7 +748,6 @@ CONFIG_LEDS_TRIGGER_DEFAULT_ON=y
 CONFIG_LEDS_TRIGGER_TRANSIENT=y
 CONFIG_LEDS_TRIGGER_CAMERA=y
 CONFIG_EDAC=y
-CONFIG_EDAC_MM_EDAC=y
 CONFIG_EDAC_HIGHBANK_MC=y
 CONFIG_EDAC_HIGHBANK_L2=y
 CONFIG_RTC_CLASS=y
index 2aac99f..1318f61 100644 (file)
@@ -635,8 +635,7 @@ CONFIG_LEDS_TRIGGER_GPIO=m
 CONFIG_LEDS_TRIGGER_DEFAULT_ON=m
 CONFIG_LEDS_TRIGGER_TRANSIENT=m
 CONFIG_LEDS_TRIGGER_CAMERA=m
-CONFIG_EDAC=y
-CONFIG_EDAC_MM_EDAC=m
+CONFIG_EDAC=m
 CONFIG_RTC_CLASS=y
 CONFIG_RTC_DEBUG=y
 CONFIG_RTC_DRV_DS1307=m
index b14e8c7..3a36d99 100644 (file)
@@ -7,6 +7,7 @@ generic-y += early_ioremap.h
 generic-y += emergency-restart.h
 generic-y += errno.h
 generic-y += exec.h
+generic-y += extable.h
 generic-y += ioctl.h
 generic-y += ipcbuf.h
 generic-y += irq_regs.h
index e4e6a9d..17f1f1a 100644 (file)
@@ -85,6 +85,18 @@ static inline void efifb_setup_from_dmi(struct screen_info *si, const char *opt)
  */
 #define ZIMAGE_OFFSET_LIMIT    SZ_128M
 #define MIN_ZIMAGE_OFFSET      MAX_UNCOMP_KERNEL_SIZE
-#define MAX_FDT_OFFSET         ZIMAGE_OFFSET_LIMIT
+
+/* on ARM, the FDT should be located in the first 128 MB of RAM */
+static inline unsigned long efi_get_max_fdt_addr(unsigned long dram_base)
+{
+       return dram_base + ZIMAGE_OFFSET_LIMIT;
+}
+
+/* on ARM, the initrd should be loaded in a lowmem region */
+static inline unsigned long efi_get_max_initrd_addr(unsigned long dram_base,
+                                                   unsigned long image_addr)
+{
+       return dram_base + SZ_512M;
+}
 
 #endif /* _ASM_ARM_EFI_H */
index b7e0125..2577405 100644 (file)
@@ -12,8 +12,6 @@
  * User space memory access functions
  */
 #include <linux/string.h>
-#include <linux/thread_info.h>
-#include <asm/errno.h>
 #include <asm/memory.h>
 #include <asm/domain.h>
 #include <asm/unified.h>
 #define __put_user_unaligned __put_user
 #endif
 
-#define VERIFY_READ 0
-#define VERIFY_WRITE 1
-
-/*
- * The exception table consists of pairs of addresses: the first is the
- * address of an instruction that is allowed to fault, and the second is
- * the address at which the program should continue.  No registers are
- * modified, so it is entirely up to the continuation code to figure out
- * what to do.
- *
- * All the routines below use bits of fixup code that are out of line
- * with the main instruction path.  This means when everything is well,
- * we don't even have to jump over them.  Further, they do not intrude
- * on our cache or tlb entries.
- */
-
-struct exception_table_entry
-{
-       unsigned long insn, fixup;
-};
-
-extern int fixup_exception(struct pt_regs *regs);
+#include <asm/extable.h>
 
 /*
  * These two functions allow hooking accesses to userspace to increase
@@ -271,7 +248,7 @@ static inline void set_fs(mm_segment_t fs)
 #define access_ok(type, addr, size)    (__range_ok(addr, size) == 0)
 
 #define user_addr_max() \
-       (segment_eq(get_fs(), KERNEL_DS) ? ~0UL : get_fs())
+       (uaccess_kernel() ? ~0UL : get_fs())
 
 /*
  * The "__xxx" versions of the user access functions do not verify the
@@ -478,7 +455,7 @@ extern unsigned long __must_check
 arm_copy_from_user(void *to, const void __user *from, unsigned long n);
 
 static inline unsigned long __must_check
-__arch_copy_from_user(void *to, const void __user *from, unsigned long n)
+raw_copy_from_user(void *to, const void __user *from, unsigned long n)
 {
        unsigned int __ua_flags;
 
@@ -494,7 +471,7 @@ extern unsigned long __must_check
 __copy_to_user_std(void __user *to, const void *from, unsigned long n);
 
 static inline unsigned long __must_check
-__arch_copy_to_user(void __user *to, const void *from, unsigned long n)
+raw_copy_to_user(void __user *to, const void *from, unsigned long n)
 {
 #ifndef CONFIG_UACCESS_WITH_MEMCPY
        unsigned int __ua_flags;
@@ -522,54 +499,22 @@ __clear_user(void __user *addr, unsigned long n)
 }
 
 #else
-#define __arch_copy_from_user(to, from, n)     \
-                                       (memcpy(to, (void __force *)from, n), 0)
-#define __arch_copy_to_user(to, from, n)       \
-                                       (memcpy((void __force *)to, from, n), 0)
-#define __clear_user(addr, n)          (memset((void __force *)addr, 0, n), 0)
-#endif
-
-static inline unsigned long __must_check
-__copy_from_user(void *to, const void __user *from, unsigned long n)
-{
-       check_object_size(to, n, false);
-       return __arch_copy_from_user(to, from, n);
-}
-
-static inline unsigned long __must_check
-copy_from_user(void *to, const void __user *from, unsigned long n)
-{
-       unsigned long res = n;
-
-       check_object_size(to, n, false);
-
-       if (likely(access_ok(VERIFY_READ, from, n)))
-               res = __arch_copy_from_user(to, from, n);
-       if (unlikely(res))
-               memset(to + (n - res), 0, res);
-       return res;
-}
-
-static inline unsigned long __must_check
-__copy_to_user(void __user *to, const void *from, unsigned long n)
+static inline unsigned long
+raw_copy_from_user(void *to, const void __user *from, unsigned long n)
 {
-       check_object_size(from, n, true);
-
-       return __arch_copy_to_user(to, from, n);
+       memcpy(to, (const void __force *)from, n);
+       return 0;
 }
-
-static inline unsigned long __must_check
-copy_to_user(void __user *to, const void *from, unsigned long n)
+static inline unsigned long
+raw_copy_to_user(void __user *to, const void *from, unsigned long n)
 {
-       check_object_size(from, n, true);
-
-       if (access_ok(VERIFY_WRITE, to, n))
-               n = __arch_copy_to_user(to, from, n);
-       return n;
+       memcpy((void __force *)to, from, n);
+       return 0;
 }
-
-#define __copy_to_user_inatomic __copy_to_user
-#define __copy_from_user_inatomic __copy_from_user
+#define __clear_user(addr, n)          (memset((void __force *)addr, 0, n), 0)
+#endif
+#define INLINE_COPY_TO_USER
+#define INLINE_COPY_FROM_USER
 
 static inline unsigned long __must_check clear_user(void __user *to, unsigned long n)
 {
index 96dba7c..314eb6a 100644 (file)
@@ -1124,6 +1124,9 @@ static void cpu_hyp_reinit(void)
                if (__hyp_get_vectors() == hyp_default_vectors)
                        cpu_init_hyp_mode(NULL);
        }
+
+       if (vgic_present)
+               kvm_vgic_init_cpu_hardware();
 }
 
 static void cpu_hyp_reset(void)
index 962616f..582a972 100644 (file)
@@ -292,11 +292,18 @@ static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size)
        phys_addr_t addr = start, end = start + size;
        phys_addr_t next;
 
+       assert_spin_locked(&kvm->mmu_lock);
        pgd = kvm->arch.pgd + stage2_pgd_index(addr);
        do {
                next = stage2_pgd_addr_end(addr, end);
                if (!stage2_pgd_none(*pgd))
                        unmap_stage2_puds(kvm, pgd, addr, next);
+               /*
+                * If the range is too large, release the kvm->mmu_lock
+                * to prevent starvation and lockup detector warnings.
+                */
+               if (next != end)
+                       cond_resched_lock(&kvm->mmu_lock);
        } while (pgd++, addr = next, addr != end);
 }
 
@@ -803,6 +810,7 @@ void stage2_unmap_vm(struct kvm *kvm)
        int idx;
 
        idx = srcu_read_lock(&kvm->srcu);
+       down_read(&current->mm->mmap_sem);
        spin_lock(&kvm->mmu_lock);
 
        slots = kvm_memslots(kvm);
@@ -810,6 +818,7 @@ void stage2_unmap_vm(struct kvm *kvm)
                stage2_unmap_memslot(kvm, memslot);
 
        spin_unlock(&kvm->mmu_lock);
+       up_read(&current->mm->mmap_sem);
        srcu_read_unlock(&kvm->srcu, idx);
 }
 
@@ -829,7 +838,10 @@ void kvm_free_stage2_pgd(struct kvm *kvm)
        if (kvm->arch.pgd == NULL)
                return;
 
+       spin_lock(&kvm->mmu_lock);
        unmap_stage2_range(kvm, 0, KVM_PHYS_SIZE);
+       spin_unlock(&kvm->mmu_lock);
+
        /* Free the HW pgd, one page at a time */
        free_pages_exact(kvm->arch.pgd, S2_PGD_SIZE);
        kvm->arch.pgd = NULL;
@@ -1801,6 +1813,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
            (KVM_PHYS_SIZE >> PAGE_SHIFT))
                return -EFAULT;
 
+       down_read(&current->mm->mmap_sem);
        /*
         * A memory region could potentially cover multiple VMAs, and any holes
         * between them, so iterate over all of them to find out if we can map
@@ -1844,8 +1857,10 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
                        pa += vm_start - vma->vm_start;
 
                        /* IO region dirty page logging not allowed */
-                       if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES)
-                               return -EINVAL;
+                       if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES) {
+                               ret = -EINVAL;
+                               goto out;
+                       }
 
                        ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
                                                    vm_end - vm_start,
@@ -1857,7 +1872,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
        } while (hva < reg_end);
 
        if (change == KVM_MR_FLAGS_ONLY)
-               return ret;
+               goto out;
 
        spin_lock(&kvm->mmu_lock);
        if (ret)
@@ -1865,6 +1880,8 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
        else
                stage2_flush_memslot(kvm, memslot);
        spin_unlock(&kvm->mmu_lock);
+out:
+       up_read(&current->mm->mmap_sem);
        return ret;
 }
 
index 6bd1089..9b4ed17 100644 (file)
@@ -90,7 +90,7 @@ __copy_to_user_memcpy(void __user *to, const void *from, unsigned long n)
        unsigned long ua_flags;
        int atomic;
 
-       if (unlikely(segment_eq(get_fs(), KERNEL_DS))) {
+       if (uaccess_kernel()) {
                memcpy((void *)to, from, n);
                return 0;
        }
@@ -162,7 +162,7 @@ __clear_user_memset(void __user *addr, unsigned long n)
 {
        unsigned long ua_flags;
 
-       if (unlikely(segment_eq(get_fs(), KERNEL_DS))) {
+       if (uaccess_kernel()) {
                memset((void *)addr, 0, n);
                return 0;
        }
index f69e28b..70db2ab 100644 (file)
@@ -3,8 +3,8 @@ menuconfig ARCH_MOXART
        depends on ARCH_MULTI_V4
        select CPU_FA526
        select ARM_DMA_MEM_BUFFERABLE
+       select FARADAY_FTINTC010
        select MOXART_TIMER
-       select GENERIC_IRQ_CHIP
        select GPIOLIB
        select PHYLIB if NETDEVICES
        help
index c4f2ace..3089d3b 100644 (file)
@@ -270,6 +270,7 @@ extern const struct smp_operations omap4_smp_ops;
 extern int omap4_mpuss_init(void);
 extern int omap4_enter_lowpower(unsigned int cpu, unsigned int power_state);
 extern int omap4_hotplug_cpu(unsigned int cpu, unsigned int power_state);
+extern u32 omap4_get_cpu1_ns_pa_addr(void);
 #else
 static inline int omap4_enter_lowpower(unsigned int cpu,
                                        unsigned int power_state)
index d3fb566..433db6d 100644 (file)
@@ -50,7 +50,7 @@ void omap4_cpu_die(unsigned int cpu)
                omap4_hotplug_cpu(cpu, PWRDM_POWER_OFF);
 
                if (omap_secure_apis_support())
-                       boot_cpu = omap_read_auxcoreboot0();
+                       boot_cpu = omap_read_auxcoreboot0() >> 9;
                else
                        boot_cpu =
                                readl_relaxed(base + OMAP_AUX_CORE_BOOT_0) >> 5;
index 113ab2d..03ec6d3 100644 (file)
@@ -64,6 +64,7 @@
 #include "prm-regbits-44xx.h"
 
 static void __iomem *sar_base;
+static u32 old_cpu1_ns_pa_addr;
 
 #if defined(CONFIG_PM) && defined(CONFIG_SMP)
 
@@ -212,6 +213,11 @@ static void __init save_l2x0_context(void)
 {}
 #endif
 
+u32 omap4_get_cpu1_ns_pa_addr(void)
+{
+       return old_cpu1_ns_pa_addr;
+}
+
 /**
  * omap4_enter_lowpower: OMAP4 MPUSS Low Power Entry Function
  * The purpose of this function is to manage low power programming
@@ -460,22 +466,30 @@ int __init omap4_mpuss_init(void)
 void __init omap4_mpuss_early_init(void)
 {
        unsigned long startup_pa;
+       void __iomem *ns_pa_addr;
 
-       if (!(cpu_is_omap44xx() || soc_is_omap54xx()))
+       if (!(soc_is_omap44xx() || soc_is_omap54xx()))
                return;
 
        sar_base = omap4_get_sar_ram_base();
 
-       if (cpu_is_omap443x())
+       /* Save old NS_PA_ADDR for validity checks later on */
+       if (soc_is_omap44xx())
+               ns_pa_addr = sar_base + CPU1_WAKEUP_NS_PA_ADDR_OFFSET;
+       else
+               ns_pa_addr = sar_base + OMAP5_CPU1_WAKEUP_NS_PA_ADDR_OFFSET;
+       old_cpu1_ns_pa_addr = readl_relaxed(ns_pa_addr);
+
+       if (soc_is_omap443x())
                startup_pa = __pa_symbol(omap4_secondary_startup);
-       else if (cpu_is_omap446x())
+       else if (soc_is_omap446x())
                startup_pa = __pa_symbol(omap4460_secondary_startup);
        else if ((__boot_cpu_mode & MODE_MASK) == HYP_MODE)
                startup_pa = __pa_symbol(omap5_secondary_hyp_startup);
        else
                startup_pa = __pa_symbol(omap5_secondary_startup);
 
-       if (cpu_is_omap44xx())
+       if (soc_is_omap44xx())
                writel_relaxed(startup_pa, sar_base +
                               CPU1_WAKEUP_NS_PA_ADDR_OFFSET);
        else
index fd90125..72506e6 100644 (file)
@@ -94,6 +94,5 @@ ENTRY(omap_read_auxcoreboot0)
        ldr     r12, =0x103
        dsb
        smc     #0
-       mov     r0, r0, lsr #9
        ldmfd   sp!, {r2-r12, pc}
 ENDPROC(omap_read_auxcoreboot0)
index 003353b..3faf454 100644 (file)
@@ -21,6 +21,7 @@
 #include <linux/io.h>
 #include <linux/irqchip/arm-gic.h>
 
+#include <asm/sections.h>
 #include <asm/smp_scu.h>
 #include <asm/virt.h>
 
 
 #define OMAP5_CORE_COUNT       0x2
 
+#define AUX_CORE_BOOT0_GP_RELEASE      0x020
+#define AUX_CORE_BOOT0_HS_RELEASE      0x200
+
 struct omap_smp_config {
        unsigned long cpu1_rstctrl_pa;
        void __iomem *cpu1_rstctrl_va;
        void __iomem *scu_base;
+       void __iomem *wakeupgen_base;
        void *startup_addr;
 };
 
@@ -140,7 +145,6 @@ static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
        static struct clockdomain *cpu1_clkdm;
        static bool booted;
        static struct powerdomain *cpu1_pwrdm;
-       void __iomem *base = omap_get_wakeupgen_base();
 
        /*
         * Set synchronisation state between this boot processor
@@ -155,9 +159,11 @@ static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
         * A barrier is added to ensure that write buffer is drained
         */
        if (omap_secure_apis_support())
-               omap_modify_auxcoreboot0(0x200, 0xfffffdff);
+               omap_modify_auxcoreboot0(AUX_CORE_BOOT0_HS_RELEASE,
+                                        0xfffffdff);
        else
-               writel_relaxed(0x20, base + OMAP_AUX_CORE_BOOT_0);
+               writel_relaxed(AUX_CORE_BOOT0_GP_RELEASE,
+                              cfg.wakeupgen_base + OMAP_AUX_CORE_BOOT_0);
 
        if (!cpu1_clkdm && !cpu1_pwrdm) {
                cpu1_clkdm = clkdm_lookup("mpu1_clkdm");
@@ -261,9 +267,72 @@ static void __init omap4_smp_init_cpus(void)
                set_cpu_possible(i, true);
 }
 
+/*
+ * For now, just make sure the start-up address is not within the booting
+ * kernel space as that means we just overwrote whatever secondary_startup()
+ * code there was.
+ */
+static bool __init omap4_smp_cpu1_startup_valid(unsigned long addr)
+{
+       if ((addr >= __pa(PAGE_OFFSET)) && (addr <= __pa(__bss_start)))
+               return false;
+
+       return true;
+}
+
+/*
+ * We may need to reset CPU1 before configuring, otherwise kexec boot can end
+ * up trying to use old kernel startup address or suspend-resume will
+ * occasionally fail to bring up CPU1 on 4430 if CPU1 fails to enter deeper
+ * idle states.
+ */
+static void __init omap4_smp_maybe_reset_cpu1(struct omap_smp_config *c)
+{
+       unsigned long cpu1_startup_pa, cpu1_ns_pa_addr;
+       bool needs_reset = false;
+       u32 released;
+
+       if (omap_secure_apis_support())
+               released = omap_read_auxcoreboot0() & AUX_CORE_BOOT0_HS_RELEASE;
+       else
+               released = readl_relaxed(cfg.wakeupgen_base +
+                                        OMAP_AUX_CORE_BOOT_0) &
+                                               AUX_CORE_BOOT0_GP_RELEASE;
+       if (released) {
+               pr_warn("smp: CPU1 not parked?\n");
+
+               return;
+       }
+
+       cpu1_startup_pa = readl_relaxed(cfg.wakeupgen_base +
+                                       OMAP_AUX_CORE_BOOT_1);
+       cpu1_ns_pa_addr = omap4_get_cpu1_ns_pa_addr();
+
+       /* Did the configured secondary_startup() get overwritten? */
+       if (!omap4_smp_cpu1_startup_valid(cpu1_startup_pa))
+               needs_reset = true;
+
+       /*
+        * If omap4 or 5 has NS_PA_ADDR configured, CPU1 may be in a
+        * deeper idle state in WFI and will wake to an invalid address.
+        */
+       if ((soc_is_omap44xx() || soc_is_omap54xx()) &&
+           !omap4_smp_cpu1_startup_valid(cpu1_ns_pa_addr))
+               needs_reset = true;
+
+       if (!needs_reset || !c->cpu1_rstctrl_va)
+               return;
+
+       pr_info("smp: CPU1 parked within kernel, needs reset (0x%lx 0x%lx)\n",
+               cpu1_startup_pa, cpu1_ns_pa_addr);
+
+       writel_relaxed(1, c->cpu1_rstctrl_va);
+       readl_relaxed(c->cpu1_rstctrl_va);
+       writel_relaxed(0, c->cpu1_rstctrl_va);
+}
+
 static void __init omap4_smp_prepare_cpus(unsigned int max_cpus)
 {
-       void __iomem *base = omap_get_wakeupgen_base();
        const struct omap_smp_config *c = NULL;
 
        if (soc_is_omap443x())
@@ -281,6 +350,7 @@ static void __init omap4_smp_prepare_cpus(unsigned int max_cpus)
        /* Must preserve cfg.scu_base set earlier */
        cfg.cpu1_rstctrl_pa = c->cpu1_rstctrl_pa;
        cfg.startup_addr = c->startup_addr;
+       cfg.wakeupgen_base = omap_get_wakeupgen_base();
 
        if (soc_is_dra74x() || soc_is_omap54xx()) {
                if ((__boot_cpu_mode & MODE_MASK) == HYP_MODE)
@@ -299,15 +369,7 @@ static void __init omap4_smp_prepare_cpus(unsigned int max_cpus)
        if (cfg.scu_base)
                scu_enable(cfg.scu_base);
 
-       /*
-        * Reset CPU1 before configuring, otherwise kexec will
-        * end up trying to use old kernel startup address.
-        */
-       if (cfg.cpu1_rstctrl_va) {
-               writel_relaxed(1, cfg.cpu1_rstctrl_va);
-               readl_relaxed(cfg.cpu1_rstctrl_va);
-               writel_relaxed(0, cfg.cpu1_rstctrl_va);
-       }
+       omap4_smp_maybe_reset_cpu1(&cfg);
 
        /*
         * Write the address of secondary startup routine into the
@@ -319,7 +381,7 @@ static void __init omap4_smp_prepare_cpus(unsigned int max_cpus)
                omap_auxcoreboot_addr(__pa_symbol(cfg.startup_addr));
        else
                writel_relaxed(__pa_symbol(cfg.startup_addr),
-                              base + OMAP_AUX_CORE_BOOT_1);
+                              cfg.wakeupgen_base + OMAP_AUX_CORE_BOOT_1);
 }
 
 const struct smp_operations omap4_smp_ops __initconst = {
index e920dd8..f989145 100644 (file)
@@ -222,6 +222,14 @@ static int _omap_device_notifier_call(struct notifier_block *nb,
                                dev_err(dev, "failed to idle\n");
                }
                break;
+       case BUS_NOTIFY_BIND_DRIVER:
+               od = to_omap_device(pdev);
+               if (od && (od->_state == OMAP_DEVICE_STATE_ENABLED) &&
+                   pm_runtime_status_suspended(dev)) {
+                       od->_driver_status = BUS_NOTIFY_BIND_DRIVER;
+                       pm_runtime_set_active(dev);
+               }
+               break;
        case BUS_NOTIFY_ADD_DEVICE:
                if (pdev->dev.of_node)
                        omap_device_build_from_dt(pdev);
index 633442a..2a7bb6c 100644 (file)
@@ -6,6 +6,7 @@ menuconfig ARCH_ORION5X
        select GPIOLIB
        select MVEBU_MBUS
        select PCI
+       select PHYLIB if NETDEVICES
        select PLAT_ORION_LEGACY
        help
          Support for the following Marvell Orion 5x series SoCs:
index 63eabb0..475811f 100644 (file)
@@ -935,13 +935,31 @@ static void arm_coherent_dma_free(struct device *dev, size_t size, void *cpu_add
        __arm_dma_free(dev, size, cpu_addr, handle, attrs, true);
 }
 
+/*
+ * The whole dma_get_sgtable() idea is fundamentally unsafe - it seems
+ * that the intention is to allow exporting memory allocated via the
+ * coherent DMA APIs through the dma_buf API, which only accepts a
+ * scattertable.  This presents a couple of problems:
+ * 1. Not all memory allocated via the coherent DMA APIs is backed by
+ *    a struct page
+ * 2. Passing coherent DMA memory into the streaming APIs is not allowed
+ *    as we will try to flush the memory through a different alias to that
+ *    actually being used (and the flushes are redundant.)
+ */
 int arm_dma_get_sgtable(struct device *dev, struct sg_table *sgt,
                 void *cpu_addr, dma_addr_t handle, size_t size,
                 unsigned long attrs)
 {
-       struct page *page = pfn_to_page(dma_to_pfn(dev, handle));
+       unsigned long pfn = dma_to_pfn(dev, handle);
+       struct page *page;
        int ret;
 
+       /* If the PFN is not valid, we do not have a struct page */
+       if (!pfn_valid(pfn))
+               return -ENXIO;
+
+       page = pfn_to_page(pfn);
+
        ret = sg_alloc_table(sgt, 1, GFP_KERNEL);
        if (unlikely(ret))
                return ret;
index 3b5c7aa..33a45bd 100644 (file)
@@ -303,7 +303,10 @@ static inline void set_vbar(unsigned long val)
  */
 static inline bool security_extensions_enabled(void)
 {
-       return !!cpuid_feature_extract(CPUID_EXT_PFR1, 4);
+       /* Check CPUID Identification Scheme before ID_PFR1 read */
+       if ((read_cpuid_id() & 0x000f0000) == 0x000f0000)
+               return !!cpuid_feature_extract(CPUID_EXT_PFR1, 4);
+       return 0;
 }
 
 static unsigned long __init setup_vectors_base(void)
index 9255b6d..aff6994 100644 (file)
@@ -468,6 +468,7 @@ void __init orion_ge11_init(struct mv643xx_eth_platform_data *eth_data,
                    eth_data, &orion_ge11);
 }
 
+#ifdef CONFIG_ARCH_ORION5X
 /*****************************************************************************
  * Ethernet switch
  ****************************************************************************/
@@ -480,6 +481,9 @@ void __init orion_ge00_switch_init(struct dsa_chip_data *d)
        struct mdio_board_info *bd;
        unsigned int i;
 
+       if (!IS_BUILTIN(CONFIG_PHYLIB))
+               return;
+
        for (i = 0; i < ARRAY_SIZE(d->port_names); i++)
                if (!strcmp(d->port_names[i], "cpu"))
                        break;
@@ -493,6 +497,7 @@ void __init orion_ge00_switch_init(struct dsa_chip_data *d)
 
        mdiobus_register_board_info(&orion_ge00_switch_board_info, 1);
 }
+#endif
 
 /*****************************************************************************
  * I2C
index b6dc9d8..ad1f4e6 100644 (file)
@@ -266,11 +266,20 @@ void __kprobes kprobe_handler(struct pt_regs *regs)
 #endif
 
        if (p) {
-               if (cur) {
+               if (!p->ainsn.insn_check_cc(regs->ARM_cpsr)) {
+                       /*
+                        * Probe hit but conditional execution check failed,
+                        * so just skip the instruction and continue as if
+                        * nothing had happened.
+                        * In this case, we can skip recursing check too.
+                        */
+                       singlestep_skip(p, regs);
+               } else if (cur) {
                        /* Kprobe is pending, so we're recursing. */
                        switch (kcb->kprobe_status) {
                        case KPROBE_HIT_ACTIVE:
                        case KPROBE_HIT_SSDONE:
+                       case KPROBE_HIT_SS:
                                /* A pre- or post-handler probe got us here. */
                                kprobes_inc_nmissed_count(p);
                                save_previous_kprobe(kcb);
@@ -279,11 +288,16 @@ void __kprobes kprobe_handler(struct pt_regs *regs)
                                singlestep(p, regs, kcb);
                                restore_previous_kprobe(kcb);
                                break;
+                       case KPROBE_REENTER:
+                               /* A nested probe was hit in FIQ, it is a BUG */
+                               pr_warn("Unrecoverable kprobe detected at %p.\n",
+                                       p->addr);
+                               /* fall through */
                        default:
                                /* impossible cases */
                                BUG();
                        }
-               } else if (p->ainsn.insn_check_cc(regs->ARM_cpsr)) {
+               } else {
                        /* Probe hit and conditional execution check ok. */
                        set_current_kprobe(p);
                        kcb->kprobe_status = KPROBE_HIT_ACTIVE;
@@ -304,13 +318,6 @@ void __kprobes kprobe_handler(struct pt_regs *regs)
                                }
                                reset_current_kprobe();
                        }
-               } else {
-                       /*
-                        * Probe hit but conditional execution check failed,
-                        * so just skip the instruction and continue as if
-                        * nothing had happened.
-                        */
-                       singlestep_skip(p, regs);
                }
        } else if (cur) {
                /* We probably hit a jprobe.  Call its break handler. */
@@ -434,6 +441,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
        struct hlist_node *tmp;
        unsigned long flags, orig_ret_address = 0;
        unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline;
+       kprobe_opcode_t *correct_ret_addr = NULL;
 
        INIT_HLIST_HEAD(&empty_rp);
        kretprobe_hash_lock(current, &head, &flags);
@@ -456,14 +464,34 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
                        /* another task is sharing our hash bucket */
                        continue;
 
+               orig_ret_address = (unsigned long)ri->ret_addr;
+
+               if (orig_ret_address != trampoline_address)
+                       /*
+                        * This is the real return address. Any other
+                        * instances associated with this task are for
+                        * other calls deeper on the call stack
+                        */
+                       break;
+       }
+
+       kretprobe_assert(ri, orig_ret_address, trampoline_address);
+
+       correct_ret_addr = ri->ret_addr;
+       hlist_for_each_entry_safe(ri, tmp, head, hlist) {
+               if (ri->task != current)
+                       /* another task is sharing our hash bucket */
+                       continue;
+
+               orig_ret_address = (unsigned long)ri->ret_addr;
                if (ri->rp && ri->rp->handler) {
                        __this_cpu_write(current_kprobe, &ri->rp->kp);
                        get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
+                       ri->ret_addr = correct_ret_addr;
                        ri->rp->handler(ri, regs);
                        __this_cpu_write(current_kprobe, NULL);
                }
 
-               orig_ret_address = (unsigned long)ri->ret_addr;
                recycle_rp_inst(ri, &empty_rp);
 
                if (orig_ret_address != trampoline_address)
@@ -475,7 +503,6 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
                        break;
        }
 
-       kretprobe_assert(ri, orig_ret_address, trampoline_address);
        kretprobe_hash_unlock(current, &flags);
 
        hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) {
index c893726..1c98a87 100644 (file)
@@ -977,7 +977,10 @@ static void coverage_end(void)
 void __naked __kprobes_test_case_start(void)
 {
        __asm__ __volatile__ (
-               "stmdb  sp!, {r4-r11}                           \n\t"
+               "mov    r2, sp                                  \n\t"
+               "bic    r3, r2, #7                              \n\t"
+               "mov    sp, r3                                  \n\t"
+               "stmdb  sp!, {r2-r11}                           \n\t"
                "sub    sp, sp, #"__stringify(TEST_MEMORY_SIZE)"\n\t"
                "bic    r0, lr, #1  @ r0 = inline data          \n\t"
                "mov    r1, sp                                  \n\t"
@@ -997,7 +1000,8 @@ void __naked __kprobes_test_case_end_32(void)
                "movne  pc, r0                                  \n\t"
                "mov    r0, r4                                  \n\t"
                "add    sp, sp, #"__stringify(TEST_MEMORY_SIZE)"\n\t"
-               "ldmia  sp!, {r4-r11}                           \n\t"
+               "ldmia  sp!, {r2-r11}                           \n\t"
+               "mov    sp, r2                                  \n\t"
                "mov    pc, r0                                  \n\t"
        );
 }
@@ -1013,7 +1017,8 @@ void __naked __kprobes_test_case_end_16(void)
                "bxne   r0                                      \n\t"
                "mov    r0, r4                                  \n\t"
                "add    sp, sp, #"__stringify(TEST_MEMORY_SIZE)"\n\t"
-               "ldmia  sp!, {r4-r11}                           \n\t"
+               "ldmia  sp!, {r2-r11}                           \n\t"
+               "mov    sp, r2                                  \n\t"
                "bx     r0                                      \n\t"
        );
 }
index 3741859..67695fa 100644 (file)
@@ -2,6 +2,7 @@ config ARM64
        def_bool y
        select ACPI_CCA_REQUIRED if ACPI
        select ACPI_GENERIC_GSI if ACPI
+       select ACPI_GTDT if ACPI
        select ACPI_REDUCED_HARDWARE_ONLY if ACPI
        select ACPI_MCFG if ACPI
        select ACPI_SPCR_TABLE if ACPI
@@ -60,7 +61,6 @@ config ARM64
        select HAVE_ALIGNED_STRUCT_PAGE if SLUB
        select HAVE_ARCH_AUDITSYSCALL
        select HAVE_ARCH_BITREVERSE
-       select HAVE_ARCH_HARDENED_USERCOPY
        select HAVE_ARCH_HUGE_VMAP
        select HAVE_ARCH_JUMP_LABEL
        select HAVE_ARCH_KASAN if SPARSEMEM_VMEMMAP && !(ARM64_16K_PAGES && ARM64_VA_BITS_48)
index 1c64ea2..0565779 100644 (file)
                usbphy: phy@01c19400 {
                        compatible = "allwinner,sun50i-a64-usb-phy";
                        reg = <0x01c19400 0x14>,
+                             <0x01c1a800 0x4>,
                              <0x01c1b800 0x4>;
                        reg-names = "phy_ctrl",
+                                   "pmu0",
                                    "pmu1";
                        clocks = <&ccu CLK_USB_PHY0>,
                                 <&ccu CLK_USB_PHY1>;
index b4b3400..74d08e4 100644 (file)
@@ -25,6 +25,7 @@
 #include <linux/bug.h>
 #include <linux/init.h>
 #include <linux/jump_label.h>
+#include <linux/smp.h>
 #include <linux/types.h>
 
 #include <clocksource/arm_arch_timer.h>
@@ -37,24 +38,44 @@ extern struct static_key_false arch_timer_read_ool_enabled;
 #define needs_unstable_timer_counter_workaround()  false
 #endif
 
+enum arch_timer_erratum_match_type {
+       ate_match_dt,
+       ate_match_local_cap_id,
+       ate_match_acpi_oem_info,
+};
+
+struct clock_event_device;
 
 struct arch_timer_erratum_workaround {
-       const char *id;         /* Indicate the Erratum ID */
+       enum arch_timer_erratum_match_type match_type;
+       const void *id;
+       const char *desc;
        u32 (*read_cntp_tval_el0)(void);
        u32 (*read_cntv_tval_el0)(void);
        u64 (*read_cntvct_el0)(void);
+       int (*set_next_event_phys)(unsigned long, struct clock_event_device *);
+       int (*set_next_event_virt)(unsigned long, struct clock_event_device *);
 };
 
-extern const struct arch_timer_erratum_workaround *timer_unstable_counter_workaround;
-
-#define arch_timer_reg_read_stable(reg)                \
-({                                                     \
-       u64 _val;                                       \
-       if (needs_unstable_timer_counter_workaround())          \
-               _val = timer_unstable_counter_workaround->read_##reg();\
-       else                                            \
-               _val = read_sysreg(reg);                \
-       _val;                                           \
+DECLARE_PER_CPU(const struct arch_timer_erratum_workaround *,
+               timer_unstable_counter_workaround);
+
+#define arch_timer_reg_read_stable(reg)                                        \
+({                                                                     \
+       u64 _val;                                                       \
+       if (needs_unstable_timer_counter_workaround()) {                \
+               const struct arch_timer_erratum_workaround *wa;         \
+               preempt_disable();                                      \
+               wa = __this_cpu_read(timer_unstable_counter_workaround); \
+               if (wa && wa->read_##reg)                               \
+                       _val = wa->read_##reg();                        \
+               else                                                    \
+                       _val = read_sysreg(reg);                        \
+               preempt_enable();                                       \
+       } else {                                                        \
+               _val = read_sysreg(reg);                                \
+       }                                                               \
+       _val;                                                           \
 })
 
 /*
index fb78a5d..b3aab8a 100644 (file)
@@ -37,7 +37,8 @@
 #define ARM64_HAS_NO_FPSIMD                    16
 #define ARM64_WORKAROUND_REPEAT_TLBI           17
 #define ARM64_WORKAROUND_QCOM_FALKOR_E1003     18
+#define ARM64_WORKAROUND_858921                        19
 
-#define ARM64_NCAPS                            19
+#define ARM64_NCAPS                            20
 
 #endif /* __ASM_CPUCAPS_H */
index fc50271..0984d1b 100644 (file)
@@ -80,6 +80,7 @@
 #define ARM_CPU_PART_FOUNDATION                0xD00
 #define ARM_CPU_PART_CORTEX_A57                0xD07
 #define ARM_CPU_PART_CORTEX_A53                0xD03
+#define ARM_CPU_PART_CORTEX_A73                0xD09
 
 #define APM_CPU_PART_POTENZA           0x000
 
@@ -92,6 +93,7 @@
 
 #define MIDR_CORTEX_A53 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A53)
 #define MIDR_CORTEX_A57 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A57)
+#define MIDR_CORTEX_A73 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A73)
 #define MIDR_THUNDERX  MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX)
 #define MIDR_THUNDERX_81XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_81XX)
 #define MIDR_QCOM_FALKOR_V1 MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_FALKOR_V1)
index e744528..8f3043a 100644 (file)
@@ -1,6 +1,7 @@
 #ifndef _ASM_EFI_H
 #define _ASM_EFI_H
 
+#include <asm/boot.h>
 #include <asm/cpufeature.h>
 #include <asm/io.h>
 #include <asm/mmu_context.h>
@@ -46,7 +47,28 @@ int efi_set_mapping_permissions(struct mm_struct *mm, efi_memory_desc_t *md);
  * 2MiB so we know it won't cross a 2MiB boundary.
  */
 #define EFI_FDT_ALIGN  SZ_2M   /* used by allocate_new_fdt_and_exit_boot() */
-#define MAX_FDT_OFFSET SZ_512M
+
+/* on arm64, the FDT may be located anywhere in system RAM */
+static inline unsigned long efi_get_max_fdt_addr(unsigned long dram_base)
+{
+       return ULONG_MAX;
+}
+
+/*
+ * On arm64, we have to ensure that the initrd ends up in the linear region,
+ * which is a 1 GB aligned region of size '1UL << (VA_BITS - 1)' that is
+ * guaranteed to cover the kernel Image.
+ *
+ * Since the EFI stub is part of the kernel Image, we can relax the
+ * usual requirements in Documentation/arm64/booting.txt, which still
+ * apply to other bootloaders, and are required for some kernel
+ * configurations.
+ */
+static inline unsigned long efi_get_max_initrd_addr(unsigned long dram_base,
+                                                   unsigned long image_addr)
+{
+       return (image_addr & ~(SZ_1G - 1UL)) + (1UL << (VA_BITS - 1));
+}
 
 #define efi_call_early(f, ...)         sys_table_arg->boottime->f(__VA_ARGS__)
 #define __efi_call_early(f, ...)       f(__VA_ARGS__)
index d14c478..ad42e79 100644 (file)
 #define ESR_ELx_SYS64_ISS_SYS_CTR_READ (ESR_ELx_SYS64_ISS_SYS_CTR | \
                                         ESR_ELx_SYS64_ISS_DIR_READ)
 
+#define ESR_ELx_SYS64_ISS_SYS_CNTVCT   (ESR_ELx_SYS64_ISS_SYS_VAL(3, 3, 2, 14, 0) | \
+                                        ESR_ELx_SYS64_ISS_DIR_READ)
 #ifndef __ASSEMBLY__
 #include <asm/types.h>
 
diff --git a/arch/arm64/include/asm/extable.h b/arch/arm64/include/asm/extable.h
new file mode 100644 (file)
index 0000000..42f50f1
--- /dev/null
@@ -0,0 +1,25 @@
+#ifndef __ASM_EXTABLE_H
+#define __ASM_EXTABLE_H
+
+/*
+ * The exception table consists of pairs of relative offsets: the first
+ * is the relative offset to an instruction that is allowed to fault,
+ * and the second is the relative offset at which the program should
+ * continue. No registers are modified, so it is entirely up to the
+ * continuation code to figure out what to do.
+ *
+ * All the routines below use bits of fixup code that are out of line
+ * with the main instruction path.  This means when everything is well,
+ * we don't even have to jump over them.  Further, they do not intrude
+ * on our cache or tlb entries.
+ */
+
+struct exception_table_entry
+{
+       int insn, fixup;
+};
+
+#define ARCH_HAS_RELATIVE_EXTABLE
+
+extern int fixup_exception(struct pt_regs *regs);
+#endif
index 5308d69..ba49717 100644 (file)
 #include <linux/bitops.h>
 #include <linux/kasan-checks.h>
 #include <linux/string.h>
-#include <linux/thread_info.h>
 
 #include <asm/cpufeature.h>
 #include <asm/ptrace.h>
-#include <asm/errno.h>
 #include <asm/memory.h>
 #include <asm/compiler.h>
-
-#define VERIFY_READ 0
-#define VERIFY_WRITE 1
-
-/*
- * The exception table consists of pairs of relative offsets: the first
- * is the relative offset to an instruction that is allowed to fault,
- * and the second is the relative offset at which the program should
- * continue. No registers are modified, so it is entirely up to the
- * continuation code to figure out what to do.
- *
- * All the routines below use bits of fixup code that are out of line
- * with the main instruction path.  This means when everything is well,
- * we don't even have to jump over them.  Further, they do not intrude
- * on our cache or tlb entries.
- */
-
-struct exception_table_entry
-{
-       int insn, fixup;
-};
-
-#define ARCH_HAS_RELATIVE_EXTABLE
-
-extern int fixup_exception(struct pt_regs *regs);
+#include <asm/extable.h>
 
 #define KERNEL_DS      (-1UL)
 #define get_ds()       (KERNEL_DS)
@@ -357,58 +331,13 @@ do {                                                                      \
 })
 
 extern unsigned long __must_check __arch_copy_from_user(void *to, const void __user *from, unsigned long n);
+#define raw_copy_from_user __arch_copy_from_user
 extern unsigned long __must_check __arch_copy_to_user(void __user *to, const void *from, unsigned long n);
-extern unsigned long __must_check __copy_in_user(void __user *to, const void __user *from, unsigned long n);
+#define raw_copy_to_user __arch_copy_to_user
+extern unsigned long __must_check raw_copy_in_user(void __user *to, const void __user *from, unsigned long n);
 extern unsigned long __must_check __clear_user(void __user *addr, unsigned long n);
-
-static inline unsigned long __must_check __copy_from_user(void *to, const void __user *from, unsigned long n)
-{
-       kasan_check_write(to, n);
-       check_object_size(to, n, false);
-       return __arch_copy_from_user(to, from, n);
-}
-
-static inline unsigned long __must_check __copy_to_user(void __user *to, const void *from, unsigned long n)
-{
-       kasan_check_read(from, n);
-       check_object_size(from, n, true);
-       return __arch_copy_to_user(to, from, n);
-}
-
-static inline unsigned long __must_check copy_from_user(void *to, const void __user *from, unsigned long n)
-{
-       unsigned long res = n;
-       kasan_check_write(to, n);
-       check_object_size(to, n, false);
-
-       if (access_ok(VERIFY_READ, from, n)) {
-               res = __arch_copy_from_user(to, from, n);
-       }
-       if (unlikely(res))
-               memset(to + (n - res), 0, res);
-       return res;
-}
-
-static inline unsigned long __must_check copy_to_user(void __user *to, const void *from, unsigned long n)
-{
-       kasan_check_read(from, n);
-       check_object_size(from, n, true);
-
-       if (access_ok(VERIFY_WRITE, to, n)) {
-               n = __arch_copy_to_user(to, from, n);
-       }
-       return n;
-}
-
-static inline unsigned long __must_check copy_in_user(void __user *to, const void __user *from, unsigned long n)
-{
-       if (access_ok(VERIFY_READ, from, n) && access_ok(VERIFY_WRITE, to, n))
-               n = __copy_in_user(to, from, n);
-       return n;
-}
-
-#define __copy_to_user_inatomic __copy_to_user
-#define __copy_from_user_inatomic __copy_from_user
+#define INLINE_COPY_TO_USER
+#define INLINE_COPY_FROM_USER
 
 static inline unsigned long __must_check clear_user(void __user *to, unsigned long n)
 {
index 64d9cbd..e25c11e 100644 (file)
@@ -18,6 +18,7 @@
 #include <linux/acpi.h>
 #include <linux/bootmem.h>
 #include <linux/cpumask.h>
+#include <linux/efi-bgrt.h>
 #include <linux/init.h>
 #include <linux/irq.h>
 #include <linux/irqdomain.h>
@@ -233,6 +234,8 @@ done:
                        early_init_dt_scan_chosen_stdout();
        } else {
                parse_spcr(earlycon_init_is_deferred);
+               if (IS_ENABLED(CONFIG_ACPI_BGRT))
+                       acpi_table_parse(ACPI_SIG_BGRT, acpi_parse_bgrt);
        }
 }
 
index e9c4dc9..67368c7 100644 (file)
@@ -38,7 +38,7 @@ EXPORT_SYMBOL(clear_page);
 EXPORT_SYMBOL(__arch_copy_from_user);
 EXPORT_SYMBOL(__arch_copy_to_user);
 EXPORT_SYMBOL(__clear_user);
-EXPORT_SYMBOL(__copy_in_user);
+EXPORT_SYMBOL(raw_copy_in_user);
 
        /* physical memory */
 EXPORT_SYMBOL(memstart_addr);
index f6cc67e..2ed2a76 100644 (file)
@@ -53,6 +53,13 @@ static int cpu_enable_trap_ctr_access(void *__unused)
        .midr_range_min = min, \
        .midr_range_max = max
 
+#define MIDR_ALL_VERSIONS(model) \
+       .def_scope = SCOPE_LOCAL_CPU, \
+       .matches = is_affected_midr_range, \
+       .midr_model = model, \
+       .midr_range_min = 0, \
+       .midr_range_max = (MIDR_VARIANT_MASK | MIDR_REVISION_MASK)
+
 const struct arm64_cpu_capabilities arm64_errata[] = {
 #if    defined(CONFIG_ARM64_ERRATUM_826319) || \
        defined(CONFIG_ARM64_ERRATUM_827319) || \
@@ -151,6 +158,14 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
                           MIDR_CPU_VAR_REV(0, 0)),
        },
 #endif
+#ifdef CONFIG_ARM64_ERRATUM_858921
+       {
+       /* Cortex-A73 all versions */
+               .desc = "ARM erratum 858921",
+               .capability = ARM64_WORKAROUND_858921,
+               MIDR_ALL_VERSIONS(MIDR_CORTEX_A73),
+       },
+#endif
        {
        }
 };
index abda8e8..6eb77ae 100644 (file)
@@ -1090,20 +1090,29 @@ static void __init setup_feature_capabilities(void)
  * Check if the current CPU has a given feature capability.
  * Should be called from non-preemptible context.
  */
-bool this_cpu_has_cap(unsigned int cap)
+static bool __this_cpu_has_cap(const struct arm64_cpu_capabilities *cap_array,
+                              unsigned int cap)
 {
        const struct arm64_cpu_capabilities *caps;
 
        if (WARN_ON(preemptible()))
                return false;
 
-       for (caps = arm64_features; caps->desc; caps++)
+       for (caps = cap_array; caps->desc; caps++)
                if (caps->capability == cap && caps->matches)
                        return caps->matches(caps, SCOPE_LOCAL_CPU);
 
        return false;
 }
 
+extern const struct arm64_cpu_capabilities arm64_errata[];
+
+bool this_cpu_has_cap(unsigned int cap)
+{
+       return (__this_cpu_has_cap(arm64_features, cap) ||
+               __this_cpu_has_cap(arm64_errata, cap));
+}
+
 void __init setup_cpu_features(void)
 {
        u32 cwg;
index e52be6a..1de444e 100644 (file)
@@ -505,6 +505,14 @@ static void ctr_read_handler(unsigned int esr, struct pt_regs *regs)
        regs->pc += 4;
 }
 
+static void cntvct_read_handler(unsigned int esr, struct pt_regs *regs)
+{
+       int rt = (esr & ESR_ELx_SYS64_ISS_RT_MASK) >> ESR_ELx_SYS64_ISS_RT_SHIFT;
+
+       pt_regs_write_reg(regs, rt, arch_counter_get_cntvct());
+       regs->pc += 4;
+}
+
 struct sys64_hook {
        unsigned int esr_mask;
        unsigned int esr_val;
@@ -523,6 +531,12 @@ static struct sys64_hook sys64_hooks[] = {
                .esr_val = ESR_ELx_SYS64_ISS_SYS_CTR_READ,
                .handler = ctr_read_handler,
        },
+       {
+               /* Trap read access to CNTVCT_EL0 */
+               .esr_mask = ESR_ELx_SYS64_ISS_SYS_OP_MASK,
+               .esr_val = ESR_ELx_SYS64_ISS_SYS_CNTVCT,
+               .handler = cntvct_read_handler,
+       },
        {},
 };
 
index 47184c3..b24a830 100644 (file)
        .endm
 
 end    .req    x5
-ENTRY(__copy_in_user)
+ENTRY(raw_copy_in_user)
        uaccess_enable_not_uao x3, x4
        add     end, x0, x2
 #include "copy_template.S"
        uaccess_disable_not_uao x3
        mov     x0, #0
        ret
-ENDPROC(__copy_in_user)
+ENDPROC(raw_copy_in_user)
 
        .section .fixup,"ax"
        .align  2
index 4bf899f..1b35b8b 100644 (file)
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 
-static const char *fault_name(unsigned int esr);
+struct fault_info {
+       int     (*fn)(unsigned long addr, unsigned int esr,
+                     struct pt_regs *regs);
+       int     sig;
+       int     code;
+       const char *name;
+};
+
+static const struct fault_info fault_info[];
+
+static inline const struct fault_info *esr_to_fault_info(unsigned int esr)
+{
+       return fault_info + (esr & 63);
+}
 
 #ifdef CONFIG_KPROBES
 static inline int notify_page_fault(struct pt_regs *regs, unsigned int esr)
@@ -197,10 +210,12 @@ static void __do_user_fault(struct task_struct *tsk, unsigned long addr,
                            struct pt_regs *regs)
 {
        struct siginfo si;
+       const struct fault_info *inf;
 
        if (unhandled_signal(tsk, sig) && show_unhandled_signals_ratelimited()) {
+               inf = esr_to_fault_info(esr);
                pr_info("%s[%d]: unhandled %s (%d) at 0x%08lx, esr 0x%03x\n",
-                       tsk->comm, task_pid_nr(tsk), fault_name(esr), sig,
+                       tsk->comm, task_pid_nr(tsk), inf->name, sig,
                        addr, esr);
                show_pte(tsk->mm, addr);
                show_regs(regs);
@@ -219,14 +234,16 @@ static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *re
 {
        struct task_struct *tsk = current;
        struct mm_struct *mm = tsk->active_mm;
+       const struct fault_info *inf;
 
        /*
         * If we are in kernel mode at this point, we have no context to
         * handle this fault with.
         */
-       if (user_mode(regs))
-               __do_user_fault(tsk, addr, esr, SIGSEGV, SEGV_MAPERR, regs);
-       else
+       if (user_mode(regs)) {
+               inf = esr_to_fault_info(esr);
+               __do_user_fault(tsk, addr, esr, inf->sig, inf->code, regs);
+       } else
                __do_kernel_fault(mm, addr, esr, regs);
 }
 
@@ -488,12 +505,7 @@ static int do_bad(unsigned long addr, unsigned int esr, struct pt_regs *regs)
        return 1;
 }
 
-static const struct fault_info {
-       int     (*fn)(unsigned long addr, unsigned int esr, struct pt_regs *regs);
-       int     sig;
-       int     code;
-       const char *name;
-} fault_info[] = {
+static const struct fault_info fault_info[] = {
        { do_bad,               SIGBUS,  0,             "ttbr address size fault"       },
        { do_bad,               SIGBUS,  0,             "level 1 address size fault"    },
        { do_bad,               SIGBUS,  0,             "level 2 address size fault"    },
@@ -560,19 +572,13 @@ static const struct fault_info {
        { do_bad,               SIGBUS,  0,             "unknown 63"                    },
 };
 
-static const char *fault_name(unsigned int esr)
-{
-       const struct fault_info *inf = fault_info + (esr & 63);
-       return inf->name;
-}
-
 /*
  * Dispatch a data abort to the relevant handler.
  */
 asmlinkage void __exception do_mem_abort(unsigned long addr, unsigned int esr,
                                         struct pt_regs *regs)
 {
-       const struct fault_info *inf = fault_info + (esr & 63);
+       const struct fault_info *inf = esr_to_fault_info(esr);
        struct siginfo info;
 
        if (!inf->fn(addr, esr, regs))
index e25584d..7514a00 100644 (file)
@@ -294,10 +294,6 @@ static __init int setup_hugepagesz(char *opt)
                hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
        } else if (ps == PUD_SIZE) {
                hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
-       } else if (ps == (PAGE_SIZE * CONT_PTES)) {
-               hugetlb_add_hstate(CONT_PTE_SHIFT);
-       } else if (ps == (PMD_SIZE * CONT_PMDS)) {
-               hugetlb_add_hstate((PMD_SHIFT + CONT_PMD_SHIFT) - PAGE_SHIFT);
        } else {
                hugetlb_bad_size();
                pr_err("hugepagesz: Unsupported page size %lu K\n", ps >> 10);
@@ -306,13 +302,3 @@ static __init int setup_hugepagesz(char *opt)
        return 1;
 }
 __setup("hugepagesz=", setup_hugepagesz);
-
-#ifdef CONFIG_ARM64_64K_PAGES
-static __init int add_default_hugepagesz(void)
-{
-       if (size_to_hstate(CONT_PTES * PAGE_SIZE) == NULL)
-               hugetlb_add_hstate(CONT_PTE_SHIFT);
-       return 0;
-}
-arch_initcall(add_default_hugepagesz);
-#endif
diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig
deleted file mode 100644 (file)
index 7e75d45..0000000
+++ /dev/null
@@ -1,288 +0,0 @@
-config AVR32
-       def_bool y
-       # With EXPERT=n, we get lots of stuff automatically selected
-       # that we usually don't need on AVR32.
-       select EXPERT
-       select HAVE_CLK
-       select HAVE_EXIT_THREAD
-       select HAVE_OPROFILE
-       select HAVE_KPROBES
-       select VIRT_TO_BUS
-       select GENERIC_IRQ_PROBE
-       select GENERIC_ATOMIC64
-       select HARDIRQS_SW_RESEND
-       select GENERIC_IRQ_SHOW
-       select ARCH_HAVE_CUSTOM_GPIO_H
-       select ARCH_WANT_IPC_PARSE_VERSION
-       select ARCH_HAVE_NMI_SAFE_CMPXCHG
-       select GENERIC_CLOCKEVENTS
-       select HAVE_MOD_ARCH_SPECIFIC
-       select MODULES_USE_ELF_RELA
-       select HAVE_NMI
-       help
-         AVR32 is a high-performance 32-bit RISC microprocessor core,
-         designed for cost-sensitive embedded applications, with particular
-         emphasis on low power consumption and high code density.
-
-         There is an AVR32 Linux project with a web page at
-         http://avr32linux.org/.
-
-config STACKTRACE_SUPPORT
-       def_bool y
-
-config LOCKDEP_SUPPORT
-       def_bool y
-
-config TRACE_IRQFLAGS_SUPPORT
-       def_bool y
-
-config RWSEM_GENERIC_SPINLOCK
-       def_bool y
-
-config RWSEM_XCHGADD_ALGORITHM
-       def_bool n
-
-config ARCH_HAS_ILOG2_U32
-       def_bool n
-
-config ARCH_HAS_ILOG2_U64
-       def_bool n
-
-config GENERIC_HWEIGHT
-       def_bool y
-
-config GENERIC_CALIBRATE_DELAY
-       def_bool y
-
-config GENERIC_BUG
-       def_bool y
-       depends on BUG
-
-source "init/Kconfig"
-
-source "kernel/Kconfig.freezer"
-
-menu "System Type and features"
-
-config SUBARCH_AVR32B
-       bool
-config MMU
-       bool
-config PERFORMANCE_COUNTERS
-       bool
-
-config PLATFORM_AT32AP
-       bool
-       select SUBARCH_AVR32B
-       select MMU
-       select PERFORMANCE_COUNTERS
-       select GPIOLIB
-       select GENERIC_ALLOCATOR
-       select HAVE_FB_ATMEL
-
-#
-# CPU types
-#
-
-# AP7000 derivatives
-config CPU_AT32AP700X
-       bool
-       select PLATFORM_AT32AP
-config CPU_AT32AP7000
-       bool
-       select CPU_AT32AP700X
-config CPU_AT32AP7001
-       bool
-       select CPU_AT32AP700X
-config CPU_AT32AP7002
-       bool
-       select CPU_AT32AP700X
-
-# AP700X boards
-config BOARD_ATNGW100_COMMON
-       bool
-       select CPU_AT32AP7000
-
-choice
-       prompt "AVR32 board type"
-       default BOARD_ATSTK1000
-
-config BOARD_ATSTK1000
-       bool "ATSTK1000 evaluation board"
-
-config BOARD_ATNGW100_MKI
-       bool "ATNGW100 Network Gateway"
-       select BOARD_ATNGW100_COMMON
-
-config BOARD_ATNGW100_MKII
-       bool "ATNGW100 mkII Network Gateway"
-       select BOARD_ATNGW100_COMMON
-
-config BOARD_HAMMERHEAD
-       bool "Hammerhead board"
-       select CPU_AT32AP7000
-       select USB_ARCH_HAS_HCD
-       help
-         The Hammerhead platform is built around an AVR32 32-bit microcontroller from Atmel.
-         It offers versatile peripherals, such as ethernet, usb device, usb host etc.
-
-         The board also incorporates a power supply and is a Power over Ethernet (PoE) Powered
-         Device (PD).
-
-         Additionally, a Cyclone III FPGA from Altera is integrated on the board. The FPGA is
-         mapped into the 32-bit AVR memory bus. The FPGA offers two DDR2 SDRAM interfaces, which
-         will cover even the most exceptional need of memory bandwidth. Together with the onboard
-         video decoder the board is ready for video processing.
-
-         For more information see: http://www.miromico.ch/index.php/hammerhead.html 
-
-config BOARD_FAVR_32
-       bool "Favr-32 LCD-board"
-       select CPU_AT32AP7000
-
-config BOARD_MERISC
-       bool "Merisc board"
-       select CPU_AT32AP7000
-       help
-         Merisc is the family name for a range of AVR32-based boards.
-
-         The boards are designed to be used in a man-machine
-         interfacing environment, utilizing a touch-based graphical
-         user interface. They host a vast range of I/O peripherals as
-         well as a large SDRAM & Flash memory bank.
-
-         For more information see: http://www.martinsson.se/merisc
-
-config BOARD_MIMC200
-       bool "MIMC200 CPU board"
-       select CPU_AT32AP7000
-endchoice
-
-source "arch/avr32/boards/atstk1000/Kconfig"
-source "arch/avr32/boards/atngw100/Kconfig"
-source "arch/avr32/boards/hammerhead/Kconfig"
-source "arch/avr32/boards/favr-32/Kconfig"
-source "arch/avr32/boards/merisc/Kconfig"
-
-choice
-       prompt "Boot loader type"
-       default LOADER_U_BOOT
-
-config LOADER_U_BOOT
-       bool "U-Boot (or similar) bootloader"
-endchoice
-
-source "arch/avr32/mach-at32ap/Kconfig"
-
-config LOAD_ADDRESS
-       hex
-       default 0x10000000 if LOADER_U_BOOT=y && CPU_AT32AP700X=y
-
-config ENTRY_ADDRESS
-       hex
-       default 0x90000000 if LOADER_U_BOOT=y && CPU_AT32AP700X=y
-
-config PHYS_OFFSET
-       hex
-       default 0x10000000 if CPU_AT32AP700X=y
-
-source "kernel/Kconfig.preempt"
-
-config QUICKLIST
-       def_bool y
-
-config ARCH_HAVE_MEMORY_PRESENT
-       def_bool n
-
-config NEED_NODE_MEMMAP_SIZE
-       def_bool n
-
-config ARCH_FLATMEM_ENABLE
-       def_bool y
-
-config ARCH_DISCONTIGMEM_ENABLE
-       def_bool n
-
-config ARCH_SPARSEMEM_ENABLE
-       def_bool n
-
-config NODES_SHIFT
-       int
-       default "2"
-       depends on NEED_MULTIPLE_NODES
-
-source "mm/Kconfig"
-
-config OWNERSHIP_TRACE
-       bool "Ownership trace support"
-       default y
-       help
-         Say Y to generate an Ownership Trace message on every context switch,
-         enabling Nexus-compliant debuggers to keep track of the PID of the
-         currently executing task.
-
-config NMI_DEBUGGING
-       bool "NMI Debugging"
-       default n
-       help
-         Say Y here and pass the nmi_debug command-line parameter to
-         the kernel to turn on NMI debugging. Depending on the value
-         of the nmi_debug option, various pieces of information will
-         be dumped to the console when a Non-Maskable Interrupt
-         happens.
-
-# FPU emulation goes here
-
-source "kernel/Kconfig.hz"
-
-config CMDLINE
-       string "Default kernel command line"
-       default ""
-       help
-         If you don't have a boot loader capable of passing a command line string
-         to the kernel, you may specify one here. As a minimum, you should specify
-         the memory size and the root device (e.g., mem=8M, root=/dev/nfs).
-
-endmenu
-
-menu "Power management options"
-
-source "kernel/power/Kconfig"
-
-config ARCH_SUSPEND_POSSIBLE
-       def_bool y
-
-menu "CPU Frequency scaling"
-source "drivers/cpufreq/Kconfig"
-endmenu
-
-endmenu
-
-menu "Bus options"
-
-config PCI
-       bool
-
-source "drivers/pci/Kconfig"
-
-source "drivers/pcmcia/Kconfig"
-
-endmenu
-
-menu "Executable file formats"
-source "fs/Kconfig.binfmt"
-endmenu
-
-source "net/Kconfig"
-
-source "drivers/Kconfig"
-
-source "fs/Kconfig"
-
-source "arch/avr32/Kconfig.debug"
-
-source "security/Kconfig"
-
-source "crypto/Kconfig"
-
-source "lib/Kconfig"
diff --git a/arch/avr32/Kconfig.debug b/arch/avr32/Kconfig.debug
deleted file mode 100644 (file)
index 2283933..0000000
+++ /dev/null
@@ -1,9 +0,0 @@
-menu "Kernel hacking"
-
-config TRACE_IRQFLAGS_SUPPORT
-       bool
-       default y
-
-source "lib/Kconfig.debug"
-
-endmenu
diff --git a/arch/avr32/Makefile b/arch/avr32/Makefile
deleted file mode 100644 (file)
index dba48a5..0000000
+++ /dev/null
@@ -1,84 +0,0 @@
-#
-# This file is subject to the terms and conditions of the GNU General Public
-# License.  See the file "COPYING" in the main directory of this archive
-# for more details.
-#
-# Copyright (C) 2004-2006 Atmel Corporation.
-
-# Default target when executing plain make
-.PHONY: all
-all: uImage vmlinux.elf
-
-KBUILD_DEFCONFIG       := atstk1002_defconfig
-
-KBUILD_CFLAGS  += -pipe -fno-builtin -mno-pic -D__linux__
-KBUILD_AFLAGS  += -mrelax -mno-pic
-KBUILD_CFLAGS_MODULE += -mno-relax
-LDFLAGS_vmlinux        += --relax
-
-cpuflags-$(CONFIG_PLATFORM_AT32AP)     += -march=ap
-
-KBUILD_CFLAGS  += $(cpuflags-y)
-KBUILD_AFLAGS  += $(cpuflags-y)
-
-CHECKFLAGS     += -D__avr32__ -D__BIG_ENDIAN
-
-machine-$(CONFIG_PLATFORM_AT32AP) := at32ap
-machdirs       := $(patsubst %,arch/avr32/mach-%/, $(machine-y))
-
-KBUILD_CPPFLAGS        += $(patsubst %,-I$(srctree)/%include,$(machdirs))
-
-head-$(CONFIG_LOADER_U_BOOT)           += arch/avr32/boot/u-boot/head.o
-head-y                                 += arch/avr32/kernel/head.o
-core-y                                 += $(machdirs)
-core-$(CONFIG_BOARD_ATSTK1000)         += arch/avr32/boards/atstk1000/
-core-$(CONFIG_BOARD_ATNGW100_COMMON)   += arch/avr32/boards/atngw100/
-core-$(CONFIG_BOARD_HAMMERHEAD)                += arch/avr32/boards/hammerhead/
-core-$(CONFIG_BOARD_FAVR_32)           += arch/avr32/boards/favr-32/
-core-$(CONFIG_BOARD_MERISC)            += arch/avr32/boards/merisc/
-core-$(CONFIG_BOARD_MIMC200)           += arch/avr32/boards/mimc200/
-core-$(CONFIG_LOADER_U_BOOT)           += arch/avr32/boot/u-boot/
-core-y                                 += arch/avr32/kernel/
-core-y                                 += arch/avr32/mm/
-drivers-$(CONFIG_OPROFILE)             += arch/avr32/oprofile/
-libs-y                                 += arch/avr32/lib/
-
-BOOT_TARGETS := vmlinux.elf vmlinux.bin uImage uImage.srec
-
-.PHONY: $(BOOT_TARGETS) install
-
-boot := arch/$(ARCH)/boot/images
-
-             KBUILD_IMAGE := $(boot)/uImage
-vmlinux.elf: KBUILD_IMAGE := $(boot)/vmlinux.elf
-vmlinux.cso: KBUILD_IMAGE := $(boot)/vmlinux.cso
-uImage.srec: KBUILD_IMAGE := $(boot)/uImage.srec
-uImage:      KBUILD_IMAGE := $(boot)/uImage
-
-quiet_cmd_listing = LST     $@
-      cmd_listing = avr32-linux-objdump $(OBJDUMPFLAGS) -lS $< > $@
-quiet_cmd_disasm  = DIS     $@
-      cmd_disasm  = avr32-linux-objdump $(OBJDUMPFLAGS) -d $< > $@
-
-vmlinux.elf vmlinux.bin uImage.srec uImage vmlinux.cso: vmlinux
-       $(Q)$(MAKE) $(build)=$(boot) $(boot)/$@
-
-install: vmlinux
-       $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) $@
-
-vmlinux.s: vmlinux
-       $(call if_changed,disasm)
-
-vmlinux.lst: vmlinux
-       $(call if_changed,listing)
-
-CLEAN_FILES += vmlinux.s vmlinux.lst
-
-archclean:
-       $(Q)$(MAKE) $(clean)=$(boot)
-
-define archhelp
-  @echo '* vmlinux.elf         - ELF image with load address 0'
-  @echo '  vmlinux.cso         - PathFinder CSO image'
-  @echo '* uImage              - Create a bootable image for U-Boot'
-endef
diff --git a/arch/avr32/boards/atngw100/Kconfig b/arch/avr32/boards/atngw100/Kconfig
deleted file mode 100644 (file)
index 4e55617..0000000
+++ /dev/null
@@ -1,65 +0,0 @@
-# NGW100 customization
-
-if BOARD_ATNGW100_COMMON
-
-config BOARD_ATNGW100_MKII_LCD
-       bool "Enable ATNGW100 mkII LCD interface"
-       depends on BOARD_ATNGW100_MKII
-       help
-         This enables the LCD controller (LCDC) in the AT32AP7000. Since the
-         LCDC is multiplexed with MACB1 (LAN) Ethernet port, only one can be
-         enabled at a time.
-
-         This choice enables the LCDC and disables the MACB1 interface marked
-         LAN on the PCB.
-
-choice
-       prompt "Select an NGW100 add-on board to support"
-       default BOARD_ATNGW100_ADDON_NONE
-
-config BOARD_ATNGW100_ADDON_NONE
-       bool "None"
-
-config BOARD_ATNGW100_EVKLCD10X
-       bool "EVKLCD10X addon board"
-       depends on BOARD_ATNGW100_MKI || BOARD_ATNGW100_MKII_LCD
-       help
-         This enables support for the EVKLCD100 (QVGA) or EVKLCD101 (VGA)
-         addon board for the NGW100 and NGW100 mkII. By enabling this the LCD
-         controller and AC97 controller is added as platform devices.
-
-config BOARD_ATNGW100_MRMT
-       bool "Mediama RMT1/2 add-on board"
-       help
-         This enables support for the Mediama RMT1 or RMT2 board.
-         RMT provides LCD support, AC97 codec and other
-         optional peripherals to the Atmel NGW100.
-
-         This choice disables the detect pin and the write-protect pin for the
-         MCI platform device, since it conflicts with the LCD platform device.
-         The MCI pins can be reenabled by editing the "add device function" but
-         this may break the setup for other displays that use these pins.
-
-endchoice
-
-choice
-       prompt "LCD panel resolution on EVKLCD10X"
-       depends on BOARD_ATNGW100_EVKLCD10X
-       default BOARD_ATNGW100_EVKLCD10X_VGA
-
-config BOARD_ATNGW100_EVKLCD10X_QVGA
-       bool "QVGA (320x240)"
-
-config BOARD_ATNGW100_EVKLCD10X_VGA
-       bool "VGA (640x480)"
-
-config BOARD_ATNGW100_EVKLCD10X_POW_QVGA
-       bool "Powertip QVGA (320x240)"
-
-endchoice
-
-if BOARD_ATNGW100_MRMT
-source "arch/avr32/boards/atngw100/Kconfig_mrmt"
-endif
-
-endif  # BOARD_ATNGW100_COMMON
diff --git a/arch/avr32/boards/atngw100/Kconfig_mrmt b/arch/avr32/boards/atngw100/Kconfig_mrmt
deleted file mode 100644 (file)
index 9a199a2..0000000
+++ /dev/null
@@ -1,80 +0,0 @@
-# RMT for NGW100 customization
-
-choice
-       prompt "RMT Version"
-       help
-         Select the RMTx board version.
-
-config BOARD_MRMT_REV1
-       bool "RMT1"
-config BOARD_MRMT_REV2
-       bool "RMT2"
-
-endchoice
-
-config BOARD_MRMT_AC97
-       bool "Enable AC97 CODEC"
-       help
-         Enable the UCB1400 AC97 CODEC driver.
-
-choice
-       prompt "Touchscreen Driver"
-       default BOARD_MRMT_ADS7846_TS
-
-config BOARD_MRMT_UCB1400_TS
-       bool "Use UCB1400 Touchscreen"
-
-config BOARD_MRMT_ADS7846_TS
-       bool "Use ADS7846 Touchscreen"
-
-endchoice
-
-choice
-       prompt "RMTx LCD Selection"
-       default BOARD_MRMT_LCD_DISABLE
-
-config BOARD_MRMT_LCD_DISABLE
-       bool "LCD Disabled"
-
-config BOARD_MRMT_LCD_LQ043T3DX0X
-       bool "Sharp LQ043T3DX0x or compatible"
-       help
-         If using RMT2, be sure to load the resistor pack selectors accordingly
-
-if BOARD_MRMT_REV2
-config BOARD_MRMT_LCD_KWH043GM08
-       bool "Formike KWH043GM08 or compatible"
-       help
-         Be sure to load the RMT2 resistor pack selectors accordingly
-endif
-
-endchoice
-
-if !BOARD_MRMT_LCD_DISABLE
-config BOARD_MRMT_BL_PWM
-       bool "Use PWM control for LCD Backlight"
-       help
-               Use PWM driver for controlling LCD Backlight.
-               Otherwise, LCD Backlight is always on.
-endif
-
-config BOARD_MRMT_RTC_I2C
-       bool "Use External RTC on I2C Bus"
-       help
-               RMT1 has an optional RTC device on the I2C bus.
-               It is a SII S35390A.  Be sure to select the
-               matching RTC driver.
-
-choice
-       prompt "Wireless Module on ttyS2"
-       default BOARD_MRMT_WIRELESS_ZB
-
-config BOARD_MRMT_WIRELESS_ZB
-       bool "Use ZigBee/802.15.4 Module"
-
-config BOARD_MRMT_WIRELESS_BT
-       bool "Use Bluetooth (HCI) Module"
-
-config BOARD_MRMT_WIRELESS_NONE
-       bool "Not Installed"
-endchoice
diff --git a/arch/avr32/boards/atngw100/Makefile b/arch/avr32/boards/atngw100/Makefile
deleted file mode 100644 (file)
index f4ebe42..0000000
+++ /dev/null
@@ -1,3 +0,0 @@
-obj-y                                  += setup.o flash.o
-obj-$(CONFIG_BOARD_ATNGW100_EVKLCD10X) += evklcd10x.o
-obj-$(CONFIG_BOARD_ATNGW100_MRMT)      += mrmt.o
diff --git a/arch/avr32/boards/atngw100/evklcd10x.c b/arch/avr32/boards/atngw100/evklcd10x.c
deleted file mode 100644 (file)
index 64919b0..0000000
+++ /dev/null
@@ -1,178 +0,0 @@
-/*
- * Board-specific setup code for the ATEVKLCD10X addon board to the ATNGW100
- * Network Gateway
- *
- * Copyright (C) 2008 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
- */
-
-#include <linux/init.h>
-#include <linux/linkage.h>
-#include <linux/gpio.h>
-#include <linux/fb.h>
-#include <linux/platform_device.h>
-
-#include <video/atmel_lcdc.h>
-
-#include <asm/setup.h>
-
-#include <mach/at32ap700x.h>
-#include <mach/portmux.h>
-#include <mach/board.h>
-
-#include <sound/atmel-ac97c.h>
-
-static struct ac97c_platform_data __initdata ac97c0_data = {
-       .reset_pin = GPIO_PIN_PB(19),
-};
-
-#ifdef CONFIG_BOARD_ATNGW100_EVKLCD10X_VGA
-static struct fb_videomode __initdata tcg057vglad_modes[] = {
-       {
-               .name           = "640x480 @ 50",
-               .refresh        = 50,
-               .xres           = 640,          .yres           = 480,
-               .pixclock       = KHZ2PICOS(25180),
-
-               .left_margin    = 64,           .right_margin   = 96,
-               .upper_margin   = 34,           .lower_margin   = 11,
-               .hsync_len      = 64,           .vsync_len      = 15,
-
-               .sync           = 0,
-               .vmode          = FB_VMODE_NONINTERLACED,
-       },
-};
-
-static struct fb_monspecs __initdata atevklcd10x_default_monspecs = {
-       .manufacturer           = "KYO",
-       .monitor                = "TCG057VGLAD",
-       .modedb                 = tcg057vglad_modes,
-       .modedb_len             = ARRAY_SIZE(tcg057vglad_modes),
-       .hfmin                  = 19948,
-       .hfmax                  = 31478,
-       .vfmin                  = 50,
-       .vfmax                  = 67,
-       .dclkmax                = 28330000,
-};
-
-static struct atmel_lcdfb_pdata __initdata atevklcd10x_lcdc_data = {
-       .default_bpp            = 16,
-       .default_dmacon         = ATMEL_LCDC_DMAEN | ATMEL_LCDC_DMA2DEN,
-       .default_lcdcon2        = (ATMEL_LCDC_DISTYPE_TFT
-                                  | ATMEL_LCDC_CLKMOD_ALWAYSACTIVE
-                                  | ATMEL_LCDC_MEMOR_BIG),
-       .default_monspecs       = &atevklcd10x_default_monspecs,
-       .guard_time             = 2,
-};
-#elif CONFIG_BOARD_ATNGW100_EVKLCD10X_QVGA
-static struct fb_videomode __initdata tcg057qvlad_modes[] = {
-       {
-               .name           = "320x240 @ 50",
-               .refresh        = 50,
-               .xres           = 320,          .yres           = 240,
-               .pixclock       = KHZ2PICOS(6300),
-
-               .left_margin    = 34,           .right_margin   = 46,
-               .upper_margin   = 7,            .lower_margin   = 15,
-               .hsync_len      = 64,           .vsync_len      = 12,
-
-               .sync           = 0,
-               .vmode          = FB_VMODE_NONINTERLACED,
-       },
-};
-
-static struct fb_monspecs __initdata atevklcd10x_default_monspecs = {
-       .manufacturer           = "KYO",
-       .monitor                = "TCG057QVLAD",
-       .modedb                 = tcg057qvlad_modes,
-       .modedb_len             = ARRAY_SIZE(tcg057qvlad_modes),
-       .hfmin                  = 19948,
-       .hfmax                  = 31478,
-       .vfmin                  = 50,
-       .vfmax                  = 67,
-       .dclkmax                = 7000000,
-};
-
-static struct atmel_lcdfb_pdata __initdata atevklcd10x_lcdc_data = {
-       .default_bpp            = 16,
-       .default_dmacon         = ATMEL_LCDC_DMAEN | ATMEL_LCDC_DMA2DEN,
-       .default_lcdcon2        = (ATMEL_LCDC_DISTYPE_TFT
-                                  | ATMEL_LCDC_CLKMOD_ALWAYSACTIVE
-                                  | ATMEL_LCDC_MEMOR_BIG),
-       .default_monspecs       = &atevklcd10x_default_monspecs,
-       .guard_time             = 2,
-};
-#elif CONFIG_BOARD_ATNGW100_EVKLCD10X_POW_QVGA
-static struct fb_videomode __initdata ph320240t_modes[] = {
-       {
-               .name           = "320x240 @ 60",
-               .refresh        = 60,
-               .xres           = 320,          .yres           = 240,
-               .pixclock       = KHZ2PICOS(6300),
-
-               .left_margin    = 38,           .right_margin   = 20,
-               .upper_margin   = 15,           .lower_margin   = 5,
-               .hsync_len      = 30,           .vsync_len      = 3,
-
-               .sync           = 0,
-               .vmode          = FB_VMODE_NONINTERLACED,
-       },
-};
-
-static struct fb_monspecs __initdata atevklcd10x_default_monspecs = {
-       .manufacturer           = "POW",
-       .monitor                = "PH320240T",
-       .modedb                 = ph320240t_modes,
-       .modedb_len             = ARRAY_SIZE(ph320240t_modes),
-       .hfmin                  = 14400,
-       .hfmax                  = 21600,
-       .vfmin                  = 50,
-       .vfmax                  = 90,
-       .dclkmax                = 6400000,
-};
-
-static struct atmel_lcdfb_pdata __initdata atevklcd10x_lcdc_data = {
-       .default_bpp            = 16,
-       .default_dmacon         = ATMEL_LCDC_DMAEN | ATMEL_LCDC_DMA2DEN,
-       .default_lcdcon2        = (ATMEL_LCDC_DISTYPE_TFT
-                                  | ATMEL_LCDC_CLKMOD_ALWAYSACTIVE
-                                  | ATMEL_LCDC_MEMOR_BIG),
-       .default_monspecs       = &atevklcd10x_default_monspecs,
-       .guard_time             = 2,
-};
-#endif
-
-static void atevklcd10x_lcdc_power_control(struct atmel_lcdfb_pdata *pdata, int on)
-{
-       gpio_set_value(GPIO_PIN_PB(15), on);
-}
-
-static int __init atevklcd10x_init(void)
-{
-       /* PB15 is connected to the enable line on the boost regulator
-        * controlling the backlight for the LCD panel.
-        */
-       at32_select_gpio(GPIO_PIN_PB(15), AT32_GPIOF_OUTPUT);
-       gpio_request(GPIO_PIN_PB(15), "backlight");
-       gpio_direction_output(GPIO_PIN_PB(15), 0);
-
-       atevklcd10x_lcdc_data.atmel_lcdfb_power_control =
-               atevklcd10x_lcdc_power_control;
-
-       at32_add_device_lcdc(0, &atevklcd10x_lcdc_data,
-                       fbmem_start, fbmem_size,
-#ifdef CONFIG_BOARD_ATNGW100_MKII
-                       ATMEL_LCDC_PRI_18BIT | ATMEL_LCDC_PC_DVAL
-#else
-                       ATMEL_LCDC_ALT_18BIT | ATMEL_LCDC_PE_DVAL
-#endif
-                       );
-
-       at32_add_device_ac97c(0, &ac97c0_data, AC97C_BOTH);
-
-       return 0;
-}
-postcore_initcall(atevklcd10x_init);
diff --git a/arch/avr32/boards/atngw100/flash.c b/arch/avr32/boards/atngw100/flash.c
deleted file mode 100644 (file)
index 55ccc9c..0000000
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * ATNGW100 board-specific flash initialization
- *
- * Copyright (C) 2005-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/init.h>
-#include <linux/platform_device.h>
-#include <linux/mtd/mtd.h>
-#include <linux/mtd/partitions.h>
-#include <linux/mtd/physmap.h>
-
-#include <mach/smc.h>
-
-static struct smc_timing flash_timing __initdata = {
-       .ncs_read_setup         = 0,
-       .nrd_setup              = 40,
-       .ncs_write_setup        = 0,
-       .nwe_setup              = 10,
-
-       .ncs_read_pulse         = 80,
-       .nrd_pulse              = 40,
-       .ncs_write_pulse        = 65,
-       .nwe_pulse              = 55,
-
-       .read_cycle             = 120,
-       .write_cycle            = 120,
-};
-
-static struct smc_config flash_config __initdata = {
-       .bus_width              = 2,
-       .nrd_controlled         = 1,
-       .nwe_controlled         = 1,
-       .byte_write             = 1,
-};
-
-static struct mtd_partition flash_parts[] = {
-       {
-               .name           = "u-boot",
-               .offset         = 0x00000000,
-               .size           = 0x00020000,           /* 128 KiB */
-               .mask_flags     = MTD_WRITEABLE,
-       },
-       {
-               .name           = "root",
-               .offset         = 0x00020000,
-               .size           = 0x007d0000,
-       },
-       {
-               .name           = "env",
-               .offset         = 0x007f0000,
-               .size           = 0x00010000,
-               .mask_flags     = MTD_WRITEABLE,
-       },
-};
-
-static struct physmap_flash_data flash_data = {
-       .width          = 2,
-       .nr_parts       = ARRAY_SIZE(flash_parts),
-       .parts          = flash_parts,
-};
-
-static struct resource flash_resource = {
-       .start          = 0x00000000,
-       .end            = 0x007fffff,
-       .flags          = IORESOURCE_MEM,
-};
-
-static struct platform_device flash_device = {
-       .name           = "physmap-flash",
-       .id             = 0,
-       .resource       = &flash_resource,
-       .num_resources  = 1,
-       .dev            = {
-               .platform_data = &flash_data,
-       },
-};
-
-/* This needs to be called after the SMC has been initialized */
-static int __init atngw100_flash_init(void)
-{
-       int ret;
-
-       smc_set_timing(&flash_config, &flash_timing);
-       ret = smc_set_configuration(0, &flash_config);
-       if (ret < 0) {
-               printk(KERN_ERR "atngw100: failed to set NOR flash timing\n");
-               return ret;
-       }
-
-       platform_device_register(&flash_device);
-
-       return 0;
-}
-device_initcall(atngw100_flash_init);
diff --git a/arch/avr32/boards/atngw100/mrmt.c b/arch/avr32/boards/atngw100/mrmt.c
deleted file mode 100644 (file)
index 99b0a79..0000000
+++ /dev/null
@@ -1,382 +0,0 @@
-/*
- * Board-specific setup code for Remote Media Terminal 1 (RMT1)
- * add-on board for the ATNGW100 Network Gateway
- *
- * Copyright (C) 2008 Mediama Technologies
- * Based on ATNGW100 Network Gateway (Copyright (C) Atmel)
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/gpio.h>
-#include <linux/init.h>
-#include <linux/irq.h>
-#include <linux/linkage.h>
-#include <linux/platform_device.h>
-#include <linux/types.h>
-#include <linux/fb.h>
-#include <linux/leds.h>
-#include <linux/pwm.h>
-#include <linux/leds_pwm.h>
-#include <linux/input.h>
-#include <linux/gpio_keys.h>
-#include <linux/spi/spi.h>
-#include <linux/spi/ads7846.h>
-
-#include <video/atmel_lcdc.h>
-#include <sound/atmel-ac97c.h>
-
-#include <asm/delay.h>
-#include <asm/io.h>
-#include <asm/setup.h>
-
-#include <mach/at32ap700x.h>
-#include <mach/board.h>
-#include <mach/init.h>
-#include <mach/portmux.h>
-
-/* Define board-specifoic GPIO assignments */
-#define PIN_LCD_BL     GPIO_PIN_PA(28)
-#define PWM_CH_BL      0       /* Must match with GPIO pin definition */
-#define PIN_LCD_DISP   GPIO_PIN_PA(31)
-#define        PIN_AC97_RST_N  GPIO_PIN_PA(30)
-#define PB_EXTINT_BASE 25
-#define TS_IRQ         0
-#define PIN_TS_EXTINT  GPIO_PIN_PB(PB_EXTINT_BASE+TS_IRQ)
-#define PIN_PB_LEFT    GPIO_PIN_PB(11)
-#define PIN_PB_RIGHT   GPIO_PIN_PB(12)
-#define PIN_PWR_SW_N   GPIO_PIN_PB(14)
-#define PIN_PWR_ON     GPIO_PIN_PB(13)
-#define PIN_ZB_RST_N   GPIO_PIN_PA(21)
-#define PIN_BT_RST     GPIO_PIN_PA(22)
-#define PIN_LED_SYS    GPIO_PIN_PA(16)
-#define PIN_LED_A      GPIO_PIN_PA(19)
-#define PIN_LED_B      GPIO_PIN_PE(19)
-
-#ifdef CONFIG_BOARD_MRMT_LCD_LQ043T3DX0X
-/* Sharp LQ043T3DX0x (or compatible) panel */
-static struct fb_videomode __initdata lcd_fb_modes[] = {
-       {
-               .name           = "480x272 @ 59.94Hz",
-               .refresh        = 59.94,
-               .xres           = 480,          .yres           = 272,
-               .pixclock       = KHZ2PICOS(9000),
-
-               .left_margin    = 2,            .right_margin   = 2,
-               .upper_margin   = 3,            .lower_margin   = 9,
-               .hsync_len      = 41,           .vsync_len      = 1,
-
-               .sync           = 0,
-               .vmode          = FB_VMODE_NONINTERLACED,
-       },
-};
-
-static struct fb_monspecs __initdata lcd_fb_default_monspecs = {
-       .manufacturer           = "SHA",
-       .monitor                = "LQ043T3DX02",
-       .modedb                 = lcd_fb_modes,
-       .modedb_len             = ARRAY_SIZE(lcd_fb_modes),
-       .hfmin                  = 14915,
-       .hfmax                  = 17638,
-       .vfmin                  = 53,
-       .vfmax                  = 61,
-       .dclkmax                = 9260000,
-};
-
-static struct atmel_lcdfb_pdata __initdata rmt_lcdc_data = {
-       .default_bpp            = 24,
-       .default_dmacon         = ATMEL_LCDC_DMAEN | ATMEL_LCDC_DMA2DEN,
-       .default_lcdcon2        = (ATMEL_LCDC_DISTYPE_TFT
-                                  | ATMEL_LCDC_CLKMOD_ALWAYSACTIVE
-                                  | ATMEL_LCDC_INVCLK_NORMAL
-                                  | ATMEL_LCDC_MEMOR_BIG),
-       .lcd_wiring_mode        = ATMEL_LCDC_WIRING_RGB,
-       .default_monspecs       = &lcd_fb_default_monspecs,
-       .guard_time             = 2,
-};
-#endif
-
-#ifdef CONFIG_BOARD_MRMT_LCD_KWH043GM08
-/* Sharp KWH043GM08-Fxx (or compatible) panel */
-static struct fb_videomode __initdata lcd_fb_modes[] = {
-       {
-               .name           = "480x272 @ 59.94Hz",
-               .refresh        = 59.94,
-               .xres           = 480,          .yres           = 272,
-               .pixclock       = KHZ2PICOS(9000),
-
-               .left_margin    = 2,            .right_margin   = 2,
-               .upper_margin   = 3,            .lower_margin   = 9,
-               .hsync_len      = 41,           .vsync_len      = 1,
-
-               .sync           = 0,
-               .vmode          = FB_VMODE_NONINTERLACED,
-       },
-};
-
-static struct fb_monspecs __initdata lcd_fb_default_monspecs = {
-       .manufacturer           = "FOR",
-       .monitor                = "KWH043GM08",
-       .modedb                 = lcd_fb_modes,
-       .modedb_len             = ARRAY_SIZE(lcd_fb_modes),
-       .hfmin                  = 14915,
-       .hfmax                  = 17638,
-       .vfmin                  = 53,
-       .vfmax                  = 61,
-       .dclkmax                = 9260000,
-};
-
-static struct atmel_lcdfb_pdata __initdata rmt_lcdc_data = {
-       .default_bpp            = 24,
-       .default_dmacon         = ATMEL_LCDC_DMAEN | ATMEL_LCDC_DMA2DEN,
-       .default_lcdcon2        = (ATMEL_LCDC_DISTYPE_TFT
-                                  | ATMEL_LCDC_CLKMOD_ALWAYSACTIVE
-                                  | ATMEL_LCDC_INVCLK_INVERTED
-                                  | ATMEL_LCDC_MEMOR_BIG),
-       .lcd_wiring_mode        = ATMEL_LCDC_WIRING_RGB,
-       .default_monspecs       = &lcd_fb_default_monspecs,
-       .guard_time             = 2,
-};
-#endif
-
-#ifdef CONFIG_BOARD_MRMT_AC97
-static struct ac97c_platform_data __initdata ac97c0_data = {
-       .reset_pin              = PIN_AC97_RST_N,
-};
-#endif
-
-#ifdef CONFIG_BOARD_MRMT_UCB1400_TS
-/* NOTE: IRQ assignment relies on kernel module parameter */
-static struct platform_device rmt_ts_device = {
-       .name   = "ucb1400_ts",
-       .id     = -1,
-};
-#endif
-
-#ifdef CONFIG_BOARD_MRMT_BL_PWM
-/* PWM LEDs: LCD Backlight, etc */
-static struct pwm_lookup pwm_lookup[] = {
-       PWM_LOOKUP("at91sam9rl-pwm", PWM_CH_BL, "leds_pwm", "ds1",
-                  5000, PWM_POLARITY_INVERSED),
-};
-
-static struct led_pwm pwm_leds[] = {
-       {
-               .name = "backlight",
-               .max_brightness = 255,
-       },
-};
-
-static struct led_pwm_platform_data pwm_data = {
-       .num_leds       = ARRAY_SIZE(pwm_leds),
-       .leds           = pwm_leds,
-};
-
-static struct platform_device leds_pwm = {
-       .name   = "leds_pwm",
-       .id     = -1,
-       .dev    = {
-               .platform_data = &pwm_data,
-       },
-};
-#endif
-
-#ifdef CONFIG_BOARD_MRMT_ADS7846_TS
-static int ads7846_pendown_state(void)
-{
-       return !gpio_get_value( PIN_TS_EXTINT );        /* PENIRQ.*/
-}
-
-static struct ads7846_platform_data ads_info = {
-       .model                          = 7846,
-       .keep_vref_on                   = 0,    /* Use external VREF pin */
-       .vref_delay_usecs               = 0,
-       .vref_mv                        = 3300, /* VREF = 3.3V */
-       .settle_delay_usecs             = 800,
-       .penirq_recheck_delay_usecs     = 800,
-       .x_plate_ohms                   = 750,
-       .y_plate_ohms                   = 300,
-       .pressure_max                   = 4096,
-       .debounce_max                   = 1,
-       .debounce_rep                   = 0,
-       .debounce_tol                   = (~0),
-       .get_pendown_state              = ads7846_pendown_state,
-       .filter                         = NULL,
-       .filter_init                    = NULL,
-};
-
-static struct spi_board_info spi01_board_info[] __initdata = {
-       {
-               .modalias       = "ads7846",
-               .max_speed_hz   = 31250*26,
-               .bus_num        = 0,
-               .chip_select    = 1,
-               .platform_data  = &ads_info,
-               .irq            = AT32_EXTINT(TS_IRQ),
-       },
-};
-#endif
-
-/* GPIO Keys: left, right, power, etc */
-static const struct gpio_keys_button rmt_gpio_keys_buttons[] = {
-       [0] = {
-               .type           = EV_KEY,
-               .code           = KEY_POWER,
-               .gpio           = PIN_PWR_SW_N,
-               .active_low     = 1,
-               .desc           = "power button",
-       },
-       [1] = {
-               .type           = EV_KEY,
-               .code           = KEY_LEFT,
-               .gpio           = PIN_PB_LEFT,
-               .active_low     = 1,
-               .desc           = "left button",
-       },
-       [2] = {
-               .type           = EV_KEY,
-               .code           = KEY_RIGHT,
-               .gpio           = PIN_PB_RIGHT,
-               .active_low     = 1,
-               .desc           = "right button",
-       },
-};
-
-static const struct gpio_keys_platform_data rmt_gpio_keys_data = {
-       .nbuttons =     ARRAY_SIZE(rmt_gpio_keys_buttons),
-       .buttons =      (void *) rmt_gpio_keys_buttons,
-};
-
-static struct platform_device rmt_gpio_keys = {
-       .name =         "gpio-keys",
-       .id =           -1,
-       .dev = {
-               .platform_data = (void *) &rmt_gpio_keys_data,
-       }
-};
-
-#ifdef CONFIG_BOARD_MRMT_RTC_I2C
-static struct i2c_board_info __initdata mrmt1_i2c_rtc = {
-       I2C_BOARD_INFO("s35390a", 0x30),
-       .irq            = 0,
-};
-#endif
-
-static void mrmt_power_off(void)
-{
-       /* PWR_ON=0 will force power off */
-       gpio_set_value( PIN_PWR_ON, 0 );
-}
-
-static int __init mrmt1_init(void)
-{
-       gpio_set_value( PIN_PWR_ON, 1 );        /* Ensure PWR_ON is enabled */
-
-       pm_power_off = mrmt_power_off;
-
-       /* Setup USARTS (other than console) */
-       at32_map_usart(2, 1, 0);        /* USART 2: /dev/ttyS1, RMT1:DB9M */
-       at32_map_usart(3, 2, ATMEL_USART_RTS | ATMEL_USART_CTS);
-                       /* USART 3: /dev/ttyS2, RMT1:Wireless, w/ RTS/CTS */
-       at32_add_device_usart(1);
-       at32_add_device_usart(2);
-
-       /* Select GPIO Key pins */
-       at32_select_gpio( PIN_PWR_SW_N, AT32_GPIOF_DEGLITCH);
-       at32_select_gpio( PIN_PB_LEFT, AT32_GPIOF_DEGLITCH);
-       at32_select_gpio( PIN_PB_RIGHT, AT32_GPIOF_DEGLITCH);
-       platform_device_register(&rmt_gpio_keys);
-
-#ifdef CONFIG_BOARD_MRMT_RTC_I2C
-       i2c_register_board_info(0, &mrmt1_i2c_rtc, 1);
-#endif
-
-#ifndef CONFIG_BOARD_MRMT_LCD_DISABLE
-       /* User "alternate" LCDC inferface on Port E & D */
-       /* NB: exclude LCDC_CC pin, as NGW100 reserves it for other use */
-       at32_add_device_lcdc(0, &rmt_lcdc_data,
-               fbmem_start, fbmem_size,
-               (ATMEL_LCDC_ALT_24BIT | ATMEL_LCDC_PE_DVAL ) );
-#endif
-
-#ifdef CONFIG_BOARD_MRMT_AC97
-       at32_add_device_ac97c(0, &ac97c0_data, AC97C_BOTH);
-#endif
-
-#ifdef CONFIG_BOARD_MRMT_ADS7846_TS
-       /* Select the Touchscreen interrupt pin mode */
-       at32_select_periph( GPIO_PIOB_BASE, 1 << (PB_EXTINT_BASE+TS_IRQ),
-                       GPIO_PERIPH_A, AT32_GPIOF_DEGLITCH);
-       irq_set_irq_type(AT32_EXTINT(TS_IRQ), IRQ_TYPE_EDGE_FALLING);
-       at32_spi_setup_slaves(0,spi01_board_info,ARRAY_SIZE(spi01_board_info));
-       spi_register_board_info(spi01_board_info,ARRAY_SIZE(spi01_board_info));
-#endif
-
-#ifdef CONFIG_BOARD_MRMT_UCB1400_TS
-       /* Select the Touchscreen interrupt pin mode */
-       at32_select_periph( GPIO_PIOB_BASE, 1 << (PB_EXTINT_BASE+TS_IRQ),
-                       GPIO_PERIPH_A, AT32_GPIOF_DEGLITCH);
-       platform_device_register(&rmt_ts_device);
-#endif
-
-       at32_select_gpio( PIN_LCD_DISP, AT32_GPIOF_OUTPUT );
-       gpio_request( PIN_LCD_DISP, "LCD_DISP" );
-       gpio_direction_output( PIN_LCD_DISP, 0 );       /* LCD DISP */
-#ifdef CONFIG_BOARD_MRMT_LCD_DISABLE
-       /* Keep Backlight and DISP off */
-       at32_select_gpio( PIN_LCD_BL, AT32_GPIOF_OUTPUT );
-       gpio_request( PIN_LCD_BL, "LCD_BL" );
-       gpio_direction_output( PIN_LCD_BL, 0 );         /* Backlight */
-#else
-       gpio_set_value( PIN_LCD_DISP, 1 );      /* DISP asserted first */
-#ifdef CONFIG_BOARD_MRMT_BL_PWM
-       /* Use PWM for Backlight controls */
-       at32_add_device_pwm(1 << PWM_CH_BL);
-       pwm_add_table(pwm_lookup, ARRAY_SIZE(pwm_lookup));
-       platform_device_register(&leds_pwm);
-#else
-       /* Backlight always on */
-       udelay( 1 );
-       at32_select_gpio( PIN_LCD_BL, AT32_GPIOF_OUTPUT );
-       gpio_request( PIN_LCD_BL, "LCD_BL" );
-       gpio_direction_output( PIN_LCD_BL, 1 );
-#endif
-#endif
-
-       /* Make sure BT and Zigbee modules in reset */
-       at32_select_gpio( PIN_BT_RST, AT32_GPIOF_OUTPUT );
-       gpio_request( PIN_BT_RST, "BT_RST" );
-       gpio_direction_output( PIN_BT_RST, 1 );
-       /* BT Module in Reset */
-
-       at32_select_gpio( PIN_ZB_RST_N, AT32_GPIOF_OUTPUT );
-       gpio_request( PIN_ZB_RST_N, "ZB_RST_N" );
-       gpio_direction_output( PIN_ZB_RST_N, 0 );
-       /* XBee Module in Reset */
-
-#ifdef CONFIG_BOARD_MRMT_WIRELESS_ZB
-       udelay( 1000 );
-       /* Unreset the XBee Module */
-       gpio_set_value( PIN_ZB_RST_N, 1 );
-#endif
-#ifdef CONFIG_BOARD_MRMT_WIRELESS_BT
-       udelay( 1000 );
-       /* Unreset the BT Module */
-       gpio_set_value( PIN_BT_RST, 0 );
-#endif
-
-       return 0;
-}
-arch_initcall(mrmt1_init);
-
-static int __init mrmt1_early_init(void)
-{
-       /* To maintain power-on signal in case boot loader did not already */
-       at32_select_gpio( PIN_PWR_ON, AT32_GPIOF_OUTPUT );
-       gpio_request( PIN_PWR_ON, "PIN_PWR_ON" );
-       gpio_direction_output( PIN_PWR_ON, 1 );
-
-       return 0;
-}
-core_initcall(mrmt1_early_init);
diff --git a/arch/avr32/boards/atngw100/setup.c b/arch/avr32/boards/atngw100/setup.c
deleted file mode 100644 (file)
index afeae89..0000000
+++ /dev/null
@@ -1,324 +0,0 @@
-/*
- * Board-specific setup code for the ATNGW100 Network Gateway
- *
- * Copyright (C) 2005-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/clk.h>
-#include <linux/etherdevice.h>
-#include <linux/gpio.h>
-#include <linux/irq.h>
-#include <linux/i2c.h>
-#include <linux/i2c-gpio.h>
-#include <linux/init.h>
-#include <linux/linkage.h>
-#include <linux/platform_device.h>
-#include <linux/types.h>
-#include <linux/leds.h>
-#include <linux/spi/spi.h>
-#include <linux/atmel-mci.h>
-#include <linux/usb/atmel_usba_udc.h>
-
-#include <asm/io.h>
-#include <asm/setup.h>
-
-#include <mach/at32ap700x.h>
-#include <mach/board.h>
-#include <mach/init.h>
-#include <mach/portmux.h>
-
-/* Oscillator frequencies. These are board-specific */
-unsigned long at32_board_osc_rates[3] = {
-       [0] = 32768,    /* 32.768 kHz on RTC osc */
-       [1] = 20000000, /* 20 MHz on osc0 */
-       [2] = 12000000, /* 12 MHz on osc1 */
-};
-
-/*
- * The ATNGW100 mkII is very similar to the ATNGW100. Both have the AT32AP7000
- * chip on board; the difference is that the ATNGW100 mkII has 128 MB 32-bit
- * SDRAM (the ATNGW100 has 32 MB 16-bit SDRAM) and 256 MB 16-bit NAND flash
- * (the ATNGW100 has none.)
- *
- * The RAM difference is handled by the boot loader, so the only difference we
- * end up handling here is the NAND flash, EBI pin reservation and if LCDC or
- * MACB1 should be enabled.
- */
-#ifdef CONFIG_BOARD_ATNGW100_MKII
-#include <linux/mtd/partitions.h>
-#include <mach/smc.h>
-
-static struct smc_timing nand_timing __initdata = {
-       .ncs_read_setup         = 0,
-       .nrd_setup              = 10,
-       .ncs_write_setup        = 0,
-       .nwe_setup              = 10,
-
-       .ncs_read_pulse         = 30,
-       .nrd_pulse              = 15,
-       .ncs_write_pulse        = 30,
-       .nwe_pulse              = 15,
-
-       .read_cycle             = 30,
-       .write_cycle            = 30,
-
-       .ncs_read_recover       = 0,
-       .nrd_recover            = 15,
-       .ncs_write_recover      = 0,
-       /* WE# high -> RE# low min 60 ns */
-       .nwe_recover            = 50,
-};
-
-static struct smc_config nand_config __initdata = {
-       .bus_width              = 2,
-       .nrd_controlled         = 1,
-       .nwe_controlled         = 1,
-       .nwait_mode             = 0,
-       .byte_write             = 0,
-       .tdf_cycles             = 2,
-       .tdf_mode               = 0,
-};
-
-static struct mtd_partition nand_partitions[] = {
-       {
-               .name           = "main",
-               .offset         = 0x00000000,
-               .size           = MTDPART_SIZ_FULL,
-       },
-};
-
-
-static struct atmel_nand_data atngw100mkii_nand_data __initdata = {
-       .cle            = 21,
-       .ale            = 22,
-       .rdy_pin        = GPIO_PIN_PB(28),
-       .enable_pin     = GPIO_PIN_PE(23),
-       .bus_width_16   = true,
-       .ecc_mode       = NAND_ECC_SOFT,
-       .parts          = nand_partitions,
-       .num_parts      = ARRAY_SIZE(nand_partitions),
-};
-#endif
-
-/* Initialized by bootloader-specific startup code. */
-struct tag *bootloader_tags __initdata;
-
-struct eth_addr {
-       u8 addr[6];
-};
-static struct eth_addr __initdata hw_addr[2];
-static struct macb_platform_data __initdata eth_data[2];
-
-static struct spi_board_info spi0_board_info[] __initdata = {
-       {
-               .modalias       = "mtd_dataflash",
-               .max_speed_hz   = 8000000,
-               .chip_select    = 0,
-       },
-};
-
-static struct mci_platform_data __initdata mci0_data = {
-       .slot[0] = {
-               .bus_width      = 4,
-#if defined(CONFIG_BOARD_ATNGW100_MKII)
-               .detect_pin     = GPIO_PIN_PC(25),
-               .wp_pin         = GPIO_PIN_PE(22),
-#else
-               .detect_pin     = GPIO_PIN_PC(25),
-               .wp_pin         = GPIO_PIN_PE(0),
-#endif
-       },
-};
-
-static struct usba_platform_data atngw100_usba_data __initdata = {
-#if defined(CONFIG_BOARD_ATNGW100_MKII)
-       .vbus_pin       = GPIO_PIN_PE(26),
-#else
-       .vbus_pin       = -ENODEV,
-#endif
-};
-
-/*
- * The next two functions should go away as the boot loader is
- * supposed to initialize the macb address registers with a valid
- * ethernet address. But we need to keep it around for a while until
- * we can be reasonably sure the boot loader does this.
- *
- * The phy_id is ignored as the driver will probe for it.
- */
-static int __init parse_tag_ethernet(struct tag *tag)
-{
-       int i;
-
-       i = tag->u.ethernet.mac_index;
-       if (i < ARRAY_SIZE(hw_addr))
-               memcpy(hw_addr[i].addr, tag->u.ethernet.hw_address,
-                      sizeof(hw_addr[i].addr));
-
-       return 0;
-}
-__tagtable(ATAG_ETHERNET, parse_tag_ethernet);
-
-static void __init set_hw_addr(struct platform_device *pdev)
-{
-       struct resource *res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-       const u8 *addr;
-       void __iomem *regs;
-       struct clk *pclk;
-
-       if (!res)
-               return;
-       if (pdev->id >= ARRAY_SIZE(hw_addr))
-               return;
-
-       addr = hw_addr[pdev->id].addr;
-       if (!is_valid_ether_addr(addr))
-               return;
-
-       /*
-        * Since this is board-specific code, we'll cheat and use the
-        * physical address directly as we happen to know that it's
-        * the same as the virtual address.
-        */
-       regs = (void __iomem __force *)res->start;
-       pclk = clk_get(&pdev->dev, "pclk");
-       if (IS_ERR(pclk))
-               return;
-
-       clk_enable(pclk);
-       __raw_writel((addr[3] << 24) | (addr[2] << 16)
-                    | (addr[1] << 8) | addr[0], regs + 0x98);
-       __raw_writel((addr[5] << 8) | addr[4], regs + 0x9c);
-       clk_disable(pclk);
-       clk_put(pclk);
-}
-
-void __init setup_board(void)
-{
-       at32_map_usart(1, 0, 0);        /* USART 1: /dev/ttyS0, DB9 */
-       at32_setup_serial_console(0);
-}
-
-static const struct gpio_led ngw_leds[] = {
-       { .name = "sys", .gpio = GPIO_PIN_PA(16), .active_low = 1,
-               .default_trigger = "heartbeat",
-       },
-       { .name = "a", .gpio = GPIO_PIN_PA(19), .active_low = 1, },
-       { .name = "b", .gpio = GPIO_PIN_PE(19), .active_low = 1, },
-};
-
-static const struct gpio_led_platform_data ngw_led_data = {
-       .num_leds =     ARRAY_SIZE(ngw_leds),
-       .leds =         (void *) ngw_leds,
-};
-
-static struct platform_device ngw_gpio_leds = {
-       .name =         "leds-gpio",
-       .id =           -1,
-       .dev = {
-               .platform_data = (void *) &ngw_led_data,
-       }
-};
-
-static struct i2c_gpio_platform_data i2c_gpio_data = {
-       .sda_pin                = GPIO_PIN_PA(6),
-       .scl_pin                = GPIO_PIN_PA(7),
-       .sda_is_open_drain      = 1,
-       .scl_is_open_drain      = 1,
-       .udelay                 = 2,    /* close to 100 kHz */
-};
-
-static struct platform_device i2c_gpio_device = {
-       .name           = "i2c-gpio",
-       .id             = 0,
-       .dev            = {
-               .platform_data  = &i2c_gpio_data,
-       },
-};
-
-static struct i2c_board_info __initdata i2c_info[] = {
-       /* NOTE:  original ATtiny24 firmware is at address 0x0b */
-};
-
-static int __init atngw100_init(void)
-{
-       unsigned        i;
-
-       /*
-        * ATNGW100 mkII uses 32-bit SDRAM interface. Reserve the
-        * SDRAM-specific pins so that nobody messes with them.
-        */
-#ifdef CONFIG_BOARD_ATNGW100_MKII
-       at32_reserve_pin(GPIO_PIOE_BASE, ATMEL_EBI_PE_DATA_ALL);
-
-       smc_set_timing(&nand_config, &nand_timing);
-       smc_set_configuration(3, &nand_config);
-       at32_add_device_nand(0, &atngw100mkii_nand_data);
-#endif
-
-       at32_add_device_usart(0);
-
-       set_hw_addr(at32_add_device_eth(0, &eth_data[0]));
-#ifndef CONFIG_BOARD_ATNGW100_MKII_LCD
-       set_hw_addr(at32_add_device_eth(1, &eth_data[1]));
-#endif
-
-       at32_add_device_spi(0, spi0_board_info, ARRAY_SIZE(spi0_board_info));
-       at32_add_device_mci(0, &mci0_data);
-       at32_add_device_usba(0, &atngw100_usba_data);
-
-       for (i = 0; i < ARRAY_SIZE(ngw_leds); i++) {
-               at32_select_gpio(ngw_leds[i].gpio,
-                               AT32_GPIOF_OUTPUT | AT32_GPIOF_HIGH);
-       }
-       platform_device_register(&ngw_gpio_leds);
-
-       /* all these i2c/smbus pins should have external pullups for
-        * open-drain sharing among all I2C devices.  SDA and SCL do;
-        * PB28/EXTINT3 (ATNGW100) and PE21 (ATNGW100 mkII) doesn't; it should
-        * be SMBALERT# (for PMBus), but it's not available off-board.
-        */
-#ifdef CONFIG_BOARD_ATNGW100_MKII
-       at32_select_periph(GPIO_PIOE_BASE, 1 << 21, 0, AT32_GPIOF_PULLUP);
-#else
-       at32_select_periph(GPIO_PIOB_BASE, 1 << 28, 0, AT32_GPIOF_PULLUP);
-#endif
-       at32_select_gpio(i2c_gpio_data.sda_pin,
-               AT32_GPIOF_MULTIDRV | AT32_GPIOF_OUTPUT | AT32_GPIOF_HIGH);
-       at32_select_gpio(i2c_gpio_data.scl_pin,
-               AT32_GPIOF_MULTIDRV | AT32_GPIOF_OUTPUT | AT32_GPIOF_HIGH);
-       platform_device_register(&i2c_gpio_device);
-       i2c_register_board_info(0, i2c_info, ARRAY_SIZE(i2c_info));
-
-       return 0;
-}
-postcore_initcall(atngw100_init);
-
-static int __init atngw100_arch_init(void)
-{
-       /* PB30 (ATNGW100) and PE30 (ATNGW100 mkII) is the otherwise unused
-        * jumper on the mainboard, with an external pullup; the jumper grounds
-        * it. Use it however you like, including letting U-Boot or Linux tweak
-        * boot sequences.
-        */
-#ifdef CONFIG_BOARD_ATNGW100_MKII
-       at32_select_gpio(GPIO_PIN_PE(30), 0);
-       gpio_request(GPIO_PIN_PE(30), "j15");
-       gpio_direction_input(GPIO_PIN_PE(30));
-       gpio_export(GPIO_PIN_PE(30), false);
-#else
-       at32_select_gpio(GPIO_PIN_PB(30), 0);
-       gpio_request(GPIO_PIN_PB(30), "j15");
-       gpio_direction_input(GPIO_PIN_PB(30));
-       gpio_export(GPIO_PIN_PB(30), false);
-#endif
-
-       /* set_irq_type() after the arch_initcall for EIC has run, and
-        * before the I2C subsystem could try using this IRQ.
-        */
-       return irq_set_irq_type(AT32_EXTINT(3), IRQ_TYPE_EDGE_FALLING);
-}
-arch_initcall(atngw100_arch_init);
diff --git a/arch/avr32/boards/atstk1000/Kconfig b/arch/avr32/boards/atstk1000/Kconfig
deleted file mode 100644 (file)
index 8dc4821..0000000
+++ /dev/null
@@ -1,109 +0,0 @@
-# STK1000 customization
-
-if BOARD_ATSTK1000
-
-choice
-       prompt "ATSTK1000 CPU daughterboard type"
-       default BOARD_ATSTK1002
-
-config BOARD_ATSTK1002
-       bool "ATSTK1002"
-       select CPU_AT32AP7000
-
-config BOARD_ATSTK1003
-       bool "ATSTK1003"
-       select CPU_AT32AP7001
-
-config BOARD_ATSTK1004
-       bool "ATSTK1004"
-       select CPU_AT32AP7002
-
-config BOARD_ATSTK1006
-       bool "ATSTK1006"
-       select CPU_AT32AP7000
-
-endchoice
-
-
-config BOARD_ATSTK100X_CUSTOM
-       bool "Non-default STK1002/STK1003/STK1004 jumper settings"
-       help
-         You will normally leave the jumpers on the CPU card at their
-         default settings.  If you need to use certain peripherals,
-         you will need to change some of those jumpers.
-
-if BOARD_ATSTK100X_CUSTOM
-
-config BOARD_ATSTK100X_SW1_CUSTOM
-       bool "SW1: use SSC1 (not SPI0)"
-       help
-         This also prevents using the external DAC as an audio interface,
-         and means you can't initialize the on-board QVGA display.
-
-config BOARD_ATSTK100X_SW2_CUSTOM
-       bool "SW2: use IRDA or TIMER0 (not UART-A, MMC/SD, and PS2-A)"
-       help
-         If you change this you'll want an updated boot loader putting
-         the console on UART-C not UART-A.
-
-config BOARD_ATSTK100X_SW3_CUSTOM
-       bool "SW3: use TIMER1 (not SSC0 and GCLK)"
-       help
-         This also prevents using the external DAC as an audio interface.
-
-config BOARD_ATSTK100X_SW4_CUSTOM
-       bool "SW4: use ISI/Camera (not GPIOs, SPI1, and PS2-B)"
-       help
-         To use the camera interface you'll need a custom card (on the
-         PCI-format connector) connect a video sensor.
-
-config BOARD_ATSTK1002_SW5_CUSTOM
-       bool "SW5: use MACB1 (not LCDC)"
-       depends on BOARD_ATSTK1002
-
-config BOARD_ATSTK1002_SW6_CUSTOM
-       bool "SW6: more GPIOs (not MACB0)"
-       depends on BOARD_ATSTK1002
-
-endif  # custom
-
-config BOARD_ATSTK100X_SPI1
-       bool "Configure SPI1 controller"
-       depends on !BOARD_ATSTK100X_SW4_CUSTOM
-       help
-         All the signals for the second SPI controller are available on
-         GPIO lines and accessed through the J1 jumper block.  Say "y"
-         here to configure that SPI controller.
-
-config BOARD_ATSTK1000_J2_LED
-       bool
-       default BOARD_ATSTK1000_J2_LED8 || BOARD_ATSTK1000_J2_RGB
-
-choice
-       prompt "LEDs connected to J2:"
-       depends on LEDS_GPIO && !BOARD_ATSTK100X_SW4_CUSTOM
-       optional
-       help
-         Select this if you have jumpered the J2 jumper block to the
-         LED0..LED7 amber leds, or to the RGB leds, using a ten-pin
-         IDC cable.  A default "heartbeat" trigger is provided, but
-         you can of course override this.
-
-config BOARD_ATSTK1000_J2_LED8
-       bool "LED0..LED7"
-       help
-         Select this if J2 is jumpered to LED0..LED7 amber leds.
-
-config BOARD_ATSTK1000_J2_RGB
-       bool "RGB leds"
-       help
-         Select this if J2 is jumpered to the RGB leds.
-
-endchoice
-
-config BOARD_ATSTK1000_EXTDAC
-       bool
-       depends on !BOARD_ATSTK100X_SW1_CUSTOM && !BOARD_ATSTK100X_SW3_CUSTOM
-       default y
-
-endif  # stk 1000
diff --git a/arch/avr32/boards/atstk1000/Makefile b/arch/avr32/boards/atstk1000/Makefile
deleted file mode 100644 (file)
index edecee0..0000000
+++ /dev/null
@@ -1,5 +0,0 @@
-obj-y                          += setup.o flash.o
-obj-$(CONFIG_BOARD_ATSTK1002)  += atstk1002.o
-obj-$(CONFIG_BOARD_ATSTK1003)  += atstk1003.o
-obj-$(CONFIG_BOARD_ATSTK1004)  += atstk1004.o
-obj-$(CONFIG_BOARD_ATSTK1006)  += atstk1002.o
diff --git a/arch/avr32/boards/atstk1000/atstk1000.h b/arch/avr32/boards/atstk1000/atstk1000.h
deleted file mode 100644 (file)
index 653cc09..0000000
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * ATSTK1000 setup code: Daughterboard interface
- *
- * Copyright (C) 2007 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef __ARCH_AVR32_BOARDS_ATSTK1000_ATSTK1000_H
-#define __ARCH_AVR32_BOARDS_ATSTK1000_ATSTK1000_H
-
-extern struct atmel_lcdfb_pdata atstk1000_lcdc_data;
-
-void atstk1000_setup_j2_leds(void);
-
-#endif /* __ARCH_AVR32_BOARDS_ATSTK1000_ATSTK1000_H */
diff --git a/arch/avr32/boards/atstk1000/atstk1002.c b/arch/avr32/boards/atstk1000/atstk1002.c
deleted file mode 100644 (file)
index 6c80aba..0000000
+++ /dev/null
@@ -1,330 +0,0 @@
-/*
- * ATSTK1002/ATSTK1006 daughterboard-specific init code
- *
- * Copyright (C) 2005-2007 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/clk.h>
-#include <linux/etherdevice.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/platform_device.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/spi/spi.h>
-#include <linux/spi/at73c213.h>
-#include <linux/atmel-mci.h>
-
-#include <video/atmel_lcdc.h>
-
-#include <asm/io.h>
-#include <asm/setup.h>
-
-#include <mach/at32ap700x.h>
-#include <mach/board.h>
-#include <mach/init.h>
-#include <mach/portmux.h>
-
-#include "atstk1000.h"
-
-/* Oscillator frequencies. These are board specific */
-unsigned long at32_board_osc_rates[3] = {
-       [0] = 32768,    /* 32.768 kHz on RTC osc */
-       [1] = 20000000, /* 20 MHz on osc0 */
-       [2] = 12000000, /* 12 MHz on osc1 */
-};
-
-/*
- * The ATSTK1006 daughterboard is very similar to the ATSTK1002. Both
- * have the AT32AP7000 chip on board; the difference is that the
- * STK1006 has 128 MB SDRAM (the STK1002 uses the 8 MB SDRAM chip on
- * the STK1000 motherboard) and 256 MB NAND flash (the STK1002 has
- * none.)
- *
- * The RAM difference is handled by the boot loader, so the only
- * difference we end up handling here is the NAND flash.
- */
-#ifdef CONFIG_BOARD_ATSTK1006
-#include <linux/mtd/partitions.h>
-#include <mach/smc.h>
-
-static struct smc_timing nand_timing __initdata = {
-       .ncs_read_setup         = 0,
-       .nrd_setup              = 10,
-       .ncs_write_setup        = 0,
-       .nwe_setup              = 10,
-
-       .ncs_read_pulse         = 30,
-       .nrd_pulse              = 15,
-       .ncs_write_pulse        = 30,
-       .nwe_pulse              = 15,
-
-       .read_cycle             = 30,
-       .write_cycle            = 30,
-
-       .ncs_read_recover       = 0,
-       .nrd_recover            = 15,
-       .ncs_write_recover      = 0,
-       /* WE# high -> RE# low min 60 ns */
-       .nwe_recover            = 50,
-};
-
-static struct smc_config nand_config __initdata = {
-       .bus_width              = 1,
-       .nrd_controlled         = 1,
-       .nwe_controlled         = 1,
-       .nwait_mode             = 0,
-       .byte_write             = 0,
-       .tdf_cycles             = 2,
-       .tdf_mode               = 0,
-};
-
-static struct mtd_partition nand_partitions[] = {
-       {
-               .name           = "main",
-               .offset         = 0x00000000,
-               .size           = MTDPART_SIZ_FULL,
-       },
-};
-
-static struct atmel_nand_data atstk1006_nand_data __initdata = {
-       .cle            = 21,
-       .ale            = 22,
-       .rdy_pin        = GPIO_PIN_PB(30),
-       .enable_pin     = GPIO_PIN_PB(29),
-       .ecc_mode       = NAND_ECC_SOFT,
-       .parts          = nand_partitions,
-       .num_parts      = ARRAY_SIZE(nand_partitions),
-};
-#endif
-
-struct eth_addr {
-       u8 addr[6];
-};
-
-static struct eth_addr __initdata hw_addr[2];
-static struct macb_platform_data __initdata eth_data[2] = {
-       {
-               /*
-                * The MDIO pullups on STK1000 are a bit too weak for
-                * the autodetection to work properly, so we have to
-                * mask out everything but the correct address.
-                */
-               .phy_mask       = ~(1U << 16),
-       },
-       {
-               .phy_mask       = ~(1U << 17),
-       },
-};
-
-#ifdef CONFIG_BOARD_ATSTK1000_EXTDAC
-static struct at73c213_board_info at73c213_data = {
-       .ssc_id         = 0,
-       .shortname      = "AVR32 STK1000 external DAC",
-};
-#endif
-
-#ifndef CONFIG_BOARD_ATSTK100X_SW1_CUSTOM
-static struct spi_board_info spi0_board_info[] __initdata = {
-#ifdef CONFIG_BOARD_ATSTK1000_EXTDAC
-       {
-               /* AT73C213 */
-               .modalias       = "at73c213",
-               .max_speed_hz   = 200000,
-               .chip_select    = 0,
-               .mode           = SPI_MODE_1,
-               .platform_data  = &at73c213_data,
-       },
-#endif
-       {
-               /* QVGA display */
-               .modalias       = "ltv350qv",
-               .max_speed_hz   = 16000000,
-               .chip_select    = 1,
-               .mode           = SPI_MODE_3,
-       },
-};
-#endif
-
-#ifdef CONFIG_BOARD_ATSTK100X_SPI1
-static struct spi_board_info spi1_board_info[] __initdata = { {
-       /* patch in custom entries here */
-} };
-#endif
-
-/*
- * The next two functions should go away as the boot loader is
- * supposed to initialize the macb address registers with a valid
- * ethernet address. But we need to keep it around for a while until
- * we can be reasonably sure the boot loader does this.
- *
- * The phy_id is ignored as the driver will probe for it.
- */
-static int __init parse_tag_ethernet(struct tag *tag)
-{
-       int i;
-
-       i = tag->u.ethernet.mac_index;
-       if (i < ARRAY_SIZE(hw_addr))
-               memcpy(hw_addr[i].addr, tag->u.ethernet.hw_address,
-                      sizeof(hw_addr[i].addr));
-
-       return 0;
-}
-__tagtable(ATAG_ETHERNET, parse_tag_ethernet);
-
-static void __init set_hw_addr(struct platform_device *pdev)
-{
-       struct resource *res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-       const u8 *addr;
-       void __iomem *regs;
-       struct clk *pclk;
-
-       if (!res)
-               return;
-       if (pdev->id >= ARRAY_SIZE(hw_addr))
-               return;
-
-       addr = hw_addr[pdev->id].addr;
-       if (!is_valid_ether_addr(addr))
-               return;
-
-       /*
-        * Since this is board-specific code, we'll cheat and use the
-        * physical address directly as we happen to know that it's
-        * the same as the virtual address.
-        */
-       regs = (void __iomem __force *)res->start;
-       pclk = clk_get(&pdev->dev, "pclk");
-       if (IS_ERR(pclk))
-               return;
-
-       clk_enable(pclk);
-       __raw_writel((addr[3] << 24) | (addr[2] << 16)
-                    | (addr[1] << 8) | addr[0], regs + 0x98);
-       __raw_writel((addr[5] << 8) | addr[4], regs + 0x9c);
-       clk_disable(pclk);
-       clk_put(pclk);
-}
-
-#ifdef CONFIG_BOARD_ATSTK1000_EXTDAC
-static void __init atstk1002_setup_extdac(void)
-{
-       struct clk *gclk;
-       struct clk *pll;
-
-       gclk = clk_get(NULL, "gclk0");
-       if (IS_ERR(gclk))
-               goto err_gclk;
-       pll = clk_get(NULL, "pll0");
-       if (IS_ERR(pll))
-               goto err_pll;
-
-       if (clk_set_parent(gclk, pll)) {
-               pr_debug("STK1000: failed to set pll0 as parent for DAC clock\n");
-               goto err_set_clk;
-       }
-
-       at32_select_periph(GPIO_PIOA_BASE, (1 << 30), GPIO_PERIPH_A, 0);
-       at73c213_data.dac_clk = gclk;
-
-err_set_clk:
-       clk_put(pll);
-err_pll:
-       clk_put(gclk);
-err_gclk:
-       return;
-}
-#else
-static void __init atstk1002_setup_extdac(void)
-{
-
-}
-#endif /* CONFIG_BOARD_ATSTK1000_EXTDAC */
-
-void __init setup_board(void)
-{
-#ifdef CONFIG_BOARD_ATSTK100X_SW2_CUSTOM
-       at32_map_usart(0, 1, 0);        /* USART 0/B: /dev/ttyS1, IRDA */
-#else
-       at32_map_usart(1, 0, 0);        /* USART 1/A: /dev/ttyS0, DB9 */
-#endif
-       /* USART 2/unused: expansion connector */
-       at32_map_usart(3, 2, 0);        /* USART 3/C: /dev/ttyS2, DB9 */
-
-       at32_setup_serial_console(0);
-}
-
-#ifndef CONFIG_BOARD_ATSTK100X_SW2_CUSTOM
-
-static struct mci_platform_data __initdata mci0_data = {
-       .slot[0] = {
-               .bus_width      = 4,
-
-/* MMC card detect requires MACB0 *NOT* be used */
-#ifdef CONFIG_BOARD_ATSTK1002_SW6_CUSTOM
-               .detect_pin     = GPIO_PIN_PC(14), /* gpio30/sdcd */
-               .wp_pin         = GPIO_PIN_PC(15), /* gpio31/sdwp */
-#else
-               .detect_pin     = -ENODEV,
-               .wp_pin         = -ENODEV,
-#endif /* SW6 for sd{cd,wp} routing */
-       },
-};
-
-#endif /* SW2 for MMC signal routing */
-
-static int __init atstk1002_init(void)
-{
-       /*
-        * ATSTK1000 uses 32-bit SDRAM interface. Reserve the
-        * SDRAM-specific pins so that nobody messes with them.
-        */
-       at32_reserve_pin(GPIO_PIOE_BASE, ATMEL_EBI_PE_DATA_ALL);
-
-#ifdef CONFIG_BOARD_ATSTK1006
-       smc_set_timing(&nand_config, &nand_timing);
-       smc_set_configuration(3, &nand_config);
-       at32_add_device_nand(0, &atstk1006_nand_data);
-#endif
-
-#ifdef CONFIG_BOARD_ATSTK100X_SW2_CUSTOM
-       at32_add_device_usart(1);
-#else
-       at32_add_device_usart(0);
-#endif
-       at32_add_device_usart(2);
-
-#ifndef CONFIG_BOARD_ATSTK1002_SW6_CUSTOM
-       set_hw_addr(at32_add_device_eth(0, &eth_data[0]));
-#endif
-#ifndef CONFIG_BOARD_ATSTK100X_SW1_CUSTOM
-       at32_add_device_spi(0, spi0_board_info, ARRAY_SIZE(spi0_board_info));
-#endif
-#ifdef CONFIG_BOARD_ATSTK100X_SPI1
-       at32_add_device_spi(1, spi1_board_info, ARRAY_SIZE(spi1_board_info));
-#endif
-#ifndef CONFIG_BOARD_ATSTK100X_SW2_CUSTOM
-       at32_add_device_mci(0, &mci0_data);
-#endif
-#ifdef CONFIG_BOARD_ATSTK1002_SW5_CUSTOM
-       set_hw_addr(at32_add_device_eth(1, &eth_data[1]));
-#else
-       at32_add_device_lcdc(0, &atstk1000_lcdc_data,
-                            fbmem_start, fbmem_size,
-                            ATMEL_LCDC_PRI_24BIT | ATMEL_LCDC_PRI_CONTROL);
-#endif
-       at32_add_device_usba(0, NULL);
-#ifndef CONFIG_BOARD_ATSTK100X_SW3_CUSTOM
-       at32_add_device_ssc(0, ATMEL_SSC_TX);
-#endif
-
-       atstk1000_setup_j2_leds();
-       atstk1002_setup_extdac();
-
-       return 0;
-}
-postcore_initcall(atstk1002_init);
diff --git a/arch/avr32/boards/atstk1000/atstk1003.c b/arch/avr32/boards/atstk1000/atstk1003.c
deleted file mode 100644 (file)
index ff7e232..0000000
+++ /dev/null
@@ -1,162 +0,0 @@
-/*
- * ATSTK1003 daughterboard-specific init code
- *
- * Copyright (C) 2007 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/clk.h>
-#include <linux/err.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/platform_device.h>
-#include <linux/string.h>
-#include <linux/types.h>
-
-#include <linux/spi/at73c213.h>
-#include <linux/spi/spi.h>
-#include <linux/atmel-mci.h>
-
-#include <asm/setup.h>
-
-#include <mach/at32ap700x.h>
-#include <mach/board.h>
-#include <mach/init.h>
-#include <mach/portmux.h>
-
-#include "atstk1000.h"
-
-/* Oscillator frequencies. These are board specific */
-unsigned long at32_board_osc_rates[3] = {
-       [0] = 32768,    /* 32.768 kHz on RTC osc */
-       [1] = 20000000, /* 20 MHz on osc0 */
-       [2] = 12000000, /* 12 MHz on osc1 */
-};
-
-#ifdef CONFIG_BOARD_ATSTK1000_EXTDAC
-static struct at73c213_board_info at73c213_data = {
-       .ssc_id         = 0,
-       .shortname      = "AVR32 STK1000 external DAC",
-};
-#endif
-
-#ifndef CONFIG_BOARD_ATSTK100X_SW1_CUSTOM
-static struct spi_board_info spi0_board_info[] __initdata = {
-#ifdef CONFIG_BOARD_ATSTK1000_EXTDAC
-       {
-               /* AT73C213 */
-               .modalias       = "at73c213",
-               .max_speed_hz   = 200000,
-               .chip_select    = 0,
-               .mode           = SPI_MODE_1,
-               .platform_data  = &at73c213_data,
-       },
-#endif
-       /*
-        * We can control the LTV350QV LCD panel, but it isn't much
-        * point since we don't have an LCD controller...
-        */
-};
-#endif
-
-#ifdef CONFIG_BOARD_ATSTK100X_SPI1
-static struct spi_board_info spi1_board_info[] __initdata = { {
-       /* patch in custom entries here */
-} };
-#endif
-
-#ifndef CONFIG_BOARD_ATSTK100X_SW2_CUSTOM
-static struct mci_platform_data __initdata mci0_data = {
-       .slot[0] = {
-               .bus_width      = 4,
-               .detect_pin     = -ENODEV,
-               .wp_pin         = -ENODEV,
-       },
-};
-#endif
-
-#ifdef CONFIG_BOARD_ATSTK1000_EXTDAC
-static void __init atstk1003_setup_extdac(void)
-{
-       struct clk *gclk;
-       struct clk *pll;
-
-       gclk = clk_get(NULL, "gclk0");
-       if (IS_ERR(gclk))
-               goto err_gclk;
-       pll = clk_get(NULL, "pll0");
-       if (IS_ERR(pll))
-               goto err_pll;
-
-       if (clk_set_parent(gclk, pll)) {
-               pr_debug("STK1000: failed to set pll0 as parent for DAC clock\n");
-               goto err_set_clk;
-       }
-
-       at32_select_periph(GPIO_PIOA_BASE, (1 << 30), GPIO_PERIPH_A, 0);
-       at73c213_data.dac_clk = gclk;
-
-err_set_clk:
-       clk_put(pll);
-err_pll:
-       clk_put(gclk);
-err_gclk:
-       return;
-}
-#else
-static void __init atstk1003_setup_extdac(void)
-{
-
-}
-#endif /* CONFIG_BOARD_ATSTK1000_EXTDAC */
-
-void __init setup_board(void)
-{
-#ifdef CONFIG_BOARD_ATSTK100X_SW2_CUSTOM
-       at32_map_usart(0, 1, 0);        /* USART 0/B: /dev/ttyS1, IRDA */
-#else
-       at32_map_usart(1, 0, 0);        /* USART 1/A: /dev/ttyS0, DB9 */
-#endif
-       /* USART 2/unused: expansion connector */
-       at32_map_usart(3, 2, 0);        /* USART 3/C: /dev/ttyS2, DB9 */
-
-       at32_setup_serial_console(0);
-}
-
-static int __init atstk1003_init(void)
-{
-       /*
-        * ATSTK1000 uses 32-bit SDRAM interface. Reserve the
-        * SDRAM-specific pins so that nobody messes with them.
-        */
-       at32_reserve_pin(GPIO_PIOE_BASE, ATMEL_EBI_PE_DATA_ALL);
-
-#ifdef CONFIG_BOARD_ATSTK100X_SW2_CUSTOM
-       at32_add_device_usart(1);
-#else
-       at32_add_device_usart(0);
-#endif
-       at32_add_device_usart(2);
-
-#ifndef CONFIG_BOARD_ATSTK100X_SW1_CUSTOM
-       at32_add_device_spi(0, spi0_board_info, ARRAY_SIZE(spi0_board_info));
-#endif
-#ifdef CONFIG_BOARD_ATSTK100X_SPI1
-       at32_add_device_spi(1, spi1_board_info, ARRAY_SIZE(spi1_board_info));
-#endif
-#ifndef CONFIG_BOARD_ATSTK100X_SW2_CUSTOM
-       at32_add_device_mci(0, &mci0_data);
-#endif
-       at32_add_device_usba(0, NULL);
-#ifndef CONFIG_BOARD_ATSTK100X_SW3_CUSTOM
-       at32_add_device_ssc(0, ATMEL_SSC_TX);
-#endif
-
-       atstk1000_setup_j2_leds();
-       atstk1003_setup_extdac();
-
-       return 0;
-}
-postcore_initcall(atstk1003_init);
diff --git a/arch/avr32/boards/atstk1000/atstk1004.c b/arch/avr32/boards/atstk1000/atstk1004.c
deleted file mode 100644 (file)
index 69a9f0f..0000000
+++ /dev/null
@@ -1,164 +0,0 @@
-/*
- * ATSTK1003 daughterboard-specific init code
- *
- * Copyright (C) 2007 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/clk.h>
-#include <linux/err.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/platform_device.h>
-#include <linux/string.h>
-#include <linux/types.h>
-
-#include <linux/spi/at73c213.h>
-#include <linux/spi/spi.h>
-#include <linux/atmel-mci.h>
-
-#include <video/atmel_lcdc.h>
-
-#include <asm/setup.h>
-
-#include <mach/at32ap700x.h>
-#include <mach/board.h>
-#include <mach/init.h>
-#include <mach/portmux.h>
-
-#include "atstk1000.h"
-
-/* Oscillator frequencies. These are board specific */
-unsigned long at32_board_osc_rates[3] = {
-       [0] = 32768,    /* 32.768 kHz on RTC osc */
-       [1] = 20000000, /* 20 MHz on osc0 */
-       [2] = 12000000, /* 12 MHz on osc1 */
-};
-
-#ifdef CONFIG_BOARD_ATSTK1000_EXTDAC
-static struct at73c213_board_info at73c213_data = {
-       .ssc_id         = 0,
-       .shortname      = "AVR32 STK1000 external DAC",
-};
-#endif
-
-#ifndef CONFIG_BOARD_ATSTK100X_SW1_CUSTOM
-static struct spi_board_info spi0_board_info[] __initdata = {
-#ifdef CONFIG_BOARD_ATSTK1000_EXTDAC
-       {
-               /* AT73C213 */
-               .modalias       = "at73c213",
-               .max_speed_hz   = 200000,
-               .chip_select    = 0,
-               .mode           = SPI_MODE_1,
-               .platform_data  = &at73c213_data,
-       },
-#endif
-       {
-               /* QVGA display */
-               .modalias       = "ltv350qv",
-               .max_speed_hz   = 16000000,
-               .chip_select    = 1,
-               .mode           = SPI_MODE_3,
-       },
-};
-#endif
-
-#ifdef CONFIG_BOARD_ATSTK100X_SPI1
-static struct spi_board_info spi1_board_info[] __initdata = { {
-       /* patch in custom entries here */
-} };
-#endif
-
-#ifndef CONFIG_BOARD_ATSTK100X_SW2_CUSTOM
-static struct mci_platform_data __initdata mci0_data = {
-       .slot[0] = {
-               .bus_width      = 4,
-               .detect_pin     = -ENODEV,
-               .wp_pin         = -ENODEV,
-       },
-};
-#endif
-
-#ifdef CONFIG_BOARD_ATSTK1000_EXTDAC
-static void __init atstk1004_setup_extdac(void)
-{
-       struct clk *gclk;
-       struct clk *pll;
-
-       gclk = clk_get(NULL, "gclk0");
-       if (IS_ERR(gclk))
-               goto err_gclk;
-       pll = clk_get(NULL, "pll0");
-       if (IS_ERR(pll))
-               goto err_pll;
-
-       if (clk_set_parent(gclk, pll)) {
-               pr_debug("STK1000: failed to set pll0 as parent for DAC clock\n");
-               goto err_set_clk;
-       }
-
-       at32_select_periph(GPIO_PIOA_BASE, (1 << 30), GPIO_PERIPH_A, 0);
-       at73c213_data.dac_clk = gclk;
-
-err_set_clk:
-       clk_put(pll);
-err_pll:
-       clk_put(gclk);
-err_gclk:
-       return;
-}
-#else
-static void __init atstk1004_setup_extdac(void)
-{
-
-}
-#endif /* CONFIG_BOARD_ATSTK1000_EXTDAC */
-
-void __init setup_board(void)
-{
-#ifdef CONFIG_BOARD_ATSTK100X_SW2_CUSTOM
-       at32_map_usart(0, 1, 0);        /* USART 0/B: /dev/ttyS1, IRDA */
-#else
-       at32_map_usart(1, 0, 0);        /* USART 1/A: /dev/ttyS0, DB9 */
-#endif
-       /* USART 2/unused: expansion connector */
-       at32_map_usart(3, 2, 0);        /* USART 3/C: /dev/ttyS2, DB9 */
-
-       at32_setup_serial_console(0);
-}
-
-static int __init atstk1004_init(void)
-{
-#ifdef CONFIG_BOARD_ATSTK100X_SW2_CUSTOM
-       at32_add_device_usart(1);
-#else
-       at32_add_device_usart(0);
-#endif
-       at32_add_device_usart(2);
-
-#ifndef CONFIG_BOARD_ATSTK100X_SW1_CUSTOM
-       at32_add_device_spi(0, spi0_board_info, ARRAY_SIZE(spi0_board_info));
-#endif
-#ifdef CONFIG_BOARD_ATSTK100X_SPI1
-       at32_add_device_spi(1, spi1_board_info, ARRAY_SIZE(spi1_board_info));
-#endif
-#ifndef CONFIG_BOARD_ATSTK100X_SW2_CUSTOM
-       at32_add_device_mci(0, &mci0_data);
-#endif
-       at32_add_device_lcdc(0, &atstk1000_lcdc_data,
-                            fbmem_start, fbmem_size,
-                            ATMEL_LCDC_PRI_24BIT | ATMEL_LCDC_PRI_CONTROL);
-       at32_add_device_usba(0, NULL);
-#ifndef CONFIG_BOARD_ATSTK100X_SW3_CUSTOM
-       at32_add_device_ssc(0, ATMEL_SSC_TX);
-#endif
-
-       atstk1000_setup_j2_leds();
-       atstk1004_setup_extdac();
-
-       return 0;
-}
-postcore_initcall(atstk1004_init);
diff --git a/arch/avr32/boards/atstk1000/flash.c b/arch/avr32/boards/atstk1000/flash.c
deleted file mode 100644 (file)
index 6e4d561..0000000
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * ATSTK1000 board-specific flash initialization
- *
- * Copyright (C) 2005-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/init.h>
-#include <linux/platform_device.h>
-#include <linux/mtd/mtd.h>
-#include <linux/mtd/partitions.h>
-#include <linux/mtd/physmap.h>
-
-#include <mach/smc.h>
-
-static struct smc_timing flash_timing __initdata = {
-       .ncs_read_setup         = 0,
-       .nrd_setup              = 40,
-       .ncs_write_setup        = 0,
-       .nwe_setup              = 10,
-
-       .ncs_read_pulse         = 80,
-       .nrd_pulse              = 40,
-       .ncs_write_pulse        = 65,
-       .nwe_pulse              = 55,
-
-       .read_cycle             = 120,
-       .write_cycle            = 120,
-};
-
-static struct smc_config flash_config __initdata = {
-       .bus_width              = 2,
-       .nrd_controlled         = 1,
-       .nwe_controlled         = 1,
-       .byte_write             = 1,
-};
-
-static struct mtd_partition flash_parts[] = {
-       {
-               .name           = "u-boot",
-               .offset         = 0x00000000,
-               .size           = 0x00020000,           /* 128 KiB */
-               .mask_flags     = MTD_WRITEABLE,
-       },
-       {
-               .name           = "root",
-               .offset         = 0x00020000,
-               .size           = 0x007d0000,
-       },
-       {
-               .name           = "env",
-               .offset         = 0x007f0000,
-               .size           = 0x00010000,
-               .mask_flags     = MTD_WRITEABLE,
-       },
-};
-
-static struct physmap_flash_data flash_data = {
-       .width          = 2,
-       .nr_parts       = ARRAY_SIZE(flash_parts),
-       .parts          = flash_parts,
-};
-
-static struct resource flash_resource = {
-       .start          = 0x00000000,
-       .end            = 0x007fffff,
-       .flags          = IORESOURCE_MEM,
-};
-
-static struct platform_device flash_device = {
-       .name           = "physmap-flash",
-       .id             = 0,
-       .resource       = &flash_resource,
-       .num_resources  = 1,
-       .dev            = {
-               .platform_data = &flash_data,
-       },
-};
-
-/* This needs to be called after the SMC has been initialized */
-static int __init atstk1000_flash_init(void)
-{
-       int ret;
-
-       smc_set_timing(&flash_config, &flash_timing);
-       ret = smc_set_configuration(0, &flash_config);
-       if (ret < 0) {
-               printk(KERN_ERR "atstk1000: failed to set NOR flash timing\n");
-               return ret;
-       }
-
-       platform_device_register(&flash_device);
-
-       return 0;
-}
-device_initcall(atstk1000_flash_init);
diff --git a/arch/avr32/boards/atstk1000/setup.c b/arch/avr32/boards/atstk1000/setup.c
deleted file mode 100644 (file)
index b6b88f5..0000000
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * ATSTK1000 board-specific setup code.
- *
- * Copyright (C) 2005-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/bootmem.h>
-#include <linux/fb.h>
-#include <linux/init.h>
-#include <linux/platform_device.h>
-#include <linux/types.h>
-#include <linux/linkage.h>
-
-#include <video/atmel_lcdc.h>
-
-#include <asm/setup.h>
-
-#include <mach/at32ap700x.h>
-#include <mach/board.h>
-#include <mach/portmux.h>
-
-#include "atstk1000.h"
-
-/* Initialized by bootloader-specific startup code. */
-struct tag *bootloader_tags __initdata;
-
-static struct fb_videomode __initdata ltv350qv_modes[] = {
-       {
-               .name           = "320x240 @ 75",
-               .refresh        = 75,
-               .xres           = 320,          .yres           = 240,
-               .pixclock       = KHZ2PICOS(6891),
-
-               .left_margin    = 17,           .right_margin   = 33,
-               .upper_margin   = 10,           .lower_margin   = 10,
-               .hsync_len      = 16,           .vsync_len      = 1,
-
-               .sync           = 0,
-               .vmode          = FB_VMODE_NONINTERLACED,
-       },
-};
-
-static struct fb_monspecs __initdata atstk1000_default_monspecs = {
-       .manufacturer           = "SNG",
-       .monitor                = "LTV350QV",
-       .modedb                 = ltv350qv_modes,
-       .modedb_len             = ARRAY_SIZE(ltv350qv_modes),
-       .hfmin                  = 14820,
-       .hfmax                  = 22230,
-       .vfmin                  = 60,
-       .vfmax                  = 90,
-       .dclkmax                = 30000000,
-};
-
-struct atmel_lcdfb_pdata __initdata atstk1000_lcdc_data = {
-       .default_bpp            = 24,
-       .default_dmacon         = ATMEL_LCDC_DMAEN | ATMEL_LCDC_DMA2DEN,
-       .default_lcdcon2        = (ATMEL_LCDC_DISTYPE_TFT
-                                  | ATMEL_LCDC_INVCLK
-                                  | ATMEL_LCDC_CLKMOD_ALWAYSACTIVE
-                                  | ATMEL_LCDC_MEMOR_BIG),
-       .default_monspecs       = &atstk1000_default_monspecs,
-       .guard_time             = 2,
-};
-
-#ifdef CONFIG_BOARD_ATSTK1000_J2_LED
-#include <linux/leds.h>
-
-static struct gpio_led stk1000_j2_led[] = {
-#ifdef CONFIG_BOARD_ATSTK1000_J2_LED8
-#define LEDSTRING "J2 jumpered to LED8"
-       { .name = "led0:amber", .gpio = GPIO_PIN_PB( 8), },
-       { .name = "led1:amber", .gpio = GPIO_PIN_PB( 9), },
-       { .name = "led2:amber", .gpio = GPIO_PIN_PB(10), },
-       { .name = "led3:amber", .gpio = GPIO_PIN_PB(13), },
-       { .name = "led4:amber", .gpio = GPIO_PIN_PB(14), },
-       { .name = "led5:amber", .gpio = GPIO_PIN_PB(15), },
-       { .name = "led6:amber", .gpio = GPIO_PIN_PB(16), },
-       { .name = "led7:amber", .gpio = GPIO_PIN_PB(30),
-                       .default_trigger = "heartbeat", },
-#else  /* RGB */
-#define LEDSTRING "J2 jumpered to RGB LEDs"
-       { .name = "r1:red",     .gpio = GPIO_PIN_PB( 8), },
-       { .name = "g1:green",   .gpio = GPIO_PIN_PB(10), },
-       { .name = "b1:blue",    .gpio = GPIO_PIN_PB(14), },
-
-       { .name = "r2:red",     .gpio = GPIO_PIN_PB( 9),
-                       .default_trigger = "heartbeat", },
-       { .name = "g2:green",   .gpio = GPIO_PIN_PB(13), },
-       { .name = "b2:blue",    .gpio = GPIO_PIN_PB(15),
-                       .default_trigger = "heartbeat", },
-       /* PB16, PB30 unused */
-#endif
-};
-
-static struct gpio_led_platform_data stk1000_j2_led_data = {
-       .num_leds       = ARRAY_SIZE(stk1000_j2_led),
-       .leds           = stk1000_j2_led,
-};
-
-static struct platform_device stk1000_j2_led_dev = {
-       .name           = "leds-gpio",
-       .id             = 2,    /* gpio block J2 */
-       .dev            = {
-               .platform_data  = &stk1000_j2_led_data,
-       },
-};
-
-void __init atstk1000_setup_j2_leds(void)
-{
-       unsigned        i;
-
-       for (i = 0; i < ARRAY_SIZE(stk1000_j2_led); i++)
-               at32_select_gpio(stk1000_j2_led[i].gpio, AT32_GPIOF_OUTPUT);
-
-       printk("STK1000: " LEDSTRING "\n");
-       platform_device_register(&stk1000_j2_led_dev);
-}
-#else /* CONFIG_BOARD_ATSTK1000_J2_LED */
-void __init atstk1000_setup_j2_leds(void)
-{
-
-}
-#endif /* CONFIG_BOARD_ATSTK1000_J2_LED */
diff --git a/arch/avr32/boards/favr-32/Kconfig b/arch/avr32/boards/favr-32/Kconfig
deleted file mode 100644 (file)
index 2c83d1d..0000000
+++ /dev/null
@@ -1,22 +0,0 @@
-# Favr-32 customization
-
-if BOARD_FAVR_32
-
-config BOARD_FAVR32_ABDAC_RATE
-       int "DAC target rate"
-       default 44100
-       range 32000 50000
-       help
-         Specify the target rate the internal DAC should try to match. This
-         will use PLL1 to generate a frequency as close as possible to this
-         rate.
-
-         Must be within the range 32000 to 50000, which should be suitable to
-         generate most other frequencies in power of 2 steps.
-
-         Ex:
-               48000 will also suit 24000 and 12000
-               44100 will also suit 22050 and 11025
-               32000 will also suit 16000 and 8000
-
-endif # BOARD_FAVR_32
diff --git a/arch/avr32/boards/favr-32/Makefile b/arch/avr32/boards/favr-32/Makefile
deleted file mode 100644 (file)
index 234f215..0000000
+++ /dev/null
@@ -1 +0,0 @@
-obj-y  += setup.o flash.o
diff --git a/arch/avr32/boards/favr-32/flash.c b/arch/avr32/boards/favr-32/flash.c
deleted file mode 100644 (file)
index 604bbd5..0000000
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Favr-32 board-specific flash initialization
- *
- * Copyright (C) 2008 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/init.h>
-#include <linux/platform_device.h>
-#include <linux/mtd/mtd.h>
-#include <linux/mtd/partitions.h>
-#include <linux/mtd/physmap.h>
-
-#include <mach/smc.h>
-
-static struct smc_timing flash_timing __initdata = {
-       .ncs_read_setup         = 0,
-       .nrd_setup              = 40,
-       .ncs_write_setup        = 0,
-       .nwe_setup              = 10,
-
-       .ncs_read_pulse         = 80,
-       .nrd_pulse              = 40,
-       .ncs_write_pulse        = 65,
-       .nwe_pulse              = 55,
-
-       .read_cycle             = 120,
-       .write_cycle            = 120,
-};
-
-static struct smc_config flash_config __initdata = {
-       .bus_width              = 2,
-       .nrd_controlled         = 1,
-       .nwe_controlled         = 1,
-       .byte_write             = 1,
-};
-
-static struct mtd_partition flash_parts[] = {
-       {
-               .name           = "u-boot",
-               .offset         = 0x00000000,
-               .size           = 0x00020000,           /* 128 KiB */
-               .mask_flags     = MTD_WRITEABLE,
-       },
-       {
-               .name           = "root",
-               .offset         = 0x00020000,
-               .size           = 0x007d0000,
-       },
-       {
-               .name           = "env",
-               .offset         = 0x007f0000,
-               .size           = 0x00010000,
-               .mask_flags     = MTD_WRITEABLE,
-       },
-};
-
-static struct physmap_flash_data flash_data = {
-       .width          = 2,
-       .nr_parts       = ARRAY_SIZE(flash_parts),
-       .parts          = flash_parts,
-};
-
-static struct resource flash_resource = {
-       .start          = 0x00000000,
-       .end            = 0x007fffff,
-       .flags          = IORESOURCE_MEM,
-};
-
-static struct platform_device flash_device = {
-       .name           = "physmap-flash",
-       .id             = 0,
-       .resource       = &flash_resource,
-       .num_resources  = 1,
-       .dev            = {
-               .platform_data = &flash_data,
-       },
-};
-
-/* This needs to be called after the SMC has been initialized */
-static int __init favr32_flash_init(void)
-{
-       int ret;
-
-       smc_set_timing(&flash_config, &flash_timing);
-       ret = smc_set_configuration(0, &flash_config);
-       if (ret < 0) {
-               printk(KERN_ERR "Favr-32: failed to set NOR flash timing\n");
-               return ret;
-       }
-
-       platform_device_register(&flash_device);
-
-       return 0;
-}
-device_initcall(favr32_flash_init);
diff --git a/arch/avr32/boards/favr-32/setup.c b/arch/avr32/boards/favr-32/setup.c
deleted file mode 100644 (file)
index 234cb07..0000000
+++ /dev/null
@@ -1,366 +0,0 @@
-/*
- * Favr-32 board-specific setup code.
- *
- * Copyright (C) 2008 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/clk.h>
-#include <linux/etherdevice.h>
-#include <linux/bootmem.h>
-#include <linux/fb.h>
-#include <linux/init.h>
-#include <linux/platform_device.h>
-#include <linux/types.h>
-#include <linux/linkage.h>
-#include <linux/gpio.h>
-#include <linux/leds.h>
-#include <linux/atmel-mci.h>
-#include <linux/pwm.h>
-#include <linux/pwm_backlight.h>
-#include <linux/regulator/fixed.h>
-#include <linux/regulator/machine.h>
-#include <linux/spi/spi.h>
-#include <linux/spi/ads7846.h>
-
-#include <sound/atmel-abdac.h>
-
-#include <video/atmel_lcdc.h>
-
-#include <asm/setup.h>
-
-#include <mach/at32ap700x.h>
-#include <mach/init.h>
-#include <mach/board.h>
-#include <mach/portmux.h>
-
-#define PWM_BL_CH 2
-
-/* Oscillator frequencies. These are board-specific */
-unsigned long at32_board_osc_rates[3] = {
-       [0] = 32768,    /* 32.768 kHz on RTC osc */
-       [1] = 20000000, /* 20 MHz on osc0 */
-       [2] = 12000000, /* 12 MHz on osc1 */
-};
-
-/* Initialized by bootloader-specific startup code. */
-struct tag *bootloader_tags __initdata;
-
-static struct atmel_abdac_pdata __initdata abdac0_data = {
-};
-
-struct eth_addr {
-       u8 addr[6];
-};
-static struct eth_addr __initdata hw_addr[1];
-static struct macb_platform_data __initdata eth_data[1] = {
-       {
-               .phy_mask       = ~(1U << 1),
-       },
-};
-
-static int ads7843_get_pendown_state(void)
-{
-       return !gpio_get_value(GPIO_PIN_PB(3));
-}
-
-static struct ads7846_platform_data ads7843_data = {
-       .model                  = 7843,
-       .get_pendown_state      = ads7843_get_pendown_state,
-       .pressure_max           = 255,
-       /*
-        * Values below are for debounce filtering, these can be experimented
-        * with further.
-        */
-       .debounce_max           = 20,
-       .debounce_rep           = 4,
-       .debounce_tol           = 5,
-
-       .keep_vref_on           = true,
-       .settle_delay_usecs     = 500,
-       .penirq_recheck_delay_usecs = 100,
-};
-
-static struct spi_board_info __initdata spi1_board_info[] = {
-       {
-               /* ADS7843 touch controller */
-               .modalias       = "ads7846",
-               .max_speed_hz   = 2000000,
-               .chip_select    = 0,
-               .bus_num        = 1,
-               .platform_data  = &ads7843_data,
-       },
-};
-
-static struct mci_platform_data __initdata mci0_data = {
-       .slot[0] = {
-               .bus_width      = 4,
-               .detect_pin     = -ENODEV,
-               .wp_pin         = -ENODEV,
-       },
-};
-
-static struct fb_videomode __initdata lb104v03_modes[] = {
-       {
-               .name           = "640x480 @ 50",
-               .refresh        = 50,
-               .xres           = 640,          .yres           = 480,
-               .pixclock       = KHZ2PICOS(25100),
-
-               .left_margin    = 90,           .right_margin   = 70,
-               .upper_margin   = 30,           .lower_margin   = 15,
-               .hsync_len      = 12,           .vsync_len      = 2,
-
-               .sync           = 0,
-               .vmode          = FB_VMODE_NONINTERLACED,
-       },
-};
-
-static struct fb_monspecs __initdata favr32_default_monspecs = {
-       .manufacturer           = "LG",
-       .monitor                = "LB104V03",
-       .modedb                 = lb104v03_modes,
-       .modedb_len             = ARRAY_SIZE(lb104v03_modes),
-       .hfmin                  = 27273,
-       .hfmax                  = 31111,
-       .vfmin                  = 45,
-       .vfmax                  = 60,
-       .dclkmax                = 28000000,
-};
-
-struct atmel_lcdfb_pdata __initdata favr32_lcdc_data = {
-       .default_bpp            = 16,
-       .default_dmacon         = ATMEL_LCDC_DMAEN | ATMEL_LCDC_DMA2DEN,
-       .default_lcdcon2        = (ATMEL_LCDC_DISTYPE_TFT
-                                  | ATMEL_LCDC_CLKMOD_ALWAYSACTIVE
-                                  | ATMEL_LCDC_MEMOR_BIG),
-       .default_monspecs       = &favr32_default_monspecs,
-       .guard_time             = 2,
-};
-
-static struct gpio_led favr32_leds[] = {
-       {
-               .name            = "green",
-               .gpio            = GPIO_PIN_PE(19),
-               .default_trigger = "heartbeat",
-               .active_low      = 1,
-       },
-       {
-               .name            = "red",
-               .gpio            = GPIO_PIN_PE(20),
-               .active_low      = 1,
-       },
-};
-
-static struct gpio_led_platform_data favr32_led_data = {
-       .num_leds       = ARRAY_SIZE(favr32_leds),
-       .leds           = favr32_leds,
-};
-
-static struct platform_device favr32_led_dev = {
-       .name           = "leds-gpio",
-       .id             = 0,
-       .dev            = {
-               .platform_data  = &favr32_led_data,
-       },
-};
-
-/*
- * The next two functions should go away as the boot loader is
- * supposed to initialize the macb address registers with a valid
- * ethernet address. But we need to keep it around for a while until
- * we can be reasonably sure the boot loader does this.
- *
- * The phy_id is ignored as the driver will probe for it.
- */
-static int __init parse_tag_ethernet(struct tag *tag)
-{
-       int i;
-
-       i = tag->u.ethernet.mac_index;
-       if (i < ARRAY_SIZE(hw_addr))
-               memcpy(hw_addr[i].addr, tag->u.ethernet.hw_address,
-                      sizeof(hw_addr[i].addr));
-
-       return 0;
-}
-__tagtable(ATAG_ETHERNET, parse_tag_ethernet);
-
-static void __init set_hw_addr(struct platform_device *pdev)
-{
-       struct resource *res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-       const u8 *addr;
-       void __iomem *regs;
-       struct clk *pclk;
-
-       if (!res)
-               return;
-       if (pdev->id >= ARRAY_SIZE(hw_addr))
-               return;
-
-       addr = hw_addr[pdev->id].addr;
-       if (!is_valid_ether_addr(addr))
-               return;
-
-       /*
-        * Since this is board-specific code, we'll cheat and use the
-        * physical address directly as we happen to know that it's
-        * the same as the virtual address.
-        */
-       regs = (void __iomem __force *)res->start;
-       pclk = clk_get(&pdev->dev, "pclk");
-       if (IS_ERR(pclk))
-               return;
-
-       clk_enable(pclk);
-       __raw_writel((addr[3] << 24) | (addr[2] << 16)
-                    | (addr[1] << 8) | addr[0], regs + 0x98);
-       __raw_writel((addr[5] << 8) | addr[4], regs + 0x9c);
-       clk_disable(pclk);
-       clk_put(pclk);
-}
-
-void __init favr32_setup_leds(void)
-{
-       unsigned i;
-
-       for (i = 0; i < ARRAY_SIZE(favr32_leds); i++)
-               at32_select_gpio(favr32_leds[i].gpio, AT32_GPIOF_OUTPUT);
-
-       platform_device_register(&favr32_led_dev);
-}
-
-static struct pwm_lookup pwm_lookup[] = {
-       PWM_LOOKUP("at91sam9rl-pwm", PWM_BL_CH, "pwm-backlight.0", NULL,
-                  5000, PWM_POLARITY_INVERSED),
-};
-
-static struct regulator_consumer_supply fixed_power_consumers[] = {
-       REGULATOR_SUPPLY("power", "pwm-backlight.0"),
-};
-
-static struct platform_pwm_backlight_data pwm_bl_data = {
-       .enable_gpio            = GPIO_PIN_PA(28),
-       .max_brightness         = 255,
-       .dft_brightness         = 255,
-       .lth_brightness         = 50,
-};
-
-static struct platform_device pwm_bl_device = {
-       .name = "pwm-backlight",
-       .dev = {
-               .platform_data = &pwm_bl_data,
-       },
-};
-
-static void __init favr32_setup_atmel_pwm_bl(void)
-{
-       pwm_add_table(pwm_lookup, ARRAY_SIZE(pwm_lookup));
-       regulator_register_always_on(0, "fixed", fixed_power_consumers,
-                                   ARRAY_SIZE(fixed_power_consumers), 3300000);
-       platform_device_register(&pwm_bl_device);
-       at32_select_gpio(pwm_bl_data.enable_gpio, 0);
-}
-
-void __init setup_board(void)
-{
-       at32_map_usart(3, 0, 0);        /* USART 3 => /dev/ttyS0 */
-       at32_setup_serial_console(0);
-}
-
-static int __init set_abdac_rate(struct platform_device *pdev)
-{
-       int retval;
-       struct clk *osc1;
-       struct clk *pll1;
-       struct clk *abdac;
-
-       if (pdev == NULL)
-               return -ENXIO;
-
-       osc1 = clk_get(NULL, "osc1");
-       if (IS_ERR(osc1)) {
-               retval = PTR_ERR(osc1);
-               goto out;
-       }
-
-       pll1 = clk_get(NULL, "pll1");
-       if (IS_ERR(pll1)) {
-               retval = PTR_ERR(pll1);
-               goto out_osc1;
-       }
-
-       abdac = clk_get(&pdev->dev, "sample_clk");
-       if (IS_ERR(abdac)) {
-               retval = PTR_ERR(abdac);
-               goto out_pll1;
-       }
-
-       retval = clk_set_parent(pll1, osc1);
-       if (retval != 0)
-               goto out_abdac;
-
-       /*
-        * Rate is 32000 to 50000 and ABDAC oversamples 256x. Multiply, in
-        * power of 2, to a value above 80 MHz. Power of 2 so it is possible
-        * for the generic clock to divide it down again and 80 MHz is the
-        * lowest frequency for the PLL.
-        */
-       retval = clk_round_rate(pll1,
-                       CONFIG_BOARD_FAVR32_ABDAC_RATE * 256 * 16);
-       if (retval <= 0) {
-               retval = -EINVAL;
-               goto out_abdac;
-       }
-
-       retval = clk_set_rate(pll1, retval);
-       if (retval != 0)
-               goto out_abdac;
-
-       retval = clk_set_parent(abdac, pll1);
-       if (retval != 0)
-               goto out_abdac;
-
-out_abdac:
-       clk_put(abdac);
-out_pll1:
-       clk_put(pll1);
-out_osc1:
-       clk_put(osc1);
-out:
-       return retval;
-}
-
-static int __init favr32_init(void)
-{
-       /*
-        * Favr-32 uses 32-bit SDRAM interface. Reserve the SDRAM-specific
-        * pins so that nobody messes with them.
-        */
-       at32_reserve_pin(GPIO_PIOE_BASE, ATMEL_EBI_PE_DATA_ALL);
-
-       at32_select_gpio(GPIO_PIN_PB(3), 0);    /* IRQ from ADS7843 */
-
-       at32_add_device_usart(0);
-
-       set_hw_addr(at32_add_device_eth(0, &eth_data[0]));
-
-       spi1_board_info[0].irq = gpio_to_irq(GPIO_PIN_PB(3));
-
-       set_abdac_rate(at32_add_device_abdac(0, &abdac0_data));
-
-       at32_add_device_pwm(1 << PWM_BL_CH);
-       at32_add_device_spi(1, spi1_board_info, ARRAY_SIZE(spi1_board_info));
-       at32_add_device_mci(0, &mci0_data);
-       at32_add_device_usba(0, NULL);
-       at32_add_device_lcdc(0, &favr32_lcdc_data, fbmem_start, fbmem_size, 0);
-
-       favr32_setup_leds();
-
-       favr32_setup_atmel_pwm_bl();
-
-       return 0;
-}
-postcore_initcall(favr32_init);
diff --git a/arch/avr32/boards/hammerhead/Kconfig b/arch/avr32/boards/hammerhead/Kconfig
deleted file mode 100644 (file)
index 5c13d78..0000000
+++ /dev/null
@@ -1,43 +0,0 @@
-# Hammerhead customization
-
-if BOARD_HAMMERHEAD
-
-config BOARD_HAMMERHEAD_USB
-       bool "Philips ISP116x-hcd USB support"
-       help
-         This enables USB support for Hammerheads internal ISP116x
-         controller from Philips.
-
-         Choose 'Y' here if you want to have your board USB driven.
-
-config BOARD_HAMMERHEAD_LCD
-       bool "Atmel AT91/AT32 LCD support"
-       help
-         This enables LCD support for the Hammerhead board. You may
-         also add support for framebuffer devices (AT91/AT32 LCD Controller)
-         and framebuffer console support to get the most out of your LCD.
-
-         Choose 'Y' here if you have ordered a Corona daugther board and
-         want to have support for your Hantronix HDA-351T-LV LCD.
-
-config BOARD_HAMMERHEAD_SND
-       bool "Atmel AC97 Sound support"
-       help
-         This enables Sound support for the Hammerhead board. You may
-         also go through the ALSA settings to get it working.
-
-         Choose 'Y' here if you have ordered a Corona daugther board and
-         want to make your board funky.
-
-config BOARD_HAMMERHEAD_FPGA
-       bool "Hammerhead FPGA Support"
-       default y
-       help
-         This adds support for the Cyclone III FPGA from Altera
-         found on Miromico's Hammerhead board.
-
-         Choose 'Y' here if you want to have FPGA support enabled.
-         You will have to choose the "Hammerhead FPGA Device Support" in
-         Device Drivers->Misc to be able to use FPGA functionality.
-
-endif  # BOARD_ATNGW100
diff --git a/arch/avr32/boards/hammerhead/Makefile b/arch/avr32/boards/hammerhead/Makefile
deleted file mode 100644 (file)
index c740aa1..0000000
+++ /dev/null
@@ -1 +0,0 @@
-obj-y                          += setup.o flash.o
diff --git a/arch/avr32/boards/hammerhead/flash.c b/arch/avr32/boards/hammerhead/flash.c
deleted file mode 100644 (file)
index e86280c..0000000
+++ /dev/null
@@ -1,381 +0,0 @@
-/*
- * Hammerhead board-specific flash initialization
- *
- * Copyright (C) 2008 Miromico AG
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/init.h>
-#include <linux/platform_device.h>
-#include <linux/mtd/mtd.h>
-#include <linux/mtd/partitions.h>
-#include <linux/mtd/physmap.h>
-#include <linux/usb/isp116x.h>
-#include <linux/dma-mapping.h>
-#include <linux/delay.h>
-
-#include <mach/portmux.h>
-#include <mach/at32ap700x.h>
-#include <mach/smc.h>
-
-#include "../../mach-at32ap/clock.h"
-#include "flash.h"
-
-
-#define HAMMERHEAD_USB_PERIPH_GCLK0    0x40000000
-#define HAMMERHEAD_USB_PERIPH_CS2      0x02000000
-#define HAMMERHEAD_USB_PERIPH_EXTINT0  0x02000000
-
-#define HAMMERHEAD_FPGA_PERIPH_MOSI    0x00000002
-#define HAMMERHEAD_FPGA_PERIPH_SCK     0x00000020
-#define HAMMERHEAD_FPGA_PERIPH_EXTINT3 0x10000000
-
-static struct smc_timing flash_timing __initdata = {
-       .ncs_read_setup         = 0,
-       .nrd_setup              = 40,
-       .ncs_write_setup        = 0,
-       .nwe_setup              = 10,
-
-       .ncs_read_pulse         = 80,
-       .nrd_pulse              = 40,
-       .ncs_write_pulse        = 65,
-       .nwe_pulse              = 55,
-
-       .read_cycle             = 120,
-       .write_cycle            = 120,
-};
-
-static struct smc_config flash_config __initdata = {
-       .bus_width              = 2,
-       .nrd_controlled         = 1,
-       .nwe_controlled         = 1,
-       .byte_write             = 1,
-};
-
-static struct mtd_partition flash_parts[] = {
-       {
-               .name           = "u-boot",
-               .offset         = 0x00000000,
-               .size           = 0x00020000,           /* 128 KiB */
-               .mask_flags     = MTD_WRITEABLE,
-       },
-       {
-               .name           = "root",
-               .offset         = 0x00020000,
-               .size           = 0x007d0000,
-       },
-       {
-               .name           = "env",
-               .offset         = 0x007f0000,
-               .size           = 0x00010000,
-               .mask_flags     = MTD_WRITEABLE,
-       },
-};
-
-static struct physmap_flash_data flash_data = {
-       .width          = 2,
-       .nr_parts       = ARRAY_SIZE(flash_parts),
-       .parts          = flash_parts,
-};
-
-static struct resource flash_resource = {
-       .start          = 0x00000000,
-       .end            = 0x007fffff,
-       .flags          = IORESOURCE_MEM,
-};
-
-static struct platform_device flash_device = {
-       .name           = "physmap-flash",
-       .id             = 0,
-       .resource       = &flash_resource,
-       .num_resources  = 1,
-       .dev            = { .platform_data = &flash_data, },
-};
-
-#ifdef CONFIG_BOARD_HAMMERHEAD_USB
-
-static struct smc_timing isp1160_timing __initdata = {
-       .ncs_read_setup         = 75,
-       .nrd_setup              = 75,
-       .ncs_write_setup        = 75,
-       .nwe_setup              = 75,
-
-
-       /* We use conservative timing settings, as the minimal settings aren't
-          stable. There may be room for tweaking. */
-       .ncs_read_pulse         = 75,  /* min. 33ns */
-       .nrd_pulse              = 75,  /* min. 33ns */
-       .ncs_write_pulse        = 75,  /* min. 26ns */
-       .nwe_pulse              = 75,  /* min. 26ns */
-
-       .read_cycle             = 225, /* min. 143ns */
-       .write_cycle            = 225, /* min. 136ns */
-};
-
-static struct smc_config isp1160_config __initdata = {
-       .bus_width              = 2,
-       .nrd_controlled         = 1,
-       .nwe_controlled         = 1,
-       .byte_write             = 0,
-};
-
-/*
- * The platform delay function is only used to enforce the strange
- * read to write delay. This can not be configured in the SMC. All other
- * timings are controlled by the SMC (see timings obove)
- * So in isp116x-hcd.c we should comment out USE_PLATFORM_DELAY
- */
-void isp116x_delay(struct device *dev, int delay)
-{
-       if (delay > 150)
-               ndelay(delay - 150);
-}
-
-static struct  isp116x_platform_data isp1160_data = {
-       .sel15Kres              = 1,    /* use internal downstream resistors */
-       .oc_enable              = 0,    /* external overcurrent detection */
-       .int_edge_triggered     = 0,    /* interrupt is level triggered */
-       .int_act_high           = 0,    /* interrupt is active low */
-       .delay = isp116x_delay,         /* platform delay function */
-};
-
-static struct resource isp1160_resource[] = {
-       {
-               .start          = 0x08000000,
-               .end            = 0x08000001,
-               .flags          = IORESOURCE_MEM,
-       },
-       {
-               .start          = 0x08000002,
-               .end            = 0x08000003,
-               .flags          = IORESOURCE_MEM,
-       },
-       {
-               .start          = 64,
-               .flags          = IORESOURCE_IRQ,
-       },
-};
-
-static struct platform_device isp1160_device = {
-       .name           = "isp116x-hcd",
-       .id             = 0,
-       .resource       = isp1160_resource,
-       .num_resources  = 3,
-       .dev            = {
-               .platform_data = &isp1160_data,
-       },
-};
-#endif
-
-#ifdef CONFIG_BOARD_HAMMERHEAD_USB
-static int __init hammerhead_usbh_init(void)
-{
-       struct clk *gclk;
-       struct clk *osc;
-
-       int ret;
-
-       /* setup smc for usbh */
-       smc_set_timing(&isp1160_config, &isp1160_timing);
-       ret = smc_set_configuration(2, &isp1160_config);
-
-       if (ret < 0) {
-               printk(KERN_ERR
-                      "hammerhead: failed to set ISP1160 USBH timing\n");
-               return ret;
-       }
-
-       /* setup gclk0 to run from osc1 */
-       gclk = clk_get(NULL, "gclk0");
-       if (IS_ERR(gclk)) {
-               ret = PTR_ERR(gclk);
-               goto err_gclk;
-       }
-
-       osc = clk_get(NULL, "osc1");
-       if (IS_ERR(osc)) {
-               ret = PTR_ERR(osc);
-               goto err_osc;
-       }
-
-       ret = clk_set_parent(gclk, osc);
-       if (ret < 0) {
-               pr_debug("hammerhead: failed to set osc1 for USBH clock\n");
-               goto err_set_clk;
-       }
-
-       /* set clock to 6MHz */
-       clk_set_rate(gclk, 6000000);
-
-       /* and enable */
-       clk_enable(gclk);
-
-       /* select GCLK0 peripheral function */
-       at32_select_periph(GPIO_PIOA_BASE, HAMMERHEAD_USB_PERIPH_GCLK0,
-                          GPIO_PERIPH_A, 0);
-
-       /* enable CS2 peripheral function */
-       at32_select_periph(GPIO_PIOE_BASE, HAMMERHEAD_USB_PERIPH_CS2,
-                          GPIO_PERIPH_A, 0);
-
-       /* H_WAKEUP must be driven low */
-       at32_select_gpio(GPIO_PIN_PA(8), AT32_GPIOF_OUTPUT);
-
-       /* Select EXTINT0 for PB25 */
-       at32_select_periph(GPIO_PIOB_BASE, HAMMERHEAD_USB_PERIPH_EXTINT0,
-                          GPIO_PERIPH_A, 0);
-
-       /* register usbh device driver */
-       platform_device_register(&isp1160_device);
-
- err_set_clk:
-       clk_put(osc);
- err_osc:
-       clk_put(gclk);
- err_gclk:
-       return ret;
-}
-#endif
-
-#ifdef CONFIG_BOARD_HAMMERHEAD_FPGA
-static struct smc_timing fpga_timing __initdata = {
-       .ncs_read_setup         = 16,
-       .nrd_setup              = 32,
-       .ncs_read_pulse         = 48,
-       .nrd_pulse              = 32,
-       .read_cycle             = 64,
-
-       .ncs_write_setup        = 16,
-       .nwe_setup              = 16,
-       .ncs_write_pulse        = 32,
-       .nwe_pulse              = 32,
-       .write_cycle            = 64,
-};
-
-static struct smc_config fpga_config __initdata = {
-       .bus_width              = 4,
-       .nrd_controlled         = 1,
-       .nwe_controlled         = 1,
-       .byte_write             = 0,
-};
-
-static struct resource hh_fpga0_resource[] = {
-       {
-               .start          = 0xffe00400,
-               .end            = 0xffe00400 + 0x3ff,
-               .flags          = IORESOURCE_MEM,
-       },
-       {
-               .start          = 4,
-               .end            = 4,
-               .flags          = IORESOURCE_IRQ,
-       },
-       {
-               .start          = 0x0c000000,
-               .end            = 0x0c000100,
-               .flags          = IORESOURCE_MEM,
-       },
-       {
-               .start          = 67,
-               .end            = 67,
-               .flags          = IORESOURCE_IRQ,
-       },
-};
-
-static u64 hh_fpga0_dma_mask = DMA_BIT_MASK(32);
-static struct platform_device hh_fpga0_device = {
-       .name           = "hh_fpga",
-       .id             = 0,
-       .dev            = {
-               .dma_mask = &hh_fpga0_dma_mask,
-               .coherent_dma_mask = DMA_BIT_MASK(32),
-       },
-       .resource       = hh_fpga0_resource,
-       .num_resources  = ARRAY_SIZE(hh_fpga0_resource),
-};
-
-static struct clk hh_fpga0_spi_clk = {
-       .name           = "spi_clk",
-       .dev            = &hh_fpga0_device.dev,
-       .mode           = pba_clk_mode,
-       .get_rate       = pba_clk_get_rate,
-       .index          = 1,
-};
-
-struct platform_device *__init at32_add_device_hh_fpga(void)
-{
-       /* Select peripheral functionallity for SPI SCK and MOSI */
-       at32_select_periph(GPIO_PIOB_BASE, HAMMERHEAD_FPGA_PERIPH_SCK,
-                          GPIO_PERIPH_B, 0);
-       at32_select_periph(GPIO_PIOB_BASE, HAMMERHEAD_FPGA_PERIPH_MOSI,
-                          GPIO_PERIPH_B, 0);
-
-       /* reserve all other needed gpio
-        * We have on board pull ups, so there is no need
-        * to enable gpio pull ups */
-       /* INIT_DONE (input) */
-       at32_select_gpio(GPIO_PIN_PB(0), 0);
-
-       /* nSTATUS (input) */
-       at32_select_gpio(GPIO_PIN_PB(2), 0);
-
-       /* nCONFIG (output, low) */
-       at32_select_gpio(GPIO_PIN_PB(3), AT32_GPIOF_OUTPUT);
-
-       /* CONF_DONE (input) */
-       at32_select_gpio(GPIO_PIN_PB(4), 0);
-
-       /* Select EXTINT3 for PB28 (Interrupt from FPGA) */
-       at32_select_periph(GPIO_PIOB_BASE, HAMMERHEAD_FPGA_PERIPH_EXTINT3,
-                          GPIO_PERIPH_A, 0);
-
-       /* Get our parent clock */
-       hh_fpga0_spi_clk.parent = clk_get(NULL, "pba");
-       clk_put(hh_fpga0_spi_clk.parent);
-
-       /* Register clock in at32 clock tree */
-       at32_clk_register(&hh_fpga0_spi_clk);
-
-       platform_device_register(&hh_fpga0_device);
-       return &hh_fpga0_device;
-}
-#endif
-
-/* This needs to be called after the SMC has been initialized */
-static int __init hammerhead_flash_init(void)
-{
-       int ret;
-
-       smc_set_timing(&flash_config, &flash_timing);
-       ret = smc_set_configuration(0, &flash_config);
-
-       if (ret < 0) {
-               printk(KERN_ERR "hammerhead: failed to set NOR flash timing\n");
-               return ret;
-       }
-
-       platform_device_register(&flash_device);
-
-#ifdef CONFIG_BOARD_HAMMERHEAD_USB
-       hammerhead_usbh_init();
-#endif
-
-#ifdef CONFIG_BOARD_HAMMERHEAD_FPGA
-       /* Setup SMC for FPGA interface */
-       smc_set_timing(&fpga_config, &fpga_timing);
-       ret = smc_set_configuration(3, &fpga_config);
-#endif
-
-
-       if (ret < 0) {
-               printk(KERN_ERR "hammerhead: failed to set FPGA timing\n");
-               return ret;
-       }
-
-       return 0;
-}
-
-device_initcall(hammerhead_flash_init);
diff --git a/arch/avr32/boards/hammerhead/flash.h b/arch/avr32/boards/hammerhead/flash.h
deleted file mode 100644 (file)
index ea70c62..0000000
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef __BOARDS_HAMMERHEAD_FLASH_H
-#define __BOARDS_HAMMERHEAD_FLASH_H
-
-struct platform_device *at32_add_device_hh_fpga(void);
-
-#endif /* __BOARDS_HAMMERHEAD_FLASH_H */
diff --git a/arch/avr32/boards/hammerhead/setup.c b/arch/avr32/boards/hammerhead/setup.c
deleted file mode 100644 (file)
index dc0e317..0000000
+++ /dev/null
@@ -1,247 +0,0 @@
-/*
- * Board-specific setup code for the Miromico Hammerhead board
- *
- * Copyright (C) 2008 Miromico AG
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/atmel-mci.h>
-#include <linux/clk.h>
-#include <linux/fb.h>
-#include <linux/etherdevice.h>
-#include <linux/i2c.h>
-#include <linux/i2c-gpio.h>
-#include <linux/init.h>
-#include <linux/linkage.h>
-#include <linux/platform_device.h>
-#include <linux/types.h>
-#include <linux/spi/spi.h>
-
-#include <video/atmel_lcdc.h>
-
-#include <linux/io.h>
-#include <asm/setup.h>
-
-#include <mach/at32ap700x.h>
-#include <mach/board.h>
-#include <mach/init.h>
-#include <mach/portmux.h>
-
-#include <sound/atmel-ac97c.h>
-
-#include "../../mach-at32ap/clock.h"
-#include "flash.h"
-
-/* Oscillator frequencies. These are board-specific */
-unsigned long at32_board_osc_rates[3] = {
-       [0] = 32768,    /* 32.768 kHz on RTC osc */
-       [1] = 25000000, /* 25MHz on osc0 */
-       [2] = 12000000, /* 12 MHz on osc1 */
-};
-
-/* Initialized by bootloader-specific startup code. */
-struct tag *bootloader_tags __initdata;
-
-#ifdef CONFIG_BOARD_HAMMERHEAD_LCD
-static struct fb_videomode __initdata hda350tlv_modes[] = {
-       {
-               .name           = "320x240 @ 75",
-               .refresh        = 75,
-               .xres           = 320,
-               .yres           = 240,
-               .pixclock       = KHZ2PICOS(6891),
-
-               .left_margin    = 48,
-               .right_margin   = 18,
-               .upper_margin   = 18,
-               .lower_margin   = 4,
-               .hsync_len      = 20,
-               .vsync_len      = 2,
-
-               .sync           = 0,
-               .vmode          = FB_VMODE_NONINTERLACED,
-       },
-};
-
-static struct fb_monspecs __initdata hammerhead_hda350t_monspecs = {
-       .manufacturer           = "HAN",
-       .monitor                = "HDA350T-LV",
-       .modedb                 = hda350tlv_modes,
-       .modedb_len             = ARRAY_SIZE(hda350tlv_modes),
-       .hfmin                  = 14900,
-       .hfmax                  = 22350,
-       .vfmin                  = 60,
-       .vfmax                  = 90,
-       .dclkmax                = 10000000,
-};
-
-struct atmel_lcdfb_pdata __initdata hammerhead_lcdc_data = {
-       .default_bpp            = 24,
-       .default_dmacon         = ATMEL_LCDC_DMAEN | ATMEL_LCDC_DMA2DEN,
-       .default_lcdcon2        = (ATMEL_LCDC_DISTYPE_TFT
-                                  | ATMEL_LCDC_INVCLK
-                                  | ATMEL_LCDC_CLKMOD_ALWAYSACTIVE
-                                  | ATMEL_LCDC_MEMOR_BIG),
-       .default_monspecs       = &hammerhead_hda350t_monspecs,
-       .guard_time             = 2,
-};
-#endif
-
-static struct mci_platform_data __initdata mci0_data = {
-       .slot[0] = {
-               .bus_width      = 4,
-               .detect_pin     = -ENODEV,
-               .wp_pin         = -ENODEV,
-       },
-};
-
-struct eth_addr {
-       u8 addr[6];
-};
-
-static struct eth_addr __initdata hw_addr[1];
-static struct macb_platform_data __initdata eth_data[1];
-
-/*
- * The next two functions should go away as the boot loader is
- * supposed to initialize the macb address registers with a valid
- * ethernet address. But we need to keep it around for a while until
- * we can be reasonably sure the boot loader does this.
- *
- * The phy_id is ignored as the driver will probe for it.
- */
-static int __init parse_tag_ethernet(struct tag *tag)
-{
-       int i = tag->u.ethernet.mac_index;
-
-       if (i < ARRAY_SIZE(hw_addr))
-               memcpy(hw_addr[i].addr, tag->u.ethernet.hw_address,
-                      sizeof(hw_addr[i].addr));
-
-       return 0;
-}
-__tagtable(ATAG_ETHERNET, parse_tag_ethernet);
-
-static void __init set_hw_addr(struct platform_device *pdev)
-{
-       struct resource *res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-       const u8 *addr;
-       void __iomem *regs;
-       struct clk *pclk;
-
-       if (!res)
-               return;
-
-       if (pdev->id >= ARRAY_SIZE(hw_addr))
-               return;
-
-       addr = hw_addr[pdev->id].addr;
-
-       if (!is_valid_ether_addr(addr))
-               return;
-
-       /*
-        * Since this is board-specific code, we'll cheat and use the
-        * physical address directly as we happen to know that it's
-        * the same as the virtual address.
-        */
-       regs = (void __iomem __force *)res->start;
-       pclk = clk_get(&pdev->dev, "pclk");
-
-       if (IS_ERR(pclk))
-               return;
-
-       clk_enable(pclk);
-
-       __raw_writel((addr[3] << 24) | (addr[2] << 16) | (addr[1] << 8) |
-                    addr[0], regs + 0x98);
-       __raw_writel((addr[5] << 8) | addr[4], regs + 0x9c);
-
-       clk_disable(pclk);
-       clk_put(pclk);
-}
-
-void __init setup_board(void)
-{
-       at32_map_usart(1, 0, 0);        /* USART 1: /dev/ttyS0, DB9 */
-       at32_setup_serial_console(0);
-}
-
-static struct i2c_gpio_platform_data i2c_gpio_data = {
-       .sda_pin                = GPIO_PIN_PA(6),
-       .scl_pin                = GPIO_PIN_PA(7),
-       .sda_is_open_drain      = 1,
-       .scl_is_open_drain      = 1,
-       .udelay                 = 2,    /* close to 100 kHz */
-};
-
-static struct platform_device i2c_gpio_device = {
-       .name           = "i2c-gpio",
-       .id             = 0,
-       .dev            = { .platform_data = &i2c_gpio_data, },
-};
-
-static struct i2c_board_info __initdata i2c_info[] = {};
-
-#ifdef CONFIG_BOARD_HAMMERHEAD_SND
-static struct ac97c_platform_data ac97c_data = {
-       .reset_pin = GPIO_PIN_PA(16),
-};
-#endif
-
-static int __init hammerhead_init(void)
-{
-       /*
-        * Hammerhead uses 32-bit SDRAM interface. Reserve the
-        * SDRAM-specific pins so that nobody messes with them.
-        */
-       at32_reserve_pin(GPIO_PIOE_BASE, ATMEL_EBI_PE_DATA_ALL);
-
-       at32_add_device_usart(0);
-
-       /* Reserve PB29 (GCLK3). This pin is used as clock source
-        * for ETH PHY (25MHz). GCLK3 setup is done by U-Boot.
-        */
-       at32_reserve_pin(GPIO_PIOB_BASE, (1<<29));
-
-       /*
-        * Hammerhead uses only one ethernet port, so we don't set
-        * address of second port
-        */
-       set_hw_addr(at32_add_device_eth(0, &eth_data[0]));
-
-#ifdef CONFIG_BOARD_HAMMERHEAD_FPGA
-       at32_add_device_hh_fpga();
-#endif
-       at32_add_device_mci(0, &mci0_data);
-
-#ifdef CONFIG_BOARD_HAMMERHEAD_USB
-       at32_add_device_usba(0, NULL);
-#endif
-#ifdef CONFIG_BOARD_HAMMERHEAD_LCD
-       at32_add_device_lcdc(0, &hammerhead_lcdc_data, fbmem_start,
-                            fbmem_size, ATMEL_LCDC_PRI_24BIT);
-#endif
-
-       at32_select_gpio(i2c_gpio_data.sda_pin,
-                        AT32_GPIOF_MULTIDRV | AT32_GPIOF_OUTPUT |
-                        AT32_GPIOF_HIGH);
-       at32_select_gpio(i2c_gpio_data.scl_pin,
-                        AT32_GPIOF_MULTIDRV | AT32_GPIOF_OUTPUT |
-                        AT32_GPIOF_HIGH);
-       platform_device_register(&i2c_gpio_device);
-       i2c_register_board_info(0, i2c_info, ARRAY_SIZE(i2c_info));
-
-#ifdef CONFIG_BOARD_HAMMERHEAD_SND
-       at32_add_device_ac97c(0, &ac97c_data, AC97C_BOTH);
-#endif
-
-       /* Select the Touchscreen interrupt pin mode */
-       at32_select_periph(GPIO_PIOB_BASE, 0x08000000, GPIO_PERIPH_A, 0);
-
-       return 0;
-}
-
-postcore_initcall(hammerhead_init);
diff --git a/arch/avr32/boards/merisc/Kconfig b/arch/avr32/boards/merisc/Kconfig
deleted file mode 100644 (file)
index 7e04327..0000000
+++ /dev/null
@@ -1,5 +0,0 @@
-# Merisc customization
-
-if BOARD_MERISC
-
-endif  # BOARD_MERISC
diff --git a/arch/avr32/boards/merisc/Makefile b/arch/avr32/boards/merisc/Makefile
deleted file mode 100644 (file)
index d24c787..0000000
+++ /dev/null
@@ -1 +0,0 @@
-obj-y                                  += setup.o flash.o display.o merisc_sysfs.o
diff --git a/arch/avr32/boards/merisc/display.c b/arch/avr32/boards/merisc/display.c
deleted file mode 100644 (file)
index e7683ee..0000000
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Display setup code for the Merisc board
- *
- * Copyright (C) 2008 Martinsson Elektronik AB
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/init.h>
-#include <linux/platform_device.h>
-#include <linux/fb.h>
-#include <video/atmel_lcdc.h>
-#include <asm/setup.h>
-#include <mach/board.h>
-#include "merisc.h"
-
-static struct fb_videomode merisc_fb_videomode[] = {
-       {
-               .refresh        = 44,
-               .xres           = 640,
-               .yres           = 480,
-               .left_margin    = 96,
-               .right_margin   = 96,
-               .upper_margin   = 34,
-               .lower_margin   = 8,
-               .hsync_len      = 64,
-               .vsync_len      = 64,
-               .name           = "640x480 @ 44",
-               .pixclock       = KHZ2PICOS(25180),
-               .sync           = 0,
-               .vmode          = FB_VMODE_NONINTERLACED,
-       },
-};
-
-static struct fb_monspecs merisc_fb_monspecs = {
-       .manufacturer   = "Kyo",
-       .monitor        = "TCG075VG2AD",
-       .modedb         = merisc_fb_videomode,
-       .modedb_len     = ARRAY_SIZE(merisc_fb_videomode),
-       .hfmin          = 30000,
-       .hfmax          = 33333,
-       .vfmin          = 60,
-       .vfmax          = 90,
-       .dclkmax        = 30000000,
-};
-
-struct atmel_lcdfb_pdata merisc_lcdc_data = {
-       .default_bpp            = 24,
-       .default_dmacon         = ATMEL_LCDC_DMAEN | ATMEL_LCDC_DMA2DEN,
-       .default_lcdcon2        = (ATMEL_LCDC_DISTYPE_TFT
-                                  | ATMEL_LCDC_CLKMOD_ALWAYSACTIVE
-                                  | ATMEL_LCDC_MEMOR_BIG),
-       .default_monspecs       = &merisc_fb_monspecs,
-       .guard_time             = 2,
-};
-
-static int __init merisc_display_init(void)
-{
-       at32_add_device_lcdc(0, &merisc_lcdc_data, fbmem_start,
-                            fbmem_size, 0);
-
-       return 0;
-}
-device_initcall(merisc_display_init);
diff --git a/arch/avr32/boards/merisc/flash.c b/arch/avr32/boards/merisc/flash.c
deleted file mode 100644 (file)
index 8e856fd..0000000
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * Merisc board-specific flash initialization
- *
- * Copyright (C) 2008 Martinsson Elektronik AB
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/init.h>
-#include <linux/platform_device.h>
-#include <linux/mtd/mtd.h>
-#include <linux/mtd/partitions.h>
-#include <linux/mtd/physmap.h>
-#include <mach/smc.h>
-
-/* Will be translated to units of 14.3 ns, rounded up */
-static struct smc_timing flash_timing __initdata = {
-       .ncs_read_setup         = 1 * 14,
-       .nrd_setup              = 5 * 14,
-       .ncs_write_setup        = 1 * 14,
-       .nwe_setup              = 2 * 14,
-
-       .ncs_read_pulse         = 12 * 14,
-       .nrd_pulse              = 7 * 14,
-       .ncs_write_pulse        = 8 * 14,
-       .nwe_pulse              = 4 * 14,
-
-       .read_cycle             = 14 * 14,
-       .write_cycle            = 10 * 14,
-};
-
-static struct smc_config flash_config __initdata = {
-       .bus_width      = 2,
-       .nrd_controlled = 1,
-       .nwe_controlled = 1,
-       .byte_write     = 1,
-       .tdf_cycles     = 3,
-};
-
-static struct mtd_partition flash_0_parts[] = {
-       {
-               .name           = "boot",
-               .offset         = 0x00000000,
-               .size           = 0x00060000,
-               .mask_flags     = 0,
-       },
-       {
-               .name           = "kernel",
-               .offset         = 0x00060000,
-               .size           = 0x00200000,
-               .mask_flags     = 0,
-       },
-       {
-               .name           = "root",
-               .offset         = 0x00260000,
-               .size           = MTDPART_SIZ_FULL,
-               .mask_flags     = 0,
-       },
-};
-
-static struct mtd_partition flash_1_parts[] = {
-       {
-               .name           = "2ndflash",
-               .offset         = 0x00000000,
-               .size           = MTDPART_SIZ_FULL,
-               .mask_flags     = 0,
-       },
-};
-
-static struct physmap_flash_data flash_data[] = {
-       {
-               .width          = 2,
-               .nr_parts       = ARRAY_SIZE(flash_0_parts),
-               .parts          = flash_0_parts,
-       },
-       {
-               .width          = 2,
-               .nr_parts       = ARRAY_SIZE(flash_1_parts),
-               .parts          = flash_1_parts,
-       }
-};
-
-static struct resource flash_resource[] = {
-       {
-               .start          = 0x00000000,
-               .end            = 0x03ffffff,
-               .flags          = IORESOURCE_MEM,
-       },
-       {
-               .start          = 0x04000000,
-               .end            = 0x07ffffff,
-               .flags          = IORESOURCE_MEM,
-       },
-};
-
-static struct platform_device flash_device[] = {
-       {
-               .name           = "physmap-flash",
-               .id             = 0,
-               .resource       = &flash_resource[0],
-               .num_resources  = 1,
-               .dev            = {
-                       .platform_data  = &flash_data[0],
-               },
-       },
-       {
-               .name           = "physmap-flash",
-               .id             = 1,
-               .resource       = &flash_resource[1],
-               .num_resources  = 1,
-               .dev            = {
-                       .platform_data  = &flash_data[1],
-               },
-       },
-};
-
-static int __init merisc_flash_init(void)
-{
-       int ret;
-       smc_set_timing(&flash_config, &flash_timing);
-
-       ret = smc_set_configuration(0, &flash_config);
-       if (ret < 0) {
-               printk(KERN_ERR "Merisc: failed to set NOR flash timing #0\n");
-               return ret;
-       }
-
-       ret = smc_set_configuration(4, &flash_config);
-       if (ret < 0) {
-               printk(KERN_ERR "Merisc: failed to set NOR flash timing #1\n");
-               return ret;
-       }
-
-       platform_device_register(&flash_device[0]);
-       platform_device_register(&flash_device[1]);
-       return 0;
-}
-device_initcall(merisc_flash_init);
diff --git a/arch/avr32/boards/merisc/merisc.h b/arch/avr32/boards/merisc/merisc.h
deleted file mode 100644 (file)
index 50ffb2f..0000000
+++ /dev/null
@@ -1,18 +0,0 @@
-/*
- * Merisc exports
- *
- * Copyright (C) 2008 Martinsson Elektronik AB
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef __ARCH_AVR32_BOARDS_MERISC_MERISC_H
-#define __ARCH_AVR32_BOARDS_MERISC_MERISC_H
-
-const char *merisc_revision(void);
-const char *merisc_model(void);
-
-extern struct class merisc_class;
-
-#endif /* __ARCH_AVR32_BOARDS_MERISC_MERISC_H */
diff --git a/arch/avr32/boards/merisc/merisc_sysfs.c b/arch/avr32/boards/merisc/merisc_sysfs.c
deleted file mode 100644 (file)
index 5a25231..0000000
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Merisc sysfs exports
- *
- * Copyright (C) 2008 Martinsson Elektronik AB
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/list.h>
-#include <linux/spinlock.h>
-#include <linux/device.h>
-#include <linux/timer.h>
-#include <linux/err.h>
-#include <linux/ctype.h>
-#include "merisc.h"
-
-static ssize_t merisc_model_show(struct class *class, char *buf)
-{
-       ssize_t ret = 0;
-
-       sprintf(buf, "%s\n", merisc_model());
-       ret = strlen(buf) + 1;
-
-       return ret;
-}
-
-static ssize_t merisc_revision_show(struct class *class, char *buf)
-{
-       ssize_t ret = 0;
-
-       sprintf(buf, "%s\n", merisc_revision());
-       ret = strlen(buf) + 1;
-
-       return ret;
-}
-
-static struct class_attribute merisc_class_attrs[] = {
-       __ATTR(model, S_IRUGO, merisc_model_show, NULL),
-       __ATTR(revision, S_IRUGO, merisc_revision_show, NULL),
-       __ATTR_NULL,
-};
-
-struct class merisc_class = {
-       .name =         "merisc",
-       .owner =        THIS_MODULE,
-       .class_attrs =  merisc_class_attrs,
-};
-
-static int __init merisc_sysfs_init(void)
-{
-       int status;
-
-       status = class_register(&merisc_class);
-       if (status < 0)
-               return status;
-
-       return 0;
-}
-
-postcore_initcall(merisc_sysfs_init);
diff --git a/arch/avr32/boards/merisc/setup.c b/arch/avr32/boards/merisc/setup.c
deleted file mode 100644 (file)
index 718a6d7..0000000
+++ /dev/null
@@ -1,305 +0,0 @@
-/*
- * Board-specific setup code for the Merisc
- *
- * Copyright (C) 2008 Martinsson Elektronik AB
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/clk.h>
-#include <linux/etherdevice.h>
-#include <linux/i2c.h>
-#include <linux/i2c-gpio.h>
-#include <linux/gpio.h>
-#include <linux/init.h>
-#include <linux/linkage.h>
-#include <linux/platform_device.h>
-#include <linux/types.h>
-#include <linux/leds.h>
-#include <linux/spi/spi.h>
-#include <linux/spi/ads7846.h>
-#include <linux/irq.h>
-#include <linux/fb.h>
-#include <linux/atmel-mci.h>
-#include <linux/pwm.h>
-#include <linux/leds_pwm.h>
-
-#include <asm/io.h>
-#include <asm/setup.h>
-
-#include <mach/at32ap700x.h>
-#include <mach/board.h>
-#include <mach/init.h>
-#include <mach/portmux.h>
-
-#include "merisc.h"
-
-/* Holds the autodetected board model and revision */
-static int merisc_board_id;
-
-/* Initialized by bootloader-specific startup code. */
-struct tag *bootloader_tags __initdata;
-
-/* Oscillator frequencies. These are board specific */
-unsigned long at32_board_osc_rates[3] = {
-       [0]     = 32768,        /* 32.768 kHz on RTC osc */
-       [1]     = 20000000,     /* 20 MHz on osc0 */
-       [2]     = 12000000,     /* 12 MHz on osc1 */
-};
-
-struct eth_addr {
-       u8 addr[6];
-};
-
-static struct eth_addr __initdata hw_addr[2];
-static struct macb_platform_data __initdata eth_data[2];
-
-static int ads7846_get_pendown_state_PB26(void)
-{
-       return !gpio_get_value(GPIO_PIN_PB(26));
-}
-
-static int ads7846_get_pendown_state_PB28(void)
-{
-       return !gpio_get_value(GPIO_PIN_PB(28));
-}
-
-static struct ads7846_platform_data __initdata ads7846_data = {
-       .model                          = 7846,
-       .vref_delay_usecs               = 100,
-       .vref_mv                        = 0,
-       .keep_vref_on                   = 0,
-       .settle_delay_usecs             = 150,
-       .penirq_recheck_delay_usecs     = 1,
-       .x_plate_ohms                   = 800,
-       .debounce_rep                   = 4,
-       .debounce_max                   = 10,
-       .debounce_tol                   = 50,
-       .get_pendown_state              = ads7846_get_pendown_state_PB26,
-       .filter_init                    = NULL,
-       .filter                         = NULL,
-       .filter_cleanup                 = NULL,
-};
-
-static struct spi_board_info __initdata spi0_board_info[] = {
-       {
-               .modalias       = "ads7846",
-               .max_speed_hz   = 3250000,
-               .chip_select    = 0,
-               .bus_num        = 0,
-               .platform_data  = &ads7846_data,
-               .mode           = SPI_MODE_0,
-       },
-};
-
-static struct mci_platform_data __initdata mci0_data = {
-       .slot[0] = {
-               .bus_width              = 4,
-               .detect_pin             = GPIO_PIN_PE(19),
-               .wp_pin                 = GPIO_PIN_PE(20),
-               .detect_is_active_high  = true,
-       },
-};
-
-static int __init parse_tag_ethernet(struct tag *tag)
-{
-       int i;
-
-       i = tag->u.ethernet.mac_index;
-       if (i < ARRAY_SIZE(hw_addr)) {
-               memcpy(hw_addr[i].addr, tag->u.ethernet.hw_address,
-                      sizeof(hw_addr[i].addr));
-       }
-
-       return 0;
-}
-__tagtable(ATAG_ETHERNET, parse_tag_ethernet);
-
-static void __init set_hw_addr(struct platform_device *pdev)
-{
-       struct resource *res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-       const u8 *addr;
-       void __iomem *regs;
-       struct clk *pclk;
-
-       if (!res)
-               return;
-
-       if (pdev->id >= ARRAY_SIZE(hw_addr))
-               return;
-
-       addr = hw_addr[pdev->id].addr;
-       if (!is_valid_ether_addr(addr))
-               return;
-
-       regs = (void __iomem __force *)res->start;
-       pclk = clk_get(&pdev->dev, "pclk");
-       if (IS_ERR(pclk))
-               return;
-
-       clk_enable(pclk);
-       __raw_writel((addr[3] << 24) | (addr[2] << 16)
-                    | (addr[1] << 8) | addr[0], regs + 0x98);
-       __raw_writel((addr[5] << 8) | addr[4], regs + 0x9c);
-       clk_disable(pclk);
-       clk_put(pclk);
-}
-
-static struct i2c_gpio_platform_data i2c_gpio_data = {
-       .sda_pin                = GPIO_PIN_PA(6),
-       .scl_pin                = GPIO_PIN_PA(7),
-       .sda_is_open_drain      = 1,
-       .scl_is_open_drain      = 1,
-       .udelay                 = 2,
-};
-
-static struct platform_device i2c_gpio_device = {
-       .name   = "i2c-gpio",
-       .id     = 0,
-       .dev    = {
-               .platform_data  = &i2c_gpio_data,
-       },
-};
-
-static struct i2c_board_info __initdata i2c_info[] = {
-       {
-               I2C_BOARD_INFO("pcf8563", 0x51)
-       },
-};
-
-#if IS_ENABLED(CONFIG_LEDS_PWM)
-static struct pwm_lookup pwm_lookup[] = {
-       PWM_LOOKUP("at91sam9rl-pwm", 0, "leds_pwm", "backlight",
-                  5000, PWM_POLARITY_NORMAL),
-};
-
-static struct led_pwm pwm_leds[] = {
-       {
-               .name   = "backlight",
-               .max_brightness = 255,
-       },
-};
-
-static struct led_pwm_platform_data pwm_data = {
-       .num_leds       = ARRAY_SIZE(pwm_leds),
-       .leds           = pwm_leds,
-};
-
-static struct platform_device leds_pwm = {
-       .name   = "leds_pwm",
-       .id     = -1,
-       .dev    = {
-               .platform_data = &pwm_data,
-       },
-};
-#endif
-
-const char *merisc_model(void)
-{
-       switch (merisc_board_id) {
-       case 0:
-       case 1:
-               return "500-01";
-       case 2:
-               return "BT";
-       default:
-               return "Unknown";
-       }
-}
-
-const char *merisc_revision(void)
-{
-       switch (merisc_board_id) {
-       case 0:
-               return "B";
-       case 1:
-               return "D";
-       case 2:
-               return "A";
-       default:
-               return "Unknown";
-       }
-}
-
-static void detect_merisc_board_id(void)
-{
-       /* Board ID pins MUST be set as input or the board may be damaged */
-       at32_select_gpio(GPIO_PIN_PA(24), AT32_GPIOF_PULLUP);
-       at32_select_gpio(GPIO_PIN_PA(25), AT32_GPIOF_PULLUP);
-       at32_select_gpio(GPIO_PIN_PA(26), AT32_GPIOF_PULLUP);
-       at32_select_gpio(GPIO_PIN_PA(27), AT32_GPIOF_PULLUP);
-
-       merisc_board_id = !gpio_get_value(GPIO_PIN_PA(24)) +
-               !gpio_get_value(GPIO_PIN_PA(25)) * 2 +
-               !gpio_get_value(GPIO_PIN_PA(26)) * 4 +
-               !gpio_get_value(GPIO_PIN_PA(27)) * 8;
-}
-
-void __init setup_board(void)
-{
-       at32_map_usart(0, 0, 0);
-       at32_map_usart(1, 1, 0);
-       at32_map_usart(3, 3, 0);
-       at32_setup_serial_console(1);
-}
-
-static int __init merisc_init(void)
-{
-       detect_merisc_board_id();
-
-       printk(KERN_NOTICE "BOARD: Merisc %s revision %s\n", merisc_model(),
-              merisc_revision());
-
-       /* Reserve pins for SDRAM */
-       at32_reserve_pin(GPIO_PIOE_BASE, ATMEL_EBI_PE_DATA_ALL | (1 << 26));
-
-       if (merisc_board_id >= 1)
-               at32_map_usart(2, 2, 0);
-
-       at32_add_device_usart(0);
-       at32_add_device_usart(1);
-       if (merisc_board_id >= 1)
-               at32_add_device_usart(2);
-       at32_add_device_usart(3);
-       set_hw_addr(at32_add_device_eth(0, &eth_data[0]));
-
-       /* ADS7846 PENIRQ */
-       if (merisc_board_id == 0) {
-               ads7846_data.get_pendown_state = ads7846_get_pendown_state_PB26;
-               at32_select_periph(GPIO_PIOB_BASE, 1 << 26,
-                                  GPIO_PERIPH_A, AT32_GPIOF_PULLUP);
-               spi0_board_info[0].irq = AT32_EXTINT(1);
-       } else {
-               ads7846_data.get_pendown_state = ads7846_get_pendown_state_PB28;
-               at32_select_periph(GPIO_PIOB_BASE, 1 << 28, GPIO_PERIPH_A,
-                                  AT32_GPIOF_PULLUP);
-               spi0_board_info[0].irq = AT32_EXTINT(3);
-       }
-
-       /* ADS7846 busy pin */
-       at32_select_gpio(GPIO_PIN_PA(4), AT32_GPIOF_PULLUP);
-
-       at32_add_device_spi(0, spi0_board_info, ARRAY_SIZE(spi0_board_info));
-
-       at32_add_device_mci(0, &mci0_data);
-
-#if IS_ENABLED(CONFIG_LEDS_PWM)
-       pwm_add_table(pwm_lookup, ARRAY_SIZE(pwm_lookup));
-       at32_add_device_pwm((1 << 0) | (1 << 2));
-       platform_device_register(&leds_pwm);
-#else
-       at32_add_device_pwm((1 << 2));
-#endif
-
-       at32_select_gpio(i2c_gpio_data.sda_pin,
-               AT32_GPIOF_MULTIDRV | AT32_GPIOF_OUTPUT | AT32_GPIOF_HIGH);
-       at32_select_gpio(i2c_gpio_data.scl_pin,
-               AT32_GPIOF_MULTIDRV | AT32_GPIOF_OUTPUT | AT32_GPIOF_HIGH);
-       platform_device_register(&i2c_gpio_device);
-
-       i2c_register_board_info(0, i2c_info, ARRAY_SIZE(i2c_info));
-
-       return 0;
-}
-postcore_initcall(merisc_init);
diff --git a/arch/avr32/boards/mimc200/Makefile b/arch/avr32/boards/mimc200/Makefile
deleted file mode 100644 (file)
index c740aa1..0000000
+++ /dev/null
@@ -1 +0,0 @@
-obj-y                          += setup.o flash.o
diff --git a/arch/avr32/boards/mimc200/flash.c b/arch/avr32/boards/mimc200/flash.c
deleted file mode 100644 (file)
index d83d650..0000000
+++ /dev/null
@@ -1,143 +0,0 @@
-/*
- * MIMC200 board-specific flash initialization
- *
- * Copyright (C) 2008 Mercury IMC Ltd
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/init.h>
-#include <linux/platform_device.h>
-#include <linux/mtd/mtd.h>
-#include <linux/mtd/partitions.h>
-#include <linux/mtd/physmap.h>
-
-#include <mach/smc.h>
-
-static struct smc_timing flash_timing __initdata = {
-       .ncs_read_setup         = 0,
-       .nrd_setup              = 15,
-       .ncs_write_setup        = 0,
-       .nwe_setup              = 0,
-
-       .ncs_read_pulse         = 115,
-       .nrd_pulse              = 110,
-       .ncs_write_pulse        = 60,
-       .nwe_pulse              = 60,
-
-       .read_cycle             = 115,
-       .write_cycle            = 100,
-};
-
-static struct smc_config flash_config __initdata = {
-       .bus_width              = 2,
-       .nrd_controlled         = 1,
-       .nwe_controlled         = 1,
-       .byte_write             = 1,
-};
-
-/* system flash definition */
-
-static struct mtd_partition flash_parts_system[] = {
-       {
-               .name           = "u-boot",
-               .offset         = 0x00000000,
-               .size           = 0x00020000,           /* 128 KiB */
-               .mask_flags     = MTD_WRITEABLE,
-       },
-       {
-               .name           = "root",
-               .offset         = 0x00020000,
-               .size           = 0x007c0000,
-       },
-       {
-               .name           = "splash",
-               .offset         = 0x007e0000,
-               .size           = 0x00010000,           /* 64KiB */
-       },
-       {
-               .name           = "env",
-               .offset         = 0x007f0000,
-               .size           = 0x00010000,
-               .mask_flags     = MTD_WRITEABLE,
-       },
-};
-
-static struct physmap_flash_data flash_system = {
-       .width          = 2,
-       .nr_parts       = ARRAY_SIZE(flash_parts_system),
-       .parts          = flash_parts_system,
-};
-
-static struct resource flash_resource_system = {
-       .start          = 0x00000000,
-       .end            = 0x007fffff,
-       .flags          = IORESOURCE_MEM,
-};
-
-static struct platform_device flash_device_system = {
-       .name           = "physmap-flash",
-       .id             = 0,
-       .resource       = &flash_resource_system,
-       .num_resources  = 1,
-       .dev            = {
-               .platform_data = &flash_system,
-       },
-};
-
-/* data flash definition */
-
-static struct mtd_partition flash_parts_data[] = {
-       {
-               .name           = "data",
-               .offset         = 0x00000000,
-               .size           = 0x00800000,
-       },
-};
-
-static struct physmap_flash_data flash_data = {
-       .width          = 2,
-       .nr_parts       = ARRAY_SIZE(flash_parts_data),
-       .parts          = flash_parts_data,
-};
-
-static struct resource flash_resource_data = {
-       .start          = 0x08000000,
-       .end            = 0x087fffff,
-       .flags          = IORESOURCE_MEM,
-};
-
-static struct platform_device flash_device_data = {
-       .name           = "physmap-flash",
-       .id             = 1,
-       .resource       = &flash_resource_data,
-       .num_resources  = 1,
-       .dev            = {
-               .platform_data = &flash_data,
-       },
-};
-
-/* This needs to be called after the SMC has been initialized */
-static int __init mimc200_flash_init(void)
-{
-       int ret;
-
-       smc_set_timing(&flash_config, &flash_timing);
-       ret = smc_set_configuration(0, &flash_config);
-       if (ret < 0) {
-               printk(KERN_ERR "mimc200: failed to set 'System' NOR flash timing\n");
-               return ret;
-       }
-       ret = smc_set_configuration(1, &flash_config);
-       if (ret < 0) {
-               printk(KERN_ERR "mimc200: failed to set 'Data' NOR flash timing\n");
-               return ret;
-       }
-
-       platform_device_register(&flash_device_system);
-       platform_device_register(&flash_device_data);
-
-       return 0;
-}
-device_initcall(mimc200_flash_init);
diff --git a/arch/avr32/boards/mimc200/setup.c b/arch/avr32/boards/mimc200/setup.c
deleted file mode 100644 (file)
index 1cb8e9c..0000000
+++ /dev/null
@@ -1,236 +0,0 @@
-/*
- * Board-specific setup code for the MIMC200
- *
- * Copyright (C) 2008 Mercury IMC Ltd
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-extern struct atmel_lcdfb_pdata mimc200_lcdc_data;
-
-#include <linux/clk.h>
-#include <linux/etherdevice.h>
-#include <linux/i2c-gpio.h>
-#include <linux/init.h>
-#include <linux/linkage.h>
-#include <linux/platform_device.h>
-#include <linux/types.h>
-#include <linux/leds.h>
-#include <linux/spi/spi.h>
-#include <linux/spi/eeprom.h>
-
-#include <video/atmel_lcdc.h>
-#include <linux/fb.h>
-
-#include <linux/atmel-mci.h>
-#include <linux/io.h>
-#include <asm/setup.h>
-
-#include <mach/at32ap700x.h>
-#include <mach/board.h>
-#include <mach/init.h>
-#include <mach/portmux.h>
-
-/* Oscillator frequencies. These are board-specific */
-unsigned long at32_board_osc_rates[3] = {
-       [0] = 32768,    /* 32.768 kHz on RTC osc */
-       [1] = 10000000, /* 10 MHz on osc0 */
-       [2] = 12000000, /* 12 MHz on osc1 */
-};
-
-/* Initialized by bootloader-specific startup code. */
-struct tag *bootloader_tags __initdata;
-
-static struct fb_videomode __initdata pt0434827_modes[] = {
-       {
-               .name           = "480x272 @ 72",
-               .refresh        = 72,
-               .xres           = 480,          .yres           = 272,
-               .pixclock       = KHZ2PICOS(10000),
-
-               .left_margin    = 1,            .right_margin   = 1,
-               .upper_margin   = 12,           .lower_margin   = 1,
-               .hsync_len      = 42,           .vsync_len      = 1,
-
-               .sync           = 0,
-               .vmode          = FB_VMODE_NONINTERLACED,
-       },
-};
-
-static struct fb_monspecs __initdata mimc200_default_monspecs = {
-       .manufacturer           = "PT",
-       .monitor                = "PT0434827-A401",
-       .modedb                 = pt0434827_modes,
-       .modedb_len             = ARRAY_SIZE(pt0434827_modes),
-       .hfmin                  = 14820,
-       .hfmax                  = 22230,
-       .vfmin                  = 60,
-       .vfmax                  = 85,
-       .dclkmax                = 25200000,
-};
-
-struct atmel_lcdfb_pdata __initdata mimc200_lcdc_data = {
-       .default_bpp            = 16,
-       .default_dmacon         = ATMEL_LCDC_DMAEN | ATMEL_LCDC_DMA2DEN,
-       .default_lcdcon2        = (ATMEL_LCDC_DISTYPE_TFT
-                                  | ATMEL_LCDC_INVCLK
-                                  | ATMEL_LCDC_CLKMOD_ALWAYSACTIVE
-                                  | ATMEL_LCDC_MEMOR_BIG),
-       .default_monspecs       = &mimc200_default_monspecs,
-       .guard_time             = 2,
-};
-
-struct eth_addr {
-       u8 addr[6];
-};
-static struct eth_addr __initdata hw_addr[2];
-static struct macb_platform_data __initdata eth_data[2];
-
-static struct spi_eeprom eeprom_25lc010 = {
-               .name = "25lc010",
-               .byte_len = 128,
-               .page_size = 16,
-               .flags = EE_ADDR1,
-};
-
-static struct spi_board_info spi0_board_info[] __initdata = {
-       {
-               .modalias       = "rtc-ds1390",
-               .max_speed_hz   = 4000000,
-               .chip_select    = 2,
-       },
-       {
-               .modalias       = "at25",
-               .max_speed_hz   = 1000000,
-               .chip_select    = 1,
-               .mode           = SPI_MODE_3,
-               .platform_data  = &eeprom_25lc010,
-       },
-};
-
-static struct mci_platform_data __initdata mci0_data = {
-       .slot[0] = {
-               .bus_width      = 4,
-               .detect_pin     = GPIO_PIN_PA(26),
-               .wp_pin         = GPIO_PIN_PA(27),
-       },
-};
-
-/*
- * The next two functions should go away as the boot loader is
- * supposed to initialize the macb address registers with a valid
- * ethernet address. But we need to keep it around for a while until
- * we can be reasonably sure the boot loader does this.
- *
- * The phy_id is ignored as the driver will probe for it.
- */
-static int __init parse_tag_ethernet(struct tag *tag)
-{
-       int i;
-
-       i = tag->u.ethernet.mac_index;
-       if (i < ARRAY_SIZE(hw_addr))
-               memcpy(hw_addr[i].addr, tag->u.ethernet.hw_address,
-                      sizeof(hw_addr[i].addr));
-
-       return 0;
-}
-__tagtable(ATAG_ETHERNET, parse_tag_ethernet);
-
-static void __init set_hw_addr(struct platform_device *pdev)
-{
-       struct resource *res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-       const u8 *addr;
-       void __iomem *regs;
-       struct clk *pclk;
-
-       if (!res)
-               return;
-       if (pdev->id >= ARRAY_SIZE(hw_addr))
-               return;
-
-       addr = hw_addr[pdev->id].addr;
-       if (!is_valid_ether_addr(addr))
-               return;
-
-       /*
-        * Since this is board-specific code, we'll cheat and use the
-        * physical address directly as we happen to know that it's
-        * the same as the virtual address.
-        */
-       regs = (void __iomem __force *)res->start;
-       pclk = clk_get(&pdev->dev, "pclk");
-       if (IS_ERR(pclk))
-               return;
-
-       clk_enable(pclk);
-       __raw_writel((addr[3] << 24) | (addr[2] << 16)
-                    | (addr[1] << 8) | addr[0], regs + 0x98);
-       __raw_writel((addr[5] << 8) | addr[4], regs + 0x9c);
-       clk_disable(pclk);
-       clk_put(pclk);
-}
-
-void __init setup_board(void)
-{
-       at32_map_usart(0, 0, 0);        /* USART 0: /dev/ttyS0 (TTL --> Altera) */
-       at32_map_usart(1, 1, 0);        /* USART 1: /dev/ttyS1 (RS232) */
-       at32_map_usart(2, 2, 0);        /* USART 2: /dev/ttyS2 (RS485) */
-       at32_map_usart(3, 3, 0);        /* USART 3: /dev/ttyS3 (RS422 Multidrop) */
-}
-
-static struct i2c_gpio_platform_data i2c_gpio_data = {
-       .sda_pin                = GPIO_PIN_PA(6),
-       .scl_pin                = GPIO_PIN_PA(7),
-       .sda_is_open_drain      = 1,
-       .scl_is_open_drain      = 1,
-       .udelay                 = 2,    /* close to 100 kHz */
-};
-
-static struct platform_device i2c_gpio_device = {
-       .name           = "i2c-gpio",
-       .id             = 0,
-       .dev            = {
-       .platform_data  = &i2c_gpio_data,
-       },
-};
-
-static struct i2c_board_info __initdata i2c_info[] = {
-};
-
-static int __init mimc200_init(void)
-{
-       /*
-        * MIMC200 uses 16-bit SDRAM interface, so we don't need to
-        * reserve any pins for it.
-        */
-
-       at32_add_device_usart(0);
-       at32_add_device_usart(1);
-       at32_add_device_usart(2);
-       at32_add_device_usart(3);
-
-       set_hw_addr(at32_add_device_eth(0, &eth_data[0]));
-       set_hw_addr(at32_add_device_eth(1, &eth_data[1]));
-
-       at32_add_device_spi(0, spi0_board_info, ARRAY_SIZE(spi0_board_info));
-       at32_add_device_mci(0, &mci0_data);
-       at32_add_device_usba(0, NULL);
-
-       at32_select_periph(GPIO_PIOB_BASE, 1 << 28, 0, AT32_GPIOF_PULLUP);
-       at32_select_gpio(i2c_gpio_data.sda_pin,
-               AT32_GPIOF_MULTIDRV | AT32_GPIOF_OUTPUT | AT32_GPIOF_HIGH);
-       at32_select_gpio(i2c_gpio_data.scl_pin,
-               AT32_GPIOF_MULTIDRV | AT32_GPIOF_OUTPUT | AT32_GPIOF_HIGH);
-       platform_device_register(&i2c_gpio_device);
-       i2c_register_board_info(0, i2c_info, ARRAY_SIZE(i2c_info));
-
-       at32_add_device_lcdc(0, &mimc200_lcdc_data,
-                            fbmem_start, fbmem_size,
-                            ATMEL_LCDC_CONTROL | ATMEL_LCDC_ALT_CONTROL | ATMEL_LCDC_ALT_24B_DATA);
-
-       return 0;
-}
-postcore_initcall(mimc200_init);
diff --git a/arch/avr32/boot/images/.gitignore b/arch/avr32/boot/images/.gitignore
deleted file mode 100644 (file)
index 64ea9d0..0000000
+++ /dev/null
@@ -1,4 +0,0 @@
-uImage
-uImage.srec
-vmlinux.cso
-sfdwarf.log
diff --git a/arch/avr32/boot/images/Makefile b/arch/avr32/boot/images/Makefile
deleted file mode 100644 (file)
index 2a3b539..0000000
+++ /dev/null
@@ -1,57 +0,0 @@
-#
-# Copyright (C) 2004-2006 Atmel Corporation
-#
-# This file is subject to the terms and conditions of the GNU General Public
-# License.  See the file "COPYING" in the main directory of this archive
-# for more details.
-#
-
-extra-y                := vmlinux.bin vmlinux.gz
-
-OBJCOPYFLAGS_vmlinux.bin := -O binary -R .note.gnu.build-id
-$(obj)/vmlinux.bin: vmlinux FORCE
-       $(call if_changed,objcopy)
-
-$(obj)/vmlinux.gz: $(obj)/vmlinux.bin FORCE
-       $(call if_changed,gzip)
-
-UIMAGE_LOADADDR = $(CONFIG_LOAD_ADDRESS)
-UIMAGE_ENTRYADDR = $(CONFIG_ENTRY_ADDRESS)
-UIMAGE_COMPRESSION = gzip
-
-targets += uImage uImage.srec
-$(obj)/uImage: $(obj)/vmlinux.gz
-       $(call if_changed,uimage)
-       @echo '  Image $@ is ready'
-
-OBJCOPYFLAGS_uImage.srec := -I binary -O srec
-$(obj)/uImage.srec: $(obj)/uImage
-       $(call if_changed,objcopy)
-
-OBJCOPYFLAGS_vmlinux.elf := --change-section-lma .text-0x80000000 \
-                           --change-section-lma __ex_table-0x80000000 \
-                           --change-section-lma .rodata-0x80000000 \
-                           --change-section-lma .data-0x80000000 \
-                           --change-section-lma .init-0x80000000 \
-                           --change-section-lma .bss-0x80000000 \
-                           --change-section-lma __param-0x80000000 \
-                           --change-section-lma __ksymtab-0x80000000 \
-                           --change-section-lma __ksymtab_gpl-0x80000000 \
-                           --change-section-lma __kcrctab-0x80000000 \
-                           --change-section-lma __kcrctab_gpl-0x80000000 \
-                           --change-section-lma __ksymtab_strings-0x80000000 \
-                           --set-start 0xa0000000
-$(obj)/vmlinux.elf: vmlinux FORCE
-       $(call if_changed,objcopy)
-
-quiet_cmd_sfdwarf = SFDWARF $@
-      cmd_sfdwarf = sfdwarf $< TO $@ GNUAVR IW $(SFDWARF_FLAGS) > $(obj)/sfdwarf.log
-
-$(obj)/vmlinux.cso: $(obj)/vmlinux.elf FORCE
-       $(call if_changed,sfdwarf)
-
-install: $(BOOTIMAGE)
-       sh $(srctree)/install-kernel.sh $<
-
-# Generated files to be removed upon make clean
-clean-files    := vmlinux.elf vmlinux.bin vmlinux.gz uImage uImage.srec
diff --git a/arch/avr32/boot/u-boot/Makefile b/arch/avr32/boot/u-boot/Makefile
deleted file mode 100644 (file)
index 125ddc9..0000000
+++ /dev/null
@@ -1,3 +0,0 @@
-extra-y                := head.o
-
-obj-y          := empty.o
diff --git a/arch/avr32/boot/u-boot/empty.S b/arch/avr32/boot/u-boot/empty.S
deleted file mode 100644 (file)
index 8ac91a5..0000000
+++ /dev/null
@@ -1 +0,0 @@
-/* Empty file */
diff --git a/arch/avr32/boot/u-boot/head.S b/arch/avr32/boot/u-boot/head.S
deleted file mode 100644 (file)
index 2ffc298..0000000
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Startup code for use with the u-boot bootloader.
- *
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <asm/setup.h>
-#include <asm/thread_info.h>
-#include <asm/sysreg.h>
-
-       /*
-        * The kernel is loaded where we want it to be and all caches
-        * have just been flushed. We get two parameters from u-boot:
-        *
-        * r12 contains a magic number (ATAG_MAGIC)
-        * r11 points to a tag table providing information about
-        *     the system.
-        */
-       .section .init.text,"ax"
-       .global _start
-_start:
-       /* Initialize .bss */
-       lddpc   r2, bss_start_addr
-       lddpc   r3, end_addr
-       mov     r0, 0
-       mov     r1, 0
-1:      st.d    r2++, r0
-       cp      r2, r3
-       brlo    1b
-
-       /* Initialize status register */
-       lddpc   r0, init_sr
-       mtsr    SYSREG_SR, r0
-
-       /* Set initial stack pointer */
-       lddpc   sp, stack_addr
-       sub     sp, -THREAD_SIZE
-
-#ifdef CONFIG_FRAME_POINTER
-       /* Mark last stack frame */
-       mov     lr, 0
-       mov     r7, 0
-#endif
-
-       /* Check if the boot loader actually provided a tag table */
-       lddpc   r0, magic_number
-       cp.w    r12, r0
-       brne    no_tag_table
-
-       /*
-        * Save the tag table address for later use. This must be done
-        * _after_ .bss has been initialized...
-        */
-       lddpc   r0, tag_table_addr
-       st.w    r0[0], r11
-
-       /* Jump to loader-independent setup code */
-       rjmp    kernel_entry
-
-       .align  2
-magic_number:
-       .long   ATAG_MAGIC
-tag_table_addr:
-       .long   bootloader_tags
-bss_start_addr:
-       .long   __bss_start
-end_addr:
-       .long   _end
-init_sr:
-       .long   0x007f0000      /* Supervisor mode, everything masked */
-stack_addr:
-       .long   init_thread_union
-panic_addr:
-       .long   panic
-
-no_tag_table:
-       sub     r12, pc, (. - 2f)
-       /* branch to panic() which can be far away with that construct */
-       lddpc   pc, panic_addr
-2:     .asciz  "Boot loader didn't provide correct magic number\n"
diff --git a/arch/avr32/configs/atngw100_defconfig b/arch/avr32/configs/atngw100_defconfig
deleted file mode 100644 (file)
index ce00300..0000000
+++ /dev/null
@@ -1,142 +0,0 @@
-# CONFIG_LOCALVERSION_AUTO is not set
-CONFIG_SYSVIPC=y
-CONFIG_POSIX_MQUEUE=y
-CONFIG_NO_HZ=y
-CONFIG_HIGH_RES_TIMERS=y
-CONFIG_LOG_BUF_SHIFT=14
-CONFIG_RELAY=y
-CONFIG_BLK_DEV_INITRD=y
-CONFIG_CC_OPTIMIZE_FOR_SIZE=y
-# CONFIG_BASE_FULL is not set
-# CONFIG_COMPAT_BRK is not set
-CONFIG_PROFILING=y
-CONFIG_OPROFILE=m
-CONFIG_MODULES=y
-CONFIG_MODULE_UNLOAD=y
-# CONFIG_BLK_DEV_BSG is not set
-# CONFIG_IOSCHED_DEADLINE is not set
-CONFIG_BOARD_ATNGW100_MKI=y
-# CONFIG_OWNERSHIP_TRACE is not set
-CONFIG_NMI_DEBUGGING=y
-CONFIG_CPU_FREQ=y
-# CONFIG_CPU_FREQ_STAT is not set
-CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
-CONFIG_CPU_FREQ_GOV_USERSPACE=y
-CONFIG_AVR32_AT32AP_CPUFREQ=y
-CONFIG_NET=y
-CONFIG_PACKET=y
-CONFIG_UNIX=y
-CONFIG_XFRM_USER=y
-CONFIG_NET_KEY=y
-CONFIG_INET=y
-CONFIG_IP_MULTICAST=y
-CONFIG_IP_ADVANCED_ROUTER=y
-CONFIG_IP_PNP=y
-CONFIG_IP_PNP_DHCP=y
-CONFIG_IP_MROUTE=y
-CONFIG_IP_PIMSM_V1=y
-CONFIG_SYN_COOKIES=y
-CONFIG_INET_AH=y
-CONFIG_INET_ESP=y
-CONFIG_INET_IPCOMP=y
-# CONFIG_INET_LRO is not set
-CONFIG_IPV6=y
-CONFIG_INET6_AH=y
-CONFIG_INET6_ESP=y
-CONFIG_INET6_IPCOMP=y
-CONFIG_NETFILTER=y
-# CONFIG_NETFILTER_ADVANCED is not set
-CONFIG_NETFILTER_XTABLES=y
-CONFIG_BRIDGE=m
-CONFIG_VLAN_8021Q=m
-CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
-# CONFIG_PREVENT_FIRMWARE_BUILD is not set
-# CONFIG_FW_LOADER is not set
-CONFIG_MTD=y
-CONFIG_MTD_CMDLINE_PARTS=y
-CONFIG_MTD_BLOCK=y
-CONFIG_MTD_CFI=y
-CONFIG_MTD_CFI_AMDSTD=y
-CONFIG_MTD_PHYSMAP=y
-CONFIG_MTD_DATAFLASH=y
-CONFIG_MTD_UBI=y
-CONFIG_BLK_DEV_LOOP=m
-CONFIG_BLK_DEV_NBD=m
-CONFIG_BLK_DEV_RAM=m
-CONFIG_ATMEL_TCLIB=y
-CONFIG_NETDEVICES=y
-CONFIG_TUN=m
-CONFIG_MACB=y
-CONFIG_PPP=m
-CONFIG_PPP_BSDCOMP=m
-CONFIG_PPP_DEFLATE=m
-CONFIG_PPP_FILTER=y
-CONFIG_PPP_MPPE=m
-CONFIG_PPPOE=m
-CONFIG_PPP_ASYNC=m
-# CONFIG_INPUT is not set
-# CONFIG_SERIO is not set
-# CONFIG_VT is not set
-# CONFIG_LEGACY_PTYS is not set
-# CONFIG_DEVKMEM is not set
-CONFIG_SERIAL_ATMEL=y
-CONFIG_SERIAL_ATMEL_CONSOLE=y
-# CONFIG_HW_RANDOM is not set
-CONFIG_I2C=m
-CONFIG_I2C_CHARDEV=m
-CONFIG_I2C_GPIO=m
-CONFIG_SPI=y
-CONFIG_SPI_ATMEL=y
-CONFIG_SPI_SPIDEV=m
-CONFIG_GPIO_SYSFS=y
-# CONFIG_HWMON is not set
-CONFIG_WATCHDOG=y
-CONFIG_AT32AP700X_WDT=y
-CONFIG_USB_GADGET=y
-CONFIG_USB_GADGET_VBUS_DRAW=350
-CONFIG_USB_ZERO=m
-CONFIG_USB_ETH=m
-CONFIG_USB_GADGETFS=m
-CONFIG_USB_MASS_STORAGE=m
-CONFIG_USB_G_SERIAL=m
-CONFIG_USB_CDC_COMPOSITE=m
-CONFIG_MMC=y
-CONFIG_MMC_TEST=m
-CONFIG_MMC_ATMELMCI=y
-CONFIG_NEW_LEDS=y
-CONFIG_LEDS_CLASS=y
-CONFIG_LEDS_GPIO=y
-CONFIG_LEDS_TRIGGERS=y
-CONFIG_LEDS_TRIGGER_TIMER=y
-CONFIG_LEDS_TRIGGER_HEARTBEAT=y
-CONFIG_RTC_CLASS=y
-CONFIG_RTC_DRV_AT32AP700X=y
-CONFIG_DMADEVICES=y
-CONFIG_EXT2_FS=y
-CONFIG_EXT3_FS=y
-# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
-# CONFIG_EXT3_FS_XATTR is not set
-CONFIG_EXT4_FS=y
-# CONFIG_DNOTIFY is not set
-CONFIG_FUSE_FS=m
-CONFIG_MSDOS_FS=m
-CONFIG_VFAT_FS=m
-CONFIG_FAT_DEFAULT_CODEPAGE=850
-CONFIG_PROC_KCORE=y
-CONFIG_TMPFS=y
-CONFIG_CONFIGFS_FS=y
-CONFIG_JFFS2_FS=y
-CONFIG_UBIFS_FS=y
-CONFIG_NFS_FS=y
-CONFIG_ROOT_NFS=y
-CONFIG_NFSD=m
-CONFIG_NFSD_V3=y
-CONFIG_CIFS=m
-CONFIG_NLS_CODEPAGE_437=m
-CONFIG_NLS_CODEPAGE_850=m
-CONFIG_NLS_ISO8859_1=m
-CONFIG_NLS_UTF8=m
-CONFIG_DEBUG_FS=y
-CONFIG_FRAME_POINTER=y
-CONFIG_MAGIC_SYSRQ=y
-CONFIG_DETECT_HUNG_TASK=y
diff --git a/arch/avr32/configs/atngw100_evklcd100_defconfig b/arch/avr32/configs/atngw100_evklcd100_defconfig
deleted file mode 100644 (file)
index 01ff632..0000000
+++ /dev/null
@@ -1,158 +0,0 @@
-# CONFIG_LOCALVERSION_AUTO is not set
-CONFIG_SYSVIPC=y
-CONFIG_POSIX_MQUEUE=y
-CONFIG_NO_HZ=y
-CONFIG_HIGH_RES_TIMERS=y
-CONFIG_LOG_BUF_SHIFT=14
-CONFIG_RELAY=y
-CONFIG_BLK_DEV_INITRD=y
-CONFIG_CC_OPTIMIZE_FOR_SIZE=y
-# CONFIG_BASE_FULL is not set
-# CONFIG_COMPAT_BRK is not set
-CONFIG_PROFILING=y
-CONFIG_OPROFILE=m
-CONFIG_MODULES=y
-CONFIG_MODULE_UNLOAD=y
-# CONFIG_BLK_DEV_BSG is not set
-# CONFIG_IOSCHED_DEADLINE is not set
-CONFIG_BOARD_ATNGW100_MKI=y
-CONFIG_BOARD_ATNGW100_EVKLCD10X=y
-CONFIG_BOARD_ATNGW100_EVKLCD10X_QVGA=y
-# CONFIG_OWNERSHIP_TRACE is not set
-CONFIG_NMI_DEBUGGING=y
-CONFIG_CPU_FREQ=y
-# CONFIG_CPU_FREQ_STAT is not set
-CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
-CONFIG_CPU_FREQ_GOV_USERSPACE=y
-CONFIG_AVR32_AT32AP_CPUFREQ=y
-CONFIG_NET=y
-CONFIG_PACKET=y
-CONFIG_UNIX=y
-CONFIG_XFRM_USER=y
-CONFIG_NET_KEY=y
-CONFIG_INET=y
-CONFIG_IP_MULTICAST=y
-CONFIG_IP_ADVANCED_ROUTER=y
-CONFIG_IP_PNP=y
-CONFIG_IP_PNP_DHCP=y
-CONFIG_IP_MROUTE=y
-CONFIG_IP_PIMSM_V1=y
-CONFIG_SYN_COOKIES=y
-CONFIG_INET_AH=y
-CONFIG_INET_ESP=y
-CONFIG_INET_IPCOMP=y
-# CONFIG_INET_LRO is not set
-CONFIG_IPV6=y
-CONFIG_INET6_AH=y
-CONFIG_INET6_ESP=y
-CONFIG_INET6_IPCOMP=y
-CONFIG_NETFILTER=y
-# CONFIG_NETFILTER_ADVANCED is not set
-CONFIG_NETFILTER_XTABLES=y
-CONFIG_BRIDGE=m
-CONFIG_VLAN_8021Q=m
-CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
-# CONFIG_PREVENT_FIRMWARE_BUILD is not set
-# CONFIG_FW_LOADER is not set
-CONFIG_MTD=y
-CONFIG_MTD_CMDLINE_PARTS=y
-CONFIG_MTD_BLOCK=y
-CONFIG_MTD_CFI=y
-CONFIG_MTD_CFI_AMDSTD=y
-CONFIG_MTD_PHYSMAP=y
-CONFIG_MTD_DATAFLASH=y
-CONFIG_MTD_UBI=y
-CONFIG_BLK_DEV_LOOP=m
-CONFIG_BLK_DEV_NBD=m
-CONFIG_BLK_DEV_RAM=m
-CONFIG_ATMEL_TCLIB=y
-CONFIG_NETDEVICES=y
-CONFIG_TUN=m
-CONFIG_MACB=y
-CONFIG_PPP=m
-CONFIG_PPP_BSDCOMP=m
-CONFIG_PPP_DEFLATE=m
-CONFIG_PPP_FILTER=y
-CONFIG_PPP_MPPE=m
-CONFIG_PPPOE=m
-CONFIG_PPP_ASYNC=m
-# CONFIG_INPUT_MOUSEDEV is not set
-CONFIG_INPUT_EVDEV=m
-# CONFIG_INPUT_KEYBOARD is not set
-# CONFIG_INPUT_MOUSE is not set
-CONFIG_INPUT_TOUCHSCREEN=y
-CONFIG_TOUCHSCREEN_WM97XX=m
-# CONFIG_SERIO is not set
-# CONFIG_LEGACY_PTYS is not set
-CONFIG_SERIAL_ATMEL=y
-CONFIG_SERIAL_ATMEL_CONSOLE=y
-# CONFIG_HW_RANDOM is not set
-CONFIG_I2C=m
-CONFIG_I2C_CHARDEV=m
-CONFIG_I2C_GPIO=m
-CONFIG_SPI=y
-CONFIG_SPI_ATMEL=y
-CONFIG_SPI_SPIDEV=m
-CONFIG_GPIO_SYSFS=y
-# CONFIG_HWMON is not set
-CONFIG_WATCHDOG=y
-CONFIG_AT32AP700X_WDT=y
-CONFIG_FB=y
-CONFIG_FB_ATMEL=y
-CONFIG_SOUND=y
-CONFIG_SND=y
-CONFIG_SND_MIXER_OSS=m
-CONFIG_SND_PCM_OSS=m
-CONFIG_SND_HRTIMER=y
-# CONFIG_SND_SUPPORT_OLD_API is not set
-# CONFIG_SND_DRIVERS is not set
-CONFIG_SND_ATMEL_AC97C=m
-# CONFIG_SND_SPI is not set
-CONFIG_USB_GADGET=y
-CONFIG_USB_GADGET_VBUS_DRAW=350
-CONFIG_USB_ZERO=m
-CONFIG_USB_ETH=m
-CONFIG_USB_GADGETFS=m
-CONFIG_USB_MASS_STORAGE=m
-CONFIG_USB_G_SERIAL=m
-CONFIG_USB_CDC_COMPOSITE=m
-CONFIG_MMC=y
-CONFIG_MMC_TEST=m
-CONFIG_MMC_ATMELMCI=y
-CONFIG_NEW_LEDS=y
-CONFIG_LEDS_CLASS=y
-CONFIG_LEDS_GPIO=y
-CONFIG_LEDS_TRIGGERS=y
-CONFIG_LEDS_TRIGGER_TIMER=y
-CONFIG_LEDS_TRIGGER_HEARTBEAT=y
-CONFIG_RTC_CLASS=y
-CONFIG_RTC_DRV_AT32AP700X=y
-CONFIG_DMADEVICES=y
-CONFIG_EXT2_FS=y
-CONFIG_EXT3_FS=y
-# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
-# CONFIG_EXT3_FS_XATTR is not set
-CONFIG_EXT4_FS=y
-# CONFIG_DNOTIFY is not set
-CONFIG_FUSE_FS=m
-CONFIG_MSDOS_FS=m
-CONFIG_VFAT_FS=m
-CONFIG_FAT_DEFAULT_CODEPAGE=850
-CONFIG_PROC_KCORE=y
-CONFIG_TMPFS=y
-CONFIG_CONFIGFS_FS=y
-CONFIG_JFFS2_FS=y
-CONFIG_UBIFS_FS=y
-CONFIG_NFS_FS=y
-CONFIG_ROOT_NFS=y
-CONFIG_NFSD=m
-CONFIG_NFSD_V3=y
-CONFIG_CIFS=m
-CONFIG_NLS_CODEPAGE_437=m
-CONFIG_NLS_CODEPAGE_850=m
-CONFIG_NLS_ISO8859_1=m
-CONFIG_NLS_UTF8=m
-CONFIG_DEBUG_FS=y
-CONFIG_FRAME_POINTER=y
-CONFIG_MAGIC_SYSRQ=y
-CONFIG_DETECT_HUNG_TASK=y
diff --git a/arch/avr32/configs/atngw100_evklcd101_defconfig b/arch/avr32/configs/atngw100_evklcd101_defconfig
deleted file mode 100644 (file)
index c4021df..0000000
+++ /dev/null
@@ -1,157 +0,0 @@
-# CONFIG_LOCALVERSION_AUTO is not set
-CONFIG_SYSVIPC=y
-CONFIG_POSIX_MQUEUE=y
-CONFIG_NO_HZ=y
-CONFIG_HIGH_RES_TIMERS=y
-CONFIG_LOG_BUF_SHIFT=14
-CONFIG_RELAY=y
-CONFIG_BLK_DEV_INITRD=y
-CONFIG_CC_OPTIMIZE_FOR_SIZE=y
-# CONFIG_BASE_FULL is not set
-# CONFIG_COMPAT_BRK is not set
-CONFIG_PROFILING=y
-CONFIG_OPROFILE=m
-CONFIG_MODULES=y
-CONFIG_MODULE_UNLOAD=y
-# CONFIG_BLK_DEV_BSG is not set
-# CONFIG_IOSCHED_DEADLINE is not set
-CONFIG_BOARD_ATNGW100_MKI=y
-CONFIG_BOARD_ATNGW100_EVKLCD10X=y
-# CONFIG_OWNERSHIP_TRACE is not set
-CONFIG_NMI_DEBUGGING=y
-CONFIG_CPU_FREQ=y
-# CONFIG_CPU_FREQ_STAT is not set
-CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
-CONFIG_CPU_FREQ_GOV_USERSPACE=y
-CONFIG_AVR32_AT32AP_CPUFREQ=y
-CONFIG_NET=y
-CONFIG_PACKET=y
-CONFIG_UNIX=y
-CONFIG_XFRM_USER=y
-CONFIG_NET_KEY=y
-CONFIG_INET=y
-CONFIG_IP_MULTICAST=y
-CONFIG_IP_ADVANCED_ROUTER=y
-CONFIG_IP_PNP=y
-CONFIG_IP_PNP_DHCP=y
-CONFIG_IP_MROUTE=y
-CONFIG_IP_PIMSM_V1=y
-CONFIG_SYN_COOKIES=y
-CONFIG_INET_AH=y
-CONFIG_INET_ESP=y
-CONFIG_INET_IPCOMP=y
-# CONFIG_INET_LRO is not set
-CONFIG_IPV6=y
-CONFIG_INET6_AH=y
-CONFIG_INET6_ESP=y
-CONFIG_INET6_IPCOMP=y
-CONFIG_NETFILTER=y
-# CONFIG_NETFILTER_ADVANCED is not set
-CONFIG_NETFILTER_XTABLES=y
-CONFIG_BRIDGE=m
-CONFIG_VLAN_8021Q=m
-CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
-# CONFIG_PREVENT_FIRMWARE_BUILD is not set
-# CONFIG_FW_LOADER is not set
-CONFIG_MTD=y
-CONFIG_MTD_CMDLINE_PARTS=y
-CONFIG_MTD_BLOCK=y
-CONFIG_MTD_CFI=y
-CONFIG_MTD_CFI_AMDSTD=y
-CONFIG_MTD_PHYSMAP=y
-CONFIG_MTD_DATAFLASH=y
-CONFIG_MTD_UBI=y
-CONFIG_BLK_DEV_LOOP=m
-CONFIG_BLK_DEV_NBD=m
-CONFIG_BLK_DEV_RAM=m
-CONFIG_ATMEL_TCLIB=y
-CONFIG_NETDEVICES=y
-CONFIG_TUN=m
-CONFIG_MACB=y
-CONFIG_PPP=m
-CONFIG_PPP_BSDCOMP=m
-CONFIG_PPP_DEFLATE=m
-CONFIG_PPP_FILTER=y
-CONFIG_PPP_MPPE=m
-CONFIG_PPPOE=m
-CONFIG_PPP_ASYNC=m
-# CONFIG_INPUT_MOUSEDEV is not set
-CONFIG_INPUT_EVDEV=m
-# CONFIG_INPUT_KEYBOARD is not set
-# CONFIG_INPUT_MOUSE is not set
-CONFIG_INPUT_TOUCHSCREEN=y
-CONFIG_TOUCHSCREEN_WM97XX=m
-# CONFIG_SERIO is not set
-# CONFIG_LEGACY_PTYS is not set
-CONFIG_SERIAL_ATMEL=y
-CONFIG_SERIAL_ATMEL_CONSOLE=y
-# CONFIG_HW_RANDOM is not set
-CONFIG_I2C=m
-CONFIG_I2C_CHARDEV=m
-CONFIG_I2C_GPIO=m
-CONFIG_SPI=y
-CONFIG_SPI_ATMEL=y
-CONFIG_SPI_SPIDEV=m
-CONFIG_GPIO_SYSFS=y
-# CONFIG_HWMON is not set
-CONFIG_WATCHDOG=y
-CONFIG_AT32AP700X_WDT=y
-CONFIG_FB=y
-CONFIG_FB_ATMEL=y
-CONFIG_SOUND=y
-CONFIG_SND=y
-CONFIG_SND_MIXER_OSS=m
-CONFIG_SND_PCM_OSS=m
-CONFIG_SND_HRTIMER=y
-# CONFIG_SND_SUPPORT_OLD_API is not set
-# CONFIG_SND_DRIVERS is not set
-CONFIG_SND_ATMEL_AC97C=m
-# CONFIG_SND_SPI is not set
-CONFIG_USB_GADGET=y
-CONFIG_USB_GADGET_VBUS_DRAW=350
-CONFIG_USB_ZERO=m
-CONFIG_USB_ETH=m
-CONFIG_USB_GADGETFS=m
-CONFIG_USB_MASS_STORAGE=m
-CONFIG_USB_G_SERIAL=m
-CONFIG_USB_CDC_COMPOSITE=m
-CONFIG_MMC=y
-CONFIG_MMC_TEST=m
-CONFIG_MMC_ATMELMCI=y
-CONFIG_NEW_LEDS=y
-CONFIG_LEDS_CLASS=y
-CONFIG_LEDS_GPIO=y
-CONFIG_LEDS_TRIGGERS=y
-CONFIG_LEDS_TRIGGER_TIMER=y
-CONFIG_LEDS_TRIGGER_HEARTBEAT=y
-CONFIG_RTC_CLASS=y
-CONFIG_RTC_DRV_AT32AP700X=y
-CONFIG_DMADEVICES=y
-CONFIG_EXT2_FS=y
-CONFIG_EXT3_FS=y
-# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
-# CONFIG_EXT3_FS_XATTR is not set
-CONFIG_EXT4_FS=y
-# CONFIG_DNOTIFY is not set
-CONFIG_FUSE_FS=m
-CONFIG_MSDOS_FS=m
-CONFIG_VFAT_FS=m
-CONFIG_FAT_DEFAULT_CODEPAGE=850
-CONFIG_PROC_KCORE=y
-CONFIG_TMPFS=y
-CONFIG_CONFIGFS_FS=y
-CONFIG_JFFS2_FS=y
-CONFIG_UBIFS_FS=y
-CONFIG_NFS_FS=y
-CONFIG_ROOT_NFS=y
-CONFIG_NFSD=m
-CONFIG_NFSD_V3=y
-CONFIG_CIFS=m
-CONFIG_NLS_CODEPAGE_437=m
-CONFIG_NLS_CODEPAGE_850=m
-CONFIG_NLS_ISO8859_1=m
-CONFIG_NLS_UTF8=m
-CONFIG_DEBUG_FS=y
-CONFIG_FRAME_POINTER=y
-CONFIG_MAGIC_SYSRQ=y
-CONFIG_DETECT_HUNG_TASK=y
diff --git a/arch/avr32/configs/atngw100_mrmt_defconfig b/arch/avr32/configs/atngw100_mrmt_defconfig
deleted file mode 100644 (file)
index ffcc28d..0000000
+++ /dev/null
@@ -1,136 +0,0 @@
-# CONFIG_LOCALVERSION_AUTO is not set
-CONFIG_SYSVIPC=y
-CONFIG_POSIX_MQUEUE=y
-CONFIG_BSD_PROCESS_ACCT=y
-CONFIG_BSD_PROCESS_ACCT_V3=y
-CONFIG_LOG_BUF_SHIFT=14
-CONFIG_BLK_DEV_INITRD=y
-CONFIG_CC_OPTIMIZE_FOR_SIZE=y
-# CONFIG_BASE_FULL is not set
-# CONFIG_SLUB_DEBUG is not set
-CONFIG_MODULES=y
-CONFIG_MODULE_UNLOAD=y
-CONFIG_MODULE_FORCE_UNLOAD=y
-# CONFIG_BLK_DEV_BSG is not set
-# CONFIG_IOSCHED_DEADLINE is not set
-# CONFIG_OWNERSHIP_TRACE is not set
-# CONFIG_SUSPEND is not set
-CONFIG_PM=y
-CONFIG_CPU_FREQ=y
-CONFIG_CPU_FREQ_GOV_POWERSAVE=y
-CONFIG_CPU_FREQ_GOV_USERSPACE=y
-CONFIG_CPU_FREQ_GOV_ONDEMAND=y
-CONFIG_AVR32_AT32AP_CPUFREQ=y
-CONFIG_NET=y
-CONFIG_PACKET=y
-CONFIG_UNIX=y
-CONFIG_INET=y
-CONFIG_IP_PNP=y
-CONFIG_IP_PNP_DHCP=y
-CONFIG_SYN_COOKIES=y
-# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
-# CONFIG_INET_XFRM_MODE_TUNNEL is not set
-# CONFIG_INET_XFRM_MODE_BEET is not set
-# CONFIG_INET_LRO is not set
-# CONFIG_IPV6 is not set
-CONFIG_BT=m
-CONFIG_BT_RFCOMM=m
-CONFIG_BT_RFCOMM_TTY=y
-CONFIG_BT_HIDP=m
-CONFIG_BT_HCIUART=m
-CONFIG_BT_HCIUART_H4=y
-CONFIG_BT_HCIUART_BCSP=y
-CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
-# CONFIG_PREVENT_FIRMWARE_BUILD is not set
-# CONFIG_FW_LOADER is not set
-CONFIG_MTD=y
-CONFIG_MTD_CMDLINE_PARTS=y
-CONFIG_MTD_BLOCK=y
-CONFIG_MTD_CFI=y
-CONFIG_MTD_CFI_AMDSTD=y
-CONFIG_MTD_PHYSMAP=y
-CONFIG_MTD_DATAFLASH=y
-CONFIG_BLK_DEV_LOOP=y
-CONFIG_NETDEVICES=y
-CONFIG_MACB=y
-# CONFIG_INPUT_MOUSEDEV is not set
-CONFIG_INPUT_EVDEV=y
-# CONFIG_KEYBOARD_ATKBD is not set
-CONFIG_KEYBOARD_GPIO=y
-# CONFIG_INPUT_MOUSE is not set
-CONFIG_INPUT_TOUCHSCREEN=y
-CONFIG_TOUCHSCREEN_ADS7846=m
-# CONFIG_SERIO is not set
-CONFIG_VT_HW_CONSOLE_BINDING=y
-# CONFIG_LEGACY_PTYS is not set
-CONFIG_SERIAL_ATMEL=y
-CONFIG_SERIAL_ATMEL_CONSOLE=y
-# CONFIG_HW_RANDOM is not set
-CONFIG_I2C=y
-CONFIG_I2C_CHARDEV=y
-CONFIG_I2C_GPIO=y
-CONFIG_SPI=y
-CONFIG_SPI_ATMEL=y
-CONFIG_SPI_SPIDEV=y
-CONFIG_WATCHDOG=y
-CONFIG_AT32AP700X_WDT=y
-CONFIG_FB=y
-CONFIG_FB_ATMEL=y
-CONFIG_LCD_CLASS_DEVICE=y
-CONFIG_SOUND=m
-CONFIG_SND=m
-CONFIG_SND_MIXER_OSS=m
-CONFIG_SND_PCM_OSS=m
-# CONFIG_SND_SUPPORT_OLD_API is not set
-# CONFIG_SND_VERBOSE_PROCFS is not set
-CONFIG_SND_ATMEL_AC97C=m
-# CONFIG_SND_SPI is not set
-CONFIG_USB_GADGET=m
-CONFIG_USB_GADGET_DEBUG_FILES=y
-CONFIG_USB_MASS_STORAGE=m
-CONFIG_USB_G_SERIAL=m
-CONFIG_MMC=y
-CONFIG_MMC_ATMELMCI=y
-CONFIG_NEW_LEDS=y
-CONFIG_LEDS_CLASS=y
-CONFIG_LEDS_GPIO=y
-CONFIG_LEDS_PWM=y
-CONFIG_LEDS_TRIGGERS=y
-CONFIG_LEDS_TRIGGER_TIMER=y
-CONFIG_LEDS_TRIGGER_HEARTBEAT=y
-CONFIG_RTC_CLASS=y
-CONFIG_RTC_DRV_S35390A=m
-CONFIG_RTC_DRV_AT32AP700X=m
-CONFIG_DMADEVICES=y
-CONFIG_UIO=y
-CONFIG_PWM=y
-CONFIG_PWM_ATMEL=y
-CONFIG_EXT2_FS=y
-CONFIG_EXT2_FS_XATTR=y
-CONFIG_EXT3_FS=y
-# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
-# CONFIG_DNOTIFY is not set
-CONFIG_MSDOS_FS=y
-CONFIG_VFAT_FS=y
-CONFIG_FAT_DEFAULT_CODEPAGE=850
-CONFIG_NTFS_FS=m
-CONFIG_NTFS_RW=y
-CONFIG_TMPFS=y
-CONFIG_CONFIGFS_FS=y
-CONFIG_JFFS2_FS=y
-CONFIG_NFS_FS=y
-CONFIG_ROOT_NFS=y
-CONFIG_CIFS=m
-CONFIG_CIFS_STATS=y
-CONFIG_CIFS_WEAK_PW_HASH=y
-CONFIG_CIFS_XATTR=y
-CONFIG_CIFS_POSIX=y
-CONFIG_NLS_CODEPAGE_437=y
-CONFIG_NLS_CODEPAGE_850=y
-CONFIG_NLS_ISO8859_1=y
-CONFIG_NLS_UTF8=y
-CONFIG_DEBUG_FS=y
-CONFIG_FRAME_POINTER=y
-CONFIG_MAGIC_SYSRQ=y
-CONFIG_DETECT_HUNG_TASK=y
-CONFIG_CRC_CCITT=y
diff --git a/arch/avr32/configs/atngw100mkii_defconfig b/arch/avr32/configs/atngw100mkii_defconfig
deleted file mode 100644 (file)
index 0496264..0000000
+++ /dev/null
@@ -1,144 +0,0 @@
-# CONFIG_LOCALVERSION_AUTO is not set
-CONFIG_SYSVIPC=y
-CONFIG_POSIX_MQUEUE=y
-CONFIG_NO_HZ=y
-CONFIG_HIGH_RES_TIMERS=y
-CONFIG_LOG_BUF_SHIFT=14
-CONFIG_RELAY=y
-CONFIG_BLK_DEV_INITRD=y
-CONFIG_CC_OPTIMIZE_FOR_SIZE=y
-# CONFIG_BASE_FULL is not set
-# CONFIG_COMPAT_BRK is not set
-CONFIG_PROFILING=y
-CONFIG_OPROFILE=m
-CONFIG_MODULES=y
-CONFIG_MODULE_UNLOAD=y
-# CONFIG_BLK_DEV_BSG is not set
-# CONFIG_IOSCHED_DEADLINE is not set
-CONFIG_BOARD_ATNGW100_MKII=y
-# CONFIG_OWNERSHIP_TRACE is not set
-CONFIG_NMI_DEBUGGING=y
-CONFIG_CPU_FREQ=y
-# CONFIG_CPU_FREQ_STAT is not set
-CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
-CONFIG_CPU_FREQ_GOV_USERSPACE=y
-CONFIG_AVR32_AT32AP_CPUFREQ=y
-CONFIG_NET=y
-CONFIG_PACKET=y
-CONFIG_UNIX=y
-CONFIG_XFRM_USER=y
-CONFIG_NET_KEY=y
-CONFIG_INET=y
-CONFIG_IP_MULTICAST=y
-CONFIG_IP_ADVANCED_ROUTER=y
-CONFIG_IP_PNP=y
-CONFIG_IP_PNP_DHCP=y
-CONFIG_IP_MROUTE=y
-CONFIG_IP_PIMSM_V1=y
-CONFIG_SYN_COOKIES=y
-CONFIG_INET_AH=y
-CONFIG_INET_ESP=y
-CONFIG_INET_IPCOMP=y
-# CONFIG_INET_LRO is not set
-CONFIG_IPV6=y
-CONFIG_INET6_AH=y
-CONFIG_INET6_ESP=y
-CONFIG_INET6_IPCOMP=y
-CONFIG_NETFILTER=y
-# CONFIG_NETFILTER_ADVANCED is not set
-CONFIG_NETFILTER_XTABLES=y
-CONFIG_BRIDGE=m
-CONFIG_VLAN_8021Q=m
-CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
-# CONFIG_PREVENT_FIRMWARE_BUILD is not set
-# CONFIG_FW_LOADER is not set
-CONFIG_MTD=y
-CONFIG_MTD_CMDLINE_PARTS=y
-CONFIG_MTD_BLOCK=y
-CONFIG_MTD_CFI=y
-CONFIG_MTD_CFI_INTELEXT=y
-CONFIG_MTD_PHYSMAP=y
-CONFIG_MTD_DATAFLASH=y
-CONFIG_MTD_NAND=y
-CONFIG_MTD_NAND_ATMEL=y
-CONFIG_MTD_UBI=y
-CONFIG_BLK_DEV_LOOP=m
-CONFIG_BLK_DEV_NBD=m
-CONFIG_BLK_DEV_RAM=m
-CONFIG_ATMEL_TCLIB=y
-CONFIG_NETDEVICES=y
-CONFIG_TUN=m
-CONFIG_MACB=y
-CONFIG_PPP=m
-CONFIG_PPP_BSDCOMP=m
-CONFIG_PPP_DEFLATE=m
-CONFIG_PPP_FILTER=y
-CONFIG_PPP_MPPE=m
-CONFIG_PPPOE=m
-CONFIG_PPP_ASYNC=m
-# CONFIG_INPUT is not set
-# CONFIG_SERIO is not set
-# CONFIG_VT is not set
-# CONFIG_LEGACY_PTYS is not set
-# CONFIG_DEVKMEM is not set
-CONFIG_SERIAL_ATMEL=y
-CONFIG_SERIAL_ATMEL_CONSOLE=y
-# CONFIG_HW_RANDOM is not set
-CONFIG_I2C=m
-CONFIG_I2C_CHARDEV=m
-CONFIG_I2C_GPIO=m
-CONFIG_SPI=y
-CONFIG_SPI_ATMEL=y
-CONFIG_SPI_SPIDEV=m
-CONFIG_GPIO_SYSFS=y
-# CONFIG_HWMON is not set
-CONFIG_WATCHDOG=y
-CONFIG_AT32AP700X_WDT=y
-CONFIG_USB_GADGET=y
-CONFIG_USB_GADGET_VBUS_DRAW=350
-CONFIG_USB_ZERO=m
-CONFIG_USB_ETH=m
-CONFIG_USB_GADGETFS=m
-CONFIG_USB_MASS_STORAGE=m
-CONFIG_USB_G_SERIAL=m
-CONFIG_USB_CDC_COMPOSITE=m
-CONFIG_MMC=y
-CONFIG_MMC_TEST=m
-CONFIG_MMC_ATMELMCI=y
-CONFIG_NEW_LEDS=y
-CONFIG_LEDS_CLASS=y
-CONFIG_LEDS_GPIO=y
-CONFIG_LEDS_TRIGGERS=y
-CONFIG_LEDS_TRIGGER_TIMER=y
-CONFIG_LEDS_TRIGGER_HEARTBEAT=y
-CONFIG_RTC_CLASS=y
-CONFIG_RTC_DRV_AT32AP700X=y
-CONFIG_DMADEVICES=y
-CONFIG_EXT2_FS=y
-CONFIG_EXT3_FS=y
-# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
-# CONFIG_EXT3_FS_XATTR is not set
-CONFIG_EXT4_FS=y
-# CONFIG_DNOTIFY is not set
-CONFIG_FUSE_FS=m
-CONFIG_MSDOS_FS=m
-CONFIG_VFAT_FS=m
-CONFIG_FAT_DEFAULT_CODEPAGE=850
-CONFIG_PROC_KCORE=y
-CONFIG_TMPFS=y
-CONFIG_CONFIGFS_FS=y
-CONFIG_JFFS2_FS=y
-CONFIG_UBIFS_FS=y
-CONFIG_NFS_FS=y
-CONFIG_ROOT_NFS=y
-CONFIG_NFSD=m
-CONFIG_NFSD_V3=y
-CONFIG_CIFS=m
-CONFIG_NLS_CODEPAGE_437=m
-CONFIG_NLS_CODEPAGE_850=m
-CONFIG_NLS_ISO8859_1=m
-CONFIG_NLS_UTF8=m
-CONFIG_DEBUG_FS=y
-CONFIG_FRAME_POINTER=y
-CONFIG_MAGIC_SYSRQ=y
-CONFIG_DETECT_HUNG_TASK=y
diff --git a/arch/avr32/configs/atngw100mkii_evklcd100_defconfig b/arch/avr32/configs/atngw100mkii_evklcd100_defconfig
deleted file mode 100644 (file)
index 89c2cda..0000000
+++ /dev/null
@@ -1,161 +0,0 @@
-# CONFIG_LOCALVERSION_AUTO is not set
-CONFIG_SYSVIPC=y
-CONFIG_POSIX_MQUEUE=y
-CONFIG_NO_HZ=y
-CONFIG_HIGH_RES_TIMERS=y
-CONFIG_LOG_BUF_SHIFT=14
-CONFIG_RELAY=y
-CONFIG_BLK_DEV_INITRD=y
-CONFIG_CC_OPTIMIZE_FOR_SIZE=y
-# CONFIG_BASE_FULL is not set
-# CONFIG_COMPAT_BRK is not set
-CONFIG_PROFILING=y
-CONFIG_OPROFILE=m
-CONFIG_MODULES=y
-CONFIG_MODULE_UNLOAD=y
-# CONFIG_BLK_DEV_BSG is not set
-# CONFIG_IOSCHED_DEADLINE is not set
-CONFIG_BOARD_ATNGW100_MKII=y
-CONFIG_BOARD_ATNGW100_MKII_LCD=y
-CONFIG_BOARD_ATNGW100_EVKLCD10X=y
-CONFIG_BOARD_ATNGW100_EVKLCD10X_QVGA=y
-# CONFIG_OWNERSHIP_TRACE is not set
-CONFIG_NMI_DEBUGGING=y
-CONFIG_CPU_FREQ=y
-# CONFIG_CPU_FREQ_STAT is not set
-CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
-CONFIG_CPU_FREQ_GOV_USERSPACE=y
-CONFIG_AVR32_AT32AP_CPUFREQ=y
-CONFIG_NET=y
-CONFIG_PACKET=y
-CONFIG_UNIX=y
-CONFIG_XFRM_USER=y
-CONFIG_NET_KEY=y
-CONFIG_INET=y
-CONFIG_IP_MULTICAST=y
-CONFIG_IP_ADVANCED_ROUTER=y
-CONFIG_IP_PNP=y
-CONFIG_IP_PNP_DHCP=y
-CONFIG_IP_MROUTE=y
-CONFIG_IP_PIMSM_V1=y
-CONFIG_SYN_COOKIES=y
-CONFIG_INET_AH=y
-CONFIG_INET_ESP=y
-CONFIG_INET_IPCOMP=y
-# CONFIG_INET_LRO is not set
-CONFIG_IPV6=y
-CONFIG_INET6_AH=y
-CONFIG_INET6_ESP=y
-CONFIG_INET6_IPCOMP=y
-CONFIG_NETFILTER=y
-# CONFIG_NETFILTER_ADVANCED is not set
-CONFIG_NETFILTER_XTABLES=y
-CONFIG_BRIDGE=m
-CONFIG_VLAN_8021Q=m
-CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
-# CONFIG_PREVENT_FIRMWARE_BUILD is not set
-# CONFIG_FW_LOADER is not set
-CONFIG_MTD=y
-CONFIG_MTD_CMDLINE_PARTS=y
-CONFIG_MTD_BLOCK=y
-CONFIG_MTD_CFI=y
-CONFIG_MTD_CFI_INTELEXT=y
-CONFIG_MTD_PHYSMAP=y
-CONFIG_MTD_DATAFLASH=y
-CONFIG_MTD_NAND=y
-CONFIG_MTD_NAND_ATMEL=y
-CONFIG_MTD_UBI=y
-CONFIG_BLK_DEV_LOOP=m
-CONFIG_BLK_DEV_NBD=m
-CONFIG_BLK_DEV_RAM=m
-CONFIG_ATMEL_TCLIB=y
-CONFIG_NETDEVICES=y
-CONFIG_TUN=m
-CONFIG_MACB=y
-CONFIG_PPP=m
-CONFIG_PPP_BSDCOMP=m
-CONFIG_PPP_DEFLATE=m
-CONFIG_PPP_FILTER=y
-CONFIG_PPP_MPPE=m
-CONFIG_PPPOE=m
-CONFIG_PPP_ASYNC=m
-# CONFIG_INPUT_MOUSEDEV is not set
-CONFIG_INPUT_EVDEV=m
-# CONFIG_INPUT_KEYBOARD is not set
-# CONFIG_INPUT_MOUSE is not set
-CONFIG_INPUT_TOUCHSCREEN=y
-CONFIG_TOUCHSCREEN_WM97XX=m
-# CONFIG_SERIO is not set
-# CONFIG_LEGACY_PTYS is not set
-CONFIG_SERIAL_ATMEL=y
-CONFIG_SERIAL_ATMEL_CONSOLE=y
-# CONFIG_HW_RANDOM is not set
-CONFIG_I2C=m
-CONFIG_I2C_CHARDEV=m
-CONFIG_I2C_GPIO=m
-CONFIG_SPI=y
-CONFIG_SPI_ATMEL=y
-CONFIG_SPI_SPIDEV=m
-CONFIG_GPIO_SYSFS=y
-# CONFIG_HWMON is not set
-CONFIG_WATCHDOG=y
-CONFIG_AT32AP700X_WDT=y
-CONFIG_FB=y
-CONFIG_FB_ATMEL=y
-CONFIG_SOUND=y
-CONFIG_SND=y
-CONFIG_SND_MIXER_OSS=m
-CONFIG_SND_PCM_OSS=m
-CONFIG_SND_HRTIMER=y
-# CONFIG_SND_SUPPORT_OLD_API is not set
-# CONFIG_SND_DRIVERS is not set
-CONFIG_SND_ATMEL_AC97C=m
-# CONFIG_SND_SPI is not set
-CONFIG_USB_GADGET=y
-CONFIG_USB_GADGET_VBUS_DRAW=350
-CONFIG_USB_ZERO=m
-CONFIG_USB_ETH=m
-CONFIG_USB_GADGETFS=m
-CONFIG_USB_MASS_STORAGE=m
-CONFIG_USB_G_SERIAL=m
-CONFIG_USB_CDC_COMPOSITE=m
-CONFIG_MMC=y
-CONFIG_MMC_TEST=m
-CONFIG_MMC_ATMELMCI=y
-CONFIG_NEW_LEDS=y
-CONFIG_LEDS_CLASS=y
-CONFIG_LEDS_GPIO=y
-CONFIG_LEDS_TRIGGERS=y
-CONFIG_LEDS_TRIGGER_TIMER=y
-CONFIG_LEDS_TRIGGER_HEARTBEAT=y
-CONFIG_RTC_CLASS=y
-CONFIG_RTC_DRV_AT32AP700X=y
-CONFIG_DMADEVICES=y
-CONFIG_EXT2_FS=y
-CONFIG_EXT3_FS=y
-# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
-# CONFIG_EXT3_FS_XATTR is not set
-CONFIG_EXT4_FS=y
-# CONFIG_DNOTIFY is not set
-CONFIG_FUSE_FS=m
-CONFIG_MSDOS_FS=m
-CONFIG_VFAT_FS=m
-CONFIG_FAT_DEFAULT_CODEPAGE=850
-CONFIG_PROC_KCORE=y
-CONFIG_TMPFS=y
-CONFIG_CONFIGFS_FS=y
-CONFIG_JFFS2_FS=y
-CONFIG_UBIFS_FS=y
-CONFIG_NFS_FS=y
-CONFIG_ROOT_NFS=y
-CONFIG_NFSD=m
-CONFIG_NFSD_V3=y
-CONFIG_CIFS=m
-CONFIG_NLS_CODEPAGE_437=m
-CONFIG_NLS_CODEPAGE_850=m
-CONFIG_NLS_ISO8859_1=m
-CONFIG_NLS_UTF8=m
-CONFIG_DEBUG_FS=y
-CONFIG_FRAME_POINTER=y
-CONFIG_MAGIC_SYSRQ=y
-CONFIG_DETECT_HUNG_TASK=y
diff --git a/arch/avr32/configs/atngw100mkii_evklcd101_defconfig b/arch/avr32/configs/atngw100mkii_evklcd101_defconfig
deleted file mode 100644 (file)
index 1b4d4a8..0000000
+++ /dev/null
@@ -1,160 +0,0 @@
-# CONFIG_LOCALVERSION_AUTO is not set
-CONFIG_SYSVIPC=y
-CONFIG_POSIX_MQUEUE=y
-CONFIG_NO_HZ=y
-CONFIG_HIGH_RES_TIMERS=y
-CONFIG_LOG_BUF_SHIFT=14
-CONFIG_RELAY=y
-CONFIG_BLK_DEV_INITRD=y
-CONFIG_CC_OPTIMIZE_FOR_SIZE=y
-# CONFIG_BASE_FULL is not set
-# CONFIG_COMPAT_BRK is not set
-CONFIG_PROFILING=y
-CONFIG_OPROFILE=m
-CONFIG_MODULES=y
-CONFIG_MODULE_UNLOAD=y
-# CONFIG_BLK_DEV_BSG is not set
-# CONFIG_IOSCHED_DEADLINE is not set
-CONFIG_BOARD_ATNGW100_MKII=y
-CONFIG_BOARD_ATNGW100_MKII_LCD=y
-CONFIG_BOARD_ATNGW100_EVKLCD10X=y
-# CONFIG_OWNERSHIP_TRACE is not set
-CONFIG_NMI_DEBUGGING=y
-CONFIG_CPU_FREQ=y
-# CONFIG_CPU_FREQ_STAT is not set
-CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
-CONFIG_CPU_FREQ_GOV_USERSPACE=y
-CONFIG_AVR32_AT32AP_CPUFREQ=y
-CONFIG_NET=y
-CONFIG_PACKET=y
-CONFIG_UNIX=y
-CONFIG_XFRM_USER=y
-CONFIG_NET_KEY=y
-CONFIG_INET=y
-CONFIG_IP_MULTICAST=y
-CONFIG_IP_ADVANCED_ROUTER=y
-CONFIG_IP_PNP=y
-CONFIG_IP_PNP_DHCP=y
-CONFIG_IP_MROUTE=y
-CONFIG_IP_PIMSM_V1=y
-CONFIG_SYN_COOKIES=y
-CONFIG_INET_AH=y
-CONFIG_INET_ESP=y
-CONFIG_INET_IPCOMP=y
-# CONFIG_INET_LRO is not set
-CONFIG_IPV6=y
-CONFIG_INET6_AH=y
-CONFIG_INET6_ESP=y
-CONFIG_INET6_IPCOMP=y
-CONFIG_NETFILTER=y
-# CONFIG_NETFILTER_ADVANCED is not set
-CONFIG_NETFILTER_XTABLES=y
-CONFIG_BRIDGE=m
-CONFIG_VLAN_8021Q=m
-CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
-# CONFIG_PREVENT_FIRMWARE_BUILD is not set
-# CONFIG_FW_LOADER is not set
-CONFIG_MTD=y
-CONFIG_MTD_CMDLINE_PARTS=y
-CONFIG_MTD_BLOCK=y
-CONFIG_MTD_CFI=y
-CONFIG_MTD_CFI_INTELEXT=y
-CONFIG_MTD_PHYSMAP=y
-CONFIG_MTD_DATAFLASH=y
-CONFIG_MTD_NAND=y
-CONFIG_MTD_NAND_ATMEL=y
-CONFIG_MTD_UBI=y
-CONFIG_BLK_DEV_LOOP=m
-CONFIG_BLK_DEV_NBD=m
-CONFIG_BLK_DEV_RAM=m
-CONFIG_ATMEL_TCLIB=y
-CONFIG_NETDEVICES=y
-CONFIG_TUN=m
-CONFIG_MACB=y
-CONFIG_PPP=m
-CONFIG_PPP_BSDCOMP=m
-CONFIG_PPP_DEFLATE=m
-CONFIG_PPP_FILTER=y
-CONFIG_PPP_MPPE=m
-CONFIG_PPPOE=m
-CONFIG_PPP_ASYNC=m
-# CONFIG_INPUT_MOUSEDEV is not set
-CONFIG_INPUT_EVDEV=m
-# CONFIG_INPUT_KEYBOARD is not set
-# CONFIG_INPUT_MOUSE is not set
-CONFIG_INPUT_TOUCHSCREEN=y
-CONFIG_TOUCHSCREEN_WM97XX=m
-# CONFIG_SERIO is not set
-# CONFIG_LEGACY_PTYS is not set
-CONFIG_SERIAL_ATMEL=y
-CONFIG_SERIAL_ATMEL_CONSOLE=y
-# CONFIG_HW_RANDOM is not set
-CONFIG_I2C=m
-CONFIG_I2C_CHARDEV=m
-CONFIG_I2C_GPIO=m
-CONFIG_SPI=y
-CONFIG_SPI_ATMEL=y
-CONFIG_SPI_SPIDEV=m
-CONFIG_GPIO_SYSFS=y
-# CONFIG_HWMON is not set
-CONFIG_WATCHDOG=y
-CONFIG_AT32AP700X_WDT=y
-CONFIG_FB=y
-CONFIG_FB_ATMEL=y
-CONFIG_SOUND=y
-CONFIG_SND=y
-CONFIG_SND_MIXER_OSS=m
-CONFIG_SND_PCM_OSS=m
-CONFIG_SND_HRTIMER=y
-# CONFIG_SND_SUPPORT_OLD_API is not set
-# CONFIG_SND_DRIVERS is not set
-CONFIG_SND_ATMEL_AC97C=m
-# CONFIG_SND_SPI is not set
-CONFIG_USB_GADGET=y
-CONFIG_USB_GADGET_VBUS_DRAW=350
-CONFIG_USB_ZERO=m
-CONFIG_USB_ETH=m
-CONFIG_USB_GADGETFS=m
-CONFIG_USB_MASS_STORAGE=m
-CONFIG_USB_G_SERIAL=m
-CONFIG_USB_CDC_COMPOSITE=m
-CONFIG_MMC=y
-CONFIG_MMC_TEST=m
-CONFIG_MMC_ATMELMCI=y
-CONFIG_NEW_LEDS=y
-CONFIG_LEDS_CLASS=y
-CONFIG_LEDS_GPIO=y
-CONFIG_LEDS_TRIGGERS=y
-CONFIG_LEDS_TRIGGER_TIMER=y
-CONFIG_LEDS_TRIGGER_HEARTBEAT=y
-CONFIG_RTC_CLASS=y
-CONFIG_RTC_DRV_AT32AP700X=y
-CONFIG_DMADEVICES=y
-CONFIG_EXT2_FS=y
-CONFIG_EXT3_FS=y
-# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
-# CONFIG_EXT3_FS_XATTR is not set
-CONFIG_EXT4_FS=y
-# CONFIG_DNOTIFY is not set
-CONFIG_FUSE_FS=m
-CONFIG_MSDOS_FS=m
-CONFIG_VFAT_FS=m
-CONFIG_FAT_DEFAULT_CODEPAGE=850
-CONFIG_PROC_KCORE=y
-CONFIG_TMPFS=y
-CONFIG_CONFIGFS_FS=y
-CONFIG_JFFS2_FS=y
-CONFIG_UBIFS_FS=y
-CONFIG_NFS_FS=y
-CONFIG_ROOT_NFS=y
-CONFIG_NFSD=m
-CONFIG_NFSD_V3=y
-CONFIG_CIFS=m
-CONFIG_NLS_CODEPAGE_437=m
-CONFIG_NLS_CODEPAGE_850=m
-CONFIG_NLS_ISO8859_1=m
-CONFIG_NLS_UTF8=m
-CONFIG_DEBUG_FS=y
-CONFIG_FRAME_POINTER=y
-CONFIG_MAGIC_SYSRQ=y
-CONFIG_DETECT_HUNG_TASK=y
diff --git a/arch/avr32/configs/atstk1002_defconfig b/arch/avr32/configs/atstk1002_defconfig
deleted file mode 100644 (file)
index 9b8b52e..0000000
+++ /dev/null
@@ -1,157 +0,0 @@
-# CONFIG_LOCALVERSION_AUTO is not set
-CONFIG_SYSVIPC=y
-CONFIG_POSIX_MQUEUE=y
-CONFIG_NO_HZ=y
-CONFIG_HIGH_RES_TIMERS=y
-CONFIG_LOG_BUF_SHIFT=14
-CONFIG_RELAY=y
-CONFIG_BLK_DEV_INITRD=y
-CONFIG_CC_OPTIMIZE_FOR_SIZE=y
-# CONFIG_BASE_FULL is not set
-# CONFIG_COMPAT_BRK is not set
-CONFIG_PROFILING=y
-CONFIG_OPROFILE=m
-CONFIG_MODULES=y
-CONFIG_MODULE_UNLOAD=y
-# CONFIG_BLK_DEV_BSG is not set
-# CONFIG_IOSCHED_DEADLINE is not set
-# CONFIG_OWNERSHIP_TRACE is not set
-CONFIG_NMI_DEBUGGING=y
-CONFIG_CPU_FREQ=y
-# CONFIG_CPU_FREQ_STAT is not set
-CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
-CONFIG_CPU_FREQ_GOV_USERSPACE=y
-CONFIG_AVR32_AT32AP_CPUFREQ=y
-CONFIG_NET=y
-CONFIG_PACKET=y
-CONFIG_UNIX=y
-CONFIG_XFRM_USER=m
-CONFIG_NET_KEY=m
-CONFIG_INET=y
-CONFIG_IP_PNP=y
-CONFIG_IP_PNP_DHCP=y
-CONFIG_NET_IPIP=m
-CONFIG_NET_IPGRE_DEMUX=m
-CONFIG_NET_IPGRE=m
-CONFIG_INET_AH=m
-CONFIG_INET_ESP=m
-CONFIG_INET_XFRM_MODE_TRANSPORT=m
-CONFIG_INET_XFRM_MODE_TUNNEL=m
-CONFIG_INET_XFRM_MODE_BEET=m
-# CONFIG_INET_LRO is not set
-CONFIG_INET6_AH=m
-CONFIG_INET6_ESP=m
-CONFIG_INET6_IPCOMP=m
-CONFIG_IPV6_TUNNEL=m
-CONFIG_BRIDGE=m
-CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
-# CONFIG_PREVENT_FIRMWARE_BUILD is not set
-# CONFIG_FW_LOADER is not set
-CONFIG_MTD=y
-CONFIG_MTD_CMDLINE_PARTS=y
-CONFIG_MTD_BLOCK=y
-CONFIG_MTD_CFI=y
-CONFIG_MTD_CFI_AMDSTD=y
-CONFIG_MTD_PHYSMAP=y
-CONFIG_MTD_UBI=y
-CONFIG_BLK_DEV_LOOP=m
-CONFIG_BLK_DEV_NBD=m
-CONFIG_BLK_DEV_RAM=m
-CONFIG_ATMEL_TCLIB=y
-CONFIG_ATMEL_SSC=m
-# CONFIG_SCSI_PROC_FS is not set
-CONFIG_BLK_DEV_SD=m
-CONFIG_BLK_DEV_SR=m
-# CONFIG_SCSI_LOWLEVEL is not set
-CONFIG_ATA=m
-# CONFIG_SATA_PMP is not set
-CONFIG_PATA_AT32=m
-CONFIG_NETDEVICES=y
-CONFIG_TUN=m
-CONFIG_MACB=y
-CONFIG_PPP=m
-CONFIG_PPP_BSDCOMP=m
-CONFIG_PPP_DEFLATE=m
-CONFIG_PPP_ASYNC=m
-CONFIG_INPUT=m
-CONFIG_INPUT_EVDEV=m
-# CONFIG_KEYBOARD_ATKBD is not set
-CONFIG_KEYBOARD_GPIO=m
-# CONFIG_MOUSE_PS2 is not set
-CONFIG_MOUSE_GPIO=m
-# CONFIG_SERIO is not set
-# CONFIG_VT is not set
-# CONFIG_LEGACY_PTYS is not set
-# CONFIG_DEVKMEM is not set
-CONFIG_SERIAL_ATMEL=y
-CONFIG_SERIAL_ATMEL_CONSOLE=y
-# CONFIG_HW_RANDOM is not set
-CONFIG_I2C=m
-CONFIG_I2C_CHARDEV=m
-CONFIG_I2C_GPIO=m
-CONFIG_SPI=y
-CONFIG_SPI_ATMEL=y
-CONFIG_SPI_SPIDEV=m
-CONFIG_GPIO_SYSFS=y
-# CONFIG_HWMON is not set
-CONFIG_WATCHDOG=y
-CONFIG_AT32AP700X_WDT=y
-CONFIG_FB=y
-CONFIG_FB_ATMEL=y
-CONFIG_LCD_CLASS_DEVICE=y
-CONFIG_LCD_LTV350QV=y
-CONFIG_SOUND=m
-CONFIG_SND=m
-CONFIG_SND_MIXER_OSS=m
-CONFIG_SND_PCM_OSS=m
-# CONFIG_SND_SUPPORT_OLD_API is not set
-# CONFIG_SND_VERBOSE_PROCFS is not set
-CONFIG_SND_AT73C213=m
-CONFIG_USB_GADGET=y
-CONFIG_USB_ZERO=m
-CONFIG_USB_ETH=m
-CONFIG_USB_GADGETFS=m
-CONFIG_USB_MASS_STORAGE=m
-CONFIG_USB_G_SERIAL=m
-CONFIG_USB_CDC_COMPOSITE=m
-CONFIG_MMC=y
-CONFIG_MMC_TEST=m
-CONFIG_MMC_ATMELMCI=y
-CONFIG_NEW_LEDS=y
-CONFIG_LEDS_CLASS=y
-CONFIG_LEDS_GPIO=m
-CONFIG_LEDS_PWM=m
-CONFIG_LEDS_TRIGGERS=y
-CONFIG_LEDS_TRIGGER_TIMER=m
-CONFIG_LEDS_TRIGGER_HEARTBEAT=m
-CONFIG_RTC_CLASS=y
-CONFIG_RTC_DRV_AT32AP700X=y
-CONFIG_DMADEVICES=y
-CONFIG_PWM=y
-CONFIG_PWM_ATMEL=m
-CONFIG_EXT2_FS=y
-CONFIG_EXT3_FS=y
-# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
-# CONFIG_EXT3_FS_XATTR is not set
-CONFIG_EXT4_FS=y
-# CONFIG_DNOTIFY is not set
-CONFIG_FUSE_FS=m
-CONFIG_MSDOS_FS=m
-CONFIG_VFAT_FS=m
-CONFIG_FAT_DEFAULT_CODEPAGE=850
-CONFIG_PROC_KCORE=y
-CONFIG_TMPFS=y
-CONFIG_CONFIGFS_FS=y
-CONFIG_JFFS2_FS=y
-CONFIG_UBIFS_FS=y
-CONFIG_NFS_FS=y
-CONFIG_ROOT_NFS=y
-CONFIG_CIFS=m
-CONFIG_NLS_CODEPAGE_437=m
-CONFIG_NLS_CODEPAGE_850=m
-CONFIG_NLS_ISO8859_1=m
-CONFIG_NLS_UTF8=m
-CONFIG_DEBUG_FS=y
-CONFIG_FRAME_POINTER=y
-CONFIG_MAGIC_SYSRQ=y
-CONFIG_DETECT_HUNG_TASK=y
diff --git a/arch/avr32/configs/atstk1003_defconfig b/arch/avr32/configs/atstk1003_defconfig
deleted file mode 100644 (file)
index ccce1a0..0000000
+++ /dev/null
@@ -1,137 +0,0 @@
-# CONFIG_LOCALVERSION_AUTO is not set
-CONFIG_SYSVIPC=y
-CONFIG_POSIX_MQUEUE=y
-CONFIG_NO_HZ=y
-CONFIG_HIGH_RES_TIMERS=y
-CONFIG_LOG_BUF_SHIFT=14
-CONFIG_RELAY=y
-CONFIG_BLK_DEV_INITRD=y
-CONFIG_CC_OPTIMIZE_FOR_SIZE=y
-# CONFIG_BASE_FULL is not set
-# CONFIG_COMPAT_BRK is not set
-CONFIG_PROFILING=y
-CONFIG_OPROFILE=m
-CONFIG_MODULES=y
-CONFIG_MODULE_UNLOAD=y
-# CONFIG_BLK_DEV_BSG is not set
-# CONFIG_IOSCHED_DEADLINE is not set
-CONFIG_BOARD_ATSTK1003=y
-# CONFIG_OWNERSHIP_TRACE is not set
-CONFIG_NMI_DEBUGGING=y
-CONFIG_CPU_FREQ=y
-# CONFIG_CPU_FREQ_STAT is not set
-CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
-CONFIG_CPU_FREQ_GOV_USERSPACE=y
-CONFIG_AVR32_AT32AP_CPUFREQ=y
-CONFIG_NET=y
-CONFIG_PACKET=y
-CONFIG_UNIX=y
-CONFIG_INET=y
-# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
-# CONFIG_INET_XFRM_MODE_TUNNEL is not set
-# CONFIG_INET_XFRM_MODE_BEET is not set
-# CONFIG_INET_LRO is not set
-# CONFIG_INET_DIAG is not set
-# CONFIG_IPV6 is not set
-CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
-# CONFIG_PREVENT_FIRMWARE_BUILD is not set
-# CONFIG_FW_LOADER is not set
-CONFIG_MTD=y
-CONFIG_MTD_CMDLINE_PARTS=y
-CONFIG_MTD_BLOCK=y
-CONFIG_MTD_CFI=y
-CONFIG_MTD_CFI_AMDSTD=y
-CONFIG_MTD_PHYSMAP=y
-CONFIG_MTD_UBI=y
-CONFIG_BLK_DEV_LOOP=m
-CONFIG_BLK_DEV_NBD=m
-CONFIG_BLK_DEV_RAM=m
-CONFIG_ATMEL_TCLIB=y
-CONFIG_ATMEL_SSC=m
-# CONFIG_SCSI_PROC_FS is not set
-CONFIG_BLK_DEV_SD=m
-CONFIG_BLK_DEV_SR=m
-# CONFIG_SCSI_LOWLEVEL is not set
-CONFIG_ATA=m
-# CONFIG_SATA_PMP is not set
-CONFIG_PATA_AT32=m
-CONFIG_NETDEVICES=y
-CONFIG_PPP=m
-CONFIG_PPP_BSDCOMP=m
-CONFIG_PPP_DEFLATE=m
-CONFIG_PPP_ASYNC=m
-CONFIG_INPUT=m
-CONFIG_INPUT_EVDEV=m
-# CONFIG_KEYBOARD_ATKBD is not set
-CONFIG_KEYBOARD_GPIO=m
-# CONFIG_MOUSE_PS2 is not set
-CONFIG_MOUSE_GPIO=m
-# CONFIG_SERIO is not set
-# CONFIG_VT is not set
-# CONFIG_LEGACY_PTYS is not set
-# CONFIG_DEVKMEM is not set
-CONFIG_SERIAL_ATMEL=y
-CONFIG_SERIAL_ATMEL_CONSOLE=y
-# CONFIG_HW_RANDOM is not set
-CONFIG_I2C=m
-CONFIG_I2C_CHARDEV=m
-CONFIG_I2C_GPIO=m
-CONFIG_SPI=y
-CONFIG_SPI_ATMEL=y
-CONFIG_SPI_SPIDEV=m
-CONFIG_GPIO_SYSFS=y
-# CONFIG_HWMON is not set
-CONFIG_WATCHDOG=y
-CONFIG_AT32AP700X_WDT=y
-CONFIG_SOUND=m
-CONFIG_SND=m
-CONFIG_SND_MIXER_OSS=m
-CONFIG_SND_PCM_OSS=m
-# CONFIG_SND_DRIVERS is not set
-CONFIG_SND_AT73C213=m
-CONFIG_USB_GADGET=y
-CONFIG_USB_ZERO=m
-CONFIG_USB_ETH=m
-CONFIG_USB_GADGETFS=m
-CONFIG_USB_MASS_STORAGE=m
-CONFIG_USB_G_SERIAL=m
-CONFIG_USB_CDC_COMPOSITE=m
-CONFIG_MMC=y
-CONFIG_MMC_TEST=m
-CONFIG_MMC_ATMELMCI=y
-CONFIG_NEW_LEDS=y
-CONFIG_LEDS_CLASS=y
-CONFIG_LEDS_GPIO=m
-CONFIG_LEDS_PWM=m
-CONFIG_LEDS_TRIGGERS=y
-CONFIG_LEDS_TRIGGER_TIMER=m
-CONFIG_LEDS_TRIGGER_HEARTBEAT=m
-CONFIG_RTC_CLASS=y
-CONFIG_RTC_DRV_AT32AP700X=y
-CONFIG_DMADEVICES=y
-CONFIG_PWM=y
-CONFIG_PWM_ATMEL=m
-CONFIG_EXT2_FS=y
-CONFIG_EXT3_FS=y
-# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
-# CONFIG_EXT3_FS_XATTR is not set
-CONFIG_EXT4_FS=y
-# CONFIG_DNOTIFY is not set
-CONFIG_FUSE_FS=m
-CONFIG_MSDOS_FS=m
-CONFIG_VFAT_FS=m
-CONFIG_FAT_DEFAULT_CODEPAGE=850
-CONFIG_PROC_KCORE=y
-CONFIG_TMPFS=y
-CONFIG_CONFIGFS_FS=y
-CONFIG_JFFS2_FS=y
-CONFIG_UBIFS_FS=y
-# CONFIG_NETWORK_FILESYSTEMS is not set
-CONFIG_NLS_CODEPAGE_437=m
-CONFIG_NLS_CODEPAGE_850=m
-CONFIG_NLS_ISO8859_1=m
-CONFIG_NLS_UTF8=m
-CONFIG_DEBUG_FS=y
-CONFIG_FRAME_POINTER=y
-CONFIG_MAGIC_SYSRQ=y
-CONFIG_DETECT_HUNG_TASK=y
diff --git a/arch/avr32/configs/atstk1004_defconfig b/arch/avr32/configs/atstk1004_defconfig
deleted file mode 100644 (file)
index e64288f..0000000
+++ /dev/null
@@ -1,135 +0,0 @@
-# CONFIG_LOCALVERSION_AUTO is not set
-CONFIG_SYSVIPC=y
-CONFIG_POSIX_MQUEUE=y
-CONFIG_NO_HZ=y
-CONFIG_HIGH_RES_TIMERS=y
-CONFIG_LOG_BUF_SHIFT=14
-CONFIG_RELAY=y
-CONFIG_BLK_DEV_INITRD=y
-CONFIG_CC_OPTIMIZE_FOR_SIZE=y
-# CONFIG_BASE_FULL is not set
-# CONFIG_COMPAT_BRK is not set
-CONFIG_PROFILING=y
-CONFIG_OPROFILE=m
-CONFIG_MODULES=y
-CONFIG_MODULE_UNLOAD=y
-# CONFIG_BLK_DEV_BSG is not set
-# CONFIG_IOSCHED_DEADLINE is not set
-CONFIG_BOARD_ATSTK1004=y
-# CONFIG_OWNERSHIP_TRACE is not set
-CONFIG_NMI_DEBUGGING=y
-CONFIG_CPU_FREQ=y
-# CONFIG_CPU_FREQ_STAT is not set
-CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
-CONFIG_CPU_FREQ_GOV_USERSPACE=y
-CONFIG_AVR32_AT32AP_CPUFREQ=y
-CONFIG_NET=y
-CONFIG_PACKET=y
-CONFIG_UNIX=y
-CONFIG_INET=y
-# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
-# CONFIG_INET_XFRM_MODE_TUNNEL is not set
-# CONFIG_INET_XFRM_MODE_BEET is not set
-# CONFIG_INET_LRO is not set
-# CONFIG_INET_DIAG is not set
-# CONFIG_IPV6 is not set
-CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
-# CONFIG_PREVENT_FIRMWARE_BUILD is not set
-# CONFIG_FW_LOADER is not set
-CONFIG_MTD=y
-CONFIG_MTD_CMDLINE_PARTS=y
-CONFIG_MTD_BLOCK=y
-CONFIG_MTD_CFI=y
-CONFIG_MTD_CFI_AMDSTD=y
-CONFIG_MTD_PHYSMAP=y
-CONFIG_MTD_UBI=y
-CONFIG_BLK_DEV_LOOP=m
-CONFIG_BLK_DEV_NBD=m
-CONFIG_BLK_DEV_RAM=m
-CONFIG_ATMEL_TCLIB=y
-CONFIG_ATMEL_SSC=m
-# CONFIG_SCSI_PROC_FS is not set
-CONFIG_BLK_DEV_SD=m
-CONFIG_BLK_DEV_SR=m
-# CONFIG_SCSI_LOWLEVEL is not set
-CONFIG_ATA=m
-# CONFIG_SATA_PMP is not set
-CONFIG_PATA_AT32=m
-CONFIG_NETDEVICES=y
-CONFIG_PPP=m
-CONFIG_PPP_BSDCOMP=m
-CONFIG_PPP_DEFLATE=m
-CONFIG_PPP_ASYNC=m
-CONFIG_INPUT=m
-CONFIG_INPUT_EVDEV=m
-# CONFIG_KEYBOARD_ATKBD is not set
-CONFIG_KEYBOARD_GPIO=m
-# CONFIG_MOUSE_PS2 is not set
-CONFIG_MOUSE_GPIO=m
-# CONFIG_SERIO is not set
-# CONFIG_VT is not set
-# CONFIG_LEGACY_PTYS is not set
-# CONFIG_DEVKMEM is not set
-CONFIG_SERIAL_ATMEL=y
-CONFIG_SERIAL_ATMEL_CONSOLE=y
-# CONFIG_HW_RANDOM is not set
-CONFIG_I2C=m
-CONFIG_I2C_CHARDEV=m
-CONFIG_I2C_GPIO=m
-CONFIG_SPI=y
-CONFIG_SPI_ATMEL=y
-CONFIG_SPI_SPIDEV=m
-CONFIG_GPIO_SYSFS=y
-# CONFIG_HWMON is not set
-CONFIG_WATCHDOG=y
-CONFIG_AT32AP700X_WDT=y
-CONFIG_FB=y
-CONFIG_FB_ATMEL=y
-CONFIG_LCD_CLASS_DEVICE=y
-CONFIG_LCD_LTV350QV=y
-CONFIG_USB_GADGET=y
-CONFIG_USB_ZERO=m
-CONFIG_USB_ETH=m
-CONFIG_USB_GADGETFS=m
-CONFIG_USB_MASS_STORAGE=m
-CONFIG_USB_G_SERIAL=m
-CONFIG_USB_CDC_COMPOSITE=m
-CONFIG_MMC=y
-CONFIG_MMC_TEST=m
-CONFIG_MMC_ATMELMCI=y
-CONFIG_NEW_LEDS=y
-CONFIG_LEDS_CLASS=y
-CONFIG_LEDS_GPIO=m
-CONFIG_LEDS_PWM=m
-CONFIG_LEDS_TRIGGERS=y
-CONFIG_LEDS_TRIGGER_TIMER=m
-CONFIG_LEDS_TRIGGER_HEARTBEAT=m
-CONFIG_RTC_CLASS=y
-CONFIG_RTC_DRV_AT32AP700X=y
-CONFIG_DMADEVICES=y
-CONFIG_PWM=y
-CONFIG_PWM_ATMEL=m
-CONFIG_EXT2_FS=y
-CONFIG_EXT3_FS=y
-# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
-# CONFIG_EXT3_FS_XATTR is not set
-CONFIG_EXT4_FS=y
-# CONFIG_DNOTIFY is not set
-CONFIG_FUSE_FS=m
-CONFIG_MSDOS_FS=m
-CONFIG_VFAT_FS=m
-CONFIG_FAT_DEFAULT_CODEPAGE=850
-CONFIG_PROC_KCORE=y
-CONFIG_TMPFS=y
-CONFIG_CONFIGFS_FS=y
-CONFIG_JFFS2_FS=y
-CONFIG_UBIFS_FS=y
-# CONFIG_NETWORK_FILESYSTEMS is not set
-CONFIG_NLS_CODEPAGE_437=m
-CONFIG_NLS_CODEPAGE_850=m
-CONFIG_NLS_ISO8859_1=m
-CONFIG_NLS_UTF8=m
-CONFIG_DEBUG_FS=y
-CONFIG_FRAME_POINTER=y
-CONFIG_MAGIC_SYSRQ=y
-CONFIG_DETECT_HUNG_TASK=y
diff --git a/arch/avr32/configs/atstk1006_defconfig b/arch/avr32/configs/atstk1006_defconfig
deleted file mode 100644 (file)
index 7d669f7..0000000
+++ /dev/null
@@ -1,160 +0,0 @@
-# CONFIG_LOCALVERSION_AUTO is not set
-CONFIG_SYSVIPC=y
-CONFIG_POSIX_MQUEUE=y
-CONFIG_NO_HZ=y
-CONFIG_HIGH_RES_TIMERS=y
-CONFIG_LOG_BUF_SHIFT=14
-CONFIG_RELAY=y
-CONFIG_BLK_DEV_INITRD=y
-CONFIG_CC_OPTIMIZE_FOR_SIZE=y
-# CONFIG_BASE_FULL is not set
-# CONFIG_COMPAT_BRK is not set
-CONFIG_PROFILING=y
-CONFIG_OPROFILE=m
-CONFIG_MODULES=y
-CONFIG_MODULE_UNLOAD=y
-# CONFIG_BLK_DEV_BSG is not set
-# CONFIG_IOSCHED_DEADLINE is not set
-CONFIG_BOARD_ATSTK1006=y
-# CONFIG_OWNERSHIP_TRACE is not set
-CONFIG_NMI_DEBUGGING=y
-CONFIG_CPU_FREQ=y
-# CONFIG_CPU_FREQ_STAT is not set
-CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
-CONFIG_CPU_FREQ_GOV_USERSPACE=y
-CONFIG_AVR32_AT32AP_CPUFREQ=y
-CONFIG_NET=y
-CONFIG_PACKET=y
-CONFIG_UNIX=y
-CONFIG_XFRM_USER=m
-CONFIG_NET_KEY=m
-CONFIG_INET=y
-CONFIG_IP_PNP=y
-CONFIG_IP_PNP_DHCP=y
-CONFIG_NET_IPIP=m
-CONFIG_NET_IPGRE_DEMUX=m
-CONFIG_NET_IPGRE=m
-CONFIG_INET_AH=m
-CONFIG_INET_ESP=m
-CONFIG_INET_XFRM_MODE_TRANSPORT=m
-CONFIG_INET_XFRM_MODE_TUNNEL=m
-CONFIG_INET_XFRM_MODE_BEET=m
-# CONFIG_INET_LRO is not set
-CONFIG_INET6_AH=m
-CONFIG_INET6_ESP=m
-CONFIG_INET6_IPCOMP=m
-CONFIG_IPV6_TUNNEL=m
-CONFIG_BRIDGE=m
-CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
-# CONFIG_PREVENT_FIRMWARE_BUILD is not set
-# CONFIG_FW_LOADER is not set
-CONFIG_MTD=y
-CONFIG_MTD_CMDLINE_PARTS=y
-CONFIG_MTD_BLOCK=y
-CONFIG_MTD_CFI=y
-CONFIG_MTD_CFI_AMDSTD=y
-CONFIG_MTD_PHYSMAP=y
-CONFIG_MTD_NAND=y
-CONFIG_MTD_NAND_ATMEL=y
-CONFIG_MTD_UBI=y
-CONFIG_BLK_DEV_LOOP=m
-CONFIG_BLK_DEV_NBD=m
-CONFIG_BLK_DEV_RAM=m
-CONFIG_ATMEL_TCLIB=y
-CONFIG_ATMEL_SSC=m
-# CONFIG_SCSI_PROC_FS is not set
-CONFIG_BLK_DEV_SD=m
-CONFIG_BLK_DEV_SR=m
-# CONFIG_SCSI_LOWLEVEL is not set
-CONFIG_ATA=m
-# CONFIG_SATA_PMP is not set
-CONFIG_PATA_AT32=m
-CONFIG_NETDEVICES=y
-CONFIG_TUN=m
-CONFIG_MACB=y
-CONFIG_PPP=m
-CONFIG_PPP_BSDCOMP=m
-CONFIG_PPP_DEFLATE=m
-CONFIG_PPP_ASYNC=m
-CONFIG_INPUT=m
-CONFIG_INPUT_EVDEV=m
-# CONFIG_KEYBOARD_ATKBD is not set
-CONFIG_KEYBOARD_GPIO=m
-# CONFIG_MOUSE_PS2 is not set
-CONFIG_MOUSE_GPIO=m
-# CONFIG_SERIO is not set
-# CONFIG_VT is not set
-# CONFIG_LEGACY_PTYS is not set
-# CONFIG_DEVKMEM is not set
-CONFIG_SERIAL_ATMEL=y
-CONFIG_SERIAL_ATMEL_CONSOLE=y
-# CONFIG_HW_RANDOM is not set
-CONFIG_I2C=m
-CONFIG_I2C_CHARDEV=m
-CONFIG_I2C_GPIO=m
-CONFIG_SPI=y
-CONFIG_SPI_ATMEL=y
-CONFIG_SPI_SPIDEV=m
-CONFIG_GPIO_SYSFS=y
-# CONFIG_HWMON is not set
-CONFIG_WATCHDOG=y
-CONFIG_AT32AP700X_WDT=y
-CONFIG_FB=y
-CONFIG_FB_ATMEL=y
-CONFIG_LCD_CLASS_DEVICE=y
-CONFIG_LCD_LTV350QV=y
-CONFIG_SOUND=m
-CONFIG_SND=m
-CONFIG_SND_MIXER_OSS=m
-CONFIG_SND_PCM_OSS=m
-# CONFIG_SND_SUPPORT_OLD_API is not set
-# CONFIG_SND_VERBOSE_PROCFS is not set
-CONFIG_SND_AT73C213=m
-CONFIG_USB_GADGET=y
-CONFIG_USB_ZERO=m
-CONFIG_USB_ETH=m
-CONFIG_USB_GADGETFS=m
-CONFIG_USB_MASS_STORAGE=m
-CONFIG_USB_G_SERIAL=m
-CONFIG_USB_CDC_COMPOSITE=m
-CONFIG_MMC=y
-CONFIG_MMC_TEST=m
-CONFIG_MMC_ATMELMCI=y
-CONFIG_NEW_LEDS=y
-CONFIG_LEDS_CLASS=y
-CONFIG_LEDS_GPIO=m
-CONFIG_LEDS_PWM=m
-CONFIG_LEDS_TRIGGERS=y
-CONFIG_LEDS_TRIGGER_TIMER=m
-CONFIG_LEDS_TRIGGER_HEARTBEAT=m
-CONFIG_RTC_CLASS=y
-CONFIG_RTC_DRV_AT32AP700X=y
-CONFIG_DMADEVICES=y
-CONFIG_PWM=y
-CONFIG_PWM_ATMEL=m
-CONFIG_EXT2_FS=y
-CONFIG_EXT3_FS=y
-# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
-# CONFIG_EXT3_FS_XATTR is not set
-CONFIG_EXT4_FS=y
-# CONFIG_DNOTIFY is not set
-CONFIG_FUSE_FS=m
-CONFIG_MSDOS_FS=m
-CONFIG_VFAT_FS=m
-CONFIG_FAT_DEFAULT_CODEPAGE=850
-CONFIG_PROC_KCORE=y
-CONFIG_TMPFS=y
-CONFIG_CONFIGFS_FS=y
-CONFIG_JFFS2_FS=y
-CONFIG_UBIFS_FS=y
-CONFIG_NFS_FS=y
-CONFIG_ROOT_NFS=y
-CONFIG_CIFS=m
-CONFIG_NLS_CODEPAGE_437=m
-CONFIG_NLS_CODEPAGE_850=m
-CONFIG_NLS_ISO8859_1=m
-CONFIG_NLS_UTF8=m
-CONFIG_DEBUG_FS=y
-CONFIG_FRAME_POINTER=y
-CONFIG_MAGIC_SYSRQ=y
-CONFIG_DETECT_HUNG_TASK=y
diff --git a/arch/avr32/configs/favr-32_defconfig b/arch/avr32/configs/favr-32_defconfig
deleted file mode 100644 (file)
index 560c52f..0000000
+++ /dev/null
@@ -1,143 +0,0 @@
-# CONFIG_LOCALVERSION_AUTO is not set
-CONFIG_SYSVIPC=y
-CONFIG_POSIX_MQUEUE=y
-CONFIG_NO_HZ=y
-CONFIG_HIGH_RES_TIMERS=y
-CONFIG_LOG_BUF_SHIFT=14
-CONFIG_RELAY=y
-CONFIG_BLK_DEV_INITRD=y
-CONFIG_CC_OPTIMIZE_FOR_SIZE=y
-# CONFIG_BASE_FULL is not set
-# CONFIG_COMPAT_BRK is not set
-CONFIG_PROFILING=y
-CONFIG_OPROFILE=m
-CONFIG_MODULES=y
-CONFIG_MODULE_UNLOAD=y
-# CONFIG_BLK_DEV_BSG is not set
-# CONFIG_IOSCHED_DEADLINE is not set
-CONFIG_BOARD_FAVR_32=y
-# CONFIG_OWNERSHIP_TRACE is not set
-CONFIG_NMI_DEBUGGING=y
-CONFIG_CPU_FREQ=y
-# CONFIG_CPU_FREQ_STAT is not set
-CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
-CONFIG_CPU_FREQ_GOV_USERSPACE=y
-CONFIG_AVR32_AT32AP_CPUFREQ=y
-CONFIG_NET=y
-CONFIG_PACKET=y
-CONFIG_UNIX=y
-CONFIG_XFRM_USER=m
-CONFIG_NET_KEY=m
-CONFIG_INET=y
-CONFIG_IP_PNP=y
-CONFIG_IP_PNP_DHCP=y
-CONFIG_NET_IPIP=m
-CONFIG_INET_AH=m
-CONFIG_INET_ESP=m
-CONFIG_INET_XFRM_MODE_TRANSPORT=m
-CONFIG_INET_XFRM_MODE_TUNNEL=m
-CONFIG_INET_XFRM_MODE_BEET=m
-# CONFIG_INET_LRO is not set
-CONFIG_IPV6=y
-CONFIG_INET6_AH=m
-CONFIG_INET6_ESP=m
-CONFIG_INET6_IPCOMP=m
-CONFIG_INET6_XFRM_MODE_TRANSPORT=m
-CONFIG_INET6_XFRM_MODE_TUNNEL=m
-CONFIG_INET6_XFRM_MODE_BEET=m
-CONFIG_IPV6_SIT=m
-CONFIG_IPV6_TUNNEL=m
-CONFIG_BRIDGE=m
-CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
-# CONFIG_PREVENT_FIRMWARE_BUILD is not set
-# CONFIG_FW_LOADER is not set
-CONFIG_MTD=y
-CONFIG_MTD_CMDLINE_PARTS=y
-CONFIG_MTD_BLOCK=y
-CONFIG_MTD_CFI=y
-CONFIG_MTD_CFI_AMDSTD=y
-CONFIG_MTD_PHYSMAP=y
-CONFIG_BLK_DEV_LOOP=m
-CONFIG_BLK_DEV_NBD=m
-CONFIG_BLK_DEV_RAM=m
-CONFIG_ATMEL_TCLIB=y
-CONFIG_ATMEL_SSC=m
-CONFIG_NETDEVICES=y
-CONFIG_MACB=y
-CONFIG_PPP=m
-CONFIG_PPP_BSDCOMP=m
-CONFIG_PPP_DEFLATE=m
-CONFIG_PPP_ASYNC=m
-CONFIG_INPUT_MOUSEDEV=m
-CONFIG_INPUT_EVDEV=m
-# CONFIG_KEYBOARD_ATKBD is not set
-CONFIG_KEYBOARD_GPIO=m
-# CONFIG_MOUSE_PS2 is not set
-CONFIG_MOUSE_GPIO=m
-CONFIG_INPUT_TOUCHSCREEN=y
-CONFIG_TOUCHSCREEN_ADS7846=m
-# CONFIG_SERIO is not set
-# CONFIG_CONSOLE_TRANSLATIONS is not set
-# CONFIG_LEGACY_PTYS is not set
-# CONFIG_DEVKMEM is not set
-CONFIG_SERIAL_ATMEL=y
-CONFIG_SERIAL_ATMEL_CONSOLE=y
-# CONFIG_HW_RANDOM is not set
-CONFIG_I2C=m
-CONFIG_I2C_CHARDEV=m
-CONFIG_I2C_GPIO=m
-CONFIG_SPI=y
-CONFIG_SPI_ATMEL=y
-CONFIG_SPI_SPIDEV=m
-CONFIG_GPIO_SYSFS=y
-# CONFIG_HWMON is not set
-CONFIG_WATCHDOG=y
-CONFIG_AT32AP700X_WDT=y
-CONFIG_FB=y
-CONFIG_FB_ATMEL=y
-# CONFIG_LCD_CLASS_DEVICE is not set
-CONFIG_BACKLIGHT_PWM=m
-CONFIG_SOUND=m
-CONFIG_SOUND_PRIME=m
-CONFIG_USB_GADGET=y
-CONFIG_USB_ZERO=m
-CONFIG_USB_ETH=m
-CONFIG_USB_GADGETFS=m
-CONFIG_USB_MASS_STORAGE=m
-CONFIG_USB_G_SERIAL=m
-CONFIG_USB_CDC_COMPOSITE=m
-CONFIG_MMC=y
-CONFIG_MMC_ATMELMCI=y
-CONFIG_NEW_LEDS=y
-CONFIG_LEDS_CLASS=y
-CONFIG_LEDS_GPIO=y
-CONFIG_LEDS_TRIGGERS=y
-CONFIG_LEDS_TRIGGER_TIMER=y
-CONFIG_LEDS_TRIGGER_HEARTBEAT=y
-CONFIG_LEDS_TRIGGER_DEFAULT_ON=y
-CONFIG_RTC_CLASS=y
-CONFIG_RTC_DRV_AT32AP700X=y
-CONFIG_DMADEVICES=y
-CONFIG_PWM=y
-CONFIG_PWM_ATMEL=y
-CONFIG_EXT2_FS=y
-CONFIG_EXT3_FS=y
-# CONFIG_EXT3_FS_XATTR is not set
-# CONFIG_DNOTIFY is not set
-CONFIG_FUSE_FS=m
-CONFIG_MSDOS_FS=m
-CONFIG_VFAT_FS=m
-CONFIG_PROC_KCORE=y
-CONFIG_TMPFS=y
-CONFIG_CONFIGFS_FS=y
-CONFIG_JFFS2_FS=y
-# CONFIG_JFFS2_FS_WRITEBUFFER is not set
-CONFIG_NFS_FS=y
-CONFIG_ROOT_NFS=y
-CONFIG_NLS_CODEPAGE_437=m
-CONFIG_NLS_ISO8859_1=m
-CONFIG_NLS_UTF8=m
-CONFIG_DEBUG_FS=y
-CONFIG_FRAME_POINTER=y
-CONFIG_MAGIC_SYSRQ=y
-# CONFIG_CRYPTO_HW is not set
diff --git a/arch/avr32/configs/hammerhead_defconfig b/arch/avr32/configs/hammerhead_defconfig
deleted file mode 100644 (file)
index d57fadb..0000000
+++ /dev/null
@@ -1,145 +0,0 @@
-# CONFIG_LOCALVERSION_AUTO is not set
-CONFIG_SYSVIPC=y
-CONFIG_POSIX_MQUEUE=y
-CONFIG_NO_HZ=y
-CONFIG_HIGH_RES_TIMERS=y
-CONFIG_BSD_PROCESS_ACCT=y
-CONFIG_BSD_PROCESS_ACCT_V3=y
-CONFIG_LOG_BUF_SHIFT=14
-CONFIG_BLK_DEV_INITRD=y
-CONFIG_CC_OPTIMIZE_FOR_SIZE=y
-# CONFIG_BASE_FULL is not set
-# CONFIG_COMPAT_BRK is not set
-CONFIG_PROFILING=y
-CONFIG_OPROFILE=m
-CONFIG_MODULES=y
-CONFIG_MODULE_UNLOAD=y
-CONFIG_MODULE_FORCE_UNLOAD=y
-# CONFIG_BLK_DEV_BSG is not set
-# CONFIG_IOSCHED_DEADLINE is not set
-CONFIG_BOARD_HAMMERHEAD=y
-CONFIG_BOARD_HAMMERHEAD_USB=y
-CONFIG_BOARD_HAMMERHEAD_LCD=y
-CONFIG_BOARD_HAMMERHEAD_SND=y
-# CONFIG_BOARD_HAMMERHEAD_FPGA is not set
-# CONFIG_OWNERSHIP_TRACE is not set
-CONFIG_CPU_FREQ=y
-# CONFIG_CPU_FREQ_STAT is not set
-CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
-CONFIG_CPU_FREQ_GOV_USERSPACE=y
-CONFIG_AVR32_AT32AP_CPUFREQ=y
-CONFIG_NET=y
-CONFIG_PACKET=y
-CONFIG_UNIX=y
-CONFIG_XFRM_USER=y
-CONFIG_NET_KEY=y
-CONFIG_INET=y
-CONFIG_IP_MULTICAST=y
-CONFIG_IP_ADVANCED_ROUTER=y
-CONFIG_IP_PNP=y
-CONFIG_IP_PNP_DHCP=y
-CONFIG_IP_MROUTE=y
-CONFIG_IP_PIMSM_V1=y
-CONFIG_SYN_COOKIES=y
-CONFIG_INET_AH=y
-CONFIG_INET_ESP=y
-CONFIG_INET_IPCOMP=y
-# CONFIG_INET_LRO is not set
-# CONFIG_IPV6 is not set
-CONFIG_NETFILTER=y
-# CONFIG_NETFILTER_ADVANCED is not set
-CONFIG_NETFILTER_XTABLES=y
-CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
-# CONFIG_PREVENT_FIRMWARE_BUILD is not set
-# CONFIG_FW_LOADER is not set
-CONFIG_MTD=y
-CONFIG_MTD_CMDLINE_PARTS=y
-CONFIG_MTD_BLOCK=y
-CONFIG_MTD_CFI=y
-CONFIG_MTD_CFI_AMDSTD=y
-CONFIG_MTD_PHYSMAP=y
-CONFIG_MTD_DATAFLASH=y
-CONFIG_BLK_DEV_RAM=m
-CONFIG_ATMEL_TCLIB=y
-CONFIG_SCSI=m
-CONFIG_BLK_DEV_SD=m
-CONFIG_NETDEVICES=y
-CONFIG_MACB=y
-CONFIG_INPUT_FF_MEMLESS=m
-CONFIG_INPUT_EVDEV=m
-CONFIG_INPUT_TOUCHSCREEN=y
-# CONFIG_LEGACY_PTYS is not set
-CONFIG_SERIAL_ATMEL=y
-CONFIG_SERIAL_ATMEL_CONSOLE=y
-# CONFIG_HW_RANDOM is not set
-CONFIG_I2C=m
-CONFIG_I2C_CHARDEV=m
-CONFIG_I2C_GPIO=m
-CONFIG_SPI=y
-CONFIG_SPI_ATMEL=y
-CONFIG_SPI_SPIDEV=m
-# CONFIG_HWMON is not set
-CONFIG_WATCHDOG=y
-CONFIG_AT32AP700X_WDT=y
-CONFIG_FB=y
-CONFIG_FB_ATMEL=y
-CONFIG_FRAMEBUFFER_CONSOLE=y
-CONFIG_SOUND=m
-CONFIG_SND=m
-CONFIG_SND_SEQUENCER=m
-CONFIG_SND_MIXER_OSS=m
-CONFIG_SND_PCM_OSS=m
-CONFIG_SND_SEQUENCER_OSS=y
-# CONFIG_SND_SUPPORT_OLD_API is not set
-CONFIG_HID_A4TECH=m
-CONFIG_HID_APPLE=m
-CONFIG_HID_BELKIN=m
-CONFIG_HID_CHERRY=m
-CONFIG_HID_CHICONY=m
-CONFIG_HID_CYPRESS=m
-CONFIG_HID_EZKEY=m
-CONFIG_HID_GYRATION=m
-CONFIG_HID_LOGITECH=m
-CONFIG_HID_MICROSOFT=m
-CONFIG_HID_MONTEREY=m
-CONFIG_HID_PANTHERLORD=m
-CONFIG_HID_PETALYNX=m
-CONFIG_HID_SAMSUNG=m
-CONFIG_HID_SUNPLUS=m
-CONFIG_USB=m
-CONFIG_USB_MON=m
-CONFIG_USB_ISP116X_HCD=m
-CONFIG_USB_STORAGE=m
-CONFIG_USB_GADGET=y
-CONFIG_USB_ZERO=m
-CONFIG_USB_ETH=m
-CONFIG_USB_GADGETFS=m
-CONFIG_USB_MASS_STORAGE=m
-CONFIG_USB_G_SERIAL=m
-CONFIG_MMC=m
-CONFIG_MMC_ATMELMCI=m
-CONFIG_RTC_CLASS=y
-CONFIG_RTC_DRV_AT32AP700X=y
-CONFIG_EXT2_FS=m
-# CONFIG_DNOTIFY is not set
-CONFIG_MSDOS_FS=y
-CONFIG_VFAT_FS=m
-CONFIG_FAT_DEFAULT_CODEPAGE=850
-CONFIG_TMPFS=y
-CONFIG_CONFIGFS_FS=y
-CONFIG_JFFS2_FS=y
-CONFIG_NFS_FS=y
-CONFIG_ROOT_NFS=y
-CONFIG_NLS_CODEPAGE_437=m
-CONFIG_NLS_CODEPAGE_850=m
-CONFIG_NLS_ISO8859_1=m
-CONFIG_NLS_UTF8=m
-CONFIG_FRAME_POINTER=y
-CONFIG_MAGIC_SYSRQ=y
-CONFIG_CRYPTO_ECB=m
-CONFIG_CRYPTO_PCBC=m
-CONFIG_CRYPTO_ARC4=m
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
-CONFIG_CRC_CCITT=m
-CONFIG_CRC_ITU_T=m
-CONFIG_CRC7=m
diff --git a/arch/avr32/configs/merisc_defconfig b/arch/avr32/configs/merisc_defconfig
deleted file mode 100644 (file)
index e6a9cb7..0000000
+++ /dev/null
@@ -1,115 +0,0 @@
-# CONFIG_LOCALVERSION_AUTO is not set
-CONFIG_SYSVIPC=y
-CONFIG_POSIX_MQUEUE=y
-CONFIG_NO_HZ=y
-CONFIG_HIGH_RES_TIMERS=y
-CONFIG_BSD_PROCESS_ACCT=y
-CONFIG_BSD_PROCESS_ACCT_V3=y
-CONFIG_LOG_BUF_SHIFT=14
-CONFIG_BLK_DEV_INITRD=y
-CONFIG_CC_OPTIMIZE_FOR_SIZE=y
-# CONFIG_BASE_FULL is not set
-CONFIG_MODULES=y
-CONFIG_MODULE_UNLOAD=y
-CONFIG_MODULE_FORCE_UNLOAD=y
-# CONFIG_BLK_DEV_BSG is not set
-# CONFIG_IOSCHED_DEADLINE is not set
-CONFIG_BOARD_MERISC=y
-CONFIG_AP700X_32_BIT_SMC=y
-# CONFIG_OWNERSHIP_TRACE is not set
-CONFIG_NET=y
-CONFIG_PACKET=y
-CONFIG_UNIX=y
-CONFIG_XFRM_USER=y
-CONFIG_NET_KEY=y
-CONFIG_INET=y
-CONFIG_IP_MULTICAST=y
-CONFIG_IP_ADVANCED_ROUTER=y
-CONFIG_IP_PNP=y
-CONFIG_IP_PNP_DHCP=y
-CONFIG_IP_MROUTE=y
-CONFIG_IP_PIMSM_V1=y
-CONFIG_SYN_COOKIES=y
-CONFIG_INET_AH=y
-CONFIG_INET_ESP=y
-CONFIG_INET_IPCOMP=y
-# CONFIG_INET_LRO is not set
-# CONFIG_IPV6 is not set
-CONFIG_CAN=y
-CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
-# CONFIG_PREVENT_FIRMWARE_BUILD is not set
-# CONFIG_FW_LOADER is not set
-CONFIG_MTD=y
-CONFIG_MTD_BLOCK=y
-CONFIG_MTD_CFI=y
-CONFIG_MTD_JEDECPROBE=y
-CONFIG_MTD_CFI_AMDSTD=y
-CONFIG_MTD_ABSENT=y
-CONFIG_MTD_PHYSMAP=y
-CONFIG_MTD_BLOCK2MTD=y
-CONFIG_BLK_DEV_LOOP=y
-CONFIG_ATMEL_SSC=y
-CONFIG_SCSI=y
-CONFIG_BLK_DEV_SD=y
-# CONFIG_SCSI_LOWLEVEL is not set
-CONFIG_NETDEVICES=y
-CONFIG_MACB=y
-# CONFIG_INPUT_MOUSEDEV is not set
-CONFIG_INPUT_EVDEV=y
-# CONFIG_KEYBOARD_ATKBD is not set
-# CONFIG_INPUT_MOUSE is not set
-CONFIG_INPUT_TOUCHSCREEN=y
-CONFIG_TOUCHSCREEN_ADS7846=y
-CONFIG_INPUT_MISC=y
-CONFIG_INPUT_UINPUT=y
-# CONFIG_SERIO is not set
-# CONFIG_CONSOLE_TRANSLATIONS is not set
-# CONFIG_LEGACY_PTYS is not set
-# CONFIG_DEVKMEM is not set
-CONFIG_SERIAL_ATMEL=y
-CONFIG_SERIAL_ATMEL_CONSOLE=y
-# CONFIG_HW_RANDOM is not set
-CONFIG_I2C=y
-CONFIG_I2C_CHARDEV=y
-CONFIG_I2C_GPIO=y
-CONFIG_SPI=y
-CONFIG_SPI_ATMEL=y
-CONFIG_SPI_SPIDEV=y
-CONFIG_GPIO_SYSFS=y
-# CONFIG_HWMON is not set
-CONFIG_WATCHDOG=y
-CONFIG_FB=y
-CONFIG_FB_ATMEL=y
-# CONFIG_LCD_CLASS_DEVICE is not set
-CONFIG_FRAMEBUFFER_CONSOLE=y
-CONFIG_LOGO=y
-CONFIG_MMC=y
-CONFIG_MMC_ATMELMCI=y
-CONFIG_NEW_LEDS=y
-CONFIG_LEDS_CLASS=y
-CONFIG_LEDS_PWM=y
-CONFIG_RTC_CLASS=y
-# CONFIG_RTC_HCTOSYS is not set
-CONFIG_RTC_DRV_PCF8563=y
-CONFIG_DMADEVICES=y
-CONFIG_UIO=y
-CONFIG_PWM=y
-CONFIG_PWM_ATMEL=m
-CONFIG_EXT2_FS=y
-# CONFIG_DNOTIFY is not set
-CONFIG_FUSE_FS=y
-CONFIG_MSDOS_FS=y
-CONFIG_VFAT_FS=y
-CONFIG_TMPFS=y
-CONFIG_CONFIGFS_FS=y
-CONFIG_JFFS2_FS=y
-CONFIG_JFFS2_FS_WBUF_VERIFY=y
-CONFIG_CRAMFS=y
-CONFIG_NFS_FS=y
-CONFIG_ROOT_NFS=y
-CONFIG_NLS_CODEPAGE_437=y
-CONFIG_NLS_CODEPAGE_850=y
-CONFIG_NLS_ISO8859_1=y
-CONFIG_NLS_UTF8=y
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
-# CONFIG_CRYPTO_HW is not set
diff --git a/arch/avr32/configs/mimc200_defconfig b/arch/avr32/configs/mimc200_defconfig
deleted file mode 100644 (file)
index 49c7e89..0000000
+++ /dev/null
@@ -1,114 +0,0 @@
-# CONFIG_LOCALVERSION_AUTO is not set
-CONFIG_SYSVIPC=y
-CONFIG_POSIX_MQUEUE=y
-CONFIG_NO_HZ=y
-CONFIG_HIGH_RES_TIMERS=y
-CONFIG_BSD_PROCESS_ACCT=y
-CONFIG_BSD_PROCESS_ACCT_V3=y
-CONFIG_LOG_BUF_SHIFT=14
-CONFIG_BLK_DEV_INITRD=y
-CONFIG_CC_OPTIMIZE_FOR_SIZE=y
-# CONFIG_BASE_FULL is not set
-# CONFIG_COMPAT_BRK is not set
-CONFIG_PROFILING=y
-# CONFIG_BLK_DEV_BSG is not set
-# CONFIG_IOSCHED_DEADLINE is not set
-CONFIG_BOARD_MIMC200=y
-# CONFIG_OWNERSHIP_TRACE is not set
-CONFIG_NMI_DEBUGGING=y
-CONFIG_CPU_FREQ=y
-# CONFIG_CPU_FREQ_STAT is not set
-CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
-CONFIG_CPU_FREQ_GOV_USERSPACE=y
-CONFIG_AVR32_AT32AP_CPUFREQ=y
-CONFIG_NET=y
-CONFIG_PACKET=y
-CONFIG_UNIX=y
-CONFIG_XFRM_USER=y
-CONFIG_NET_KEY=y
-CONFIG_INET=y
-CONFIG_IP_MULTICAST=y
-CONFIG_IP_ADVANCED_ROUTER=y
-CONFIG_IP_PNP=y
-CONFIG_IP_PNP_DHCP=y
-CONFIG_IP_MROUTE=y
-CONFIG_IP_PIMSM_V1=y
-CONFIG_SYN_COOKIES=y
-CONFIG_INET_AH=y
-CONFIG_INET_ESP=y
-CONFIG_INET_IPCOMP=y
-# CONFIG_INET_LRO is not set
-CONFIG_INET6_AH=y
-CONFIG_INET6_ESP=y
-CONFIG_INET6_IPCOMP=y
-CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
-# CONFIG_PREVENT_FIRMWARE_BUILD is not set
-# CONFIG_FW_LOADER is not set
-CONFIG_MTD=y
-CONFIG_MTD_CMDLINE_PARTS=y
-CONFIG_MTD_BLOCK=y
-CONFIG_MTD_CFI=y
-CONFIG_MTD_CFI_AMDSTD=y
-CONFIG_MTD_PHYSMAP=y
-CONFIG_MTD_DATAFLASH=y
-CONFIG_ATMEL_TCLIB=y
-CONFIG_EEPROM_AT24=y
-CONFIG_EEPROM_AT25=y
-CONFIG_NETDEVICES=y
-CONFIG_MACB=y
-# CONFIG_INPUT is not set
-# CONFIG_SERIO is not set
-# CONFIG_VT is not set
-# CONFIG_LEGACY_PTYS is not set
-# CONFIG_DEVKMEM is not set
-CONFIG_SERIAL_ATMEL=y
-CONFIG_SERIAL_ATMEL_CONSOLE=y
-# CONFIG_HW_RANDOM is not set
-CONFIG_I2C=y
-CONFIG_I2C_CHARDEV=y
-CONFIG_I2C_GPIO=y
-CONFIG_SPI=y
-CONFIG_SPI_ATMEL=y
-CONFIG_GPIO_SYSFS=y
-# CONFIG_HWMON is not set
-CONFIG_WATCHDOG=y
-CONFIG_AT32AP700X_WDT=y
-CONFIG_FB=y
-CONFIG_FB_ATMEL=y
-# CONFIG_USB_SUPPORT is not set
-CONFIG_MMC=y
-CONFIG_MMC_TEST=y
-CONFIG_MMC_ATMELMCI=y
-CONFIG_MMC_SPI=y
-CONFIG_NEW_LEDS=y
-CONFIG_LEDS_CLASS=y
-CONFIG_LEDS_GPIO=y
-CONFIG_LEDS_TRIGGERS=y
-CONFIG_LEDS_TRIGGER_TIMER=y
-CONFIG_LEDS_TRIGGER_HEARTBEAT=y
-CONFIG_LEDS_TRIGGER_DEFAULT_ON=y
-CONFIG_RTC_CLASS=y
-CONFIG_RTC_DRV_DS1390=y
-CONFIG_DMADEVICES=y
-CONFIG_EXT2_FS=y
-CONFIG_EXT3_FS=y
-# CONFIG_EXT3_FS_XATTR is not set
-# CONFIG_DNOTIFY is not set
-CONFIG_MSDOS_FS=y
-CONFIG_VFAT_FS=y
-CONFIG_FAT_DEFAULT_CODEPAGE=850
-CONFIG_TMPFS=y
-CONFIG_CONFIGFS_FS=y
-CONFIG_JFFS2_FS=y
-CONFIG_NFS_FS=y
-CONFIG_ROOT_NFS=y
-CONFIG_NLS_CODEPAGE_437=y
-CONFIG_NLS_CODEPAGE_850=y
-CONFIG_NLS_ISO8859_1=y
-CONFIG_NLS_UTF8=y
-CONFIG_FRAME_POINTER=y
-CONFIG_MAGIC_SYSRQ=y
-CONFIG_CRYPTO_ECB=y
-CONFIG_CRYPTO_PCBC=y
-CONFIG_CRYPTO_ARC4=y
-CONFIG_CRC_CCITT=y
diff --git a/arch/avr32/include/asm/Kbuild b/arch/avr32/include/asm/Kbuild
deleted file mode 100644 (file)
index 3d7ef2c..0000000
+++ /dev/null
@@ -1,23 +0,0 @@
-
-generic-y += clkdev.h
-generic-y += delay.h
-generic-y += device.h
-generic-y += div64.h
-generic-y += emergency-restart.h
-generic-y += exec.h
-generic-y += futex.h
-generic-y += irq_regs.h
-generic-y += irq_work.h
-generic-y += local.h
-generic-y += local64.h
-generic-y += mcs_spinlock.h
-generic-y += mm-arch-hooks.h
-generic-y += param.h
-generic-y += percpu.h
-generic-y += preempt.h
-generic-y += sections.h
-generic-y += topology.h
-generic-y += trace_clock.h
-generic-y += vga.h
-generic-y += word-at-a-time.h
-generic-y += xor.h
diff --git a/arch/avr32/include/asm/addrspace.h b/arch/avr32/include/asm/addrspace.h
deleted file mode 100644 (file)
index 5a47a79..0000000
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Definitions for the address spaces of the AVR32 CPUs. Heavily based on
- * include/asm-sh/addrspace.h
- *
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef __ASM_AVR32_ADDRSPACE_H
-#define __ASM_AVR32_ADDRSPACE_H
-
-#ifdef CONFIG_MMU
-
-/* Memory segments when segmentation is enabled */
-#define P0SEG          0x00000000
-#define P1SEG          0x80000000
-#define P2SEG          0xa0000000
-#define P3SEG          0xc0000000
-#define P4SEG          0xe0000000
-
-/* Returns the privileged segment base of a given address */
-#define PXSEG(a)       (((unsigned long)(a)) & 0xe0000000)
-
-/* Returns the physical address of a PnSEG (n=1,2) address */
-#define PHYSADDR(a)    (((unsigned long)(a)) & 0x1fffffff)
-
-/*
- * Map an address to a certain privileged segment
- */
-#define P1SEGADDR(a) ((__typeof__(a))(((unsigned long)(a) & 0x1fffffff) \
-                                     | P1SEG))
-#define P2SEGADDR(a) ((__typeof__(a))(((unsigned long)(a) & 0x1fffffff) \
-                                     | P2SEG))
-#define P3SEGADDR(a) ((__typeof__(a))(((unsigned long)(a) & 0x1fffffff) \
-                                     | P3SEG))
-#define P4SEGADDR(a) ((__typeof__(a))(((unsigned long)(a) & 0x1fffffff) \
-                                     | P4SEG))
-
-#endif /* CONFIG_MMU */
-
-#endif /* __ASM_AVR32_ADDRSPACE_H */
diff --git a/arch/avr32/include/asm/asm-offsets.h b/arch/avr32/include/asm/asm-offsets.h
deleted file mode 100644 (file)
index d370ee3..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <generated/asm-offsets.h>
diff --git a/arch/avr32/include/asm/asm.h b/arch/avr32/include/asm/asm.h
deleted file mode 100644 (file)
index a2c64f4..0000000
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef __ASM_AVR32_ASM_H__
-#define __ASM_AVR32_ASM_H__
-
-#include <asm/sysreg.h>
-#include <asm/asm-offsets.h>
-#include <asm/thread_info.h>
-
-#define mask_interrupts                ssrf    SYSREG_GM_OFFSET
-#define mask_exceptions                ssrf    SYSREG_EM_OFFSET
-#define unmask_interrupts      csrf    SYSREG_GM_OFFSET
-#define unmask_exceptions      csrf    SYSREG_EM_OFFSET
-
-#ifdef CONFIG_FRAME_POINTER
-       .macro  save_fp
-       st.w    --sp, r7
-       .endm
-       .macro  restore_fp
-       ld.w    r7, sp++
-       .endm
-       .macro  zero_fp
-       mov     r7, 0
-       .endm
-#else
-       .macro  save_fp
-       .endm
-       .macro  restore_fp
-       .endm
-       .macro  zero_fp
-       .endm
-#endif
-       .macro  get_thread_info reg
-       mov     \reg, sp
-       andl    \reg, ~(THREAD_SIZE - 1) & 0xffff
-       .endm
-
-       /* Save and restore registers */
-       .macro  save_min sr, tmp=lr
-       pushm   lr
-       mfsr    \tmp, \sr
-       zero_fp
-       st.w    --sp, \tmp
-       .endm
-
-       .macro  restore_min sr, tmp=lr
-       ld.w    \tmp, sp++
-       mtsr    \sr, \tmp
-       popm    lr
-       .endm
-
-       .macro  save_half sr, tmp=lr
-       save_fp
-       pushm   r8-r9,r10,r11,r12,lr
-       zero_fp
-       mfsr    \tmp, \sr
-       st.w    --sp, \tmp
-       .endm
-
-       .macro  restore_half sr, tmp=lr
-       ld.w    \tmp, sp++
-       mtsr    \sr, \tmp
-       popm    r8-r9,r10,r11,r12,lr
-       restore_fp
-       .endm
-
-       .macro  save_full_user sr, tmp=lr
-       stmts   --sp, r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,sp,lr
-       st.w    --sp, lr
-       zero_fp
-       mfsr    \tmp, \sr
-       st.w    --sp, \tmp
-       .endm
-
-       .macro  restore_full_user sr, tmp=lr
-       ld.w    \tmp, sp++
-       mtsr    \sr, \tmp
-       ld.w    lr, sp++
-       ldmts   sp++, r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,sp,lr
-       .endm
-
-       /* uaccess macros */
-       .macro branch_if_kernel scratch, label
-       get_thread_info \scratch
-       ld.w    \scratch, \scratch[TI_flags]
-       bld     \scratch, TIF_USERSPACE
-       brcc    \label
-       .endm
-
-       .macro ret_if_privileged scratch, addr, size, ret
-       sub     \scratch, \size, 1
-       add     \scratch, \addr
-       retcs   \ret
-       retmi   \ret
-       .endm
-
-#endif /* __ASM_AVR32_ASM_H__ */
diff --git a/arch/avr32/include/asm/atomic.h b/arch/avr32/include/asm/atomic.h
deleted file mode 100644 (file)
index 3d5ce38..0000000
+++ /dev/null
@@ -1,243 +0,0 @@
-/*
- * Atomic operations that C can't guarantee us.  Useful for
- * resource counting etc.
- *
- * But use these as seldom as possible since they are slower than
- * regular operations.
- *
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef __ASM_AVR32_ATOMIC_H
-#define __ASM_AVR32_ATOMIC_H
-
-#include <linux/types.h>
-#include <asm/cmpxchg.h>
-
-#define ATOMIC_INIT(i)  { (i) }
-
-#define atomic_read(v)         READ_ONCE((v)->counter)
-#define atomic_set(v, i)       WRITE_ONCE(((v)->counter), (i))
-
-#define ATOMIC_OP_RETURN(op, asm_op, asm_con)                          \
-static inline int __atomic_##op##_return(int i, atomic_t *v)           \
-{                                                                      \
-       int result;                                                     \
-                                                                       \
-       asm volatile(                                                   \
-               "/* atomic_" #op "_return */\n"                         \
-               "1:     ssrf    5\n"                                    \
-               "       ld.w    %0, %2\n"                               \
-               "       " #asm_op "     %0, %3\n"                       \
-               "       stcond  %1, %0\n"                               \
-               "       brne    1b"                                     \
-               : "=&r" (result), "=o" (v->counter)                     \
-               : "m" (v->counter), #asm_con (i)                        \
-               : "cc");                                                \
-                                                                       \
-       return result;                                                  \
-}
-
-#define ATOMIC_FETCH_OP(op, asm_op, asm_con)                           \
-static inline int __atomic_fetch_##op(int i, atomic_t *v)              \
-{                                                                      \
-       int result, val;                                                \
-                                                                       \
-       asm volatile(                                                   \
-               "/* atomic_fetch_" #op " */\n"                          \
-               "1:     ssrf    5\n"                                    \
-               "       ld.w    %0, %3\n"                               \
-               "       mov     %1, %0\n"                               \
-               "       " #asm_op "     %1, %4\n"                       \
-               "       stcond  %2, %1\n"                               \
-               "       brne    1b"                                     \
-               : "=&r" (result), "=&r" (val), "=o" (v->counter)        \
-               : "m" (v->counter), #asm_con (i)                        \
-               : "cc");                                                \
-                                                                       \
-       return result;                                                  \
-}
-
-ATOMIC_OP_RETURN(sub, sub, rKs21)
-ATOMIC_OP_RETURN(add, add, r)
-ATOMIC_FETCH_OP (sub, sub, rKs21)
-ATOMIC_FETCH_OP (add, add, r)
-
-#define ATOMIC_OPS(op, asm_op)                                         \
-ATOMIC_OP_RETURN(op, asm_op, r)                                                \
-static inline void atomic_##op(int i, atomic_t *v)                     \
-{                                                                      \
-       (void)__atomic_##op##_return(i, v);                             \
-}                                                                      \
-ATOMIC_FETCH_OP(op, asm_op, r)                                         \
-static inline int atomic_fetch_##op(int i, atomic_t *v)                \
-{                                                                      \
-       return __atomic_fetch_##op(i, v);                               \
-}
-
-ATOMIC_OPS(and, and)
-ATOMIC_OPS(or, or)
-ATOMIC_OPS(xor, eor)
-
-#undef ATOMIC_OPS
-#undef ATOMIC_FETCH_OP
-#undef ATOMIC_OP_RETURN
-
-/*
- * Probably found the reason why we want to use sub with the signed 21-bit
- * limit, it uses one less register than the add instruction that can add up to
- * 32-bit values.
- *
- * Both instructions are 32-bit, to use a 16-bit instruction the immediate is
- * very small; 4 bit.
- *
- * sub 32-bit, type IV, takes a register and subtracts a 21-bit immediate.
- * add 32-bit, type II, adds two register values together.
- */
-#define IS_21BIT_CONST(i)                                              \
-       (__builtin_constant_p(i) && ((i) >= -1048575) && ((i) <= 1048576))
-
-/*
- * atomic_add_return - add integer to atomic variable
- * @i: integer value to add
- * @v: pointer of type atomic_t
- *
- * Atomically adds @i to @v. Returns the resulting value.
- */
-static inline int atomic_add_return(int i, atomic_t *v)
-{
-       if (IS_21BIT_CONST(i))
-               return __atomic_sub_return(-i, v);
-
-       return __atomic_add_return(i, v);
-}
-
-static inline int atomic_fetch_add(int i, atomic_t *v)
-{
-       if (IS_21BIT_CONST(i))
-               return __atomic_fetch_sub(-i, v);
-
-       return __atomic_fetch_add(i, v);
-}
-
-/*
- * atomic_sub_return - subtract the atomic variable
- * @i: integer value to subtract
- * @v: pointer of type atomic_t
- *
- * Atomically subtracts @i from @v. Returns the resulting value.
- */
-static inline int atomic_sub_return(int i, atomic_t *v)
-{
-       if (IS_21BIT_CONST(i))
-               return __atomic_sub_return(i, v);
-
-       return __atomic_add_return(-i, v);
-}
-
-static inline int atomic_fetch_sub(int i, atomic_t *v)
-{
-       if (IS_21BIT_CONST(i))
-               return __atomic_fetch_sub(i, v);
-
-       return __atomic_fetch_add(-i, v);
-}
-
-/*
- * __atomic_add_unless - add unless the number is a given value
- * @v: pointer of type atomic_t
- * @a: the amount to add to v...
- * @u: ...unless v is equal to u.
- *
- * Atomically adds @a to @v, so long as it was not @u.
- * Returns the old value of @v.
-*/
-static inline int __atomic_add_unless(atomic_t *v, int a, int u)
-{
-       int tmp, old = atomic_read(v);
-
-       if (IS_21BIT_CONST(a)) {
-               asm volatile(
-                       "/* __atomic_sub_unless */\n"
-                       "1:     ssrf    5\n"
-                       "       ld.w    %0, %2\n"
-                       "       cp.w    %0, %4\n"
-                       "       breq    1f\n"
-                       "       sub     %0, %3\n"
-                       "       stcond  %1, %0\n"
-                       "       brne    1b\n"
-                       "1:"
-                       : "=&r"(tmp), "=o"(v->counter)
-                       : "m"(v->counter), "rKs21"(-a), "rKs21"(u)
-                       : "cc", "memory");
-       } else {
-               asm volatile(
-                       "/* __atomic_add_unless */\n"
-                       "1:     ssrf    5\n"
-                       "       ld.w    %0, %2\n"
-                       "       cp.w    %0, %4\n"
-                       "       breq    1f\n"
-                       "       add     %0, %3\n"
-                       "       stcond  %1, %0\n"
-                       "       brne    1b\n"
-                       "1:"
-                       : "=&r"(tmp), "=o"(v->counter)
-                       : "m"(v->counter), "r"(a), "ir"(u)
-                       : "cc", "memory");
-       }
-
-       return old;
-}
-
-#undef IS_21BIT_CONST
-
-/*
- * atomic_sub_if_positive - conditionally subtract integer from atomic variable
- * @i: integer value to subtract
- * @v: pointer of type atomic_t
- *
- * Atomically test @v and subtract @i if @v is greater or equal than @i.
- * The function returns the old value of @v minus @i.
- */
-static inline int atomic_sub_if_positive(int i, atomic_t *v)
-{
-       int result;
-
-       asm volatile(
-               "/* atomic_sub_if_positive */\n"
-               "1:     ssrf    5\n"
-               "       ld.w    %0, %2\n"
-               "       sub     %0, %3\n"
-               "       brlt    1f\n"
-               "       stcond  %1, %0\n"
-               "       brne    1b\n"
-               "1:"
-               : "=&r"(result), "=o"(v->counter)
-               : "m"(v->counter), "ir"(i)
-               : "cc", "memory");
-
-       return result;
-}
-
-#define atomic_xchg(v, new)    (xchg(&((v)->counter), new))
-#define atomic_cmpxchg(v, o, n)        (cmpxchg(&((v)->counter), (o), (n)))
-
-#define atomic_sub(i, v)       (void)atomic_sub_return(i, v)
-#define atomic_add(i, v)       (void)atomic_add_return(i, v)
-#define atomic_dec(v)          atomic_sub(1, (v))
-#define atomic_inc(v)          atomic_add(1, (v))
-
-#define atomic_dec_return(v)   atomic_sub_return(1, v)
-#define atomic_inc_return(v)   atomic_add_return(1, v)
-
-#define atomic_sub_and_test(i, v) (atomic_sub_return(i, v) == 0)
-#define atomic_inc_and_test(v) (atomic_add_return(1, v) == 0)
-#define atomic_dec_and_test(v) (atomic_sub_return(1, v) == 0)
-#define atomic_add_negative(i, v) (atomic_add_return(i, v) < 0)
-
-#define atomic_dec_if_positive(v) atomic_sub_if_positive(1, v)
-
-#endif /*  __ASM_AVR32_ATOMIC_H */
diff --git a/arch/avr32/include/asm/barrier.h b/arch/avr32/include/asm/barrier.h
deleted file mode 100644 (file)
index 7151007..0000000
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef __ASM_AVR32_BARRIER_H
-#define __ASM_AVR32_BARRIER_H
-
-/*
- * Weirdest thing ever.. no full barrier, but it has a write barrier!
- */
-#define wmb()  asm volatile("sync 0" : : : "memory")
-
-#ifdef CONFIG_SMP
-# error "The AVR32 port does not support SMP"
-#endif
-
-#include <asm-generic/barrier.h>
-
-#endif /* __ASM_AVR32_BARRIER_H */
diff --git a/arch/avr32/include/asm/bitops.h b/arch/avr32/include/asm/bitops.h
deleted file mode 100644 (file)
index 910d537..0000000
+++ /dev/null
@@ -1,314 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef __ASM_AVR32_BITOPS_H
-#define __ASM_AVR32_BITOPS_H
-
-#ifndef _LINUX_BITOPS_H
-#error only <linux/bitops.h> can be included directly
-#endif
-
-#include <asm/byteorder.h>
-#include <asm/barrier.h>
-
-/*
- * set_bit - Atomically set a bit in memory
- * @nr: the bit to set
- * @addr: the address to start counting from
- *
- * This function is atomic and may not be reordered.  See __set_bit()
- * if you do not require the atomic guarantees.
- *
- * Note that @nr may be almost arbitrarily large; this function is not
- * restricted to acting on a single-word quantity.
- */
-static inline void set_bit(int nr, volatile void * addr)
-{
-       unsigned long *p = ((unsigned long *)addr) + nr / BITS_PER_LONG;
-       unsigned long tmp;
-
-       if (__builtin_constant_p(nr)) {
-               asm volatile(
-                       "1:     ssrf    5\n"
-                       "       ld.w    %0, %2\n"
-                       "       sbr     %0, %3\n"
-                       "       stcond  %1, %0\n"
-                       "       brne    1b"
-                       : "=&r"(tmp), "=o"(*p)
-                       : "m"(*p), "i"(nr)
-                       : "cc");
-       } else {
-               unsigned long mask = 1UL << (nr % BITS_PER_LONG);
-               asm volatile(
-                       "1:     ssrf    5\n"
-                       "       ld.w    %0, %2\n"
-                       "       or      %0, %3\n"
-                       "       stcond  %1, %0\n"
-                       "       brne    1b"
-                       : "=&r"(tmp), "=o"(*p)
-                       : "m"(*p), "r"(mask)
-                       : "cc");
-       }
-}
-
-/*
- * clear_bit - Clears a bit in memory
- * @nr: Bit to clear
- * @addr: Address to start counting from
- *
- * clear_bit() is atomic and may not be reordered.  However, it does
- * not contain a memory barrier, so if it is used for locking purposes,
- * you should call smp_mb__before_atomic() and/or smp_mb__after_atomic()
- * in order to ensure changes are visible on other processors.
- */
-static inline void clear_bit(int nr, volatile void * addr)
-{
-       unsigned long *p = ((unsigned long *)addr) + nr / BITS_PER_LONG;
-       unsigned long tmp;
-
-       if (__builtin_constant_p(nr)) {
-               asm volatile(
-                       "1:     ssrf    5\n"
-                       "       ld.w    %0, %2\n"
-                       "       cbr     %0, %3\n"
-                       "       stcond  %1, %0\n"
-                       "       brne    1b"
-                       : "=&r"(tmp), "=o"(*p)
-                       : "m"(*p), "i"(nr)
-                       : "cc");
-       } else {
-               unsigned long mask = 1UL << (nr % BITS_PER_LONG);
-               asm volatile(
-                       "1:     ssrf    5\n"
-                       "       ld.w    %0, %2\n"
-                       "       andn    %0, %3\n"
-                       "       stcond  %1, %0\n"
-                       "       brne    1b"
-                       : "=&r"(tmp), "=o"(*p)
-                       : "m"(*p), "r"(mask)
-                       : "cc");
-       }
-}
-
-/*
- * change_bit - Toggle a bit in memory
- * @nr: Bit to change
- * @addr: Address to start counting from
- *
- * change_bit() is atomic and may not be reordered.
- * Note that @nr may be almost arbitrarily large; this function is not
- * restricted to acting on a single-word quantity.
- */
-static inline void change_bit(int nr, volatile void * addr)
-{
-       unsigned long *p = ((unsigned long *)addr) + nr / BITS_PER_LONG;
-       unsigned long mask = 1UL << (nr % BITS_PER_LONG);
-       unsigned long tmp;
-
-       asm volatile(
-               "1:     ssrf    5\n"
-               "       ld.w    %0, %2\n"
-               "       eor     %0, %3\n"
-               "       stcond  %1, %0\n"
-               "       brne    1b"
-               : "=&r"(tmp), "=o"(*p)
-               : "m"(*p), "r"(mask)
-               : "cc");
-}
-
-/*
- * test_and_set_bit - Set a bit and return its old value
- * @nr: Bit to set
- * @addr: Address to count from
- *
- * This operation is atomic and cannot be reordered.
- * It also implies a memory barrier.
- */
-static inline int test_and_set_bit(int nr, volatile void * addr)
-{
-       unsigned long *p = ((unsigned long *)addr) + nr / BITS_PER_LONG;
-       unsigned long mask = 1UL << (nr % BITS_PER_LONG);
-       unsigned long tmp, old;
-
-       if (__builtin_constant_p(nr)) {
-               asm volatile(
-                       "1:     ssrf    5\n"
-                       "       ld.w    %0, %3\n"
-                       "       mov     %2, %0\n"
-                       "       sbr     %0, %4\n"
-                       "       stcond  %1, %0\n"
-                       "       brne    1b"
-                       : "=&r"(tmp), "=o"(*p), "=&r"(old)
-                       : "m"(*p), "i"(nr)
-                       : "memory", "cc");
-       } else {
-               asm volatile(
-                       "1:     ssrf    5\n"
-                       "       ld.w    %2, %3\n"
-                       "       or      %0, %2, %4\n"
-                       "       stcond  %1, %0\n"
-                       "       brne    1b"
-                       : "=&r"(tmp), "=o"(*p), "=&r"(old)
-                       : "m"(*p), "r"(mask)
-                       : "memory", "cc");
-       }
-
-       return (old & mask) != 0;
-}
-
-/*
- * test_and_clear_bit - Clear a bit and return its old value
- * @nr: Bit to clear
- * @addr: Address to count from
- *
- * This operation is atomic and cannot be reordered.
- * It also implies a memory barrier.
- */
-static inline int test_and_clear_bit(int nr, volatile void * addr)
-{
-       unsigned long *p = ((unsigned long *)addr) + nr / BITS_PER_LONG;
-       unsigned long mask = 1UL << (nr % BITS_PER_LONG);
-       unsigned long tmp, old;
-
-       if (__builtin_constant_p(nr)) {
-               asm volatile(
-                       "1:     ssrf    5\n"
-                       "       ld.w    %0, %3\n"
-                       "       mov     %2, %0\n"
-                       "       cbr     %0, %4\n"
-                       "       stcond  %1, %0\n"
-                       "       brne    1b"
-                       : "=&r"(tmp), "=o"(*p), "=&r"(old)
-                       : "m"(*p), "i"(nr)
-                       : "memory", "cc");
-       } else {
-               asm volatile(
-                       "1:     ssrf    5\n"
-                       "       ld.w    %0, %3\n"
-                       "       mov     %2, %0\n"
-                       "       andn    %0, %4\n"
-                       "       stcond  %1, %0\n"
-                       "       brne    1b"
-                       : "=&r"(tmp), "=o"(*p), "=&r"(old)
-                       : "m"(*p), "r"(mask)
-                       : "memory", "cc");
-       }
-
-       return (old & mask) != 0;
-}
-
-/*
- * test_and_change_bit - Change a bit and return its old value
- * @nr: Bit to change
- * @addr: Address to count from
- *
- * This operation is atomic and cannot be reordered.
- * It also implies a memory barrier.
- */
-static inline int test_and_change_bit(int nr, volatile void * addr)
-{
-       unsigned long *p = ((unsigned long *)addr) + nr / BITS_PER_LONG;
-       unsigned long mask = 1UL << (nr % BITS_PER_LONG);
-       unsigned long tmp, old;
-
-       asm volatile(
-               "1:     ssrf    5\n"
-               "       ld.w    %2, %3\n"
-               "       eor     %0, %2, %4\n"
-               "       stcond  %1, %0\n"
-               "       brne    1b"
-               : "=&r"(tmp), "=o"(*p), "=&r"(old)
-               : "m"(*p), "r"(mask)
-               : "memory", "cc");
-
-       return (old & mask) != 0;
-}
-
-#include <asm-generic/bitops/non-atomic.h>
-
-/* Find First bit Set */
-static inline unsigned long __ffs(unsigned long word)
-{
-       unsigned long result;
-
-       asm("brev %1\n\t"
-           "clz %0,%1"
-           : "=r"(result), "=&r"(word)
-           : "1"(word));
-       return result;
-}
-
-/* Find First Zero */
-static inline unsigned long ffz(unsigned long word)
-{
-       return __ffs(~word);
-}
-
-/* Find Last bit Set */
-static inline int fls(unsigned long word)
-{
-       unsigned long result;
-
-       asm("clz %0,%1" : "=r"(result) : "r"(word));
-       return 32 - result;
-}
-
-static inline int __fls(unsigned long word)
-{
-       return fls(word) - 1;
-}
-
-unsigned long find_first_zero_bit(const unsigned long *addr,
-                                 unsigned long size);
-#define find_first_zero_bit find_first_zero_bit
-
-unsigned long find_next_zero_bit(const unsigned long *addr,
-                                unsigned long size,
-                                unsigned long offset);
-#define find_next_zero_bit find_next_zero_bit
-
-unsigned long find_first_bit(const unsigned long *addr,
-                            unsigned long size);
-#define find_first_bit find_first_bit
-
-unsigned long find_next_bit(const unsigned long *addr,
-                                unsigned long size,
-                                unsigned long offset);
-#define find_next_bit find_next_bit
-
-/*
- * ffs: find first bit set. This is defined the same way as
- * the libc and compiler builtin ffs routines, therefore
- * differs in spirit from the above ffz (man ffs).
- *
- * The difference is that bit numbering starts at 1, and if no bit is set,
- * the function returns 0.
- */
-static inline int ffs(unsigned long word)
-{
-       if(word == 0)
-               return 0;
-       return __ffs(word) + 1;
-}
-
-#include <asm-generic/bitops/fls64.h>
-#include <asm-generic/bitops/sched.h>
-#include <asm-generic/bitops/hweight.h>
-#include <asm-generic/bitops/lock.h>
-
-extern unsigned long find_next_zero_bit_le(const void *addr,
-               unsigned long size, unsigned long offset);
-#define find_next_zero_bit_le find_next_zero_bit_le
-
-extern unsigned long find_next_bit_le(const void *addr,
-               unsigned long size, unsigned long offset);
-#define find_next_bit_le find_next_bit_le
-
-#include <asm-generic/bitops/le.h>
-#include <asm-generic/bitops/ext2-atomic.h>
-
-#endif /* __ASM_AVR32_BITOPS_H */
diff --git a/arch/avr32/include/asm/bug.h b/arch/avr32/include/asm/bug.h
deleted file mode 100644 (file)
index 85a92d0..0000000
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (C) 2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef __ASM_AVR32_BUG_H
-#define __ASM_AVR32_BUG_H
-
-#ifdef CONFIG_BUG
-
-/*
- * According to our Chief Architect, this compact opcode is very
- * unlikely to ever be implemented.
- */
-#define AVR32_BUG_OPCODE       0x5df0
-
-#ifdef CONFIG_DEBUG_BUGVERBOSE
-
-#define _BUG_OR_WARN(flags)                                            \
-       asm volatile(                                                   \
-               "1:     .hword  %0\n"                                   \
-               "       .section __bug_table,\"a\",@progbits\n"         \
-               "2:     .long   1b\n"                                   \
-               "       .long   %1\n"                                   \
-               "       .short  %2\n"                                   \
-               "       .short  %3\n"                                   \
-               "       .org    2b + %4\n"                              \
-               "       .previous"                                      \
-               :                                                       \
-               : "i"(AVR32_BUG_OPCODE), "i"(__FILE__),                 \
-                 "i"(__LINE__), "i"(flags),                            \
-                 "i"(sizeof(struct bug_entry)))
-
-#else
-
-#define _BUG_OR_WARN(flags)                                            \
-       asm volatile(                                                   \
-               "1:     .hword  %0\n"                                   \
-               "       .section __bug_table,\"a\",@progbits\n"         \
-               "2:     .long   1b\n"                                   \
-               "       .short  %1\n"                                   \
-               "       .org    2b + %2\n"                              \
-               "       .previous"                                      \
-               :                                                       \
-               : "i"(AVR32_BUG_OPCODE), "i"(flags),                    \
-                 "i"(sizeof(struct bug_entry)))
-
-#endif /* CONFIG_DEBUG_BUGVERBOSE */
-
-#define BUG()                                                          \
-       do {                                                            \
-               _BUG_OR_WARN(0);                                        \
-               unreachable();                                          \
-       } while (0)
-
-#define WARN_ON(condition)                                                     \
-       ({                                                              \
-               int __ret_warn_on = !!(condition);                      \
-               if (unlikely(__ret_warn_on))                            \
-                       _BUG_OR_WARN(BUGFLAG_WARNING);                  \
-               unlikely(__ret_warn_on);                                \
-       })
-
-#define HAVE_ARCH_BUG
-#define HAVE_ARCH_WARN_ON
-
-#endif /* CONFIG_BUG */
-
-#include <asm-generic/bug.h>
-
-struct pt_regs;
-void die(const char *str, struct pt_regs *regs, long err);
-void _exception(long signr, struct pt_regs *regs, int code,
-               unsigned long addr);
-
-#endif /* __ASM_AVR32_BUG_H */
diff --git a/arch/avr32/include/asm/bugs.h b/arch/avr32/include/asm/bugs.h
deleted file mode 100644 (file)
index 278661b..0000000
+++ /dev/null
@@ -1,15 +0,0 @@
-/*
- * This is included by init/main.c to check for architecture-dependent bugs.
- *
- * Needs:
- *      void check_bugs(void);
- */
-#ifndef __ASM_AVR32_BUGS_H
-#define __ASM_AVR32_BUGS_H
-
-static void __init check_bugs(void)
-{
-       boot_cpu_data.loops_per_jiffy = loops_per_jiffy;
-}
-
-#endif /* __ASM_AVR32_BUGS_H */
diff --git a/arch/avr32/include/asm/cache.h b/arch/avr32/include/asm/cache.h
deleted file mode 100644 (file)
index c3a58a1..0000000
+++ /dev/null
@@ -1,38 +0,0 @@
-#ifndef __ASM_AVR32_CACHE_H
-#define __ASM_AVR32_CACHE_H
-
-#define L1_CACHE_SHIFT 5
-#define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT)
-
-/*
- * Memory returned by kmalloc() may be used for DMA, so we must make
- * sure that all such allocations are cache aligned. Otherwise,
- * unrelated code may cause parts of the buffer to be read into the
- * cache before the transfer is done, causing old data to be seen by
- * the CPU.
- */
-#define ARCH_DMA_MINALIGN      L1_CACHE_BYTES
-
-#ifndef __ASSEMBLER__
-struct cache_info {
-       unsigned int ways;
-       unsigned int sets;
-       unsigned int linesz;
-};
-#endif /* __ASSEMBLER */
-
-/* Cache operation constants */
-#define ICACHE_FLUSH           0x00
-#define ICACHE_INVALIDATE      0x01
-#define ICACHE_LOCK            0x02
-#define ICACHE_UNLOCK          0x03
-#define ICACHE_PREFETCH                0x04
-
-#define DCACHE_FLUSH           0x08
-#define DCACHE_LOCK            0x09
-#define DCACHE_UNLOCK          0x0a
-#define DCACHE_INVALIDATE      0x0b
-#define DCACHE_CLEAN           0x0c
-#define DCACHE_CLEAN_INVAL     0x0d
-
-#endif /* __ASM_AVR32_CACHE_H */
diff --git a/arch/avr32/include/asm/cacheflush.h b/arch/avr32/include/asm/cacheflush.h
deleted file mode 100644 (file)
index 96e5382..0000000
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef __ASM_AVR32_CACHEFLUSH_H
-#define __ASM_AVR32_CACHEFLUSH_H
-
-/* Keep includes the same across arches.  */
-#include <linux/mm.h>
-
-#define CACHE_OP_ICACHE_INVALIDATE     0x01
-#define CACHE_OP_DCACHE_INVALIDATE     0x0b
-#define CACHE_OP_DCACHE_CLEAN          0x0c
-#define CACHE_OP_DCACHE_CLEAN_INVAL    0x0d
-
-/*
- * Invalidate any cacheline containing virtual address vaddr without
- * writing anything back to memory.
- *
- * Note that this function may corrupt unrelated data structures when
- * applied on buffers that are not cacheline aligned in both ends.
- */
-static inline void invalidate_dcache_line(void *vaddr)
-{
-       asm volatile("cache %0[0], %1"
-                    :
-                    : "r"(vaddr), "n"(CACHE_OP_DCACHE_INVALIDATE)
-                    : "memory");
-}
-
-/*
- * Make sure any cacheline containing virtual address vaddr is written
- * to memory.
- */
-static inline void clean_dcache_line(void *vaddr)
-{
-       asm volatile("cache %0[0], %1"
-                    :
-                    : "r"(vaddr), "n"(CACHE_OP_DCACHE_CLEAN)
-                    : "memory");
-}
-
-/*
- * Make sure any cacheline containing virtual address vaddr is written
- * to memory and then invalidate it.
- */
-static inline void flush_dcache_line(void *vaddr)
-{
-       asm volatile("cache %0[0], %1"
-                    :
-                    : "r"(vaddr), "n"(CACHE_OP_DCACHE_CLEAN_INVAL)
-                    : "memory");
-}
-
-/*
- * Invalidate any instruction cacheline containing virtual address
- * vaddr.
- */
-static inline void invalidate_icache_line(void *vaddr)
-{
-       asm volatile("cache %0[0], %1"
-                    :
-                    : "r"(vaddr), "n"(CACHE_OP_ICACHE_INVALIDATE)
-                    : "memory");
-}
-
-/*
- * Applies the above functions on all lines that are touched by the
- * specified virtual address range.
- */
-void invalidate_dcache_region(void *start, size_t len);
-void clean_dcache_region(void *start, size_t len);
-void flush_dcache_region(void *start, size_t len);
-void invalidate_icache_region(void *start, size_t len);
-
-/*
- * Make sure any pending writes are completed before continuing.
- */
-#define flush_write_buffer() asm volatile("sync 0" : : : "memory")
-
-/*
- * The following functions are called when a virtual mapping changes.
- * We do not need to flush anything in this case.
- */
-#define flush_cache_all()                      do { } while (0)
-#define flush_cache_mm(mm)                     do { } while (0)
-#define flush_cache_dup_mm(mm)                 do { } while (0)
-#define flush_cache_range(vma, start, end)     do { } while (0)
-#define flush_cache_page(vma, vmaddr, pfn)     do { } while (0)
-#define flush_cache_vmap(start, end)           do { } while (0)
-#define flush_cache_vunmap(start, end)         do { } while (0)
-
-/*
- * I think we need to implement this one to be able to reliably
- * execute pages from RAMDISK. However, if we implement the
- * flush_dcache_*() functions, it might not be needed anymore.
- *
- * #define flush_icache_page(vma, page)                do { } while (0)
- */
-extern void flush_icache_page(struct vm_area_struct *vma, struct page *page);
-
-/*
- * These are (I think) related to D-cache aliasing.  We might need to
- * do something here, but only for certain configurations.  No such
- * configurations exist at this time.
- */
-#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
-#define flush_dcache_page(page)                        do { } while (0)
-#define flush_dcache_mmap_lock(page)           do { } while (0)
-#define flush_dcache_mmap_unlock(page)         do { } while (0)
-
-/*
- * These are for I/D cache coherency. In this case, we do need to
- * flush with all configurations.
- */
-extern void flush_icache_range(unsigned long start, unsigned long end);
-
-extern void copy_to_user_page(struct vm_area_struct *vma, struct page *page,
-               unsigned long vaddr, void *dst, const void *src,
-               unsigned long len);
-
-static inline void copy_from_user_page(struct vm_area_struct *vma,
-               struct page *page, unsigned long vaddr, void *dst,
-               const void *src, unsigned long len)
-{
-       memcpy(dst, src, len);
-}
-
-#endif /* __ASM_AVR32_CACHEFLUSH_H */
diff --git a/arch/avr32/include/asm/checksum.h b/arch/avr32/include/asm/checksum.h
deleted file mode 100644 (file)
index 4ab7d5b..0000000
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef __ASM_AVR32_CHECKSUM_H
-#define __ASM_AVR32_CHECKSUM_H
-
-/*
- * computes the checksum of a memory block at buff, length len,
- * and adds in "sum" (32-bit)
- *
- * returns a 32-bit number suitable for feeding into itself
- * or csum_tcpudp_magic
- *
- * this function must be called with even lengths, except
- * for the last fragment, which may be odd
- *
- * it's best to have buff aligned on a 32-bit boundary
- */
-__wsum csum_partial(const void *buff, int len, __wsum sum);
-
-/*
- * the same as csum_partial, but copies from src while it
- * checksums, and handles user-space pointer exceptions correctly, when needed.
- *
- * here even more important to align src and dst on a 32-bit (or even
- * better 64-bit) boundary
- */
-__wsum csum_partial_copy_generic(const void *src, void *dst, int len,
-                                      __wsum sum, int *src_err_ptr,
-                                      int *dst_err_ptr);
-
-/*
- *     Note: when you get a NULL pointer exception here this means someone
- *     passed in an incorrect kernel address to one of these functions.
- *
- *     If you use these functions directly please don't forget the
- *     access_ok().
- */
-static inline
-__wsum csum_partial_copy_nocheck(const void *src, void *dst,
-                                      int len, __wsum sum)
-{
-       return csum_partial_copy_generic(src, dst, len, sum, NULL, NULL);
-}
-
-static inline
-__wsum csum_partial_copy_from_user(const void __user *src, void *dst,
-                                         int len, __wsum sum, int *err_ptr)
-{
-       return csum_partial_copy_generic((const void __force *)src, dst, len,
-                                        sum, err_ptr, NULL);
-}
-
-/*
- *     This is a version of ip_compute_csum() optimized for IP headers,
- *     which always checksum on 4 octet boundaries.
- */
-static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
-{
-       unsigned int sum, tmp;
-
-       __asm__ __volatile__(
-               "       ld.w    %0, %1++\n"
-               "       ld.w    %3, %1++\n"
-               "       sub     %2, 4\n"
-               "       add     %0, %3\n"
-               "       ld.w    %3, %1++\n"
-               "       adc     %0, %0, %3\n"
-               "       ld.w    %3, %1++\n"
-               "       adc     %0, %0, %3\n"
-               "       acr     %0\n"
-               "1:     ld.w    %3, %1++\n"
-               "       add     %0, %3\n"
-               "       acr     %0\n"
-               "       sub     %2, 1\n"
-               "       brne    1b\n"
-               "       lsl     %3, %0, 16\n"
-               "       andl    %0, 0\n"
-               "       mov     %2, 0xffff\n"
-               "       add     %0, %3\n"
-               "       adc     %0, %0, %2\n"
-               "       com     %0\n"
-               "       lsr     %0, 16\n"
-               : "=r"(sum), "=r"(iph), "=r"(ihl), "=r"(tmp)
-               : "1"(iph), "2"(ihl)
-               : "memory", "cc");
-       return (__force __sum16)sum;
-}
-
-/*
- *     Fold a partial checksum
- */
-
-static inline __sum16 csum_fold(__wsum sum)
-{
-       unsigned int tmp;
-
-       asm("   bfextu  %1, %0, 0, 16\n"
-           "   lsr     %0, 16\n"
-           "   add     %0, %1\n"
-           "   bfextu  %1, %0, 16, 16\n"
-           "   add     %0, %1"
-           : "=&r"(sum), "=&r"(tmp)
-           : "0"(sum));
-
-       return (__force __sum16)~sum;
-}
-
-static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
-                                       __u32 len, __u8 proto,
-                                       __wsum sum)
-{
-       asm("   add     %0, %1\n"
-           "   adc     %0, %0, %2\n"
-           "   adc     %0, %0, %3\n"
-           "   acr     %0"
-           : "=r"(sum)
-           : "r"(daddr), "r"(saddr), "r"(len + proto),
-             "0"(sum)
-           : "cc");
-
-       return sum;
-}
-
-/*
- * computes the checksum of the TCP/UDP pseudo-header
- * returns a 16-bit checksum, already complemented
- */
-static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
-                                       __u32 len, __u8 proto,
-                                       __wsum sum)
-{
-       return csum_fold(csum_tcpudp_nofold(saddr,daddr,len,proto,sum));
-}
-
-/*
- * this routine is used for miscellaneous IP-like checksums, mainly
- * in icmp.c
- */
-
-static inline __sum16 ip_compute_csum(const void *buff, int len)
-{
-    return csum_fold(csum_partial(buff, len, 0));
-}
-
-#endif /* __ASM_AVR32_CHECKSUM_H */
diff --git a/arch/avr32/include/asm/cmpxchg.h b/arch/avr32/include/asm/cmpxchg.h
deleted file mode 100644 (file)
index 572739b..0000000
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * Atomic operations that C can't guarantee us.  Useful for
- * resource counting etc.
- *
- * But use these as seldom as possible since they are slower than
- * regular operations.
- *
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef __ASM_AVR32_CMPXCHG_H
-#define __ASM_AVR32_CMPXCHG_H
-
-#define xchg(ptr,x) \
-       ((__typeof__(*(ptr)))__xchg((unsigned long)(x),(ptr),sizeof(*(ptr))))
-
-extern void __xchg_called_with_bad_pointer(void);
-
-static inline unsigned long xchg_u32(u32 val, volatile u32 *m)
-{
-       u32 ret;
-
-       asm volatile("xchg %[ret], %[m], %[val]"
-                       : [ret] "=&r"(ret), "=m"(*m)
-                       : "m"(*m), [m] "r"(m), [val] "r"(val)
-                       : "memory");
-       return ret;
-}
-
-static inline unsigned long __xchg(unsigned long x,
-                                      volatile void *ptr,
-                                      int size)
-{
-       switch(size) {
-       case 4:
-               return xchg_u32(x, ptr);
-       default:
-               __xchg_called_with_bad_pointer();
-               return x;
-       }
-}
-
-static inline unsigned long __cmpxchg_u32(volatile int *m, unsigned long old,
-                                         unsigned long new)
-{
-       __u32 ret;
-
-       asm volatile(
-               "1:     ssrf    5\n"
-               "       ld.w    %[ret], %[m]\n"
-               "       cp.w    %[ret], %[old]\n"
-               "       brne    2f\n"
-               "       stcond  %[m], %[new]\n"
-               "       brne    1b\n"
-               "2:\n"
-               : [ret] "=&r"(ret), [m] "=m"(*m)
-               : "m"(m), [old] "Ks21r"(old), [new] "r"(new)
-               : "memory", "cc");
-       return ret;
-}
-
-extern unsigned long __cmpxchg_u64_unsupported_on_32bit_kernels(
-        volatile int * m, unsigned long old, unsigned long new);
-#define __cmpxchg_u64 __cmpxchg_u64_unsupported_on_32bit_kernels
-
-/* This function doesn't exist, so you'll get a linker error
-   if something tries to do an invalid cmpxchg().  */
-extern void __cmpxchg_called_with_bad_pointer(void);
-
-static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
-                                     unsigned long new, int size)
-{
-       switch (size) {
-       case 4:
-               return __cmpxchg_u32(ptr, old, new);
-       case 8:
-               return __cmpxchg_u64(ptr, old, new);
-       }
-
-       __cmpxchg_called_with_bad_pointer();
-       return old;
-}
-
-#define cmpxchg(ptr, old, new)                                 \
-       ((typeof(*(ptr)))__cmpxchg((ptr), (unsigned long)(old), \
-                                  (unsigned long)(new),        \
-                                  sizeof(*(ptr))))
-
-#include <asm-generic/cmpxchg-local.h>
-
-static inline unsigned long __cmpxchg_local(volatile void *ptr,
-                                     unsigned long old,
-                                     unsigned long new, int size)
-{
-       switch (size) {
-       case 4:
-               return __cmpxchg_u32(ptr, old, new);
-       default:
-               return __cmpxchg_local_generic(ptr, old, new, size);
-       }
-
-       return old;
-}
-
-#define cmpxchg_local(ptr, old, new)                                   \
-       ((typeof(*(ptr)))__cmpxchg_local((ptr), (unsigned long)(old),   \
-                                  (unsigned long)(new),                \
-                                  sizeof(*(ptr))))
-
-#define cmpxchg64_local(ptr, o, n) __cmpxchg64_local_generic((ptr), (o), (n))
-
-#endif /* __ASM_AVR32_CMPXCHG_H */
diff --git a/arch/avr32/include/asm/current.h b/arch/avr32/include/asm/current.h
deleted file mode 100644 (file)
index c7b0549..0000000
+++ /dev/null
@@ -1,15 +0,0 @@
-#ifndef __ASM_AVR32_CURRENT_H
-#define __ASM_AVR32_CURRENT_H
-
-#include <linux/thread_info.h>
-
-struct task_struct;
-
-inline static struct task_struct * get_current(void)
-{
-       return current_thread_info()->task;
-}
-
-#define current get_current()
-
-#endif /* __ASM_AVR32_CURRENT_H */
diff --git a/arch/avr32/include/asm/dma-mapping.h b/arch/avr32/include/asm/dma-mapping.h
deleted file mode 100644 (file)
index 7388451..0000000
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef __ASM_AVR32_DMA_MAPPING_H
-#define __ASM_AVR32_DMA_MAPPING_H
-
-extern void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
-       int direction);
-
-extern const struct dma_map_ops avr32_dma_ops;
-
-static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
-{
-       return &avr32_dma_ops;
-}
-
-#endif /* __ASM_AVR32_DMA_MAPPING_H */
diff --git a/arch/avr32/include/asm/dma.h b/arch/avr32/include/asm/dma.h
deleted file mode 100644 (file)
index 9e91205..0000000
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __ASM_AVR32_DMA_H
-#define __ASM_AVR32_DMA_H
-
-/* The maximum address that we can perform a DMA transfer to on this platform.
- * Not really applicable to AVR32, but some functions need it. */
-#define MAX_DMA_ADDRESS                0xffffffff
-
-#endif /* __ASM_AVR32_DMA_H */
diff --git a/arch/avr32/include/asm/elf.h b/arch/avr32/include/asm/elf.h
deleted file mode 100644 (file)
index 0388ece..0000000
+++ /dev/null
@@ -1,105 +0,0 @@
-#ifndef __ASM_AVR32_ELF_H
-#define __ASM_AVR32_ELF_H
-
-/* AVR32 relocation numbers */
-#define R_AVR32_NONE           0
-#define R_AVR32_32             1
-#define R_AVR32_16             2
-#define R_AVR32_8              3
-#define R_AVR32_32_PCREL       4
-#define R_AVR32_16_PCREL       5
-#define R_AVR32_8_PCREL                6
-#define R_AVR32_DIFF32         7
-#define R_AVR32_DIFF16         8
-#define R_AVR32_DIFF8          9
-#define R_AVR32_GOT32          10
-#define R_AVR32_GOT16          11
-#define R_AVR32_GOT8           12
-#define R_AVR32_21S            13
-#define R_AVR32_16U            14
-#define R_AVR32_16S            15
-#define R_AVR32_8S             16
-#define R_AVR32_8S_EXT         17
-#define R_AVR32_22H_PCREL      18
-#define R_AVR32_18W_PCREL      19
-#define R_AVR32_16B_PCREL      20
-#define R_AVR32_16N_PCREL      21
-#define R_AVR32_14UW_PCREL     22
-#define R_AVR32_11H_PCREL      23
-#define R_AVR32_10UW_PCREL     24
-#define R_AVR32_9H_PCREL       25
-#define R_AVR32_9UW_PCREL      26
-#define R_AVR32_HI16           27
-#define R_AVR32_LO16           28
-#define R_AVR32_GOTPC          29
-#define R_AVR32_GOTCALL                30
-#define R_AVR32_LDA_GOT                31
-#define R_AVR32_GOT21S         32
-#define R_AVR32_GOT18SW                33
-#define R_AVR32_GOT16S         34
-#define R_AVR32_GOT7UW         35
-#define R_AVR32_32_CPENT       36
-#define R_AVR32_CPCALL         37
-#define R_AVR32_16_CP          38
-#define R_AVR32_9W_CP          39
-#define R_AVR32_RELATIVE       40
-#define R_AVR32_GLOB_DAT       41
-#define R_AVR32_JMP_SLOT       42
-#define R_AVR32_ALIGN          43
-
-/*
- * ELF register definitions..
- */
-
-#include <asm/ptrace.h>
-#include <asm/user.h>
-
-typedef unsigned long elf_greg_t;
-
-#define ELF_NGREG (sizeof (struct pt_regs) / sizeof (elf_greg_t))
-typedef elf_greg_t elf_gregset_t[ELF_NGREG];
-
-typedef struct user_fpu_struct elf_fpregset_t;
-
-/*
- * This is used to ensure we don't load something for the wrong architecture.
- */
-#define elf_check_arch(x) ( (x)->e_machine == EM_AVR32 )
-
-/*
- * These are used to set parameters in the core dumps.
- */
-#define ELF_CLASS      ELFCLASS32
-#ifdef __LITTLE_ENDIAN__
-#define ELF_DATA       ELFDATA2LSB
-#else
-#define ELF_DATA       ELFDATA2MSB
-#endif
-#define ELF_ARCH       EM_AVR32
-
-#define ELF_EXEC_PAGESIZE      4096
-
-/* This is the location that an ET_DYN program is loaded if exec'ed.  Typical
-   use of this is to invoke "./ld.so someprog" to test out a new version of
-   the loader.  We need to make sure that it is out of the way of the program
-   that it will "exec", and that there is sufficient room for the brk.  */
-
-#define ELF_ET_DYN_BASE         (TASK_SIZE / 3 * 2)
-
-
-/* This yields a mask that user programs can use to figure out what
-   instruction set this CPU supports.  This could be done in user space,
-   but it's not easy, and we've already done it here.  */
-
-#define ELF_HWCAP      (0)
-
-/* This yields a string that ld.so will use to load implementation
-   specific libraries for optimization.  This is more specific in
-   intent than poking at uname or /proc/cpuinfo.
-
-   For the moment, we have only optimizations for the Intel generations,
-   but that could change... */
-
-#define ELF_PLATFORM  (NULL)
-
-#endif /* __ASM_AVR32_ELF_H */
diff --git a/arch/avr32/include/asm/fb.h b/arch/avr32/include/asm/fb.h
deleted file mode 100644 (file)
index 41baf84..0000000
+++ /dev/null
@@ -1,21 +0,0 @@
-#ifndef _ASM_FB_H_
-#define _ASM_FB_H_
-
-#include <linux/fb.h>
-#include <linux/fs.h>
-#include <asm/page.h>
-
-static inline void fb_pgprotect(struct file *file, struct vm_area_struct *vma,
-                               unsigned long off)
-{
-       vma->vm_page_prot = __pgprot((pgprot_val(vma->vm_page_prot)
-                                     & ~_PAGE_CACHABLE)
-                                    | (_PAGE_BUFFER | _PAGE_DIRTY));
-}
-
-static inline int fb_is_primary_device(struct fb_info *info)
-{
-       return 0;
-}
-
-#endif /* _ASM_FB_H_ */
diff --git a/arch/avr32/include/asm/ftrace.h b/arch/avr32/include/asm/ftrace.h
deleted file mode 100644 (file)
index 40a8c17..0000000
+++ /dev/null
@@ -1 +0,0 @@
-/* empty */
diff --git a/arch/avr32/include/asm/gpio.h b/arch/avr32/include/asm/gpio.h
deleted file mode 100644 (file)
index b771f71..0000000
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef __ASM_AVR32_GPIO_H
-#define __ASM_AVR32_GPIO_H
-
-#include <mach/gpio.h>
-
-#endif /* __ASM_AVR32_GPIO_H */
diff --git a/arch/avr32/include/asm/hardirq.h b/arch/avr32/include/asm/hardirq.h
deleted file mode 100644 (file)
index 9e36e3f..0000000
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef __ASM_AVR32_HARDIRQ_H
-#define __ASM_AVR32_HARDIRQ_H
-#ifndef __ASSEMBLY__
-#include <asm-generic/hardirq.h>
-#endif /* __ASSEMBLY__ */
-#endif /* __ASM_AVR32_HARDIRQ_H */
diff --git a/arch/avr32/include/asm/hw_irq.h b/arch/avr32/include/asm/hw_irq.h
deleted file mode 100644 (file)
index a36f9fc..0000000
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifndef __ASM_AVR32_HW_IRQ_H
-#define __ASM_AVR32_HW_IRQ_H
-
-static inline void hw_resend_irq(struct irq_chip *h, unsigned int i)
-{
-       /* Nothing to do */
-}
-
-#endif /* __ASM_AVR32_HW_IRQ_H */
diff --git a/arch/avr32/include/asm/io.h b/arch/avr32/include/asm/io.h
deleted file mode 100644 (file)
index f855646..0000000
+++ /dev/null
@@ -1,329 +0,0 @@
-#ifndef __ASM_AVR32_IO_H
-#define __ASM_AVR32_IO_H
-
-#include <linux/bug.h>
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/types.h>
-
-#include <asm/addrspace.h>
-#include <asm/byteorder.h>
-
-#include <mach/io.h>
-
-/* virt_to_phys will only work when address is in P1 or P2 */
-static __inline__ unsigned long virt_to_phys(volatile void *address)
-{
-       return PHYSADDR(address);
-}
-
-static __inline__ void * phys_to_virt(unsigned long address)
-{
-       return (void *)P1SEGADDR(address);
-}
-
-#define cached_to_phys(addr)   ((unsigned long)PHYSADDR(addr))
-#define uncached_to_phys(addr) ((unsigned long)PHYSADDR(addr))
-#define phys_to_cached(addr)   ((void *)P1SEGADDR(addr))
-#define phys_to_uncached(addr) ((void *)P2SEGADDR(addr))
-
-/*
- * Generic IO read/write.  These perform native-endian accesses.  Note
- * that some architectures will want to re-define __raw_{read,write}w.
- */
-extern void __raw_writesb(void __iomem *addr, const void *data, int bytelen);
-extern void __raw_writesw(void __iomem *addr, const void *data, int wordlen);
-extern void __raw_writesl(void __iomem *addr, const void *data, int longlen);
-
-extern void __raw_readsb(const void __iomem *addr, void *data, int bytelen);
-extern void __raw_readsw(const void __iomem *addr, void *data, int wordlen);
-extern void __raw_readsl(const void __iomem *addr, void *data, int longlen);
-
-static inline void __raw_writeb(u8 v, volatile void __iomem *addr)
-{
-       *(volatile u8 __force *)addr = v;
-}
-static inline void __raw_writew(u16 v, volatile void __iomem *addr)
-{
-       *(volatile u16 __force *)addr = v;
-}
-static inline void __raw_writel(u32 v, volatile void __iomem *addr)
-{
-       *(volatile u32 __force *)addr = v;
-}
-
-static inline u8 __raw_readb(const volatile void __iomem *addr)
-{
-       return *(const volatile u8 __force *)addr;
-}
-static inline u16 __raw_readw(const volatile void __iomem *addr)
-{
-       return *(const volatile u16 __force *)addr;
-}
-static inline u32 __raw_readl(const volatile void __iomem *addr)
-{
-       return *(const volatile u32 __force *)addr;
-}
-
-/* Convert I/O port address to virtual address */
-#ifndef __io
-# define __io(p)       ((void *)phys_to_uncached(p))
-#endif
-
-/*
- * Not really sure about the best way to slow down I/O on
- * AVR32. Defining it as a no-op until we have an actual test case.
- */
-#define SLOW_DOWN_IO   do { } while (0)
-
-#define __BUILD_MEMORY_SINGLE(pfx, bwl, type)                          \
-static inline void                                                     \
-pfx##write##bwl(type val, volatile void __iomem *addr)                 \
-{                                                                      \
-       volatile type *__addr;                                          \
-       type __val;                                                     \
-                                                                       \
-       __addr = (void *)__swizzle_addr_##bwl((unsigned long)(addr));   \
-       __val = pfx##ioswab##bwl(__addr, val);                          \
-                                                                       \
-       BUILD_BUG_ON(sizeof(type) > sizeof(unsigned long));             \
-                                                                       \
-       *__addr = __val;                                                \
-}                                                                      \
-                                                                       \
-static inline type pfx##read##bwl(const volatile void __iomem *addr)   \
-{                                                                      \
-       volatile type *__addr;                                          \
-       type __val;                                                     \
-                                                                       \
-       __addr = (void *)__swizzle_addr_##bwl((unsigned long)(addr));   \
-                                                                       \
-       BUILD_BUG_ON(sizeof(type) > sizeof(unsigned long));             \
-                                                                       \
-       __val = *__addr;                                                \
-       return pfx##ioswab##bwl(__addr, __val);                         \
-}
-
-#define __BUILD_IOPORT_SINGLE(pfx, bwl, type, p, slow)                 \
-static inline void pfx##out##bwl##p(type val, unsigned long port)      \
-{                                                                      \
-       volatile type *__addr;                                          \
-       type __val;                                                     \
-                                                                       \
-       __addr = __io(__swizzle_addr_##bwl(port));                      \
-       __val = pfx##ioswab##bwl(__addr, val);                          \
-                                                                       \
-       BUILD_BUG_ON(sizeof(type) > sizeof(unsigned long));             \
-                                                                       \
-       *__addr = __val;                                                \
-       slow;                                                           \
-}                                                                      \
-                                                                       \
-static inline type pfx##in##bwl##p(unsigned long port)                 \
-{                                                                      \
-       volatile type *__addr;                                          \
-       type __val;                                                     \
-                                                                       \
-       __addr = __io(__swizzle_addr_##bwl(port));                      \
-                                                                       \
-       BUILD_BUG_ON(sizeof(type) > sizeof(unsigned long));             \
-                                                                       \
-       __val = *__addr;                                                \
-       slow;                                                           \
-                                                                       \
-       return pfx##ioswab##bwl(__addr, __val);                         \
-}
-
-#define __BUILD_MEMORY_PFX(bus, bwl, type)                             \
-       __BUILD_MEMORY_SINGLE(bus, bwl, type)
-
-#define BUILDIO_MEM(bwl, type)                                         \
-       __BUILD_MEMORY_PFX(, bwl, type)                                 \
-       __BUILD_MEMORY_PFX(__mem_, bwl, type)
-
-#define __BUILD_IOPORT_PFX(bus, bwl, type)                             \
-       __BUILD_IOPORT_SINGLE(bus, bwl, type, ,)                        \
-       __BUILD_IOPORT_SINGLE(bus, bwl, type, _p, SLOW_DOWN_IO)
-
-#define BUILDIO_IOPORT(bwl, type)                                      \
-       __BUILD_IOPORT_PFX(, bwl, type)                                 \
-       __BUILD_IOPORT_PFX(__mem_, bwl, type)
-
-BUILDIO_MEM(b, u8)
-BUILDIO_MEM(w, u16)
-BUILDIO_MEM(l, u32)
-
-BUILDIO_IOPORT(b, u8)
-BUILDIO_IOPORT(w, u16)
-BUILDIO_IOPORT(l, u32)
-
-#define readb_relaxed                  readb
-#define readw_relaxed                  readw
-#define readl_relaxed                  readl
-
-#define readb_be                       __raw_readb
-#define readw_be                       __raw_readw
-#define readl_be                       __raw_readl
-
-#define writeb_relaxed                 writeb
-#define writew_relaxed                 writew
-#define writel_relaxed                 writel
-
-#define writeb_be                      __raw_writeb
-#define writew_be                      __raw_writew
-#define writel_be                      __raw_writel
-
-#define __BUILD_MEMORY_STRING(bwl, type)                               \
-static inline void writes##bwl(volatile void __iomem *addr,            \
-                              const void *data, unsigned int count)    \
-{                                                                      \
-       const type *__data = data;                                      \
-                                                                       \
-       while (count--)                                                 \
-               __mem_write##bwl(*__data++, addr);                      \
-}                                                                      \
-                                                                       \
-static inline void reads##bwl(const volatile void __iomem *addr,       \
-                             void *data, unsigned int count)           \
-{                                                                      \
-       type *__data = data;                                            \
-                                                                       \
-       while (count--)                                                 \
-               *__data++ = __mem_read##bwl(addr);                      \
-}
-
-#define __BUILD_IOPORT_STRING(bwl, type)                               \
-static inline void outs##bwl(unsigned long port, const void *data,     \
-                            unsigned int count)                        \
-{                                                                      \
-       const type *__data = data;                                      \
-                                                                       \
-       while (count--)                                                 \
-               __mem_out##bwl(*__data++, port);                        \
-}                                                                      \
-                                                                       \
-static inline void ins##bwl(unsigned long port, void *data,            \
-                          unsigned int count)                          \
-{                                                                      \
-       type *__data = data;                                            \
-                                                                       \
-       while (count--)                                                 \
-               *__data++ = __mem_in##bwl(port);                        \
-}
-
-#define BUILDSTRING(bwl, type)                                         \
-       __BUILD_MEMORY_STRING(bwl, type)                                \
-       __BUILD_IOPORT_STRING(bwl, type)
-
-BUILDSTRING(b, u8)
-BUILDSTRING(w, u16)
-BUILDSTRING(l, u32)
-
-/*
- * io{read,write}{8,16,32} macros in both le (for PCI style consumers) and native be
- */
-#ifndef ioread8
-
-#define ioread8(p)             ((unsigned int)readb(p))
-
-#define ioread16(p)            ((unsigned int)readw(p))
-#define ioread16be(p)          ((unsigned int)__raw_readw(p))
-
-#define ioread32(p)            ((unsigned int)readl(p))
-#define ioread32be(p)          ((unsigned int)__raw_readl(p))
-
-#define iowrite8(v,p)          writeb(v, p)
-
-#define iowrite16(v,p)         writew(v, p)
-#define iowrite16be(v,p)       __raw_writew(v, p)
-
-#define iowrite32(v,p)         writel(v, p)
-#define iowrite32be(v,p)       __raw_writel(v, p)
-
-#define ioread8_rep(p,d,c)     readsb(p,d,c)
-#define ioread16_rep(p,d,c)    readsw(p,d,c)
-#define ioread32_rep(p,d,c)    readsl(p,d,c)
-
-#define iowrite8_rep(p,s,c)    writesb(p,s,c)
-#define iowrite16_rep(p,s,c)   writesw(p,s,c)
-#define iowrite32_rep(p,s,c)   writesl(p,s,c)
-
-#endif
-
-static inline void memcpy_fromio(void * to, const volatile void __iomem *from,
-                                unsigned long count)
-{
-       memcpy(to, (const void __force *)from, count);
-}
-
-static inline void  memcpy_toio(volatile void __iomem *to, const void * from,
-                               unsigned long count)
-{
-       memcpy((void __force *)to, from, count);
-}
-
-static inline void memset_io(volatile void __iomem *addr, unsigned char val,
-                            unsigned long count)
-{
-       memset((void __force *)addr, val, count);
-}
-
-#define mmiowb()
-
-#define IO_SPACE_LIMIT 0xffffffff
-
-extern void __iomem *__ioremap(unsigned long offset, size_t size,
-                              unsigned long flags);
-extern void __iounmap(void __iomem *addr);
-
-/*
- * ioremap     -   map bus memory into CPU space
- * @offset     bus address of the memory
- * @size       size of the resource to map
- *
- * ioremap performs a platform specific sequence of operations to make
- * bus memory CPU accessible via the readb/.../writel functions and
- * the other mmio helpers. The returned address is not guaranteed to
- * be usable directly as a virtual address.
- */
-#define ioremap(offset, size)                  \
-       __ioremap((offset), (size), 0)
-
-#define ioremap_nocache(offset, size)          \
-       __ioremap((offset), (size), 0)
-
-#define iounmap(addr)                          \
-       __iounmap(addr)
-
-#define ioremap_wc ioremap_nocache
-#define ioremap_wt ioremap_nocache
-#define ioremap_uc ioremap_nocache
-
-#define cached(addr) P1SEGADDR(addr)
-#define uncached(addr) P2SEGADDR(addr)
-
-#define virt_to_bus virt_to_phys
-#define bus_to_virt phys_to_virt
-#define page_to_bus page_to_phys
-#define bus_to_page phys_to_page
-
-/*
- * Create a virtual mapping cookie for an IO port range.  There exists
- * no such thing as port-based I/O on AVR32, so a regular ioremap()
- * should do what we need.
- */
-#define ioport_map(port, nr)   ioremap(port, nr)
-#define ioport_unmap(port)     iounmap(port)
-
-/*
- * Convert a physical pointer to a virtual kernel pointer for /dev/mem
- * access
- */
-#define xlate_dev_mem_ptr(p)    __va(p)
-
-/*
- * Convert a virtual cached pointer to an uncached pointer
- */
-#define xlate_dev_kmem_ptr(p)   p
-
-#endif /* __ASM_AVR32_IO_H */
diff --git a/arch/avr32/include/asm/irq.h b/arch/avr32/include/asm/irq.h
deleted file mode 100644 (file)
index 6fa8913..0000000
+++ /dev/null
@@ -1,24 +0,0 @@
-#ifndef __ASM_AVR32_IRQ_H
-#define __ASM_AVR32_IRQ_H
-
-#define NR_INTERNAL_IRQS       64
-
-#include <mach/irq.h>
-
-#ifndef NR_IRQS
-#define NR_IRQS                        (NR_INTERNAL_IRQS)
-#endif
-
-#define irq_canonicalize(i)    (i)
-
-#ifndef __ASSEMBLER__
-int nmi_enable(void);
-void nmi_disable(void);
-
-/*
- * Returns a bitmask of pending interrupts in a group.
- */
-extern unsigned long intc_get_pending(unsigned int group);
-#endif
-
-#endif /* __ASM_AVR32_IOCTLS_H */
diff --git a/arch/avr32/include/asm/irqflags.h b/arch/avr32/include/asm/irqflags.h
deleted file mode 100644 (file)
index 006e948..0000000
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef __ASM_AVR32_IRQFLAGS_H
-#define __ASM_AVR32_IRQFLAGS_H
-
-#include <linux/types.h>
-#include <asm/sysreg.h>
-
-static inline unsigned long arch_local_save_flags(void)
-{
-       return sysreg_read(SR);
-}
-
-/*
- * This will restore ALL status register flags, not only the interrupt
- * mask flag.
- *
- * The empty asm statement informs the compiler of this fact while
- * also serving as a barrier.
- */
-static inline void arch_local_irq_restore(unsigned long flags)
-{
-       sysreg_write(SR, flags);
-       asm volatile("" : : : "memory", "cc");
-}
-
-static inline void arch_local_irq_disable(void)
-{
-       asm volatile("ssrf %0" : : "n"(SYSREG_GM_OFFSET) : "memory");
-}
-
-static inline void arch_local_irq_enable(void)
-{
-       asm volatile("csrf %0" : : "n"(SYSREG_GM_OFFSET) : "memory");
-}
-
-static inline bool arch_irqs_disabled_flags(unsigned long flags)
-{
-       return (flags & SYSREG_BIT(GM)) != 0;
-}
-
-static inline bool arch_irqs_disabled(void)
-{
-       return arch_irqs_disabled_flags(arch_local_save_flags());
-}
-
-static inline unsigned long arch_local_irq_save(void)
-{
-       unsigned long flags = arch_local_save_flags();
-
-       arch_local_irq_disable();
-
-       return flags;
-}
-
-#endif /* __ASM_AVR32_IRQFLAGS_H */
diff --git a/arch/avr32/include/asm/kdebug.h b/arch/avr32/include/asm/kdebug.h
deleted file mode 100644 (file)
index f930ce2..0000000
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef __ASM_AVR32_KDEBUG_H
-#define __ASM_AVR32_KDEBUG_H
-
-/* Grossly misnamed. */
-enum die_val {
-       DIE_BREAKPOINT,
-       DIE_SSTEP,
-       DIE_NMI,
-       DIE_OOPS,
-};
-
-#endif /* __ASM_AVR32_KDEBUG_H */
diff --git a/arch/avr32/include/asm/kmap_types.h b/arch/avr32/include/asm/kmap_types.h
deleted file mode 100644 (file)
index 479330b..0000000
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef __ASM_AVR32_KMAP_TYPES_H
-#define __ASM_AVR32_KMAP_TYPES_H
-
-#ifdef CONFIG_DEBUG_HIGHMEM
-# define KM_TYPE_NR 29
-#else
-# define KM_TYPE_NR 14
-#endif
-
-#endif /* __ASM_AVR32_KMAP_TYPES_H */
diff --git a/arch/avr32/include/asm/kprobes.h b/arch/avr32/include/asm/kprobes.h
deleted file mode 100644 (file)
index 28dfc61..0000000
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Kernel Probes (KProbes)
- *
- * Copyright (C) 2005-2006 Atmel Corporation
- * Copyright (C) IBM Corporation, 2002, 2004
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef __ASM_AVR32_KPROBES_H
-#define __ASM_AVR32_KPROBES_H
-
-#include <asm-generic/kprobes.h>
-
-#define BREAKPOINT_INSTRUCTION 0xd673  /* breakpoint */
-
-#ifdef CONFIG_KPROBES
-#include <linux/types.h>
-
-typedef u16    kprobe_opcode_t;
-#define MAX_INSN_SIZE          2
-#define MAX_STACK_SIZE         64      /* 32 would probably be OK */
-
-#define kretprobe_blacklist_size 0
-
-#define arch_remove_kprobe(p)  do { } while (0)
-
-/* Architecture specific copy of original instruction */
-struct arch_specific_insn {
-       kprobe_opcode_t insn[MAX_INSN_SIZE];
-};
-
-struct prev_kprobe {
-       struct kprobe *kp;
-       unsigned int status;
-};
-
-/* per-cpu kprobe control block */
-struct kprobe_ctlblk {
-       unsigned int kprobe_status;
-       struct prev_kprobe prev_kprobe;
-       struct pt_regs jprobe_saved_regs;
-       char jprobes_stack[MAX_STACK_SIZE];
-};
-
-extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
-extern int kprobe_exceptions_notify(struct notifier_block *self,
-                                   unsigned long val, void *data);
-
-#define flush_insn_slot(p)     do { } while (0)
-
-#endif /* CONFIG_KPROBES */
-#endif /* __ASM_AVR32_KPROBES_H */
diff --git a/arch/avr32/include/asm/linkage.h b/arch/avr32/include/asm/linkage.h
deleted file mode 100644 (file)
index f7b285e..0000000
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef __ASM_LINKAGE_H
-#define __ASM_LINKAGE_H
-
-#define __ALIGN .balign 2
-#define __ALIGN_STR ".balign 2"
-
-#endif /* __ASM_LINKAGE_H */
diff --git a/arch/avr32/include/asm/mmu.h b/arch/avr32/include/asm/mmu.h
deleted file mode 100644 (file)
index 60c2d26..0000000
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef __ASM_AVR32_MMU_H
-#define __ASM_AVR32_MMU_H
-
-/* Default "unsigned long" context */
-typedef unsigned long mm_context_t;
-
-#define MMU_ITLB_ENTRIES       64
-#define MMU_DTLB_ENTRIES       64
-
-#endif /* __ASM_AVR32_MMU_H */
diff --git a/arch/avr32/include/asm/mmu_context.h b/arch/avr32/include/asm/mmu_context.h
deleted file mode 100644 (file)
index cd87abb..0000000
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * ASID handling taken from SH implementation.
- *   Copyright (C) 1999 Niibe Yutaka
- *   Copyright (C) 2003 Paul Mundt
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef __ASM_AVR32_MMU_CONTEXT_H
-#define __ASM_AVR32_MMU_CONTEXT_H
-
-#include <linux/mm_types.h>
-
-#include <asm/tlbflush.h>
-#include <asm/sysreg.h>
-#include <asm-generic/mm_hooks.h>
-
-/*
- * The MMU "context" consists of two things:
- *    (a) TLB cache version
- *    (b) ASID (Address Space IDentifier)
- */
-#define MMU_CONTEXT_ASID_MASK          0x000000ff
-#define MMU_CONTEXT_VERSION_MASK       0xffffff00
-#define MMU_CONTEXT_FIRST_VERSION       0x00000100
-#define NO_CONTEXT                     0
-
-#define MMU_NO_ASID                    0x100
-
-/* Virtual Page Number mask */
-#define MMU_VPN_MASK   0xfffff000
-
-/* Cache of MMU context last used */
-extern unsigned long mmu_context_cache;
-
-/*
- * Get MMU context if needed
- */
-static inline void
-get_mmu_context(struct mm_struct *mm)
-{
-       unsigned long mc = mmu_context_cache;
-
-       if (((mm->context ^ mc) & MMU_CONTEXT_VERSION_MASK) == 0)
-               /* It's up to date, do nothing */
-               return;
-
-       /* It's old, we need to get new context with new version */
-       mc = ++mmu_context_cache;
-       if (!(mc & MMU_CONTEXT_ASID_MASK)) {
-               /*
-                * We have exhausted all ASIDs of this version.
-                * Flush the TLB and start new cycle.
-                */
-               flush_tlb_all();
-               /*
-                * Fix version. Note that we avoid version #0
-                * to distinguish NO_CONTEXT.
-                */
-               if (!mc)
-                       mmu_context_cache = mc = MMU_CONTEXT_FIRST_VERSION;
-       }
-       mm->context = mc;
-}
-
-/*
- * Initialize the context related info for a new mm_struct
- * instance.
- */
-static inline int init_new_context(struct task_struct *tsk,
-                                      struct mm_struct *mm)
-{
-       mm->context = NO_CONTEXT;
-       return 0;
-}
-
-/*
- * Destroy context related info for an mm_struct that is about
- * to be put to rest.
- */
-static inline void destroy_context(struct mm_struct *mm)
-{
-       /* Do nothing */
-}
-
-static inline void set_asid(unsigned long asid)
-{
-       /* XXX: We're destroying TLBEHI[8:31] */
-       sysreg_write(TLBEHI, asid & MMU_CONTEXT_ASID_MASK);
-       cpu_sync_pipeline();
-}
-
-static inline unsigned long get_asid(void)
-{
-       unsigned long asid;
-
-       asid = sysreg_read(TLBEHI);
-       return asid & MMU_CONTEXT_ASID_MASK;
-}
-
-static inline void activate_context(struct mm_struct *mm)
-{
-       get_mmu_context(mm);
-       set_asid(mm->context & MMU_CONTEXT_ASID_MASK);
-}
-
-static inline void switch_mm(struct mm_struct *prev,
-                                struct mm_struct *next,
-                                struct task_struct *tsk)
-{
-       if (likely(prev != next)) {
-               unsigned long __pgdir = (unsigned long)next->pgd;
-
-               sysreg_write(PTBR, __pgdir);
-               activate_context(next);
-       }
-}
-
-#define deactivate_mm(tsk,mm) do { } while(0)
-
-#define activate_mm(prev, next) switch_mm((prev), (next), NULL)
-
-static inline void
-enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
-{
-}
-
-
-static inline void enable_mmu(void)
-{
-       sysreg_write(MMUCR, (SYSREG_BIT(MMUCR_S)
-                            | SYSREG_BIT(E)
-                            | SYSREG_BIT(MMUCR_I)));
-       nop(); nop(); nop(); nop(); nop(); nop(); nop(); nop();
-
-       if (mmu_context_cache == NO_CONTEXT)
-               mmu_context_cache = MMU_CONTEXT_FIRST_VERSION;
-
-       set_asid(mmu_context_cache & MMU_CONTEXT_ASID_MASK);
-}
-
-static inline void disable_mmu(void)
-{
-       sysreg_write(MMUCR, SYSREG_BIT(MMUCR_S));
-}
-
-#endif /* __ASM_AVR32_MMU_CONTEXT_H */
diff --git a/arch/avr32/include/asm/module.h b/arch/avr32/include/asm/module.h
deleted file mode 100644 (file)
index 3f083d3..0000000
+++ /dev/null
@@ -1,26 +0,0 @@
-#ifndef __ASM_AVR32_MODULE_H
-#define __ASM_AVR32_MODULE_H
-
-#include <asm-generic/module.h>
-
-struct mod_arch_syminfo {
-       unsigned long got_offset;
-       int got_initialized;
-};
-
-struct mod_arch_specific {
-       /* Starting offset of got in the module core memory. */
-       unsigned long got_offset;
-       /* Size of the got. */
-       unsigned long got_size;
-       /* Number of symbols in syminfo. */
-       int nsyms;
-       /* Additional symbol information (got offsets). */
-       struct mod_arch_syminfo *syminfo;
-};
-
-#define MODULE_PROC_FAMILY "AVR32v1"
-
-#define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY
-
-#endif /* __ASM_AVR32_MODULE_H */
diff --git a/arch/avr32/include/asm/ocd.h b/arch/avr32/include/asm/ocd.h
deleted file mode 100644 (file)
index 6bef094..0000000
+++ /dev/null
@@ -1,543 +0,0 @@
-/*
- * AVR32 OCD Interface and register definitions
- *
- * Copyright (C) 2004-2007 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef __ASM_AVR32_OCD_H
-#define __ASM_AVR32_OCD_H
-
-/* OCD Register offsets. Abbreviations used below:
- *
- *      BP      Breakpoint
- *      Comm    Communication
- *      DT      Data Trace
- *      PC      Program Counter
- *      PID     Process ID
- *      R/W     Read/Write
- *      WP      Watchpoint
- */
-#define OCD_DID                                0x0000  /* Device ID */
-#define OCD_DC                         0x0008  /* Development Control */
-#define OCD_DS                         0x0010  /* Development Status */
-#define OCD_RWCS                       0x001c  /* R/W Access Control */
-#define OCD_RWA                                0x0024  /* R/W Access Address */
-#define OCD_RWD                                0x0028  /* R/W Access Data */
-#define OCD_WT                         0x002c  /* Watchpoint Trigger */
-#define OCD_DTC                                0x0034  /* Data Trace Control */
-#define OCD_DTSA0                      0x0038  /* DT Start Addr Channel 0 */
-#define OCD_DTSA1                      0x003c  /* DT Start Addr Channel 1 */
-#define OCD_DTEA0                      0x0048  /* DT End Addr Channel 0 */
-#define OCD_DTEA1                      0x004c  /* DT End Addr Channel 1 */
-#define OCD_BWC0A                      0x0058  /* PC BP/WP Control 0A */
-#define OCD_BWC0B                      0x005c  /* PC BP/WP Control 0B */
-#define OCD_BWC1A                      0x0060  /* PC BP/WP Control 1A */
-#define OCD_BWC1B                      0x0064  /* PC BP/WP Control 1B */
-#define OCD_BWC2A                      0x0068  /* PC BP/WP Control 2A */
-#define OCD_BWC2B                      0x006c  /* PC BP/WP Control 2B */
-#define OCD_BWC3A                      0x0070  /* Data BP/WP Control 3A */
-#define OCD_BWC3B                      0x0074  /* Data BP/WP Control 3B */
-#define OCD_BWA0A                      0x0078  /* PC BP/WP Address 0A */
-#define OCD_BWA0B                      0x007c  /* PC BP/WP Address 0B */
-#define OCD_BWA1A                      0x0080  /* PC BP/WP Address 1A */
-#define OCD_BWA1B                      0x0084  /* PC BP/WP Address 1B */
-#define OCD_BWA2A                      0x0088  /* PC BP/WP Address 2A */
-#define OCD_BWA2B                      0x008c  /* PC BP/WP Address 2B */
-#define OCD_BWA3A                      0x0090  /* Data BP/WP Address 3A */
-#define OCD_BWA3B                      0x0094  /* Data BP/WP Address 3B */
-#define OCD_NXCFG                      0x0100  /* Nexus Configuration */
-#define OCD_DINST                      0x0104  /* Debug Instruction */
-#define OCD_DPC                                0x0108  /* Debug Program Counter */
-#define OCD_CPUCM                      0x010c  /* CPU Control Mask */
-#define OCD_DCCPU                      0x0110  /* Debug Comm CPU */
-#define OCD_DCEMU                      0x0114  /* Debug Comm Emulator */
-#define OCD_DCSR                       0x0118  /* Debug Comm Status */
-#define OCD_PID                                0x011c  /* Ownership Trace PID */
-#define OCD_EPC0                       0x0120  /* Event Pair Control 0 */
-#define OCD_EPC1                       0x0124  /* Event Pair Control 1 */
-#define OCD_EPC2                       0x0128  /* Event Pair Control 2 */
-#define OCD_EPC3                       0x012c  /* Event Pair Control 3 */
-#define OCD_AXC                                0x0130  /* AUX port Control */
-
-/* Bits in DID */
-#define OCD_DID_MID_START              1
-#define OCD_DID_MID_SIZE               11
-#define OCD_DID_PN_START               12
-#define OCD_DID_PN_SIZE                        16
-#define OCD_DID_RN_START               28
-#define OCD_DID_RN_SIZE                        4
-
-/* Bits in DC */
-#define OCD_DC_TM_START                        0
-#define OCD_DC_TM_SIZE                 2
-#define OCD_DC_EIC_START               3
-#define OCD_DC_EIC_SIZE                        2
-#define OCD_DC_OVC_START               5
-#define OCD_DC_OVC_SIZE                        3
-#define OCD_DC_SS_BIT                  8
-#define OCD_DC_DBR_BIT                 12
-#define OCD_DC_DBE_BIT                 13
-#define OCD_DC_EOS_START               20
-#define OCD_DC_EOS_SIZE                        2
-#define OCD_DC_SQA_BIT                 22
-#define OCD_DC_IRP_BIT                 23
-#define OCD_DC_IFM_BIT                 24
-#define OCD_DC_TOZ_BIT                 25
-#define OCD_DC_TSR_BIT                 26
-#define OCD_DC_RID_BIT                 27
-#define OCD_DC_ORP_BIT                 28
-#define OCD_DC_MM_BIT                  29
-#define OCD_DC_RES_BIT                 30
-#define OCD_DC_ABORT_BIT               31
-
-/* Bits in DS */
-#define OCD_DS_SSS_BIT                 0
-#define OCD_DS_SWB_BIT                 1
-#define OCD_DS_HWB_BIT                 2
-#define OCD_DS_HWE_BIT                 3
-#define OCD_DS_STP_BIT                 4
-#define OCD_DS_DBS_BIT                 5
-#define OCD_DS_BP_START                        8
-#define OCD_DS_BP_SIZE                 8
-#define OCD_DS_INC_BIT                 24
-#define OCD_DS_BOZ_BIT                 25
-#define OCD_DS_DBA_BIT                 26
-#define OCD_DS_EXB_BIT                 27
-#define OCD_DS_NTBF_BIT                        28
-
-/* Bits in RWCS */
-#define OCD_RWCS_DV_BIT                        0
-#define OCD_RWCS_ERR_BIT               1
-#define OCD_RWCS_CNT_START             2
-#define OCD_RWCS_CNT_SIZE              14
-#define OCD_RWCS_CRC_BIT               19
-#define OCD_RWCS_NTBC_START            20
-#define OCD_RWCS_NTBC_SIZE             2
-#define OCD_RWCS_NTE_BIT               22
-#define OCD_RWCS_NTAP_BIT              23
-#define OCD_RWCS_WRAPPED_BIT           24
-#define OCD_RWCS_CCTRL_START           25
-#define OCD_RWCS_CCTRL_SIZE            2
-#define OCD_RWCS_SZ_START              27
-#define OCD_RWCS_SZ_SIZE               3
-#define OCD_RWCS_RW_BIT                        30
-#define OCD_RWCS_AC_BIT                        31
-
-/* Bits in RWA */
-#define OCD_RWA_RWA_START              0
-#define OCD_RWA_RWA_SIZE               32
-
-/* Bits in RWD */
-#define OCD_RWD_RWD_START              0
-#define OCD_RWD_RWD_SIZE               32
-
-/* Bits in WT */
-#define OCD_WT_DTE_START               20
-#define OCD_WT_DTE_SIZE                        3
-#define OCD_WT_DTS_START               23
-#define OCD_WT_DTS_SIZE                        3
-#define OCD_WT_PTE_START               26
-#define OCD_WT_PTE_SIZE                        3
-#define OCD_WT_PTS_START               29
-#define OCD_WT_PTS_SIZE                        3
-
-/* Bits in DTC */
-#define OCD_DTC_T0WP_BIT               0
-#define OCD_DTC_T1WP_BIT               1
-#define OCD_DTC_ASID0EN_BIT            2
-#define OCD_DTC_ASID0_START            3
-#define OCD_DTC_ASID0_SIZE             8
-#define OCD_DTC_ASID1EN_BIT            11
-#define OCD_DTC_ASID1_START            12
-#define OCD_DTC_ASID1_SIZE             8
-#define OCD_DTC_RWT1_START             28
-#define OCD_DTC_RWT1_SIZE              2
-#define OCD_DTC_RWT0_START             30
-#define OCD_DTC_RWT0_SIZE              2
-
-/* Bits in DTSA0 */
-#define OCD_DTSA0_DTSA_START           0
-#define OCD_DTSA0_DTSA_SIZE            32
-
-/* Bits in DTSA1 */
-#define OCD_DTSA1_DTSA_START           0
-#define OCD_DTSA1_DTSA_SIZE            32
-
-/* Bits in DTEA0 */
-#define OCD_DTEA0_DTEA_START           0
-#define OCD_DTEA0_DTEA_SIZE            32
-
-/* Bits in DTEA1 */
-#define OCD_DTEA1_DTEA_START           0
-#define OCD_DTEA1_DTEA_SIZE            32
-
-/* Bits in BWC0A */
-#define OCD_BWC0A_ASIDEN_BIT           0
-#define OCD_BWC0A_ASID_START           1
-#define OCD_BWC0A_ASID_SIZE            8
-#define OCD_BWC0A_EOC_BIT              14
-#define OCD_BWC0A_AME_BIT              25
-#define OCD_BWC0A_BWE_START            30
-#define OCD_BWC0A_BWE_SIZE             2
-
-/* Bits in BWC0B */
-#define OCD_BWC0B_ASIDEN_BIT           0
-#define OCD_BWC0B_ASID_START           1
-#define OCD_BWC0B_ASID_SIZE            8
-#define OCD_BWC0B_EOC_BIT              14
-#define OCD_BWC0B_AME_BIT              25
-#define OCD_BWC0B_BWE_START            30
-#define OCD_BWC0B_BWE_SIZE             2
-
-/* Bits in BWC1A */
-#define OCD_BWC1A_ASIDEN_BIT           0
-#define OCD_BWC1A_ASID_START           1
-#define OCD_BWC1A_ASID_SIZE            8
-#define OCD_BWC1A_EOC_BIT              14
-#define OCD_BWC1A_AME_BIT              25
-#define OCD_BWC1A_BWE_START            30
-#define OCD_BWC1A_BWE_SIZE             2
-
-/* Bits in BWC1B */
-#define OCD_BWC1B_ASIDEN_BIT           0
-#define OCD_BWC1B_ASID_START           1
-#define OCD_BWC1B_ASID_SIZE            8
-#define OCD_BWC1B_EOC_BIT              14
-#define OCD_BWC1B_AME_BIT              25
-#define OCD_BWC1B_BWE_START            30
-#define OCD_BWC1B_BWE_SIZE             2
-
-/* Bits in BWC2A */
-#define OCD_BWC2A_ASIDEN_BIT           0
-#define OCD_BWC2A_ASID_START           1
-#define OCD_BWC2A_ASID_SIZE            8
-#define OCD_BWC2A_EOC_BIT              14
-#define OCD_BWC2A_AMB_START            20
-#define OCD_BWC2A_AMB_SIZE             5
-#define OCD_BWC2A_AME_BIT              25
-#define OCD_BWC2A_BWE_START            30
-#define OCD_BWC2A_BWE_SIZE             2
-
-/* Bits in BWC2B */
-#define OCD_BWC2B_ASIDEN_BIT           0
-#define OCD_BWC2B_ASID_START           1
-#define OCD_BWC2B_ASID_SIZE            8
-#define OCD_BWC2B_EOC_BIT              14
-#define OCD_BWC2B_AME_BIT              25
-#define OCD_BWC2B_BWE_START            30
-#define OCD_BWC2B_BWE_SIZE             2
-
-/* Bits in BWC3A */
-#define OCD_BWC3A_ASIDEN_BIT           0
-#define OCD_BWC3A_ASID_START           1
-#define OCD_BWC3A_ASID_SIZE            8
-#define OCD_BWC3A_SIZE_START           9
-#define OCD_BWC3A_SIZE_SIZE            3
-#define OCD_BWC3A_EOC_BIT              14
-#define OCD_BWC3A_BWO_START            16
-#define OCD_BWC3A_BWO_SIZE             2
-#define OCD_BWC3A_BME_START            20
-#define OCD_BWC3A_BME_SIZE             4
-#define OCD_BWC3A_BRW_START            28
-#define OCD_BWC3A_BRW_SIZE             2
-#define OCD_BWC3A_BWE_START            30
-#define OCD_BWC3A_BWE_SIZE             2
-
-/* Bits in BWC3B */
-#define OCD_BWC3B_ASIDEN_BIT           0
-#define OCD_BWC3B_ASID_START           1
-#define OCD_BWC3B_ASID_SIZE            8
-#define OCD_BWC3B_SIZE_START           9
-#define OCD_BWC3B_SIZE_SIZE            3
-#define OCD_BWC3B_EOC_BIT              14
-#define OCD_BWC3B_BWO_START            16
-#define OCD_BWC3B_BWO_SIZE             2
-#define OCD_BWC3B_BME_START            20
-#define OCD_BWC3B_BME_SIZE             4
-#define OCD_BWC3B_BRW_START            28
-#define OCD_BWC3B_BRW_SIZE             2
-#define OCD_BWC3B_BWE_START            30
-#define OCD_BWC3B_BWE_SIZE             2
-
-/* Bits in BWA0A */
-#define OCD_BWA0A_BWA_START            0
-#define OCD_BWA0A_BWA_SIZE             32
-
-/* Bits in BWA0B */
-#define OCD_BWA0B_BWA_START            0
-#define OCD_BWA0B_BWA_SIZE             32
-
-/* Bits in BWA1A */
-#define OCD_BWA1A_BWA_START            0
-#define OCD_BWA1A_BWA_SIZE             32
-
-/* Bits in BWA1B */
-#define OCD_BWA1B_BWA_START            0
-#define OCD_BWA1B_BWA_SIZE             32
-
-/* Bits in BWA2A */
-#define OCD_BWA2A_BWA_START            0
-#define OCD_BWA2A_BWA_SIZE             32
-
-/* Bits in BWA2B */
-#define OCD_BWA2B_BWA_START            0
-#define OCD_BWA2B_BWA_SIZE             32
-
-/* Bits in BWA3A */
-#define OCD_BWA3A_BWA_START            0
-#define OCD_BWA3A_BWA_SIZE             32
-
-/* Bits in BWA3B */
-#define OCD_BWA3B_BWA_START            0
-#define OCD_BWA3B_BWA_SIZE             32
-
-/* Bits in NXCFG */
-#define OCD_NXCFG_NXARCH_START         0
-#define OCD_NXCFG_NXARCH_SIZE          4
-#define OCD_NXCFG_NXOCD_START          4
-#define OCD_NXCFG_NXOCD_SIZE           4
-#define OCD_NXCFG_NXPCB_START          8
-#define OCD_NXCFG_NXPCB_SIZE           4
-#define OCD_NXCFG_NXDB_START           12
-#define OCD_NXCFG_NXDB_SIZE            4
-#define OCD_NXCFG_MXMSEO_BIT           16
-#define OCD_NXCFG_NXMDO_START          17
-#define OCD_NXCFG_NXMDO_SIZE           4
-#define OCD_NXCFG_NXPT_BIT             21
-#define OCD_NXCFG_NXOT_BIT             22
-#define OCD_NXCFG_NXDWT_BIT            23
-#define OCD_NXCFG_NXDRT_BIT            24
-#define OCD_NXCFG_NXDTC_START          25
-#define OCD_NXCFG_NXDTC_SIZE           3
-#define OCD_NXCFG_NXDMA_BIT            28
-
-/* Bits in DINST */
-#define OCD_DINST_DINST_START          0
-#define OCD_DINST_DINST_SIZE           32
-
-/* Bits in CPUCM */
-#define OCD_CPUCM_BEM_BIT              1
-#define OCD_CPUCM_FEM_BIT              2
-#define OCD_CPUCM_REM_BIT              3
-#define OCD_CPUCM_IBEM_BIT             4
-#define OCD_CPUCM_IEEM_BIT             5
-
-/* Bits in DCCPU */
-#define OCD_DCCPU_DATA_START           0
-#define OCD_DCCPU_DATA_SIZE            32
-
-/* Bits in DCEMU */
-#define OCD_DCEMU_DATA_START           0
-#define OCD_DCEMU_DATA_SIZE            32
-
-/* Bits in DCSR */
-#define OCD_DCSR_CPUD_BIT              0
-#define OCD_DCSR_EMUD_BIT              1
-
-/* Bits in PID */
-#define OCD_PID_PROCESS_START          0
-#define OCD_PID_PROCESS_SIZE           32
-
-/* Bits in EPC0 */
-#define OCD_EPC0_RNG_START             0
-#define OCD_EPC0_RNG_SIZE              2
-#define OCD_EPC0_CE_BIT                        4
-#define OCD_EPC0_ECNT_START            16
-#define OCD_EPC0_ECNT_SIZE             16
-
-/* Bits in EPC1 */
-#define OCD_EPC1_RNG_START             0
-#define OCD_EPC1_RNG_SIZE              2
-#define OCD_EPC1_ATB_BIT               5
-#define OCD_EPC1_AM_BIT                        6
-
-/* Bits in EPC2 */
-#define OCD_EPC2_RNG_START             0
-#define OCD_EPC2_RNG_SIZE              2
-#define OCD_EPC2_DB_START              2
-#define OCD_EPC2_DB_SIZE               2
-
-/* Bits in EPC3 */
-#define OCD_EPC3_RNG_START             0
-#define OCD_EPC3_RNG_SIZE              2
-#define OCD_EPC3_DWE_BIT               2
-
-/* Bits in AXC */
-#define OCD_AXC_DIV_START              0
-#define OCD_AXC_DIV_SIZE               4
-#define OCD_AXC_AXE_BIT                        8
-#define OCD_AXC_AXS_BIT                        9
-#define OCD_AXC_DDR_BIT                        10
-#define OCD_AXC_LS_BIT                 11
-#define OCD_AXC_REX_BIT                        12
-#define OCD_AXC_REXTEN_BIT             13
-
-/* Constants for DC:EIC */
-#define OCD_EIC_PROGRAM_AND_DATA_TRACE 0
-#define OCD_EIC_BREAKPOINT             1
-#define OCD_EIC_NOP                    2
-
-/* Constants for DC:OVC */
-#define OCD_OVC_OVERRUN                        0
-#define OCD_OVC_DELAY_CPU_BTM          1
-#define OCD_OVC_DELAY_CPU_DTM          2
-#define OCD_OVC_DELAY_CPU_BTM_DTM      3
-
-/* Constants for DC:EOS */
-#define OCD_EOS_NOP                    0
-#define OCD_EOS_DEBUG_MODE             1
-#define OCD_EOS_BREAKPOINT_WATCHPOINT  2
-#define OCD_EOS_THQ                    3
-
-/* Constants for RWCS:NTBC */
-#define OCD_NTBC_OVERWRITE             0
-#define OCD_NTBC_DISABLE               1
-#define OCD_NTBC_BREAKPOINT            2
-
-/* Constants for RWCS:CCTRL */
-#define OCD_CCTRL_AUTO                 0
-#define OCD_CCTRL_CACHED               1
-#define OCD_CCTRL_UNCACHED             2
-
-/* Constants for RWCS:SZ */
-#define OCD_SZ_BYTE                    0
-#define OCD_SZ_HALFWORD                        1
-#define OCD_SZ_WORD                    2
-
-/* Constants for WT:PTS */
-#define OCD_PTS_DISABLED               0
-#define OCD_PTS_PROGRAM_0B             1
-#define OCD_PTS_PROGRAM_1A             2
-#define OCD_PTS_PROGRAM_1B             3
-#define OCD_PTS_PROGRAM_2A             4
-#define OCD_PTS_PROGRAM_2B             5
-#define OCD_PTS_DATA_3A                        6
-#define OCD_PTS_DATA_3B                        7
-
-/* Constants for DTC:RWT1 */
-#define OCD_RWT1_NO_TRACE              0
-#define OCD_RWT1_DATA_READ             1
-#define OCD_RWT1_DATA_WRITE            2
-#define OCD_RWT1_DATA_READ_WRITE       3
-
-/* Constants for DTC:RWT0 */
-#define OCD_RWT0_NO_TRACE              0
-#define OCD_RWT0_DATA_READ             1
-#define OCD_RWT0_DATA_WRITE            2
-#define OCD_RWT0_DATA_READ_WRITE       3
-
-/* Constants for BWC0A:BWE */
-#define OCD_BWE_DISABLED               0
-#define OCD_BWE_BREAKPOINT_ENABLED     1
-#define OCD_BWE_WATCHPOINT_ENABLED     3
-
-/* Constants for BWC0B:BWE */
-#define OCD_BWE_DISABLED               0
-#define OCD_BWE_BREAKPOINT_ENABLED     1
-#define OCD_BWE_WATCHPOINT_ENABLED     3
-
-/* Constants for BWC1A:BWE */
-#define OCD_BWE_DISABLED               0
-#define OCD_BWE_BREAKPOINT_ENABLED     1
-#define OCD_BWE_WATCHPOINT_ENABLED     3
-
-/* Constants for BWC1B:BWE */
-#define OCD_BWE_DISABLED               0
-#define OCD_BWE_BREAKPOINT_ENABLED     1
-#define OCD_BWE_WATCHPOINT_ENABLED     3
-
-/* Constants for BWC2A:BWE */
-#define OCD_BWE_DISABLED               0
-#define OCD_BWE_BREAKPOINT_ENABLED     1
-#define OCD_BWE_WATCHPOINT_ENABLED     3
-
-/* Constants for BWC2B:BWE */
-#define OCD_BWE_DISABLED               0
-#define OCD_BWE_BREAKPOINT_ENABLED     1
-#define OCD_BWE_WATCHPOINT_ENABLED     3
-
-/* Constants for BWC3A:SIZE */
-#define OCD_SIZE_BYTE_ACCESS           4
-#define OCD_SIZE_HALFWORD_ACCESS       5
-#define OCD_SIZE_WORD_ACCESS           6
-#define OCD_SIZE_DOUBLE_WORD_ACCESS    7
-
-/* Constants for BWC3A:BRW */
-#define OCD_BRW_READ_BREAK             0
-#define OCD_BRW_WRITE_BREAK            1
-#define OCD_BRW_ANY_ACCES_BREAK                2
-
-/* Constants for BWC3A:BWE */
-#define OCD_BWE_DISABLED               0
-#define OCD_BWE_BREAKPOINT_ENABLED     1
-#define OCD_BWE_WATCHPOINT_ENABLED     3
-
-/* Constants for BWC3B:SIZE */
-#define OCD_SIZE_BYTE_ACCESS           4
-#define OCD_SIZE_HALFWORD_ACCESS       5
-#define OCD_SIZE_WORD_ACCESS           6
-#define OCD_SIZE_DOUBLE_WORD_ACCESS    7
-
-/* Constants for BWC3B:BRW */
-#define OCD_BRW_READ_BREAK             0
-#define OCD_BRW_WRITE_BREAK            1
-#define OCD_BRW_ANY_ACCES_BREAK                2
-
-/* Constants for BWC3B:BWE */
-#define OCD_BWE_DISABLED               0
-#define OCD_BWE_BREAKPOINT_ENABLED     1
-#define OCD_BWE_WATCHPOINT_ENABLED     3
-
-/* Constants for EPC0:RNG */
-#define OCD_RNG_DISABLED               0
-#define OCD_RNG_EXCLUSIVE              1
-#define OCD_RNG_INCLUSIVE              2
-
-/* Constants for EPC1:RNG */
-#define OCD_RNG_DISABLED               0
-#define OCD_RNG_EXCLUSIVE              1
-#define OCD_RNG_INCLUSIVE              2
-
-/* Constants for EPC2:RNG */
-#define OCD_RNG_DISABLED               0
-#define OCD_RNG_EXCLUSIVE              1
-#define OCD_RNG_INCLUSIVE              2
-
-/* Constants for EPC2:DB */
-#define OCD_DB_DISABLED                        0
-#define OCD_DB_CHAINED_B               1
-#define OCD_DB_CHAINED_A               2
-#define OCD_DB_AHAINED_A_AND_B         3
-
-/* Constants for EPC3:RNG */
-#define OCD_RNG_DISABLED               0
-#define OCD_RNG_EXCLUSIVE              1
-#define OCD_RNG_INCLUSIVE              2
-
-#ifndef __ASSEMBLER__
-
-/* Register access macros */
-static inline unsigned long __ocd_read(unsigned int reg)
-{
-       return __builtin_mfdr(reg);
-}
-
-static inline void __ocd_write(unsigned int reg, unsigned long value)
-{
-       __builtin_mtdr(reg, value);
-}
-
-#define ocd_read(reg)                  __ocd_read(OCD_##reg)
-#define ocd_write(reg, value)          __ocd_write(OCD_##reg, value)
-
-struct task_struct;
-
-void ocd_enable(struct task_struct *child);
-void ocd_disable(struct task_struct *child);
-
-#endif /* !__ASSEMBLER__ */
-
-#endif /* __ASM_AVR32_OCD_H */
diff --git a/arch/avr32/include/asm/page.h b/arch/avr32/include/asm/page.h
deleted file mode 100644 (file)
index c5d2a3e..0000000
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef __ASM_AVR32_PAGE_H
-#define __ASM_AVR32_PAGE_H
-
-#include <linux/const.h>
-
-/* PAGE_SHIFT determines the page size */
-#define PAGE_SHIFT     12
-#define PAGE_SIZE      (_AC(1, UL) << PAGE_SHIFT)
-#define PAGE_MASK      (~(PAGE_SIZE-1))
-#define PTE_MASK       PAGE_MASK
-
-#ifndef __ASSEMBLY__
-
-#include <asm/addrspace.h>
-
-extern void clear_page(void *to);
-extern void copy_page(void *to, void *from);
-
-#define clear_user_page(page, vaddr, pg)       clear_page(page)
-#define copy_user_page(to, from, vaddr, pg)    copy_page(to, from)
-
-/*
- * These are used to make use of C type-checking..
- */
-typedef struct { unsigned long pte; } pte_t;
-typedef struct { unsigned long pgd; } pgd_t;
-typedef struct { unsigned long pgprot; } pgprot_t;
-typedef struct page *pgtable_t;
-
-#define pte_val(x)             ((x).pte)
-#define pgd_val(x)             ((x).pgd)
-#define pgprot_val(x)          ((x).pgprot)
-
-#define __pte(x)               ((pte_t) { (x) })
-#define __pgd(x)               ((pgd_t) { (x) })
-#define __pgprot(x)            ((pgprot_t) { (x) })
-
-/* FIXME: These should be removed soon */
-extern unsigned long memory_start, memory_end;
-
-/* Pure 2^n version of get_order */
-static inline int get_order(unsigned long size)
-{
-       unsigned lz;
-
-       size = (size - 1) >> PAGE_SHIFT;
-       asm("clz %0, %1" : "=r"(lz) : "r"(size));
-       return 32 - lz;
-}
-
-#endif /* !__ASSEMBLY__ */
-
-/*
- * The hardware maps the virtual addresses 0x80000000 -> 0x9fffffff
- * permanently to the physical addresses 0x00000000 -> 0x1fffffff when
- * segmentation is enabled. We want to make use of this in order to
- * minimize TLB pressure.
- */
-#define PAGE_OFFSET            (0x80000000UL)
-
-/*
- * ALSA uses virt_to_page() on DMA pages, which I'm not entirely sure
- * is a good idea. Anyway, we can't simply subtract PAGE_OFFSET here
- * in that case, so we'll have to mask out the three most significant
- * bits of the address instead...
- *
- * What's the difference between __pa() and virt_to_phys() anyway?
- */
-#define __pa(x)                PHYSADDR(x)
-#define __va(x)                ((void *)(P1SEGADDR(x)))
-
-#define MAP_NR(addr)   (((unsigned long)(addr) - PAGE_OFFSET) >> PAGE_SHIFT)
-
-#define phys_to_page(phys)     (pfn_to_page(phys >> PAGE_SHIFT))
-#define page_to_phys(page)     (page_to_pfn(page) << PAGE_SHIFT)
-
-#ifndef CONFIG_NEED_MULTIPLE_NODES
-
-#define ARCH_PFN_OFFSET                (CONFIG_PHYS_OFFSET >> PAGE_SHIFT)
-
-#define pfn_valid(pfn)         ((pfn) >= ARCH_PFN_OFFSET && (pfn) < (ARCH_PFN_OFFSET + max_mapnr))
-#endif /* CONFIG_NEED_MULTIPLE_NODES */
-
-#define virt_to_page(kaddr)    pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
-#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
-
-#define VM_DATA_DEFAULT_FLAGS  (VM_READ | VM_WRITE |   \
-                                VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
-
-/*
- * Memory above this physical address will be considered highmem.
- */
-#define HIGHMEM_START          0x20000000UL
-
-#include <asm-generic/memory_model.h>
-
-#endif /* __ASM_AVR32_PAGE_H */
diff --git a/arch/avr32/include/asm/pci.h b/arch/avr32/include/asm/pci.h
deleted file mode 100644 (file)
index 0f5f134..0000000
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __ASM_AVR32_PCI_H__
-#define __ASM_AVR32_PCI_H__
-
-/* We don't support PCI yet, but some drivers require this file anyway */
-
-#define PCI_DMA_BUS_IS_PHYS    (1)
-
-#endif /* __ASM_AVR32_PCI_H__ */
diff --git a/arch/avr32/include/asm/pgalloc.h b/arch/avr32/include/asm/pgalloc.h
deleted file mode 100644 (file)
index db039cb..0000000
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef __ASM_AVR32_PGALLOC_H
-#define __ASM_AVR32_PGALLOC_H
-
-#include <linux/mm.h>
-#include <linux/quicklist.h>
-#include <asm/page.h>
-#include <asm/pgtable.h>
-
-#define QUICK_PGD      0       /* Preserve kernel mappings over free */
-#define QUICK_PT       1       /* Zero on free */
-
-static inline void pmd_populate_kernel(struct mm_struct *mm,
-                                      pmd_t *pmd, pte_t *pte)
-{
-       set_pmd(pmd, __pmd((unsigned long)pte));
-}
-
-static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
-                                   pgtable_t pte)
-{
-       set_pmd(pmd, __pmd((unsigned long)page_address(pte)));
-}
-#define pmd_pgtable(pmd) pmd_page(pmd)
-
-static inline void pgd_ctor(void *x)
-{
-       pgd_t *pgd = x;
-
-       memcpy(pgd + USER_PTRS_PER_PGD,
-               swapper_pg_dir + USER_PTRS_PER_PGD,
-               (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));
-}
-
-/*
- * Allocate and free page tables
- */
-static inline pgd_t *pgd_alloc(struct mm_struct *mm)
-{
-       return quicklist_alloc(QUICK_PGD, GFP_KERNEL, pgd_ctor);
-}
-
-static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
-{
-       quicklist_free(QUICK_PGD, NULL, pgd);
-}
-
-static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm,
-                                         unsigned long address)
-{
-       return quicklist_alloc(QUICK_PT, GFP_KERNEL, NULL);
-}
-
-static inline pgtable_t pte_alloc_one(struct mm_struct *mm,
-                                        unsigned long address)
-{
-       struct page *page;
-       void *pg;
-
-       pg = quicklist_alloc(QUICK_PT, GFP_KERNEL, NULL);
-       if (!pg)
-               return NULL;
-
-       page = virt_to_page(pg);
-       if (!pgtable_page_ctor(page)) {
-               quicklist_free(QUICK_PT, NULL, pg);
-               return NULL;
-       }
-
-       return page;
-}
-
-static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
-{
-       quicklist_free(QUICK_PT, NULL, pte);
-}
-
-static inline void pte_free(struct mm_struct *mm, pgtable_t pte)
-{
-       pgtable_page_dtor(pte);
-       quicklist_free_page(QUICK_PT, NULL, pte);
-}
-
-#define __pte_free_tlb(tlb,pte,addr)                   \
-do {                                                   \
-       pgtable_page_dtor(pte);                         \
-       tlb_remove_page((tlb), pte);                    \
-} while (0)
-
-static inline void check_pgt_cache(void)
-{
-       quicklist_trim(QUICK_PGD, NULL, 25, 16);
-       quicklist_trim(QUICK_PT, NULL, 25, 16);
-}
-
-#endif /* __ASM_AVR32_PGALLOC_H */
diff --git a/arch/avr32/include/asm/pgtable-2level.h b/arch/avr32/include/asm/pgtable-2level.h
deleted file mode 100644 (file)
index d5b1c63..0000000
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef __ASM_AVR32_PGTABLE_2LEVEL_H
-#define __ASM_AVR32_PGTABLE_2LEVEL_H
-
-#define __ARCH_USE_5LEVEL_HACK
-#include <asm-generic/pgtable-nopmd.h>
-
-/*
- * Traditional 2-level paging structure
- */
-#define PGDIR_SHIFT    22
-#define PTRS_PER_PGD   1024
-
-#define PTRS_PER_PTE   1024
-
-#ifndef __ASSEMBLY__
-#define pte_ERROR(e) \
-       printk("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, pte_val(e))
-#define pgd_ERROR(e) \
-       printk("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e))
-
-/*
- * Certain architectures need to do special things when PTEs
- * within a page table are directly modified.  Thus, the following
- * hook is made available.
- */
-#define set_pte(pteptr, pteval) (*(pteptr) = pteval)
-#define set_pte_at(mm,addr,ptep,pteval) set_pte(ptep, pteval)
-
-/*
- * (pmds are folded into pgds so this doesn't get actually called,
- * but the define is needed for a generic inline function.)
- */
-#define set_pmd(pmdptr, pmdval) (*(pmdptr) = pmdval)
-
-#define pte_pfn(x)             ((unsigned long)(((x).pte >> PAGE_SHIFT)))
-#define pfn_pte(pfn, prot)     __pte(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
-#define pfn_pmd(pfn, prot)     __pmd(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
-
-#endif /* !__ASSEMBLY__ */
-
-#endif /* __ASM_AVR32_PGTABLE_2LEVEL_H */
diff --git a/arch/avr32/include/asm/pgtable.h b/arch/avr32/include/asm/pgtable.h
deleted file mode 100644 (file)
index 3580066..0000000
+++ /dev/null
@@ -1,347 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef __ASM_AVR32_PGTABLE_H
-#define __ASM_AVR32_PGTABLE_H
-
-#include <asm/addrspace.h>
-
-#ifndef __ASSEMBLY__
-#include <linux/sched.h>
-
-#endif /* !__ASSEMBLY__ */
-
-/*
- * Use two-level page tables just as the i386 (without PAE)
- */
-#include <asm/pgtable-2level.h>
-
-/*
- * The following code might need some cleanup when the values are
- * final...
- */
-#define PMD_SIZE       (1UL << PMD_SHIFT)
-#define PMD_MASK       (~(PMD_SIZE-1))
-#define PGDIR_SIZE     (1UL << PGDIR_SHIFT)
-#define PGDIR_MASK     (~(PGDIR_SIZE-1))
-
-#define USER_PTRS_PER_PGD      (TASK_SIZE / PGDIR_SIZE)
-#define FIRST_USER_ADDRESS     0UL
-
-#ifndef __ASSEMBLY__
-extern pgd_t swapper_pg_dir[PTRS_PER_PGD];
-extern void paging_init(void);
-
-/*
- * ZERO_PAGE is a global shared page that is always zero: used for
- * zero-mapped memory areas etc.
- */
-extern struct page *empty_zero_page;
-#define ZERO_PAGE(vaddr) (empty_zero_page)
-
-/*
- * Just any arbitrary offset to the start of the vmalloc VM area: the
- * current 8 MiB value just means that there will be a 8 MiB "hole"
- * after the uncached physical memory (P2 segment) until the vmalloc
- * area starts. That means that any out-of-bounds memory accesses will
- * hopefully be caught; we don't know if the end of the P1/P2 segments
- * are actually used for anything, but it is anyway safer to let the
- * MMU catch these kinds of errors than to rely on the memory bus.
- *
- * A "hole" of the same size is added to the end of the P3 segment as
- * well. It might seem wasteful to use 16 MiB of virtual address space
- * on this, but we do have 512 MiB of it...
- *
- * The vmalloc() routines leave a hole of 4 KiB between each vmalloced
- * area for the same reason.
- */
-#define VMALLOC_OFFSET (8 * 1024 * 1024)
-#define VMALLOC_START  (P3SEG + VMALLOC_OFFSET)
-#define VMALLOC_END    (P4SEG - VMALLOC_OFFSET)
-#endif /* !__ASSEMBLY__ */
-
-/*
- * Page flags. Some of these flags are not directly supported by
- * hardware, so we have to emulate them.
- */
-#define _TLBEHI_BIT_VALID      9
-#define _TLBEHI_VALID          (1 << _TLBEHI_BIT_VALID)
-
-#define _PAGE_BIT_WT           0  /* W-bit   : write-through */
-#define _PAGE_BIT_DIRTY                1  /* D-bit   : page changed */
-#define _PAGE_BIT_SZ0          2  /* SZ0-bit : Size of page */
-#define _PAGE_BIT_SZ1          3  /* SZ1-bit : Size of page */
-#define _PAGE_BIT_EXECUTE      4  /* X-bit   : execute access allowed */
-#define _PAGE_BIT_RW           5  /* AP0-bit : write access allowed */
-#define _PAGE_BIT_USER         6  /* AP1-bit : user space access allowed */
-#define _PAGE_BIT_BUFFER       7  /* B-bit   : bufferable */
-#define _PAGE_BIT_GLOBAL       8  /* G-bit   : global (ignore ASID) */
-#define _PAGE_BIT_CACHABLE     9  /* C-bit   : cachable */
-
-/* If we drop support for 1K pages, we get two extra bits */
-#define _PAGE_BIT_PRESENT      10
-#define _PAGE_BIT_ACCESSED     11 /* software: page was accessed */
-
-#define _PAGE_WT               (1 << _PAGE_BIT_WT)
-#define _PAGE_DIRTY            (1 << _PAGE_BIT_DIRTY)
-#define _PAGE_EXECUTE          (1 << _PAGE_BIT_EXECUTE)
-#define _PAGE_RW               (1 << _PAGE_BIT_RW)
-#define _PAGE_USER             (1 << _PAGE_BIT_USER)
-#define _PAGE_BUFFER           (1 << _PAGE_BIT_BUFFER)
-#define _PAGE_GLOBAL           (1 << _PAGE_BIT_GLOBAL)
-#define _PAGE_CACHABLE         (1 << _PAGE_BIT_CACHABLE)
-
-/* Software flags */
-#define _PAGE_ACCESSED         (1 << _PAGE_BIT_ACCESSED)
-#define _PAGE_PRESENT          (1 << _PAGE_BIT_PRESENT)
-
-/*
- * Page types, i.e. sizes. _PAGE_TYPE_NONE corresponds to what is
- * usually called _PAGE_PROTNONE on other architectures.
- *
- * XXX: Find out if _PAGE_PROTNONE is equivalent with !_PAGE_USER. If
- * so, we can encode all possible page sizes (although we can't really
- * support 1K pages anyway due to the _PAGE_PRESENT and _PAGE_ACCESSED
- * bits)
- *
- */
-#define _PAGE_TYPE_MASK                ((1 << _PAGE_BIT_SZ0) | (1 << _PAGE_BIT_SZ1))
-#define _PAGE_TYPE_NONE                (0 << _PAGE_BIT_SZ0)
-#define _PAGE_TYPE_SMALL       (1 << _PAGE_BIT_SZ0)
-#define _PAGE_TYPE_MEDIUM      (2 << _PAGE_BIT_SZ0)
-#define _PAGE_TYPE_LARGE       (3 << _PAGE_BIT_SZ0)
-
-/*
- * Mask which drop software flags. We currently can't handle more than
- * 512 MiB of physical memory, so we can use bits 29-31 for other
- * stuff.  With a fixed 4K page size, we can use bits 10-11 as well as
- * bits 2-3 (SZ)
- */
-#define _PAGE_FLAGS_HARDWARE_MASK      0xfffff3ff
-
-#define _PAGE_FLAGS_CACHE_MASK (_PAGE_CACHABLE | _PAGE_BUFFER | _PAGE_WT)
-
-/* Flags that may be modified by software */
-#define _PAGE_CHG_MASK         (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY \
-                                | _PAGE_FLAGS_CACHE_MASK)
-
-#define _PAGE_FLAGS_READ       (_PAGE_CACHABLE | _PAGE_BUFFER)
-#define _PAGE_FLAGS_WRITE      (_PAGE_FLAGS_READ | _PAGE_RW | _PAGE_DIRTY)
-
-#define _PAGE_NORMAL(x)        __pgprot((x) | _PAGE_PRESENT | _PAGE_TYPE_SMALL \
-                                | _PAGE_ACCESSED)
-
-#define PAGE_NONE      (_PAGE_ACCESSED | _PAGE_TYPE_NONE)
-#define PAGE_READ      (_PAGE_FLAGS_READ | _PAGE_USER)
-#define PAGE_EXEC      (_PAGE_FLAGS_READ | _PAGE_EXECUTE | _PAGE_USER)
-#define PAGE_WRITE     (_PAGE_FLAGS_WRITE | _PAGE_USER)
-#define PAGE_KERNEL    _PAGE_NORMAL(_PAGE_FLAGS_WRITE | _PAGE_EXECUTE | _PAGE_GLOBAL)
-#define PAGE_KERNEL_RO _PAGE_NORMAL(_PAGE_FLAGS_READ | _PAGE_EXECUTE | _PAGE_GLOBAL)
-
-#define _PAGE_P(x)     _PAGE_NORMAL((x) & ~(_PAGE_RW | _PAGE_DIRTY))
-#define _PAGE_S(x)     _PAGE_NORMAL(x)
-
-#define PAGE_COPY      _PAGE_P(PAGE_WRITE | PAGE_READ)
-#define PAGE_SHARED    _PAGE_S(PAGE_WRITE | PAGE_READ)
-
-#ifndef __ASSEMBLY__
-/*
- * The hardware supports flags for write- and execute access. Read is
- * always allowed if the page is loaded into the TLB, so the "-w-",
- * "--x" and "-wx" mappings are implemented as "rw-", "r-x" and "rwx",
- * respectively.
- *
- * The "---" case is handled by software; the page will simply not be
- * loaded into the TLB if the page type is _PAGE_TYPE_NONE.
- */
-
-#define __P000 __pgprot(PAGE_NONE)
-#define __P001 _PAGE_P(PAGE_READ)
-#define __P010 _PAGE_P(PAGE_WRITE)
-#define __P011 _PAGE_P(PAGE_WRITE | PAGE_READ)
-#define __P100 _PAGE_P(PAGE_EXEC)
-#define __P101 _PAGE_P(PAGE_EXEC | PAGE_READ)
-#define __P110 _PAGE_P(PAGE_EXEC | PAGE_WRITE)
-#define __P111 _PAGE_P(PAGE_EXEC | PAGE_WRITE | PAGE_READ)
-
-#define __S000 __pgprot(PAGE_NONE)
-#define __S001 _PAGE_S(PAGE_READ)
-#define __S010 _PAGE_S(PAGE_WRITE)
-#define __S011 _PAGE_S(PAGE_WRITE | PAGE_READ)
-#define __S100 _PAGE_S(PAGE_EXEC)
-#define __S101 _PAGE_S(PAGE_EXEC | PAGE_READ)
-#define __S110 _PAGE_S(PAGE_EXEC | PAGE_WRITE)
-#define __S111 _PAGE_S(PAGE_EXEC | PAGE_WRITE | PAGE_READ)
-
-#define pte_none(x)    (!pte_val(x))
-#define pte_present(x) (pte_val(x) & _PAGE_PRESENT)
-
-#define pte_clear(mm,addr,xp)                                  \
-       do {                                                    \
-               set_pte_at(mm, addr, xp, __pte(0));             \
-       } while (0)
-
-/*
- * The following only work if pte_present() is true.
- * Undefined behaviour if not..
- */
-static inline int pte_write(pte_t pte)
-{
-       return pte_val(pte) & _PAGE_RW;
-}
-static inline int pte_dirty(pte_t pte)
-{
-       return pte_val(pte) & _PAGE_DIRTY;
-}
-static inline int pte_young(pte_t pte)
-{
-       return pte_val(pte) & _PAGE_ACCESSED;
-}
-static inline int pte_special(pte_t pte)
-{
-       return 0;
-}
-
-/* Mutator functions for PTE bits */
-static inline pte_t pte_wrprotect(pte_t pte)
-{
-       set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_RW));
-       return pte;
-}
-static inline pte_t pte_mkclean(pte_t pte)
-{
-       set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_DIRTY));
-       return pte;
-}
-static inline pte_t pte_mkold(pte_t pte)
-{
-       set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_ACCESSED));
-       return pte;
-}
-static inline pte_t pte_mkwrite(pte_t pte)
-{
-       set_pte(&pte, __pte(pte_val(pte) | _PAGE_RW));
-       return pte;
-}
-static inline pte_t pte_mkdirty(pte_t pte)
-{
-       set_pte(&pte, __pte(pte_val(pte) | _PAGE_DIRTY));
-       return pte;
-}
-static inline pte_t pte_mkyoung(pte_t pte)
-{
-       set_pte(&pte, __pte(pte_val(pte) | _PAGE_ACCESSED));
-       return pte;
-}
-static inline pte_t pte_mkspecial(pte_t pte)
-{
-       return pte;
-}
-
-#define pmd_none(x)    (!pmd_val(x))
-#define pmd_present(x) (pmd_val(x))
-
-static inline void pmd_clear(pmd_t *pmdp)
-{
-       set_pmd(pmdp, __pmd(0));
-}
-
-#define        pmd_bad(x)      (pmd_val(x) & ~PAGE_MASK)
-
-/*
- * Permanent address of a page. We don't support highmem, so this is
- * trivial.
- */
-#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
-#define pte_page(x)    (pfn_to_page(pte_pfn(x)))
-
-/*
- * Mark the prot value as uncacheable and unbufferable
- */
-#define pgprot_noncached(prot)                                         \
-       __pgprot(pgprot_val(prot) & ~(_PAGE_BUFFER | _PAGE_CACHABLE))
-
-/*
- * Mark the prot value as uncacheable but bufferable
- */
-#define pgprot_writecombine(prot)                                      \
-       __pgprot((pgprot_val(prot) & ~_PAGE_CACHABLE) | _PAGE_BUFFER)
-
-/*
- * Conversion functions: convert a page and protection to a page entry,
- * and a page entry and page directory to the page they refer to.
- *
- * extern pte_t mk_pte(struct page *page, pgprot_t pgprot)
- */
-#define mk_pte(page, pgprot)   pfn_pte(page_to_pfn(page), (pgprot))
-
-static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
-{
-       set_pte(&pte, __pte((pte_val(pte) & _PAGE_CHG_MASK)
-                           | pgprot_val(newprot)));
-       return pte;
-}
-
-#define page_pte(page) page_pte_prot(page, __pgprot(0))
-
-#define pmd_page_vaddr(pmd)    pmd_val(pmd)
-#define pmd_page(pmd)          (virt_to_page(pmd_val(pmd)))
-
-/* to find an entry in a page-table-directory. */
-#define pgd_index(address)     (((address) >> PGDIR_SHIFT)     \
-                                & (PTRS_PER_PGD - 1))
-#define pgd_offset(mm, address)        ((mm)->pgd + pgd_index(address))
-
-/* to find an entry in a kernel page-table-directory */
-#define pgd_offset_k(address)  pgd_offset(&init_mm, address)
-
-/* Find an entry in the third-level page table.. */
-#define pte_index(address)                             \
-       ((address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
-#define pte_offset(dir, address)                                       \
-       ((pte_t *) pmd_page_vaddr(*(dir)) + pte_index(address))
-#define pte_offset_kernel(dir, address)                                        \
-       ((pte_t *) pmd_page_vaddr(*(dir)) + pte_index(address))
-#define pte_offset_map(dir, address) pte_offset_kernel(dir, address)
-#define pte_unmap(pte)         do { } while (0)
-
-struct vm_area_struct;
-extern void update_mmu_cache(struct vm_area_struct * vma,
-                            unsigned long address, pte_t *ptep);
-
-/*
- * Encode and decode a swap entry
- *
- * Constraints:
- *   _PAGE_TYPE_* at bits 2-3 (for emulating _PAGE_PROTNONE)
- *   _PAGE_PRESENT at bit 10
- *
- * We encode the type into bits 4-9 and offset into bits 11-31. This
- * gives us a 21 bits offset, or 2**21 * 4K = 8G usable swap space per
- * device, and 64 possible types.
- *
- * NOTE: We should set ZEROs at the position of _PAGE_PRESENT
- *       and _PAGE_PROTNONE bits
- */
-#define __swp_type(x)          (((x).val >> 4) & 0x3f)
-#define __swp_offset(x)                ((x).val >> 11)
-#define __swp_entry(type, offset) ((swp_entry_t) { ((type) << 4) | ((offset) << 11) })
-#define __pte_to_swp_entry(pte)        ((swp_entry_t) { pte_val(pte) })
-#define __swp_entry_to_pte(x)  ((pte_t) { (x).val })
-
-typedef pte_t *pte_addr_t;
-
-#define kern_addr_valid(addr)  (1)
-
-/* No page table caches to initialize (?) */
-#define pgtable_cache_init()   do { } while(0)
-
-#include <asm-generic/pgtable.h>
-
-#endif /* !__ASSEMBLY__ */
-
-#endif /* __ASM_AVR32_PGTABLE_H */
diff --git a/arch/avr32/include/asm/processor.h b/arch/avr32/include/asm/processor.h
deleted file mode 100644 (file)
index 972adcc..0000000
+++ /dev/null
@@ -1,166 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef __ASM_AVR32_PROCESSOR_H
-#define __ASM_AVR32_PROCESSOR_H
-
-#include <asm/page.h>
-#include <asm/cache.h>
-
-#define TASK_SIZE      0x80000000
-
-#ifdef __KERNEL__
-#define STACK_TOP      TASK_SIZE
-#define STACK_TOP_MAX  STACK_TOP
-#endif
-
-#ifndef __ASSEMBLY__
-
-static inline void *current_text_addr(void)
-{
-       register void *pc asm("pc");
-       return pc;
-}
-
-enum arch_type {
-       ARCH_AVR32A,
-       ARCH_AVR32B,
-       ARCH_MAX
-};
-
-enum cpu_type {
-       CPU_MORGAN,
-       CPU_AT32AP,
-       CPU_MAX
-};
-
-enum tlb_config {
-       TLB_NONE,
-       TLB_SPLIT,
-       TLB_UNIFIED,
-       TLB_INVALID
-};
-
-#define AVR32_FEATURE_RMW      (1 << 0)
-#define AVR32_FEATURE_DSP      (1 << 1)
-#define AVR32_FEATURE_SIMD     (1 << 2)
-#define AVR32_FEATURE_OCD      (1 << 3)
-#define AVR32_FEATURE_PCTR     (1 << 4)
-#define AVR32_FEATURE_JAVA     (1 << 5)
-#define AVR32_FEATURE_FPU      (1 << 6)
-
-struct avr32_cpuinfo {
-       struct clk *clk;
-       unsigned long loops_per_jiffy;
-       enum arch_type arch_type;
-       enum cpu_type cpu_type;
-       unsigned short arch_revision;
-       unsigned short cpu_revision;
-       enum tlb_config tlb_config;
-       unsigned long features;
-       u32 device_id;
-
-       struct cache_info icache;
-       struct cache_info dcache;
-};
-
-static inline unsigned int avr32_get_manufacturer_id(struct avr32_cpuinfo *cpu)
-{
-       return (cpu->device_id >> 1) & 0x7f;
-}
-static inline unsigned int avr32_get_product_number(struct avr32_cpuinfo *cpu)
-{
-       return (cpu->device_id >> 12) & 0xffff;
-}
-static inline unsigned int avr32_get_chip_revision(struct avr32_cpuinfo *cpu)
-{
-       return (cpu->device_id >> 28) & 0x0f;
-}
-
-extern struct avr32_cpuinfo boot_cpu_data;
-
-/* No SMP support so far */
-#define current_cpu_data boot_cpu_data
-
-/* This decides where the kernel will search for a free chunk of vm
- * space during mmap's
- */
-#define TASK_UNMAPPED_BASE     (PAGE_ALIGN(TASK_SIZE / 3))
-
-#define cpu_relax()            barrier()
-#define cpu_sync_pipeline()    asm volatile("sub pc, -2" : : : "memory")
-
-struct cpu_context {
-       unsigned long sr;
-       unsigned long pc;
-       unsigned long ksp;      /* Kernel stack pointer */
-       unsigned long r7;
-       unsigned long r6;
-       unsigned long r5;
-       unsigned long r4;
-       unsigned long r3;
-       unsigned long r2;
-       unsigned long r1;
-       unsigned long r0;
-};
-
-/* This struct contains the CPU context as stored by switch_to() */
-struct thread_struct {
-       struct cpu_context cpu_context;
-       unsigned long single_step_addr;
-       u16 single_step_insn;
-};
-
-#define INIT_THREAD {                                          \
-       .cpu_context = {                                        \
-               .ksp = sizeof(init_stack) + (long)&init_stack,  \
-       },                                                      \
-}
-
-/*
- * Do necessary setup to start up a newly executed thread.
- */
-#define start_thread(regs, new_pc, new_sp)      \
-       do {                                     \
-               memset(regs, 0, sizeof(*regs));  \
-               regs->sr = MODE_USER;            \
-               regs->pc = new_pc & ~1;          \
-               regs->sp = new_sp;               \
-       } while(0)
-
-struct task_struct;
-
-/* Free all resources held by a thread */
-extern void release_thread(struct task_struct *);
-
-/* Return saved PC of a blocked thread */
-#define thread_saved_pc(tsk)    ((tsk)->thread.cpu_context.pc)
-
-struct pt_regs;
-extern unsigned long get_wchan(struct task_struct *p);
-extern void show_regs_log_lvl(struct pt_regs *regs, const char *log_lvl);
-extern void show_stack_log_lvl(struct task_struct *tsk, unsigned long sp,
-                              struct pt_regs *regs, const char *log_lvl);
-
-#define task_pt_regs(p) \
-       ((struct pt_regs *)(THREAD_SIZE + task_stack_page(p)) - 1)
-
-#define KSTK_EIP(tsk)  ((tsk)->thread.cpu_context.pc)
-#define KSTK_ESP(tsk)  ((tsk)->thread.cpu_context.ksp)
-
-#define ARCH_HAS_PREFETCH
-
-static inline void prefetch(const void *x)
-{
-       const char *c = x;
-       asm volatile("pref %0" : : "r"(c));
-}
-#define PREFETCH_STRIDE        L1_CACHE_BYTES
-
-#endif /* __ASSEMBLY__ */
-
-#endif /* __ASM_AVR32_PROCESSOR_H */
diff --git a/arch/avr32/include/asm/ptrace.h b/arch/avr32/include/asm/ptrace.h
deleted file mode 100644 (file)
index 630e4f9..0000000
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef __ASM_AVR32_PTRACE_H
-#define __ASM_AVR32_PTRACE_H
-
-#include <uapi/asm/ptrace.h>
-
-#ifndef __ASSEMBLY__
-
-#include <asm/ocd.h>
-
-#define arch_has_single_step()         (1)
-
-#define arch_ptrace_attach(child)       ocd_enable(child)
-
-#define user_mode(regs)                 (((regs)->sr & MODE_MASK) == MODE_USER)
-#define instruction_pointer(regs)       ((regs)->pc)
-#define profile_pc(regs)                instruction_pointer(regs)
-#define user_stack_pointer(regs)       ((regs)->sp)
-
-static __inline__ int valid_user_regs(struct pt_regs *regs)
-{
-       /*
-        * Some of the Java bits might be acceptable if/when we
-        * implement some support for that stuff...
-        */
-       if ((regs->sr & 0xffff0000) == 0)
-               return 1;
-
-       /*
-        * Force status register flags to be sane and report this
-        * illegal behaviour...
-        */
-       regs->sr &= 0x0000ffff;
-       return 0;
-}
-
-
-#endif /* ! __ASSEMBLY__ */
-#endif /* __ASM_AVR32_PTRACE_H */
diff --git a/arch/avr32/include/asm/serial.h b/arch/avr32/include/asm/serial.h
deleted file mode 100644 (file)
index 5ecaebc..0000000
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef _ASM_SERIAL_H
-#define _ASM_SERIAL_H
-
-/*
- * This assumes you have a 1.8432 MHz clock for your UART.
- *
- * It'd be nice if someone built a serial card with a 24.576 MHz
- * clock, since the 16550A is capable of handling a top speed of 1.5
- * megabits/second; but this requires the faster clock.
- */
-#define BASE_BAUD (1843200 / 16)
-
-#endif /* _ASM_SERIAL_H */
diff --git a/arch/avr32/include/asm/setup.h b/arch/avr32/include/asm/setup.h
deleted file mode 100644 (file)
index 73490ae..0000000
+++ /dev/null
@@ -1,144 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * Based on linux/include/asm-arm/setup.h
- *   Copyright (C) 1997-1999 Russell King
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef __ASM_AVR32_SETUP_H__
-#define __ASM_AVR32_SETUP_H__
-
-#include <uapi/asm/setup.h>
-
-
-/* Magic number indicating that a tag table is present */
-#define ATAG_MAGIC     0xa2a25441
-
-#ifndef __ASSEMBLY__
-
-/*
- * Generic memory range, used by several tags.
- *
- *   addr is always physical.
- *   size is measured in bytes.
- *   next is for use by the OS, e.g. for grouping regions into
- *        linked lists.
- */
-struct tag_mem_range {
-       u32                     addr;
-       u32                     size;
-       struct tag_mem_range *  next;
-};
-
-/* The list ends with an ATAG_NONE node. */
-#define ATAG_NONE      0x00000000
-
-struct tag_header {
-       u32 size;
-       u32 tag;
-};
-
-/* The list must start with an ATAG_CORE node */
-#define ATAG_CORE      0x54410001
-
-struct tag_core {
-       u32 flags;
-       u32 pagesize;
-       u32 rootdev;
-};
-
-/* it is allowed to have multiple ATAG_MEM nodes */
-#define ATAG_MEM       0x54410002
-/* ATAG_MEM uses tag_mem_range */
-
-/* command line: \0 terminated string */
-#define ATAG_CMDLINE   0x54410003
-
-struct tag_cmdline {
-       char    cmdline[1];     /* this is the minimum size */
-};
-
-/* Ramdisk image (may be compressed) */
-#define ATAG_RDIMG     0x54410004
-/* ATAG_RDIMG uses tag_mem_range */
-
-/* Information about various clocks present in the system */
-#define ATAG_CLOCK     0x54410005
-
-struct tag_clock {
-       u32     clock_id;       /* Which clock are we talking about? */
-       u32     clock_flags;    /* Special features */
-       u64     clock_hz;       /* Clock speed in Hz */
-};
-
-/* The clock types we know about */
-#define CLOCK_BOOTCPU  0
-
-/* Memory reserved for the system (e.g. the bootloader) */
-#define ATAG_RSVD_MEM  0x54410006
-/* ATAG_RSVD_MEM uses tag_mem_range */
-
-/* Ethernet information */
-
-#define ATAG_ETHERNET  0x54410007
-
-struct tag_ethernet {
-       u8      mac_index;
-       u8      mii_phy_addr;
-       u8      hw_address[6];
-};
-
-#define ETH_INVALID_PHY        0xff
-
-/* board information */
-#define ATAG_BOARDINFO 0x54410008
-
-struct tag_boardinfo {
-       u32     board_number;
-};
-
-struct tag {
-       struct tag_header hdr;
-       union {
-               struct tag_core core;
-               struct tag_mem_range mem_range;
-               struct tag_cmdline cmdline;
-               struct tag_clock clock;
-               struct tag_ethernet ethernet;
-               struct tag_boardinfo boardinfo;
-       } u;
-};
-
-struct tagtable {
-       u32     tag;
-       int     (*parse)(struct tag *);
-};
-
-#define __tag __used __attribute__((__section__(".taglist.init")))
-#define __tagtable(tag, fn)                                            \
-       static struct tagtable __tagtable_##fn __tag = { tag, fn }
-
-#define tag_member_present(tag,member)                                 \
-       ((unsigned long)(&((struct tag *)0L)->member + 1)               \
-        <= (tag)->hdr.size * 4)
-
-#define tag_next(t)    ((struct tag *)((u32 *)(t) + (t)->hdr.size))
-#define tag_size(type) ((sizeof(struct tag_header) + sizeof(struct type)) >> 2)
-
-#define for_each_tag(t,base)                                           \
-       for (t = base; t->hdr.size; t = tag_next(t))
-
-extern struct tag *bootloader_tags;
-
-extern resource_size_t fbmem_start;
-extern resource_size_t fbmem_size;
-extern u32 board_number;
-
-void setup_processor(void);
-
-#endif /* !__ASSEMBLY__ */
-
-#endif /* __ASM_AVR32_SETUP_H__ */
diff --git a/arch/avr32/include/asm/shmparam.h b/arch/avr32/include/asm/shmparam.h
deleted file mode 100644 (file)
index 3681266..0000000
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef __ASM_AVR32_SHMPARAM_H
-#define __ASM_AVR32_SHMPARAM_H
-
-#define SHMLBA PAGE_SIZE       /* attach addr a multiple of this */
-
-#endif /* __ASM_AVR32_SHMPARAM_H */
diff --git a/arch/avr32/include/asm/signal.h b/arch/avr32/include/asm/signal.h
deleted file mode 100644 (file)
index d875eb6..0000000
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef __ASM_AVR32_SIGNAL_H
-#define __ASM_AVR32_SIGNAL_H
-
-#include <uapi/asm/signal.h>
-
-/* Most things should be clean enough to redefine this at will, if care
-   is taken to make libc match.  */
-
-#define _NSIG          64
-#define _NSIG_BPW      32
-#define _NSIG_WORDS    (_NSIG / _NSIG_BPW)
-
-typedef unsigned long old_sigset_t;            /* at least 32 bits */
-
-typedef struct {
-       unsigned long sig[_NSIG_WORDS];
-} sigset_t;
-
-#define __ARCH_HAS_SA_RESTORER
-
-#include <asm/sigcontext.h>
-#undef __HAVE_ARCH_SIG_BITOPS
-
-#endif
diff --git a/arch/avr32/include/asm/string.h b/arch/avr32/include/asm/string.h
deleted file mode 100644 (file)
index c91a623..0000000
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef __ASM_AVR32_STRING_H
-#define __ASM_AVR32_STRING_H
-
-#define __HAVE_ARCH_MEMSET
-extern void *memset(void *b, int c, size_t len);
-
-#define __HAVE_ARCH_MEMCPY
-extern void *memcpy(void *to, const void *from, size_t len);
-
-#endif /* __ASM_AVR32_STRING_H */
diff --git a/arch/avr32/include/asm/switch_to.h b/arch/avr32/include/asm/switch_to.h
deleted file mode 100644 (file)
index 6f00581..0000000
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef __ASM_AVR32_SWITCH_TO_H
-#define __ASM_AVR32_SWITCH_TO_H
-
-/*
- * Help PathFinder and other Nexus-compliant debuggers keep track of
- * the current PID by emitting an Ownership Trace Message each time we
- * switch task.
- */
-#ifdef CONFIG_OWNERSHIP_TRACE
-#include <asm/ocd.h>
-#define ocd_switch(prev, next)                         \
-       do {                                            \
-               ocd_write(PID, prev->pid);              \
-               ocd_write(PID, next->pid);              \
-       } while(0)
-#else
-#define ocd_switch(prev, next)
-#endif
-
-/*
- * switch_to(prev, next, last) should switch from task `prev' to task
- * `next'. `prev' will never be the same as `next'.
- *
- * We just delegate everything to the __switch_to assembly function,
- * which is implemented in arch/avr32/kernel/switch_to.S
- *
- * mb() tells GCC not to cache `current' across this call.
- */
-struct cpu_context;
-struct task_struct;
-extern struct task_struct *__switch_to(struct task_struct *,
-                                      struct cpu_context *,
-                                      struct cpu_context *);
-#define switch_to(prev, next, last)                                    \
-       do {                                                            \
-               ocd_switch(prev, next);                                 \
-               last = __switch_to(prev, &prev->thread.cpu_context + 1, \
-                                  &next->thread.cpu_context);          \
-       } while (0)
-
-
-#endif /* __ASM_AVR32_SWITCH_TO_H */
diff --git a/arch/avr32/include/asm/syscalls.h b/arch/avr32/include/asm/syscalls.h
deleted file mode 100644 (file)
index 244f2ac..0000000
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- * syscalls.h - Linux syscall interfaces (arch-specific)
- *
- * Copyright (c) 2008 Jaswinder Singh
- *
- * This file is released under the GPLv2.
- * See the file COPYING for more details.
- */
-
-#ifndef _ASM_AVR32_SYSCALLS_H
-#define _ASM_AVR32_SYSCALLS_H
-
-#include <linux/compiler.h>
-#include <linux/linkage.h>
-#include <linux/types.h>
-#include <linux/signal.h>
-
-/* mm/cache.c */
-asmlinkage int sys_cacheflush(int, void __user *, size_t);
-
-#endif /* _ASM_AVR32_SYSCALLS_H */
diff --git a/arch/avr32/include/asm/sysreg.h b/arch/avr32/include/asm/sysreg.h
deleted file mode 100644 (file)
index d4e0950..0000000
+++ /dev/null
@@ -1,291 +0,0 @@
-/*
- * AVR32 System Registers
- *
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef __ASM_AVR32_SYSREG_H
-#define __ASM_AVR32_SYSREG_H
-
-/* sysreg register offsets */
-#define SYSREG_SR                              0x0000
-#define SYSREG_EVBA                            0x0004
-#define SYSREG_ACBA                            0x0008
-#define SYSREG_CPUCR                           0x000c
-#define SYSREG_ECR                             0x0010
-#define SYSREG_RSR_SUP                         0x0014
-#define SYSREG_RSR_INT0                                0x0018
-#define SYSREG_RSR_INT1                                0x001c
-#define SYSREG_RSR_INT2                                0x0020
-#define SYSREG_RSR_INT3                                0x0024
-#define SYSREG_RSR_EX                          0x0028
-#define SYSREG_RSR_NMI                         0x002c
-#define SYSREG_RSR_DBG                         0x0030
-#define SYSREG_RAR_SUP                         0x0034
-#define SYSREG_RAR_INT0                                0x0038
-#define SYSREG_RAR_INT1                                0x003c
-#define SYSREG_RAR_INT2                                0x0040
-#define SYSREG_RAR_INT3                                0x0044
-#define SYSREG_RAR_EX                          0x0048
-#define SYSREG_RAR_NMI                         0x004c
-#define SYSREG_RAR_DBG                         0x0050
-#define SYSREG_JECR                            0x0054
-#define SYSREG_JOSP                            0x0058
-#define SYSREG_JAVA_LV0                                0x005c
-#define SYSREG_JAVA_LV1                                0x0060
-#define SYSREG_JAVA_LV2                                0x0064
-#define SYSREG_JAVA_LV3                                0x0068
-#define SYSREG_JAVA_LV4                                0x006c
-#define SYSREG_JAVA_LV5                                0x0070
-#define SYSREG_JAVA_LV6                                0x0074
-#define SYSREG_JAVA_LV7                                0x0078
-#define SYSREG_JTBA                            0x007c
-#define SYSREG_JBCR                            0x0080
-#define SYSREG_CONFIG0                         0x0100
-#define SYSREG_CONFIG1                         0x0104
-#define SYSREG_COUNT                           0x0108
-#define SYSREG_COMPARE                         0x010c
-#define SYSREG_TLBEHI                          0x0110
-#define SYSREG_TLBELO                          0x0114
-#define SYSREG_PTBR                            0x0118
-#define SYSREG_TLBEAR                          0x011c
-#define SYSREG_MMUCR                           0x0120
-#define SYSREG_TLBARLO                         0x0124
-#define SYSREG_TLBARHI                         0x0128
-#define SYSREG_PCCNT                           0x012c
-#define SYSREG_PCNT0                           0x0130
-#define SYSREG_PCNT1                           0x0134
-#define SYSREG_PCCR                            0x0138
-#define SYSREG_BEAR                            0x013c
-#define SYSREG_SABAL                           0x0300
-#define SYSREG_SABAH                           0x0304
-#define SYSREG_SABD                            0x0308
-
-/* Bitfields in SR */
-#define SYSREG_SR_C_OFFSET                     0
-#define SYSREG_SR_C_SIZE                       1
-#define SYSREG_Z_OFFSET                                1
-#define SYSREG_Z_SIZE                          1
-#define SYSREG_SR_N_OFFSET                     2
-#define SYSREG_SR_N_SIZE                       1
-#define SYSREG_SR_V_OFFSET                     3
-#define SYSREG_SR_V_SIZE                       1
-#define SYSREG_Q_OFFSET                                4
-#define SYSREG_Q_SIZE                          1
-#define SYSREG_L_OFFSET                                5
-#define SYSREG_L_SIZE                          1
-#define SYSREG_T_OFFSET                                14
-#define SYSREG_T_SIZE                          1
-#define SYSREG_SR_R_OFFSET                     15
-#define SYSREG_SR_R_SIZE                       1
-#define SYSREG_GM_OFFSET                       16
-#define SYSREG_GM_SIZE                         1
-#define SYSREG_I0M_OFFSET                      17
-#define SYSREG_I0M_SIZE                                1
-#define SYSREG_I1M_OFFSET                      18
-#define SYSREG_I1M_SIZE                                1
-#define SYSREG_I2M_OFFSET                      19
-#define SYSREG_I2M_SIZE                                1
-#define SYSREG_I3M_OFFSET                      20
-#define SYSREG_I3M_SIZE                                1
-#define SYSREG_EM_OFFSET                       21
-#define SYSREG_EM_SIZE                         1
-#define SYSREG_MODE_OFFSET                     22
-#define SYSREG_MODE_SIZE                       3
-#define SYSREG_M0_OFFSET                       22
-#define SYSREG_M0_SIZE                         1
-#define SYSREG_M1_OFFSET                       23
-#define SYSREG_M1_SIZE                         1
-#define SYSREG_M2_OFFSET                       24
-#define SYSREG_M2_SIZE                         1
-#define SYSREG_SR_D_OFFSET                     26
-#define SYSREG_SR_D_SIZE                       1
-#define SYSREG_DM_OFFSET                       27
-#define SYSREG_DM_SIZE                         1
-#define SYSREG_SR_J_OFFSET                     28
-#define SYSREG_SR_J_SIZE                       1
-#define SYSREG_H_OFFSET                                29
-#define SYSREG_H_SIZE                          1
-
-/* Bitfields in CPUCR */
-#define SYSREG_BI_OFFSET                       0
-#define SYSREG_BI_SIZE                         1
-#define SYSREG_BE_OFFSET                       1
-#define SYSREG_BE_SIZE                         1
-#define SYSREG_FE_OFFSET                       2
-#define SYSREG_FE_SIZE                         1
-#define SYSREG_RE_OFFSET                       3
-#define SYSREG_RE_SIZE                         1
-#define SYSREG_IBE_OFFSET                      4
-#define SYSREG_IBE_SIZE                                1
-#define SYSREG_IEE_OFFSET                      5
-#define SYSREG_IEE_SIZE                                1
-
-/* Bitfields in CONFIG0 */
-#define SYSREG_CONFIG0_R_OFFSET                        0
-#define SYSREG_CONFIG0_R_SIZE                  1
-#define SYSREG_CONFIG0_D_OFFSET                        1
-#define SYSREG_CONFIG0_D_SIZE                  1
-#define SYSREG_CONFIG0_S_OFFSET                        2
-#define SYSREG_CONFIG0_S_SIZE                  1
-#define SYSREG_CONFIG0_O_OFFSET                        3
-#define SYSREG_CONFIG0_O_SIZE                  1
-#define SYSREG_CONFIG0_P_OFFSET                        4
-#define SYSREG_CONFIG0_P_SIZE                  1
-#define SYSREG_CONFIG0_J_OFFSET                        5
-#define SYSREG_CONFIG0_J_SIZE                  1
-#define SYSREG_CONFIG0_F_OFFSET                        6
-#define SYSREG_CONFIG0_F_SIZE                  1
-#define SYSREG_MMUT_OFFSET                     7
-#define SYSREG_MMUT_SIZE                       3
-#define SYSREG_AR_OFFSET                       10
-#define SYSREG_AR_SIZE                         3
-#define SYSREG_AT_OFFSET                       13
-#define SYSREG_AT_SIZE                         3
-#define SYSREG_PROCESSORREVISION_OFFSET                16
-#define SYSREG_PROCESSORREVISION_SIZE          8
-#define SYSREG_PROCESSORID_OFFSET              24
-#define SYSREG_PROCESSORID_SIZE                        8
-
-/* Bitfields in CONFIG1 */
-#define SYSREG_DASS_OFFSET                     0
-#define SYSREG_DASS_SIZE                       3
-#define SYSREG_DLSZ_OFFSET                     3
-#define SYSREG_DLSZ_SIZE                       3
-#define SYSREG_DSET_OFFSET                     6
-#define SYSREG_DSET_SIZE                       4
-#define SYSREG_IASS_OFFSET                     10
-#define SYSREG_IASS_SIZE                       3
-#define SYSREG_ILSZ_OFFSET                     13
-#define SYSREG_ILSZ_SIZE                       3
-#define SYSREG_ISET_OFFSET                     16
-#define SYSREG_ISET_SIZE                       4
-#define SYSREG_DMMUSZ_OFFSET                   20
-#define SYSREG_DMMUSZ_SIZE                     6
-#define SYSREG_IMMUSZ_OFFSET                   26
-#define SYSREG_IMMUSZ_SIZE                     6
-
-/* Bitfields in TLBEHI */
-#define SYSREG_ASID_OFFSET                     0
-#define SYSREG_ASID_SIZE                       8
-#define SYSREG_TLBEHI_I_OFFSET                 8
-#define SYSREG_TLBEHI_I_SIZE                   1
-#define SYSREG_TLBEHI_V_OFFSET                 9
-#define SYSREG_TLBEHI_V_SIZE                   1
-#define SYSREG_VPN_OFFSET                      10
-#define SYSREG_VPN_SIZE                                22
-
-/* Bitfields in TLBELO */
-#define SYSREG_W_OFFSET                                0
-#define SYSREG_W_SIZE                          1
-#define SYSREG_TLBELO_D_OFFSET                 1
-#define SYSREG_TLBELO_D_SIZE                   1
-#define SYSREG_SZ_OFFSET                       2
-#define SYSREG_SZ_SIZE                         2
-#define SYSREG_AP_OFFSET                       4
-#define SYSREG_AP_SIZE                         3
-#define SYSREG_B_OFFSET                                7
-#define SYSREG_B_SIZE                          1
-#define SYSREG_G_OFFSET                                8
-#define SYSREG_G_SIZE                          1
-#define SYSREG_TLBELO_C_OFFSET                 9
-#define SYSREG_TLBELO_C_SIZE                   1
-#define SYSREG_PFN_OFFSET                      10
-#define SYSREG_PFN_SIZE                                22
-
-/* Bitfields in MMUCR */
-#define SYSREG_E_OFFSET                                0
-#define SYSREG_E_SIZE                          1
-#define SYSREG_M_OFFSET                                1
-#define SYSREG_M_SIZE                          1
-#define SYSREG_MMUCR_I_OFFSET                  2
-#define SYSREG_MMUCR_I_SIZE                    1
-#define SYSREG_MMUCR_N_OFFSET                  3
-#define SYSREG_MMUCR_N_SIZE                    1
-#define SYSREG_MMUCR_S_OFFSET                  4
-#define SYSREG_MMUCR_S_SIZE                    1
-#define SYSREG_DLA_OFFSET                      8
-#define SYSREG_DLA_SIZE                                6
-#define SYSREG_DRP_OFFSET                      14
-#define SYSREG_DRP_SIZE                                6
-#define SYSREG_ILA_OFFSET                      20
-#define SYSREG_ILA_SIZE                                6
-#define SYSREG_IRP_OFFSET                      26
-#define SYSREG_IRP_SIZE                                6
-
-/* Bitfields in PCCR */
-#define SYSREG_PCCR_E_OFFSET                   0
-#define SYSREG_PCCR_E_SIZE                     1
-#define SYSREG_PCCR_R_OFFSET                   1
-#define SYSREG_PCCR_R_SIZE                     1
-#define SYSREG_PCCR_C_OFFSET                   2
-#define SYSREG_PCCR_C_SIZE                     1
-#define SYSREG_PCCR_S_OFFSET                   3
-#define SYSREG_PCCR_S_SIZE                     1
-#define SYSREG_IEC_OFFSET                      4
-#define SYSREG_IEC_SIZE                                1
-#define SYSREG_IE0_OFFSET                      5
-#define SYSREG_IE0_SIZE                                1
-#define SYSREG_IE1_OFFSET                      6
-#define SYSREG_IE1_SIZE                                1
-#define SYSREG_FC_OFFSET                       8
-#define SYSREG_FC_SIZE                         1
-#define SYSREG_F0_OFFSET                       9
-#define SYSREG_F0_SIZE                         1
-#define SYSREG_F1_OFFSET                       10
-#define SYSREG_F1_SIZE                         1
-#define SYSREG_CONF0_OFFSET                    12
-#define SYSREG_CONF0_SIZE                      6
-#define SYSREG_CONF1_OFFSET                    18
-#define SYSREG_CONF1_SIZE                      6
-
-/* Constants for ECR */
-#define ECR_UNRECOVERABLE                      0
-#define ECR_TLB_MULTIPLE                       1
-#define ECR_BUS_ERROR_WRITE                    2
-#define ECR_BUS_ERROR_READ                     3
-#define ECR_NMI                                        4
-#define ECR_ADDR_ALIGN_X                       5
-#define ECR_PROTECTION_X                       6
-#define ECR_DEBUG                              7
-#define ECR_ILLEGAL_OPCODE                     8
-#define ECR_UNIMPL_INSTRUCTION                 9
-#define ECR_PRIVILEGE_VIOLATION                        10
-#define ECR_FPE                                        11
-#define ECR_COPROC_ABSENT                      12
-#define ECR_ADDR_ALIGN_R                       13
-#define ECR_ADDR_ALIGN_W                       14
-#define ECR_PROTECTION_R                       15
-#define ECR_PROTECTION_W                       16
-#define ECR_DTLB_MODIFIED                      17
-#define ECR_TLB_MISS_X                         20
-#define ECR_TLB_MISS_R                         24
-#define ECR_TLB_MISS_W                         28
-
-/* Bit manipulation macros */
-#define SYSREG_BIT(name)                               \
-       (1 << SYSREG_##name##_OFFSET)
-#define SYSREG_BF(name,value)                          \
-       (((value) & ((1 << SYSREG_##name##_SIZE) - 1))  \
-        << SYSREG_##name##_OFFSET)
-#define SYSREG_BFEXT(name,value)\
-       (((value) >> SYSREG_##name##_OFFSET)            \
-        & ((1 << SYSREG_##name##_SIZE) - 1))
-#define SYSREG_BFINS(name,value,old)                   \
-       (((old) & ~(((1 << SYSREG_##name##_SIZE) - 1)   \
-                   << SYSREG_##name##_OFFSET))         \
-        | SYSREG_BF(name,value))
-
-/* Register access macros */
-#ifdef __CHECKER__
-extern unsigned long __builtin_mfsr(unsigned long reg);
-extern void __builtin_mtsr(unsigned long reg, unsigned long value);
-#endif
-
-#define sysreg_read(reg)               __builtin_mfsr(SYSREG_##reg)
-#define sysreg_write(reg, value)       __builtin_mtsr(SYSREG_##reg, value)
-
-#endif /* __ASM_AVR32_SYSREG_H */
diff --git a/arch/avr32/include/asm/termios.h b/arch/avr32/include/asm/termios.h
deleted file mode 100644 (file)
index 9d59437..0000000
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef __ASM_AVR32_TERMIOS_H
-#define __ASM_AVR32_TERMIOS_H
-
-#include <uapi/asm/termios.h>
-
-/*     intr=^C         quit=^\         erase=del       kill=^U
-       eof=^D          vtime=\0        vmin=\1         sxtc=\0
-       start=^Q        stop=^S         susp=^Z         eol=\0
-       reprint=^R      discard=^U      werase=^W       lnext=^V
-       eol2=\0
-*/
-#define INIT_C_CC "\003\034\177\025\004\0\1\0\021\023\032\0\022\017\027\026\0"
-
-#include <asm-generic/termios-base.h>
-
-#endif /* __ASM_AVR32_TERMIOS_H */
diff --git a/arch/avr32/include/asm/thread_info.h b/arch/avr32/include/asm/thread_info.h
deleted file mode 100644 (file)
index d4d3079..0000000
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef __ASM_AVR32_THREAD_INFO_H
-#define __ASM_AVR32_THREAD_INFO_H
-
-#include <asm/page.h>
-
-#define THREAD_SIZE_ORDER      1
-#define THREAD_SIZE            (PAGE_SIZE << THREAD_SIZE_ORDER)
-
-#ifndef __ASSEMBLY__
-#include <asm/types.h>
-
-struct task_struct;
-
-struct thread_info {
-       struct task_struct      *task;          /* main task structure */
-       unsigned long           flags;          /* low level flags */
-       __u32                   cpu;
-       __s32                   preempt_count;  /* 0 => preemptable, <0 => BUG */
-       __u32                   rar_saved;      /* return address... */
-       __u32                   rsr_saved;      /* ...and status register
-                                                  saved by debug handler
-                                                  when setting up
-                                                  trampoline */
-       __u8                    supervisor_stack[0];
-};
-
-#define INIT_THREAD_INFO(tsk)                                          \
-{                                                                      \
-       .task           = &tsk,                                         \
-       .flags          = 0,                                            \
-       .cpu            = 0,                                            \
-       .preempt_count  = INIT_PREEMPT_COUNT,                           \
-}
-
-#define init_thread_info       (init_thread_union.thread_info)
-#define init_stack             (init_thread_union.stack)
-
-/*
- * Get the thread information struct from C.
- * We do the usual trick and use the lower end of the stack for this
- */
-static inline struct thread_info *current_thread_info(void)
-{
-       unsigned long addr = ~(THREAD_SIZE - 1);
-
-       asm("and %0, sp" : "=r"(addr) : "0"(addr));
-       return (struct thread_info *)addr;
-}
-
-#define get_thread_info(ti) get_task_struct((ti)->task)
-#define put_thread_info(ti) put_task_struct((ti)->task)
-
-#endif /* !__ASSEMBLY__ */
-
-/*
- * Thread information flags
- * - these are process state flags that various assembly files may need to access
- * - pending work-to-be-done flags are in LSW
- * - other flags in MSW
- */
-#define TIF_SYSCALL_TRACE       0       /* syscall trace active */
-#define TIF_SIGPENDING          1       /* signal pending */
-#define TIF_NEED_RESCHED        2       /* rescheduling necessary */
-#define TIF_BREAKPOINT         4       /* enter monitor mode on return */
-#define TIF_SINGLE_STEP                5       /* single step in progress */
-#define TIF_MEMDIE             6       /* is terminating due to OOM killer */
-#define TIF_RESTORE_SIGMASK    7       /* restore signal mask in do_signal */
-#define TIF_CPU_GOING_TO_SLEEP 8       /* CPU is entering sleep 0 mode */
-#define TIF_NOTIFY_RESUME      9       /* callback before returning to user */
-#define TIF_DEBUG              30      /* debugging enabled */
-#define TIF_USERSPACE          31      /* true if FS sets userspace */
-
-#define _TIF_SYSCALL_TRACE     (1 << TIF_SYSCALL_TRACE)
-#define _TIF_SIGPENDING                (1 << TIF_SIGPENDING)
-#define _TIF_NEED_RESCHED      (1 << TIF_NEED_RESCHED)
-#define _TIF_BREAKPOINT                (1 << TIF_BREAKPOINT)
-#define _TIF_SINGLE_STEP       (1 << TIF_SINGLE_STEP)
-#define _TIF_MEMDIE            (1 << TIF_MEMDIE)
-#define _TIF_CPU_GOING_TO_SLEEP (1 << TIF_CPU_GOING_TO_SLEEP)
-#define _TIF_NOTIFY_RESUME     (1 << TIF_NOTIFY_RESUME)
-
-/* Note: The masks below must never span more than 16 bits! */
-
-/* work to do on interrupt/exception return */
-#define _TIF_WORK_MASK                         \
-       (_TIF_SIGPENDING                        \
-        | _TIF_NOTIFY_RESUME                   \
-        | _TIF_NEED_RESCHED                    \
-        | _TIF_BREAKPOINT)
-
-/* work to do on any return to userspace */
-#define _TIF_ALLWORK_MASK      (_TIF_WORK_MASK | _TIF_SYSCALL_TRACE)
-/* work to do on return from debug mode */
-#define _TIF_DBGWORK_MASK      (_TIF_WORK_MASK & ~_TIF_BREAKPOINT)
-
-#endif /* __ASM_AVR32_THREAD_INFO_H */
diff --git a/arch/avr32/include/asm/timex.h b/arch/avr32/include/asm/timex.h
deleted file mode 100644 (file)
index 187dcf3..0000000
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef __ASM_AVR32_TIMEX_H
-#define __ASM_AVR32_TIMEX_H
-
-/*
- * This is the frequency of the timer used for Linux's timer interrupt.
- * The value should be defined as accurate as possible or under certain
- * circumstances Linux timekeeping might become inaccurate or fail.
- *
- * For many system the exact clockrate of the timer isn't known but due to
- * the way this value is used we can get away with a wrong value as long
- * as this value is:
- *
- *  - a multiple of HZ
- *  - a divisor of the actual rate
- *
- * 500000 is a good such cheat value.
- *
- * The obscure number 1193182 is the same as used by the original i8254
- * time in legacy PC hardware; the chip is never found in AVR32 systems.
- */
-#define CLOCK_TICK_RATE                500000  /* Underlying HZ */
-
-typedef unsigned long cycles_t;
-
-static inline cycles_t get_cycles (void)
-{
-       return 0;
-}
-
-#define ARCH_HAS_READ_CURRENT_TIMER
-
-#endif /* __ASM_AVR32_TIMEX_H */
diff --git a/arch/avr32/include/asm/tlb.h b/arch/avr32/include/asm/tlb.h
deleted file mode 100644 (file)
index 5c55f9c..0000000
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef __ASM_AVR32_TLB_H
-#define __ASM_AVR32_TLB_H
-
-#define tlb_start_vma(tlb, vma) \
-       flush_cache_range(vma, vma->vm_start, vma->vm_end)
-
-#define tlb_end_vma(tlb, vma) \
-       flush_tlb_range(vma, vma->vm_start, vma->vm_end)
-
-#define __tlb_remove_tlb_entry(tlb, pte, address) do { } while(0)
-
-/*
- * Flush whole TLB for MM
- */
-#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm)
-
-#include <asm-generic/tlb.h>
-
-/*
- * For debugging purposes
- */
-extern void show_dtlb_entry(unsigned int index);
-extern void dump_dtlb(void);
-
-#endif /* __ASM_AVR32_TLB_H */
diff --git a/arch/avr32/include/asm/tlbflush.h b/arch/avr32/include/asm/tlbflush.h
deleted file mode 100644 (file)
index bf90a78..0000000
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef __ASM_AVR32_TLBFLUSH_H
-#define __ASM_AVR32_TLBFLUSH_H
-
-#include <asm/mmu.h>
-
-/*
- * TLB flushing:
- *
- *  - flush_tlb() flushes the current mm struct TLBs
- *  - flush_tlb_all() flushes all processes' TLB entries
- *  - flush_tlb_mm(mm) flushes the specified mm context TLBs
- *  - flush_tlb_page(vma, vmaddr) flushes one page
- *  - flush_tlb_range(vma, start, end) flushes a range of pages
- *  - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
- */
-extern void flush_tlb(void);
-extern void flush_tlb_all(void);
-extern void flush_tlb_mm(struct mm_struct *mm);
-extern void flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
-                           unsigned long end);
-extern void flush_tlb_page(struct vm_area_struct *vma, unsigned long page);
-
-extern void flush_tlb_kernel_range(unsigned long start, unsigned long end);
-
-#endif /* __ASM_AVR32_TLBFLUSH_H */
diff --git a/arch/avr32/include/asm/traps.h b/arch/avr32/include/asm/traps.h
deleted file mode 100644 (file)
index 6a8fb94..0000000
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef __ASM_AVR32_TRAPS_H
-#define __ASM_AVR32_TRAPS_H
-
-#include <linux/list.h>
-
-struct undef_hook {
-       struct list_head node;
-       u32 insn_mask;
-       u32 insn_val;
-       int (*fn)(struct pt_regs *regs, u32 insn);
-};
-
-void register_undef_hook(struct undef_hook *hook);
-void unregister_undef_hook(struct undef_hook *hook);
-
-#endif /* __ASM_AVR32_TRAPS_H */
diff --git a/arch/avr32/include/asm/types.h b/arch/avr32/include/asm/types.h
deleted file mode 100644 (file)
index 5932405..0000000
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef __ASM_AVR32_TYPES_H
-#define __ASM_AVR32_TYPES_H
-
-#include <uapi/asm/types.h>
-
-/*
- * These aren't exported outside the kernel to avoid name space clashes
- */
-
-#define BITS_PER_LONG 32
-
-#endif /* __ASM_AVR32_TYPES_H */
diff --git a/arch/avr32/include/asm/uaccess.h b/arch/avr32/include/asm/uaccess.h
deleted file mode 100644 (file)
index b1ec1fa..0000000
+++ /dev/null
@@ -1,337 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef __ASM_AVR32_UACCESS_H
-#define __ASM_AVR32_UACCESS_H
-
-#include <linux/errno.h>
-#include <linux/sched.h>
-
-#define VERIFY_READ    0
-#define VERIFY_WRITE   1
-
-typedef struct {
-       unsigned int is_user_space;
-} mm_segment_t;
-
-/*
- * The fs value determines whether argument validity checking should be
- * performed or not.  If get_fs() == USER_DS, checking is performed, with
- * get_fs() == KERNEL_DS, checking is bypassed.
- *
- * For historical reasons (Data Segment Register?), these macros are misnamed.
- */
-#define MAKE_MM_SEG(s) ((mm_segment_t) { (s) })
-#define segment_eq(a, b)       ((a).is_user_space == (b).is_user_space)
-
-#define USER_ADDR_LIMIT 0x80000000
-
-#define KERNEL_DS      MAKE_MM_SEG(0)
-#define USER_DS                MAKE_MM_SEG(1)
-
-#define get_ds()       (KERNEL_DS)
-
-static inline mm_segment_t get_fs(void)
-{
-       return MAKE_MM_SEG(test_thread_flag(TIF_USERSPACE));
-}
-
-static inline void set_fs(mm_segment_t s)
-{
-       if (s.is_user_space)
-               set_thread_flag(TIF_USERSPACE);
-       else
-               clear_thread_flag(TIF_USERSPACE);
-}
-
-/*
- * Test whether a block of memory is a valid user space address.
- * Returns 0 if the range is valid, nonzero otherwise.
- *
- * We do the following checks:
- *   1. Is the access from kernel space?
- *   2. Does (addr + size) set the carry bit?
- *   3. Is (addr + size) a negative number (i.e. >= 0x80000000)?
- *
- * If yes on the first check, access is granted.
- * If no on any of the others, access is denied.
- */
-#define __range_ok(addr, size)                                         \
-       (test_thread_flag(TIF_USERSPACE)                                \
-        && (((unsigned long)(addr) >= 0x80000000)                      \
-            || ((unsigned long)(size) > 0x80000000)                    \
-            || (((unsigned long)(addr) + (unsigned long)(size)) > 0x80000000)))
-
-#define access_ok(type, addr, size) (likely(__range_ok(addr, size) == 0))
-
-/* Generic arbitrary sized copy. Return the number of bytes NOT copied */
-extern __kernel_size_t __copy_user(void *to, const void *from,
-                                  __kernel_size_t n);
-
-extern __kernel_size_t copy_to_user(void __user *to, const void *from,
-                                   __kernel_size_t n);
-extern __kernel_size_t ___copy_from_user(void *to, const void __user *from,
-                                     __kernel_size_t n);
-
-static inline __kernel_size_t __copy_to_user(void __user *to, const void *from,
-                                            __kernel_size_t n)
-{
-       return __copy_user((void __force *)to, from, n);
-}
-static inline __kernel_size_t __copy_from_user(void *to,
-                                              const void __user *from,
-                                              __kernel_size_t n)
-{
-       return __copy_user(to, (const void __force *)from, n);
-}
-static inline __kernel_size_t copy_from_user(void *to,
-                                              const void __user *from,
-                                              __kernel_size_t n)
-{
-       size_t res = ___copy_from_user(to, from, n);
-       if (unlikely(res))
-               memset(to + (n - res), 0, res);
-       return res;
-}
-
-#define __copy_to_user_inatomic __copy_to_user
-#define __copy_from_user_inatomic __copy_from_user
-
-/*
- * put_user: - Write a simple value into user space.
- * @x:   Value to copy to user space.
- * @ptr: Destination address, in user space.
- *
- * Context: User context only. This function may sleep if pagefaults are
- *          enabled.
- *
- * This macro copies a single simple value from kernel space to user
- * space.  It supports simple types like char and int, but not larger
- * data types like structures or arrays.
- *
- * @ptr must have pointer-to-simple-variable type, and @x must be assignable
- * to the result of dereferencing @ptr.
- *
- * Returns zero on success, or -EFAULT on error.
- */
-#define put_user(x, ptr)       \
-       __put_user_check((x), (ptr), sizeof(*(ptr)))
-
-/*
- * get_user: - Get a simple variable from user space.
- * @x:   Variable to store result.
- * @ptr: Source address, in user space.
- *
- * Context: User context only. This function may sleep if pagefaults are
- *          enabled.
- *
- * This macro copies a single simple variable from user space to kernel
- * space.  It supports simple types like char and int, but not larger
- * data types like structures or arrays.
- *
- * @ptr must have pointer-to-simple-variable type, and the result of
- * dereferencing @ptr must be assignable to @x without a cast.
- *
- * Returns zero on success, or -EFAULT on error.
- * On error, the variable @x is set to zero.
- */
-#define get_user(x, ptr) \
-       __get_user_check((x), (ptr), sizeof(*(ptr)))
-
-/*
- * __put_user: - Write a simple value into user space, with less checking.
- * @x:   Value to copy to user space.
- * @ptr: Destination address, in user space.
- *
- * Context: User context only. This function may sleep if pagefaults are
- *          enabled.
- *
- * This macro copies a single simple value from kernel space to user
- * space.  It supports simple types like char and int, but not larger
- * data types like structures or arrays.
- *
- * @ptr must have pointer-to-simple-variable type, and @x must be assignable
- * to the result of dereferencing @ptr.
- *
- * Caller must check the pointer with access_ok() before calling this
- * function.
- *
- * Returns zero on success, or -EFAULT on error.
- */
-#define __put_user(x, ptr) \
-       __put_user_nocheck((x), (ptr), sizeof(*(ptr)))
-
-/*
- * __get_user: - Get a simple variable from user space, with less checking.
- * @x:   Variable to store result.
- * @ptr: Source address, in user space.
- *
- * Context: User context only. This function may sleep if pagefaults are
- *          enabled.
- *
- * This macro copies a single simple variable from user space to kernel
- * space.  It supports simple types like char and int, but not larger
- * data types like structures or arrays.
- *
- * @ptr must have pointer-to-simple-variable type, and the result of
- * dereferencing @ptr must be assignable to @x without a cast.
- *
- * Caller must check the pointer with access_ok() before calling this
- * function.
- *
- * Returns zero on success, or -EFAULT on error.
- * On error, the variable @x is set to zero.
- */
-#define __get_user(x, ptr) \
-       __get_user_nocheck((x), (ptr), sizeof(*(ptr)))
-
-extern int __get_user_bad(void);
-extern int __put_user_bad(void);
-
-#define __get_user_nocheck(x, ptr, size)                               \
-({                                                                     \
-       unsigned long __gu_val = 0;                                     \
-       int __gu_err = 0;                                               \
-                                                                       \
-       switch (size) {                                                 \
-       case 1: __get_user_asm("ub", __gu_val, ptr, __gu_err); break;   \
-       case 2: __get_user_asm("uh", __gu_val, ptr, __gu_err); break;   \
-       case 4: __get_user_asm("w", __gu_val, ptr, __gu_err); break;    \
-       default: __gu_err = __get_user_bad(); break;                    \
-       }                                                               \
-                                                                       \
-       x = (__force typeof(*(ptr)))__gu_val;                           \
-       __gu_err;                                                       \
-})
-
-#define __get_user_check(x, ptr, size)                                 \
-({                                                                     \
-       unsigned long __gu_val = 0;                                     \
-       const typeof(*(ptr)) __user * __gu_addr = (ptr);                \
-       int __gu_err = 0;                                               \
-                                                                       \
-       if (access_ok(VERIFY_READ, __gu_addr, size)) {                  \
-               switch (size) {                                         \
-               case 1:                                                 \
-                       __get_user_asm("ub", __gu_val, __gu_addr,       \
-                                      __gu_err);                       \
-                       break;                                          \
-               case 2:                                                 \
-                       __get_user_asm("uh", __gu_val, __gu_addr,       \
-                                      __gu_err);                       \
-                       break;                                          \
-               case 4:                                                 \
-                       __get_user_asm("w", __gu_val, __gu_addr,        \
-                                      __gu_err);                       \
-                       break;                                          \
-               default:                                                \
-                       __gu_err = __get_user_bad();                    \
-                       break;                                          \
-               }                                                       \
-       } else {                                                        \
-               __gu_err = -EFAULT;                                     \
-       }                                                               \
-       x = (__force typeof(*(ptr)))__gu_val;                           \
-       __gu_err;                                                       \
-})
-
-#define __get_user_asm(suffix, __gu_val, ptr, __gu_err)                        \
-       asm volatile(                                                   \
-               "1:     ld." suffix "   %1, %3                  \n"     \
-               "2:                                             \n"     \
-               "       .subsection 1                           \n"     \
-               "3:     mov     %0, %4                          \n"     \
-               "       rjmp    2b                              \n"     \
-               "       .subsection 0                           \n"     \
-               "       .section __ex_table, \"a\"              \n"     \
-               "       .long   1b, 3b                          \n"     \
-               "       .previous                               \n"     \
-               : "=r"(__gu_err), "=r"(__gu_val)                        \
-               : "0"(__gu_err), "m"(*(ptr)), "i"(-EFAULT))
-
-#define __put_user_nocheck(x, ptr, size)                               \
-({                                                                     \
-       typeof(*(ptr)) __pu_val;                                        \
-       int __pu_err = 0;                                               \
-                                                                       \
-       __pu_val = (x);                                                 \
-       switch (size) {                                                 \
-       case 1: __put_user_asm("b", ptr, __pu_val, __pu_err); break;    \
-       case 2: __put_user_asm("h", ptr, __pu_val, __pu_err); break;    \
-       case 4: __put_user_asm("w", ptr, __pu_val, __pu_err); break;    \
-       case 8: __put_user_asm("d", ptr, __pu_val, __pu_err); break;    \
-       default: __pu_err = __put_user_bad(); break;                    \
-       }                                                               \
-       __pu_err;                                                       \
-})
-
-#define __put_user_check(x, ptr, size)                                 \
-({                                                                     \
-       typeof(*(ptr)) __pu_val;                                        \
-       typeof(*(ptr)) __user *__pu_addr = (ptr);                       \
-       int __pu_err = 0;                                               \
-                                                                       \
-       __pu_val = (x);                                                 \
-       if (access_ok(VERIFY_WRITE, __pu_addr, size)) {                 \
-               switch (size) {                                         \
-               case 1:                                                 \
-                       __put_user_asm("b", __pu_addr, __pu_val,        \
-                                      __pu_err);                       \
-                       break;                                          \
-               case 2:                                                 \
-                       __put_user_asm("h", __pu_addr, __pu_val,        \
-                                      __pu_err);                       \
-                       break;                                          \
-               case 4:                                                 \
-                       __put_user_asm("w", __pu_addr, __pu_val,        \
-                                      __pu_err);                       \
-                       break;                                          \
-               case 8:                                                 \
-                       __put_user_asm("d", __pu_addr, __pu_val,        \
-                                      __pu_err);                       \
-                       break;                                          \
-               default:                                                \
-                       __pu_err = __put_user_bad();                    \
-                       break;                                          \
-               }                                                       \
-       } else {                                                        \
-               __pu_err = -EFAULT;                                     \
-       }                                                               \
-       __pu_err;                                                       \
-})
-
-#define __put_user_asm(suffix, ptr, __pu_val, __gu_err)                        \
-       asm volatile(                                                   \
-               "1:     st." suffix "   %1, %3                  \n"     \
-               "2:                                             \n"     \
-               "       .subsection 1                           \n"     \
-               "3:     mov     %0, %4                          \n"     \
-               "       rjmp    2b                              \n"     \
-               "       .subsection 0                           \n"     \
-               "       .section __ex_table, \"a\"              \n"     \
-               "       .long   1b, 3b                          \n"     \
-               "       .previous                               \n"     \
-               : "=r"(__gu_err), "=m"(*(ptr))                          \
-               : "0"(__gu_err), "r"(__pu_val), "i"(-EFAULT))
-
-extern __kernel_size_t clear_user(void __user *addr, __kernel_size_t size);
-extern __kernel_size_t __clear_user(void __user *addr, __kernel_size_t size);
-
-extern long strncpy_from_user(char *dst, const char __user *src, long count);
-extern long __strncpy_from_user(char *dst, const char __user *src, long count);
-
-extern long strnlen_user(const char __user *__s, long __n);
-extern long __strnlen_user(const char __user *__s, long __n);
-
-#define strlen_user(s) strnlen_user(s, ~0UL >> 1)
-
-struct exception_table_entry
-{
-       unsigned long insn, fixup;
-};
-
-#endif /* __ASM_AVR32_UACCESS_H */
diff --git a/arch/avr32/include/asm/ucontext.h b/arch/avr32/include/asm/ucontext.h
deleted file mode 100644 (file)
index ac7259c..0000000
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef __ASM_AVR32_UCONTEXT_H
-#define __ASM_AVR32_UCONTEXT_H
-
-struct ucontext {
-       unsigned long           uc_flags;
-       struct ucontext *       uc_link;
-       stack_t                 uc_stack;
-       struct sigcontext       uc_mcontext;
-       sigset_t                uc_sigmask;
-};
-
-#endif /* __ASM_AVR32_UCONTEXT_H */
diff --git a/arch/avr32/include/asm/unaligned.h b/arch/avr32/include/asm/unaligned.h
deleted file mode 100644 (file)
index 0418772..0000000
+++ /dev/null
@@ -1,21 +0,0 @@
-#ifndef _ASM_AVR32_UNALIGNED_H
-#define _ASM_AVR32_UNALIGNED_H
-
-/*
- * AVR32 can handle some unaligned accesses, depending on the
- * implementation.  The AVR32 AP implementation can handle unaligned
- * words, but halfwords must be halfword-aligned, and doublewords must
- * be word-aligned.
- *
- * However, swapped word loads must be word-aligned so we can't
- * optimize word loads in general.
- */
-
-#include <linux/unaligned/be_struct.h>
-#include <linux/unaligned/le_byteshift.h>
-#include <linux/unaligned/generic.h>
-
-#define get_unaligned  __get_unaligned_be
-#define put_unaligned  __put_unaligned_be
-
-#endif /* _ASM_AVR32_UNALIGNED_H */
diff --git a/arch/avr32/include/asm/unistd.h b/arch/avr32/include/asm/unistd.h
deleted file mode 100644 (file)
index 2011bee..0000000
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef __ASM_AVR32_UNISTD_H
-#define __ASM_AVR32_UNISTD_H
-
-#include <uapi/asm/unistd.h>
-
-#define NR_syscalls            321
-
-/* Old stuff */
-#define __IGNORE_uselib
-#define __IGNORE_mmap
-
-/* NUMA stuff */
-#define __IGNORE_mbind
-#define __IGNORE_get_mempolicy
-#define __IGNORE_set_mempolicy
-#define __IGNORE_migrate_pages
-#define __IGNORE_move_pages
-
-/* SMP stuff */
-#define __IGNORE_getcpu
-
-#define __ARCH_WANT_STAT64
-#define __ARCH_WANT_SYS_ALARM
-#define __ARCH_WANT_SYS_GETHOSTNAME
-#define __ARCH_WANT_SYS_PAUSE
-#define __ARCH_WANT_SYS_TIME
-#define __ARCH_WANT_SYS_UTIME
-#define __ARCH_WANT_SYS_WAITPID
-#define __ARCH_WANT_SYS_FADVISE64
-#define __ARCH_WANT_SYS_GETPGRP
-#define __ARCH_WANT_SYS_LLSEEK
-#define __ARCH_WANT_SYS_GETPGRP
-#define __ARCH_WANT_SYS_FORK
-#define __ARCH_WANT_SYS_VFORK
-#define __ARCH_WANT_SYS_CLONE
-
-#endif /* __ASM_AVR32_UNISTD_H */
diff --git a/arch/avr32/include/asm/user.h b/arch/avr32/include/asm/user.h
deleted file mode 100644 (file)
index 7e9152f..0000000
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Note: We may not need these definitions for AVR32, as we don't
- * support a.out.
- */
-#ifndef __ASM_AVR32_USER_H
-#define __ASM_AVR32_USER_H
-
-#include <linux/types.h>
-#include <asm/ptrace.h>
-#include <asm/page.h>
-
-/*
- * Core file format: The core file is written in such a way that gdb
- * can understand it and provide useful information to the user (under
- * linux we use the `trad-core' bfd).  The file contents are as follows:
- *
- *  upage: 1 page consisting of a user struct that tells gdb
- *     what is present in the file.  Directly after this is a
- *     copy of the task_struct, which is currently not used by gdb,
- *     but it may come in handy at some point.  All of the registers
- *     are stored as part of the upage.  The upage should always be
- *     only one page long.
- *  data: The data segment follows next.  We use current->end_text to
- *     current->brk to pick up all of the user variables, plus any memory
- *     that may have been sbrk'ed.  No attempt is made to determine if a
- *     page is demand-zero or if a page is totally unused, we just cover
- *     the entire range.  All of the addresses are rounded in such a way
- *     that an integral number of pages is written.
- *  stack: We need the stack information in order to get a meaningful
- *     backtrace.  We need to write the data from usp to
- *     current->start_stack, so we round each of these in order to be able
- *     to write an integer number of pages.
- */
-
-struct user_fpu_struct {
-       /* We have no FPU (yet) */
-};
-
-struct user {
-       struct pt_regs  regs;                   /* entire machine state */
-       size_t          u_tsize;                /* text size (pages) */
-       size_t          u_dsize;                /* data size (pages) */
-       size_t          u_ssize;                /* stack size (pages) */
-       unsigned long   start_code;             /* text starting address */
-       unsigned long   start_data;             /* data starting address */
-       unsigned long   start_stack;            /* stack starting address */
-       long int        signal;                 /* signal causing core dump */
-       unsigned long   u_ar0;                  /* help gdb find registers */
-       unsigned long   magic;                  /* identifies a core file */
-       char            u_comm[32];             /* user command name */
-};
-
-#define NBPG                   PAGE_SIZE
-#define UPAGES                 1
-#define HOST_TEXT_START_ADDR   (u.start_code)
-#define HOST_DATA_START_ADDR   (u.start_data)
-#define HOST_STACK_END_ADDR    (u.start_stack + u.u_ssize * NBPG)
-
-#endif /* __ASM_AVR32_USER_H */
diff --git a/arch/avr32/include/uapi/asm/Kbuild b/arch/avr32/include/uapi/asm/Kbuild
deleted file mode 100644 (file)
index 08d8a3d..0000000
+++ /dev/null
@@ -1,36 +0,0 @@
-# UAPI Header export list
-include include/uapi/asm-generic/Kbuild.asm
-
-header-y += auxvec.h
-header-y += byteorder.h
-header-y += cachectl.h
-header-y += msgbuf.h
-header-y += param.h
-header-y += posix_types.h
-header-y += ptrace.h
-header-y += sembuf.h
-header-y += setup.h
-header-y += shmbuf.h
-header-y += sigcontext.h
-header-y += signal.h
-header-y += socket.h
-header-y += sockios.h
-header-y += stat.h
-header-y += swab.h
-header-y += termbits.h
-header-y += termios.h
-header-y += types.h
-header-y += unistd.h
-generic-y += bitsperlong.h
-generic-y += errno.h
-generic-y += fcntl.h
-generic-y += ioctl.h
-generic-y += ioctls.h
-generic-y += ipcbuf.h
-generic-y += kvm_para.h
-generic-y += mman.h
-generic-y += param.h
-generic-y += poll.h
-generic-y += resource.h
-generic-y += siginfo.h
-generic-y += statfs.h
diff --git a/arch/avr32/include/uapi/asm/auxvec.h b/arch/avr32/include/uapi/asm/auxvec.h
deleted file mode 100644 (file)
index 4f02da3..0000000
+++ /dev/null
@@ -1,4 +0,0 @@
-#ifndef _UAPI__ASM_AVR32_AUXVEC_H
-#define _UAPI__ASM_AVR32_AUXVEC_H
-
-#endif /* _UAPI__ASM_AVR32_AUXVEC_H */
diff --git a/arch/avr32/include/uapi/asm/byteorder.h b/arch/avr32/include/uapi/asm/byteorder.h
deleted file mode 100644 (file)
index 71242f0..0000000
+++ /dev/null
@@ -1,9 +0,0 @@
-/*
- * AVR32 endian-conversion functions.
- */
-#ifndef _UAPI__ASM_AVR32_BYTEORDER_H
-#define _UAPI__ASM_AVR32_BYTEORDER_H
-
-#include <linux/byteorder/big_endian.h>
-
-#endif /* _UAPI__ASM_AVR32_BYTEORDER_H */
diff --git a/arch/avr32/include/uapi/asm/cachectl.h b/arch/avr32/include/uapi/asm/cachectl.h
deleted file mode 100644 (file)
index 573a958..0000000
+++ /dev/null
@@ -1,11 +0,0 @@
-#ifndef _UAPI__ASM_AVR32_CACHECTL_H
-#define _UAPI__ASM_AVR32_CACHECTL_H
-
-/*
- * Operations that can be performed through the cacheflush system call
- */
-
-/* Clean the data cache, then invalidate the icache */
-#define CACHE_IFLUSH   0
-
-#endif /* _UAPI__ASM_AVR32_CACHECTL_H */
diff --git a/arch/avr32/include/uapi/asm/msgbuf.h b/arch/avr32/include/uapi/asm/msgbuf.h
deleted file mode 100644 (file)
index 9eae6ef..0000000
+++ /dev/null
@@ -1,31 +0,0 @@
-#ifndef _UAPI__ASM_AVR32_MSGBUF_H
-#define _UAPI__ASM_AVR32_MSGBUF_H
-
-/*
- * The msqid64_ds structure for i386 architecture.
- * Note extra padding because this structure is passed back and forth
- * between kernel and user space.
- *
- * Pad space is left for:
- * - 64-bit time_t to solve y2038 problem
- * - 2 miscellaneous 32-bit values
- */
-
-struct msqid64_ds {
-       struct ipc64_perm msg_perm;
-       __kernel_time_t msg_stime;      /* last msgsnd time */
-       unsigned long   __unused1;
-       __kernel_time_t msg_rtime;      /* last msgrcv time */
-       unsigned long   __unused2;
-       __kernel_time_t msg_ctime;      /* last change time */
-       unsigned long   __unused3;
-       unsigned long  msg_cbytes;      /* current number of bytes on queue */
-       unsigned long  msg_qnum;        /* number of messages in queue */
-       unsigned long  msg_qbytes;      /* max number of bytes on queue */
-       __kernel_pid_t msg_lspid;       /* pid of last msgsnd */
-       __kernel_pid_t msg_lrpid;       /* last receive pid */
-       unsigned long  __unused4;
-       unsigned long  __unused5;
-};
-
-#endif /* _UAPI__ASM_AVR32_MSGBUF_H */
diff --git a/arch/avr32/include/uapi/asm/posix_types.h b/arch/avr32/include/uapi/asm/posix_types.h
deleted file mode 100644 (file)
index 5b813a8..0000000
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef _UAPI__ASM_AVR32_POSIX_TYPES_H
-#define _UAPI__ASM_AVR32_POSIX_TYPES_H
-
-/*
- * This file is generally used by user-level software, so you need to
- * be a little careful about namespace pollution etc.  Also, we cannot
- * assume GCC is being used.
- */
-
-typedef unsigned short  __kernel_mode_t;
-#define __kernel_mode_t __kernel_mode_t
-
-typedef unsigned short  __kernel_ipc_pid_t;
-#define __kernel_ipc_pid_t __kernel_ipc_pid_t
-
-typedef unsigned long  __kernel_size_t;
-typedef long           __kernel_ssize_t;
-typedef int             __kernel_ptrdiff_t;
-#define __kernel_size_t __kernel_size_t
-
-typedef unsigned short  __kernel_old_uid_t;
-typedef unsigned short  __kernel_old_gid_t;
-#define __kernel_old_uid_t __kernel_old_uid_t
-
-typedef unsigned short  __kernel_old_dev_t;
-#define __kernel_old_dev_t __kernel_old_dev_t
-
-#include <asm-generic/posix_types.h>
-
-#endif /* _UAPI__ASM_AVR32_POSIX_TYPES_H */
diff --git a/arch/avr32/include/uapi/asm/ptrace.h b/arch/avr32/include/uapi/asm/ptrace.h
deleted file mode 100644 (file)
index fe8c162..0000000
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef _UAPI__ASM_AVR32_PTRACE_H
-#define _UAPI__ASM_AVR32_PTRACE_H
-
-#define PTRACE_GETREGS         12
-#define PTRACE_SETREGS         13
-
-/*
- * Status Register bits
- */
-#define SR_H           0x20000000
-#define SR_J           0x10000000
-#define SR_DM          0x08000000
-#define SR_D           0x04000000
-#define MODE_NMI       0x01c00000
-#define MODE_EXCEPTION 0x01800000
-#define MODE_INT3      0x01400000
-#define MODE_INT2      0x01000000
-#define MODE_INT1      0x00c00000
-#define MODE_INT0      0x00800000
-#define MODE_SUPERVISOR        0x00400000
-#define MODE_USER      0x00000000
-#define MODE_MASK      0x01c00000
-#define SR_EM          0x00200000
-#define SR_I3M         0x00100000
-#define SR_I2M         0x00080000
-#define SR_I1M         0x00040000
-#define SR_I0M         0x00020000
-#define SR_GM          0x00010000
-
-#define SR_H_BIT       29
-#define SR_J_BIT       28
-#define SR_DM_BIT      27
-#define SR_D_BIT       26
-#define MODE_SHIFT     22
-#define SR_EM_BIT      21
-#define SR_I3M_BIT     20
-#define SR_I2M_BIT     19
-#define SR_I1M_BIT     18
-#define SR_I0M_BIT     17
-#define SR_GM_BIT      16
-
-/* The user-visible part */
-#define SR_L           0x00000020
-#define SR_Q           0x00000010
-#define SR_V           0x00000008
-#define SR_N           0x00000004
-#define SR_Z           0x00000002
-#define SR_C           0x00000001
-
-#define SR_L_BIT       5
-#define SR_Q_BIT       4
-#define SR_V_BIT       3
-#define SR_N_BIT       2
-#define SR_Z_BIT       1
-#define SR_C_BIT       0
-
-/*
- * The order is defined by the stmts instruction. r0 is stored first,
- * so it gets the highest address.
- *
- * Registers 0-12 are general-purpose registers (r12 is normally used for
- * the function return value).
- * Register 13 is the stack pointer
- * Register 14 is the link register
- * Register 15 is the program counter (retrieved from the RAR sysreg)
- */
-#define FRAME_SIZE_FULL 72
-#define REG_R12_ORIG   68
-#define REG_R0         64
-#define REG_R1         60
-#define REG_R2         56
-#define REG_R3         52
-#define REG_R4         48
-#define REG_R5         44
-#define REG_R6         40
-#define REG_R7         36
-#define REG_R8         32
-#define REG_R9         28
-#define REG_R10                24
-#define REG_R11                20
-#define REG_R12                16
-#define REG_SP         12
-#define REG_LR          8
-
-#define FRAME_SIZE_MIN  8
-#define REG_PC          4
-#define REG_SR          0
-
-#ifndef __ASSEMBLY__
-struct pt_regs {
-       /* These are always saved */
-       unsigned long sr;
-       unsigned long pc;
-
-       /* These are sometimes saved */
-       unsigned long lr;
-       unsigned long sp;
-       unsigned long r12;
-       unsigned long r11;
-       unsigned long r10;
-       unsigned long r9;
-       unsigned long r8;
-       unsigned long r7;
-       unsigned long r6;
-       unsigned long r5;
-       unsigned long r4;
-       unsigned long r3;
-       unsigned long r2;
-       unsigned long r1;
-       unsigned long r0;
-
-       /* Only saved on system call */
-       unsigned long r12_orig;
-};
-
-
-#endif /* ! __ASSEMBLY__ */
-
-#endif /* _UAPI__ASM_AVR32_PTRACE_H */
diff --git a/arch/avr32/include/uapi/asm/sembuf.h b/arch/avr32/include/uapi/asm/sembuf.h
deleted file mode 100644 (file)
index 6c6f7cf..0000000
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef _UAPI__ASM_AVR32_SEMBUF_H
-#define _UAPI__ASM_AVR32_SEMBUF_H
-
-/*
-* The semid64_ds structure for AVR32 architecture.
- * Note extra padding because this structure is passed back and forth
- * between kernel and user space.
- *
- * Pad space is left for:
- * - 64-bit time_t to solve y2038 problem
- * - 2 miscellaneous 32-bit values
- */
-
-struct semid64_ds {
-        struct ipc64_perm sem_perm;             /* permissions .. see ipc.h */
-        __kernel_time_t sem_otime;              /* last semop time */
-        unsigned long   __unused1;
-        __kernel_time_t sem_ctime;              /* last change time */
-        unsigned long   __unused2;
-        unsigned long   sem_nsems;              /* no. of semaphores in array */
-        unsigned long   __unused3;
-        unsigned long   __unused4;
-};
-
-#endif /* _UAPI__ASM_AVR32_SEMBUF_H */
diff --git a/arch/avr32/include/uapi/asm/setup.h b/arch/avr32/include/uapi/asm/setup.h
deleted file mode 100644 (file)
index a654df7..0000000
+++ /dev/null
@@ -1,16 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * Based on linux/include/asm-arm/setup.h
- *   Copyright (C) 1997-1999 Russell King
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef _UAPI__ASM_AVR32_SETUP_H__
-#define _UAPI__ASM_AVR32_SETUP_H__
-
-#define COMMAND_LINE_SIZE 256
-
-#endif /* _UAPI__ASM_AVR32_SETUP_H__ */
diff --git a/arch/avr32/include/uapi/asm/shmbuf.h b/arch/avr32/include/uapi/asm/shmbuf.h
deleted file mode 100644 (file)
index b94cf8b..0000000
+++ /dev/null
@@ -1,42 +0,0 @@
-#ifndef _UAPI__ASM_AVR32_SHMBUF_H
-#define _UAPI__ASM_AVR32_SHMBUF_H
-
-/*
- * The shmid64_ds structure for i386 architecture.
- * Note extra padding because this structure is passed back and forth
- * between kernel and user space.
- *
- * Pad space is left for:
- * - 64-bit time_t to solve y2038 problem
- * - 2 miscellaneous 32-bit values
- */
-
-struct shmid64_ds {
-       struct ipc64_perm       shm_perm;       /* operation perms */
-       size_t                  shm_segsz;      /* size of segment (bytes) */
-       __kernel_time_t         shm_atime;      /* last attach time */
-       unsigned long           __unused1;
-       __kernel_time_t         shm_dtime;      /* last detach time */
-       unsigned long           __unused2;
-       __kernel_time_t         shm_ctime;      /* last change time */
-       unsigned long           __unused3;
-       __kernel_pid_t          shm_cpid;       /* pid of creator */
-       __kernel_pid_t          shm_lpid;       /* pid of last operator */
-       unsigned long           shm_nattch;     /* no. of current attaches */
-       unsigned long           __unused4;
-       unsigned long           __unused5;
-};
-
-struct shminfo64 {
-       unsigned long   shmmax;
-       unsigned long   shmmin;
-       unsigned long   shmmni;
-       unsigned long   shmseg;
-       unsigned long   shmall;
-       unsigned long   __unused1;
-       unsigned long   __unused2;
-       unsigned long   __unused3;
-       unsigned long   __unused4;
-};
-
-#endif /* _UAPI__ASM_AVR32_SHMBUF_H */
diff --git a/arch/avr32/include/uapi/asm/sigcontext.h b/arch/avr32/include/uapi/asm/sigcontext.h
deleted file mode 100644 (file)
index 27e56bf..0000000
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef _UAPI__ASM_AVR32_SIGCONTEXT_H
-#define _UAPI__ASM_AVR32_SIGCONTEXT_H
-
-struct sigcontext {
-       unsigned long   oldmask;
-
-       /* CPU registers */
-       unsigned long   sr;
-       unsigned long   pc;
-       unsigned long   lr;
-       unsigned long   sp;
-       unsigned long   r12;
-       unsigned long   r11;
-       unsigned long   r10;
-       unsigned long   r9;
-       unsigned long   r8;
-       unsigned long   r7;
-       unsigned long   r6;
-       unsigned long   r5;
-       unsigned long   r4;
-       unsigned long   r3;
-       unsigned long   r2;
-       unsigned long   r1;
-       unsigned long   r0;
-};
-
-#endif /* _UAPI__ASM_AVR32_SIGCONTEXT_H */
diff --git a/arch/avr32/include/uapi/asm/signal.h b/arch/avr32/include/uapi/asm/signal.h
deleted file mode 100644 (file)
index ffe8c77..0000000
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef _UAPI__ASM_AVR32_SIGNAL_H
-#define _UAPI__ASM_AVR32_SIGNAL_H
-
-#include <linux/types.h>
-
-/* Avoid too many header ordering problems.  */
-struct siginfo;
-
-#ifndef __KERNEL__
-/* Here we must cater to libcs that poke about in kernel headers.  */
-
-#define NSIG           32
-typedef unsigned long sigset_t;
-
-#endif /* __KERNEL__ */
-
-#define SIGHUP          1
-#define SIGINT          2
-#define SIGQUIT                 3
-#define SIGILL          4
-#define SIGTRAP                 5
-#define SIGABRT                 6
-#define SIGIOT          6
-#define SIGBUS          7
-#define SIGFPE          8
-#define SIGKILL                 9
-#define SIGUSR1                10
-#define SIGSEGV                11
-#define SIGUSR2                12
-#define SIGPIPE                13
-#define SIGALRM                14
-#define SIGTERM                15
-#define SIGSTKFLT      16
-#define SIGCHLD                17
-#define SIGCONT                18
-#define SIGSTOP                19
-#define SIGTSTP                20
-#define SIGTTIN                21
-#define SIGTTOU                22
-#define SIGURG         23
-#define SIGXCPU                24
-#define SIGXFSZ                25
-#define SIGVTALRM      26
-#define SIGPROF                27
-#define SIGWINCH       28
-#define SIGIO          29
-#define SIGPOLL                SIGIO
-/*
-#define SIGLOST                29
-*/
-#define SIGPWR         30
-#define SIGSYS         31
-#define        SIGUNUSED       31
-
-/* These should not be considered constants from userland.  */
-#define SIGRTMIN       32
-#define SIGRTMAX       (_NSIG-1)
-
-/*
- * SA_FLAGS values:
- *
- * SA_NOCLDSTOP                flag to turn off SIGCHLD when children stop.
- * SA_NOCLDWAIT                flag on SIGCHLD to inhibit zombies.
- * SA_SIGINFO          deliver the signal with SIGINFO structs
- * SA_ONSTACK          indicates that a registered stack_t will be used.
- * SA_RESTART          flag to get restarting signals (which were the default long ago)
- * SA_NODEFER          prevents the current signal from being masked in the handler.
- * SA_RESETHAND                clears the handler when the signal is delivered.
- *
- * SA_ONESHOT and SA_NOMASK are the historical Linux names for the Single
- * Unix names RESETHAND and NODEFER respectively.
- */
-#define SA_NOCLDSTOP   0x00000001
-#define SA_NOCLDWAIT   0x00000002
-#define SA_SIGINFO     0x00000004
-#define SA_RESTORER    0x04000000
-#define SA_ONSTACK     0x08000000
-#define SA_RESTART     0x10000000
-#define SA_NODEFER     0x40000000
-#define SA_RESETHAND   0x80000000
-
-#define SA_NOMASK      SA_NODEFER
-#define SA_ONESHOT     SA_RESETHAND
-
-#define MINSIGSTKSZ    2048
-#define SIGSTKSZ       8192
-
-#include <asm-generic/signal-defs.h>
-
-#ifndef __KERNEL__
-/* Here we must cater to libcs that poke about in kernel headers.  */
-
-struct sigaction {
-       union {
-               __sighandler_t _sa_handler;
-               void (*_sa_sigaction)(int, struct siginfo *, void *);
-       } _u;
-       sigset_t sa_mask;
-       unsigned long sa_flags;
-       void (*sa_restorer)(void);
-};
-
-#define sa_handler     _u._sa_handler
-#define sa_sigaction   _u._sa_sigaction
-
-#endif /* __KERNEL__ */
-
-typedef struct sigaltstack {
-       void __user *ss_sp;
-       int ss_flags;
-       size_t ss_size;
-} stack_t;
-
-#endif /* _UAPI__ASM_AVR32_SIGNAL_H */
diff --git a/arch/avr32/include/uapi/asm/socket.h b/arch/avr32/include/uapi/asm/socket.h
deleted file mode 100644 (file)
index 5a65042..0000000
+++ /dev/null
@@ -1,95 +0,0 @@
-#ifndef _UAPI__ASM_AVR32_SOCKET_H
-#define _UAPI__ASM_AVR32_SOCKET_H
-
-#include <asm/sockios.h>
-
-/* For setsockopt(2) */
-#define SOL_SOCKET     1
-
-#define SO_DEBUG       1
-#define SO_REUSEADDR   2
-#define SO_TYPE                3
-#define SO_ERROR       4
-#define SO_DONTROUTE   5
-#define SO_BROADCAST   6
-#define SO_SNDBUF      7
-#define SO_RCVBUF      8
-#define SO_SNDBUFFORCE 32
-#define SO_RCVBUFFORCE 33
-#define SO_KEEPALIVE   9
-#define SO_OOBINLINE   10
-#define SO_NO_CHECK    11
-#define SO_PRIORITY    12
-#define SO_LINGER      13
-#define SO_BSDCOMPAT   14
-#define SO_REUSEPORT   15
-#define SO_PASSCRED    16
-#define SO_PEERCRED    17
-#define SO_RCVLOWAT    18
-#define SO_SNDLOWAT    19
-#define SO_RCVTIMEO    20
-#define SO_SNDTIMEO    21
-
-/* Security levels - as per NRL IPv6 - don't actually do anything */
-#define SO_SECURITY_AUTHENTICATION             22
-#define SO_SECURITY_ENCRYPTION_TRANSPORT       23
-#define SO_SECURITY_ENCRYPTION_NETWORK         24
-
-#define SO_BINDTODEVICE        25
-
-/* Socket filtering */
-#define SO_ATTACH_FILTER        26
-#define SO_DETACH_FILTER        27
-#define SO_GET_FILTER          SO_ATTACH_FILTER
-
-#define SO_PEERNAME            28
-#define SO_TIMESTAMP           29
-#define SCM_TIMESTAMP          SO_TIMESTAMP
-
-#define SO_ACCEPTCONN          30
-
-#define SO_PEERSEC             31
-#define SO_PASSSEC             34
-#define SO_TIMESTAMPNS         35
-#define SCM_TIMESTAMPNS                SO_TIMESTAMPNS
-
-#define SO_MARK                        36
-
-#define SO_TIMESTAMPING                37
-#define SCM_TIMESTAMPING       SO_TIMESTAMPING
-
-#define SO_PROTOCOL            38
-#define SO_DOMAIN              39
-
-#define SO_RXQ_OVFL             40
-
-#define SO_WIFI_STATUS         41
-#define SCM_WIFI_STATUS                SO_WIFI_STATUS
-#define SO_PEEK_OFF            42
-
-/* Instruct lower device to use last 4-bytes of skb data as FCS */
-#define SO_NOFCS               43
-
-#define SO_LOCK_FILTER         44
-
-#define SO_SELECT_ERR_QUEUE    45
-
-#define SO_BUSY_POLL           46
-
-#define SO_MAX_PACING_RATE     47
-
-#define SO_BPF_EXTENSIONS      48
-
-#define SO_INCOMING_CPU                49
-
-#define SO_ATTACH_BPF          50
-#define SO_DETACH_BPF          SO_DETACH_FILTER
-
-#define SO_ATTACH_REUSEPORT_CBPF       51
-#define SO_ATTACH_REUSEPORT_EBPF       52
-
-#define SO_CNX_ADVICE          53
-
-#define SCM_TIMESTAMPING_OPT_STATS     54
-
-#endif /* _UAPI__ASM_AVR32_SOCKET_H */
diff --git a/arch/avr32/include/uapi/asm/sockios.h b/arch/avr32/include/uapi/asm/sockios.h
deleted file mode 100644 (file)
index d047854..0000000
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef _UAPI__ASM_AVR32_SOCKIOS_H
-#define _UAPI__ASM_AVR32_SOCKIOS_H
-
-/* Socket-level I/O control calls. */
-#define FIOSETOWN      0x8901
-#define SIOCSPGRP      0x8902
-#define FIOGETOWN      0x8903
-#define SIOCGPGRP      0x8904
-#define SIOCATMARK     0x8905
-#define SIOCGSTAMP     0x8906          /* Get stamp (timeval) */
-#define SIOCGSTAMPNS   0x8907          /* Get stamp (timespec) */
-
-#endif /* _UAPI__ASM_AVR32_SOCKIOS_H */
diff --git a/arch/avr32/include/uapi/asm/stat.h b/arch/avr32/include/uapi/asm/stat.h
deleted file mode 100644 (file)
index c06acef..0000000
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef _UAPI__ASM_AVR32_STAT_H
-#define _UAPI__ASM_AVR32_STAT_H
-
-struct __old_kernel_stat {
-        unsigned short st_dev;
-        unsigned short st_ino;
-        unsigned short st_mode;
-        unsigned short st_nlink;
-        unsigned short st_uid;
-        unsigned short st_gid;
-        unsigned short st_rdev;
-        unsigned long  st_size;
-        unsigned long  st_atime;
-        unsigned long  st_mtime;
-        unsigned long  st_ctime;
-};
-
-struct stat {
-        unsigned long st_dev;
-        unsigned long st_ino;
-        unsigned short st_mode;
-        unsigned short st_nlink;
-        unsigned short st_uid;
-        unsigned short st_gid;
-        unsigned long  st_rdev;
-        unsigned long  st_size;
-        unsigned long  st_blksize;
-        unsigned long  st_blocks;
-        unsigned long  st_atime;
-        unsigned long  st_atime_nsec;
-        unsigned long  st_mtime;
-        unsigned long  st_mtime_nsec;
-        unsigned long  st_ctime;
-        unsigned long  st_ctime_nsec;
-        unsigned long  __unused4;
-        unsigned long  __unused5;
-};
-
-#define STAT_HAVE_NSEC 1
-
-struct stat64 {
-       unsigned long long st_dev;
-
-       unsigned long long st_ino;
-       unsigned int    st_mode;
-       unsigned int    st_nlink;
-
-       unsigned long   st_uid;
-       unsigned long   st_gid;
-
-       unsigned long long st_rdev;
-
-       long long       st_size;
-       unsigned long   __pad1;         /* align 64-bit st_blocks */
-       unsigned long   st_blksize;
-
-       unsigned long long st_blocks;   /* Number 512-byte blocks allocated. */
-
-       unsigned long   st_atime;
-       unsigned long   st_atime_nsec;
-
-       unsigned long   st_mtime;
-       unsigned long   st_mtime_nsec;
-
-       unsigned long   st_ctime;
-       unsigned long   st_ctime_nsec;
-
-       unsigned long   __unused1;
-       unsigned long   __unused2;
-};
-
-#endif /* _UAPI__ASM_AVR32_STAT_H */
diff --git a/arch/avr32/include/uapi/asm/swab.h b/arch/avr32/include/uapi/asm/swab.h
deleted file mode 100644 (file)
index 1a03549..0000000
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * AVR32 byteswapping functions.
- */
-#ifndef _UAPI__ASM_AVR32_SWAB_H
-#define _UAPI__ASM_AVR32_SWAB_H
-
-#include <linux/types.h>
-#include <linux/compiler.h>
-
-#define __SWAB_64_THRU_32__
-
-#ifdef __CHECKER__
-extern unsigned long __builtin_bswap_32(unsigned long x);
-extern unsigned short __builtin_bswap_16(unsigned short x);
-#endif
-
-/*
- * avr32-linux-gcc versions earlier than 4.2 improperly sign-extends
- * the result.
- */
-#if !(__GNUC__ == 4 && __GNUC_MINOR__ < 2)
-static inline __attribute_const__ __u16 __arch_swab16(__u16 val)
-{
-       return __builtin_bswap_16(val);
-}
-#define __arch_swab16 __arch_swab16
-
-static inline __attribute_const__ __u32 __arch_swab32(__u32 val)
-{
-       return __builtin_bswap_32(val);
-}
-#define __arch_swab32 __arch_swab32
-#endif
-
-#endif /* _UAPI__ASM_AVR32_SWAB_H */
diff --git a/arch/avr32/include/uapi/asm/termbits.h b/arch/avr32/include/uapi/asm/termbits.h
deleted file mode 100644 (file)
index 32789cc..0000000
+++ /dev/null
@@ -1,196 +0,0 @@
-#ifndef _UAPI__ASM_AVR32_TERMBITS_H
-#define _UAPI__ASM_AVR32_TERMBITS_H
-
-#include <linux/posix_types.h>
-
-typedef unsigned char  cc_t;
-typedef unsigned int   speed_t;
-typedef unsigned int   tcflag_t;
-
-#define NCCS 19
-struct termios {
-       tcflag_t c_iflag;               /* input mode flags */
-       tcflag_t c_oflag;               /* output mode flags */
-       tcflag_t c_cflag;               /* control mode flags */
-       tcflag_t c_lflag;               /* local mode flags */
-       cc_t c_line;                    /* line discipline */
-       cc_t c_cc[NCCS];                /* control characters */
-};
-
-struct termios2 {
-       tcflag_t c_iflag;               /* input mode flags */
-       tcflag_t c_oflag;               /* output mode flags */
-       tcflag_t c_cflag;               /* control mode flags */
-       tcflag_t c_lflag;               /* local mode flags */
-       cc_t c_line;                    /* line discipline */
-       cc_t c_cc[NCCS];                /* control characters */
-       speed_t c_ispeed;               /* input speed */
-       speed_t c_ospeed;               /* output speed */
-};
-
-struct ktermios {
-       tcflag_t c_iflag;               /* input mode flags */
-       tcflag_t c_oflag;               /* output mode flags */
-       tcflag_t c_cflag;               /* control mode flags */
-       tcflag_t c_lflag;               /* local mode flags */
-       cc_t c_line;                    /* line discipline */
-       cc_t c_cc[NCCS];                /* control characters */
-       speed_t c_ispeed;               /* input speed */
-       speed_t c_ospeed;               /* output speed */
-};
-
-/* c_cc characters */
-#define VINTR 0
-#define VQUIT 1
-#define VERASE 2
-#define VKILL 3
-#define VEOF 4
-#define VTIME 5
-#define VMIN 6
-#define VSWTC 7
-#define VSTART 8
-#define VSTOP 9
-#define VSUSP 10
-#define VEOL 11
-#define VREPRINT 12
-#define VDISCARD 13
-#define VWERASE 14
-#define VLNEXT 15
-#define VEOL2 16
-
-/* c_iflag bits */
-#define IGNBRK 0000001
-#define BRKINT 0000002
-#define IGNPAR 0000004
-#define PARMRK 0000010
-#define INPCK  0000020
-#define ISTRIP 0000040
-#define INLCR  0000100
-#define IGNCR  0000200
-#define ICRNL  0000400
-#define IUCLC  0001000
-#define IXON   0002000
-#define IXANY  0004000
-#define IXOFF  0010000
-#define IMAXBEL        0020000
-#define IUTF8  0040000
-
-/* c_oflag bits */
-#define OPOST  0000001
-#define OLCUC  0000002
-#define ONLCR  0000004
-#define OCRNL  0000010
-#define ONOCR  0000020
-#define ONLRET 0000040
-#define OFILL  0000100
-#define OFDEL  0000200
-#define NLDLY  0000400
-#define   NL0  0000000
-#define   NL1  0000400
-#define CRDLY  0003000
-#define   CR0  0000000
-#define   CR1  0001000
-#define   CR2  0002000
-#define   CR3  0003000
-#define TABDLY 0014000
-#define   TAB0 0000000
-#define   TAB1 0004000
-#define   TAB2 0010000
-#define   TAB3 0014000
-#define   XTABS        0014000
-#define BSDLY  0020000
-#define   BS0  0000000
-#define   BS1  0020000
-#define VTDLY  0040000
-#define   VT0  0000000
-#define   VT1  0040000
-#define FFDLY  0100000
-#define   FF0  0000000
-#define   FF1  0100000
-
-/* c_cflag bit meaning */
-#define CBAUD  0010017
-#define  B0    0000000         /* hang up */
-#define  B50   0000001
-#define  B75   0000002
-#define  B110  0000003
-#define  B134  0000004
-#define  B150  0000005
-#define  B200  0000006
-#define  B300  0000007
-#define  B600  0000010
-#define  B1200 0000011
-#define  B1800 0000012
-#define  B2400 0000013
-#define  B4800 0000014
-#define  B9600 0000015
-#define  B19200        0000016
-#define  B38400        0000017
-#define EXTA B19200
-#define EXTB B38400
-#define CSIZE  0000060
-#define   CS5  0000000
-#define   CS6  0000020
-#define   CS7  0000040
-#define   CS8  0000060
-#define CSTOPB 0000100
-#define CREAD  0000200
-#define PARENB 0000400
-#define PARODD 0001000
-#define HUPCL  0002000
-#define CLOCAL 0004000
-#define CBAUDEX 0010000
-#define    B57600 0010001
-#define   B115200 0010002
-#define   B230400 0010003
-#define   B460800 0010004
-#define   B500000 0010005
-#define   B576000 0010006
-#define   B921600 0010007
-#define  B1000000 0010010
-#define  B1152000 0010011
-#define  B1500000 0010012
-#define  B2000000 0010013
-#define  B2500000 0010014
-#define  B3000000 0010015
-#define  B3500000 0010016
-#define  B4000000 0010017
-#define CIBAUD   002003600000  /* input baud rate (not used) */
-#define CMSPAR   010000000000          /* mark or space (stick) parity */
-#define CRTSCTS          020000000000          /* flow control */
-
-/* c_lflag bits */
-#define ISIG   0000001
-#define ICANON 0000002
-#define XCASE  0000004
-#define ECHO   0000010
-#define ECHOE  0000020
-#define ECHOK  0000040
-#define ECHONL 0000100
-#define NOFLSH 0000200
-#define TOSTOP 0000400
-#define ECHOCTL        0001000
-#define ECHOPRT        0002000
-#define ECHOKE 0004000
-#define FLUSHO 0010000
-#define PENDIN 0040000
-#define IEXTEN 0100000
-#define EXTPROC        0200000
-
-/* tcflow() and TCXONC use these */
-#define        TCOOFF          0
-#define        TCOON           1
-#define        TCIOFF          2
-#define        TCION           3
-
-/* tcflush() and TCFLSH use these */
-#define        TCIFLUSH        0
-#define        TCOFLUSH        1
-#define        TCIOFLUSH       2
-
-/* tcsetattr uses these */
-#define        TCSANOW         0
-#define        TCSADRAIN       1
-#define        TCSAFLUSH       2
-
-#endif /* _UAPI__ASM_AVR32_TERMBITS_H */
diff --git a/arch/avr32/include/uapi/asm/termios.h b/arch/avr32/include/uapi/asm/termios.h
deleted file mode 100644 (file)
index c8a0081..0000000
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef _UAPI__ASM_AVR32_TERMIOS_H
-#define _UAPI__ASM_AVR32_TERMIOS_H
-
-#include <asm/termbits.h>
-#include <asm/ioctls.h>
-
-struct winsize {
-       unsigned short ws_row;
-       unsigned short ws_col;
-       unsigned short ws_xpixel;
-       unsigned short ws_ypixel;
-};
-
-#define NCC 8
-struct termio {
-       unsigned short c_iflag;         /* input mode flags */
-       unsigned short c_oflag;         /* output mode flags */
-       unsigned short c_cflag;         /* control mode flags */
-       unsigned short c_lflag;         /* local mode flags */
-       unsigned char c_line;           /* line discipline */
-       unsigned char c_cc[NCC];        /* control characters */
-};
-
-/* modem lines */
-#define TIOCM_LE       0x001
-#define TIOCM_DTR      0x002
-#define TIOCM_RTS      0x004
-#define TIOCM_ST       0x008
-#define TIOCM_SR       0x010
-#define TIOCM_CTS      0x020
-#define TIOCM_CAR      0x040
-#define TIOCM_RNG      0x080
-#define TIOCM_DSR      0x100
-#define TIOCM_CD       TIOCM_CAR
-#define TIOCM_RI       TIOCM_RNG
-#define TIOCM_OUT1     0x2000
-#define TIOCM_OUT2     0x4000
-#define TIOCM_LOOP     0x8000
-
-/* ioctl (fd, TIOCSERGETLSR, &result) where result may be as below */
-
-#endif /* _UAPI__ASM_AVR32_TERMIOS_H */
diff --git a/arch/avr32/include/uapi/asm/types.h b/arch/avr32/include/uapi/asm/types.h
deleted file mode 100644 (file)
index 7c986c4..0000000
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef _UAPI__ASM_AVR32_TYPES_H
-#define _UAPI__ASM_AVR32_TYPES_H
-
-#include <asm-generic/int-ll64.h>
-
-#endif /* _UAPI__ASM_AVR32_TYPES_H */
diff --git a/arch/avr32/include/uapi/asm/unistd.h b/arch/avr32/include/uapi/asm/unistd.h
deleted file mode 100644 (file)
index 236505d..0000000
+++ /dev/null
@@ -1,347 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef _UAPI__ASM_AVR32_UNISTD_H
-#define _UAPI__ASM_AVR32_UNISTD_H
-
-/*
- * This file contains the system call numbers.
- */
-
-#define __NR_restart_syscall    0
-#define __NR_exit               1
-#define __NR_fork               2
-#define __NR_read               3
-#define __NR_write              4
-#define __NR_open               5
-#define __NR_close              6
-#define __NR_umask              7
-#define __NR_creat              8
-#define __NR_link               9
-#define __NR_unlink             10
-#define __NR_execve             11
-#define __NR_chdir              12
-#define __NR_time               13
-#define __NR_mknod              14
-#define __NR_chmod              15
-#define __NR_chown              16
-#define __NR_lchown             17
-#define __NR_lseek              18
-#define __NR__llseek            19
-#define __NR_getpid             20
-#define __NR_mount              21
-#define __NR_umount2            22
-#define __NR_setuid             23
-#define __NR_getuid             24
-#define __NR_stime              25
-#define __NR_ptrace             26
-#define __NR_alarm              27
-#define __NR_pause              28
-#define __NR_utime              29
-#define __NR_stat               30
-#define __NR_fstat              31
-#define __NR_lstat              32
-#define __NR_access             33
-#define __NR_chroot             34
-#define __NR_sync               35
-#define __NR_fsync              36
-#define __NR_kill               37
-#define __NR_rename             38
-#define __NR_mkdir              39
-#define __NR_rmdir              40
-#define __NR_dup                41
-#define __NR_pipe               42
-#define __NR_times              43
-#define __NR_clone              44
-#define __NR_brk                45
-#define __NR_setgid             46
-#define __NR_getgid             47
-#define __NR_getcwd             48
-#define __NR_geteuid            49
-#define __NR_getegid            50
-#define __NR_acct               51
-#define __NR_setfsuid           52
-#define __NR_setfsgid           53
-#define __NR_ioctl              54
-#define __NR_fcntl              55
-#define __NR_setpgid            56
-#define __NR_mremap             57
-#define __NR_setresuid          58
-#define __NR_getresuid          59
-#define __NR_setreuid           60
-#define __NR_setregid           61
-#define __NR_ustat              62
-#define __NR_dup2               63
-#define __NR_getppid            64
-#define __NR_getpgrp            65
-#define __NR_setsid             66
-#define __NR_rt_sigaction       67
-#define __NR_rt_sigreturn       68
-#define __NR_rt_sigprocmask     69
-#define __NR_rt_sigpending      70
-#define __NR_rt_sigtimedwait    71
-#define __NR_rt_sigqueueinfo    72
-#define __NR_rt_sigsuspend      73
-#define __NR_sethostname        74
-#define __NR_setrlimit          75
-#define __NR_getrlimit          76 /* SuS compliant getrlimit */
-#define __NR_getrusage          77
-#define __NR_gettimeofday       78
-#define __NR_settimeofday       79
-#define __NR_getgroups          80
-#define __NR_setgroups          81
-#define __NR_select             82
-#define __NR_symlink            83
-#define __NR_fchdir             84
-#define __NR_readlink           85
-#define __NR_pread              86
-#define __NR_pwrite             87
-#define __NR_swapon             88
-#define __NR_reboot             89
-#define __NR_mmap2              90
-#define __NR_munmap             91
-#define __NR_truncate           92
-#define __NR_ftruncate          93
-#define __NR_fchmod             94
-#define __NR_fchown             95
-#define __NR_getpriority        96
-#define __NR_setpriority        97
-#define __NR_wait4              98
-#define __NR_statfs             99
-#define __NR_fstatfs            100
-#define __NR_vhangup            101
-#define __NR_sigaltstack        102
-#define __NR_syslog             103
-#define __NR_setitimer          104
-#define __NR_getitimer          105
-#define __NR_swapoff            106
-#define __NR_sysinfo            107
-/* 108 was __NR_ipc for a little while */
-#define __NR_sendfile           109
-#define __NR_setdomainname      110
-#define __NR_uname              111
-#define __NR_adjtimex           112
-#define __NR_mprotect           113
-#define __NR_vfork              114
-#define __NR_init_module        115
-#define __NR_delete_module      116
-#define __NR_quotactl           117
-#define __NR_getpgid            118
-#define __NR_bdflush            119
-#define __NR_sysfs              120
-#define __NR_personality        121
-#define __NR_afs_syscall        122 /* Syscall for Andrew File System */
-#define __NR_getdents           123
-#define __NR_flock              124
-#define __NR_msync              125
-#define __NR_readv              126
-#define __NR_writev             127
-#define __NR_getsid             128
-#define __NR_fdatasync          129
-#define __NR__sysctl            130
-#define __NR_mlock              131
-#define __NR_munlock            132
-#define __NR_mlockall           133
-#define __NR_munlockall         134
-#define __NR_sched_setparam     135
-#define __NR_sched_getparam     136
-#define __NR_sched_setscheduler 137
-#define __NR_sched_getscheduler 138
-#define __NR_sched_yield        139
-#define __NR_sched_get_priority_max     140
-#define __NR_sched_get_priority_min     141
-#define __NR_sched_rr_get_interval      142
-#define __NR_nanosleep          143
-#define __NR_poll               144
-#define __NR_nfsservctl         145
-#define __NR_setresgid          146
-#define __NR_getresgid          147
-#define __NR_prctl              148
-#define __NR_socket             149
-#define __NR_bind               150
-#define __NR_connect            151
-#define __NR_listen             152
-#define __NR_accept             153
-#define __NR_getsockname        154
-#define __NR_getpeername        155
-#define __NR_socketpair         156
-#define __NR_send               157
-#define __NR_recv               158
-#define __NR_sendto             159
-#define __NR_recvfrom           160
-#define __NR_shutdown           161
-#define __NR_setsockopt         162
-#define __NR_getsockopt         163
-#define __NR_sendmsg            164
-#define __NR_recvmsg            165
-#define __NR_truncate64         166
-#define __NR_ftruncate64        167
-#define __NR_stat64             168
-#define __NR_lstat64            169
-#define __NR_fstat64            170
-#define __NR_pivot_root         171
-#define __NR_mincore            172
-#define __NR_madvise            173
-#define __NR_getdents64         174
-#define __NR_fcntl64            175
-#define __NR_gettid             176
-#define __NR_readahead          177
-#define __NR_setxattr           178
-#define __NR_lsetxattr          179
-#define __NR_fsetxattr          180
-#define __NR_getxattr           181
-#define __NR_lgetxattr          182
-#define __NR_fgetxattr          183
-#define __NR_listxattr          184
-#define __NR_llistxattr         185
-#define __NR_flistxattr         186
-#define __NR_removexattr        187
-#define __NR_lremovexattr       188
-#define __NR_fremovexattr       189
-#define __NR_tkill              190
-#define __NR_sendfile64         191
-#define __NR_futex              192
-#define __NR_sched_setaffinity  193
-#define __NR_sched_getaffinity  194
-#define __NR_capget             195
-#define __NR_capset             196
-#define __NR_io_setup           197
-#define __NR_io_destroy         198
-#define __NR_io_getevents       199
-#define __NR_io_submit          200
-#define __NR_io_cancel          201
-#define __NR_fadvise64          202
-#define __NR_exit_group         203
-#define __NR_lookup_dcookie     204
-#define __NR_epoll_create       205
-#define __NR_epoll_ctl          206
-#define __NR_epoll_wait         207
-#define __NR_remap_file_pages   208
-#define __NR_set_tid_address    209
-#define __NR_timer_create       210
-#define __NR_timer_settime      211
-#define __NR_timer_gettime      212
-#define __NR_timer_getoverrun   213
-#define __NR_timer_delete       214
-#define __NR_clock_settime      215
-#define __NR_clock_gettime      216
-#define __NR_clock_getres       217
-#define __NR_clock_nanosleep    218
-#define __NR_statfs64           219
-#define __NR_fstatfs64          220
-#define __NR_tgkill             221
-/* 222 reserved for tux */
-#define __NR_utimes             223
-#define __NR_fadvise64_64       224
-#define __NR_cacheflush         225
-#define __NR_vserver            226
-#define __NR_mq_open            227
-#define __NR_mq_unlink          228
-#define __NR_mq_timedsend       229
-#define __NR_mq_timedreceive    230
-#define __NR_mq_notify          231
-#define __NR_mq_getsetattr      232
-#define __NR_kexec_load         233
-#define __NR_waitid             234
-#define __NR_add_key            235
-#define __NR_request_key        236
-#define __NR_keyctl             237
-#define __NR_ioprio_set         238
-#define __NR_ioprio_get         239
-#define __NR_inotify_init       240
-#define __NR_inotify_add_watch  241
-#define __NR_inotify_rm_watch   242
-#define __NR_openat             243
-#define __NR_mkdirat            244
-#define __NR_mknodat            245
-#define __NR_fchownat           246
-#define __NR_futimesat          247
-#define __NR_fstatat64          248
-#define __NR_unlinkat           249
-#define __NR_renameat           250
-#define __NR_linkat             251
-#define __NR_symlinkat          252
-#define __NR_readlinkat         253
-#define __NR_fchmodat           254
-#define __NR_faccessat          255
-#define __NR_pselect6           256
-#define __NR_ppoll              257
-#define __NR_unshare            258
-#define __NR_set_robust_list    259
-#define __NR_get_robust_list    260
-#define __NR_splice             261
-#define __NR_sync_file_range    262
-#define __NR_tee                263
-#define __NR_vmsplice           264
-#define __NR_epoll_pwait        265
-#define __NR_msgget             266
-#define __NR_msgsnd             267
-#define __NR_msgrcv             268
-#define __NR_msgctl             269
-#define __NR_semget             270
-#define __NR_semop              271
-#define __NR_semctl             272
-#define __NR_semtimedop         273
-#define __NR_shmat              274
-#define __NR_shmget             275
-#define __NR_shmdt              276
-#define __NR_shmctl             277
-#define __NR_utimensat          278
-#define __NR_signalfd           279
-/* 280 was __NR_timerfd */
-#define __NR_eventfd            281
-/* 282 was half-implemented __NR_recvmmsg */
-#define __NR_setns              283
-#define __NR_pread64            284
-#define __NR_pwrite64           285
-#define __NR_timerfd_create     286
-#define __NR_fallocate          287
-#define __NR_timerfd_settime    288
-#define __NR_timerfd_gettime    289
-#define __NR_signalfd4          290
-#define __NR_eventfd2           291
-#define __NR_epoll_create1      292
-#define __NR_dup3               293
-#define __NR_pipe2              294
-#define __NR_inotify_init1      295
-#define __NR_preadv             296
-#define __NR_pwritev            297
-#define __NR_rt_tgsigqueueinfo  298
-#define __NR_perf_event_open    299
-#define __NR_recvmmsg           300
-#define __NR_fanotify_init      301
-#define __NR_fanotify_mark      302
-#define __NR_prlimit64          303
-#define __NR_name_to_handle_at  304
-#define __NR_open_by_handle_at  305
-#define __NR_clock_adjtime      306
-#define __NR_syncfs             307
-#define __NR_sendmmsg           308
-#define __NR_process_vm_readv   309
-#define __NR_process_vm_writev  310
-#define __NR_kcmp               311
-#define __NR_finit_module       312
-#define __NR_sched_setattr      313
-#define __NR_sched_getattr      314
-#define __NR_renameat2          315
-#define __NR_seccomp            316
-#define __NR_getrandom          317
-#define __NR_memfd_create       318
-#define __NR_bpf                319
-#define __NR_execveat           320
-#define __NR_accept4            321
-#define __NR_userfaultfd        322
-#define __NR_membarrier         323
-#define __NR_mlock2             324
-#define __NR_copy_file_range    325
-#define __NR_preadv2            326
-#define __NR_pwritev2           327
-#define __NR_pkey_mprotect      328
-#define __NR_pkey_alloc         329
-#define __NR_pkey_free          330
-
-#endif /* _UAPI__ASM_AVR32_UNISTD_H */
diff --git a/arch/avr32/kernel/.gitignore b/arch/avr32/kernel/.gitignore
deleted file mode 100644 (file)
index c5f676c..0000000
+++ /dev/null
@@ -1 +0,0 @@
-vmlinux.lds
diff --git a/arch/avr32/kernel/Makefile b/arch/avr32/kernel/Makefile
deleted file mode 100644 (file)
index 119a2e4..0000000
+++ /dev/null
@@ -1,15 +0,0 @@
-#
-# Makefile for the Linux/AVR32 kernel.
-#
-
-extra-y                                := head.o vmlinux.lds
-
-obj-$(CONFIG_SUBARCH_AVR32B)   += entry-avr32b.o
-obj-y                          += syscall_table.o syscall-stubs.o irq.o
-obj-y                          += setup.o traps.o ocd.o ptrace.o
-obj-y                          += signal.o process.o time.o
-obj-y                          += switch_to.o cpu.o
-obj-$(CONFIG_MODULES)          += module.o avr32_ksyms.o
-obj-$(CONFIG_KPROBES)          += kprobes.o
-obj-$(CONFIG_STACKTRACE)       += stacktrace.o
-obj-$(CONFIG_NMI_DEBUGGING)    += nmi_debug.o
diff --git a/arch/avr32/kernel/asm-offsets.c b/arch/avr32/kernel/asm-offsets.c
deleted file mode 100644 (file)
index 2c9764f..0000000
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Generate definitions needed by assembly language modules.
- * This code generates raw asm output which is post-processed
- * to extract and format the required data.
- */
-
-#include <linux/mm.h>
-#include <linux/sched.h>
-#include <linux/thread_info.h>
-#include <linux/kbuild.h>
-
-void foo(void)
-{
-       OFFSET(TI_task, thread_info, task);
-       OFFSET(TI_flags, thread_info, flags);
-       OFFSET(TI_cpu, thread_info, cpu);
-       OFFSET(TI_preempt_count, thread_info, preempt_count);
-       OFFSET(TI_rar_saved, thread_info, rar_saved);
-       OFFSET(TI_rsr_saved, thread_info, rsr_saved);
-       BLANK();
-       OFFSET(TSK_active_mm, task_struct, active_mm);
-       BLANK();
-       OFFSET(MM_pgd, mm_struct, pgd);
-}
diff --git a/arch/avr32/kernel/avr32_ksyms.c b/arch/avr32/kernel/avr32_ksyms.c
deleted file mode 100644 (file)
index 0d05fd0..0000000
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Export AVR32-specific functions for loadable modules.
- *
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/delay.h>
-#include <linux/io.h>
-#include <linux/module.h>
-
-#include <asm/checksum.h>
-#include <linux/uaccess.h>
-
-/*
- * GCC functions
- */
-extern unsigned long long __avr32_lsl64(unsigned long long u, unsigned long b);
-extern unsigned long long __avr32_lsr64(unsigned long long u, unsigned long b);
-extern unsigned long long __avr32_asr64(unsigned long long u, unsigned long b);
-EXPORT_SYMBOL(__avr32_lsl64);
-EXPORT_SYMBOL(__avr32_lsr64);
-EXPORT_SYMBOL(__avr32_asr64);
-
-/*
- * String functions
- */
-EXPORT_SYMBOL(memset);
-EXPORT_SYMBOL(memcpy);
-
-EXPORT_SYMBOL(clear_page);
-EXPORT_SYMBOL(copy_page);
-
-/*
- * Userspace access stuff.
- */
-EXPORT_SYMBOL(___copy_from_user);
-EXPORT_SYMBOL(copy_to_user);
-EXPORT_SYMBOL(__copy_user);
-EXPORT_SYMBOL(strncpy_from_user);
-EXPORT_SYMBOL(__strncpy_from_user);
-EXPORT_SYMBOL(clear_user);
-EXPORT_SYMBOL(__clear_user);
-EXPORT_SYMBOL(strnlen_user);
-
-EXPORT_SYMBOL(csum_partial);
-EXPORT_SYMBOL(csum_partial_copy_generic);
-
-/* Delay loops (lib/delay.S) */
-EXPORT_SYMBOL(__ndelay);
-EXPORT_SYMBOL(__udelay);
-EXPORT_SYMBOL(__const_udelay);
-
-/* Bit operations (lib/findbit.S) */
-EXPORT_SYMBOL(find_first_zero_bit);
-EXPORT_SYMBOL(find_next_zero_bit);
-EXPORT_SYMBOL(find_first_bit);
-EXPORT_SYMBOL(find_next_bit);
-EXPORT_SYMBOL(find_next_bit_le);
-EXPORT_SYMBOL(find_next_zero_bit_le);
-
-/* I/O primitives (lib/io-*.S) */
-EXPORT_SYMBOL(__raw_readsb);
-EXPORT_SYMBOL(__raw_readsw);
-EXPORT_SYMBOL(__raw_readsl);
-EXPORT_SYMBOL(__raw_writesb);
-EXPORT_SYMBOL(__raw_writesw);
-EXPORT_SYMBOL(__raw_writesl);
diff --git a/arch/avr32/kernel/cpu.c b/arch/avr32/kernel/cpu.c
deleted file mode 100644 (file)
index 0341ae2..0000000
+++ /dev/null
@@ -1,410 +0,0 @@
-/*
- * Copyright (C) 2005-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/init.h>
-#include <linux/device.h>
-#include <linux/seq_file.h>
-#include <linux/cpu.h>
-#include <linux/module.h>
-#include <linux/percpu.h>
-#include <linux/param.h>
-#include <linux/errno.h>
-#include <linux/clk.h>
-
-#include <asm/setup.h>
-#include <asm/sysreg.h>
-
-static DEFINE_PER_CPU(struct cpu, cpu_devices);
-
-#ifdef CONFIG_PERFORMANCE_COUNTERS
-
-/*
- * XXX: If/when a SMP-capable implementation of AVR32 will ever be
- * made, we must make sure that the code executes on the correct CPU.
- */
-static ssize_t show_pc0event(struct device *dev,
-                       struct device_attribute *attr, char *buf)
-{
-       unsigned long pccr;
-
-       pccr = sysreg_read(PCCR);
-       return sprintf(buf, "0x%lx\n", (pccr >> 12) & 0x3f);
-}
-static ssize_t store_pc0event(struct device *dev,
-                       struct device_attribute *attr, const char *buf,
-                             size_t count)
-{
-       unsigned long val;
-       int ret;
-
-       ret = kstrtoul(buf, 0, &val);
-       if (ret)
-               return ret;
-       if (val > 0x3f)
-               return -EINVAL;
-       val = (val << 12) | (sysreg_read(PCCR) & 0xfffc0fff);
-       sysreg_write(PCCR, val);
-       return count;
-}
-static ssize_t show_pc0count(struct device *dev,
-                       struct device_attribute *attr, char *buf)
-{
-       unsigned long pcnt0;
-
-       pcnt0 = sysreg_read(PCNT0);
-       return sprintf(buf, "%lu\n", pcnt0);
-}
-static ssize_t store_pc0count(struct device *dev,
-                               struct device_attribute *attr,
-                               const char *buf, size_t count)
-{
-       unsigned long val;
-       int ret;
-
-       ret = kstrtoul(buf, 0, &val);
-       if (ret)
-               return ret;
-       sysreg_write(PCNT0, val);
-
-       return count;
-}
-
-static ssize_t show_pc1event(struct device *dev,
-                               struct device_attribute *attr, char *buf)
-{
-       unsigned long pccr;
-
-       pccr = sysreg_read(PCCR);
-       return sprintf(buf, "0x%lx\n", (pccr >> 18) & 0x3f);
-}
-static ssize_t store_pc1event(struct device *dev,
-                             struct device_attribute *attr, const char *buf,
-                             size_t count)
-{
-       unsigned long val;
-       int ret;
-
-       ret = kstrtoul(buf, 0, &val);
-       if (ret)
-               return ret;
-       if (val > 0x3f)
-               return -EINVAL;
-       val = (val << 18) | (sysreg_read(PCCR) & 0xff03ffff);
-       sysreg_write(PCCR, val);
-       return count;
-}
-static ssize_t show_pc1count(struct device *dev,
-                               struct device_attribute *attr, char *buf)
-{
-       unsigned long pcnt1;
-
-       pcnt1 = sysreg_read(PCNT1);
-       return sprintf(buf, "%lu\n", pcnt1);
-}
-static ssize_t store_pc1count(struct device *dev,
-                               struct device_attribute *attr, const char *buf,
-                             size_t count)
-{
-       unsigned long val;
-       int ret;
-
-       ret = kstrtoul(buf, 0, &val);
-       if (ret)
-               return ret;
-       sysreg_write(PCNT1, val);
-
-       return count;
-}
-
-static ssize_t show_pccycles(struct device *dev,
-                               struct device_attribute *attr, char *buf)
-{
-       unsigned long pccnt;
-
-       pccnt = sysreg_read(PCCNT);
-       return sprintf(buf, "%lu\n", pccnt);
-}
-static ssize_t store_pccycles(struct device *dev,
-                               struct device_attribute *attr, const char *buf,
-                             size_t count)
-{
-       unsigned long val;
-       int ret;
-
-       ret = kstrtoul(buf, 0, &val);
-       if (ret)
-               return ret;
-       sysreg_write(PCCNT, val);
-
-       return count;
-}
-
-static ssize_t show_pcenable(struct device *dev,
-                       struct device_attribute *attr, char *buf)
-{
-       unsigned long pccr;
-
-       pccr = sysreg_read(PCCR);
-       return sprintf(buf, "%c\n", (pccr & 1)?'1':'0');
-}
-static ssize_t store_pcenable(struct device *dev,
-                             struct device_attribute *attr, const char *buf,
-                             size_t count)
-{
-       unsigned long pccr, val;
-       int ret;
-
-       ret = kstrtoul(buf, 0, &val);
-       if (ret)
-               return ret;
-       if (val)
-               val = 1;
-
-       pccr = sysreg_read(PCCR);
-       pccr = (pccr & ~1UL) | val;
-       sysreg_write(PCCR, pccr);
-
-       return count;
-}
-
-static DEVICE_ATTR(pc0event, 0600, show_pc0event, store_pc0event);
-static DEVICE_ATTR(pc0count, 0600, show_pc0count, store_pc0count);
-static DEVICE_ATTR(pc1event, 0600, show_pc1event, store_pc1event);
-static DEVICE_ATTR(pc1count, 0600, show_pc1count, store_pc1count);
-static DEVICE_ATTR(pccycles, 0600, show_pccycles, store_pccycles);
-static DEVICE_ATTR(pcenable, 0600, show_pcenable, store_pcenable);
-
-#endif /* CONFIG_PERFORMANCE_COUNTERS */
-
-static int __init topology_init(void)
-{
-       int cpu;
-
-       for_each_possible_cpu(cpu) {
-               struct cpu *c = &per_cpu(cpu_devices, cpu);
-
-               register_cpu(c, cpu);
-
-#ifdef CONFIG_PERFORMANCE_COUNTERS
-               device_create_file(&c->dev, &dev_attr_pc0event);
-               device_create_file(&c->dev, &dev_attr_pc0count);
-               device_create_file(&c->dev, &dev_attr_pc1event);
-               device_create_file(&c->dev, &dev_attr_pc1count);
-               device_create_file(&c->dev, &dev_attr_pccycles);
-               device_create_file(&c->dev, &dev_attr_pcenable);
-#endif
-       }
-
-       return 0;
-}
-
-subsys_initcall(topology_init);
-
-struct chip_id_map {
-       u16     mid;
-       u16     pn;
-       const char *name;
-};
-
-static const struct chip_id_map chip_names[] = {
-       { .mid = 0x1f, .pn = 0x1e82, .name = "AT32AP700x" },
-};
-#define NR_CHIP_NAMES ARRAY_SIZE(chip_names)
-
-static const char *cpu_names[] = {
-       "Morgan",
-       "AP7",
-};
-#define NR_CPU_NAMES ARRAY_SIZE(cpu_names)
-
-static const char *arch_names[] = {
-       "AVR32A",
-       "AVR32B",
-};
-#define NR_ARCH_NAMES ARRAY_SIZE(arch_names)
-
-static const char *mmu_types[] = {
-       "No MMU",
-       "ITLB and DTLB",
-       "Shared TLB",
-       "MPU"
-};
-
-static const char *cpu_feature_flags[] = {
-       "rmw", "dsp", "simd", "ocd", "perfctr", "java", "fpu",
-};
-
-static const char *get_chip_name(struct avr32_cpuinfo *cpu)
-{
-       unsigned int i;
-       unsigned int mid = avr32_get_manufacturer_id(cpu);
-       unsigned int pn = avr32_get_product_number(cpu);
-
-       for (i = 0; i < NR_CHIP_NAMES; i++) {
-               if (chip_names[i].mid == mid && chip_names[i].pn == pn)
-                       return chip_names[i].name;
-       }
-
-       return "(unknown)";
-}
-
-void __init setup_processor(void)
-{
-       unsigned long config0, config1;
-       unsigned long features;
-       unsigned cpu_id, cpu_rev, arch_id, arch_rev, mmu_type;
-       unsigned device_id;
-       unsigned tmp;
-       unsigned i;
-
-       config0 = sysreg_read(CONFIG0);
-       config1 = sysreg_read(CONFIG1);
-       cpu_id = SYSREG_BFEXT(PROCESSORID, config0);
-       cpu_rev = SYSREG_BFEXT(PROCESSORREVISION, config0);
-       arch_id = SYSREG_BFEXT(AT, config0);
-       arch_rev = SYSREG_BFEXT(AR, config0);
-       mmu_type = SYSREG_BFEXT(MMUT, config0);
-
-       device_id = ocd_read(DID);
-
-       boot_cpu_data.arch_type = arch_id;
-       boot_cpu_data.cpu_type = cpu_id;
-       boot_cpu_data.arch_revision = arch_rev;
-       boot_cpu_data.cpu_revision = cpu_rev;
-       boot_cpu_data.tlb_config = mmu_type;
-       boot_cpu_data.device_id = device_id;
-
-       tmp = SYSREG_BFEXT(ILSZ, config1);
-       if (tmp) {
-               boot_cpu_data.icache.ways = 1 << SYSREG_BFEXT(IASS, config1);
-               boot_cpu_data.icache.sets = 1 << SYSREG_BFEXT(ISET, config1);
-               boot_cpu_data.icache.linesz = 1 << (tmp + 1);
-       }
-       tmp = SYSREG_BFEXT(DLSZ, config1);
-       if (tmp) {
-               boot_cpu_data.dcache.ways = 1 << SYSREG_BFEXT(DASS, config1);
-               boot_cpu_data.dcache.sets = 1 << SYSREG_BFEXT(DSET, config1);
-               boot_cpu_data.dcache.linesz = 1 << (tmp + 1);
-       }
-
-       if ((cpu_id >= NR_CPU_NAMES) || (arch_id >= NR_ARCH_NAMES)) {
-               printk ("Unknown CPU configuration (ID %02x, arch %02x), "
-                       "continuing anyway...\n",
-                       cpu_id, arch_id);
-               return;
-       }
-
-       printk ("CPU: %s chip revision %c\n", get_chip_name(&boot_cpu_data),
-                       avr32_get_chip_revision(&boot_cpu_data) + 'A');
-       printk ("CPU: %s [%02x] core revision %d (%s arch revision %d)\n",
-               cpu_names[cpu_id], cpu_id, cpu_rev,
-               arch_names[arch_id], arch_rev);
-       printk ("CPU: MMU configuration: %s\n", mmu_types[mmu_type]);
-
-       printk ("CPU: features:");
-       features = 0;
-       if (config0 & SYSREG_BIT(CONFIG0_R))
-               features |= AVR32_FEATURE_RMW;
-       if (config0 & SYSREG_BIT(CONFIG0_D))
-               features |= AVR32_FEATURE_DSP;
-       if (config0 & SYSREG_BIT(CONFIG0_S))
-               features |= AVR32_FEATURE_SIMD;
-       if (config0 & SYSREG_BIT(CONFIG0_O))
-               features |= AVR32_FEATURE_OCD;
-       if (config0 & SYSREG_BIT(CONFIG0_P))
-               features |= AVR32_FEATURE_PCTR;
-       if (config0 & SYSREG_BIT(CONFIG0_J))
-               features |= AVR32_FEATURE_JAVA;
-       if (config0 & SYSREG_BIT(CONFIG0_F))
-               features |= AVR32_FEATURE_FPU;
-
-       for (i = 0; i < ARRAY_SIZE(cpu_feature_flags); i++)
-               if (features & (1 << i))
-                       printk(" %s", cpu_feature_flags[i]);
-
-       printk("\n");
-       boot_cpu_data.features = features;
-}
-
-#ifdef CONFIG_PROC_FS
-static int c_show(struct seq_file *m, void *v)
-{
-       unsigned int icache_size, dcache_size;
-       unsigned int cpu = smp_processor_id();
-       unsigned int freq;
-       unsigned int i;
-
-       icache_size = boot_cpu_data.icache.ways *
-               boot_cpu_data.icache.sets *
-               boot_cpu_data.icache.linesz;
-       dcache_size = boot_cpu_data.dcache.ways *
-               boot_cpu_data.dcache.sets *
-               boot_cpu_data.dcache.linesz;
-
-       seq_printf(m, "processor\t: %d\n", cpu);
-
-       seq_printf(m, "chip type\t: %s revision %c\n",
-                       get_chip_name(&boot_cpu_data),
-                       avr32_get_chip_revision(&boot_cpu_data) + 'A');
-       if (boot_cpu_data.arch_type < NR_ARCH_NAMES)
-               seq_printf(m, "cpu arch\t: %s revision %d\n",
-                          arch_names[boot_cpu_data.arch_type],
-                          boot_cpu_data.arch_revision);
-       if (boot_cpu_data.cpu_type < NR_CPU_NAMES)
-               seq_printf(m, "cpu core\t: %s revision %d\n",
-                          cpu_names[boot_cpu_data.cpu_type],
-                          boot_cpu_data.cpu_revision);
-
-       freq = (clk_get_rate(boot_cpu_data.clk) + 500) / 1000;
-       seq_printf(m, "cpu MHz\t\t: %u.%03u\n", freq / 1000, freq % 1000);
-
-       seq_printf(m, "i-cache\t\t: %dK (%u ways x %u sets x %u)\n",
-                  icache_size >> 10,
-                  boot_cpu_data.icache.ways,
-                  boot_cpu_data.icache.sets,
-                  boot_cpu_data.icache.linesz);
-       seq_printf(m, "d-cache\t\t: %dK (%u ways x %u sets x %u)\n",
-                  dcache_size >> 10,
-                  boot_cpu_data.dcache.ways,
-                  boot_cpu_data.dcache.sets,
-                  boot_cpu_data.dcache.linesz);
-
-       seq_printf(m, "features\t:");
-       for (i = 0; i < ARRAY_SIZE(cpu_feature_flags); i++)
-               if (boot_cpu_data.features & (1 << i))
-                       seq_printf(m, " %s", cpu_feature_flags[i]);
-
-       seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
-                  boot_cpu_data.loops_per_jiffy / (500000/HZ),
-                  (boot_cpu_data.loops_per_jiffy / (5000/HZ)) % 100);
-
-       return 0;
-}
-
-static void *c_start(struct seq_file *m, loff_t *pos)
-{
-       return *pos < 1 ? (void *)1 : NULL;
-}
-
-static void *c_next(struct seq_file *m, void *v, loff_t *pos)
-{
-       ++*pos;
-       return NULL;
-}
-
-static void c_stop(struct seq_file *m, void *v)
-{
-
-}
-
-const struct seq_operations cpuinfo_op = {
-       .start  = c_start,
-       .next   = c_next,
-       .stop   = c_stop,
-       .show   = c_show
-};
-#endif /* CONFIG_PROC_FS */
diff --git a/arch/avr32/kernel/entry-avr32b.S b/arch/avr32/kernel/entry-avr32b.S
deleted file mode 100644 (file)
index 7301f48..0000000
+++ /dev/null
@@ -1,877 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-/*
- * This file contains the low-level entry-points into the kernel, that is,
- * exception handlers, debug trap handlers, interrupt handlers and the
- * system call handler.
- */
-#include <linux/errno.h>
-
-#include <asm/asm.h>
-#include <asm/hardirq.h>
-#include <asm/irq.h>
-#include <asm/ocd.h>
-#include <asm/page.h>
-#include <asm/pgtable.h>
-#include <asm/ptrace.h>
-#include <asm/sysreg.h>
-#include <asm/thread_info.h>
-#include <asm/unistd.h>
-
-#ifdef CONFIG_PREEMPT
-# define preempt_stop          mask_interrupts
-#else
-# define preempt_stop
-# define fault_resume_kernel   fault_restore_all
-#endif
-
-#define __MASK(x)      ((1 << (x)) - 1)
-#define IRQ_MASK       ((__MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT) | \
-                        (__MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT))
-
-       .section .ex.text,"ax",@progbits
-       .align  2
-exception_vectors:
-       bral    handle_critical
-       .align  2
-       bral    handle_critical
-       .align  2
-       bral    do_bus_error_write
-       .align  2
-       bral    do_bus_error_read
-       .align  2
-       bral    do_nmi_ll
-       .align  2
-       bral    handle_address_fault
-       .align  2
-       bral    handle_protection_fault
-       .align  2
-       bral    handle_debug
-       .align  2
-       bral    do_illegal_opcode_ll
-       .align  2
-       bral    do_illegal_opcode_ll
-       .align  2
-       bral    do_illegal_opcode_ll
-       .align  2
-       bral    do_fpe_ll
-       .align  2
-       bral    do_illegal_opcode_ll
-       .align  2
-       bral    handle_address_fault
-       .align  2
-       bral    handle_address_fault
-       .align  2
-       bral    handle_protection_fault
-       .align  2
-       bral    handle_protection_fault
-       .align  2
-       bral    do_dtlb_modified
-
-#define        tlbmiss_save    pushm   r0-r3
-#define tlbmiss_restore        popm    r0-r3
-
-       .org    0x50
-       .global itlb_miss
-itlb_miss:
-       tlbmiss_save
-       rjmp    tlb_miss_common
-
-       .org    0x60
-dtlb_miss_read:
-       tlbmiss_save
-       rjmp    tlb_miss_common
-
-       .org    0x70
-dtlb_miss_write:
-       tlbmiss_save
-
-       .global tlb_miss_common
-       .align  2
-tlb_miss_common:
-       mfsr    r0, SYSREG_TLBEAR
-       mfsr    r1, SYSREG_PTBR
-
-       /*
-        * First level lookup: The PGD contains virtual pointers to
-        * the second-level page tables, but they may be NULL if not
-        * present.
-        */
-pgtbl_lookup:
-       lsr     r2, r0, PGDIR_SHIFT
-       ld.w    r3, r1[r2 << 2]
-       bfextu  r1, r0, PAGE_SHIFT, PGDIR_SHIFT - PAGE_SHIFT
-       cp.w    r3, 0
-       breq    page_table_not_present
-
-       /* Second level lookup */
-       ld.w    r2, r3[r1 << 2]
-       mfsr    r0, SYSREG_TLBARLO
-       bld     r2, _PAGE_BIT_PRESENT
-       brcc    page_not_present
-
-       /* Mark the page as accessed */
-       sbr     r2, _PAGE_BIT_ACCESSED
-       st.w    r3[r1 << 2], r2
-
-       /* Drop software flags */
-       andl    r2, _PAGE_FLAGS_HARDWARE_MASK & 0xffff
-       mtsr    SYSREG_TLBELO, r2
-
-       /* Figure out which entry we want to replace */
-       mfsr    r1, SYSREG_MMUCR
-       clz     r2, r0
-       brcc    1f
-       mov     r3, -1                  /* All entries have been accessed, */
-       mov     r2, 0                   /* so start at 0 */
-       mtsr    SYSREG_TLBARLO, r3      /* and reset TLBAR */
-
-1:     bfins   r1, r2, SYSREG_DRP_OFFSET, SYSREG_DRP_SIZE
-       mtsr    SYSREG_MMUCR, r1
-       tlbw
-
-       tlbmiss_restore
-       rete
-
-       /* The slow path of the TLB miss handler */
-       .align  2
-page_table_not_present:
-       /* Do we need to synchronize with swapper_pg_dir? */
-       bld     r0, 31
-       brcs    sync_with_swapper_pg_dir
-
-page_not_present:
-       tlbmiss_restore
-       sub     sp, 4
-       stmts   --sp, r0-lr
-       call    save_full_context_ex
-       mfsr    r12, SYSREG_ECR
-       mov     r11, sp
-       call    do_page_fault
-       rjmp    ret_from_exception
-
-       .align  2
-sync_with_swapper_pg_dir:
-       /*
-        * If swapper_pg_dir contains a non-NULL second-level page
-        * table pointer, copy it into the current PGD. If not, we
-        * must handle it as a full-blown page fault.
-        *
-        * Jumping back to pgtbl_lookup causes an unnecessary lookup,
-        * but it is guaranteed to be a cache hit, it won't happen
-        * very often, and we absolutely do not want to sacrifice any
-        * performance in the fast path in order to improve this.
-        */
-       mov     r1, lo(swapper_pg_dir)
-       orh     r1, hi(swapper_pg_dir)
-       ld.w    r3, r1[r2 << 2]
-       cp.w    r3, 0
-       breq    page_not_present
-       mfsr    r1, SYSREG_PTBR
-       st.w    r1[r2 << 2], r3
-       rjmp    pgtbl_lookup
-
-       /*
-        * We currently have two bytes left at this point until we
-        * crash into the system call handler...
-        *
-        * Don't worry, the assembler will let us know.
-        */
-
-
-       /* ---                    System Call                    --- */
-
-       .org    0x100
-system_call:
-#ifdef CONFIG_PREEMPT
-       mask_interrupts
-#endif
-       pushm   r12             /* r12_orig */
-       stmts   --sp, r0-lr
-
-       mfsr    r0, SYSREG_RAR_SUP
-       mfsr    r1, SYSREG_RSR_SUP
-#ifdef CONFIG_PREEMPT
-       unmask_interrupts
-#endif
-       zero_fp
-       stm     --sp, r0-r1
-
-       /* check for syscall tracing */
-       get_thread_info r0
-       ld.w    r1, r0[TI_flags]
-       bld     r1, TIF_SYSCALL_TRACE
-       brcs    syscall_trace_enter
-
-syscall_trace_cont:
-       cp.w    r8, NR_syscalls
-       brhs    syscall_badsys
-
-       lddpc   lr, syscall_table_addr
-       ld.w    lr, lr[r8 << 2]
-       mov     r8, r5          /* 5th argument (6th is pushed by stub) */
-       icall   lr
-
-       .global syscall_return
-syscall_return:
-       get_thread_info r0
-       mask_interrupts         /* make sure we don't miss an interrupt
-                                  setting need_resched or sigpending
-                                  between sampling and the rets */
-
-       /* Store the return value so that the correct value is loaded below */
-       stdsp   sp[REG_R12], r12
-
-       ld.w    r1, r0[TI_flags]
-       andl    r1, _TIF_ALLWORK_MASK, COH
-       brne    syscall_exit_work
-
-syscall_exit_cont:
-       popm    r8-r9
-       mtsr    SYSREG_RAR_SUP, r8
-       mtsr    SYSREG_RSR_SUP, r9
-       ldmts   sp++, r0-lr
-       sub     sp, -4          /* r12_orig */
-       rets
-
-       .align  2
-syscall_table_addr:
-       .long   sys_call_table
-
-syscall_badsys:
-       mov     r12, -ENOSYS
-       rjmp    syscall_return
-
-       .global ret_from_fork
-ret_from_fork:
-       call   schedule_tail
-       mov     r12, 0
-       rjmp    syscall_return
-
-       .global ret_from_kernel_thread
-ret_from_kernel_thread:
-       call   schedule_tail
-       mov     r12, r0
-       mov     lr, r2  /* syscall_return */
-       mov     pc, r1
-
-syscall_trace_enter:
-       pushm   r8-r12
-       call    syscall_trace
-       popm    r8-r12
-       rjmp    syscall_trace_cont
-
-syscall_exit_work:
-       bld     r1, TIF_SYSCALL_TRACE
-       brcc    1f
-       unmask_interrupts
-       call    syscall_trace
-       mask_interrupts
-       ld.w    r1, r0[TI_flags]
-
-1:     bld     r1, TIF_NEED_RESCHED
-       brcc    2f
-       unmask_interrupts
-       call    schedule
-       mask_interrupts
-       ld.w    r1, r0[TI_flags]
-       rjmp    1b
-
-2:     mov     r2, _TIF_SIGPENDING | _TIF_NOTIFY_RESUME
-       tst     r1, r2
-       breq    3f
-       unmask_interrupts
-       mov     r12, sp
-       mov     r11, r0
-       call    do_notify_resume
-       mask_interrupts
-       ld.w    r1, r0[TI_flags]
-       rjmp    1b
-
-3:     bld     r1, TIF_BREAKPOINT
-       brcc    syscall_exit_cont
-       rjmp    enter_monitor_mode
-
-       /* This function expects to find offending PC in SYSREG_RAR_EX */
-       .type   save_full_context_ex, @function
-       .align  2
-save_full_context_ex:
-       mfsr    r11, SYSREG_RAR_EX
-       sub     r9, pc, . - debug_trampoline
-       mfsr    r8, SYSREG_RSR_EX
-       cp.w    r9, r11
-       breq    3f
-       mov     r12, r8
-       andh    r8, (MODE_MASK >> 16), COH
-       brne    2f
-
-1:     pushm   r11, r12        /* PC and SR */
-       unmask_exceptions
-       ret     r12
-
-2:     sub     r10, sp, -(FRAME_SIZE_FULL - REG_LR)
-       stdsp   sp[4], r10      /* replace saved SP */
-       rjmp    1b
-
-       /*
-        * The debug handler set up a trampoline to make us
-        * automatically enter monitor mode upon return, but since
-        * we're saving the full context, we must assume that the
-        * exception handler might want to alter the return address
-        * and/or status register. So we need to restore the original
-        * context and enter monitor mode manually after the exception
-        * has been handled.
-        */
-3:     get_thread_info r8
-       ld.w    r11, r8[TI_rar_saved]
-       ld.w    r12, r8[TI_rsr_saved]
-       rjmp    1b
-       .size   save_full_context_ex, . - save_full_context_ex
-
-       /* Low-level exception handlers */
-handle_critical:
-       /*
-        * AT32AP700x errata:
-        *
-        * After a Java stack overflow or underflow trap, any CPU
-        * memory access may cause erratic behavior. This will happen
-        * when the four least significant bits of the JOSP system
-        * register contains any value between 9 and 15 (inclusive).
-        *
-        * Possible workarounds:
-        *   - Don't use the Java Extension Module
-        *   - Ensure that the stack overflow and underflow trap
-        *     handlers do not do any memory access or trigger any
-        *     exceptions before the overflow/underflow condition is
-        *     cleared (by incrementing or decrementing the JOSP)
-        *   - Make sure that JOSP does not contain any problematic
-        *     value before doing any exception or interrupt
-        *     processing.
-        *   - Set up a critical exception handler which writes a
-        *     known-to-be-safe value, e.g. 4, to JOSP before doing
-        *     any further processing.
-        *
-        * We'll use the last workaround for now since we cannot
-        * guarantee that user space processes don't use Java mode.
-        * Non-well-behaving userland will be terminated with extreme
-        * prejudice.
-        */
-#ifdef CONFIG_CPU_AT32AP700X
-       /*
-        * There's a chance we can't touch memory, so temporarily
-        * borrow PTBR to save the stack pointer while we fix things
-        * up...
-        */
-       mtsr    SYSREG_PTBR, sp
-       mov     sp, 4
-       mtsr    SYSREG_JOSP, sp
-       mfsr    sp, SYSREG_PTBR
-       sub     pc, -2
-
-       /* Push most of pt_regs on stack. We'll do the rest later */
-       sub     sp, 4
-       pushm   r0-r12
-
-       /* PTBR mirrors current_thread_info()->task->active_mm->pgd */
-       get_thread_info r0
-       ld.w    r1, r0[TI_task]
-       ld.w    r2, r1[TSK_active_mm]
-       ld.w    r3, r2[MM_pgd]
-       mtsr    SYSREG_PTBR, r3
-#else
-       sub     sp, 4
-       pushm   r0-r12
-#endif
-       sub     r0, sp, -(14 * 4)
-       mov     r1, lr
-       mfsr    r2, SYSREG_RAR_EX
-       mfsr    r3, SYSREG_RSR_EX
-       pushm   r0-r3
-
-       mfsr    r12, SYSREG_ECR
-       mov     r11, sp
-       call    do_critical_exception
-
-       /* We should never get here... */
-bad_return:
-       sub     r12, pc, (. - 1f)
-       lddpc   pc, 2f
-       .align  2
-1:     .asciz  "Return from critical exception!"
-2:     .long   panic
-
-       .align  1
-do_bus_error_write:
-       sub     sp, 4
-       stmts   --sp, r0-lr
-       call    save_full_context_ex
-       mov     r11, 1
-       rjmp    1f
-
-do_bus_error_read:
-       sub     sp, 4
-       stmts   --sp, r0-lr
-       call    save_full_context_ex
-       mov     r11, 0
-1:     mfsr    r12, SYSREG_BEAR
-       mov     r10, sp
-       call    do_bus_error
-       rjmp    ret_from_exception
-
-       .align  1
-do_nmi_ll:
-       sub     sp, 4
-       stmts   --sp, r0-lr
-       mfsr    r9, SYSREG_RSR_NMI
-       mfsr    r8, SYSREG_RAR_NMI
-       bfextu  r0, r9, MODE_SHIFT, 3
-       brne    2f
-
-1:     pushm   r8, r9  /* PC and SR */
-       mfsr    r12, SYSREG_ECR
-       mov     r11, sp
-       call    do_nmi
-       popm    r8-r9
-       mtsr    SYSREG_RAR_NMI, r8
-       tst     r0, r0
-       mtsr    SYSREG_RSR_NMI, r9
-       brne    3f
-
-       ldmts   sp++, r0-lr
-       sub     sp, -4          /* skip r12_orig */
-       rete
-
-2:     sub     r10, sp, -(FRAME_SIZE_FULL - REG_LR)
-       stdsp   sp[4], r10      /* replace saved SP */
-       rjmp    1b
-
-3:     popm    lr
-       sub     sp, -4          /* skip sp */
-       popm    r0-r12
-       sub     sp, -4          /* skip r12_orig */
-       rete
-
-handle_address_fault:
-       sub     sp, 4
-       stmts   --sp, r0-lr
-       call    save_full_context_ex
-       mfsr    r12, SYSREG_ECR
-       mov     r11, sp
-       call    do_address_exception
-       rjmp    ret_from_exception
-
-handle_protection_fault:
-       sub     sp, 4
-       stmts   --sp, r0-lr
-       call    save_full_context_ex
-       mfsr    r12, SYSREG_ECR
-       mov     r11, sp
-       call    do_page_fault
-       rjmp    ret_from_exception
-
-       .align  1
-do_illegal_opcode_ll:
-       sub     sp, 4
-       stmts   --sp, r0-lr
-       call    save_full_context_ex
-       mfsr    r12, SYSREG_ECR
-       mov     r11, sp
-       call    do_illegal_opcode
-       rjmp    ret_from_exception
-
-do_dtlb_modified:
-       pushm   r0-r3
-       mfsr    r1, SYSREG_TLBEAR
-       mfsr    r0, SYSREG_PTBR
-       lsr     r2, r1, PGDIR_SHIFT
-       ld.w    r0, r0[r2 << 2]
-       lsl     r1, (32 - PGDIR_SHIFT)
-       lsr     r1, (32 - PGDIR_SHIFT) + PAGE_SHIFT
-
-       /* Translate to virtual address in P1 */
-       andl    r0, 0xf000
-       sbr     r0, 31
-       add     r2, r0, r1 << 2
-       ld.w    r3, r2[0]
-       sbr     r3, _PAGE_BIT_DIRTY
-       mov     r0, r3
-       st.w    r2[0], r3
-
-       /* The page table is up-to-date. Update the TLB entry as well */
-       andl    r0, lo(_PAGE_FLAGS_HARDWARE_MASK)
-       mtsr    SYSREG_TLBELO, r0
-
-       /* MMUCR[DRP] is updated automatically, so let's go... */
-       tlbw
-
-       popm    r0-r3
-       rete
-
-do_fpe_ll:
-       sub     sp, 4
-       stmts   --sp, r0-lr
-       call    save_full_context_ex
-       unmask_interrupts
-       mov     r12, 26
-       mov     r11, sp
-       call    do_fpe
-       rjmp    ret_from_exception
-
-ret_from_exception:
-       mask_interrupts
-       lddsp   r4, sp[REG_SR]
-
-       andh    r4, (MODE_MASK >> 16), COH
-       brne    fault_resume_kernel
-
-       get_thread_info r0
-       ld.w    r1, r0[TI_flags]
-       andl    r1, _TIF_WORK_MASK, COH
-       brne    fault_exit_work
-
-fault_resume_user:
-       popm    r8-r9
-       mask_exceptions
-       mtsr    SYSREG_RAR_EX, r8
-       mtsr    SYSREG_RSR_EX, r9
-       ldmts   sp++, r0-lr
-       sub     sp, -4
-       rete
-
-fault_resume_kernel:
-#ifdef CONFIG_PREEMPT
-       get_thread_info r0
-       ld.w    r2, r0[TI_preempt_count]
-       cp.w    r2, 0
-       brne    1f
-       ld.w    r1, r0[TI_flags]
-       bld     r1, TIF_NEED_RESCHED
-       brcc    1f
-       lddsp   r4, sp[REG_SR]
-       bld     r4, SYSREG_GM_OFFSET
-       brcs    1f
-       call    preempt_schedule_irq
-1:
-#endif
-
-       popm    r8-r9
-       mask_exceptions
-       mfsr    r1, SYSREG_SR
-       mtsr    SYSREG_RAR_EX, r8
-       mtsr    SYSREG_RSR_EX, r9
-       popm    lr
-       sub     sp, -4          /* ignore SP */
-       popm    r0-r12
-       sub     sp, -4          /* ignore r12_orig */
-       rete
-
-irq_exit_work:
-       /* Switch to exception mode so that we can share the same code. */
-       mfsr    r8, SYSREG_SR
-       cbr     r8, SYSREG_M0_OFFSET
-       orh     r8, hi(SYSREG_BIT(M1) | SYSREG_BIT(M2))
-       mtsr    SYSREG_SR, r8
-       sub     pc, -2
-       get_thread_info r0
-       ld.w    r1, r0[TI_flags]
-
-fault_exit_work:
-       bld     r1, TIF_NEED_RESCHED
-       brcc    1f
-       unmask_interrupts
-       call    schedule
-       mask_interrupts
-       ld.w    r1, r0[TI_flags]
-       rjmp    fault_exit_work
-
-1:     mov     r2, _TIF_SIGPENDING | _TIF_NOTIFY_RESUME
-       tst     r1, r2
-       breq    2f
-       unmask_interrupts
-       mov     r12, sp
-       mov     r11, r0
-       call    do_notify_resume
-       mask_interrupts
-       ld.w    r1, r0[TI_flags]
-       rjmp    fault_exit_work
-
-2:     bld     r1, TIF_BREAKPOINT
-       brcc    fault_resume_user
-       rjmp    enter_monitor_mode
-
-       .section .kprobes.text, "ax", @progbits
-       .type   handle_debug, @function
-handle_debug:
-       sub     sp, 4           /* r12_orig */
-       stmts   --sp, r0-lr
-       mfsr    r8, SYSREG_RAR_DBG
-       mfsr    r9, SYSREG_RSR_DBG
-       unmask_exceptions
-       pushm   r8-r9
-       bfextu  r9, r9, SYSREG_MODE_OFFSET, SYSREG_MODE_SIZE
-       brne    debug_fixup_regs
-
-.Ldebug_fixup_cont:
-#ifdef CONFIG_TRACE_IRQFLAGS
-       call    trace_hardirqs_off
-#endif
-       mov     r12, sp
-       call    do_debug
-       mov     sp, r12
-
-       lddsp   r2, sp[REG_SR]
-       bfextu  r3, r2, SYSREG_MODE_OFFSET, SYSREG_MODE_SIZE
-       brne    debug_resume_kernel
-
-       get_thread_info r0
-       ld.w    r1, r0[TI_flags]
-       mov     r2, _TIF_DBGWORK_MASK
-       tst     r1, r2
-       brne    debug_exit_work
-
-       bld     r1, TIF_SINGLE_STEP
-       brcc    1f
-       mfdr    r4, OCD_DC
-       sbr     r4, OCD_DC_SS_BIT
-       mtdr    OCD_DC, r4
-
-1:     popm    r10,r11
-       mask_exceptions
-       mtsr    SYSREG_RSR_DBG, r11
-       mtsr    SYSREG_RAR_DBG, r10
-#ifdef CONFIG_TRACE_IRQFLAGS
-       call    trace_hardirqs_on
-1:
-#endif
-       ldmts   sp++, r0-lr
-       sub     sp, -4
-       retd
-       .size   handle_debug, . - handle_debug
-
-       /* Mode of the trapped context is in r9 */
-       .type   debug_fixup_regs, @function
-debug_fixup_regs:
-       mfsr    r8, SYSREG_SR
-       mov     r10, r8
-       bfins   r8, r9, SYSREG_MODE_OFFSET, SYSREG_MODE_SIZE
-       mtsr    SYSREG_SR, r8
-       sub     pc, -2
-       stdsp   sp[REG_LR], lr
-       mtsr    SYSREG_SR, r10
-       sub     pc, -2
-       sub     r8, sp, -FRAME_SIZE_FULL
-       stdsp   sp[REG_SP], r8
-       rjmp    .Ldebug_fixup_cont
-       .size   debug_fixup_regs, . - debug_fixup_regs
-
-       .type   debug_resume_kernel, @function
-debug_resume_kernel:
-       mask_exceptions
-       popm    r10, r11
-       mtsr    SYSREG_RAR_DBG, r10
-       mtsr    SYSREG_RSR_DBG, r11
-#ifdef CONFIG_TRACE_IRQFLAGS
-       bld     r11, SYSREG_GM_OFFSET
-       brcc    1f
-       call    trace_hardirqs_on
-1:
-#endif
-       mfsr    r2, SYSREG_SR
-       mov     r1, r2
-       bfins   r2, r3, SYSREG_MODE_OFFSET, SYSREG_MODE_SIZE
-       mtsr    SYSREG_SR, r2
-       sub     pc, -2
-       popm    lr
-       mtsr    SYSREG_SR, r1
-       sub     pc, -2
-       sub     sp, -4          /* skip SP */
-       popm    r0-r12
-       sub     sp, -4
-       retd
-       .size   debug_resume_kernel, . - debug_resume_kernel
-
-       .type   debug_exit_work, @function
-debug_exit_work:
-       /*
-        * We must return from Monitor Mode using a retd, and we must
-        * not schedule since that involves the D bit in SR getting
-        * cleared by something other than the debug hardware. This
-        * may cause undefined behaviour according to the Architecture
-        * manual.
-        *
-        * So we fix up the return address and status and return to a
-        * stub below in Exception mode. From there, we can follow the
-        * normal exception return path.
-        *
-        * The real return address and status registers are stored on
-        * the stack in the way the exception return path understands,
-        * so no need to fix anything up there.
-        */
-       sub     r8, pc, . - fault_exit_work
-       mtsr    SYSREG_RAR_DBG, r8
-       mov     r9, 0
-       orh     r9, hi(SR_EM | SR_GM | MODE_EXCEPTION)
-       mtsr    SYSREG_RSR_DBG, r9
-       sub     pc, -2
-       retd
-       .size   debug_exit_work, . - debug_exit_work
-
-       .set    rsr_int0,       SYSREG_RSR_INT0
-       .set    rsr_int1,       SYSREG_RSR_INT1
-       .set    rsr_int2,       SYSREG_RSR_INT2
-       .set    rsr_int3,       SYSREG_RSR_INT3
-       .set    rar_int0,       SYSREG_RAR_INT0
-       .set    rar_int1,       SYSREG_RAR_INT1
-       .set    rar_int2,       SYSREG_RAR_INT2
-       .set    rar_int3,       SYSREG_RAR_INT3
-
-       .macro  IRQ_LEVEL level
-       .type   irq_level\level, @function
-irq_level\level:
-       sub     sp, 4           /* r12_orig */
-       stmts   --sp,r0-lr
-       mfsr    r8, rar_int\level
-       mfsr    r9, rsr_int\level
-
-#ifdef CONFIG_PREEMPT
-       sub     r11, pc, (. - system_call)
-       cp.w    r11, r8
-       breq    4f
-#endif
-
-       pushm   r8-r9
-
-       mov     r11, sp
-       mov     r12, \level
-
-       call    do_IRQ
-
-       lddsp   r4, sp[REG_SR]
-       bfextu  r4, r4, SYSREG_M0_OFFSET, 3
-       cp.w    r4, MODE_SUPERVISOR >> SYSREG_M0_OFFSET
-       breq    2f
-       cp.w    r4, MODE_USER >> SYSREG_M0_OFFSET
-#ifdef CONFIG_PREEMPT
-       brne    3f
-#else
-       brne    1f
-#endif
-
-       get_thread_info r0
-       ld.w    r1, r0[TI_flags]
-       andl    r1, _TIF_WORK_MASK, COH
-       brne    irq_exit_work
-
-1:
-#ifdef CONFIG_TRACE_IRQFLAGS
-       call    trace_hardirqs_on
-#endif
-       popm    r8-r9
-       mtsr    rar_int\level, r8
-       mtsr    rsr_int\level, r9
-       ldmts   sp++,r0-lr
-       sub     sp, -4          /* ignore r12_orig */
-       rete
-
-#ifdef CONFIG_PREEMPT
-4:     mask_interrupts
-       mfsr    r8, rsr_int\level
-       sbr     r8, 16
-       mtsr    rsr_int\level, r8
-       ldmts   sp++, r0-lr
-       sub     sp, -4          /* ignore r12_orig */
-       rete
-#endif
-
-2:     get_thread_info r0
-       ld.w    r1, r0[TI_flags]
-       bld     r1, TIF_CPU_GOING_TO_SLEEP
-#ifdef CONFIG_PREEMPT
-       brcc    3f
-#else
-       brcc    1b
-#endif
-       sub     r1, pc, . - cpu_idle_skip_sleep
-       stdsp   sp[REG_PC], r1
-#ifdef CONFIG_PREEMPT
-3:     get_thread_info r0
-       ld.w    r2, r0[TI_preempt_count]
-       cp.w    r2, 0
-       brne    1b
-       ld.w    r1, r0[TI_flags]
-       bld     r1, TIF_NEED_RESCHED
-       brcc    1b
-       lddsp   r4, sp[REG_SR]
-       bld     r4, SYSREG_GM_OFFSET
-       brcs    1b
-       call    preempt_schedule_irq
-#endif
-       rjmp    1b
-       .endm
-
-       .section .irq.text,"ax",@progbits
-
-       .global irq_level0
-       .global irq_level1
-       .global irq_level2
-       .global irq_level3
-       IRQ_LEVEL 0
-       IRQ_LEVEL 1
-       IRQ_LEVEL 2
-       IRQ_LEVEL 3
-
-       .section .kprobes.text, "ax", @progbits
-       .type   enter_monitor_mode, @function
-enter_monitor_mode:
-       /*
-        * We need to enter monitor mode to do a single step. The
-        * monitor code will alter the return address so that we
-        * return directly to the user instead of returning here.
-        */
-       breakpoint
-       rjmp    breakpoint_failed
-
-       .size   enter_monitor_mode, . - enter_monitor_mode
-
-       .type   debug_trampoline, @function
-       .global debug_trampoline
-debug_trampoline:
-       /*
-        * Save the registers on the stack so that the monitor code
-        * can find them easily.
-        */
-       sub     sp, 4           /* r12_orig */
-       stmts   --sp, r0-lr
-       get_thread_info r0
-       ld.w    r8, r0[TI_rar_saved]
-       ld.w    r9, r0[TI_rsr_saved]
-       pushm   r8-r9
-
-       /*
-        * The monitor code will alter the return address so we don't
-        * return here.
-        */
-       breakpoint
-       rjmp    breakpoint_failed
-       .size   debug_trampoline, . - debug_trampoline
-
-       .type breakpoint_failed, @function
-breakpoint_failed:
-       /*
-        * Something went wrong. Perhaps the debug hardware isn't
-        * enabled?
-        */
-       lda.w   r12, msg_breakpoint_failed
-       mov     r11, sp
-       mov     r10, 9          /* SIGKILL */
-       call    die
-1:     rjmp    1b
-
-msg_breakpoint_failed:
-       .asciz  "Failed to enter Debug Mode"
diff --git a/arch/avr32/kernel/head.S b/arch/avr32/kernel/head.S
deleted file mode 100644 (file)
index 59eae6d..0000000
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Non-board-specific low-level startup code
- *
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/linkage.h>
-
-#include <asm/page.h>
-
-       .section .init.text,"ax"
-       .global kernel_entry
-kernel_entry:
-       /* Start the show */
-       lddpc   pc, kernel_start_addr
-
-       .align  2
-kernel_start_addr:
-       .long   start_kernel
diff --git a/arch/avr32/kernel/irq.c b/arch/avr32/kernel/irq.c
deleted file mode 100644 (file)
index 900e49b..0000000
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * Based on arch/i386/kernel/irq.c
- *   Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/interrupt.h>
-#include <linux/irq.h>
-#include <linux/kernel_stat.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/device.h>
-
-/* May be overridden by platform code */
-int __weak nmi_enable(void)
-{
-       return -ENOSYS;
-}
-
-void __weak nmi_disable(void)
-{
-
-}
diff --git a/arch/avr32/kernel/kprobes.c b/arch/avr32/kernel/kprobes.c
deleted file mode 100644 (file)
index a94ece4..0000000
+++ /dev/null
@@ -1,267 +0,0 @@
-/*
- *  Kernel Probes (KProbes)
- *
- * Copyright (C) 2005-2006 Atmel Corporation
- *
- * Based on arch/ppc64/kernel/kprobes.c
- *  Copyright (C) IBM Corporation, 2002, 2004
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/kprobes.h>
-#include <linux/ptrace.h>
-
-#include <asm/cacheflush.h>
-#include <linux/kdebug.h>
-#include <asm/ocd.h>
-
-DEFINE_PER_CPU(struct kprobe *, current_kprobe);
-static unsigned long kprobe_status;
-static struct pt_regs jprobe_saved_regs;
-
-struct kretprobe_blackpoint kretprobe_blacklist[] = {{NULL, NULL}};
-
-int __kprobes arch_prepare_kprobe(struct kprobe *p)
-{
-       int ret = 0;
-
-       if ((unsigned long)p->addr & 0x01) {
-               printk("Attempt to register kprobe at an unaligned address\n");
-               ret = -EINVAL;
-       }
-
-       /* XXX: Might be a good idea to check if p->addr is a valid
-        * kernel address as well... */
-
-       if (!ret) {
-               pr_debug("copy kprobe at %p\n", p->addr);
-               memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
-               p->opcode = *p->addr;
-       }
-
-       return ret;
-}
-
-void __kprobes arch_arm_kprobe(struct kprobe *p)
-{
-       pr_debug("arming kprobe at %p\n", p->addr);
-       ocd_enable(NULL);
-       *p->addr = BREAKPOINT_INSTRUCTION;
-       flush_icache_range((unsigned long)p->addr,
-                          (unsigned long)p->addr + sizeof(kprobe_opcode_t));
-}
-
-void __kprobes arch_disarm_kprobe(struct kprobe *p)
-{
-       pr_debug("disarming kprobe at %p\n", p->addr);
-       ocd_disable(NULL);
-       *p->addr = p->opcode;
-       flush_icache_range((unsigned long)p->addr,
-                          (unsigned long)p->addr + sizeof(kprobe_opcode_t));
-}
-
-static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
-{
-       unsigned long dc;
-
-       pr_debug("preparing to singlestep over %p (PC=%08lx)\n",
-                p->addr, regs->pc);
-
-       BUG_ON(!(sysreg_read(SR) & SYSREG_BIT(SR_D)));
-
-       dc = ocd_read(DC);
-       dc |= 1 << OCD_DC_SS_BIT;
-       ocd_write(DC, dc);
-
-       /*
-        * We must run the instruction from its original location
-        * since it may actually reference PC.
-        *
-        * TODO: Do the instruction replacement directly in icache.
-        */
-       *p->addr = p->opcode;
-       flush_icache_range((unsigned long)p->addr,
-                          (unsigned long)p->addr + sizeof(kprobe_opcode_t));
-}
-
-static void __kprobes resume_execution(struct kprobe *p, struct pt_regs *regs)
-{
-       unsigned long dc;
-
-       pr_debug("resuming execution at PC=%08lx\n", regs->pc);
-
-       dc = ocd_read(DC);
-       dc &= ~(1 << OCD_DC_SS_BIT);
-       ocd_write(DC, dc);
-
-       *p->addr = BREAKPOINT_INSTRUCTION;
-       flush_icache_range((unsigned long)p->addr,
-                          (unsigned long)p->addr + sizeof(kprobe_opcode_t));
-}
-
-static void __kprobes set_current_kprobe(struct kprobe *p)
-{
-       __this_cpu_write(current_kprobe, p);
-}
-
-static int __kprobes kprobe_handler(struct pt_regs *regs)
-{
-       struct kprobe *p;
-       void *addr = (void *)regs->pc;
-       int ret = 0;
-
-       pr_debug("kprobe_handler: kprobe_running=%p\n",
-                kprobe_running());
-
-       /*
-        * We don't want to be preempted for the entire
-        * duration of kprobe processing
-        */
-       preempt_disable();
-
-       /* Check that we're not recursing */
-       if (kprobe_running()) {
-               p = get_kprobe(addr);
-               if (p) {
-                       if (kprobe_status == KPROBE_HIT_SS) {
-                               printk("FIXME: kprobe hit while single-stepping!\n");
-                               goto no_kprobe;
-                       }
-
-                       printk("FIXME: kprobe hit while handling another kprobe\n");
-                       goto no_kprobe;
-               } else {
-                       p = kprobe_running();
-                       if (p->break_handler && p->break_handler(p, regs))
-                               goto ss_probe;
-               }
-               /* If it's not ours, can't be delete race, (we hold lock). */
-               goto no_kprobe;
-       }
-
-       p = get_kprobe(addr);
-       if (!p)
-               goto no_kprobe;
-
-       kprobe_status = KPROBE_HIT_ACTIVE;
-       set_current_kprobe(p);
-       if (p->pre_handler && p->pre_handler(p, regs))
-               /* handler has already set things up, so skip ss setup */
-               return 1;
-
-ss_probe:
-       prepare_singlestep(p, regs);
-       kprobe_status = KPROBE_HIT_SS;
-       return 1;
-
-no_kprobe:
-       preempt_enable_no_resched();
-       return ret;
-}
-
-static int __kprobes post_kprobe_handler(struct pt_regs *regs)
-{
-       struct kprobe *cur = kprobe_running();
-
-       pr_debug("post_kprobe_handler, cur=%p\n", cur);
-
-       if (!cur)
-               return 0;
-
-       if (cur->post_handler) {
-               kprobe_status = KPROBE_HIT_SSDONE;
-               cur->post_handler(cur, regs, 0);
-       }
-
-       resume_execution(cur, regs);
-       reset_current_kprobe();
-       preempt_enable_no_resched();
-
-       return 1;
-}
-
-int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
-{
-       struct kprobe *cur = kprobe_running();
-
-       pr_debug("kprobe_fault_handler: trapnr=%d\n", trapnr);
-
-       if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
-               return 1;
-
-       if (kprobe_status & KPROBE_HIT_SS) {
-               resume_execution(cur, regs);
-               preempt_enable_no_resched();
-       }
-       return 0;
-}
-
-/*
- * Wrapper routine to for handling exceptions.
- */
-int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
-                                      unsigned long val, void *data)
-{
-       struct die_args *args = (struct die_args *)data;
-       int ret = NOTIFY_DONE;
-
-       pr_debug("kprobe_exceptions_notify: val=%lu, data=%p\n",
-                val, data);
-
-       switch (val) {
-       case DIE_BREAKPOINT:
-               if (kprobe_handler(args->regs))
-                       ret = NOTIFY_STOP;
-               break;
-       case DIE_SSTEP:
-               if (post_kprobe_handler(args->regs))
-                       ret = NOTIFY_STOP;
-               break;
-       default:
-               break;
-       }
-
-       return ret;
-}
-
-int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
-{
-       struct jprobe *jp = container_of(p, struct jprobe, kp);
-
-       memcpy(&jprobe_saved_regs, regs, sizeof(struct pt_regs));
-
-       /*
-        * TODO: We should probably save some of the stack here as
-        * well, since gcc may pass arguments on the stack for certain
-        * functions (lots of arguments, large aggregates, varargs)
-        */
-
-       /* setup return addr to the jprobe handler routine */
-       regs->pc = (unsigned long)jp->entry;
-       return 1;
-}
-
-void __kprobes jprobe_return(void)
-{
-       asm volatile("breakpoint" ::: "memory");
-}
-
-int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
-{
-       /*
-        * FIXME - we should ideally be validating that we got here 'cos
-        * of the "trap" in jprobe_return() above, before restoring the
-        * saved regs...
-        */
-       memcpy(regs, &jprobe_saved_regs, sizeof(struct pt_regs));
-       return 1;
-}
-
-int __init arch_init_kprobes(void)
-{
-       /* TODO: Register kretprobe trampoline */
-       return 0;
-}
diff --git a/arch/avr32/kernel/module.c b/arch/avr32/kernel/module.c
deleted file mode 100644 (file)
index 2b4c54c..0000000
+++ /dev/null
@@ -1,291 +0,0 @@
-/*
- * AVR32-specific kernel module loader
- *
- * Copyright (C) 2005-2006 Atmel Corporation
- *
- * GOT initialization parts are based on the s390 version
- *   Copyright (C) 2002, 2003 IBM Deutschland Entwicklung GmbH,
- *                            IBM Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/bug.h>
-#include <linux/elf.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/moduleloader.h>
-#include <linux/vmalloc.h>
-
-void module_arch_freeing_init(struct module *mod)
-{
-       vfree(mod->arch.syminfo);
-       mod->arch.syminfo = NULL;
-}
-
-static inline int check_rela(Elf32_Rela *rela, struct module *module,
-                            char *strings, Elf32_Sym *symbols)
-{
-       struct mod_arch_syminfo *info;
-
-       info = module->arch.syminfo + ELF32_R_SYM(rela->r_info);
-       switch (ELF32_R_TYPE(rela->r_info)) {
-       case R_AVR32_GOT32:
-       case R_AVR32_GOT16:
-       case R_AVR32_GOT8:
-       case R_AVR32_GOT21S:
-       case R_AVR32_GOT18SW:   /* mcall */
-       case R_AVR32_GOT16S:    /* ld.w */
-               if (rela->r_addend != 0) {
-                       printk(KERN_ERR
-                              "GOT relocation against %s at offset %u with addend\n",
-                              strings + symbols[ELF32_R_SYM(rela->r_info)].st_name,
-                              rela->r_offset);
-                       return -ENOEXEC;
-               }
-               if (info->got_offset == -1UL) {
-                       info->got_offset = module->arch.got_size;
-                       module->arch.got_size += sizeof(void *);
-               }
-               pr_debug("GOT[%3lu] %s\n", info->got_offset,
-                        strings + symbols[ELF32_R_SYM(rela->r_info)].st_name);
-               break;
-       }
-
-       return 0;
-}
-
-int module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
-                             char *secstrings, struct module *module)
-{
-       Elf32_Shdr *symtab;
-       Elf32_Sym *symbols;
-       Elf32_Rela *rela;
-       char *strings;
-       int nrela, i, j;
-       int ret;
-
-       /* Find the symbol table */
-       symtab = NULL;
-       for (i = 0; i < hdr->e_shnum; i++)
-               switch (sechdrs[i].sh_type) {
-               case SHT_SYMTAB:
-                       symtab = &sechdrs[i];
-                       break;
-               }
-       if (!symtab) {
-               printk(KERN_ERR "module %s: no symbol table\n", module->name);
-               return -ENOEXEC;
-       }
-
-       /* Allocate room for one syminfo structure per symbol. */
-       module->arch.nsyms = symtab->sh_size / sizeof(Elf_Sym);
-       module->arch.syminfo = vmalloc(module->arch.nsyms
-                                  * sizeof(struct mod_arch_syminfo));
-       if (!module->arch.syminfo)
-               return -ENOMEM;
-
-       symbols = (void *)hdr + symtab->sh_offset;
-       strings = (void *)hdr + sechdrs[symtab->sh_link].sh_offset;
-       for (i = 0; i < module->arch.nsyms; i++) {
-               if (symbols[i].st_shndx == SHN_UNDEF &&
-                   strcmp(strings + symbols[i].st_name,
-                          "_GLOBAL_OFFSET_TABLE_") == 0)
-                       /* "Define" it as absolute. */
-                       symbols[i].st_shndx = SHN_ABS;
-               module->arch.syminfo[i].got_offset = -1UL;
-               module->arch.syminfo[i].got_initialized = 0;
-       }
-
-       /* Allocate GOT entries for symbols that need it. */
-       module->arch.got_size = 0;
-       for (i = 0; i < hdr->e_shnum; i++) {
-               if (sechdrs[i].sh_type != SHT_RELA)
-                       continue;
-               nrela = sechdrs[i].sh_size / sizeof(Elf32_Rela);
-               rela = (void *)hdr + sechdrs[i].sh_offset;
-               for (j = 0; j < nrela; j++) {
-                       ret = check_rela(rela + j, module,
-                                        strings, symbols);
-                       if (ret)
-                               goto out_free_syminfo;
-               }
-       }
-
-       /*
-        * Increase core size to make room for GOT and set start
-        * offset for GOT.
-        */
-       module->core_layout.size = ALIGN(module->core_layout.size, 4);
-       module->arch.got_offset = module->core_layout.size;
-       module->core_layout.size += module->arch.got_size;
-
-       return 0;
-
-out_free_syminfo:
-       vfree(module->arch.syminfo);
-       module->arch.syminfo = NULL;
-
-       return ret;
-}
-
-static inline int reloc_overflow(struct module *module, const char *reloc_name,
-                                Elf32_Addr relocation)
-{
-       printk(KERN_ERR "module %s: Value %lx does not fit relocation %s\n",
-              module->name, (unsigned long)relocation, reloc_name);
-       return -ENOEXEC;
-}
-
-#define get_u16(loc)           (*((uint16_t *)loc))
-#define put_u16(loc, val)      (*((uint16_t *)loc) = (val))
-
-int apply_relocate_add(Elf32_Shdr *sechdrs, const char *strtab,
-                      unsigned int symindex, unsigned int relindex,
-                      struct module *module)
-{
-       Elf32_Shdr *symsec = sechdrs + symindex;
-       Elf32_Shdr *relsec = sechdrs + relindex;
-       Elf32_Shdr *dstsec = sechdrs + relsec->sh_info;
-       Elf32_Rela *rel = (void *)relsec->sh_addr;
-       unsigned int i;
-       int ret = 0;
-
-       for (i = 0; i < relsec->sh_size / sizeof(Elf32_Rela); i++, rel++) {
-               struct mod_arch_syminfo *info;
-               Elf32_Sym *sym;
-               Elf32_Addr relocation;
-               uint32_t *location;
-               uint32_t value;
-
-               location = (void *)dstsec->sh_addr + rel->r_offset;
-               sym = (Elf32_Sym *)symsec->sh_addr + ELF32_R_SYM(rel->r_info);
-               relocation = sym->st_value + rel->r_addend;
-
-               info = module->arch.syminfo + ELF32_R_SYM(rel->r_info);
-
-               /* Initialize GOT entry if necessary */
-               switch (ELF32_R_TYPE(rel->r_info)) {
-               case R_AVR32_GOT32:
-               case R_AVR32_GOT16:
-               case R_AVR32_GOT8:
-               case R_AVR32_GOT21S:
-               case R_AVR32_GOT18SW:
-               case R_AVR32_GOT16S:
-                       if (!info->got_initialized) {
-                               Elf32_Addr *gotent;
-
-                               gotent = (module->core_layout.base
-                                         + module->arch.got_offset
-                                         + info->got_offset);
-                               *gotent = relocation;
-                               info->got_initialized = 1;
-                       }
-
-                       relocation = info->got_offset;
-                       break;
-               }
-
-               switch (ELF32_R_TYPE(rel->r_info)) {
-               case R_AVR32_32:
-               case R_AVR32_32_CPENT:
-                       *location = relocation;
-                       break;
-               case R_AVR32_22H_PCREL:
-                       relocation -= (Elf32_Addr)location;
-                       if ((relocation & 0xffe00001) != 0
-                           && (relocation & 0xffc00001) != 0xffc00000)
-                               return reloc_overflow(module,
-                                                     "R_AVR32_22H_PCREL",
-                                                     relocation);
-                       relocation >>= 1;
-
-                       value = *location;
-                       value = ((value & 0xe1ef0000)
-                                | (relocation & 0xffff)
-                                | ((relocation & 0x10000) << 4)
-                                | ((relocation & 0x1e0000) << 8));
-                       *location = value;
-                       break;
-               case R_AVR32_11H_PCREL:
-                       relocation -= (Elf32_Addr)location;
-                       if ((relocation & 0xfffffc01) != 0
-                           && (relocation & 0xfffff801) != 0xfffff800)
-                               return reloc_overflow(module,
-                                                     "R_AVR32_11H_PCREL",
-                                                     relocation);
-                       value = get_u16(location);
-                       value = ((value & 0xf00c)
-                                | ((relocation & 0x1fe) << 3)
-                                | ((relocation & 0x600) >> 9));
-                       put_u16(location, value);
-                       break;
-               case R_AVR32_9H_PCREL:
-                       relocation -= (Elf32_Addr)location;
-                       if ((relocation & 0xffffff01) != 0
-                           && (relocation & 0xfffffe01) != 0xfffffe00)
-                               return reloc_overflow(module,
-                                                     "R_AVR32_9H_PCREL",
-                                                     relocation);
-                       value = get_u16(location);
-                       value = ((value & 0xf00f)
-                                | ((relocation & 0x1fe) << 3));
-                       put_u16(location, value);
-                       break;
-               case R_AVR32_9UW_PCREL:
-                       relocation -= ((Elf32_Addr)location) & 0xfffffffc;
-                       if ((relocation & 0xfffffc03) != 0)
-                               return reloc_overflow(module,
-                                                     "R_AVR32_9UW_PCREL",
-                                                     relocation);
-                       value = get_u16(location);
-                       value = ((value & 0xf80f)
-                                | ((relocation & 0x1fc) << 2));
-                       put_u16(location, value);
-                       break;
-               case R_AVR32_GOTPC:
-                       /*
-                        * R6 = PC - (PC - GOT)
-                        *
-                        * At this point, relocation contains the
-                        * value of PC.  Just subtract the value of
-                        * GOT, and we're done.
-                        */
-                       pr_debug("GOTPC: PC=0x%x, got_offset=0x%lx, core=0x%p\n",
-                                relocation, module->arch.got_offset,
-                                module->core_layout.base);
-                       relocation -= ((unsigned long)module->core_layout.base
-                                      + module->arch.got_offset);
-                       *location = relocation;
-                       break;
-               case R_AVR32_GOT18SW:
-                       if ((relocation & 0xfffe0003) != 0
-                           && (relocation & 0xfffc0000) != 0xfffc0000)
-                               return reloc_overflow(module, "R_AVR32_GOT18SW",
-                                                    relocation);
-                       relocation >>= 2;
-                       /* fall through */
-               case R_AVR32_GOT16S:
-                       if ((relocation & 0xffff8000) != 0
-                           && (relocation & 0xffff0000) != 0xffff0000)
-                               return reloc_overflow(module, "R_AVR32_GOT16S",
-                                                     relocation);
-                       pr_debug("GOT reloc @ 0x%x -> %u\n",
-                                rel->r_offset, relocation);
-                       value = *location;
-                       value = ((value & 0xffff0000)
-                                | (relocation & 0xffff));
-                       *location = value;
-                       break;
-
-               default:
-                       printk(KERN_ERR "module %s: Unknown relocation: %u\n",
-                              module->name, ELF32_R_TYPE(rel->r_info));
-                       return -ENOEXEC;
-               }
-       }
-
-       return ret;
-}
diff --git a/arch/avr32/kernel/nmi_debug.c b/arch/avr32/kernel/nmi_debug.c
deleted file mode 100644 (file)
index 2582304..0000000
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (C) 2007 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/delay.h>
-#include <linux/kdebug.h>
-#include <linux/notifier.h>
-#include <linux/sched.h>
-#include <linux/sched/debug.h>
-
-#include <asm/irq.h>
-
-enum nmi_action {
-       NMI_SHOW_STATE  = 1 << 0,
-       NMI_SHOW_REGS   = 1 << 1,
-       NMI_DIE         = 1 << 2,
-       NMI_DEBOUNCE    = 1 << 3,
-};
-
-static unsigned long nmi_actions;
-
-static int nmi_debug_notify(struct notifier_block *self,
-               unsigned long val, void *data)
-{
-       struct die_args *args = data;
-
-       if (likely(val != DIE_NMI))
-               return NOTIFY_DONE;
-
-       if (nmi_actions & NMI_SHOW_STATE)
-               show_state();
-       if (nmi_actions & NMI_SHOW_REGS)
-               show_regs(args->regs);
-       if (nmi_actions & NMI_DEBOUNCE)
-               mdelay(10);
-       if (nmi_actions & NMI_DIE)
-               return NOTIFY_BAD;
-
-       return NOTIFY_OK;
-}
-
-static struct notifier_block nmi_debug_nb = {
-       .notifier_call = nmi_debug_notify,
-};
-
-static int __init nmi_debug_setup(char *str)
-{
-       char *p, *sep;
-
-       register_die_notifier(&nmi_debug_nb);
-       if (nmi_enable()) {
-               printk(KERN_WARNING "Unable to enable NMI.\n");
-               return 0;
-       }
-
-       if (*str != '=')
-               return 0;
-
-       for (p = str + 1; *p; p = sep + 1) {
-               sep = strchr(p, ',');
-               if (sep)
-                       *sep = 0;
-               if (strcmp(p, "state") == 0)
-                       nmi_actions |= NMI_SHOW_STATE;
-               else if (strcmp(p, "regs") == 0)
-                       nmi_actions |= NMI_SHOW_REGS;
-               else if (strcmp(p, "debounce") == 0)
-                       nmi_actions |= NMI_DEBOUNCE;
-               else if (strcmp(p, "die") == 0)
-                       nmi_actions |= NMI_DIE;
-               else
-                       printk(KERN_WARNING "NMI: Unrecognized action `%s'\n",
-                               p);
-               if (!sep)
-                       break;
-       }
-
-       return 0;
-}
-__setup("nmi_debug", nmi_debug_setup);
diff --git a/arch/avr32/kernel/ocd.c b/arch/avr32/kernel/ocd.c
deleted file mode 100644 (file)
index 1b0245d..0000000
+++ /dev/null
@@ -1,167 +0,0 @@
-/*
- * Copyright (C) 2007 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/init.h>
-#include <linux/sched.h>
-#include <linux/spinlock.h>
-
-#include <asm/ocd.h>
-
-static long ocd_count;
-static spinlock_t ocd_lock;
-
-/**
- * ocd_enable - enable on-chip debugging
- * @child: task to be debugged
- *
- * If @child is non-NULL, ocd_enable() first checks if debugging has
- * already been enabled for @child, and if it has, does nothing.
- *
- * If @child is NULL (e.g. when debugging the kernel), or debugging
- * has not already been enabled for it, ocd_enable() increments the
- * reference count and enables the debugging hardware.
- */
-void ocd_enable(struct task_struct *child)
-{
-       u32 dc;
-
-       if (child)
-               pr_debug("ocd_enable: child=%s [%u]\n",
-                               child->comm, child->pid);
-       else
-               pr_debug("ocd_enable (no child)\n");
-
-       if (!child || !test_and_set_tsk_thread_flag(child, TIF_DEBUG)) {
-               spin_lock(&ocd_lock);
-               ocd_count++;
-               dc = ocd_read(DC);
-               dc |= (1 << OCD_DC_MM_BIT) | (1 << OCD_DC_DBE_BIT);
-               ocd_write(DC, dc);
-               spin_unlock(&ocd_lock);
-       }
-}
-
-/**
- * ocd_disable - disable on-chip debugging
- * @child: task that was being debugged, but isn't anymore
- *
- * If @child is non-NULL, ocd_disable() checks if debugging is enabled
- * for @child, and if it isn't, does nothing.
- *
- * If @child is NULL (e.g. when debugging the kernel), or debugging is
- * enabled, ocd_disable() decrements the reference count, and if it
- * reaches zero, disables the debugging hardware.
- */
-void ocd_disable(struct task_struct *child)
-{
-       u32 dc;
-
-       if (!child)
-               pr_debug("ocd_disable (no child)\n");
-       else if (test_tsk_thread_flag(child, TIF_DEBUG))
-               pr_debug("ocd_disable: child=%s [%u]\n",
-                               child->comm, child->pid);
-
-       if (!child || test_and_clear_tsk_thread_flag(child, TIF_DEBUG)) {
-               spin_lock(&ocd_lock);
-               ocd_count--;
-
-               WARN_ON(ocd_count < 0);
-
-               if (ocd_count <= 0) {
-                       dc = ocd_read(DC);
-                       dc &= ~((1 << OCD_DC_MM_BIT) | (1 << OCD_DC_DBE_BIT));
-                       ocd_write(DC, dc);
-               }
-               spin_unlock(&ocd_lock);
-       }
-}
-
-#ifdef CONFIG_DEBUG_FS
-#include <linux/debugfs.h>
-#include <linux/module.h>
-
-static struct dentry *ocd_debugfs_root;
-static struct dentry *ocd_debugfs_DC;
-static struct dentry *ocd_debugfs_DS;
-static struct dentry *ocd_debugfs_count;
-
-static int ocd_DC_get(void *data, u64 *val)
-{
-       *val = ocd_read(DC);
-       return 0;
-}
-static int ocd_DC_set(void *data, u64 val)
-{
-       ocd_write(DC, val);
-       return 0;
-}
-DEFINE_SIMPLE_ATTRIBUTE(fops_DC, ocd_DC_get, ocd_DC_set, "0x%08llx\n");
-
-static int ocd_DS_get(void *data, u64 *val)
-{
-       *val = ocd_read(DS);
-       return 0;
-}
-DEFINE_SIMPLE_ATTRIBUTE(fops_DS, ocd_DS_get, NULL, "0x%08llx\n");
-
-static int ocd_count_get(void *data, u64 *val)
-{
-       *val = ocd_count;
-       return 0;
-}
-DEFINE_SIMPLE_ATTRIBUTE(fops_count, ocd_count_get, NULL, "%lld\n");
-
-static void ocd_debugfs_init(void)
-{
-       struct dentry *root;
-
-       root = debugfs_create_dir("ocd", NULL);
-       if (IS_ERR(root) || !root)
-               goto err_root;
-       ocd_debugfs_root = root;
-
-       ocd_debugfs_DC = debugfs_create_file("DC", S_IRUSR | S_IWUSR,
-                               root, NULL, &fops_DC);
-       if (!ocd_debugfs_DC)
-               goto err_DC;
-
-       ocd_debugfs_DS = debugfs_create_file("DS", S_IRUSR, root,
-                               NULL, &fops_DS);
-       if (!ocd_debugfs_DS)
-               goto err_DS;
-
-       ocd_debugfs_count = debugfs_create_file("count", S_IRUSR, root,
-                               NULL, &fops_count);
-       if (!ocd_debugfs_count)
-               goto err_count;
-
-       return;
-
-err_count:
-       debugfs_remove(ocd_debugfs_DS);
-err_DS:
-       debugfs_remove(ocd_debugfs_DC);
-err_DC:
-       debugfs_remove(ocd_debugfs_root);
-err_root:
-       printk(KERN_WARNING "OCD: Failed to create debugfs entries\n");
-}
-#else
-static inline void ocd_debugfs_init(void)
-{
-
-}
-#endif
-
-static int __init ocd_init(void)
-{
-       spin_lock_init(&ocd_lock);
-       ocd_debugfs_init();
-       return 0;
-}
-arch_initcall(ocd_init);
diff --git a/arch/avr32/kernel/process.c b/arch/avr32/kernel/process.c
deleted file mode 100644 (file)
index ad0dfcc..0000000
+++ /dev/null
@@ -1,358 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/sched.h>
-#include <linux/sched/debug.h>
-#include <linux/sched/task.h>
-#include <linux/sched/task_stack.h>
-#include <linux/module.h>
-#include <linux/kallsyms.h>
-#include <linux/fs.h>
-#include <linux/pm.h>
-#include <linux/ptrace.h>
-#include <linux/slab.h>
-#include <linux/reboot.h>
-#include <linux/tick.h>
-#include <linux/uaccess.h>
-#include <linux/unistd.h>
-
-#include <asm/sysreg.h>
-#include <asm/ocd.h>
-#include <asm/syscalls.h>
-
-#include <mach/pm.h>
-
-void (*pm_power_off)(void);
-EXPORT_SYMBOL(pm_power_off);
-
-/*
- * This file handles the architecture-dependent parts of process handling..
- */
-
-void arch_cpu_idle(void)
-{
-       cpu_enter_idle();
-}
-
-void machine_halt(void)
-{
-       /*
-        * Enter Stop mode. The 32 kHz oscillator will keep running so
-        * the RTC will keep the time properly and the system will
-        * boot quickly.
-        */
-       asm volatile("sleep 3\n\t"
-                    "sub pc, -2");
-}
-
-void machine_power_off(void)
-{
-       if (pm_power_off)
-               pm_power_off();
-}
-
-void machine_restart(char *cmd)
-{
-       ocd_write(DC, (1 << OCD_DC_DBE_BIT));
-       ocd_write(DC, (1 << OCD_DC_RES_BIT));
-       while (1) ;
-}
-
-/*
- * Free current thread data structures etc
- */
-void exit_thread(struct task_struct *tsk)
-{
-       ocd_disable(tsk);
-}
-
-void flush_thread(void)
-{
-       /* nothing to do */
-}
-
-void release_thread(struct task_struct *dead_task)
-{
-       /* do nothing */
-}
-
-static void dump_mem(const char *str, const char *log_lvl,
-                    unsigned long bottom, unsigned long top)
-{
-       unsigned long p;
-       int i;
-
-       printk("%s%s(0x%08lx to 0x%08lx)\n", log_lvl, str, bottom, top);
-
-       for (p = bottom & ~31; p < top; ) {
-               printk("%s%04lx: ", log_lvl, p & 0xffff);
-
-               for (i = 0; i < 8; i++, p += 4) {
-                       unsigned int val;
-
-                       if (p < bottom || p >= top)
-                               printk("         ");
-                       else {
-                               if (__get_user(val, (unsigned int __user *)p)) {
-                                       printk("\n");
-                                       goto out;
-                               }
-                               printk("%08x ", val);
-                       }
-               }
-               printk("\n");
-       }
-
-out:
-       return;
-}
-
-static inline int valid_stack_ptr(struct thread_info *tinfo, unsigned long p)
-{
-       return (p > (unsigned long)tinfo)
-               && (p < (unsigned long)tinfo + THREAD_SIZE - 3);
-}
-
-#ifdef CONFIG_FRAME_POINTER
-static void show_trace_log_lvl(struct task_struct *tsk, unsigned long *sp,
-                              struct pt_regs *regs, const char *log_lvl)
-{
-       unsigned long lr, fp;
-       struct thread_info *tinfo;
-
-       if (regs)
-               fp = regs->r7;
-       else if (tsk == current)
-               asm("mov %0, r7" : "=r"(fp));
-       else
-               fp = tsk->thread.cpu_context.r7;
-
-       /*
-        * Walk the stack as long as the frame pointer (a) is within
-        * the kernel stack of the task, and (b) it doesn't move
-        * downwards.
-        */
-       tinfo = task_thread_info(tsk);
-       printk("%sCall trace:\n", log_lvl);
-       while (valid_stack_ptr(tinfo, fp)) {
-               unsigned long new_fp;
-
-               lr = *(unsigned long *)fp;
-#ifdef CONFIG_KALLSYMS
-               printk("%s [<%08lx>] ", log_lvl, lr);
-#else
-               printk(" [<%08lx>] ", lr);
-#endif
-               print_symbol("%s\n", lr);
-
-               new_fp = *(unsigned long *)(fp + 4);
-               if (new_fp <= fp)
-                       break;
-               fp = new_fp;
-       }
-       printk("\n");
-}
-#else
-static void show_trace_log_lvl(struct task_struct *tsk, unsigned long *sp,
-                              struct pt_regs *regs, const char *log_lvl)
-{
-       unsigned long addr;
-
-       printk("%sCall trace:\n", log_lvl);
-
-       while (!kstack_end(sp)) {
-               addr = *sp++;
-               if (kernel_text_address(addr)) {
-#ifdef CONFIG_KALLSYMS
-                       printk("%s [<%08lx>] ", log_lvl, addr);
-#else
-                       printk(" [<%08lx>] ", addr);
-#endif
-                       print_symbol("%s\n", addr);
-               }
-       }
-       printk("\n");
-}
-#endif
-
-void show_stack_log_lvl(struct task_struct *tsk, unsigned long sp,
-                       struct pt_regs *regs, const char *log_lvl)
-{
-       struct thread_info *tinfo;
-
-       if (sp == 0) {
-               if (tsk)
-                       sp = tsk->thread.cpu_context.ksp;
-               else
-                       sp = (unsigned long)&tinfo;
-       }
-       if (!tsk)
-               tsk = current;
-
-       tinfo = task_thread_info(tsk);
-
-       if (valid_stack_ptr(tinfo, sp)) {
-               dump_mem("Stack: ", log_lvl, sp,
-                        THREAD_SIZE + (unsigned long)tinfo);
-               show_trace_log_lvl(tsk, (unsigned long *)sp, regs, log_lvl);
-       }
-}
-
-void show_stack(struct task_struct *tsk, unsigned long *stack)
-{
-       show_stack_log_lvl(tsk, (unsigned long)stack, NULL, "");
-}
-
-static const char *cpu_modes[] = {
-       "Application", "Supervisor", "Interrupt level 0", "Interrupt level 1",
-       "Interrupt level 2", "Interrupt level 3", "Exception", "NMI"
-};
-
-void show_regs_log_lvl(struct pt_regs *regs, const char *log_lvl)
-{
-       unsigned long sp = regs->sp;
-       unsigned long lr = regs->lr;
-       unsigned long mode = (regs->sr & MODE_MASK) >> MODE_SHIFT;
-
-       show_regs_print_info(log_lvl);
-
-       if (!user_mode(regs)) {
-               sp = (unsigned long)regs + FRAME_SIZE_FULL;
-
-               printk("%s", log_lvl);
-               print_symbol("PC is at %s\n", instruction_pointer(regs));
-               printk("%s", log_lvl);
-               print_symbol("LR is at %s\n", lr);
-       }
-
-       printk("%spc : [<%08lx>]    lr : [<%08lx>]    %s\n"
-              "%ssp : %08lx  r12: %08lx  r11: %08lx\n",
-              log_lvl, instruction_pointer(regs), lr, print_tainted(),
-              log_lvl, sp, regs->r12, regs->r11);
-       printk("%sr10: %08lx  r9 : %08lx  r8 : %08lx\n",
-              log_lvl, regs->r10, regs->r9, regs->r8);
-       printk("%sr7 : %08lx  r6 : %08lx  r5 : %08lx  r4 : %08lx\n",
-              log_lvl, regs->r7, regs->r6, regs->r5, regs->r4);
-       printk("%sr3 : %08lx  r2 : %08lx  r1 : %08lx  r0 : %08lx\n",
-              log_lvl, regs->r3, regs->r2, regs->r1, regs->r0);
-       printk("%sFlags: %c%c%c%c%c\n", log_lvl,
-              regs->sr & SR_Q ? 'Q' : 'q',
-              regs->sr & SR_V ? 'V' : 'v',
-              regs->sr & SR_N ? 'N' : 'n',
-              regs->sr & SR_Z ? 'Z' : 'z',
-              regs->sr & SR_C ? 'C' : 'c');
-       printk("%sMode bits: %c%c%c%c%c%c%c%c%c%c\n", log_lvl,
-              regs->sr & SR_H ? 'H' : 'h',
-              regs->sr & SR_J ? 'J' : 'j',
-              regs->sr & SR_DM ? 'M' : 'm',
-              regs->sr & SR_D ? 'D' : 'd',
-              regs->sr & SR_EM ? 'E' : 'e',
-              regs->sr & SR_I3M ? '3' : '.',
-              regs->sr & SR_I2M ? '2' : '.',
-              regs->sr & SR_I1M ? '1' : '.',
-              regs->sr & SR_I0M ? '0' : '.',
-              regs->sr & SR_GM ? 'G' : 'g');
-       printk("%sCPU Mode: %s\n", log_lvl, cpu_modes[mode]);
-}
-
-void show_regs(struct pt_regs *regs)
-{
-       unsigned long sp = regs->sp;
-
-       if (!user_mode(regs))
-               sp = (unsigned long)regs + FRAME_SIZE_FULL;
-
-       show_regs_log_lvl(regs, "");
-       show_trace_log_lvl(current, (unsigned long *)sp, regs, "");
-}
-EXPORT_SYMBOL(show_regs);
-
-/* Fill in the fpu structure for a core dump. This is easy -- we don't have any */
-int dump_fpu(struct pt_regs *regs, elf_fpregset_t *fpu)
-{
-       /* Not valid */
-       return 0;
-}
-
-asmlinkage void ret_from_fork(void);
-asmlinkage void ret_from_kernel_thread(void);
-asmlinkage void syscall_return(void);
-
-int copy_thread(unsigned long clone_flags, unsigned long usp,
-               unsigned long arg,
-               struct task_struct *p)
-{
-       struct pt_regs *childregs = task_pt_regs(p);
-
-       if (unlikely(p->flags & PF_KTHREAD)) {
-               memset(childregs, 0, sizeof(struct pt_regs));
-               p->thread.cpu_context.r0 = arg;
-               p->thread.cpu_context.r1 = usp; /* fn */
-               p->thread.cpu_context.r2 = (unsigned long)syscall_return;
-               p->thread.cpu_context.pc = (unsigned long)ret_from_kernel_thread;
-               childregs->sr = MODE_SUPERVISOR;
-       } else {
-               *childregs = *current_pt_regs();
-               if (usp)
-                       childregs->sp = usp;
-               childregs->r12 = 0; /* Set return value for child */
-               p->thread.cpu_context.pc = (unsigned long)ret_from_fork;
-       }
-
-       p->thread.cpu_context.sr = MODE_SUPERVISOR | SR_GM;
-       p->thread.cpu_context.ksp = (unsigned long)childregs;
-
-       clear_tsk_thread_flag(p, TIF_DEBUG);
-       if ((clone_flags & CLONE_PTRACE) && test_thread_flag(TIF_DEBUG))
-               ocd_enable(p);
-
-       return 0;
-}
-
-/*
- * This function is supposed to answer the question "who called
- * schedule()?"
- */
-unsigned long get_wchan(struct task_struct *p)
-{
-       unsigned long pc;
-       unsigned long stack_page;
-
-       if (!p || p == current || p->state == TASK_RUNNING)
-               return 0;
-
-       stack_page = (unsigned long)task_stack_page(p);
-       BUG_ON(!stack_page);
-
-       /*
-        * The stored value of PC is either the address right after
-        * the call to __switch_to() or ret_from_fork.
-        */
-       pc = thread_saved_pc(p);
-       if (in_sched_functions(pc)) {
-#ifdef CONFIG_FRAME_POINTER
-               unsigned long fp = p->thread.cpu_context.r7;
-               BUG_ON(fp < stack_page || fp > (THREAD_SIZE + stack_page));
-               pc = *(unsigned long *)fp;
-#else
-               /*
-                * We depend on the frame size of schedule here, which
-                * is actually quite ugly. It might be possible to
-                * determine the frame size automatically at build
-                * time by doing this:
-                *   - compile sched/core.c
-                *   - disassemble the resulting sched.o
-                *   - look for 'sub sp,??' shortly after '<schedule>:'
-                */
-               unsigned long sp = p->thread.cpu_context.ksp + 16;
-               BUG_ON(sp < stack_page || sp > (THREAD_SIZE + stack_page));
-               pc = *(unsigned long *)sp;
-#endif
-       }
-
-       return pc;
-}
diff --git a/arch/avr32/kernel/ptrace.c b/arch/avr32/kernel/ptrace.c
deleted file mode 100644 (file)
index 41a14e9..0000000
+++ /dev/null
@@ -1,357 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#undef DEBUG
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/sched/task_stack.h>
-#include <linux/mm.h>
-#include <linux/ptrace.h>
-#include <linux/errno.h>
-#include <linux/user.h>
-#include <linux/security.h>
-#include <linux/unistd.h>
-#include <linux/notifier.h>
-
-#include <asm/traps.h>
-#include <linux/uaccess.h>
-#include <asm/ocd.h>
-#include <asm/mmu_context.h>
-#include <linux/kdebug.h>
-
-static struct pt_regs *get_user_regs(struct task_struct *tsk)
-{
-       return (struct pt_regs *)((unsigned long)task_stack_page(tsk) +
-                                 THREAD_SIZE - sizeof(struct pt_regs));
-}
-
-void user_enable_single_step(struct task_struct *tsk)
-{
-       pr_debug("user_enable_single_step: pid=%u, PC=0x%08lx, SR=0x%08lx\n",
-                tsk->pid, task_pt_regs(tsk)->pc, task_pt_regs(tsk)->sr);
-
-       /*
-        * We can't schedule in Debug mode, so when TIF_BREAKPOINT is
-        * set, the system call or exception handler will do a
-        * breakpoint to enter monitor mode before returning to
-        * userspace.
-        *
-        * The monitor code will then notice that TIF_SINGLE_STEP is
-        * set and return to userspace with single stepping enabled.
-        * The CPU will then enter monitor mode again after exactly
-        * one instruction has been executed, and the monitor code
-        * will then send a SIGTRAP to the process.
-        */
-       set_tsk_thread_flag(tsk, TIF_BREAKPOINT);
-       set_tsk_thread_flag(tsk, TIF_SINGLE_STEP);
-}
-
-void user_disable_single_step(struct task_struct *child)
-{
-       /* XXX(hch): a no-op here seems wrong.. */
-}
-
-/*
- * Called by kernel/ptrace.c when detaching
- *
- * Make sure any single step bits, etc. are not set
- */
-void ptrace_disable(struct task_struct *child)
-{
-       clear_tsk_thread_flag(child, TIF_SINGLE_STEP);
-       clear_tsk_thread_flag(child, TIF_BREAKPOINT);
-       ocd_disable(child);
-}
-
-/*
- * Read the word at offset "offset" into the task's "struct user". We
- * actually access the pt_regs struct stored on the kernel stack.
- */
-static int ptrace_read_user(struct task_struct *tsk, unsigned long offset,
-                           unsigned long __user *data)
-{
-       unsigned long *regs;
-       unsigned long value;
-
-       if (offset & 3 || offset >= sizeof(struct user)) {
-               printk("ptrace_read_user: invalid offset 0x%08lx\n", offset);
-               return -EIO;
-       }
-
-       regs = (unsigned long *)get_user_regs(tsk);
-
-       value = 0;
-       if (offset < sizeof(struct pt_regs))
-               value = regs[offset / sizeof(regs[0])];
-
-       pr_debug("ptrace_read_user(%s[%u], %#lx, %p) -> %#lx\n",
-                tsk->comm, tsk->pid, offset, data, value);
-
-       return put_user(value, data);
-}
-
-/*
- * Write the word "value" to offset "offset" into the task's "struct
- * user". We actually access the pt_regs struct stored on the kernel
- * stack.
- */
-static int ptrace_write_user(struct task_struct *tsk, unsigned long offset,
-                            unsigned long value)
-{
-       unsigned long *regs;
-
-       pr_debug("ptrace_write_user(%s[%u], %#lx, %#lx)\n",
-                       tsk->comm, tsk->pid, offset, value);
-
-       if (offset & 3 || offset >= sizeof(struct user)) {
-               pr_debug("  invalid offset 0x%08lx\n", offset);
-               return -EIO;
-       }
-
-       if (offset >= sizeof(struct pt_regs))
-               return 0;
-
-       regs = (unsigned long *)get_user_regs(tsk);
-       regs[offset / sizeof(regs[0])] = value;
-
-       return 0;
-}
-
-static int ptrace_getregs(struct task_struct *tsk, void __user *uregs)
-{
-       struct pt_regs *regs = get_user_regs(tsk);
-
-       return copy_to_user(uregs, regs, sizeof(*regs)) ? -EFAULT : 0;
-}
-
-static int ptrace_setregs(struct task_struct *tsk, const void __user *uregs)
-{
-       struct pt_regs newregs;
-       int ret;
-
-       ret = -EFAULT;
-       if (copy_from_user(&newregs, uregs, sizeof(newregs)) == 0) {
-               struct pt_regs *regs = get_user_regs(tsk);
-
-               ret = -EINVAL;
-               if (valid_user_regs(&newregs)) {
-                       *regs = newregs;
-                       ret = 0;
-               }
-       }
-
-       return ret;
-}
-
-long arch_ptrace(struct task_struct *child, long request,
-                unsigned long addr, unsigned long data)
-{
-       int ret;
-       void __user *datap = (void __user *) data;
-
-       switch (request) {
-       /* Read the word at location addr in the child process */
-       case PTRACE_PEEKTEXT:
-       case PTRACE_PEEKDATA:
-               ret = generic_ptrace_peekdata(child, addr, data);
-               break;
-
-       case PTRACE_PEEKUSR:
-               ret = ptrace_read_user(child, addr, datap);
-               break;
-
-       /* Write the word in data at location addr */
-       case PTRACE_POKETEXT:
-       case PTRACE_POKEDATA:
-               ret = generic_ptrace_pokedata(child, addr, data);
-               break;
-
-       case PTRACE_POKEUSR:
-               ret = ptrace_write_user(child, addr, data);
-               break;
-
-       case PTRACE_GETREGS:
-               ret = ptrace_getregs(child, datap);
-               break;
-
-       case PTRACE_SETREGS:
-               ret = ptrace_setregs(child, datap);
-               break;
-
-       default:
-               ret = ptrace_request(child, request, addr, data);
-               break;
-       }
-
-       return ret;
-}
-
-asmlinkage void syscall_trace(void)
-{
-       if (!test_thread_flag(TIF_SYSCALL_TRACE))
-               return;
-       if (!(current->ptrace & PT_PTRACED))
-               return;
-
-       /* The 0x80 provides a way for the tracing parent to
-        * distinguish between a syscall stop and SIGTRAP delivery */
-       ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD)
-                                ? 0x80 : 0));
-
-       /*
-        * this isn't the same as continuing with a signal, but it
-        * will do for normal use.  strace only continues with a
-        * signal if the stopping signal is not SIGTRAP.  -brl
-        */
-       if (current->exit_code) {
-               pr_debug("syscall_trace: sending signal %d to PID %u\n",
-                        current->exit_code, current->pid);
-               send_sig(current->exit_code, current, 1);
-               current->exit_code = 0;
-       }
-}
-
-/*
- * debug_trampoline() is an assembly stub which will store all user
- * registers on the stack and execute a breakpoint instruction.
- *
- * If we single-step into an exception handler which runs with
- * interrupts disabled the whole time so it doesn't have to check for
- * pending work, its return address will be modified so that it ends
- * up returning to debug_trampoline.
- *
- * If the exception handler decides to store the user context and
- * enable interrupts after all, it will restore the original return
- * address and status register value. Before it returns, it will
- * notice that TIF_BREAKPOINT is set and execute a breakpoint
- * instruction.
- */
-extern void debug_trampoline(void);
-
-asmlinkage struct pt_regs *do_debug(struct pt_regs *regs)
-{
-       struct thread_info      *ti;
-       unsigned long           trampoline_addr;
-       u32                     status;
-       u32                     ctrl;
-       int                     code;
-
-       status = ocd_read(DS);
-       ti = current_thread_info();
-       code = TRAP_BRKPT;
-
-       pr_debug("do_debug: status=0x%08x PC=0x%08lx SR=0x%08lx tif=0x%08lx\n",
-                       status, regs->pc, regs->sr, ti->flags);
-
-       if (!user_mode(regs)) {
-               unsigned long   die_val = DIE_BREAKPOINT;
-
-               if (status & (1 << OCD_DS_SSS_BIT))
-                       die_val = DIE_SSTEP;
-
-               if (notify_die(die_val, "ptrace", regs, 0, 0, SIGTRAP)
-                               == NOTIFY_STOP)
-                       return regs;
-
-               if ((status & (1 << OCD_DS_SWB_BIT))
-                               && test_and_clear_ti_thread_flag(
-                                       ti, TIF_BREAKPOINT)) {
-                       /*
-                        * Explicit breakpoint from trampoline or
-                        * exception/syscall/interrupt handler.
-                        *
-                        * The real saved regs are on the stack right
-                        * after the ones we saved on entry.
-                        */
-                       regs++;
-                       pr_debug("  -> TIF_BREAKPOINT done, adjusted regs:"
-                                       "PC=0x%08lx SR=0x%08lx\n",
-                                       regs->pc, regs->sr);
-                       BUG_ON(!user_mode(regs));
-
-                       if (test_thread_flag(TIF_SINGLE_STEP)) {
-                               pr_debug("Going to do single step...\n");
-                               return regs;
-                       }
-
-                       /*
-                        * No TIF_SINGLE_STEP means we're done
-                        * stepping over a syscall. Do the trap now.
-                        */
-                       code = TRAP_TRACE;
-               } else if ((status & (1 << OCD_DS_SSS_BIT))
-                               && test_ti_thread_flag(ti, TIF_SINGLE_STEP)) {
-
-                       pr_debug("Stepped into something, "
-                                       "setting TIF_BREAKPOINT...\n");
-                       set_ti_thread_flag(ti, TIF_BREAKPOINT);
-
-                       /*
-                        * We stepped into an exception, interrupt or
-                        * syscall handler. Some exception handlers
-                        * don't check for pending work, so we need to
-                        * set up a trampoline just in case.
-                        *
-                        * The exception entry code will undo the
-                        * trampoline stuff if it does a full context
-                        * save (which also means that it'll check for
-                        * pending work later.)
-                        */
-                       if ((regs->sr & MODE_MASK) == MODE_EXCEPTION) {
-                               trampoline_addr
-                                       = (unsigned long)&debug_trampoline;
-
-                               pr_debug("Setting up trampoline...\n");
-                               ti->rar_saved = sysreg_read(RAR_EX);
-                               ti->rsr_saved = sysreg_read(RSR_EX);
-                               sysreg_write(RAR_EX, trampoline_addr);
-                               sysreg_write(RSR_EX, (MODE_EXCEPTION
-                                                       | SR_EM | SR_GM));
-                               BUG_ON(ti->rsr_saved & MODE_MASK);
-                       }
-
-                       /*
-                        * If we stepped into a system call, we
-                        * shouldn't do a single step after we return
-                        * since the return address is right after the
-                        * "scall" instruction we were told to step
-                        * over.
-                        */
-                       if ((regs->sr & MODE_MASK) == MODE_SUPERVISOR) {
-                               pr_debug("Supervisor; no single step\n");
-                               clear_ti_thread_flag(ti, TIF_SINGLE_STEP);
-                       }
-
-                       ctrl = ocd_read(DC);
-                       ctrl &= ~(1 << OCD_DC_SS_BIT);
-                       ocd_write(DC, ctrl);
-
-                       return regs;
-               } else {
-                       printk(KERN_ERR "Unexpected OCD_DS value: 0x%08x\n",
-                                       status);
-                       printk(KERN_ERR "Thread flags: 0x%08lx\n", ti->flags);
-                       die("Unhandled debug trap in kernel mode",
-                                       regs, SIGTRAP);
-               }
-       } else if (status & (1 << OCD_DS_SSS_BIT)) {
-               /* Single step in user mode */
-               code = TRAP_TRACE;
-
-               ctrl = ocd_read(DC);
-               ctrl &= ~(1 << OCD_DC_SS_BIT);
-               ocd_write(DC, ctrl);
-       }
-
-       pr_debug("Sending SIGTRAP: code=%d PC=0x%08lx SR=0x%08lx\n",
-                       code, regs->pc, regs->sr);
-
-       clear_thread_flag(TIF_SINGLE_STEP);
-       _exception(SIGTRAP, regs, code, instruction_pointer(regs));
-
-       return regs;
-}
diff --git a/arch/avr32/kernel/setup.c b/arch/avr32/kernel/setup.c
deleted file mode 100644 (file)
index e692889..0000000
+++ /dev/null
@@ -1,609 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/clk.h>
-#include <linux/init.h>
-#include <linux/initrd.h>
-#include <linux/sched.h>
-#include <linux/console.h>
-#include <linux/ioport.h>
-#include <linux/bootmem.h>
-#include <linux/fs.h>
-#include <linux/module.h>
-#include <linux/pfn.h>
-#include <linux/root_dev.h>
-#include <linux/cpu.h>
-#include <linux/kernel.h>
-
-#include <asm/sections.h>
-#include <asm/processor.h>
-#include <asm/pgtable.h>
-#include <asm/setup.h>
-#include <asm/sysreg.h>
-
-#include <mach/board.h>
-#include <mach/init.h>
-
-extern int root_mountflags;
-
-/*
- * Initialize loops_per_jiffy as 5000000 (500MIPS).
- * Better make it too large than too small...
- */
-struct avr32_cpuinfo boot_cpu_data = {
-       .loops_per_jiffy = 5000000
-};
-EXPORT_SYMBOL(boot_cpu_data);
-
-static char __initdata command_line[COMMAND_LINE_SIZE];
-
-/*
- * Standard memory resources
- */
-static struct resource __initdata kernel_data = {
-       .name   = "Kernel data",
-       .start  = 0,
-       .end    = 0,
-       .flags  = IORESOURCE_SYSTEM_RAM,
-};
-static struct resource __initdata kernel_code = {
-       .name   = "Kernel code",
-       .start  = 0,
-       .end    = 0,
-       .flags  = IORESOURCE_SYSTEM_RAM,
-       .sibling = &kernel_data,
-};
-
-/*
- * Available system RAM and reserved regions as singly linked
- * lists. These lists are traversed using the sibling pointer in
- * struct resource and are kept sorted at all times.
- */
-static struct resource *__initdata system_ram;
-static struct resource *__initdata reserved = &kernel_code;
-
-/*
- * We need to allocate these before the bootmem allocator is up and
- * running, so we need this "cache". 32 entries are probably enough
- * for all but the most insanely complex systems.
- */
-static struct resource __initdata res_cache[32];
-static unsigned int __initdata res_cache_next_free;
-
-static void __init resource_init(void)
-{
-       struct resource *mem, *res;
-       struct resource *new;
-
-       kernel_code.start = __pa(init_mm.start_code);
-
-       for (mem = system_ram; mem; mem = mem->sibling) {
-               new = alloc_bootmem_low(sizeof(struct resource));
-               memcpy(new, mem, sizeof(struct resource));
-
-               new->sibling = NULL;
-               if (request_resource(&iomem_resource, new))
-                       printk(KERN_WARNING "Bad RAM resource %08x-%08x\n",
-                              mem->start, mem->end);
-       }
-
-       for (res = reserved; res; res = res->sibling) {
-               new = alloc_bootmem_low(sizeof(struct resource));
-               memcpy(new, res, sizeof(struct resource));
-
-               new->sibling = NULL;
-               if (insert_resource(&iomem_resource, new))
-                       printk(KERN_WARNING
-                              "Bad reserved resource %s (%08x-%08x)\n",
-                              res->name, res->start, res->end);
-       }
-}
-
-static void __init
-add_physical_memory(resource_size_t start, resource_size_t end)
-{
-       struct resource *new, *next, **pprev;
-
-       for (pprev = &system_ram, next = system_ram; next;
-            pprev = &next->sibling, next = next->sibling) {
-               if (end < next->start)
-                       break;
-               if (start <= next->end) {
-                       printk(KERN_WARNING
-                              "Warning: Physical memory map is broken\n");
-                       printk(KERN_WARNING
-                              "Warning: %08x-%08x overlaps %08x-%08x\n",
-                              start, end, next->start, next->end);
-                       return;
-               }
-       }
-
-       if (res_cache_next_free >= ARRAY_SIZE(res_cache)) {
-               printk(KERN_WARNING
-                      "Warning: Failed to add physical memory %08x-%08x\n",
-                      start, end);
-               return;
-       }
-
-       new = &res_cache[res_cache_next_free++];
-       new->start = start;
-       new->end = end;
-       new->name = "System RAM";
-       new->flags = IORESOURCE_SYSTEM_RAM;
-
-       *pprev = new;
-}
-
-static int __init
-add_reserved_region(resource_size_t start, resource_size_t end,
-                   const char *name)
-{
-       struct resource *new, *next, **pprev;
-
-       if (end < start)
-               return -EINVAL;
-
-       if (res_cache_next_free >= ARRAY_SIZE(res_cache))
-               return -ENOMEM;
-
-       for (pprev = &reserved, next = reserved; next;
-            pprev = &next->sibling, next = next->sibling) {
-               if (end < next->start)
-                       break;
-               if (start <= next->end)
-                       return -EBUSY;
-       }
-
-       new = &res_cache[res_cache_next_free++];
-       new->start = start;
-       new->end = end;
-       new->name = name;
-       new->sibling = next;
-       new->flags = IORESOURCE_MEM;
-
-       *pprev = new;
-
-       return 0;
-}
-
-static unsigned long __init
-find_free_region(const struct resource *mem, resource_size_t size,
-                resource_size_t align)
-{
-       struct resource *res;
-       unsigned long target;
-
-       target = ALIGN(mem->start, align);
-       for (res = reserved; res; res = res->sibling) {
-               if ((target + size) <= res->start)
-                       break;
-               if (target <= res->end)
-                       target = ALIGN(res->end + 1, align);
-       }
-
-       if ((target + size) > (mem->end + 1))
-               return mem->end + 1;
-
-       return target;
-}
-
-static int __init
-alloc_reserved_region(resource_size_t *start, resource_size_t size,
-                     resource_size_t align, const char *name)
-{
-       struct resource *mem;
-       resource_size_t target;
-       int ret;
-
-       for (mem = system_ram; mem; mem = mem->sibling) {
-               target = find_free_region(mem, size, align);
-               if (target <= mem->end) {
-                       ret = add_reserved_region(target, target + size - 1,
-                                                 name);
-                       if (!ret)
-                               *start = target;
-                       return ret;
-               }
-       }
-
-       return -ENOMEM;
-}
-
-/*
- * Early framebuffer allocation. Works as follows:
- *   - If fbmem_size is zero, nothing will be allocated or reserved.
- *   - If fbmem_start is zero when setup_bootmem() is called,
- *     a block of fbmem_size bytes will be reserved before bootmem
- *     initialization. It will be aligned to the largest page size
- *     that fbmem_size is a multiple of.
- *   - If fbmem_start is nonzero, an area of size fbmem_size will be
- *     reserved at the physical address fbmem_start if possible. If
- *     it collides with other reserved memory, a different block of
- *     same size will be allocated, just as if fbmem_start was zero.
- *
- * Board-specific code may use these variables to set up platform data
- * for the framebuffer driver if fbmem_size is nonzero.
- */
-resource_size_t __initdata fbmem_start;
-resource_size_t __initdata fbmem_size;
-
-/*
- * "fbmem=xxx[kKmM]" allocates the specified amount of boot memory for
- * use as framebuffer.
- *
- * "fbmem=xxx[kKmM]@yyy[kKmM]" defines a memory region of size xxx and
- * starting at yyy to be reserved for use as framebuffer.
- *
- * The kernel won't verify that the memory region starting at yyy
- * actually contains usable RAM.
- */
-static int __init early_parse_fbmem(char *p)
-{
-       int ret;
-       unsigned long align;
-
-       fbmem_size = memparse(p, &p);
-       if (*p == '@') {
-               fbmem_start = memparse(p + 1, &p);
-               ret = add_reserved_region(fbmem_start,
-                                         fbmem_start + fbmem_size - 1,
-                                         "Framebuffer");
-               if (ret) {
-                       printk(KERN_WARNING
-                              "Failed to reserve framebuffer memory\n");
-                       fbmem_start = 0;
-               }
-       }
-
-       if (!fbmem_start) {
-               if ((fbmem_size & 0x000fffffUL) == 0)
-                       align = 0x100000;       /* 1 MiB */
-               else if ((fbmem_size & 0x0000ffffUL) == 0)
-                       align = 0x10000;        /* 64 KiB */
-               else
-                       align = 0x1000;         /* 4 KiB */
-
-               ret = alloc_reserved_region(&fbmem_start, fbmem_size,
-                                           align, "Framebuffer");
-               if (ret) {
-                       printk(KERN_WARNING
-                              "Failed to allocate framebuffer memory\n");
-                       fbmem_size = 0;
-               } else {
-                       memset(__va(fbmem_start), 0, fbmem_size);
-               }
-       }
-
-       return 0;
-}
-early_param("fbmem", early_parse_fbmem);
-
-/*
- * Pick out the memory size.  We look for mem=size@start,
- * where start and size are "size[KkMmGg]"
- */
-static int __init early_mem(char *p)
-{
-       resource_size_t size, start;
-
-       start = system_ram->start;
-       size  = memparse(p, &p);
-       if (*p == '@')
-               start = memparse(p + 1, &p);
-
-       system_ram->start = start;
-       system_ram->end = system_ram->start + size - 1;
-       return 0;
-}
-early_param("mem", early_mem);
-
-static int __init parse_tag_core(struct tag *tag)
-{
-       if (tag->hdr.size > 2) {
-               if ((tag->u.core.flags & 1) == 0)
-                       root_mountflags &= ~MS_RDONLY;
-               ROOT_DEV = new_decode_dev(tag->u.core.rootdev);
-       }
-       return 0;
-}
-__tagtable(ATAG_CORE, parse_tag_core);
-
-static int __init parse_tag_mem(struct tag *tag)
-{
-       unsigned long start, end;
-
-       /*
-        * Ignore zero-sized entries. If we're running standalone, the
-        * SDRAM code may emit such entries if something goes
-        * wrong...
-        */
-       if (tag->u.mem_range.size == 0)
-               return 0;
-
-       start = tag->u.mem_range.addr;
-       end = tag->u.mem_range.addr + tag->u.mem_range.size - 1;
-
-       add_physical_memory(start, end);
-       return 0;
-}
-__tagtable(ATAG_MEM, parse_tag_mem);
-
-static int __init parse_tag_rdimg(struct tag *tag)
-{
-#ifdef CONFIG_BLK_DEV_INITRD
-       struct tag_mem_range *mem = &tag->u.mem_range;
-       int ret;
-
-       if (initrd_start) {
-               printk(KERN_WARNING
-                      "Warning: Only the first initrd image will be used\n");
-               return 0;
-       }
-
-       ret = add_reserved_region(mem->addr, mem->addr + mem->size - 1,
-                                 "initrd");
-       if (ret) {
-               printk(KERN_WARNING
-                      "Warning: Failed to reserve initrd memory\n");
-               return ret;
-       }
-
-       initrd_start = (unsigned long)__va(mem->addr);
-       initrd_end = initrd_start + mem->size;
-#else
-       printk(KERN_WARNING "RAM disk image present, but "
-              "no initrd support in kernel, ignoring\n");
-#endif
-
-       return 0;
-}
-__tagtable(ATAG_RDIMG, parse_tag_rdimg);
-
-static int __init parse_tag_rsvd_mem(struct tag *tag)
-{
-       struct tag_mem_range *mem = &tag->u.mem_range;
-
-       return add_reserved_region(mem->addr, mem->addr + mem->size - 1,
-                                  "Reserved");
-}
-__tagtable(ATAG_RSVD_MEM, parse_tag_rsvd_mem);
-
-static int __init parse_tag_cmdline(struct tag *tag)
-{
-       strlcpy(boot_command_line, tag->u.cmdline.cmdline, COMMAND_LINE_SIZE);
-       return 0;
-}
-__tagtable(ATAG_CMDLINE, parse_tag_cmdline);
-
-static int __init parse_tag_clock(struct tag *tag)
-{
-       /*
-        * We'll figure out the clocks by peeking at the system
-        * manager regs directly.
-        */
-       return 0;
-}
-__tagtable(ATAG_CLOCK, parse_tag_clock);
-
-/*
- * The board_number correspond to the bd->bi_board_number in U-Boot. This
- * parameter is only available during initialisation and can be used in some
- * kind of board identification.
- */
-u32 __initdata board_number;
-
-static int __init parse_tag_boardinfo(struct tag *tag)
-{
-       board_number = tag->u.boardinfo.board_number;
-
-       return 0;
-}
-__tagtable(ATAG_BOARDINFO, parse_tag_boardinfo);
-
-/*
- * Scan the tag table for this tag, and call its parse function. The
- * tag table is built by the linker from all the __tagtable
- * declarations.
- */
-static int __init parse_tag(struct tag *tag)
-{
-       extern struct tagtable __tagtable_begin, __tagtable_end;
-       struct tagtable *t;
-
-       for (t = &__tagtable_begin; t < &__tagtable_end; t++)
-               if (tag->hdr.tag == t->tag) {
-                       t->parse(tag);
-                       break;
-               }
-
-       return t < &__tagtable_end;
-}
-
-/*
- * Parse all tags in the list we got from the boot loader
- */
-static void __init parse_tags(struct tag *t)
-{
-       for (; t->hdr.tag != ATAG_NONE; t = tag_next(t))
-               if (!parse_tag(t))
-                       printk(KERN_WARNING
-                              "Ignoring unrecognised tag 0x%08x\n",
-                              t->hdr.tag);
-}
-
-/*
- * Find a free memory region large enough for storing the
- * bootmem bitmap.
- */
-static unsigned long __init
-find_bootmap_pfn(const struct resource *mem)
-{
-       unsigned long bootmap_pages, bootmap_len;
-       unsigned long node_pages = PFN_UP(resource_size(mem));
-       unsigned long bootmap_start;
-
-       bootmap_pages = bootmem_bootmap_pages(node_pages);
-       bootmap_len = bootmap_pages << PAGE_SHIFT;
-
-       /*
-        * Find a large enough region without reserved pages for
-        * storing the bootmem bitmap. We can take advantage of the
-        * fact that all lists have been sorted.
-        *
-        * We have to check that we don't collide with any reserved
-        * regions, which includes the kernel image and any RAMDISK
-        * images.
-        */
-       bootmap_start = find_free_region(mem, bootmap_len, PAGE_SIZE);
-
-       return bootmap_start >> PAGE_SHIFT;
-}
-
-#define MAX_LOWMEM     HIGHMEM_START
-#define MAX_LOWMEM_PFN PFN_DOWN(MAX_LOWMEM)
-
-static void __init setup_bootmem(void)
-{
-       unsigned bootmap_size;
-       unsigned long first_pfn, bootmap_pfn, pages;
-       unsigned long max_pfn, max_low_pfn;
-       unsigned node = 0;
-       struct resource *res;
-
-       printk(KERN_INFO "Physical memory:\n");
-       for (res = system_ram; res; res = res->sibling)
-               printk("  %08x-%08x\n", res->start, res->end);
-       printk(KERN_INFO "Reserved memory:\n");
-       for (res = reserved; res; res = res->sibling)
-               printk("  %08x-%08x: %s\n",
-                      res->start, res->end, res->name);
-
-       nodes_clear(node_online_map);
-
-       if (system_ram->sibling)
-               printk(KERN_WARNING "Only using first memory bank\n");
-
-       for (res = system_ram; res; res = NULL) {
-               first_pfn = PFN_UP(res->start);
-               max_low_pfn = max_pfn = PFN_DOWN(res->end + 1);
-               bootmap_pfn = find_bootmap_pfn(res);
-               if (bootmap_pfn > max_pfn)
-                       panic("No space for bootmem bitmap!\n");
-
-               if (max_low_pfn > MAX_LOWMEM_PFN) {
-                       max_low_pfn = MAX_LOWMEM_PFN;
-#ifndef CONFIG_HIGHMEM
-                       /*
-                        * Lowmem is memory that can be addressed
-                        * directly through P1/P2
-                        */
-                       printk(KERN_WARNING
-                              "Node %u: Only %ld MiB of memory will be used.\n",
-                              node, MAX_LOWMEM >> 20);
-                       printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
-#else
-#error HIGHMEM is not supported by AVR32 yet
-#endif
-               }
-
-               /* Initialize the boot-time allocator with low memory only. */
-               bootmap_size = init_bootmem_node(NODE_DATA(node), bootmap_pfn,
-                                                first_pfn, max_low_pfn);
-
-               /*
-                * Register fully available RAM pages with the bootmem
-                * allocator.
-                */
-               pages = max_low_pfn - first_pfn;
-               free_bootmem_node (NODE_DATA(node), PFN_PHYS(first_pfn),
-                                  PFN_PHYS(pages));
-
-               /* Reserve space for the bootmem bitmap... */
-               reserve_bootmem_node(NODE_DATA(node),
-                                    PFN_PHYS(bootmap_pfn),
-                                    bootmap_size,
-                                    BOOTMEM_DEFAULT);
-
-               /* ...and any other reserved regions. */
-               for (res = reserved; res; res = res->sibling) {
-                       if (res->start > PFN_PHYS(max_pfn))
-                               break;
-
-                       /*
-                        * resource_init will complain about partial
-                        * overlaps, so we'll just ignore such
-                        * resources for now.
-                        */
-                       if (res->start >= PFN_PHYS(first_pfn)
-                           && res->end < PFN_PHYS(max_pfn))
-                               reserve_bootmem_node(NODE_DATA(node),
-                                                    res->start,
-                                                    resource_size(res),
-                                                    BOOTMEM_DEFAULT);
-               }
-
-               node_set_online(node);
-       }
-}
-
-void __init setup_arch (char **cmdline_p)
-{
-       struct clk *cpu_clk;
-
-       init_mm.start_code = (unsigned long)_stext;
-       init_mm.end_code = (unsigned long)_etext;
-       init_mm.end_data = (unsigned long)_edata;
-       init_mm.brk = (unsigned long)_end;
-
-       /*
-        * Include .init section to make allocations easier. It will
-        * be removed before the resource is actually requested.
-        */
-       kernel_code.start = __pa(__init_begin);
-       kernel_code.end = __pa(init_mm.end_code - 1);
-       kernel_data.start = __pa(init_mm.end_code);
-       kernel_data.end = __pa(init_mm.brk - 1);
-
-       parse_tags(bootloader_tags);
-
-       setup_processor();
-       setup_platform();
-       setup_board();
-
-       cpu_clk = clk_get(NULL, "cpu");
-       if (IS_ERR(cpu_clk)) {
-               printk(KERN_WARNING "Warning: Unable to get CPU clock\n");
-       } else {
-               unsigned long cpu_hz = clk_get_rate(cpu_clk);
-
-               /*
-                * Well, duh, but it's probably a good idea to
-                * increment the use count.
-                */
-               clk_enable(cpu_clk);
-
-               boot_cpu_data.clk = cpu_clk;
-               boot_cpu_data.loops_per_jiffy = cpu_hz * 4;
-               printk("CPU: Running at %lu.%03lu MHz\n",
-                      ((cpu_hz + 500) / 1000) / 1000,
-                      ((cpu_hz + 500) / 1000) % 1000);
-       }
-
-       strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
-       *cmdline_p = command_line;
-       parse_early_param();
-
-       setup_bootmem();
-
-#ifdef CONFIG_VT
-       conswitchp = &dummy_con;
-#endif
-
-       paging_init();
-       resource_init();
-}
diff --git a/arch/avr32/kernel/signal.c b/arch/avr32/kernel/signal.c
deleted file mode 100644 (file)
index b5fcc49..0000000
+++ /dev/null
@@ -1,288 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * Based on linux/arch/sh/kernel/signal.c
- *  Copyright (C) 1999, 2000  Niibe Yutaka & Kaz Kojima
- *  Copyright (C) 1991, 1992  Linus Torvalds
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/errno.h>
-#include <linux/ptrace.h>
-#include <linux/unistd.h>
-#include <linux/tracehook.h>
-
-#include <linux/uaccess.h>
-#include <asm/ucontext.h>
-#include <asm/syscalls.h>
-
-struct rt_sigframe
-{
-       struct siginfo info;
-       struct ucontext uc;
-       unsigned long retcode;
-};
-
-static int
-restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc)
-{
-       int err = 0;
-
-#define COPY(x)                err |= __get_user(regs->x, &sc->x)
-       COPY(sr);
-       COPY(pc);
-       COPY(lr);
-       COPY(sp);
-       COPY(r12);
-       COPY(r11);
-       COPY(r10);
-       COPY(r9);
-       COPY(r8);
-       COPY(r7);
-       COPY(r6);
-       COPY(r5);
-       COPY(r4);
-       COPY(r3);
-       COPY(r2);
-       COPY(r1);
-       COPY(r0);
-#undef COPY
-
-       /*
-        * Don't allow anyone to pretend they're running in supervisor
-        * mode or something...
-        */
-       err |= !valid_user_regs(regs);
-
-       return err;
-}
-
-
-asmlinkage int sys_rt_sigreturn(struct pt_regs *regs)
-{
-       struct rt_sigframe __user *frame;
-       sigset_t set;
-
-       /* Always make any pending restarted system calls return -EINTR */
-       current->restart_block.fn = do_no_restart_syscall;
-
-       frame = (struct rt_sigframe __user *)regs->sp;
-       pr_debug("SIG return: frame = %p\n", frame);
-
-       if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
-               goto badframe;
-
-       if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
-               goto badframe;
-
-       set_current_blocked(&set);
-
-       if (restore_sigcontext(regs, &frame->uc.uc_mcontext))
-               goto badframe;
-
-       if (restore_altstack(&frame->uc.uc_stack))
-               goto badframe;
-
-       pr_debug("Context restored: pc = %08lx, lr = %08lx, sp = %08lx\n",
-                regs->pc, regs->lr, regs->sp);
-
-       return regs->r12;
-
-badframe:
-       force_sig(SIGSEGV, current);
-       return 0;
-}
-
-static int
-setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs)
-{
-       int err = 0;
-
-#define COPY(x)                err |= __put_user(regs->x, &sc->x)
-       COPY(sr);
-       COPY(pc);
-       COPY(lr);
-       COPY(sp);
-       COPY(r12);
-       COPY(r11);
-       COPY(r10);
-       COPY(r9);
-       COPY(r8);
-       COPY(r7);
-       COPY(r6);
-       COPY(r5);
-       COPY(r4);
-       COPY(r3);
-       COPY(r2);
-       COPY(r1);
-       COPY(r0);
-#undef COPY
-
-       return err;
-}
-
-static inline void __user *
-get_sigframe(struct ksignal *ksig, struct pt_regs *regs, int framesize)
-{
-       unsigned long sp = sigsp(regs->sp, ksig);
-
-       return (void __user *)((sp - framesize) & ~3);
-}
-
-static int
-setup_rt_frame(struct ksignal *ksig, sigset_t *set, struct pt_regs *regs)
-{
-       struct rt_sigframe __user *frame;
-       int err = 0;
-
-       frame = get_sigframe(ksig, regs, sizeof(*frame));
-       err = -EFAULT;
-       if (!access_ok(VERIFY_WRITE, frame, sizeof (*frame)))
-               goto out;
-
-       /*
-        * Set up the return code:
-        *
-        *      mov     r8, __NR_rt_sigreturn
-        *      scall
-        *
-        * Note: This will blow up since we're using a non-executable
-        * stack. Better use SA_RESTORER.
-        */
-#if __NR_rt_sigreturn > 127
-# error __NR_rt_sigreturn must be < 127 to fit in a short mov
-#endif
-       err = __put_user(0x3008d733 | (__NR_rt_sigreturn << 20),
-                        &frame->retcode);
-
-       err |= copy_siginfo_to_user(&frame->info, &ksig->info);
-
-       /* Set up the ucontext */
-       err |= __put_user(0, &frame->uc.uc_flags);
-       err |= __put_user(NULL, &frame->uc.uc_link);
-       err |= __save_altstack(&frame->uc.uc_stack, regs->sp);
-       err |= setup_sigcontext(&frame->uc.uc_mcontext, regs);
-       err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
-
-       if (err)
-               goto out;
-
-       regs->r12 = ksig->sig;
-       regs->r11 = (unsigned long) &frame->info;
-       regs->r10 = (unsigned long) &frame->uc;
-       regs->sp = (unsigned long) frame;
-       if (ksig->ka.sa.sa_flags & SA_RESTORER)
-               regs->lr = (unsigned long)ksig->ka.sa.sa_restorer;
-       else {
-               printk(KERN_NOTICE "[%s:%d] did not set SA_RESTORER\n",
-                      current->comm, current->pid);
-               regs->lr = (unsigned long) &frame->retcode;
-       }
-
-       pr_debug("SIG deliver [%s:%d]: sig=%d sp=0x%lx pc=0x%lx->0x%p lr=0x%lx\n",
-                current->comm, current->pid, ksig->sig, regs->sp,
-                regs->pc, ksig->ka.sa.sa_handler, regs->lr);
-
-       regs->pc = (unsigned long)ksig->ka.sa.sa_handler;
-
-out:
-       return err;
-}
-
-static inline void setup_syscall_restart(struct pt_regs *regs)
-{
-       if (regs->r12 == -ERESTART_RESTARTBLOCK)
-               regs->r8 = __NR_restart_syscall;
-       else
-               regs->r12 = regs->r12_orig;
-       regs->pc -= 2;
-}
-
-static inline void
-handle_signal(struct ksignal *ksig, struct pt_regs *regs, int syscall)
-{
-       int ret;
-
-       /*
-        * Set up the stack frame
-        */
-       ret = setup_rt_frame(ksig, sigmask_to_save(), regs);
-
-       /*
-        * Check that the resulting registers are sane
-        */
-       ret |= !valid_user_regs(regs);
-
-       /*
-        * Block the signal if we were successful.
-        */
-       signal_setup_done(ret, ksig, 0);
-}
-
-/*
- * Note that 'init' is a special process: it doesn't get signals it
- * doesn't want to handle. Thus you cannot kill init even with a
- * SIGKILL even by mistake.
- */
-static void do_signal(struct pt_regs *regs, int syscall)
-{
-       struct ksignal ksig;
-
-       /*
-        * We want the common case to go fast, which is why we may in
-        * certain cases get here from kernel mode. Just return
-        * without doing anything if so.
-        */
-       if (!user_mode(regs))
-               return;
-
-       get_signal(&ksig);
-       if (syscall) {
-               switch (regs->r12) {
-               case -ERESTART_RESTARTBLOCK:
-               case -ERESTARTNOHAND:
-                       if (ksig.sig > 0) {
-                               regs->r12 = -EINTR;
-                               break;
-                       }
-                       /* fall through */
-               case -ERESTARTSYS:
-                       if (ksig.sig > 0 && !(ksig.ka.sa.sa_flags & SA_RESTART)) {
-                               regs->r12 = -EINTR;
-                               break;
-                       }
-                       /* fall through */
-               case -ERESTARTNOINTR:
-                       setup_syscall_restart(regs);
-               }
-       }
-
-       if (!ksig.sig) {
-               /* No signal to deliver -- put the saved sigmask back */
-               restore_saved_sigmask();
-               return;
-       }
-
-       handle_signal(&ksig, regs, syscall);
-}
-
-asmlinkage void do_notify_resume(struct pt_regs *regs, struct thread_info *ti)
-{
-       int syscall = 0;
-
-       if ((sysreg_read(SR) & MODE_MASK) == MODE_SUPERVISOR)
-               syscall = 1;
-
-       if (ti->flags & _TIF_SIGPENDING)
-               do_signal(regs, syscall);
-
-       if (ti->flags & _TIF_NOTIFY_RESUME) {
-               clear_thread_flag(TIF_NOTIFY_RESUME);
-               tracehook_notify_resume(regs);
-       }
-}
diff --git a/arch/avr32/kernel/stacktrace.c b/arch/avr32/kernel/stacktrace.c
deleted file mode 100644 (file)
index f8cc995..0000000
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Stack trace management functions
- *
- * Copyright (C) 2007 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/sched.h>
-#include <linux/sched/task_stack.h>
-#include <linux/stacktrace.h>
-#include <linux/thread_info.h>
-#include <linux/module.h>
-
-register unsigned long current_frame_pointer asm("r7");
-
-struct stackframe {
-       unsigned long lr;
-       unsigned long fp;
-};
-
-/*
- * Save stack-backtrace addresses into a stack_trace buffer.
- */
-void save_stack_trace(struct stack_trace *trace)
-{
-       unsigned long low, high;
-       unsigned long fp;
-       struct stackframe *frame;
-       int skip = trace->skip;
-
-       low = (unsigned long)task_stack_page(current);
-       high = low + THREAD_SIZE;
-       fp = current_frame_pointer;
-
-       while (fp >= low && fp <= (high - 8)) {
-               frame = (struct stackframe *)fp;
-
-               if (skip) {
-                       skip--;
-               } else {
-                       trace->entries[trace->nr_entries++] = frame->lr;
-                       if (trace->nr_entries >= trace->max_entries)
-                               break;
-               }
-
-               /*
-                * The next frame must be at a higher address than the
-                * current frame.
-                */
-               low = fp + 8;
-               fp = frame->fp;
-       }
-}
-EXPORT_SYMBOL_GPL(save_stack_trace);
diff --git a/arch/avr32/kernel/switch_to.S b/arch/avr32/kernel/switch_to.S
deleted file mode 100644 (file)
index a48d046..0000000
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <asm/sysreg.h>
-
-       .text
-       .global __switch_to
-       .type   __switch_to, @function
-
-       /* Switch thread context from "prev" to "next", returning "last"
-        *   r12 :      prev
-        *   r11 :      &prev->thread + 1
-        *   r10 :      &next->thread
-        */
-__switch_to:
-       stm     --r11, r0,r1,r2,r3,r4,r5,r6,r7,sp,lr
-       mfsr    r9, SYSREG_SR
-       st.w    --r11, r9
-       ld.w    r8, r10++
-       /*
-        * schedule() may have been called from a mode with a different
-        * set of registers. Make sure we don't lose anything here.
-        */
-       pushm   r10,r12
-       mtsr    SYSREG_SR, r8
-       frs                     /* flush the return stack */
-       sub     pc, -2          /* flush the pipeline */
-       popm    r10,r12
-       ldm     r10++, r0,r1,r2,r3,r4,r5,r6,r7,sp,pc
-       .size   __switch_to, . - __switch_to
diff --git a/arch/avr32/kernel/syscall-stubs.S b/arch/avr32/kernel/syscall-stubs.S
deleted file mode 100644 (file)
index cb25653..0000000
+++ /dev/null
@@ -1,153 +0,0 @@
-/*
- * Copyright (C) 2005-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-/*
- * Stubs for syscalls that require access to pt_regs or that take more
- * than five parameters.
- */
-
-#define ARG6   r3
-
-       .text
-       .global __sys_rt_sigsuspend
-       .type   __sys_rt_sigsuspend,@function
-__sys_rt_sigsuspend:
-       mov     r10, sp
-       rjmp    sys_rt_sigsuspend
-
-       .global __sys_rt_sigreturn
-       .type   __sys_rt_sigreturn,@function
-__sys_rt_sigreturn:
-       mov     r12, sp
-       rjmp    sys_rt_sigreturn
-
-       .global __sys_mmap2
-       .type   __sys_mmap2,@function
-__sys_mmap2:
-       pushm   lr
-       st.w    --sp, ARG6
-       call    sys_mmap_pgoff
-       sub     sp, -4
-       popm    pc
-
-       .global __sys_sendto
-       .type   __sys_sendto,@function
-__sys_sendto:
-       pushm   lr
-       st.w    --sp, ARG6
-       call    sys_sendto
-       sub     sp, -4
-       popm    pc
-
-       .global __sys_recvfrom
-       .type   __sys_recvfrom,@function
-__sys_recvfrom:
-       pushm   lr
-       st.w    --sp, ARG6
-       call    sys_recvfrom
-       sub     sp, -4
-       popm    pc
-
-       .global __sys_pselect6
-       .type   __sys_pselect6,@function
-__sys_pselect6:
-       pushm   lr
-       st.w    --sp, ARG6
-       call    sys_pselect6
-       sub     sp, -4
-       popm    pc
-
-       .global __sys_splice
-       .type   __sys_splice,@function
-__sys_splice:
-       pushm   lr
-       st.w    --sp, ARG6
-       call    sys_splice
-       sub     sp, -4
-       popm    pc
-
-       .global __sys_epoll_pwait
-       .type   __sys_epoll_pwait,@function
-__sys_epoll_pwait:
-       pushm   lr
-       st.w    --sp, ARG6
-       call    sys_epoll_pwait
-       sub     sp, -4
-       popm    pc
-
-       .global __sys_sync_file_range
-       .type   __sys_sync_file_range,@function
-__sys_sync_file_range:
-       pushm   lr
-       st.w    --sp, ARG6
-       call    sys_sync_file_range
-       sub     sp, -4
-       popm    pc
-
-       .global __sys_fallocate
-       .type   __sys_fallocate,@function
-__sys_fallocate:
-       pushm   lr
-       st.w    --sp, ARG6
-       call    sys_fallocate
-       sub     sp, -4
-       popm    pc
-
-       .global __sys_fanotify_mark
-       .type   __sys_fanotify_mark,@function
-__sys_fanotify_mark:
-       pushm   lr
-       st.w    --sp, ARG6
-       call    sys_fanotify_mark
-       sub     sp, -4
-       popm    pc
-
-       .global __sys_process_vm_readv
-       .type   __sys_process_vm_readv,@function
-__sys_process_vm_readv:
-       pushm   lr
-       st.w    --sp, ARG6
-       call    sys_process_vm_readv
-       sub     sp, -4
-       popm    pc
-
-       .global __sys_process_vm_writev
-       .type   __sys_process_vm_writev,@function
-__sys_process_vm_writev:
-       pushm   lr
-       st.w    --sp, ARG6
-       call    sys_process_vm_writev
-       sub     sp, -4
-       popm    pc
-
-       .global __sys_copy_file_range
-       .type   __sys_copy_file_range,@function
-__sys_copy_file_range:
-       pushm   lr
-       st.w    --sp, ARG6
-       call    sys_copy_file_range
-       sub     sp, -4
-       popm    pc
-
-       .global __sys_preadv2
-       .type   __sys_preadv2,@function
-__sys_preadv2:
-       pushm   lr
-       st.w    --sp, ARG6
-       call    sys_preadv2
-       sub     sp, -4
-       popm    pc
-
-       .global __sys_pwritev2
-       .type   __sys_pwritev2,@function
-__sys_pwritev2:
-       pushm   lr
-       st.w    --sp, ARG6
-       call    sys_pwritev2
-       sub     sp, -4
-       popm    pc
diff --git a/arch/avr32/kernel/syscall_table.S b/arch/avr32/kernel/syscall_table.S
deleted file mode 100644 (file)
index 774ce57..0000000
+++ /dev/null
@@ -1,347 +0,0 @@
-/*
- * AVR32 system call table
- *
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-       .section .rodata,"a",@progbits
-       .type sys_call_table,@object
-       .global sys_call_table
-       .align 2
-sys_call_table:
-       .long sys_restart_syscall
-       .long sys_exit
-       .long sys_fork
-       .long sys_read
-       .long sys_write
-       .long sys_open
-       .long sys_close
-       .long sys_umask
-       .long sys_creat
-       .long sys_link
-       .long sys_unlink /* 10 */
-       .long sys_execve
-       .long sys_chdir
-       .long sys_time
-       .long sys_mknod
-       .long sys_chmod
-       .long sys_chown
-       .long sys_lchown
-       .long sys_lseek
-       .long sys_llseek
-       .long sys_getpid /* 20 */
-       .long sys_mount
-       .long sys_umount
-       .long sys_setuid
-       .long sys_getuid
-       .long sys_stime
-       .long sys_ptrace
-       .long sys_alarm
-       .long sys_pause
-       .long sys_utime
-       .long sys_newstat /* 30 */
-       .long sys_newfstat
-       .long sys_newlstat
-       .long sys_access
-       .long sys_chroot
-       .long sys_sync
-       .long sys_fsync
-       .long sys_kill
-       .long sys_rename
-       .long sys_mkdir
-       .long sys_rmdir /* 40 */
-       .long sys_dup
-       .long sys_pipe
-       .long sys_times
-       .long sys_clone
-       .long sys_brk
-       .long sys_setgid
-       .long sys_getgid
-       .long sys_getcwd
-       .long sys_geteuid
-       .long sys_getegid /* 50 */
-       .long sys_acct
-       .long sys_setfsuid
-       .long sys_setfsgid
-       .long sys_ioctl
-       .long sys_fcntl
-       .long sys_setpgid
-       .long sys_mremap
-       .long sys_setresuid
-       .long sys_getresuid
-       .long sys_setreuid /* 60 */
-       .long sys_setregid
-       .long sys_ustat
-       .long sys_dup2
-       .long sys_getppid
-       .long sys_getpgrp
-       .long sys_setsid
-       .long sys_rt_sigaction
-       .long __sys_rt_sigreturn
-       .long sys_rt_sigprocmask
-       .long sys_rt_sigpending /* 70 */
-       .long sys_rt_sigtimedwait
-       .long sys_rt_sigqueueinfo
-       .long __sys_rt_sigsuspend
-       .long sys_sethostname
-       .long sys_setrlimit
-       .long sys_getrlimit
-       .long sys_getrusage
-       .long sys_gettimeofday
-       .long sys_settimeofday
-       .long sys_getgroups /* 80 */
-       .long sys_setgroups
-       .long sys_select
-       .long sys_symlink
-       .long sys_fchdir
-       .long sys_readlink
-       .long sys_pread64
-       .long sys_pwrite64
-       .long sys_swapon
-       .long sys_reboot
-       .long __sys_mmap2 /* 90 */
-       .long sys_munmap
-       .long sys_truncate
-       .long sys_ftruncate
-       .long sys_fchmod
-       .long sys_fchown
-       .long sys_getpriority
-       .long sys_setpriority
-       .long sys_wait4
-       .long sys_statfs
-       .long sys_fstatfs /* 100 */
-       .long sys_vhangup
-       .long sys_sigaltstack
-       .long sys_syslog
-       .long sys_setitimer
-       .long sys_getitimer
-       .long sys_swapoff
-       .long sys_sysinfo
-       .long sys_ni_syscall /* was sys_ipc briefly */
-       .long sys_sendfile
-       .long sys_setdomainname /* 110 */
-       .long sys_newuname
-       .long sys_adjtimex
-       .long sys_mprotect
-       .long sys_vfork
-       .long sys_init_module
-       .long sys_delete_module
-       .long sys_quotactl
-       .long sys_getpgid
-       .long sys_bdflush
-       .long sys_sysfs /* 120 */
-       .long sys_personality
-       .long sys_ni_syscall /* reserved for afs_syscall */
-       .long sys_getdents
-       .long sys_flock
-       .long sys_msync
-       .long sys_readv
-       .long sys_writev
-       .long sys_getsid
-       .long sys_fdatasync
-       .long sys_sysctl /* 130 */
-       .long sys_mlock
-       .long sys_munlock
-       .long sys_mlockall
-       .long sys_munlockall
-       .long sys_sched_setparam
-       .long sys_sched_getparam
-       .long sys_sched_setscheduler
-       .long sys_sched_getscheduler
-       .long sys_sched_yield
-       .long sys_sched_get_priority_max  /* 140 */
-       .long sys_sched_get_priority_min
-       .long sys_sched_rr_get_interval
-       .long sys_nanosleep
-       .long sys_poll
-       .long sys_ni_syscall /* 145 was nfsservctl */
-       .long sys_setresgid
-       .long sys_getresgid
-       .long sys_prctl
-       .long sys_socket
-       .long sys_bind /* 150 */
-       .long sys_connect
-       .long sys_listen
-       .long sys_accept
-       .long sys_getsockname
-       .long sys_getpeername
-       .long sys_socketpair
-       .long sys_send
-       .long sys_recv
-       .long __sys_sendto
-       .long __sys_recvfrom /* 160 */
-       .long sys_shutdown
-       .long sys_setsockopt
-       .long sys_getsockopt
-       .long sys_sendmsg
-       .long sys_recvmsg
-       .long sys_truncate64
-       .long sys_ftruncate64
-       .long sys_stat64
-       .long sys_lstat64
-       .long sys_fstat64 /* 170 */
-       .long sys_pivot_root
-       .long sys_mincore
-       .long sys_madvise
-       .long sys_getdents64
-       .long sys_fcntl64
-       .long sys_gettid
-       .long sys_readahead
-       .long sys_setxattr
-       .long sys_lsetxattr
-       .long sys_fsetxattr /* 180 */
-       .long sys_getxattr
-       .long sys_lgetxattr
-       .long sys_fgetxattr
-       .long sys_listxattr
-       .long sys_llistxattr
-       .long sys_flistxattr
-       .long sys_removexattr
-       .long sys_lremovexattr
-       .long sys_fremovexattr
-       .long sys_tkill /* 190 */
-       .long sys_sendfile64
-       .long sys_futex
-       .long sys_sched_setaffinity
-       .long sys_sched_getaffinity
-       .long sys_capget
-       .long sys_capset
-       .long sys_io_setup
-       .long sys_io_destroy
-       .long sys_io_getevents
-       .long sys_io_submit /* 200 */
-       .long sys_io_cancel
-       .long sys_fadvise64
-       .long sys_exit_group
-       .long sys_lookup_dcookie
-       .long sys_epoll_create
-       .long sys_epoll_ctl
-       .long sys_epoll_wait
-       .long sys_remap_file_pages
-       .long sys_set_tid_address
-       .long sys_timer_create /* 210 */
-       .long sys_timer_settime
-       .long sys_timer_gettime
-       .long sys_timer_getoverrun
-       .long sys_timer_delete
-       .long sys_clock_settime
-       .long sys_clock_gettime
-       .long sys_clock_getres
-       .long sys_clock_nanosleep
-       .long sys_statfs64
-       .long sys_fstatfs64 /* 220 */
-       .long sys_tgkill
-       .long sys_ni_syscall /* reserved for TUX */
-       .long sys_utimes
-       .long sys_fadvise64_64
-       .long sys_cacheflush
-       .long sys_ni_syscall /* sys_vserver */
-       .long sys_mq_open
-       .long sys_mq_unlink
-       .long sys_mq_timedsend
-       .long sys_mq_timedreceive /* 230 */
-       .long sys_mq_notify
-       .long sys_mq_getsetattr
-       .long sys_kexec_load
-       .long sys_waitid
-       .long sys_add_key
-       .long sys_request_key
-       .long sys_keyctl
-       .long sys_ioprio_set
-       .long sys_ioprio_get
-       .long sys_inotify_init /* 240 */
-       .long sys_inotify_add_watch
-       .long sys_inotify_rm_watch
-       .long sys_openat
-       .long sys_mkdirat
-       .long sys_mknodat
-       .long sys_fchownat
-       .long sys_futimesat
-       .long sys_fstatat64
-       .long sys_unlinkat
-       .long sys_renameat /* 250 */
-       .long sys_linkat
-       .long sys_symlinkat
-       .long sys_readlinkat
-       .long sys_fchmodat
-       .long sys_faccessat
-       .long __sys_pselect6
-       .long sys_ppoll
-       .long sys_unshare
-       .long sys_set_robust_list
-       .long sys_get_robust_list /* 260 */
-       .long __sys_splice
-       .long __sys_sync_file_range
-       .long sys_tee
-       .long sys_vmsplice
-       .long __sys_epoll_pwait
-       .long sys_msgget
-       .long sys_msgsnd
-       .long sys_msgrcv
-       .long sys_msgctl
-       .long sys_semget /* 270 */
-       .long sys_semop
-       .long sys_semctl
-       .long sys_semtimedop
-       .long sys_shmat
-       .long sys_shmget
-       .long sys_shmdt
-       .long sys_shmctl
-       .long sys_utimensat
-       .long sys_signalfd
-       .long sys_ni_syscall /* 280, was sys_timerfd */
-       .long sys_eventfd
-       .long sys_ni_syscall /* 282, was half-implemented recvmmsg */
-       .long sys_setns
-       .long sys_pread64
-       .long sys_pwrite64
-       .long sys_timerfd_create
-       .long __sys_fallocate
-       .long sys_timerfd_settime
-       .long sys_timerfd_gettime
-       .long sys_signalfd4 /* 290 */
-       .long sys_eventfd2
-       .long sys_epoll_create1
-       .long sys_dup3
-       .long sys_pipe2
-       .long sys_inotify_init1
-       .long sys_preadv
-       .long sys_pwritev
-       .long sys_rt_tgsigqueueinfo
-       .long sys_perf_event_open
-       .long sys_recvmmsg /* 300 */
-       .long sys_fanotify_init
-       .long __sys_fanotify_mark
-       .long sys_prlimit64
-       .long sys_name_to_handle_at
-       .long sys_open_by_handle_at
-       .long sys_clock_adjtime
-       .long sys_syncfs
-       .long sys_sendmmsg
-       .long __sys_process_vm_readv
-       .long __sys_process_vm_writev /* 310 */
-       .long sys_kcmp
-       .long sys_finit_module
-       .long sys_sched_setattr
-       .long sys_sched_getattr
-       .long sys_renameat2
-       .long sys_seccomp
-       .long sys_getrandom
-       .long sys_memfd_create
-       .long sys_bpf
-       .long sys_execveat /* 320 */
-       .long sys_accept4
-       .long sys_userfaultfd
-       .long sys_membarrier
-       .long sys_mlock2
-       .long __sys_copy_file_range
-       .long __sys_preadv2
-       .long __sys_pwritev2
-       .long sys_pkey_mprotect
-       .long sys_pkey_alloc
-       .long sys_pkey_free /* 330 */
-       .long sys_ni_syscall /* r8 is saturated at nr_syscalls */
diff --git a/arch/avr32/kernel/time.c b/arch/avr32/kernel/time.c
deleted file mode 100644 (file)
index 4d9b696..0000000
+++ /dev/null
@@ -1,161 +0,0 @@
-/*
- * Copyright (C) 2004-2007 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/clk.h>
-#include <linux/clockchips.h>
-#include <linux/init.h>
-#include <linux/interrupt.h>
-#include <linux/irq.h>
-#include <linux/kernel.h>
-#include <linux/time.h>
-#include <linux/cpu.h>
-
-#include <asm/sysreg.h>
-
-#include <mach/pm.h>
-
-static bool disable_cpu_idle_poll;
-
-static u64 read_cycle_count(struct clocksource *cs)
-{
-       return (u64)sysreg_read(COUNT);
-}
-
-/*
- * The architectural cycle count registers are a fine clocksource unless
- * the system idle loop use sleep states like "idle":  the CPU cycles
- * measured by COUNT (and COMPARE) don't happen during sleep states.
- * Their duration also changes if cpufreq changes the CPU clock rate.
- * So we rate the clocksource using COUNT as very low quality.
- */
-static struct clocksource counter = {
-       .name           = "avr32_counter",
-       .rating         = 50,
-       .read           = read_cycle_count,
-       .mask           = CLOCKSOURCE_MASK(32),
-       .flags          = CLOCK_SOURCE_IS_CONTINUOUS,
-};
-
-static irqreturn_t timer_interrupt(int irq, void *dev_id)
-{
-       struct clock_event_device *evdev = dev_id;
-
-       if (unlikely(!(intc_get_pending(0) & 1)))
-               return IRQ_NONE;
-
-       /*
-        * Disable the interrupt until the clockevent subsystem
-        * reprograms it.
-        */
-       sysreg_write(COMPARE, 0);
-
-       evdev->event_handler(evdev);
-       return IRQ_HANDLED;
-}
-
-static struct irqaction timer_irqaction = {
-       .handler        = timer_interrupt,
-       /* Oprofile uses the same irq as the timer, so allow it to be shared */
-       .flags          = IRQF_TIMER | IRQF_SHARED,
-       .name           = "avr32_comparator",
-};
-
-static int comparator_next_event(unsigned long delta,
-               struct clock_event_device *evdev)
-{
-       unsigned long   flags;
-
-       raw_local_irq_save(flags);
-
-       /* The time to read COUNT then update COMPARE must be less
-        * than the min_delta_ns value for this clockevent source.
-        */
-       sysreg_write(COMPARE, (sysreg_read(COUNT) + delta) ? : 1);
-
-       raw_local_irq_restore(flags);
-
-       return 0;
-}
-
-static int comparator_shutdown(struct clock_event_device *evdev)
-{
-       pr_debug("%s: %s\n", __func__, evdev->name);
-       sysreg_write(COMPARE, 0);
-
-       if (disable_cpu_idle_poll) {
-               disable_cpu_idle_poll = false;
-               /*
-                * Only disable idle poll if we have forced that
-                * in a previous call.
-                */
-               cpu_idle_poll_ctrl(false);
-       }
-       return 0;
-}
-
-static int comparator_set_oneshot(struct clock_event_device *evdev)
-{
-       pr_debug("%s: %s\n", __func__, evdev->name);
-
-       disable_cpu_idle_poll = true;
-       /*
-        * If we're using the COUNT and COMPARE registers we
-        * need to force idle poll.
-        */
-       cpu_idle_poll_ctrl(true);
-
-       return 0;
-}
-
-static struct clock_event_device comparator = {
-       .name                   = "avr32_comparator",
-       .features               = CLOCK_EVT_FEAT_ONESHOT,
-       .shift                  = 16,
-       .rating                 = 50,
-       .set_next_event         = comparator_next_event,
-       .set_state_shutdown     = comparator_shutdown,
-       .set_state_oneshot      = comparator_set_oneshot,
-       .tick_resume            = comparator_set_oneshot,
-};
-
-void read_persistent_clock(struct timespec *ts)
-{
-       ts->tv_sec = mktime(2007, 1, 1, 0, 0, 0);
-       ts->tv_nsec = 0;
-}
-
-void __init time_init(void)
-{
-       unsigned long counter_hz;
-       int ret;
-
-       /* figure rate for counter */
-       counter_hz = clk_get_rate(boot_cpu_data.clk);
-       ret = clocksource_register_hz(&counter, counter_hz);
-       if (ret)
-               pr_debug("timer: could not register clocksource: %d\n", ret);
-
-       /* setup COMPARE clockevent */
-       comparator.mult = div_sc(counter_hz, NSEC_PER_SEC, comparator.shift);
-       comparator.max_delta_ns = clockevent_delta2ns((u32)~0, &comparator);
-       comparator.min_delta_ns = clockevent_delta2ns(50, &comparator) + 1;
-       comparator.cpumask = cpumask_of(0);
-
-       sysreg_write(COMPARE, 0);
-       timer_irqaction.dev_id = &comparator;
-
-       ret = setup_irq(0, &timer_irqaction);
-       if (ret)
-               pr_debug("timer: could not request IRQ 0: %d\n", ret);
-       else {
-               clockevents_register_device(&comparator);
-
-               pr_info("%s: irq 0, %lu.%03lu MHz\n", comparator.name,
-                               ((counter_hz + 500) / 1000) / 1000,
-                               ((counter_hz + 500) / 1000) % 1000);
-       }
-}
diff --git a/arch/avr32/kernel/traps.c b/arch/avr32/kernel/traps.c
deleted file mode 100644 (file)
index 50b5413..0000000
+++ /dev/null
@@ -1,262 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/bug.h>
-#include <linux/hardirq.h>
-#include <linux/init.h>
-#include <linux/kallsyms.h>
-#include <linux/kdebug.h>
-#include <linux/extable.h>
-#include <linux/module.h>      /* print_modules */
-#include <linux/notifier.h>
-#include <linux/sched/signal.h>
-#include <linux/uaccess.h>
-
-#include <asm/addrspace.h>
-#include <asm/mmu_context.h>
-#include <asm/ocd.h>
-#include <asm/sysreg.h>
-#include <asm/traps.h>
-
-static DEFINE_SPINLOCK(die_lock);
-
-void die(const char *str, struct pt_regs *regs, long err)
-{
-       static int die_counter;
-
-       console_verbose();
-       spin_lock_irq(&die_lock);
-       bust_spinlocks(1);
-
-       printk(KERN_ALERT "Oops: %s, sig: %ld [#%d]\n",
-              str, err, ++die_counter);
-
-       printk(KERN_EMERG);
-
-#ifdef CONFIG_PREEMPT
-       printk(KERN_CONT "PREEMPT ");
-#endif
-#ifdef CONFIG_FRAME_POINTER
-       printk(KERN_CONT "FRAME_POINTER ");
-#endif
-       if (current_cpu_data.features & AVR32_FEATURE_OCD) {
-               unsigned long did = ocd_read(DID);
-               printk(KERN_CONT "chip: 0x%03lx:0x%04lx rev %lu\n",
-                      (did >> 1) & 0x7ff,
-                      (did >> 12) & 0x7fff,
-                      (did >> 28) & 0xf);
-       } else {
-               printk(KERN_CONT "cpu: arch %u r%u / core %u r%u\n",
-                      current_cpu_data.arch_type,
-                      current_cpu_data.arch_revision,
-                      current_cpu_data.cpu_type,
-                      current_cpu_data.cpu_revision);
-       }
-
-       print_modules();
-       show_regs_log_lvl(regs, KERN_EMERG);
-       show_stack_log_lvl(current, regs->sp, regs, KERN_EMERG);
-       bust_spinlocks(0);
-       add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE);
-       spin_unlock_irq(&die_lock);
-
-       if (in_interrupt())
-               panic("Fatal exception in interrupt");
-
-       if (panic_on_oops)
-               panic("Fatal exception");
-
-       do_exit(err);
-}
-
-void _exception(long signr, struct pt_regs *regs, int code,
-               unsigned long addr)
-{
-       siginfo_t info;
-
-       if (!user_mode(regs)) {
-               const struct exception_table_entry *fixup;
-
-               /* Are we prepared to handle this kernel fault? */
-               fixup = search_exception_tables(regs->pc);
-               if (fixup) {
-                       regs->pc = fixup->fixup;
-                       return;
-               }
-               die("Unhandled exception in kernel mode", regs, signr);
-       }
-
-       memset(&info, 0, sizeof(info));
-       info.si_signo = signr;
-       info.si_code = code;
-       info.si_addr = (void __user *)addr;
-       force_sig_info(signr, &info, current);
-}
-
-asmlinkage void do_nmi(unsigned long ecr, struct pt_regs *regs)
-{
-       int ret;
-
-       nmi_enter();
-
-       ret = notify_die(DIE_NMI, "NMI", regs, 0, ecr, SIGINT);
-       switch (ret) {
-       case NOTIFY_OK:
-       case NOTIFY_STOP:
-               break;
-       case NOTIFY_BAD:
-               die("Fatal Non-Maskable Interrupt", regs, SIGINT);
-       default:
-               printk(KERN_ALERT "Got NMI, but nobody cared. Disabling...\n");
-               nmi_disable();
-               break;
-       }
-       nmi_exit();
-}
-
-asmlinkage void do_critical_exception(unsigned long ecr, struct pt_regs *regs)
-{
-       die("Critical exception", regs, SIGKILL);
-}
-
-asmlinkage void do_address_exception(unsigned long ecr, struct pt_regs *regs)
-{
-       _exception(SIGBUS, regs, BUS_ADRALN, regs->pc);
-}
-
-/* This way of handling undefined instructions is stolen from ARM */
-static LIST_HEAD(undef_hook);
-static DEFINE_SPINLOCK(undef_lock);
-
-void register_undef_hook(struct undef_hook *hook)
-{
-       spin_lock_irq(&undef_lock);
-       list_add(&hook->node, &undef_hook);
-       spin_unlock_irq(&undef_lock);
-}
-
-void unregister_undef_hook(struct undef_hook *hook)
-{
-       spin_lock_irq(&undef_lock);
-       list_del(&hook->node);
-       spin_unlock_irq(&undef_lock);
-}
-
-static int do_cop_absent(u32 insn)
-{
-       int cop_nr;
-       u32 cpucr;
-
-       if ((insn & 0xfdf00000) == 0xf1900000)
-               /* LDC0 */
-               cop_nr = 0;
-       else
-               cop_nr = (insn >> 13) & 0x7;
-
-       /* Try enabling the coprocessor */
-       cpucr = sysreg_read(CPUCR);
-       cpucr |= (1 << (24 + cop_nr));
-       sysreg_write(CPUCR, cpucr);
-
-       cpucr = sysreg_read(CPUCR);
-       if (!(cpucr & (1 << (24 + cop_nr))))
-               return -ENODEV;
-
-       return 0;
-}
-
-#ifdef CONFIG_BUG
-int is_valid_bugaddr(unsigned long pc)
-{
-       unsigned short opcode;
-
-       if (pc < PAGE_OFFSET)
-               return 0;
-       if (probe_kernel_address((u16 *)pc, opcode))
-               return 0;
-
-       return opcode == AVR32_BUG_OPCODE;
-}
-#endif
-
-asmlinkage void do_illegal_opcode(unsigned long ecr, struct pt_regs *regs)
-{
-       u32 insn;
-       struct undef_hook *hook;
-       void __user *pc;
-       long code;
-
-#ifdef CONFIG_BUG
-       if (!user_mode(regs) && (ecr == ECR_ILLEGAL_OPCODE)) {
-               enum bug_trap_type type;
-
-               type = report_bug(regs->pc, regs);
-               switch (type) {
-               case BUG_TRAP_TYPE_NONE:
-                       break;
-               case BUG_TRAP_TYPE_WARN:
-                       regs->pc += 2;
-                       return;
-               case BUG_TRAP_TYPE_BUG:
-                       die("Kernel BUG", regs, SIGKILL);
-               }
-       }
-#endif
-
-       local_irq_enable();
-
-       if (user_mode(regs)) {
-               pc = (void __user *)instruction_pointer(regs);
-               if (get_user(insn, (u32 __user *)pc))
-                       goto invalid_area;
-
-               if (ecr == ECR_COPROC_ABSENT && !do_cop_absent(insn))
-                       return;
-
-               spin_lock_irq(&undef_lock);
-               list_for_each_entry(hook, &undef_hook, node) {
-                       if ((insn & hook->insn_mask) == hook->insn_val) {
-                               if (hook->fn(regs, insn) == 0) {
-                                       spin_unlock_irq(&undef_lock);
-                                       return;
-                               }
-                       }
-               }
-               spin_unlock_irq(&undef_lock);
-       }
-
-       switch (ecr) {
-       case ECR_PRIVILEGE_VIOLATION:
-               code = ILL_PRVOPC;
-               break;
-       case ECR_COPROC_ABSENT:
-               code = ILL_COPROC;
-               break;
-       default:
-               code = ILL_ILLOPC;
-               break;
-       }
-
-       _exception(SIGILL, regs, code, regs->pc);
-       return;
-
-invalid_area:
-       _exception(SIGSEGV, regs, SEGV_MAPERR, regs->pc);
-}
-
-asmlinkage void do_fpe(unsigned long ecr, struct pt_regs *regs)
-{
-       /* We have no FPU yet */
-       _exception(SIGILL, regs, ILL_COPROC, regs->pc);
-}
-
-
-void __init trap_init(void)
-{
-
-}
diff --git a/arch/avr32/kernel/vmlinux.lds.S b/arch/avr32/kernel/vmlinux.lds.S
deleted file mode 100644 (file)
index 17f2730..0000000
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * AVR32 linker script for the Linux kernel
- *
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#define LOAD_OFFSET 0x00000000
-#include <asm-generic/vmlinux.lds.h>
-#include <asm/cache.h>
-#include <asm/thread_info.h>
-
-OUTPUT_FORMAT("elf32-avr32", "elf32-avr32", "elf32-avr32")
-OUTPUT_ARCH(avr32)
-ENTRY(_start)
-
-/* Big endian */
-jiffies = jiffies_64 + 4;
-
-SECTIONS
-{
-       . = CONFIG_ENTRY_ADDRESS;
-       .init           : AT(ADDR(.init) - LOAD_OFFSET) {
-               _text = .;
-               __init_begin = .;
-                       _sinittext = .;
-                       *(.text.reset)
-                       INIT_TEXT
-                       /*
-                        * .exit.text is discarded at runtime, not
-                        * link time, to deal with references from
-                        * __bug_table
-                        */
-                       EXIT_TEXT
-                       _einittext = .;
-               . = ALIGN(4);
-               __tagtable_begin = .;
-                       *(.taglist.init)
-               __tagtable_end = .;
-       }
-       INIT_DATA_SECTION(16)
-       . = ALIGN(PAGE_SIZE);
-       __init_end = .;
-
-       .text           : AT(ADDR(.text) - LOAD_OFFSET) {
-               _evba = .;
-               _stext = .;
-               *(.ex.text)
-               *(.irq.text)
-               KPROBES_TEXT
-               TEXT_TEXT
-               SCHED_TEXT
-               CPUIDLE_TEXT
-               LOCK_TEXT
-               *(.fixup)
-               *(.gnu.warning)
-               _etext = .;
-       } = 0xd703d703
-
-       EXCEPTION_TABLE(4)
-       RODATA
-
-       .data           : AT(ADDR(.data) - LOAD_OFFSET) {
-               _data = .;
-               _sdata = .;
-
-               INIT_TASK_DATA(THREAD_SIZE)
-               PAGE_ALIGNED_DATA(PAGE_SIZE);
-               CACHELINE_ALIGNED_DATA(L1_CACHE_BYTES)
-               *(.data.rel*)
-               DATA_DATA
-               CONSTRUCTORS
-
-               _edata = .;
-       }
-
-       BSS_SECTION(0, 8, 8)
-       _end = .;
-
-       DWARF_DEBUG
-
-       /* When something in the kernel is NOT compiled as a module, the module
-        * cleanup code and data are put into these segments. Both can then be
-        * thrown away, as cleanup code is never called unless it's a module.
-        */
-       DISCARDS
-}
diff --git a/arch/avr32/lib/Makefile b/arch/avr32/lib/Makefile
deleted file mode 100644 (file)
index 084d95b..0000000
+++ /dev/null
@@ -1,11 +0,0 @@
-#
-# Makefile for AVR32-specific library files
-#
-
-lib-y  := copy_user.o clear_user.o
-lib-y  += strncpy_from_user.o strnlen_user.o
-lib-y  += delay.o memset.o memcpy.o findbit.o
-lib-y  += csum_partial.o csum_partial_copy_generic.o
-lib-y  += io-readsw.o io-readsl.o io-writesw.o io-writesl.o
-lib-y  += io-readsb.o io-writesb.o
-lib-y  += __avr32_lsl64.o __avr32_lsr64.o __avr32_asr64.o
diff --git a/arch/avr32/lib/__avr32_asr64.S b/arch/avr32/lib/__avr32_asr64.S
deleted file mode 100644 (file)
index 368b6bc..0000000
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (C) 2005-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-       /*
-        * DWtype __avr32_asr64(DWtype u, word_type b)
-        */
-       .text
-       .global __avr32_asr64
-       .type   __avr32_asr64,@function
-__avr32_asr64:
-       cp.w    r12, 0
-       reteq   r12
-
-       rsub    r9, r12, 32
-       brle    1f
-
-       lsl     r8, r11, r9
-       lsr     r10, r10, r12
-       asr     r11, r11, r12
-       or      r10, r8
-       retal   r12
-
-1:     neg     r9
-       asr     r10, r11, r9
-       asr     r11, 31
-       retal   r12
diff --git a/arch/avr32/lib/__avr32_lsl64.S b/arch/avr32/lib/__avr32_lsl64.S
deleted file mode 100644 (file)
index f1dbc2b..0000000
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (C) 2005-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-       /*
-        * DWtype __avr32_lsl64(DWtype u, word_type b)
-        */
-       .text
-       .global __avr32_lsl64
-       .type   __avr32_lsl64,@function
-__avr32_lsl64:
-       cp.w    r12, 0
-       reteq   r12
-
-       rsub    r9, r12, 32
-       brle    1f
-
-       lsr     r8, r10, r9
-       lsl     r10, r10, r12
-       lsl     r11, r11, r12
-       or      r11, r8
-       retal   r12
-
-1:     neg     r9
-       lsl     r11, r10, r9
-       mov     r10, 0
-       retal   r12
diff --git a/arch/avr32/lib/__avr32_lsr64.S b/arch/avr32/lib/__avr32_lsr64.S
deleted file mode 100644 (file)
index e65bb7f..0000000
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (C) 2005-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-       /*
-        * DWtype __avr32_lsr64(DWtype u, word_type b)
-        */
-       .text
-       .global __avr32_lsr64
-       .type   __avr32_lsr64,@function
-__avr32_lsr64:
-       cp.w    r12, 0
-       reteq   r12
-
-       rsub    r9, r12, 32
-       brle    1f
-
-       lsl     r8, r11, r9
-       lsr     r11, r11, r12
-       lsr     r10, r10, r12
-       or      r10, r8
-       retal   r12
-
-1:     neg     r9
-       lsr     r10, r11, r9
-       mov     r11, 0
-       retal   r12
diff --git a/arch/avr32/lib/clear_user.S b/arch/avr32/lib/clear_user.S
deleted file mode 100644 (file)
index d8991b6..0000000
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <asm/page.h>
-#include <asm/thread_info.h>
-#include <asm/asm.h>
-
-       .text
-       .align  1
-       .global clear_user
-       .type   clear_user, "function"
-clear_user:
-       branch_if_kernel r8, __clear_user
-       ret_if_privileged r8, r12, r11, r11
-
-       .global __clear_user
-       .type   __clear_user, "function"
-__clear_user:
-       mov     r9, r12
-       mov     r8, 0
-       andl    r9, 3, COH
-       brne    5f
-
-1:     sub     r11, 4
-       brlt    2f
-
-10:    st.w    r12++, r8
-       sub     r11, 4
-       brge    10b
-
-2:     sub     r11, -4
-       reteq   0
-
-       /* Unaligned count or address */
-       bld     r11, 1
-       brcc    12f
-11:    st.h    r12++, r8
-       sub     r11, 2
-       reteq   0
-12:    st.b    r12++, r8
-       retal   0
-
-       /* Unaligned address */
-5:     cp.w    r11, 4
-       brlt    2b
-
-       lsl     r9, 2
-       add     pc, pc, r9
-13:    st.b    r12++, r8
-       sub     r11, 1
-14:    st.b    r12++, r8
-       sub     r11, 1
-15:    st.b    r12++, r8
-       sub     r11, 1
-       rjmp    1b
-
-       .size   clear_user, . - clear_user
-       .size   __clear_user, . - __clear_user
-
-       .section .fixup, "ax"
-       .align  1
-18:    sub     r11, -4
-19:    retal   r11
-
-       .section __ex_table, "a"
-       .align  2
-       .long   10b, 18b
-       .long   11b, 19b
-       .long   12b, 19b
-       .long   13b, 19b
-       .long   14b, 19b
-       .long   15b, 19b
diff --git a/arch/avr32/lib/copy_user.S b/arch/avr32/lib/copy_user.S
deleted file mode 100644 (file)
index 0753734..0000000
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * Copy to/from userspace with optional address space checking.
- *
- * Copyright 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <asm/page.h>
-#include <asm/thread_info.h>
-#include <asm/asm.h>
-
-       /*
-        * __kernel_size_t
-        * __copy_user(void *to, const void *from, __kernel_size_t n)
-        *
-        * Returns the number of bytes not copied. Might be off by
-        * max 3 bytes if we get a fault in the main loop.
-        *
-        * The address-space checking functions simply fall through to
-        * the non-checking version.
-        */
-       .text
-       .align  1
-       .global ___copy_from_user
-       .type   ___copy_from_user, @function
-___copy_from_user:
-       branch_if_kernel r8, __copy_user
-       ret_if_privileged r8, r11, r10, r10
-       rjmp    __copy_user
-       .size   ___copy_from_user, . - ___copy_from_user
-
-       .global copy_to_user
-       .type   copy_to_user, @function
-copy_to_user:
-       branch_if_kernel r8, __copy_user
-       ret_if_privileged r8, r12, r10, r10
-       .size   copy_to_user, . - copy_to_user
-
-       .global __copy_user
-       .type   __copy_user, @function
-__copy_user:
-       mov     r9, r11
-       andl    r9, 3, COH
-       brne    6f
-
-       /* At this point, from is word-aligned */
-1:     sub     r10, 4
-       brlt    3f
-
-2:
-10:    ld.w    r8, r11++
-11:    st.w    r12++, r8
-       sub     r10, 4
-       brge    2b
-
-3:     sub     r10, -4
-       reteq   0
-
-       /*
-        * Handle unaligned count. Need to be careful with r10 here so
-        * that we return the correct value even if we get a fault
-        */
-4:
-20:    ld.ub   r8, r11++
-21:    st.b    r12++, r8
-       sub     r10, 1
-       reteq   0
-22:    ld.ub   r8, r11++
-23:    st.b    r12++, r8
-       sub     r10, 1
-       reteq   0
-24:    ld.ub   r8, r11++
-25:    st.b    r12++, r8
-       retal   0
-
-       /* Handle unaligned from-pointer */
-6:     cp.w    r10, 4
-       brlt    4b
-       rsub    r9, r9, 4
-
-30:    ld.ub   r8, r11++
-31:    st.b    r12++, r8
-       sub     r10, 1
-       sub     r9, 1
-       breq    1b
-32:    ld.ub   r8, r11++
-33:    st.b    r12++, r8
-       sub     r10, 1
-       sub     r9, 1
-       breq    1b
-34:    ld.ub   r8, r11++
-35:    st.b    r12++, r8
-       sub     r10, 1
-       rjmp    1b
-       .size   __copy_user, . - __copy_user
-
-       .section .fixup,"ax"
-       .align  1
-19:    sub     r10, -4
-29:    retal   r10
-
-       .section __ex_table,"a"
-       .align  2
-       .long   10b, 19b
-       .long   11b, 19b
-       .long   20b, 29b
-       .long   21b, 29b
-       .long   22b, 29b
-       .long   23b, 29b
-       .long   24b, 29b
-       .long   25b, 29b
-       .long   30b, 29b
-       .long   31b, 29b
-       .long   32b, 29b
-       .long   33b, 29b
-       .long   34b, 29b
-       .long   35b, 29b
diff --git a/arch/avr32/lib/csum_partial.S b/arch/avr32/lib/csum_partial.S
deleted file mode 100644 (file)
index 6a262b5..0000000
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-       /*
-        * unsigned int csum_partial(const unsigned char *buff,
-        *                           int len, unsigned int sum)
-        */
-       .text
-       .global csum_partial
-       .type   csum_partial,"function"
-       .align  1
-csum_partial:
-       /* checksum complete words, aligned or not */
-3:     sub     r11, 4
-       brlt    5f
-4:     ld.w    r9, r12++
-       add     r10, r9
-       acr     r10
-       sub     r11, 4
-       brge    4b
-
-       /* return if we had a whole number of words */
-5:     sub     r11, -4
-       reteq   r10
-
-       /* checksum any remaining bytes at the end */
-       mov     r9, 0
-       mov     r8, 0
-       cp      r11, 2
-       brlt    6f
-       ld.uh   r9, r12++
-       sub     r11, 2
-       breq    7f
-       lsl     r9, 16
-6:     ld.ub   r8, r12++
-       lsl     r8, 8
-7:     or      r9, r8
-       add     r10, r9
-       acr     r10
-
-       retal   r10
-       .size   csum_partial, . - csum_partial
diff --git a/arch/avr32/lib/csum_partial_copy_generic.S b/arch/avr32/lib/csum_partial_copy_generic.S
deleted file mode 100644 (file)
index a3a0f9b..0000000
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <asm/errno.h>
-#include <asm/asm.h>
-
-       /*
-        * unsigned int csum_partial_copy_generic(const char *src, char *dst, int len
-        *                                        int sum, int *src_err_ptr,
-        *                                        int *dst_err_ptr)
-        *
-        * Copy src to dst while checksumming, otherwise like csum_partial.
-        */
-
-       .macro ld_src size, reg, ptr
-9999:  ld.\size \reg, \ptr
-       .section __ex_table, "a"
-       .long   9999b, fixup_ld_src
-       .previous
-       .endm
-
-       .macro st_dst size, ptr, reg
-9999:  st.\size \ptr, \reg
-       .section __ex_table, "a"
-       .long   9999b, fixup_st_dst
-       .previous
-       .endm
-
-       .text
-       .global csum_partial_copy_generic
-       .type   csum_partial_copy_generic,"function"
-       .align  1
-csum_partial_copy_generic:
-       pushm   r4-r7,lr
-
-       /* The inner loop */
-1:     sub     r10, 4
-       brlt    5f
-2:     ld_src  w, r5, r12++
-       st_dst  w, r11++, r5
-       add     r9, r5
-       acr     r9
-       sub     r10, 4
-       brge    2b
-
-       /* return if we had a whole number of words */
-5:     sub     r10, -4
-       brne    7f
-
-6:     mov     r12, r9
-       popm    r4-r7,pc
-
-       /* handle additional bytes at the tail */
-7:     mov     r5, 0
-       mov     r4, 32
-8:     ld_src  ub, r6, r12++
-       st_dst  b, r11++, r6
-       lsl     r5, 8
-       sub     r4, 8
-       bfins   r5, r6, 0, 8
-       sub     r10, 1
-       brne    8b
-
-       lsl     r5, r5, r4
-       add     r9, r5
-       acr     r9
-       rjmp    6b
-
-       /* Exception handler */
-       .section .fixup,"ax"
-       .align  1
-fixup_ld_src:
-       mov     r9, -EFAULT
-       cp.w    r8, 0
-       breq    1f
-       st.w    r8[0], r9
-
-1:     /*
-        * TODO: zero the complete destination - computing the rest
-        * is too much work
-        */
-
-       mov     r9, 0
-       rjmp    6b
-
-fixup_st_dst:
-       mov     r9, -EFAULT
-       lddsp   r8, sp[20]
-       cp.w    r8, 0
-       breq    1f
-       st.w    r8[0], r9
-1:     mov     r9, 0
-       rjmp    6b
-
-       .previous
diff --git a/arch/avr32/lib/delay.c b/arch/avr32/lib/delay.c
deleted file mode 100644 (file)
index c2f4a07..0000000
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- *      Precise Delay Loops for avr32
- *
- *      Copyright (C) 1993 Linus Torvalds
- *      Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
- *     Copyright (C) 2005-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/delay.h>
-#include <linux/module.h>
-#include <linux/timex.h>
-#include <linux/param.h>
-#include <linux/types.h>
-#include <linux/init.h>
-
-#include <asm/processor.h>
-#include <asm/sysreg.h>
-
-int read_current_timer(unsigned long *timer_value)
-{
-       *timer_value = sysreg_read(COUNT);
-       return 0;
-}
-
-void __delay(unsigned long loops)
-{
-       unsigned bclock, now;
-
-       bclock = sysreg_read(COUNT);
-       do {
-               now = sysreg_read(COUNT);
-       } while ((now - bclock) < loops);
-}
-
-inline void __const_udelay(unsigned long xloops)
-{
-       unsigned long long loops;
-
-       asm("mulu.d %0, %1, %2"
-           : "=r"(loops)
-           : "r"(current_cpu_data.loops_per_jiffy * HZ), "r"(xloops));
-       __delay(loops >> 32);
-}
-
-void __udelay(unsigned long usecs)
-{
-       __const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */
-}
-
-void __ndelay(unsigned long nsecs)
-{
-       __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */
-}
diff --git a/arch/avr32/lib/findbit.S b/arch/avr32/lib/findbit.S
deleted file mode 100644 (file)
index b935864..0000000
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * Copyright (C) 2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/linkage.h>
-
-       .text
-       /*
-        * unsigned long find_first_zero_bit(const unsigned long *addr,
-        *                                   unsigned long size)
-        */
-ENTRY(find_first_zero_bit)
-       cp.w    r11, 0
-       reteq   r11
-       mov     r9, r11
-1:     ld.w    r8, r12[0]
-       com     r8
-       brne    .L_found
-       sub     r12, -4
-       sub     r9, 32
-       brgt    1b
-       retal   r11
-
-       /*
-        * unsigned long find_next_zero_bit(const unsigned long *addr,
-        *                                  unsigned long size,
-        *                                  unsigned long offset)
-        */
-ENTRY(find_next_zero_bit)
-       lsr     r8, r10, 5
-       sub     r9, r11, r10
-       retle   r11
-
-       lsl     r8, 2
-       add     r12, r8
-       andl    r10, 31, COH
-       breq    1f
-
-       /* offset is not word-aligned. Handle the first (32 - r10) bits */
-       ld.w    r8, r12[0]
-       com     r8
-       sub     r12, -4
-       lsr     r8, r8, r10
-       brne    .L_found
-
-       /* r9 = r9 - (32 - r10) = r9 + r10 - 32 */
-       add     r9, r10
-       sub     r9, 32
-       retle   r11
-
-       /* Main loop. offset must be word-aligned */
-1:     ld.w    r8, r12[0]
-       com     r8
-       brne    .L_found
-       sub     r12, -4
-       sub     r9, 32
-       brgt    1b
-       retal   r11
-
-       /* Common return path for when a bit is actually found. */
-.L_found:
-       brev    r8
-       clz     r10, r8
-       rsub    r9, r11
-       add     r10, r9
-
-       /* XXX: If we don't have to return exactly "size" when the bit
-          is not found, we may drop this "min" thing */
-       min     r12, r11, r10
-       retal   r12
-
-       /*
-        * unsigned long find_first_bit(const unsigned long *addr,
-        *                              unsigned long size)
-        */
-ENTRY(find_first_bit)
-       cp.w    r11, 0
-       reteq   r11
-       mov     r9, r11
-1:     ld.w    r8, r12[0]
-       cp.w    r8, 0
-       brne    .L_found
-       sub     r12, -4
-       sub     r9, 32
-       brgt    1b
-       retal   r11
-
-       /*
-        * unsigned long find_next_bit(const unsigned long *addr,
-        *                             unsigned long size,
-        *                             unsigned long offset)
-        */
-ENTRY(find_next_bit)
-       lsr     r8, r10, 5
-       sub     r9, r11, r10
-       retle   r11
-
-       lsl     r8, 2
-       add     r12, r8
-       andl    r10, 31, COH
-       breq    1f
-
-       /* offset is not word-aligned. Handle the first (32 - r10) bits */
-       ld.w    r8, r12[0]
-       sub     r12, -4
-       lsr     r8, r8, r10
-       brne    .L_found
-
-       /* r9 = r9 - (32 - r10) = r9 + r10 - 32 */
-       add     r9, r10
-       sub     r9, 32
-       retle   r11
-
-       /* Main loop. offset must be word-aligned */
-1:     ld.w    r8, r12[0]
-       cp.w    r8, 0
-       brne    .L_found
-       sub     r12, -4
-       sub     r9, 32
-       brgt    1b
-       retal   r11
-
-ENTRY(find_next_bit_le)
-       lsr     r8, r10, 5
-       sub     r9, r11, r10
-       retle   r11
-
-       lsl     r8, 2
-       add     r12, r8
-       andl    r10, 31, COH
-       breq    1f
-
-       /* offset is not word-aligned. Handle the first (32 - r10) bits */
-       ldswp.w r8, r12[0]
-       sub     r12, -4
-       lsr     r8, r8, r10
-       brne    .L_found
-
-       /* r9 = r9 - (32 - r10) = r9 + r10 - 32 */
-       add     r9, r10
-       sub     r9, 32
-       retle   r11
-
-       /* Main loop. offset must be word-aligned */
-1:     ldswp.w r8, r12[0]
-       cp.w    r8, 0
-       brne    .L_found
-       sub     r12, -4
-       sub     r9, 32
-       brgt    1b
-       retal   r11
-
-ENTRY(find_next_zero_bit_le)
-       lsr     r8, r10, 5
-       sub     r9, r11, r10
-       retle   r11
-
-       lsl     r8, 2
-       add     r12, r8
-       andl    r10, 31, COH
-       breq    1f
-
-       /* offset is not word-aligned. Handle the first (32 - r10) bits */
-       ldswp.w r8, r12[0]
-       sub     r12, -4
-       com     r8
-       lsr     r8, r8, r10
-       brne    .L_found
-
-       /* r9 = r9 - (32 - r10) = r9 + r10 - 32 */
-       add     r9, r10
-       sub     r9, 32
-       retle   r11
-
-       /* Main loop. offset must be word-aligned */
-1:     ldswp.w r8, r12[0]
-       com     r8
-       brne    .L_found
-       sub     r12, -4
-       sub     r9, 32
-       brgt    1b
-       retal   r11
diff --git a/arch/avr32/lib/io-readsb.S b/arch/avr32/lib/io-readsb.S
deleted file mode 100644 (file)
index cb2d869..0000000
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-       .text
-.Lnot_word_aligned:
-1:     ld.ub   r8, r12[0]
-       sub     r10, 1
-       st.b    r11++, r8
-       reteq   r12
-       tst     r11, r9
-       brne    1b
-
-       /* fall through */
-
-       .global __raw_readsb
-       .type   __raw_readsb,@function
-__raw_readsb:
-       cp.w    r10, 0
-       mov     r9, 3
-       reteq   r12
-
-       tst     r11, r9
-       brne    .Lnot_word_aligned
-
-       sub     r10, 4
-       brlt    2f
-
-1:     ldins.b r8:t, r12[0]
-       ldins.b r8:u, r12[0]
-       ldins.b r8:l, r12[0]
-       ldins.b r8:b, r12[0]
-       st.w    r11++, r8
-       sub     r10, 4
-       brge    1b
-
-2:     sub     r10, -4
-       reteq   r12
-
-3:     ld.ub   r8, r12[0]
-       sub     r10, 1
-       st.b    r11++, r8
-       brne    3b
-
-       retal   r12
diff --git a/arch/avr32/lib/io-readsl.S b/arch/avr32/lib/io-readsl.S
deleted file mode 100644 (file)
index b103511..0000000
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-       .global __raw_readsl
-       .type   __raw_readsl,@function
-__raw_readsl:
-       cp.w    r10, 0
-       reteq   r12
-
-       /*
-        * If r11 isn't properly aligned, we might get an exception on
-        * some implementations. But there's not much we can do about it.
-        */
-1:     ld.w    r8, r12[0]
-       sub     r10, 1
-       st.w    r11++, r8
-       brne    1b
-
-       retal   r12
diff --git a/arch/avr32/lib/io-readsw.S b/arch/avr32/lib/io-readsw.S
deleted file mode 100644 (file)
index 456be99..0000000
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-.Lnot_word_aligned:
-       /*
-        * Bad alignment will cause a hardware exception, which is as
-        * good as anything. No need for us to check for proper alignment.
-        */
-       ld.uh   r8, r12[0]
-       sub     r10, 1
-       st.h    r11++, r8
-
-       /* fall through */
-
-       .global __raw_readsw
-       .type   __raw_readsw,@function
-__raw_readsw:
-       cp.w    r10, 0
-       reteq   r12
-       mov     r9, 3
-       tst     r11, r9
-       brne    .Lnot_word_aligned
-
-       sub     r10, 2
-       brlt    2f
-
-1:     ldins.h r8:t, r12[0]
-       ldins.h r8:b, r12[0]
-       st.w    r11++, r8
-       sub     r10, 2
-       brge    1b
-
-2:     sub     r10, -2
-       reteq   r12
-
-       ld.uh   r8, r12[0]
-       st.h    r11++, r8
-       retal   r12
diff --git a/arch/avr32/lib/io-writesb.S b/arch/avr32/lib/io-writesb.S
deleted file mode 100644 (file)
index b4ebaac..0000000
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-       .text
-.Lnot_word_aligned:
-1:     ld.ub   r8, r11++
-       sub     r10, 1
-       st.b    r12[0], r8
-       reteq   r12
-       tst     r11, r9
-       brne    1b
-
-       /* fall through */
-
-       .global __raw_writesb
-       .type   __raw_writesb,@function
-__raw_writesb:
-       cp.w    r10, 0
-       mov     r9, 3
-       reteq   r12
-
-       tst     r11, r9
-       brne    .Lnot_word_aligned
-
-       sub     r10, 4
-       brlt    2f
-
-1:     ld.w    r8, r11++
-       bfextu  r9, r8, 24, 8
-       st.b    r12[0], r9
-       bfextu  r9, r8, 16, 8
-       st.b    r12[0], r9
-       bfextu  r9, r8, 8, 8
-       st.b    r12[0], r9
-       st.b    r12[0], r8
-       sub     r10, 4
-       brge    1b
-
-2:     sub     r10, -4
-       reteq   r12
-
-3:     ld.ub   r8, r11++
-       sub     r10, 1
-       st.b    r12[0], r8
-       brne    3b
-
-       retal   r12
diff --git a/arch/avr32/lib/io-writesl.S b/arch/avr32/lib/io-writesl.S
deleted file mode 100644 (file)
index 22138b3..0000000
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-       .global __raw_writesl
-       .type   __raw_writesl,@function
-__raw_writesl:
-       cp.w    r10, 0
-       reteq   r12
-
-1:     ld.w    r8, r11++
-       sub     r10, 1
-       st.w    r12[0], r8
-       brne    1b
-
-       retal   r12
diff --git a/arch/avr32/lib/io-writesw.S b/arch/avr32/lib/io-writesw.S
deleted file mode 100644 (file)
index 8c4a53f..0000000
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-.Lnot_word_aligned:
-       ld.uh   r8, r11++
-       sub     r10, 1
-       st.h    r12[0], r8
-
-       .global __raw_writesw
-       .type   __raw_writesw,@function
-__raw_writesw:
-       cp.w    r10, 0
-       mov     r9, 3
-       reteq   r12
-       tst     r11, r9
-       brne    .Lnot_word_aligned
-
-       sub     r10, 2
-       brlt    2f
-
-1:     ld.w    r8, r11++
-       bfextu  r9, r8, 16, 16
-       st.h    r12[0], r9
-       st.h    r12[0], r8
-       sub     r10, 2
-       brge    1b
-
-2:     sub     r10, -2
-       reteq   r12
-
-       ld.uh   r8, r11++
-       st.h    r12[0], r8
-       retal   r12
diff --git a/arch/avr32/lib/memcpy.S b/arch/avr32/lib/memcpy.S
deleted file mode 100644 (file)
index c2ca49d..0000000
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-       /*
-        * void *memcpy(void *to, const void *from, unsigned long n)
-        *
-        * This implementation does word-aligned loads in the main loop,
-        * possibly sacrificing alignment of stores.
-        *
-        * Hopefully, in most cases, both "to" and "from" will be
-        * word-aligned to begin with.
-        */
-       .text
-       .global memcpy
-       .type   memcpy, @function
-memcpy:
-       mov     r9, r11
-       andl    r9, 3, COH
-       brne    1f
-
-       /* At this point, "from" is word-aligned */
-2:     mov     r9, r12
-5:     sub     r10, 4
-       brlt    4f
-
-3:     ld.w    r8, r11++
-       sub     r10, 4
-       st.w    r12++, r8
-       brge    3b
-
-4:     neg     r10
-       reteq   r9
-
-       /* Handle unaligned count */
-       lsl     r10, 2
-       add     pc, pc, r10
-       ld.ub   r8, r11++
-       st.b    r12++, r8
-       ld.ub   r8, r11++
-       st.b    r12++, r8
-       ld.ub   r8, r11++
-       st.b    r12++, r8
-       retal   r9
-
-       /* Handle unaligned "from" pointer */
-1:     sub     r10, 4
-       movlt   r9, r12
-       brlt    4b
-       add     r10, r9
-       lsl     r9, 2
-       add     pc, pc, r9
-       ld.ub   r8, r11++
-       st.b    r12++, r8
-       ld.ub   r8, r11++
-       st.b    r12++, r8
-       ld.ub   r8, r11++
-       st.b    r12++, r8
-       mov     r8, r12
-       add     pc, pc, r9
-       sub     r8, 1
-       nop
-       sub     r8, 1
-       nop
-       sub     r8, 1
-       nop
-       mov     r9, r8
-       rjmp    5b
diff --git a/arch/avr32/lib/memset.S b/arch/avr32/lib/memset.S
deleted file mode 100644 (file)
index 40da32c..0000000
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * Based on linux/arch/arm/lib/memset.S
- *   Copyright (C) 1995-2000 Russell King
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * ASM optimised string functions
- */
-#include <asm/asm.h>
-
-       /*
-        * r12: void *b
-        * r11: int c
-        * r10: size_t len
-        *
-        * Returns b in r12
-        */
-       .text
-       .global memset
-       .type   memset, @function
-       .align  5
-memset:
-       mov     r9, r12
-       mov     r8, r12
-       or      r11, r11, r11 << 8
-       andl    r9, 3, COH
-       brne    1f
-
-2:     or      r11, r11, r11 << 16
-       sub     r10, 4
-       brlt    5f
-
-       /* Let's do some real work */
-4:     st.w    r8++, r11
-       sub     r10, 4
-       brge    4b
-
-       /*
-        * When we get here, we've got less than 4 bytes to set. r10
-        * might be negative.
-        */
-5:     sub     r10, -4
-       reteq   r12
-
-       /* Fastpath ends here, exactly 32 bytes from memset */
-
-       /* Handle unaligned count or pointer */
-       bld     r10, 1
-       brcc    6f
-       st.b    r8++, r11
-       st.b    r8++, r11
-       bld     r10, 0
-       retcc   r12
-6:     st.b    r8++, r11
-       retal   r12
-
-       /* Handle unaligned pointer */
-1:     sub     r10, 4
-       brlt    5b
-       add     r10, r9
-       lsl     r9, 1
-       add     pc, r9
-       st.b    r8++, r11
-       st.b    r8++, r11
-       st.b    r8++, r11
-       rjmp    2b
-
-       .size   memset, . - memset
diff --git a/arch/avr32/lib/strncpy_from_user.S b/arch/avr32/lib/strncpy_from_user.S
deleted file mode 100644 (file)
index 72bd505..0000000
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copy to/from userspace with optional address space checking.
- *
- * Copyright 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/errno.h>
-
-#include <asm/page.h>
-#include <asm/thread_info.h>
-#include <asm/asm.h>
-
-       /*
-        * long strncpy_from_user(char *dst, const char *src, long count)
-        *
-        * On success, returns the length of the string, not including
-        * the terminating NUL.
-        *
-        * If the string is longer than count, returns count
-        *
-        * If userspace access fails, returns -EFAULT
-        */
-       .text
-       .align  1
-       .global strncpy_from_user
-       .type   strncpy_from_user, "function"
-strncpy_from_user:
-       mov     r9, -EFAULT
-       branch_if_kernel r8, __strncpy_from_user
-       ret_if_privileged r8, r11, r10, r9
-
-       .global __strncpy_from_user
-       .type   __strncpy_from_user, "function"
-__strncpy_from_user:
-       cp.w    r10, 0
-       reteq   0
-
-       mov     r9, r10
-
-1:     ld.ub   r8, r11++
-       st.b    r12++, r8
-       cp.w    r8, 0
-       breq    2f
-       sub     r9, 1
-       brne    1b
-
-2:     sub     r10, r9
-       retal   r10
-
-       .section .fixup, "ax"
-       .align  1
-3:     mov     r12, -EFAULT
-       retal   r12
-
-       .section __ex_table, "a"
-       .align  2
-       .long   1b, 3b
diff --git a/arch/avr32/lib/strnlen_user.S b/arch/avr32/lib/strnlen_user.S
deleted file mode 100644 (file)
index e46f472..0000000
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copy to/from userspace with optional address space checking.
- *
- * Copyright 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <asm/page.h>
-#include <asm/thread_info.h>
-#include <asm/processor.h>
-#include <asm/asm.h>
-
-       .text
-       .align  1
-       .global strnlen_user
-       .type   strnlen_user, "function"
-strnlen_user:
-       branch_if_kernel r8, __strnlen_user
-       sub     r8, r11, 1
-       add     r8, r12
-       retcs   0
-       brmi    adjust_length   /* do a closer inspection */
-
-       .global __strnlen_user
-       .type   __strnlen_user, "function"
-__strnlen_user:
-       mov     r10, r12
-
-10:    ld.ub   r8, r12++
-       cp.w    r8, 0
-       breq    2f
-       sub     r11, 1
-       brne    10b
-
-       sub     r12, -1
-2:     sub     r12, r10
-       retal   r12
-
-
-       .type   adjust_length, "function"
-adjust_length:
-       cp.w    r12, 0          /* addr must always be < TASK_SIZE */
-       retmi   0
-
-       pushm   lr
-       lddpc   lr, _task_size
-       sub     r11, lr, r12
-       mov     r9, r11
-       call    __strnlen_user
-       cp.w    r12, r9
-       brgt    1f
-       popm    pc
-1:     popm    pc, r12=0
-
-       .align  2
-_task_size:
-       .long   TASK_SIZE
-
-       .section .fixup, "ax"
-       .align  1
-19:    retal   0
-
-       .section __ex_table, "a"
-       .align  2
-       .long   10b, 19b
diff --git a/arch/avr32/mach-at32ap/Kconfig b/arch/avr32/mach-at32ap/Kconfig
deleted file mode 100644 (file)
index a7bbcc8..0000000
+++ /dev/null
@@ -1,31 +0,0 @@
-if PLATFORM_AT32AP
-
-menu "Atmel AVR32 AP options"
-
-choice
-       prompt "AT32AP700x static memory bus width"
-       depends on CPU_AT32AP700X
-       default AP700X_16_BIT_SMC
-       help
-         Define the width of the AP7000 external static memory interface.
-         This is used to determine how to mangle the address and/or data
-         when doing little-endian port access.
-
-         The current code can only support a single external memory bus
-         width for all chip selects, excluding the flash (which is using
-         raw access and is thus not affected by any of this.)
-
-config AP700X_32_BIT_SMC
-       bool "32 bit"
-
-config AP700X_16_BIT_SMC
-       bool "16 bit"
-
-config AP700X_8_BIT_SMC
-       bool "8 bit"
-
-endchoice
-
-endmenu
-
-endif # PLATFORM_AT32AP
diff --git a/arch/avr32/mach-at32ap/Makefile b/arch/avr32/mach-at32ap/Makefile
deleted file mode 100644 (file)
index fc09ec4..0000000
+++ /dev/null
@@ -1,8 +0,0 @@
-obj-y                          += pdc.o clock.o intc.o extint.o pio.o hsmc.o
-obj-y                          += hmatrix.o
-obj-$(CONFIG_CPU_AT32AP700X)   += at32ap700x.o pm-at32ap700x.o
-obj-$(CONFIG_PM)               += pm.o
-
-ifeq ($(CONFIG_PM_DEBUG),y)
-CFLAGS_pm.o    += -DDEBUG
-endif
diff --git a/arch/avr32/mach-at32ap/at32ap700x.c b/arch/avr32/mach-at32ap/at32ap700x.c
deleted file mode 100644 (file)
index 00d6dcc..0000000
+++ /dev/null
@@ -1,2368 +0,0 @@
-/*
- * Copyright (C) 2005-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/clk.h>
-#include <linux/delay.h>
-#include <linux/platform_data/dma-dw.h>
-#include <linux/fb.h>
-#include <linux/init.h>
-#include <linux/platform_device.h>
-#include <linux/dma-mapping.h>
-#include <linux/slab.h>
-#include <linux/gpio.h>
-#include <linux/spi/spi.h>
-#include <linux/usb/atmel_usba_udc.h>
-
-#include <linux/atmel-mci.h>
-
-#include <asm/io.h>
-#include <asm/irq.h>
-
-#include <mach/at32ap700x.h>
-#include <mach/board.h>
-#include <mach/hmatrix.h>
-#include <mach/portmux.h>
-#include <mach/sram.h>
-
-#include <sound/atmel-abdac.h>
-#include <sound/atmel-ac97c.h>
-
-#include <video/atmel_lcdc.h>
-
-#include "clock.h"
-#include "pio.h"
-#include "pm.h"
-
-
-#define PBMEM(base)                                    \
-       {                                               \
-               .start          = base,                 \
-               .end            = base + 0x3ff,         \
-               .flags          = IORESOURCE_MEM,       \
-       }
-#define IRQ(num)                                       \
-       {                                               \
-               .start          = num,                  \
-               .end            = num,                  \
-               .flags          = IORESOURCE_IRQ,       \
-       }
-#define NAMED_IRQ(num, _name)                          \
-       {                                               \
-               .start          = num,                  \
-               .end            = num,                  \
-               .name           = _name,                \
-               .flags          = IORESOURCE_IRQ,       \
-       }
-
-/* REVISIT these assume *every* device supports DMA, but several
- * don't ... tc, smc, pio, rtc, watchdog, pwm, ps2, and more.
- */
-#define DEFINE_DEV(_name, _id)                                 \
-static u64 _name##_id##_dma_mask = DMA_BIT_MASK(32);           \
-static struct platform_device _name##_id##_device = {          \
-       .name           = #_name,                               \
-       .id             = _id,                                  \
-       .dev            = {                                     \
-               .dma_mask = &_name##_id##_dma_mask,             \
-               .coherent_dma_mask = DMA_BIT_MASK(32),          \
-       },                                                      \
-       .resource       = _name##_id##_resource,                \
-       .num_resources  = ARRAY_SIZE(_name##_id##_resource),    \
-}
-#define DEFINE_DEV_DATA(_name, _id)                            \
-static u64 _name##_id##_dma_mask = DMA_BIT_MASK(32);           \
-static struct platform_device _name##_id##_device = {          \
-       .name           = #_name,                               \
-       .id             = _id,                                  \
-       .dev            = {                                     \
-               .dma_mask = &_name##_id##_dma_mask,             \
-               .platform_data  = &_name##_id##_data,           \
-               .coherent_dma_mask = DMA_BIT_MASK(32),          \
-       },                                                      \
-       .resource       = _name##_id##_resource,                \
-       .num_resources  = ARRAY_SIZE(_name##_id##_resource),    \
-}
-
-#define select_peripheral(port, pin_mask, periph, flags)       \
-       at32_select_periph(GPIO_##port##_BASE, pin_mask,        \
-                          GPIO_##periph, flags)
-
-#define DEV_CLK(_name, devname, bus, _index)                   \
-static struct clk devname##_##_name = {                                \
-       .name           = #_name,                               \
-       .dev            = &devname##_device.dev,                \
-       .parent         = &bus##_clk,                           \
-       .mode           = bus##_clk_mode,                       \
-       .get_rate       = bus##_clk_get_rate,                   \
-       .index          = _index,                               \
-}
-
-static DEFINE_SPINLOCK(pm_lock);
-
-static struct clk osc0;
-static struct clk osc1;
-
-static unsigned long osc_get_rate(struct clk *clk)
-{
-       return at32_board_osc_rates[clk->index];
-}
-
-static unsigned long pll_get_rate(struct clk *clk, unsigned long control)
-{
-       unsigned long div, mul, rate;
-
-       div = PM_BFEXT(PLLDIV, control) + 1;
-       mul = PM_BFEXT(PLLMUL, control) + 1;
-
-       rate = clk->parent->get_rate(clk->parent);
-       rate = (rate + div / 2) / div;
-       rate *= mul;
-
-       return rate;
-}
-
-static long pll_set_rate(struct clk *clk, unsigned long rate,
-                        u32 *pll_ctrl)
-{
-       unsigned long mul;
-       unsigned long mul_best_fit = 0;
-       unsigned long div;
-       unsigned long div_min;
-       unsigned long div_max;
-       unsigned long div_best_fit = 0;
-       unsigned long base;
-       unsigned long pll_in;
-       unsigned long actual = 0;
-       unsigned long rate_error;
-       unsigned long rate_error_prev = ~0UL;
-       u32 ctrl;
-
-       /* Rate must be between 80 MHz and 200 Mhz. */
-       if (rate < 80000000UL || rate > 200000000UL)
-               return -EINVAL;
-
-       ctrl = PM_BF(PLLOPT, 4);
-       base = clk->parent->get_rate(clk->parent);
-
-       /* PLL input frequency must be between 6 MHz and 32 MHz. */
-       div_min = DIV_ROUND_UP(base, 32000000UL);
-       div_max = base / 6000000UL;
-
-       if (div_max < div_min)
-               return -EINVAL;
-
-       for (div = div_min; div <= div_max; div++) {
-               pll_in = (base + div / 2) / div;
-               mul = (rate + pll_in / 2) / pll_in;
-
-               if (mul == 0)
-                       continue;
-
-               actual = pll_in * mul;
-               rate_error = abs(actual - rate);
-
-               if (rate_error < rate_error_prev) {
-                       mul_best_fit = mul;
-                       div_best_fit = div;
-                       rate_error_prev = rate_error;
-               }
-
-               if (rate_error == 0)
-                       break;
-       }
-
-       if (div_best_fit == 0)
-               return -EINVAL;
-
-       ctrl |= PM_BF(PLLMUL, mul_best_fit - 1);
-       ctrl |= PM_BF(PLLDIV, div_best_fit - 1);
-       ctrl |= PM_BF(PLLCOUNT, 16);
-
-       if (clk->parent == &osc1)
-               ctrl |= PM_BIT(PLLOSC);
-
-       *pll_ctrl = ctrl;
-
-       return actual;
-}
-
-static unsigned long pll0_get_rate(struct clk *clk)
-{
-       u32 control;
-
-       control = pm_readl(PLL0);
-
-       return pll_get_rate(clk, control);
-}
-
-static void pll1_mode(struct clk *clk, int enabled)
-{
-       unsigned long timeout;
-       u32 status;
-       u32 ctrl;
-
-       ctrl = pm_readl(PLL1);
-
-       if (enabled) {
-               if (!PM_BFEXT(PLLMUL, ctrl) && !PM_BFEXT(PLLDIV, ctrl)) {
-                       pr_debug("clk %s: failed to enable, rate not set\n",
-                                       clk->name);
-                       return;
-               }
-
-               ctrl |= PM_BIT(PLLEN);
-               pm_writel(PLL1, ctrl);
-
-               /* Wait for PLL lock. */
-               for (timeout = 10000; timeout; timeout--) {
-                       status = pm_readl(ISR);
-                       if (status & PM_BIT(LOCK1))
-                               break;
-                       udelay(10);
-               }
-
-               if (!(status & PM_BIT(LOCK1)))
-                       printk(KERN_ERR "clk %s: timeout waiting for lock\n",
-                                       clk->name);
-       } else {
-               ctrl &= ~PM_BIT(PLLEN);
-               pm_writel(PLL1, ctrl);
-       }
-}
-
-static unsigned long pll1_get_rate(struct clk *clk)
-{
-       u32 control;
-
-       control = pm_readl(PLL1);
-
-       return pll_get_rate(clk, control);
-}
-
-static long pll1_set_rate(struct clk *clk, unsigned long rate, int apply)
-{
-       u32 ctrl = 0;
-       unsigned long actual_rate;
-
-       actual_rate = pll_set_rate(clk, rate, &ctrl);
-
-       if (apply) {
-               if (actual_rate != rate)
-                       return -EINVAL;
-               if (clk->users > 0)
-                       return -EBUSY;
-               pr_debug(KERN_INFO "clk %s: new rate %lu (actual rate %lu)\n",
-                               clk->name, rate, actual_rate);
-               pm_writel(PLL1, ctrl);
-       }
-
-       return actual_rate;
-}
-
-static int pll1_set_parent(struct clk *clk, struct clk *parent)
-{
-       u32 ctrl;
-
-       if (clk->users > 0)
-               return -EBUSY;
-
-       ctrl = pm_readl(PLL1);
-       WARN_ON(ctrl & PM_BIT(PLLEN));
-
-       if (parent == &osc0)
-               ctrl &= ~PM_BIT(PLLOSC);
-       else if (parent == &osc1)
-               ctrl |= PM_BIT(PLLOSC);
-       else
-               return -EINVAL;
-
-       pm_writel(PLL1, ctrl);
-       clk->parent = parent;
-
-       return 0;
-}
-
-/*
- * The AT32AP7000 has five primary clock sources: One 32kHz
- * oscillator, two crystal oscillators and two PLLs.
- */
-static struct clk osc32k = {
-       .name           = "osc32k",
-       .get_rate       = osc_get_rate,
-       .users          = 1,
-       .index          = 0,
-};
-static struct clk osc0 = {
-       .name           = "osc0",
-       .get_rate       = osc_get_rate,
-       .users          = 1,
-       .index          = 1,
-};
-static struct clk osc1 = {
-       .name           = "osc1",
-       .get_rate       = osc_get_rate,
-       .index          = 2,
-};
-static struct clk pll0 = {
-       .name           = "pll0",
-       .get_rate       = pll0_get_rate,
-       .parent         = &osc0,
-};
-static struct clk pll1 = {
-       .name           = "pll1",
-       .mode           = pll1_mode,
-       .get_rate       = pll1_get_rate,
-       .set_rate       = pll1_set_rate,
-       .set_parent     = pll1_set_parent,
-       .parent         = &osc0,
-};
-
-/*
- * The main clock can be either osc0 or pll0.  The boot loader may
- * have chosen one for us, so we don't really know which one until we
- * have a look at the SM.
- */
-static struct clk *main_clock;
-
-/*
- * Synchronous clocks are generated from the main clock. The clocks
- * must satisfy the constraint
- *   fCPU >= fHSB >= fPB
- * i.e. each clock must not be faster than its parent.
- */
-static unsigned long bus_clk_get_rate(struct clk *clk, unsigned int shift)
-{
-       return main_clock->get_rate(main_clock) >> shift;
-};
-
-static void cpu_clk_mode(struct clk *clk, int enabled)
-{
-       unsigned long flags;
-       u32 mask;
-
-       spin_lock_irqsave(&pm_lock, flags);
-       mask = pm_readl(CPU_MASK);
-       if (enabled)
-               mask |= 1 << clk->index;
-       else
-               mask &= ~(1 << clk->index);
-       pm_writel(CPU_MASK, mask);
-       spin_unlock_irqrestore(&pm_lock, flags);
-}
-
-static unsigned long cpu_clk_get_rate(struct clk *clk)
-{
-       unsigned long cksel, shift = 0;
-
-       cksel = pm_readl(CKSEL);
-       if (cksel & PM_BIT(CPUDIV))
-               shift = PM_BFEXT(CPUSEL, cksel) + 1;
-
-       return bus_clk_get_rate(clk, shift);
-}
-
-static long cpu_clk_set_rate(struct clk *clk, unsigned long rate, int apply)
-{
-       u32 control;
-       unsigned long parent_rate, child_div, actual_rate, div;
-
-       parent_rate = clk->parent->get_rate(clk->parent);
-       control = pm_readl(CKSEL);
-
-       if (control & PM_BIT(HSBDIV))
-               child_div = 1 << (PM_BFEXT(HSBSEL, control) + 1);
-       else
-               child_div = 1;
-
-       if (rate > 3 * (parent_rate / 4) || child_div == 1) {
-               actual_rate = parent_rate;
-               control &= ~PM_BIT(CPUDIV);
-       } else {
-               unsigned int cpusel;
-               div = (parent_rate + rate / 2) / rate;
-               if (div > child_div)
-                       div = child_div;
-               cpusel = (div > 1) ? (fls(div) - 2) : 0;
-               control = PM_BIT(CPUDIV) | PM_BFINS(CPUSEL, cpusel, control);
-               actual_rate = parent_rate / (1 << (cpusel + 1));
-       }
-
-       pr_debug("clk %s: new rate %lu (actual rate %lu)\n",
-                       clk->name, rate, actual_rate);
-
-       if (apply)
-               pm_writel(CKSEL, control);
-
-       return actual_rate;
-}
-
-static void hsb_clk_mode(struct clk *clk, int enabled)
-{
-       unsigned long flags;
-       u32 mask;
-
-       spin_lock_irqsave(&pm_lock, flags);
-       mask = pm_readl(HSB_MASK);
-       if (enabled)
-               mask |= 1 << clk->index;
-       else
-               mask &= ~(1 << clk->index);
-       pm_writel(HSB_MASK, mask);
-       spin_unlock_irqrestore(&pm_lock, flags);
-}
-
-static unsigned long hsb_clk_get_rate(struct clk *clk)
-{
-       unsigned long cksel, shift = 0;
-
-       cksel = pm_readl(CKSEL);
-       if (cksel & PM_BIT(HSBDIV))
-               shift = PM_BFEXT(HSBSEL, cksel) + 1;
-
-       return bus_clk_get_rate(clk, shift);
-}
-
-void pba_clk_mode(struct clk *clk, int enabled)
-{
-       unsigned long flags;
-       u32 mask;
-
-       spin_lock_irqsave(&pm_lock, flags);
-       mask = pm_readl(PBA_MASK);
-       if (enabled)
-               mask |= 1 << clk->index;
-       else
-               mask &= ~(1 << clk->index);
-       pm_writel(PBA_MASK, mask);
-       spin_unlock_irqrestore(&pm_lock, flags);
-}
-
-unsigned long pba_clk_get_rate(struct clk *clk)
-{
-       unsigned long cksel, shift = 0;
-
-       cksel = pm_readl(CKSEL);
-       if (cksel & PM_BIT(PBADIV))
-               shift = PM_BFEXT(PBASEL, cksel) + 1;
-
-       return bus_clk_get_rate(clk, shift);
-}
-
-static void pbb_clk_mode(struct clk *clk, int enabled)
-{
-       unsigned long flags;
-       u32 mask;
-
-       spin_lock_irqsave(&pm_lock, flags);
-       mask = pm_readl(PBB_MASK);
-       if (enabled)
-               mask |= 1 << clk->index;
-       else
-               mask &= ~(1 << clk->index);
-       pm_writel(PBB_MASK, mask);
-       spin_unlock_irqrestore(&pm_lock, flags);
-}
-
-static unsigned long pbb_clk_get_rate(struct clk *clk)
-{
-       unsigned long cksel, shift = 0;
-
-       cksel = pm_readl(CKSEL);
-       if (cksel & PM_BIT(PBBDIV))
-               shift = PM_BFEXT(PBBSEL, cksel) + 1;
-
-       return bus_clk_get_rate(clk, shift);
-}
-
-static struct clk cpu_clk = {
-       .name           = "cpu",
-       .get_rate       = cpu_clk_get_rate,
-       .set_rate       = cpu_clk_set_rate,
-       .users          = 1,
-};
-static struct clk hsb_clk = {
-       .name           = "hsb",
-       .parent         = &cpu_clk,
-       .get_rate       = hsb_clk_get_rate,
-};
-static struct clk pba_clk = {
-       .name           = "pba",
-       .parent         = &hsb_clk,
-       .mode           = hsb_clk_mode,
-       .get_rate       = pba_clk_get_rate,
-       .index          = 1,
-};
-static struct clk pbb_clk = {
-       .name           = "pbb",
-       .parent         = &hsb_clk,
-       .mode           = hsb_clk_mode,
-       .get_rate       = pbb_clk_get_rate,
-       .users          = 1,
-       .index          = 2,
-};
-
-/* --------------------------------------------------------------------
- *  Generic Clock operations
- * -------------------------------------------------------------------- */
-
-static void genclk_mode(struct clk *clk, int enabled)
-{
-       u32 control;
-
-       control = pm_readl(GCCTRL(clk->index));
-       if (enabled)
-               control |= PM_BIT(CEN);
-       else
-               control &= ~PM_BIT(CEN);
-       pm_writel(GCCTRL(clk->index), control);
-}
-
-static unsigned long genclk_get_rate(struct clk *clk)
-{
-       u32 control;
-       unsigned long div = 1;
-
-       control = pm_readl(GCCTRL(clk->index));
-       if (control & PM_BIT(DIVEN))
-               div = 2 * (PM_BFEXT(DIV, control) + 1);
-
-       return clk->parent->get_rate(clk->parent) / div;
-}
-
-static long genclk_set_rate(struct clk *clk, unsigned long rate, int apply)
-{
-       u32 control;
-       unsigned long parent_rate, actual_rate, div;
-
-       parent_rate = clk->parent->get_rate(clk->parent);
-       control = pm_readl(GCCTRL(clk->index));
-
-       if (rate > 3 * parent_rate / 4) {
-               actual_rate = parent_rate;
-               control &= ~PM_BIT(DIVEN);
-       } else {
-               div = (parent_rate + rate) / (2 * rate) - 1;
-               control = PM_BFINS(DIV, div, control) | PM_BIT(DIVEN);
-               actual_rate = parent_rate / (2 * (div + 1));
-       }
-
-       dev_dbg(clk->dev, "clk %s: new rate %lu (actual rate %lu)\n",
-               clk->name, rate, actual_rate);
-
-       if (apply)
-               pm_writel(GCCTRL(clk->index), control);
-
-       return actual_rate;
-}
-
-int genclk_set_parent(struct clk *clk, struct clk *parent)
-{
-       u32 control;
-
-       dev_dbg(clk->dev, "clk %s: new parent %s (was %s)\n",
-               clk->name, parent->name, clk->parent->name);
-
-       control = pm_readl(GCCTRL(clk->index));
-
-       if (parent == &osc1 || parent == &pll1)
-               control |= PM_BIT(OSCSEL);
-       else if (parent == &osc0 || parent == &pll0)
-               control &= ~PM_BIT(OSCSEL);
-       else
-               return -EINVAL;
-
-       if (parent == &pll0 || parent == &pll1)
-               control |= PM_BIT(PLLSEL);
-       else
-               control &= ~PM_BIT(PLLSEL);
-
-       pm_writel(GCCTRL(clk->index), control);
-       clk->parent = parent;
-
-       return 0;
-}
-
-static void __init genclk_init_parent(struct clk *clk)
-{
-       u32 control;
-       struct clk *parent;
-
-       BUG_ON(clk->index > 7);
-
-       control = pm_readl(GCCTRL(clk->index));
-       if (control & PM_BIT(OSCSEL))
-               parent = (control & PM_BIT(PLLSEL)) ? &pll1 : &osc1;
-       else
-               parent = (control & PM_BIT(PLLSEL)) ? &pll0 : &osc0;
-
-       clk->parent = parent;
-}
-
-static struct resource dw_dmac0_resource[] = {
-       PBMEM(0xff200000),
-       IRQ(2),
-};
-DEFINE_DEV(dw_dmac, 0);
-DEV_CLK(hclk, dw_dmac0, hsb, 10);
-
-/* --------------------------------------------------------------------
- *  System peripherals
- * -------------------------------------------------------------------- */
-static struct resource at32_pm0_resource[] = {
-       {
-               .start  = 0xfff00000,
-               .end    = 0xfff0007f,
-               .flags  = IORESOURCE_MEM,
-       },
-       IRQ(20),
-};
-
-static struct resource at32ap700x_rtc0_resource[] = {
-       {
-               .start  = 0xfff00080,
-               .end    = 0xfff000af,
-               .flags  = IORESOURCE_MEM,
-       },
-       IRQ(21),
-};
-
-static struct resource at32_wdt0_resource[] = {
-       {
-               .start  = 0xfff000b0,
-               .end    = 0xfff000cf,
-               .flags  = IORESOURCE_MEM,
-       },
-};
-
-static struct resource at32_eic0_resource[] = {
-       {
-               .start  = 0xfff00100,
-               .end    = 0xfff0013f,
-               .flags  = IORESOURCE_MEM,
-       },
-       IRQ(19),
-};
-
-DEFINE_DEV(at32_pm, 0);
-DEFINE_DEV(at32ap700x_rtc, 0);
-DEFINE_DEV(at32_wdt, 0);
-DEFINE_DEV(at32_eic, 0);
-
-/*
- * Peripheral clock for PM, RTC, WDT and EIC. PM will ensure that this
- * is always running.
- */
-static struct clk at32_pm_pclk = {
-       .name           = "pclk",
-       .dev            = &at32_pm0_device.dev,
-       .parent         = &pbb_clk,
-       .mode           = pbb_clk_mode,
-       .get_rate       = pbb_clk_get_rate,
-       .users          = 1,
-       .index          = 0,
-};
-
-static struct resource intc0_resource[] = {
-       PBMEM(0xfff00400),
-};
-struct platform_device at32_intc0_device = {
-       .name           = "intc",
-       .id             = 0,
-       .resource       = intc0_resource,
-       .num_resources  = ARRAY_SIZE(intc0_resource),
-};
-DEV_CLK(pclk, at32_intc0, pbb, 1);
-
-static struct clk ebi_clk = {
-       .name           = "ebi",
-       .parent         = &hsb_clk,
-       .mode           = hsb_clk_mode,
-       .get_rate       = hsb_clk_get_rate,
-       .users          = 1,
-};
-static struct clk hramc_clk = {
-       .name           = "hramc",
-       .parent         = &hsb_clk,
-       .mode           = hsb_clk_mode,
-       .get_rate       = hsb_clk_get_rate,
-       .users          = 1,
-       .index          = 3,
-};
-static struct clk sdramc_clk = {
-       .name           = "sdramc_clk",
-       .parent         = &pbb_clk,
-       .mode           = pbb_clk_mode,
-       .get_rate       = pbb_clk_get_rate,
-       .users          = 1,
-       .index          = 14,
-};
-
-static struct resource smc0_resource[] = {
-       PBMEM(0xfff03400),
-};
-DEFINE_DEV(smc, 0);
-DEV_CLK(pclk, smc0, pbb, 13);
-DEV_CLK(mck, smc0, hsb, 0);
-
-static struct platform_device pdc_device = {
-       .name           = "pdc",
-       .id             = 0,
-};
-DEV_CLK(hclk, pdc, hsb, 4);
-DEV_CLK(pclk, pdc, pba, 16);
-
-static struct clk pico_clk = {
-       .name           = "pico",
-       .parent         = &cpu_clk,
-       .mode           = cpu_clk_mode,
-       .get_rate       = cpu_clk_get_rate,
-       .users          = 1,
-};
-
-/* --------------------------------------------------------------------
- * HMATRIX
- * -------------------------------------------------------------------- */
-
-struct clk at32_hmatrix_clk = {
-       .name           = "hmatrix_clk",
-       .parent         = &pbb_clk,
-       .mode           = pbb_clk_mode,
-       .get_rate       = pbb_clk_get_rate,
-       .index          = 2,
-       .users          = 1,
-};
-
-/*
- * Set bits in the HMATRIX Special Function Register (SFR) used by the
- * External Bus Interface (EBI). This can be used to enable special
- * features like CompactFlash support, NAND Flash support, etc. on
- * certain chipselects.
- */
-static inline void set_ebi_sfr_bits(u32 mask)
-{
-       hmatrix_sfr_set_bits(HMATRIX_SLAVE_EBI, mask);
-}
-
-/* --------------------------------------------------------------------
- *  Timer/Counter (TC)
- * -------------------------------------------------------------------- */
-
-static struct resource at32_tcb0_resource[] = {
-       PBMEM(0xfff00c00),
-       IRQ(22),
-};
-static struct platform_device at32_tcb0_device = {
-       .name           = "atmel_tcb",
-       .id             = 0,
-       .resource       = at32_tcb0_resource,
-       .num_resources  = ARRAY_SIZE(at32_tcb0_resource),
-};
-DEV_CLK(t0_clk, at32_tcb0, pbb, 3);
-
-static struct resource at32_tcb1_resource[] = {
-       PBMEM(0xfff01000),
-       IRQ(23),
-};
-static struct platform_device at32_tcb1_device = {
-       .name           = "atmel_tcb",
-       .id             = 1,
-       .resource       = at32_tcb1_resource,
-       .num_resources  = ARRAY_SIZE(at32_tcb1_resource),
-};
-DEV_CLK(t0_clk, at32_tcb1, pbb, 4);
-
-/* --------------------------------------------------------------------
- *  PIO
- * -------------------------------------------------------------------- */
-
-static struct resource pio0_resource[] = {
-       PBMEM(0xffe02800),
-       IRQ(13),
-};
-DEFINE_DEV(pio, 0);
-DEV_CLK(mck, pio0, pba, 10);
-
-static struct resource pio1_resource[] = {
-       PBMEM(0xffe02c00),
-       IRQ(14),
-};
-DEFINE_DEV(pio, 1);
-DEV_CLK(mck, pio1, pba, 11);
-
-static struct resource pio2_resource[] = {
-       PBMEM(0xffe03000),
-       IRQ(15),
-};
-DEFINE_DEV(pio, 2);
-DEV_CLK(mck, pio2, pba, 12);
-
-static struct resource pio3_resource[] = {
-       PBMEM(0xffe03400),
-       IRQ(16),
-};
-DEFINE_DEV(pio, 3);
-DEV_CLK(mck, pio3, pba, 13);
-
-static struct resource pio4_resource[] = {
-       PBMEM(0xffe03800),
-       IRQ(17),
-};
-DEFINE_DEV(pio, 4);
-DEV_CLK(mck, pio4, pba, 14);
-
-static int __init system_device_init(void)
-{
-       platform_device_register(&at32_pm0_device);
-       platform_device_register(&at32_intc0_device);
-       platform_device_register(&at32ap700x_rtc0_device);
-       platform_device_register(&at32_wdt0_device);
-       platform_device_register(&at32_eic0_device);
-       platform_device_register(&smc0_device);
-       platform_device_register(&pdc_device);
-       platform_device_register(&dw_dmac0_device);
-
-       platform_device_register(&at32_tcb0_device);
-       platform_device_register(&at32_tcb1_device);
-
-       platform_device_register(&pio0_device);
-       platform_device_register(&pio1_device);
-       platform_device_register(&pio2_device);
-       platform_device_register(&pio3_device);
-       platform_device_register(&pio4_device);
-
-       return 0;
-}
-core_initcall(system_device_init);
-
-/* --------------------------------------------------------------------
- *  PSIF
- * -------------------------------------------------------------------- */
-static struct resource atmel_psif0_resource[] __initdata = {
-       {
-               .start  = 0xffe03c00,
-               .end    = 0xffe03cff,
-               .flags  = IORESOURCE_MEM,
-       },
-       IRQ(18),
-};
-static struct clk atmel_psif0_pclk = {
-       .name           = "pclk",
-       .parent         = &pba_clk,
-       .mode           = pba_clk_mode,
-       .get_rate       = pba_clk_get_rate,
-       .index          = 15,
-};
-
-static struct resource atmel_psif1_resource[] __initdata = {
-       {
-               .start  = 0xffe03d00,
-               .end    = 0xffe03dff,
-               .flags  = IORESOURCE_MEM,
-       },
-       IRQ(18),
-};
-static struct clk atmel_psif1_pclk = {
-       .name           = "pclk",
-       .parent         = &pba_clk,
-       .mode           = pba_clk_mode,
-       .get_rate       = pba_clk_get_rate,
-       .index          = 15,
-};
-
-struct platform_device *__init at32_add_device_psif(unsigned int id)
-{
-       struct platform_device *pdev;
-       u32 pin_mask;
-
-       if (!(id == 0 || id == 1))
-               return NULL;
-
-       pdev = platform_device_alloc("atmel_psif", id);
-       if (!pdev)
-               return NULL;
-
-       switch (id) {
-       case 0:
-               pin_mask  = (1 << 8) | (1 << 9); /* CLOCK & DATA */
-
-               if (platform_device_add_resources(pdev, atmel_psif0_resource,
-                                       ARRAY_SIZE(atmel_psif0_resource)))
-                       goto err_add_resources;
-               atmel_psif0_pclk.dev = &pdev->dev;
-               select_peripheral(PIOA, pin_mask, PERIPH_A, 0);
-               break;
-       case 1:
-               pin_mask  = (1 << 11) | (1 << 12); /* CLOCK & DATA */
-
-               if (platform_device_add_resources(pdev, atmel_psif1_resource,
-                                       ARRAY_SIZE(atmel_psif1_resource)))
-                       goto err_add_resources;
-               atmel_psif1_pclk.dev = &pdev->dev;
-               select_peripheral(PIOB, pin_mask, PERIPH_A, 0);
-               break;
-       default:
-               return NULL;
-       }
-
-       platform_device_add(pdev);
-       return pdev;
-
-err_add_resources:
-       platform_device_put(pdev);
-       return NULL;
-}
-
-/* --------------------------------------------------------------------
- *  USART
- * -------------------------------------------------------------------- */
-
-static struct atmel_uart_data atmel_usart0_data = {
-       .use_dma_tx     = 1,
-       .use_dma_rx     = 1,
-};
-static struct resource atmel_usart0_resource[] = {
-       PBMEM(0xffe00c00),
-       IRQ(6),
-};
-DEFINE_DEV_DATA(atmel_usart, 0);
-DEV_CLK(usart, atmel_usart0, pba, 3);
-
-static struct atmel_uart_data atmel_usart1_data = {
-       .use_dma_tx     = 1,
-       .use_dma_rx     = 1,
-};
-static struct resource atmel_usart1_resource[] = {
-       PBMEM(0xffe01000),
-       IRQ(7),
-};
-DEFINE_DEV_DATA(atmel_usart, 1);
-DEV_CLK(usart, atmel_usart1, pba, 4);
-
-static struct atmel_uart_data atmel_usart2_data = {
-       .use_dma_tx     = 1,
-       .use_dma_rx     = 1,
-};
-static struct resource atmel_usart2_resource[] = {
-       PBMEM(0xffe01400),
-       IRQ(8),
-};
-DEFINE_DEV_DATA(atmel_usart, 2);
-DEV_CLK(usart, atmel_usart2, pba, 5);
-
-static struct atmel_uart_data atmel_usart3_data = {
-       .use_dma_tx     = 1,
-       .use_dma_rx     = 1,
-};
-static struct resource atmel_usart3_resource[] = {
-       PBMEM(0xffe01800),
-       IRQ(9),
-};
-DEFINE_DEV_DATA(atmel_usart, 3);
-DEV_CLK(usart, atmel_usart3, pba, 6);
-
-static inline void configure_usart0_pins(int flags)
-{
-       u32 pin_mask = (1 << 8) | (1 << 9); /* RXD & TXD */
-       if (flags & ATMEL_USART_RTS)    pin_mask |= (1 << 6);
-       if (flags & ATMEL_USART_CTS)    pin_mask |= (1 << 7);
-       if (flags & ATMEL_USART_CLK)    pin_mask |= (1 << 10);
-
-       select_peripheral(PIOA, pin_mask, PERIPH_B, AT32_GPIOF_PULLUP);
-}
-
-static inline void configure_usart1_pins(int flags)
-{
-       u32 pin_mask = (1 << 17) | (1 << 18); /* RXD & TXD */
-       if (flags & ATMEL_USART_RTS)    pin_mask |= (1 << 19);
-       if (flags & ATMEL_USART_CTS)    pin_mask |= (1 << 20);
-       if (flags & ATMEL_USART_CLK)    pin_mask |= (1 << 16);
-
-       select_peripheral(PIOA, pin_mask, PERIPH_A, AT32_GPIOF_PULLUP);
-}
-
-static inline void configure_usart2_pins(int flags)
-{
-       u32 pin_mask = (1 << 26) | (1 << 27); /* RXD & TXD */
-       if (flags & ATMEL_USART_RTS)    pin_mask |= (1 << 30);
-       if (flags & ATMEL_USART_CTS)    pin_mask |= (1 << 29);
-       if (flags & ATMEL_USART_CLK)    pin_mask |= (1 << 28);
-
-       select_peripheral(PIOB, pin_mask, PERIPH_B, AT32_GPIOF_PULLUP);
-}
-
-static inline void configure_usart3_pins(int flags)
-{
-       u32 pin_mask = (1 << 18) | (1 << 17); /* RXD & TXD */
-       if (flags & ATMEL_USART_RTS)    pin_mask |= (1 << 16);
-       if (flags & ATMEL_USART_CTS)    pin_mask |= (1 << 15);
-       if (flags & ATMEL_USART_CLK)    pin_mask |= (1 << 19);
-
-       select_peripheral(PIOB, pin_mask, PERIPH_B, AT32_GPIOF_PULLUP);
-}
-
-static struct platform_device *__initdata at32_usarts[4];
-
-void __init at32_map_usart(unsigned int hw_id, unsigned int line, int flags)
-{
-       struct platform_device *pdev;
-       struct atmel_uart_data *pdata;
-
-       switch (hw_id) {
-       case 0:
-               pdev = &atmel_usart0_device;
-               configure_usart0_pins(flags);
-               break;
-       case 1:
-               pdev = &atmel_usart1_device;
-               configure_usart1_pins(flags);
-               break;
-       case 2:
-               pdev = &atmel_usart2_device;
-               configure_usart2_pins(flags);
-               break;
-       case 3:
-               pdev = &atmel_usart3_device;
-               configure_usart3_pins(flags);
-               break;
-       default:
-               return;
-       }
-
-       if (PXSEG(pdev->resource[0].start) == P4SEG) {
-               /* Addresses in the P4 segment are permanently mapped 1:1 */
-               struct atmel_uart_data *data = pdev->dev.platform_data;
-               data->regs = (void __iomem *)pdev->resource[0].start;
-       }
-
-       pdev->id = line;
-       pdata = pdev->dev.platform_data;
-       pdata->num = line;
-       at32_usarts[line] = pdev;
-}
-
-struct platform_device *__init at32_add_device_usart(unsigned int id)
-{
-       platform_device_register(at32_usarts[id]);
-       return at32_usarts[id];
-}
-
-void __init at32_setup_serial_console(unsigned int usart_id)
-{
-#ifdef CONFIG_SERIAL_ATMEL
-       atmel_default_console_device = at32_usarts[usart_id];
-#endif
-}
-
-/* --------------------------------------------------------------------
- *  Ethernet
- * -------------------------------------------------------------------- */
-
-#ifdef CONFIG_CPU_AT32AP7000
-static struct macb_platform_data macb0_data;
-static struct resource macb0_resource[] = {
-       PBMEM(0xfff01800),
-       IRQ(25),
-};
-DEFINE_DEV_DATA(macb, 0);
-DEV_CLK(hclk, macb0, hsb, 8);
-DEV_CLK(pclk, macb0, pbb, 6);
-
-static struct macb_platform_data macb1_data;
-static struct resource macb1_resource[] = {
-       PBMEM(0xfff01c00),
-       IRQ(26),
-};
-DEFINE_DEV_DATA(macb, 1);
-DEV_CLK(hclk, macb1, hsb, 9);
-DEV_CLK(pclk, macb1, pbb, 7);
-
-struct platform_device *__init
-at32_add_device_eth(unsigned int id, struct macb_platform_data *data)
-{
-       struct platform_device *pdev;
-       u32 pin_mask;
-
-       switch (id) {
-       case 0:
-               pdev = &macb0_device;
-
-               pin_mask  = (1 << 3);   /* TXD0 */
-               pin_mask |= (1 << 4);   /* TXD1 */
-               pin_mask |= (1 << 7);   /* TXEN */
-               pin_mask |= (1 << 8);   /* TXCK */
-               pin_mask |= (1 << 9);   /* RXD0 */
-               pin_mask |= (1 << 10);  /* RXD1 */
-               pin_mask |= (1 << 13);  /* RXER */
-               pin_mask |= (1 << 15);  /* RXDV */
-               pin_mask |= (1 << 16);  /* MDC  */
-               pin_mask |= (1 << 17);  /* MDIO */
-
-               if (!data->is_rmii) {
-                       pin_mask |= (1 << 0);   /* COL  */
-                       pin_mask |= (1 << 1);   /* CRS  */
-                       pin_mask |= (1 << 2);   /* TXER */
-                       pin_mask |= (1 << 5);   /* TXD2 */
-                       pin_mask |= (1 << 6);   /* TXD3 */
-                       pin_mask |= (1 << 11);  /* RXD2 */
-                       pin_mask |= (1 << 12);  /* RXD3 */
-                       pin_mask |= (1 << 14);  /* RXCK */
-#ifndef CONFIG_BOARD_MIMC200
-                       pin_mask |= (1 << 18);  /* SPD  */
-#endif
-               }
-
-               select_peripheral(PIOC, pin_mask, PERIPH_A, 0);
-
-               break;
-
-       case 1:
-               pdev = &macb1_device;
-
-               pin_mask  = (1 << 13);  /* TXD0 */
-               pin_mask |= (1 << 14);  /* TXD1 */
-               pin_mask |= (1 << 11);  /* TXEN */
-               pin_mask |= (1 << 12);  /* TXCK */
-               pin_mask |= (1 << 10);  /* RXD0 */
-               pin_mask |= (1 << 6);   /* RXD1 */
-               pin_mask |= (1 << 5);   /* RXER */
-               pin_mask |= (1 << 4);   /* RXDV */
-               pin_mask |= (1 << 3);   /* MDC  */
-               pin_mask |= (1 << 2);   /* MDIO */
-
-#ifndef CONFIG_BOARD_MIMC200
-               if (!data->is_rmii)
-                       pin_mask |= (1 << 15);  /* SPD  */
-#endif
-
-               select_peripheral(PIOD, pin_mask, PERIPH_B, 0);
-
-               if (!data->is_rmii) {
-                       pin_mask  = (1 << 19);  /* COL  */
-                       pin_mask |= (1 << 23);  /* CRS  */
-                       pin_mask |= (1 << 26);  /* TXER */
-                       pin_mask |= (1 << 27);  /* TXD2 */
-                       pin_mask |= (1 << 28);  /* TXD3 */
-                       pin_mask |= (1 << 29);  /* RXD2 */
-                       pin_mask |= (1 << 30);  /* RXD3 */
-                       pin_mask |= (1 << 24);  /* RXCK */
-
-                       select_peripheral(PIOC, pin_mask, PERIPH_B, 0);
-               }
-               break;
-
-       default:
-               return NULL;
-       }
-
-       memcpy(pdev->dev.platform_data, data, sizeof(struct macb_platform_data));
-       platform_device_register(pdev);
-
-       return pdev;
-}
-#endif
-
-/* --------------------------------------------------------------------
- *  SPI
- * -------------------------------------------------------------------- */
-static struct resource atmel_spi0_resource[] = {
-       PBMEM(0xffe00000),
-       IRQ(3),
-};
-DEFINE_DEV(atmel_spi, 0);
-DEV_CLK(spi_clk, atmel_spi0, pba, 0);
-
-static struct resource atmel_spi1_resource[] = {
-       PBMEM(0xffe00400),
-       IRQ(4),
-};
-DEFINE_DEV(atmel_spi, 1);
-DEV_CLK(spi_clk, atmel_spi1, pba, 1);
-
-void __init
-at32_spi_setup_slaves(unsigned int bus_num, struct spi_board_info *b, unsigned int n)
-{
-       /*
-        * Manage the chipselects as GPIOs, normally using the same pins
-        * the SPI controller expects; but boards can use other pins.
-        */
-       static u8 __initdata spi_pins[][4] = {
-               { GPIO_PIN_PA(3), GPIO_PIN_PA(4),
-                 GPIO_PIN_PA(5), GPIO_PIN_PA(20) },
-               { GPIO_PIN_PB(2), GPIO_PIN_PB(3),
-                 GPIO_PIN_PB(4), GPIO_PIN_PA(27) },
-       };
-       unsigned int pin, mode;
-
-       /* There are only 2 SPI controllers */
-       if (bus_num > 1)
-               return;
-
-       for (; n; n--, b++) {
-               b->bus_num = bus_num;
-               if (b->chip_select >= 4)
-                       continue;
-               pin = (unsigned)b->controller_data;
-               if (!pin) {
-                       pin = spi_pins[bus_num][b->chip_select];
-                       b->controller_data = (void *)pin;
-               }
-               mode = AT32_GPIOF_OUTPUT;
-               if (!(b->mode & SPI_CS_HIGH))
-                       mode |= AT32_GPIOF_HIGH;
-               at32_select_gpio(pin, mode);
-       }
-}
-
-struct platform_device *__init
-at32_add_device_spi(unsigned int id, struct spi_board_info *b, unsigned int n)
-{
-       struct platform_device *pdev;
-       u32 pin_mask;
-
-       switch (id) {
-       case 0:
-               pdev = &atmel_spi0_device;
-               pin_mask  = (1 << 1) | (1 << 2);        /* MOSI & SCK */
-
-               /* pullup MISO so a level is always defined */
-               select_peripheral(PIOA, (1 << 0), PERIPH_A, AT32_GPIOF_PULLUP);
-               select_peripheral(PIOA, pin_mask, PERIPH_A, 0);
-
-               at32_spi_setup_slaves(0, b, n);
-               break;
-
-       case 1:
-               pdev = &atmel_spi1_device;
-               pin_mask  = (1 << 1) | (1 << 5);        /* MOSI */
-
-               /* pullup MISO so a level is always defined */
-               select_peripheral(PIOB, (1 << 0), PERIPH_B, AT32_GPIOF_PULLUP);
-               select_peripheral(PIOB, pin_mask, PERIPH_B, 0);
-
-               at32_spi_setup_slaves(1, b, n);
-               break;
-
-       default:
-               return NULL;
-       }
-
-       spi_register_board_info(b, n);
-       platform_device_register(pdev);
-       return pdev;
-}
-
-/* --------------------------------------------------------------------
- *  TWI
- * -------------------------------------------------------------------- */
-static struct resource atmel_twi0_resource[] __initdata = {
-       PBMEM(0xffe00800),
-       IRQ(5),
-};
-static struct clk atmel_twi0_pclk = {
-       .name           = "twi_pclk",
-       .parent         = &pba_clk,
-       .mode           = pba_clk_mode,
-       .get_rate       = pba_clk_get_rate,
-       .index          = 2,
-};
-
-struct platform_device *__init at32_add_device_twi(unsigned int id,
-                                                   struct i2c_board_info *b,
-                                                   unsigned int n)
-{
-       struct platform_device *pdev;
-       u32 pin_mask;
-
-       if (id != 0)
-               return NULL;
-
-       pdev = platform_device_alloc("atmel_twi", id);
-       if (!pdev)
-               return NULL;
-
-       if (platform_device_add_resources(pdev, atmel_twi0_resource,
-                               ARRAY_SIZE(atmel_twi0_resource)))
-               goto err_add_resources;
-
-       pin_mask  = (1 << 6) | (1 << 7);        /* SDA & SDL */
-
-       select_peripheral(PIOA, pin_mask, PERIPH_A, 0);
-
-       atmel_twi0_pclk.dev = &pdev->dev;
-
-       if (b)
-               i2c_register_board_info(id, b, n);
-
-       platform_device_add(pdev);
-       return pdev;
-
-err_add_resources:
-       platform_device_put(pdev);
-       return NULL;
-}
-
-/* --------------------------------------------------------------------
- * MMC
- * -------------------------------------------------------------------- */
-static struct resource atmel_mci0_resource[] __initdata = {
-       PBMEM(0xfff02400),
-       IRQ(28),
-};
-static struct clk atmel_mci0_pclk = {
-       .name           = "mci_clk",
-       .parent         = &pbb_clk,
-       .mode           = pbb_clk_mode,
-       .get_rate       = pbb_clk_get_rate,
-       .index          = 9,
-};
-
-static bool at32_mci_dma_filter(struct dma_chan *chan, void *pdata)
-{
-       struct dw_dma_slave *sl = pdata;
-
-       if (!sl)
-               return false;
-
-       if (sl->dma_dev == chan->device->dev) {
-               chan->private = sl;
-               return true;
-       }
-
-       return false;
-}
-
-struct platform_device *__init
-at32_add_device_mci(unsigned int id, struct mci_platform_data *data)
-{
-       struct platform_device          *pdev;
-       struct dw_dma_slave             *slave;
-       u32                             pioa_mask;
-       u32                             piob_mask;
-
-       if (id != 0 || !data)
-               return NULL;
-
-       /* Must have at least one usable slot */
-       if (!data->slot[0].bus_width && !data->slot[1].bus_width)
-               return NULL;
-
-       pdev = platform_device_alloc("atmel_mci", id);
-       if (!pdev)
-               goto fail;
-
-       if (platform_device_add_resources(pdev, atmel_mci0_resource,
-                               ARRAY_SIZE(atmel_mci0_resource)))
-               goto fail;
-
-       slave = kzalloc(sizeof(*slave), GFP_KERNEL);
-       if (!slave)
-               goto fail;
-
-       slave->dma_dev = &dw_dmac0_device.dev;
-       slave->src_id = 0;
-       slave->dst_id = 1;
-       slave->m_master = 1;
-       slave->p_master = 0;
-
-       data->dma_slave = slave;
-       data->dma_filter = at32_mci_dma_filter;
-
-       if (platform_device_add_data(pdev, data,
-                               sizeof(struct mci_platform_data)))
-               goto fail_free;
-
-       /* CLK line is common to both slots */
-       pioa_mask = 1 << 10;
-
-       switch (data->slot[0].bus_width) {
-       case 4:
-               pioa_mask |= 1 << 13;           /* DATA1 */
-               pioa_mask |= 1 << 14;           /* DATA2 */
-               pioa_mask |= 1 << 15;           /* DATA3 */
-               /* fall through */
-       case 1:
-               pioa_mask |= 1 << 11;           /* CMD   */
-               pioa_mask |= 1 << 12;           /* DATA0 */
-
-               if (gpio_is_valid(data->slot[0].detect_pin))
-                       at32_select_gpio(data->slot[0].detect_pin, 0);
-               if (gpio_is_valid(data->slot[0].wp_pin))
-                       at32_select_gpio(data->slot[0].wp_pin, 0);
-               break;
-       case 0:
-               /* Slot is unused */
-               break;
-       default:
-               goto fail_free;
-       }
-
-       select_peripheral(PIOA, pioa_mask, PERIPH_A, 0);
-       piob_mask = 0;
-
-       switch (data->slot[1].bus_width) {
-       case 4:
-               piob_mask |= 1 <<  8;           /* DATA1 */
-               piob_mask |= 1 <<  9;           /* DATA2 */
-               piob_mask |= 1 << 10;           /* DATA3 */
-               /* fall through */
-       case 1:
-               piob_mask |= 1 <<  6;           /* CMD   */
-               piob_mask |= 1 <<  7;           /* DATA0 */
-               select_peripheral(PIOB, piob_mask, PERIPH_B, 0);
-
-               if (gpio_is_valid(data->slot[1].detect_pin))
-                       at32_select_gpio(data->slot[1].detect_pin, 0);
-               if (gpio_is_valid(data->slot[1].wp_pin))
-                       at32_select_gpio(data->slot[1].wp_pin, 0);
-               break;
-       case 0:
-               /* Slot is unused */
-               break;
-       default:
-               if (!data->slot[0].bus_width)
-                       goto fail_free;
-
-               data->slot[1].bus_width = 0;
-               break;
-       }
-
-       atmel_mci0_pclk.dev = &pdev->dev;
-
-       platform_device_add(pdev);
-       return pdev;
-
-fail_free:
-       kfree(slave);
-fail:
-       data->dma_slave = NULL;
-       platform_device_put(pdev);
-       return NULL;
-}
-
-/* --------------------------------------------------------------------
- *  LCDC
- * -------------------------------------------------------------------- */
-#if defined(CONFIG_CPU_AT32AP7000) || defined(CONFIG_CPU_AT32AP7002)
-static struct atmel_lcdfb_pdata atmel_lcdfb0_data;
-static struct resource atmel_lcdfb0_resource[] = {
-       {
-               .start          = 0xff000000,
-               .end            = 0xff000fff,
-               .flags          = IORESOURCE_MEM,
-       },
-       IRQ(1),
-       {
-               /* Placeholder for pre-allocated fb memory */
-               .start          = 0x00000000,
-               .end            = 0x00000000,
-               .flags          = 0,
-       },
-};
-DEFINE_DEV_DATA(atmel_lcdfb, 0);
-DEV_CLK(hclk, atmel_lcdfb0, hsb, 7);
-static struct clk atmel_lcdfb0_pixclk = {
-       .name           = "lcdc_clk",
-       .dev            = &atmel_lcdfb0_device.dev,
-       .mode           = genclk_mode,
-       .get_rate       = genclk_get_rate,
-       .set_rate       = genclk_set_rate,
-       .set_parent     = genclk_set_parent,
-       .index          = 7,
-};
-
-struct platform_device *__init
-at32_add_device_lcdc(unsigned int id, struct atmel_lcdfb_pdata *data,
-                    unsigned long fbmem_start, unsigned long fbmem_len,
-                    u64 pin_mask)
-{
-       struct platform_device *pdev;
-       struct atmel_lcdfb_pdata *info;
-       struct fb_monspecs *monspecs;
-       struct fb_videomode *modedb;
-       unsigned int modedb_size;
-       u32 portc_mask, portd_mask, porte_mask;
-
-       /*
-        * Do a deep copy of the fb data, monspecs and modedb. Make
-        * sure all allocations are done before setting up the
-        * portmux.
-        */
-       monspecs = kmemdup(data->default_monspecs,
-                          sizeof(struct fb_monspecs), GFP_KERNEL);
-       if (!monspecs)
-               return NULL;
-
-       modedb_size = sizeof(struct fb_videomode) * monspecs->modedb_len;
-       modedb = kmemdup(monspecs->modedb, modedb_size, GFP_KERNEL);
-       if (!modedb)
-               goto err_dup_modedb;
-       monspecs->modedb = modedb;
-
-       switch (id) {
-       case 0:
-               pdev = &atmel_lcdfb0_device;
-
-               if (pin_mask == 0ULL)
-                       /* Default to "full" lcdc control signals and 24bit */
-                       pin_mask = ATMEL_LCDC_PRI_24BIT | ATMEL_LCDC_PRI_CONTROL;
-
-               /* LCDC on port C */
-               portc_mask = pin_mask & 0xfff80000;
-               select_peripheral(PIOC, portc_mask, PERIPH_A, 0);
-
-               /* LCDC on port D */
-               portd_mask = pin_mask & 0x0003ffff;
-               select_peripheral(PIOD, portd_mask, PERIPH_A, 0);
-
-               /* LCDC on port E */
-               porte_mask = (pin_mask >> 32) & 0x0007ffff;
-               select_peripheral(PIOE, porte_mask, PERIPH_B, 0);
-
-               clk_set_parent(&atmel_lcdfb0_pixclk, &pll0);
-               clk_set_rate(&atmel_lcdfb0_pixclk, clk_get_rate(&pll0));
-               break;
-
-       default:
-               goto err_invalid_id;
-       }
-
-       if (fbmem_len) {
-               pdev->resource[2].start = fbmem_start;
-               pdev->resource[2].end = fbmem_start + fbmem_len - 1;
-               pdev->resource[2].flags = IORESOURCE_MEM;
-       }
-
-       info = pdev->dev.platform_data;
-       memcpy(info, data, sizeof(struct atmel_lcdfb_pdata));
-       info->default_monspecs = monspecs;
-
-       pdev->name = "at32ap-lcdfb";
-
-       platform_device_register(pdev);
-       return pdev;
-
-err_invalid_id:
-       kfree(modedb);
-err_dup_modedb:
-       kfree(monspecs);
-       return NULL;
-}
-#endif
-
-/* --------------------------------------------------------------------
- *  PWM
- * -------------------------------------------------------------------- */
-static struct resource atmel_pwm0_resource[] __initdata = {
-       PBMEM(0xfff01400),
-       IRQ(24),
-};
-static struct clk atmel_pwm0_mck = {
-       .name           = "at91sam9rl-pwm",
-       .parent         = &pbb_clk,
-       .mode           = pbb_clk_mode,
-       .get_rate       = pbb_clk_get_rate,
-       .index          = 5,
-};
-
-struct platform_device *__init at32_add_device_pwm(u32 mask)
-{
-       struct platform_device *pdev;
-       u32 pin_mask;
-
-       if (!mask)
-               return NULL;
-
-       pdev = platform_device_alloc("at91sam9rl-pwm", 0);
-       if (!pdev)
-               return NULL;
-
-       if (platform_device_add_resources(pdev, atmel_pwm0_resource,
-                               ARRAY_SIZE(atmel_pwm0_resource)))
-               goto out_free_pdev;
-
-       pin_mask = 0;
-       if (mask & (1 << 0))
-               pin_mask |= (1 << 28);
-       if (mask & (1 << 1))
-               pin_mask |= (1 << 29);
-       if (pin_mask > 0)
-               select_peripheral(PIOA, pin_mask, PERIPH_A, 0);
-
-       pin_mask = 0;
-       if (mask & (1 << 2))
-               pin_mask |= (1 << 21);
-       if (mask & (1 << 3))
-               pin_mask |= (1 << 22);
-       if (pin_mask > 0)
-               select_peripheral(PIOA, pin_mask, PERIPH_B, 0);
-
-       atmel_pwm0_mck.dev = &pdev->dev;
-
-       platform_device_add(pdev);
-
-       return pdev;
-
-out_free_pdev:
-       platform_device_put(pdev);
-       return NULL;
-}
-
-/* --------------------------------------------------------------------
- *  SSC
- * -------------------------------------------------------------------- */
-static struct resource ssc0_resource[] = {
-       PBMEM(0xffe01c00),
-       IRQ(10),
-};
-DEFINE_DEV(ssc, 0);
-DEV_CLK(pclk, ssc0, pba, 7);
-
-static struct resource ssc1_resource[] = {
-       PBMEM(0xffe02000),
-       IRQ(11),
-};
-DEFINE_DEV(ssc, 1);
-DEV_CLK(pclk, ssc1, pba, 8);
-
-static struct resource ssc2_resource[] = {
-       PBMEM(0xffe02400),
-       IRQ(12),
-};
-DEFINE_DEV(ssc, 2);
-DEV_CLK(pclk, ssc2, pba, 9);
-
-struct platform_device *__init
-at32_add_device_ssc(unsigned int id, unsigned int flags)
-{
-       struct platform_device *pdev;
-       u32 pin_mask = 0;
-
-       switch (id) {
-       case 0:
-               pdev = &ssc0_device;
-               if (flags & ATMEL_SSC_RF)
-                       pin_mask |= (1 << 21);  /* RF */
-               if (flags & ATMEL_SSC_RK)
-                       pin_mask |= (1 << 22);  /* RK */
-               if (flags & ATMEL_SSC_TK)
-                       pin_mask |= (1 << 23);  /* TK */
-               if (flags & ATMEL_SSC_TF)
-                       pin_mask |= (1 << 24);  /* TF */
-               if (flags & ATMEL_SSC_TD)
-                       pin_mask |= (1 << 25);  /* TD */
-               if (flags & ATMEL_SSC_RD)
-                       pin_mask |= (1 << 26);  /* RD */
-
-               if (pin_mask > 0)
-                       select_peripheral(PIOA, pin_mask, PERIPH_A, 0);
-
-               break;
-       case 1:
-               pdev = &ssc1_device;
-               if (flags & ATMEL_SSC_RF)
-                       pin_mask |= (1 << 0);   /* RF */
-               if (flags & ATMEL_SSC_RK)
-                       pin_mask |= (1 << 1);   /* RK */
-               if (flags & ATMEL_SSC_TK)
-                       pin_mask |= (1 << 2);   /* TK */
-               if (flags & ATMEL_SSC_TF)
-                       pin_mask |= (1 << 3);   /* TF */
-               if (flags & ATMEL_SSC_TD)
-                       pin_mask |= (1 << 4);   /* TD */
-               if (flags & ATMEL_SSC_RD)
-                       pin_mask |= (1 << 5);   /* RD */
-
-               if (pin_mask > 0)
-                       select_peripheral(PIOA, pin_mask, PERIPH_B, 0);
-
-               break;
-       case 2:
-               pdev = &ssc2_device;
-               if (flags & ATMEL_SSC_TD)
-                       pin_mask |= (1 << 13);  /* TD */
-               if (flags & ATMEL_SSC_RD)
-                       pin_mask |= (1 << 14);  /* RD */
-               if (flags & ATMEL_SSC_TK)
-                       pin_mask |= (1 << 15);  /* TK */
-               if (flags & ATMEL_SSC_TF)
-                       pin_mask |= (1 << 16);  /* TF */
-               if (flags & ATMEL_SSC_RF)
-                       pin_mask |= (1 << 17);  /* RF */
-               if (flags & ATMEL_SSC_RK)
-                       pin_mask |= (1 << 18);  /* RK */
-
-               if (pin_mask > 0)
-                       select_peripheral(PIOB, pin_mask, PERIPH_A, 0);
-
-               break;
-       default:
-               return NULL;
-       }
-
-       platform_device_register(pdev);
-       return pdev;
-}
-
-/* --------------------------------------------------------------------
- *  USB Device Controller
- * -------------------------------------------------------------------- */
-static struct resource usba0_resource[] __initdata = {
-       {
-               .start          = 0xff300000,
-               .end            = 0xff3fffff,
-               .flags          = IORESOURCE_MEM,
-       }, {
-               .start          = 0xfff03000,
-               .end            = 0xfff033ff,
-               .flags          = IORESOURCE_MEM,
-       },
-       IRQ(31),
-};
-static struct clk usba0_pclk = {
-       .name           = "pclk",
-       .parent         = &pbb_clk,
-       .mode           = pbb_clk_mode,
-       .get_rate       = pbb_clk_get_rate,
-       .index          = 12,
-};
-static struct clk usba0_hclk = {
-       .name           = "hclk",
-       .parent         = &hsb_clk,
-       .mode           = hsb_clk_mode,
-       .get_rate       = hsb_clk_get_rate,
-       .index          = 6,
-};
-
-#define EP(nam, idx, maxpkt, maxbk, dma, isoc)                 \
-       [idx] = {                                               \
-               .name           = nam,                          \
-               .index          = idx,                          \
-               .fifo_size      = maxpkt,                       \
-               .nr_banks       = maxbk,                        \
-               .can_dma        = dma,                          \
-               .can_isoc       = isoc,                         \
-       }
-
-static struct usba_ep_data at32_usba_ep[] __initdata = {
-       EP("ep0",     0,   64, 1, 0, 0),
-       EP("ep1",     1,  512, 2, 1, 1),
-       EP("ep2",     2,  512, 2, 1, 1),
-       EP("ep3-int", 3,   64, 3, 1, 0),
-       EP("ep4-int", 4,   64, 3, 1, 0),
-       EP("ep5",     5, 1024, 3, 1, 1),
-       EP("ep6",     6, 1024, 3, 1, 1),
-};
-
-#undef EP
-
-struct platform_device *__init
-at32_add_device_usba(unsigned int id, struct usba_platform_data *data)
-{
-       /*
-        * pdata doesn't have room for any endpoints, so we need to
-        * append room for the ones we need right after it.
-        */
-       struct {
-               struct usba_platform_data pdata;
-               struct usba_ep_data ep[7];
-       } usba_data;
-       struct platform_device *pdev;
-
-       if (id != 0)
-               return NULL;
-
-       pdev = platform_device_alloc("atmel_usba_udc", 0);
-       if (!pdev)
-               return NULL;
-
-       if (platform_device_add_resources(pdev, usba0_resource,
-                                         ARRAY_SIZE(usba0_resource)))
-               goto out_free_pdev;
-
-       if (data) {
-               usba_data.pdata.vbus_pin = data->vbus_pin;
-               usba_data.pdata.vbus_pin_inverted = data->vbus_pin_inverted;
-       } else {
-               usba_data.pdata.vbus_pin = -EINVAL;
-               usba_data.pdata.vbus_pin_inverted = -EINVAL;
-       }
-
-       data = &usba_data.pdata;
-       data->num_ep = ARRAY_SIZE(at32_usba_ep);
-       memcpy(data->ep, at32_usba_ep, sizeof(at32_usba_ep));
-
-       if (platform_device_add_data(pdev, data, sizeof(usba_data)))
-               goto out_free_pdev;
-
-       if (gpio_is_valid(data->vbus_pin))
-               at32_select_gpio(data->vbus_pin, 0);
-
-       usba0_pclk.dev = &pdev->dev;
-       usba0_hclk.dev = &pdev->dev;
-
-       platform_device_add(pdev);
-
-       return pdev;
-
-out_free_pdev:
-       platform_device_put(pdev);
-       return NULL;
-}
-
-/* --------------------------------------------------------------------
- * IDE / CompactFlash
- * -------------------------------------------------------------------- */
-#if defined(CONFIG_CPU_AT32AP7000) || defined(CONFIG_CPU_AT32AP7001)
-static struct resource at32_smc_cs4_resource[] __initdata = {
-       {
-               .start  = 0x04000000,
-               .end    = 0x07ffffff,
-               .flags  = IORESOURCE_MEM,
-       },
-       IRQ(~0UL), /* Magic IRQ will be overridden */
-};
-static struct resource at32_smc_cs5_resource[] __initdata = {
-       {
-               .start  = 0x20000000,
-               .end    = 0x23ffffff,
-               .flags  = IORESOURCE_MEM,
-       },
-       IRQ(~0UL), /* Magic IRQ will be overridden */
-};
-
-static int __init at32_init_ide_or_cf(struct platform_device *pdev,
-               unsigned int cs, unsigned int extint)
-{
-       static unsigned int extint_pin_map[4] __initdata = {
-               (1 << 25),
-               (1 << 26),
-               (1 << 27),
-               (1 << 28),
-       };
-       static bool common_pins_initialized __initdata = false;
-       unsigned int extint_pin;
-       int ret;
-       u32 pin_mask;
-
-       if (extint >= ARRAY_SIZE(extint_pin_map))
-               return -EINVAL;
-       extint_pin = extint_pin_map[extint];
-
-       switch (cs) {
-       case 4:
-               ret = platform_device_add_resources(pdev,
-                               at32_smc_cs4_resource,
-                               ARRAY_SIZE(at32_smc_cs4_resource));
-               if (ret)
-                       return ret;
-
-               /* NCS4   -> OE_N  */
-               select_peripheral(PIOE, (1 << 21), PERIPH_A, 0);
-               hmatrix_sfr_set_bits(HMATRIX_SLAVE_EBI, HMATRIX_EBI_CF0_ENABLE);
-               break;
-       case 5:
-               ret = platform_device_add_resources(pdev,
-                               at32_smc_cs5_resource,
-                               ARRAY_SIZE(at32_smc_cs5_resource));
-               if (ret)
-                       return ret;
-
-               /* NCS5   -> OE_N  */
-               select_peripheral(PIOE, (1 << 22), PERIPH_A, 0);
-               hmatrix_sfr_set_bits(HMATRIX_SLAVE_EBI, HMATRIX_EBI_CF1_ENABLE);
-               break;
-       default:
-               return -EINVAL;
-       }
-
-       if (!common_pins_initialized) {
-               pin_mask  = (1 << 19);  /* CFCE1  -> CS0_N */
-               pin_mask |= (1 << 20);  /* CFCE2  -> CS1_N */
-               pin_mask |= (1 << 23);  /* CFRNW  -> DIR   */
-               pin_mask |= (1 << 24);  /* NWAIT  <- IORDY */
-
-               select_peripheral(PIOE, pin_mask, PERIPH_A, 0);
-
-               common_pins_initialized = true;
-       }
-
-       select_peripheral(PIOB, extint_pin, PERIPH_A, AT32_GPIOF_DEGLITCH);
-
-       pdev->resource[1].start = EIM_IRQ_BASE + extint;
-       pdev->resource[1].end = pdev->resource[1].start;
-
-       return 0;
-}
-
-struct platform_device *__init
-at32_add_device_ide(unsigned int id, unsigned int extint,
-                   struct ide_platform_data *data)
-{
-       struct platform_device *pdev;
-
-       pdev = platform_device_alloc("at32_ide", id);
-       if (!pdev)
-               goto fail;
-
-       if (platform_device_add_data(pdev, data,
-                               sizeof(struct ide_platform_data)))
-               goto fail;
-
-       if (at32_init_ide_or_cf(pdev, data->cs, extint))
-               goto fail;
-
-       platform_device_add(pdev);
-       return pdev;
-
-fail:
-       platform_device_put(pdev);
-       return NULL;
-}
-
-struct platform_device *__init
-at32_add_device_cf(unsigned int id, unsigned int extint,
-                   struct cf_platform_data *data)
-{
-       struct platform_device *pdev;
-
-       pdev = platform_device_alloc("at32_cf", id);
-       if (!pdev)
-               goto fail;
-
-       if (platform_device_add_data(pdev, data,
-                               sizeof(struct cf_platform_data)))
-               goto fail;
-
-       if (at32_init_ide_or_cf(pdev, data->cs, extint))
-               goto fail;
-
-       if (gpio_is_valid(data->detect_pin))
-               at32_select_gpio(data->detect_pin, AT32_GPIOF_DEGLITCH);
-       if (gpio_is_valid(data->reset_pin))
-               at32_select_gpio(data->reset_pin, 0);
-       if (gpio_is_valid(data->vcc_pin))
-               at32_select_gpio(data->vcc_pin, 0);
-       /* READY is used as extint, so we can't select it as gpio */
-
-       platform_device_add(pdev);
-       return pdev;
-
-fail:
-       platform_device_put(pdev);
-       return NULL;
-}
-#endif
-
-/* --------------------------------------------------------------------
- * NAND Flash / SmartMedia
- * -------------------------------------------------------------------- */
-static struct resource smc_cs3_resource[] __initdata = {
-       {
-               .start  = 0x0c000000,
-               .end    = 0x0fffffff,
-               .flags  = IORESOURCE_MEM,
-       }, {
-               .start  = 0xfff03c00,
-               .end    = 0xfff03fff,
-               .flags  = IORESOURCE_MEM,
-       },
-};
-
-struct platform_device *__init
-at32_add_device_nand(unsigned int id, struct atmel_nand_data *data)
-{
-       struct platform_device *pdev;
-
-       if (id != 0 || !data)
-               return NULL;
-
-       pdev = platform_device_alloc("atmel_nand", id);
-       if (!pdev)
-               goto fail;
-
-       if (platform_device_add_resources(pdev, smc_cs3_resource,
-                               ARRAY_SIZE(smc_cs3_resource)))
-               goto fail;
-
-       /* For at32ap7000, we use the reset workaround for nand driver */
-       data->need_reset_workaround = true;
-
-       if (platform_device_add_data(pdev, data,
-                               sizeof(struct atmel_nand_data)))
-               goto fail;
-
-       hmatrix_sfr_set_bits(HMATRIX_SLAVE_EBI, HMATRIX_EBI_NAND_ENABLE);
-       if (data->enable_pin)
-               at32_select_gpio(data->enable_pin,
-                               AT32_GPIOF_OUTPUT | AT32_GPIOF_HIGH);
-       if (data->rdy_pin)
-               at32_select_gpio(data->rdy_pin, 0);
-       if (data->det_pin)
-               at32_select_gpio(data->det_pin, 0);
-
-       platform_device_add(pdev);
-       return pdev;
-
-fail:
-       platform_device_put(pdev);
-       return NULL;
-}
-
-/* --------------------------------------------------------------------
- * AC97C
- * -------------------------------------------------------------------- */
-static struct resource atmel_ac97c0_resource[] __initdata = {
-       PBMEM(0xfff02800),
-       IRQ(29),
-};
-static struct clk atmel_ac97c0_pclk = {
-       .name           = "pclk",
-       .parent         = &pbb_clk,
-       .mode           = pbb_clk_mode,
-       .get_rate       = pbb_clk_get_rate,
-       .index          = 10,
-};
-
-struct platform_device *__init
-at32_add_device_ac97c(unsigned int id, struct ac97c_platform_data *data,
-                     unsigned int flags)
-{
-       struct platform_device          *pdev;
-       struct dw_dma_slave             *rx_dws;
-       struct dw_dma_slave             *tx_dws;
-       struct ac97c_platform_data      _data;
-       u32                             pin_mask;
-
-       if (id != 0)
-               return NULL;
-
-       pdev = platform_device_alloc("atmel_ac97c", id);
-       if (!pdev)
-               return NULL;
-
-       if (platform_device_add_resources(pdev, atmel_ac97c0_resource,
-                               ARRAY_SIZE(atmel_ac97c0_resource)))
-               goto out_free_resources;
-
-       if (!data) {
-               data = &_data;
-               memset(data, 0, sizeof(struct ac97c_platform_data));
-               data->reset_pin = -ENODEV;
-       }
-
-       rx_dws = &data->rx_dws;
-       tx_dws = &data->tx_dws;
-
-       /* Check if DMA slave interface for capture should be configured. */
-       if (flags & AC97C_CAPTURE) {
-               rx_dws->dma_dev = &dw_dmac0_device.dev;
-               rx_dws->src_id = 3;
-               rx_dws->m_master = 0;
-               rx_dws->p_master = 1;
-       }
-
-       /* Check if DMA slave interface for playback should be configured. */
-       if (flags & AC97C_PLAYBACK) {
-               tx_dws->dma_dev = &dw_dmac0_device.dev;
-               tx_dws->dst_id = 4;
-               tx_dws->m_master = 0;
-               tx_dws->p_master = 1;
-       }
-
-       if (platform_device_add_data(pdev, data,
-                               sizeof(struct ac97c_platform_data)))
-               goto out_free_resources;
-
-       /* SDO | SYNC | SCLK | SDI */
-       pin_mask = (1 << 20) | (1 << 21) | (1 << 22) | (1 << 23);
-
-       select_peripheral(PIOB, pin_mask, PERIPH_B, 0);
-
-       if (gpio_is_valid(data->reset_pin))
-               at32_select_gpio(data->reset_pin, AT32_GPIOF_OUTPUT
-                               | AT32_GPIOF_HIGH);
-
-       atmel_ac97c0_pclk.dev = &pdev->dev;
-
-       platform_device_add(pdev);
-       return pdev;
-
-out_free_resources:
-       platform_device_put(pdev);
-       return NULL;
-}
-
-/* --------------------------------------------------------------------
- * ABDAC
- * -------------------------------------------------------------------- */
-static struct resource abdac0_resource[] __initdata = {
-       PBMEM(0xfff02000),
-       IRQ(27),
-};
-static struct clk abdac0_pclk = {
-       .name           = "pclk",
-       .parent         = &pbb_clk,
-       .mode           = pbb_clk_mode,
-       .get_rate       = pbb_clk_get_rate,
-       .index          = 8,
-};
-static struct clk abdac0_sample_clk = {
-       .name           = "sample_clk",
-       .mode           = genclk_mode,
-       .get_rate       = genclk_get_rate,
-       .set_rate       = genclk_set_rate,
-       .set_parent     = genclk_set_parent,
-       .index          = 6,
-};
-
-struct platform_device *__init
-at32_add_device_abdac(unsigned int id, struct atmel_abdac_pdata *data)
-{
-       struct platform_device  *pdev;
-       struct dw_dma_slave     *dws;
-       u32                     pin_mask;
-
-       if (id != 0 || !data)
-               return NULL;
-
-       pdev = platform_device_alloc("atmel_abdac", id);
-       if (!pdev)
-               return NULL;
-
-       if (platform_device_add_resources(pdev, abdac0_resource,
-                               ARRAY_SIZE(abdac0_resource)))
-               goto out_free_resources;
-
-       dws = &data->dws;
-
-       dws->dma_dev = &dw_dmac0_device.dev;
-       dws->dst_id = 2;
-       dws->m_master = 0;
-       dws->p_master = 1;
-
-       if (platform_device_add_data(pdev, data,
-                               sizeof(struct atmel_abdac_pdata)))
-               goto out_free_resources;
-
-       pin_mask  = (1 << 20) | (1 << 22);      /* DATA1 & DATAN1 */
-       pin_mask |= (1 << 21) | (1 << 23);      /* DATA0 & DATAN0 */
-
-       select_peripheral(PIOB, pin_mask, PERIPH_A, 0);
-
-       abdac0_pclk.dev = &pdev->dev;
-       abdac0_sample_clk.dev = &pdev->dev;
-
-       platform_device_add(pdev);
-       return pdev;
-
-out_free_resources:
-       platform_device_put(pdev);
-       return NULL;
-}
-
-/* --------------------------------------------------------------------
- *  GCLK
- * -------------------------------------------------------------------- */
-static struct clk gclk0 = {
-       .name           = "gclk0",
-       .mode           = genclk_mode,
-       .get_rate       = genclk_get_rate,
-       .set_rate       = genclk_set_rate,
-       .set_parent     = genclk_set_parent,
-       .index          = 0,
-};
-static struct clk gclk1 = {
-       .name           = "gclk1",
-       .mode           = genclk_mode,
-       .get_rate       = genclk_get_rate,
-       .set_rate       = genclk_set_rate,
-       .set_parent     = genclk_set_parent,
-       .index          = 1,
-};
-static struct clk gclk2 = {
-       .name           = "gclk2",
-       .mode           = genclk_mode,
-       .get_rate       = genclk_get_rate,
-       .set_rate       = genclk_set_rate,
-       .set_parent     = genclk_set_parent,
-       .index          = 2,
-};
-static struct clk gclk3 = {
-       .name           = "gclk3",
-       .mode           = genclk_mode,
-       .get_rate       = genclk_get_rate,
-       .set_rate       = genclk_set_rate,
-       .set_parent     = genclk_set_parent,
-       .index          = 3,
-};
-static struct clk gclk4 = {
-       .name           = "gclk4",
-       .mode           = genclk_mode,
-       .get_rate       = genclk_get_rate,
-       .set_rate       = genclk_set_rate,
-       .set_parent     = genclk_set_parent,
-       .index          = 4,
-};
-
-static __initdata struct clk *init_clocks[] = {
-       &osc32k,
-       &osc0,
-       &osc1,
-       &pll0,
-       &pll1,
-       &cpu_clk,
-       &hsb_clk,
-       &pba_clk,
-       &pbb_clk,
-       &at32_pm_pclk,
-       &at32_intc0_pclk,
-       &at32_hmatrix_clk,
-       &ebi_clk,
-       &hramc_clk,
-       &sdramc_clk,
-       &smc0_pclk,
-       &smc0_mck,
-       &pdc_hclk,
-       &pdc_pclk,
-       &dw_dmac0_hclk,
-       &pico_clk,
-       &pio0_mck,
-       &pio1_mck,
-       &pio2_mck,
-       &pio3_mck,
-       &pio4_mck,
-       &at32_tcb0_t0_clk,
-       &at32_tcb1_t0_clk,
-       &atmel_psif0_pclk,
-       &atmel_psif1_pclk,
-       &atmel_usart0_usart,
-       &atmel_usart1_usart,
-       &atmel_usart2_usart,
-       &atmel_usart3_usart,
-       &atmel_pwm0_mck,
-#if defined(CONFIG_CPU_AT32AP7000)
-       &macb0_hclk,
-       &macb0_pclk,
-       &macb1_hclk,
-       &macb1_pclk,
-#endif
-       &atmel_spi0_spi_clk,
-       &atmel_spi1_spi_clk,
-       &atmel_twi0_pclk,
-       &atmel_mci0_pclk,
-#if defined(CONFIG_CPU_AT32AP7000) || defined(CONFIG_CPU_AT32AP7002)
-       &atmel_lcdfb0_hclk,
-       &atmel_lcdfb0_pixclk,
-#endif
-       &ssc0_pclk,
-       &ssc1_pclk,
-       &ssc2_pclk,
-       &usba0_hclk,
-       &usba0_pclk,
-       &atmel_ac97c0_pclk,
-       &abdac0_pclk,
-       &abdac0_sample_clk,
-       &gclk0,
-       &gclk1,
-       &gclk2,
-       &gclk3,
-       &gclk4,
-};
-
-void __init setup_platform(void)
-{
-       u32 cpu_mask = 0, hsb_mask = 0, pba_mask = 0, pbb_mask = 0;
-       int i;
-
-       if (pm_readl(MCCTRL) & PM_BIT(PLLSEL)) {
-               main_clock = &pll0;
-               cpu_clk.parent = &pll0;
-       } else {
-               main_clock = &osc0;
-               cpu_clk.parent = &osc0;
-       }
-
-       if (pm_readl(PLL0) & PM_BIT(PLLOSC))
-               pll0.parent = &osc1;
-       if (pm_readl(PLL1) & PM_BIT(PLLOSC))
-               pll1.parent = &osc1;
-
-       genclk_init_parent(&gclk0);
-       genclk_init_parent(&gclk1);
-       genclk_init_parent(&gclk2);
-       genclk_init_parent(&gclk3);
-       genclk_init_parent(&gclk4);
-#if defined(CONFIG_CPU_AT32AP7000) || defined(CONFIG_CPU_AT32AP7002)
-       genclk_init_parent(&atmel_lcdfb0_pixclk);
-#endif
-       genclk_init_parent(&abdac0_sample_clk);
-
-       /*
-        * Build initial dynamic clock list by registering all clocks
-        * from the array.
-        * At the same time, turn on all clocks that have at least one
-        * user already, and turn off everything else. We only do this
-        * for module clocks, and even though it isn't particularly
-        * pretty to  check the address of the mode function, it should
-        * do the trick...
-        */
-       for (i = 0; i < ARRAY_SIZE(init_clocks); i++) {
-               struct clk *clk = init_clocks[i];
-
-               /* first, register clock */
-               at32_clk_register(clk);
-
-               if (clk->users == 0)
-                       continue;
-
-               if (clk->mode == &cpu_clk_mode)
-                       cpu_mask |= 1 << clk->index;
-               else if (clk->mode == &hsb_clk_mode)
-                       hsb_mask |= 1 << clk->index;
-               else if (clk->mode == &pba_clk_mode)
-                       pba_mask |= 1 << clk->index;
-               else if (clk->mode == &pbb_clk_mode)
-                       pbb_mask |= 1 << clk->index;
-       }
-
-       pm_writel(CPU_MASK, cpu_mask);
-       pm_writel(HSB_MASK, hsb_mask);
-       pm_writel(PBA_MASK, pba_mask);
-       pm_writel(PBB_MASK, pbb_mask);
-
-       /* Initialize the port muxes */
-       at32_init_pio(&pio0_device);
-       at32_init_pio(&pio1_device);
-       at32_init_pio(&pio2_device);
-       at32_init_pio(&pio3_device);
-       at32_init_pio(&pio4_device);
-}
-
-struct gen_pool *sram_pool;
-
-static int __init sram_init(void)
-{
-       struct gen_pool *pool;
-
-       /* 1KiB granularity */
-       pool = gen_pool_create(10, -1);
-       if (!pool)
-               goto fail;
-
-       if (gen_pool_add(pool, 0x24000000, 0x8000, -1))
-               goto err_pool_add;
-
-       sram_pool = pool;
-       return 0;
-
-err_pool_add:
-       gen_pool_destroy(pool);
-fail:
-       pr_err("Failed to create SRAM pool\n");
-       return -ENOMEM;
-}
-core_initcall(sram_init);
diff --git a/arch/avr32/mach-at32ap/clock.c b/arch/avr32/mach-at32ap/clock.c
deleted file mode 100644 (file)
index fdf1cae..0000000
+++ /dev/null
@@ -1,334 +0,0 @@
-/*
- * Clock management for AT32AP CPUs
- *
- * Copyright (C) 2006 Atmel Corporation
- *
- * Based on arch/arm/mach-at91/clock.c
- *   Copyright (C) 2005 David Brownell
- *   Copyright (C) 2005 Ivan Kokshaysky
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/clk.h>
-#include <linux/err.h>
-#include <linux/export.h>
-#include <linux/device.h>
-#include <linux/string.h>
-#include <linux/list.h>
-
-#include <mach/chip.h>
-
-#include "clock.h"
-
-/* at32 clock list */
-static LIST_HEAD(at32_clock_list);
-
-static DEFINE_SPINLOCK(clk_lock);
-static DEFINE_SPINLOCK(clk_list_lock);
-
-void at32_clk_register(struct clk *clk)
-{
-       spin_lock(&clk_list_lock);
-       /* add the new item to the end of the list */
-       list_add_tail(&clk->list, &at32_clock_list);
-       spin_unlock(&clk_list_lock);
-}
-
-static struct clk *__clk_get(struct device *dev, const char *id)
-{
-       struct clk *clk;
-
-       list_for_each_entry(clk, &at32_clock_list, list) {
-               if (clk->dev == dev && strcmp(id, clk->name) == 0) {
-                       return clk;
-               }
-       }
-
-       return ERR_PTR(-ENOENT);
-}
-
-struct clk *clk_get(struct device *dev, const char *id)
-{
-       struct clk *clk;
-
-       spin_lock(&clk_list_lock);
-       clk = __clk_get(dev, id);
-       spin_unlock(&clk_list_lock);
-
-       return clk;
-}
-
-EXPORT_SYMBOL(clk_get);
-
-void clk_put(struct clk *clk)
-{
-       /* clocks are static for now, we can't free them */
-}
-EXPORT_SYMBOL(clk_put);
-
-static void __clk_enable(struct clk *clk)
-{
-       if (clk->parent)
-               __clk_enable(clk->parent);
-       if (clk->users++ == 0 && clk->mode)
-               clk->mode(clk, 1);
-}
-
-int clk_enable(struct clk *clk)
-{
-       unsigned long flags;
-
-       if (!clk)
-               return 0;
-
-       spin_lock_irqsave(&clk_lock, flags);
-       __clk_enable(clk);
-       spin_unlock_irqrestore(&clk_lock, flags);
-
-       return 0;
-}
-EXPORT_SYMBOL(clk_enable);
-
-static void __clk_disable(struct clk *clk)
-{
-       if (clk->users == 0) {
-               printk(KERN_ERR "%s: mismatched disable\n", clk->name);
-               WARN_ON(1);
-               return;
-       }
-
-       if (--clk->users == 0 && clk->mode)
-               clk->mode(clk, 0);
-       if (clk->parent)
-               __clk_disable(clk->parent);
-}
-
-void clk_disable(struct clk *clk)
-{
-       unsigned long flags;
-
-       if (IS_ERR_OR_NULL(clk))
-               return;
-
-       spin_lock_irqsave(&clk_lock, flags);
-       __clk_disable(clk);
-       spin_unlock_irqrestore(&clk_lock, flags);
-}
-EXPORT_SYMBOL(clk_disable);
-
-unsigned long clk_get_rate(struct clk *clk)
-{
-       unsigned long flags;
-       unsigned long rate;
-
-       if (!clk)
-               return 0;
-
-       spin_lock_irqsave(&clk_lock, flags);
-       rate = clk->get_rate(clk);
-       spin_unlock_irqrestore(&clk_lock, flags);
-
-       return rate;
-}
-EXPORT_SYMBOL(clk_get_rate);
-
-long clk_round_rate(struct clk *clk, unsigned long rate)
-{
-       unsigned long flags, actual_rate;
-
-       if (!clk)
-               return 0;
-
-       if (!clk->set_rate)
-               return -ENOSYS;
-
-       spin_lock_irqsave(&clk_lock, flags);
-       actual_rate = clk->set_rate(clk, rate, 0);
-       spin_unlock_irqrestore(&clk_lock, flags);
-
-       return actual_rate;
-}
-EXPORT_SYMBOL(clk_round_rate);
-
-int clk_set_rate(struct clk *clk, unsigned long rate)
-{
-       unsigned long flags;
-       long ret;
-
-       if (!clk)
-               return 0;
-
-       if (!clk->set_rate)
-               return -ENOSYS;
-
-       spin_lock_irqsave(&clk_lock, flags);
-       ret = clk->set_rate(clk, rate, 1);
-       spin_unlock_irqrestore(&clk_lock, flags);
-
-       return (ret < 0) ? ret : 0;
-}
-EXPORT_SYMBOL(clk_set_rate);
-
-int clk_set_parent(struct clk *clk, struct clk *parent)
-{
-       unsigned long flags;
-       int ret;
-
-       if (!clk)
-               return 0;
-
-       if (!clk->set_parent)
-               return -ENOSYS;
-
-       spin_lock_irqsave(&clk_lock, flags);
-       ret = clk->set_parent(clk, parent);
-       spin_unlock_irqrestore(&clk_lock, flags);
-
-       return ret;
-}
-EXPORT_SYMBOL(clk_set_parent);
-
-struct clk *clk_get_parent(struct clk *clk)
-{
-       return !clk ? NULL : clk->parent;
-}
-EXPORT_SYMBOL(clk_get_parent);
-
-
-
-#ifdef CONFIG_DEBUG_FS
-
-/* /sys/kernel/debug/at32ap_clk */
-
-#include <linux/io.h>
-#include <linux/debugfs.h>
-#include <linux/seq_file.h>
-#include "pm.h"
-
-
-#define        NEST_DELTA      2
-#define        NEST_MAX        6
-
-struct clkinf {
-       struct seq_file *s;
-       unsigned        nest;
-};
-
-static void
-dump_clock(struct clk *parent, struct clkinf *r)
-{
-       unsigned        nest = r->nest;
-       char            buf[16 + NEST_MAX];
-       struct clk      *clk;
-       unsigned        i;
-
-       /* skip clocks coupled to devices that aren't registered */
-       if (parent->dev && !dev_name(parent->dev) && !parent->users)
-               return;
-
-       /* <nest spaces> name <pad to end> */
-       memset(buf, ' ', sizeof(buf) - 1);
-       buf[sizeof(buf) - 1] = 0;
-       i = strlen(parent->name);
-       memcpy(buf + nest, parent->name,
-                       min(i, (unsigned)(sizeof(buf) - 1 - nest)));
-
-       seq_printf(r->s, "%s%c users=%2d %-3s %9ld Hz",
-               buf, parent->set_parent ? '*' : ' ',
-               parent->users,
-               parent->users ? "on" : "off",   /* NOTE: not-paranoid!! */
-               clk_get_rate(parent));
-       if (parent->dev)
-               seq_printf(r->s, ", for %s", dev_name(parent->dev));
-       seq_putc(r->s, '\n');
-
-       /* cost of this scan is small, but not linear... */
-       r->nest = nest + NEST_DELTA;
-
-       list_for_each_entry(clk, &at32_clock_list, list) {
-               if (clk->parent == parent)
-                       dump_clock(clk, r);
-       }
-       r->nest = nest;
-}
-
-static int clk_show(struct seq_file *s, void *unused)
-{
-       struct clkinf   r;
-       int             i;
-       struct clk      *clk;
-
-       /* show all the power manager registers */
-       seq_printf(s,
-                  "MCCTRL  = %8x\n"
-                  "CKSEL   = %8x\n"
-                  "CPUMASK = %8x\n"
-                  "HSBMASK = %8x\n"
-                  "PBAMASK = %8x\n"
-                  "PBBMASK = %8x\n"
-                  "PLL0    = %8x\n"
-                  "PLL1    = %8x\n"
-                  "IMR     = %8x\n",
-                  pm_readl(MCCTRL),
-                  pm_readl(CKSEL),
-                  pm_readl(CPU_MASK),
-                  pm_readl(HSB_MASK),
-                  pm_readl(PBA_MASK),
-                  pm_readl(PBB_MASK),
-                  pm_readl(PLL0),
-                  pm_readl(PLL1),
-                  pm_readl(IMR));
-       for (i = 0; i < 8; i++) {
-               if (i == 5)
-                       continue;
-               seq_printf(s, "GCCTRL%d = %8x\n", i, pm_readl(GCCTRL(i)));
-       }
-
-       seq_putc(s, '\n');
-       r.s = s;
-       r.nest = 0;
-       /* protected from changes on the list while dumping */
-       spin_lock(&clk_list_lock);
-
-       /* show clock tree as derived from the three oscillators */
-       clk = __clk_get(NULL, "osc32k");
-       dump_clock(clk, &r);
-       clk_put(clk);
-
-       clk = __clk_get(NULL, "osc0");
-       dump_clock(clk, &r);
-       clk_put(clk);
-
-       clk = __clk_get(NULL, "osc1");
-       dump_clock(clk, &r);
-       clk_put(clk);
-
-       spin_unlock(&clk_list_lock);
-
-       return 0;
-}
-
-static int clk_open(struct inode *inode, struct file *file)
-{
-       return single_open(file, clk_show, NULL);
-}
-
-static const struct file_operations clk_operations = {
-       .open           = clk_open,
-       .read           = seq_read,
-       .llseek         = seq_lseek,
-       .release        = single_release,
-};
-
-static int __init clk_debugfs_init(void)
-{
-       (void) debugfs_create_file("at32ap_clk", S_IFREG | S_IRUGO,
-                       NULL, NULL, &clk_operations);
-
-       return 0;
-}
-postcore_initcall(clk_debugfs_init);
-
-#endif
diff --git a/arch/avr32/mach-at32ap/clock.h b/arch/avr32/mach-at32ap/clock.h
deleted file mode 100644 (file)
index 4c7ebbd..0000000
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Clock management for AT32AP CPUs
- *
- * Copyright (C) 2006 Atmel Corporation
- *
- * Based on arch/arm/mach-at91/clock.c
- *   Copyright (C) 2005 David Brownell
- *   Copyright (C) 2005 Ivan Kokshaysky
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/clk.h>
-#include <linux/list.h>
-
-
-void at32_clk_register(struct clk *clk);
-
-struct clk {
-       struct list_head list;          /* linking element */
-       const char      *name;          /* Clock name/function */
-       struct device   *dev;           /* Device the clock is used by */
-       struct clk      *parent;        /* Parent clock, if any */
-       void            (*mode)(struct clk *clk, int enabled);
-       unsigned long   (*get_rate)(struct clk *clk);
-       long            (*set_rate)(struct clk *clk, unsigned long rate,
-                                   int apply);
-       int             (*set_parent)(struct clk *clk, struct clk *parent);
-       u16             users;          /* Enabled if non-zero */
-       u16             index;          /* Sibling index */
-};
-
-unsigned long pba_clk_get_rate(struct clk *clk);
-void pba_clk_mode(struct clk *clk, int enabled);
diff --git a/arch/avr32/mach-at32ap/extint.c b/arch/avr32/mach-at32ap/extint.c
deleted file mode 100644 (file)
index 96cabad..0000000
+++ /dev/null
@@ -1,271 +0,0 @@
-/*
- * External interrupt handling for AT32AP CPUs
- *
- * Copyright (C) 2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/errno.h>
-#include <linux/init.h>
-#include <linux/interrupt.h>
-#include <linux/irq.h>
-#include <linux/platform_device.h>
-#include <linux/random.h>
-#include <linux/slab.h>
-
-#include <asm/io.h>
-
-/* EIC register offsets */
-#define EIC_IER                                        0x0000
-#define EIC_IDR                                        0x0004
-#define EIC_IMR                                        0x0008
-#define EIC_ISR                                        0x000c
-#define EIC_ICR                                        0x0010
-#define EIC_MODE                               0x0014
-#define EIC_EDGE                               0x0018
-#define EIC_LEVEL                              0x001c
-#define EIC_NMIC                               0x0024
-
-/* Bitfields in NMIC */
-#define EIC_NMIC_ENABLE                                (1 << 0)
-
-/* Bit manipulation macros */
-#define EIC_BIT(name)                                  \
-       (1 << EIC_##name##_OFFSET)
-#define EIC_BF(name,value)                             \
-       (((value) & ((1 << EIC_##name##_SIZE) - 1))     \
-        << EIC_##name##_OFFSET)
-#define EIC_BFEXT(name,value)                          \
-       (((value) >> EIC_##name##_OFFSET)               \
-        & ((1 << EIC_##name##_SIZE) - 1))
-#define EIC_BFINS(name,value,old)                      \
-       (((old) & ~(((1 << EIC_##name##_SIZE) - 1)      \
-                   << EIC_##name##_OFFSET))            \
-        | EIC_BF(name,value))
-
-/* Register access macros */
-#define eic_readl(port,reg)                            \
-       __raw_readl((port)->regs + EIC_##reg)
-#define eic_writel(port,reg,value)                     \
-       __raw_writel((value), (port)->regs + EIC_##reg)
-
-struct eic {
-       void __iomem *regs;
-       struct irq_chip *chip;
-       unsigned int first_irq;
-};
-
-static struct eic *nmi_eic;
-static bool nmi_enabled;
-
-static void eic_ack_irq(struct irq_data *d)
-{
-       struct eic *eic = irq_data_get_irq_chip_data(d);
-       eic_writel(eic, ICR, 1 << (d->irq - eic->first_irq));
-}
-
-static void eic_mask_irq(struct irq_data *d)
-{
-       struct eic *eic = irq_data_get_irq_chip_data(d);
-       eic_writel(eic, IDR, 1 << (d->irq - eic->first_irq));
-}
-
-static void eic_mask_ack_irq(struct irq_data *d)
-{
-       struct eic *eic = irq_data_get_irq_chip_data(d);
-       eic_writel(eic, ICR, 1 << (d->irq - eic->first_irq));
-       eic_writel(eic, IDR, 1 << (d->irq - eic->first_irq));
-}
-
-static void eic_unmask_irq(struct irq_data *d)
-{
-       struct eic *eic = irq_data_get_irq_chip_data(d);
-       eic_writel(eic, IER, 1 << (d->irq - eic->first_irq));
-}
-
-static int eic_set_irq_type(struct irq_data *d, unsigned int flow_type)
-{
-       struct eic *eic = irq_data_get_irq_chip_data(d);
-       unsigned int irq = d->irq;
-       unsigned int i = irq - eic->first_irq;
-       u32 mode, edge, level;
-
-       flow_type &= IRQ_TYPE_SENSE_MASK;
-       if (flow_type == IRQ_TYPE_NONE)
-               flow_type = IRQ_TYPE_LEVEL_LOW;
-
-       mode = eic_readl(eic, MODE);
-       edge = eic_readl(eic, EDGE);
-       level = eic_readl(eic, LEVEL);
-
-       switch (flow_type) {
-       case IRQ_TYPE_LEVEL_LOW:
-               mode |= 1 << i;
-               level &= ~(1 << i);
-               break;
-       case IRQ_TYPE_LEVEL_HIGH:
-               mode |= 1 << i;
-               level |= 1 << i;
-               break;
-       case IRQ_TYPE_EDGE_RISING:
-               mode &= ~(1 << i);
-               edge |= 1 << i;
-               break;
-       case IRQ_TYPE_EDGE_FALLING:
-               mode &= ~(1 << i);
-               edge &= ~(1 << i);
-               break;
-       default:
-               return -EINVAL;
-       }
-
-       eic_writel(eic, MODE, mode);
-       eic_writel(eic, EDGE, edge);
-       eic_writel(eic, LEVEL, level);
-
-       irqd_set_trigger_type(d, flow_type);
-       if (flow_type & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH))
-               irq_set_handler_locked(d, handle_level_irq);
-       else
-               irq_set_handler_locked(d, handle_edge_irq);
-
-       return IRQ_SET_MASK_OK_NOCOPY;
-}
-
-static struct irq_chip eic_chip = {
-       .name           = "eic",
-       .irq_ack        = eic_ack_irq,
-       .irq_mask       = eic_mask_irq,
-       .irq_mask_ack   = eic_mask_ack_irq,
-       .irq_unmask     = eic_unmask_irq,
-       .irq_set_type   = eic_set_irq_type,
-};
-
-static void demux_eic_irq(struct irq_desc *desc)
-{
-       struct eic *eic = irq_desc_get_handler_data(desc);
-       unsigned long status, pending;
-       unsigned int i;
-
-       status = eic_readl(eic, ISR);
-       pending = status & eic_readl(eic, IMR);
-
-       while (pending) {
-               i = fls(pending) - 1;
-               pending &= ~(1 << i);
-
-               generic_handle_irq(i + eic->first_irq);
-       }
-}
-
-int nmi_enable(void)
-{
-       nmi_enabled = true;
-
-       if (nmi_eic)
-               eic_writel(nmi_eic, NMIC, EIC_NMIC_ENABLE);
-
-       return 0;
-}
-
-void nmi_disable(void)
-{
-       if (nmi_eic)
-               eic_writel(nmi_eic, NMIC, 0);
-
-       nmi_enabled = false;
-}
-
-static int __init eic_probe(struct platform_device *pdev)
-{
-       struct eic *eic;
-       struct resource *regs;
-       unsigned int i;
-       unsigned int nr_of_irqs;
-       unsigned int int_irq;
-       int ret;
-       u32 pattern;
-
-       regs = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-       int_irq = platform_get_irq(pdev, 0);
-       if (!regs || (int)int_irq <= 0) {
-               dev_dbg(&pdev->dev, "missing regs and/or irq resource\n");
-               return -ENXIO;
-       }
-
-       ret = -ENOMEM;
-       eic = kzalloc(sizeof(struct eic), GFP_KERNEL);
-       if (!eic) {
-               dev_dbg(&pdev->dev, "no memory for eic structure\n");
-               goto err_kzalloc;
-       }
-
-       eic->first_irq = EIM_IRQ_BASE + 32 * pdev->id;
-       eic->regs = ioremap(regs->start, resource_size(regs));
-       if (!eic->regs) {
-               dev_dbg(&pdev->dev, "failed to map regs\n");
-               goto err_ioremap;
-       }
-
-       /*
-        * Find out how many interrupt lines that are actually
-        * implemented in hardware.
-        */
-       eic_writel(eic, IDR, ~0UL);
-       eic_writel(eic, MODE, ~0UL);
-       pattern = eic_readl(eic, MODE);
-       nr_of_irqs = fls(pattern);
-
-       /* Trigger on low level unless overridden by driver */
-       eic_writel(eic, EDGE, 0UL);
-       eic_writel(eic, LEVEL, 0UL);
-
-       eic->chip = &eic_chip;
-
-       for (i = 0; i < nr_of_irqs; i++) {
-               irq_set_chip_and_handler(eic->first_irq + i, &eic_chip,
-                                        handle_level_irq);
-               irq_set_chip_data(eic->first_irq + i, eic);
-       }
-
-       irq_set_chained_handler_and_data(int_irq, demux_eic_irq, eic);
-
-       if (pdev->id == 0) {
-               nmi_eic = eic;
-               if (nmi_enabled)
-                       /*
-                        * Someone tried to enable NMI before we were
-                        * ready. Do it now.
-                        */
-                       nmi_enable();
-       }
-
-       dev_info(&pdev->dev,
-                "External Interrupt Controller at 0x%p, IRQ %u\n",
-                eic->regs, int_irq);
-       dev_info(&pdev->dev,
-                "Handling %u external IRQs, starting with IRQ %u\n",
-                nr_of_irqs, eic->first_irq);
-
-       return 0;
-
-err_ioremap:
-       kfree(eic);
-err_kzalloc:
-       return ret;
-}
-
-static struct platform_driver eic_driver = {
-       .driver = {
-               .name = "at32_eic",
-       },
-};
-
-static int __init eic_init(void)
-{
-       return platform_driver_probe(&eic_driver, eic_probe);
-}
-arch_initcall(eic_init);
diff --git a/arch/avr32/mach-at32ap/hmatrix.c b/arch/avr32/mach-at32ap/hmatrix.c
deleted file mode 100644 (file)
index 48f5ede..0000000
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * High-Speed Bus Matrix helper functions
- *
- * Copyright (C) 2008 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/clk.h>
-#include <linux/io.h>
-
-#include <mach/chip.h>
-#include <mach/hmatrix.h>
-
-static inline void __hmatrix_write_reg(unsigned long offset, u32 value)
-{
-       __raw_writel(value, (void __iomem __force *)(HMATRIX_BASE + offset));
-}
-
-static inline u32 __hmatrix_read_reg(unsigned long offset)
-{
-       return __raw_readl((void __iomem __force *)(HMATRIX_BASE + offset));
-}
-
-/**
- * hmatrix_write_reg - write HMATRIX configuration register
- * @offset: register offset
- * @value: value to be written to the register at @offset
- */
-void hmatrix_write_reg(unsigned long offset, u32 value)
-{
-       clk_enable(&at32_hmatrix_clk);
-       __hmatrix_write_reg(offset, value);
-       __hmatrix_read_reg(offset);
-       clk_disable(&at32_hmatrix_clk);
-}
-
-/**
- * hmatrix_read_reg - read HMATRIX configuration register
- * @offset: register offset
- *
- * Returns the value of the register at @offset.
- */
-u32 hmatrix_read_reg(unsigned long offset)
-{
-       u32 value;
-
-       clk_enable(&at32_hmatrix_clk);
-       value = __hmatrix_read_reg(offset);
-       clk_disable(&at32_hmatrix_clk);
-
-       return value;
-}
-
-/**
- * hmatrix_sfr_set_bits - set bits in a slave's Special Function Register
- * @slave_id: operate on the SFR belonging to this slave
- * @mask: mask of bits to be set in the SFR
- */
-void hmatrix_sfr_set_bits(unsigned int slave_id, u32 mask)
-{
-       u32 value;
-
-       clk_enable(&at32_hmatrix_clk);
-       value = __hmatrix_read_reg(HMATRIX_SFR(slave_id));
-       value |= mask;
-       __hmatrix_write_reg(HMATRIX_SFR(slave_id), value);
-       __hmatrix_read_reg(HMATRIX_SFR(slave_id));
-       clk_disable(&at32_hmatrix_clk);
-}
-
-/**
- * hmatrix_sfr_set_bits - clear bits in a slave's Special Function Register
- * @slave_id: operate on the SFR belonging to this slave
- * @mask: mask of bits to be cleared in the SFR
- */
-void hmatrix_sfr_clear_bits(unsigned int slave_id, u32 mask)
-{
-       u32 value;
-
-       clk_enable(&at32_hmatrix_clk);
-       value = __hmatrix_read_reg(HMATRIX_SFR(slave_id));
-       value &= ~mask;
-       __hmatrix_write_reg(HMATRIX_SFR(slave_id), value);
-       __hmatrix_read_reg(HMATRIX_SFR(slave_id));
-       clk_disable(&at32_hmatrix_clk);
-}
diff --git a/arch/avr32/mach-at32ap/hsmc.c b/arch/avr32/mach-at32ap/hsmc.c
deleted file mode 100644 (file)
index f66245e..0000000
+++ /dev/null
@@ -1,282 +0,0 @@
-/*
- * Static Memory Controller for AT32 chips
- *
- * Copyright (C) 2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/clk.h>
-#include <linux/err.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/platform_device.h>
-#include <linux/slab.h>
-
-#include <asm/io.h>
-#include <mach/smc.h>
-
-#include "hsmc.h"
-
-#define NR_CHIP_SELECTS 6
-
-struct hsmc {
-       void __iomem *regs;
-       struct clk *pclk;
-       struct clk *mck;
-};
-
-static struct hsmc *hsmc;
-
-void smc_set_timing(struct smc_config *config,
-                   const struct smc_timing *timing)
-{
-       int recover;
-       int cycle;
-
-       unsigned long mul;
-
-       /* Reset all SMC timings */
-       config->ncs_read_setup  = 0;
-       config->nrd_setup       = 0;
-       config->ncs_write_setup = 0;
-       config->nwe_setup       = 0;
-       config->ncs_read_pulse  = 0;
-       config->nrd_pulse       = 0;
-       config->ncs_write_pulse = 0;
-       config->nwe_pulse       = 0;
-       config->read_cycle      = 0;
-       config->write_cycle     = 0;
-
-       /*
-        * cycles = x / T = x * f
-        *   = ((x * 1000000000) * ((f * 65536) / 1000000000)) / 65536
-        *   = ((x * 1000000000) * (((f / 10000) * 65536) / 100000)) / 65536
-        */
-       mul = (clk_get_rate(hsmc->mck) / 10000) << 16;
-       mul /= 100000;
-
-#define ns2cyc(x) ((((x) * mul) + 65535) >> 16)
-
-       if (timing->ncs_read_setup > 0)
-               config->ncs_read_setup = ns2cyc(timing->ncs_read_setup);
-
-       if (timing->nrd_setup > 0)
-               config->nrd_setup = ns2cyc(timing->nrd_setup);
-
-       if (timing->ncs_write_setup > 0)
-               config->ncs_write_setup = ns2cyc(timing->ncs_write_setup);
-
-       if (timing->nwe_setup > 0)
-               config->nwe_setup = ns2cyc(timing->nwe_setup);
-
-       if (timing->ncs_read_pulse > 0)
-               config->ncs_read_pulse = ns2cyc(timing->ncs_read_pulse);
-
-       if (timing->nrd_pulse > 0)
-               config->nrd_pulse = ns2cyc(timing->nrd_pulse);
-
-       if (timing->ncs_write_pulse > 0)
-               config->ncs_write_pulse = ns2cyc(timing->ncs_write_pulse);
-
-       if (timing->nwe_pulse > 0)
-               config->nwe_pulse = ns2cyc(timing->nwe_pulse);
-
-       if (timing->read_cycle > 0)
-               config->read_cycle = ns2cyc(timing->read_cycle);
-
-       if (timing->write_cycle > 0)
-               config->write_cycle = ns2cyc(timing->write_cycle);
-
-       /* Extend read cycle in needed */
-       if (timing->ncs_read_recover > 0)
-               recover = ns2cyc(timing->ncs_read_recover);
-       else
-               recover = 1;
-
-       cycle = config->ncs_read_setup + config->ncs_read_pulse + recover;
-
-       if (config->read_cycle < cycle)
-               config->read_cycle = cycle;
-
-       /* Extend read cycle in needed */
-       if (timing->nrd_recover > 0)
-               recover = ns2cyc(timing->nrd_recover);
-       else
-               recover = 1;
-
-       cycle = config->nrd_setup + config->nrd_pulse + recover;
-
-       if (config->read_cycle < cycle)
-               config->read_cycle = cycle;
-
-       /* Extend write cycle in needed */
-       if (timing->ncs_write_recover > 0)
-               recover = ns2cyc(timing->ncs_write_recover);
-       else
-               recover = 1;
-
-       cycle = config->ncs_write_setup + config->ncs_write_pulse + recover;
-
-       if (config->write_cycle < cycle)
-               config->write_cycle = cycle;
-
-       /* Extend write cycle in needed */
-       if (timing->nwe_recover > 0)
-               recover = ns2cyc(timing->nwe_recover);
-       else
-               recover = 1;
-
-       cycle = config->nwe_setup + config->nwe_pulse + recover;
-
-       if (config->write_cycle < cycle)
-               config->write_cycle = cycle;
-}
-EXPORT_SYMBOL(smc_set_timing);
-
-int smc_set_configuration(int cs, const struct smc_config *config)
-{
-       unsigned long offset;
-       u32 setup, pulse, cycle, mode;
-
-       if (!hsmc)
-               return -ENODEV;
-       if (cs >= NR_CHIP_SELECTS)
-               return -EINVAL;
-
-       setup = (HSMC_BF(NWE_SETUP, config->nwe_setup)
-                | HSMC_BF(NCS_WR_SETUP, config->ncs_write_setup)
-                | HSMC_BF(NRD_SETUP, config->nrd_setup)
-                | HSMC_BF(NCS_RD_SETUP, config->ncs_read_setup));
-       pulse = (HSMC_BF(NWE_PULSE, config->nwe_pulse)
-                | HSMC_BF(NCS_WR_PULSE, config->ncs_write_pulse)
-                | HSMC_BF(NRD_PULSE, config->nrd_pulse)
-                | HSMC_BF(NCS_RD_PULSE, config->ncs_read_pulse));
-       cycle = (HSMC_BF(NWE_CYCLE, config->write_cycle)
-                | HSMC_BF(NRD_CYCLE, config->read_cycle));
-
-       switch (config->bus_width) {
-       case 1:
-               mode = HSMC_BF(DBW, HSMC_DBW_8_BITS);
-               break;
-       case 2:
-               mode = HSMC_BF(DBW, HSMC_DBW_16_BITS);
-               break;
-       case 4:
-               mode = HSMC_BF(DBW, HSMC_DBW_32_BITS);
-               break;
-       default:
-               return -EINVAL;
-       }
-
-       switch (config->nwait_mode) {
-       case 0:
-               mode |= HSMC_BF(EXNW_MODE, HSMC_EXNW_MODE_DISABLED);
-               break;
-       case 1:
-               mode |= HSMC_BF(EXNW_MODE, HSMC_EXNW_MODE_RESERVED);
-               break;
-       case 2:
-               mode |= HSMC_BF(EXNW_MODE, HSMC_EXNW_MODE_FROZEN);
-               break;
-       case 3:
-               mode |= HSMC_BF(EXNW_MODE, HSMC_EXNW_MODE_READY);
-               break;
-       default:
-               return -EINVAL;
-       }
-
-       if (config->tdf_cycles) {
-               mode |= HSMC_BF(TDF_CYCLES, config->tdf_cycles);
-       }
-
-       if (config->nrd_controlled)
-               mode |= HSMC_BIT(READ_MODE);
-       if (config->nwe_controlled)
-               mode |= HSMC_BIT(WRITE_MODE);
-       if (config->byte_write)
-               mode |= HSMC_BIT(BAT);
-       if (config->tdf_mode)
-               mode |= HSMC_BIT(TDF_MODE);
-
-       pr_debug("smc cs%d: setup/%08x pulse/%08x cycle/%08x mode/%08x\n",
-                cs, setup, pulse, cycle, mode);
-
-       offset = cs * 0x10;
-       hsmc_writel(hsmc, SETUP0 + offset, setup);
-       hsmc_writel(hsmc, PULSE0 + offset, pulse);
-       hsmc_writel(hsmc, CYCLE0 + offset, cycle);
-       hsmc_writel(hsmc, MODE0 + offset, mode);
-       hsmc_readl(hsmc, MODE0); /* I/O barrier */
-
-       return 0;
-}
-EXPORT_SYMBOL(smc_set_configuration);
-
-static int hsmc_probe(struct platform_device *pdev)
-{
-       struct resource *regs;
-       struct clk *pclk, *mck;
-       int ret;
-
-       if (hsmc)
-               return -EBUSY;
-
-       regs = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-       if (!regs)
-               return -ENXIO;
-       pclk = clk_get(&pdev->dev, "pclk");
-       if (IS_ERR(pclk))
-               return PTR_ERR(pclk);
-       mck = clk_get(&pdev->dev, "mck");
-       if (IS_ERR(mck)) {
-               ret = PTR_ERR(mck);
-               goto out_put_pclk;
-       }
-
-       ret = -ENOMEM;
-       hsmc = kzalloc(sizeof(struct hsmc), GFP_KERNEL);
-       if (!hsmc)
-               goto out_put_clocks;
-
-       clk_enable(pclk);
-       clk_enable(mck);
-
-       hsmc->pclk = pclk;
-       hsmc->mck = mck;
-       hsmc->regs = ioremap(regs->start, resource_size(regs));
-       if (!hsmc->regs)
-               goto out_disable_clocks;
-
-       dev_info(&pdev->dev, "Atmel Static Memory Controller at 0x%08lx\n",
-                (unsigned long)regs->start);
-
-       platform_set_drvdata(pdev, hsmc);
-
-       return 0;
-
-out_disable_clocks:
-       clk_disable(mck);
-       clk_disable(pclk);
-       kfree(hsmc);
-out_put_clocks:
-       clk_put(mck);
-out_put_pclk:
-       clk_put(pclk);
-       hsmc = NULL;
-       return ret;
-}
-
-static struct platform_driver hsmc_driver = {
-       .probe          = hsmc_probe,
-       .driver         = {
-               .name   = "smc",
-       },
-};
-
-static int __init hsmc_init(void)
-{
-       return platform_driver_register(&hsmc_driver);
-}
-core_initcall(hsmc_init);
diff --git a/arch/avr32/mach-at32ap/hsmc.h b/arch/avr32/mach-at32ap/hsmc.h
deleted file mode 100644 (file)
index d1d48e2..0000000
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Register definitions for Atmel Static Memory Controller (SMC)
- *
- * Copyright (C) 2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef __ASM_AVR32_HSMC_H__
-#define __ASM_AVR32_HSMC_H__
-
-/* HSMC register offsets */
-#define HSMC_SETUP0                            0x0000
-#define HSMC_PULSE0                            0x0004
-#define HSMC_CYCLE0                            0x0008
-#define HSMC_MODE0                             0x000c
-#define HSMC_SETUP1                            0x0010
-#define HSMC_PULSE1                            0x0014
-#define HSMC_CYCLE1                            0x0018
-#define HSMC_MODE1                             0x001c
-#define HSMC_SETUP2                            0x0020
-#define HSMC_PULSE2                            0x0024
-#define HSMC_CYCLE2                            0x0028
-#define HSMC_MODE2                             0x002c
-#define HSMC_SETUP3                            0x0030
-#define HSMC_PULSE3                            0x0034
-#define HSMC_CYCLE3                            0x0038
-#define HSMC_MODE3                             0x003c
-#define HSMC_SETUP4                            0x0040
-#define HSMC_PULSE4                            0x0044
-#define HSMC_CYCLE4                            0x0048
-#define HSMC_MODE4                             0x004c
-#define HSMC_SETUP5                            0x0050
-#define HSMC_PULSE5                            0x0054
-#define HSMC_CYCLE5                            0x0058
-#define HSMC_MODE5                             0x005c
-
-/* Bitfields in SETUP0 */
-#define HSMC_NWE_SETUP_OFFSET                  0
-#define HSMC_NWE_SETUP_SIZE                    6
-#define HSMC_NCS_WR_SETUP_OFFSET               8
-#define HSMC_NCS_WR_SETUP_SIZE                 6
-#define HSMC_NRD_SETUP_OFFSET                  16
-#define HSMC_NRD_SETUP_SIZE                    6
-#define HSMC_NCS_RD_SETUP_OFFSET               24
-#define HSMC_NCS_RD_SETUP_SIZE                 6
-
-/* Bitfields in PULSE0 */
-#define HSMC_NWE_PULSE_OFFSET                  0
-#define HSMC_NWE_PULSE_SIZE                    7
-#define HSMC_NCS_WR_PULSE_OFFSET               8
-#define HSMC_NCS_WR_PULSE_SIZE                 7
-#define HSMC_NRD_PULSE_OFFSET                  16
-#define HSMC_NRD_PULSE_SIZE                    7
-#define HSMC_NCS_RD_PULSE_OFFSET               24
-#define HSMC_NCS_RD_PULSE_SIZE                 7
-
-/* Bitfields in CYCLE0 */
-#define HSMC_NWE_CYCLE_OFFSET                  0
-#define HSMC_NWE_CYCLE_SIZE                    9
-#define HSMC_NRD_CYCLE_OFFSET                  16
-#define HSMC_NRD_CYCLE_SIZE                    9
-
-/* Bitfields in MODE0 */
-#define HSMC_READ_MODE_OFFSET                  0
-#define HSMC_READ_MODE_SIZE                    1
-#define HSMC_WRITE_MODE_OFFSET                 1
-#define HSMC_WRITE_MODE_SIZE                   1
-#define HSMC_EXNW_MODE_OFFSET                  4
-#define HSMC_EXNW_MODE_SIZE                    2
-#define HSMC_BAT_OFFSET                                8
-#define HSMC_BAT_SIZE                          1
-#define HSMC_DBW_OFFSET                                12
-#define HSMC_DBW_SIZE                          2
-#define HSMC_TDF_CYCLES_OFFSET                 16
-#define HSMC_TDF_CYCLES_SIZE                   4
-#define HSMC_TDF_MODE_OFFSET                   20
-#define HSMC_TDF_MODE_SIZE                     1
-#define HSMC_PMEN_OFFSET                       24
-#define HSMC_PMEN_SIZE                         1
-#define HSMC_PS_OFFSET                         28
-#define HSMC_PS_SIZE                           2
-
-/* Constants for READ_MODE */
-#define HSMC_READ_MODE_NCS_CONTROLLED          0
-#define HSMC_READ_MODE_NRD_CONTROLLED          1
-
-/* Constants for WRITE_MODE */
-#define HSMC_WRITE_MODE_NCS_CONTROLLED         0
-#define HSMC_WRITE_MODE_NWE_CONTROLLED         1
-
-/* Constants for EXNW_MODE */
-#define HSMC_EXNW_MODE_DISABLED                        0
-#define HSMC_EXNW_MODE_RESERVED                        1
-#define HSMC_EXNW_MODE_FROZEN                  2
-#define HSMC_EXNW_MODE_READY                   3
-
-/* Constants for BAT */
-#define HSMC_BAT_BYTE_SELECT                   0
-#define HSMC_BAT_BYTE_WRITE                    1
-
-/* Constants for DBW */
-#define HSMC_DBW_8_BITS                                0
-#define HSMC_DBW_16_BITS                       1
-#define HSMC_DBW_32_BITS                       2
-
-/* Bit manipulation macros */
-#define HSMC_BIT(name)                                                 \
-       (1 << HSMC_##name##_OFFSET)
-#define HSMC_BF(name,value)                                            \
-       (((value) & ((1 << HSMC_##name##_SIZE) - 1))                    \
-        << HSMC_##name##_OFFSET)
-#define HSMC_BFEXT(name,value)                                         \
-       (((value) >> HSMC_##name##_OFFSET)                              \
-        & ((1 << HSMC_##name##_SIZE) - 1))
-#define HSMC_BFINS(name,value,old)                                     \
-       (((old) & ~(((1 << HSMC_##name##_SIZE) - 1)                     \
-                   << HSMC_##name##_OFFSET)) | HSMC_BF(name,value))
-
-/* Register access macros */
-#define hsmc_readl(port,reg)                                           \
-       __raw_readl((port)->regs + HSMC_##reg)
-#define hsmc_writel(port,reg,value)                                    \
-       __raw_writel((value), (port)->regs + HSMC_##reg)
-
-#endif /* __ASM_AVR32_HSMC_H__ */
diff --git a/arch/avr32/mach-at32ap/include/mach/at32ap700x.h b/arch/avr32/mach-at32ap/include/mach/at32ap700x.h
deleted file mode 100644 (file)
index b9222bf..0000000
+++ /dev/null
@@ -1,245 +0,0 @@
-/*
- * Pin definitions for AT32AP7000.
- *
- * Copyright (C) 2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef __ASM_ARCH_AT32AP700X_H__
-#define __ASM_ARCH_AT32AP700X_H__
-
-#define GPIO_PERIPH_A  0
-#define GPIO_PERIPH_B  1
-
-/*
- * Pin numbers identifying specific GPIO pins on the chip. They can
- * also be converted to IRQ numbers by passing them through
- * gpio_to_irq().
- */
-#define GPIO_PIOA_BASE (0)
-#define GPIO_PIOB_BASE (GPIO_PIOA_BASE + 32)
-#define GPIO_PIOC_BASE (GPIO_PIOB_BASE + 32)
-#define GPIO_PIOD_BASE (GPIO_PIOC_BASE + 32)
-#define GPIO_PIOE_BASE (GPIO_PIOD_BASE + 32)
-
-#define GPIO_PIN_PA(N) (GPIO_PIOA_BASE + (N))
-#define GPIO_PIN_PB(N) (GPIO_PIOB_BASE + (N))
-#define GPIO_PIN_PC(N) (GPIO_PIOC_BASE + (N))
-#define GPIO_PIN_PD(N) (GPIO_PIOD_BASE + (N))
-#define GPIO_PIN_PE(N) (GPIO_PIOE_BASE + (N))
-
-
-/*
- * DMAC peripheral hardware handshaking interfaces, used with dw_dmac
- */
-#define DMAC_MCI_RX            0
-#define DMAC_MCI_TX            1
-#define DMAC_DAC_TX            2
-#define DMAC_AC97_A_RX         3
-#define DMAC_AC97_A_TX         4
-#define DMAC_AC97_B_RX         5
-#define DMAC_AC97_B_TX         6
-#define DMAC_DMAREQ_0          7
-#define DMAC_DMAREQ_1          8
-#define DMAC_DMAREQ_2          9
-#define DMAC_DMAREQ_3          10
-
-/* HSB master IDs */
-#define HMATRIX_MASTER_CPU_DCACHE              0
-#define HMATRIX_MASTER_CPU_ICACHE              1
-#define HMATRIX_MASTER_PDC                     2
-#define HMATRIX_MASTER_ISI                     3
-#define HMATRIX_MASTER_USBA                    4
-#define HMATRIX_MASTER_LCDC                    5
-#define HMATRIX_MASTER_MACB0                   6
-#define HMATRIX_MASTER_MACB1                   7
-#define HMATRIX_MASTER_DMACA_M0                        8
-#define HMATRIX_MASTER_DMACA_M1                        9
-
-/* HSB slave IDs */
-#define HMATRIX_SLAVE_SRAM0                    0
-#define HMATRIX_SLAVE_SRAM1                    1
-#define HMATRIX_SLAVE_PBA                      2
-#define HMATRIX_SLAVE_PBB                      3
-#define HMATRIX_SLAVE_EBI                      4
-#define HMATRIX_SLAVE_USBA                     5
-#define HMATRIX_SLAVE_LCDC                     6
-#define HMATRIX_SLAVE_DMACA                    7
-
-/* Bits in HMATRIX SFR4 (EBI) */
-#define HMATRIX_EBI_SDRAM_ENABLE               (1 << 1)
-#define HMATRIX_EBI_NAND_ENABLE                        (1 << 3)
-#define HMATRIX_EBI_CF0_ENABLE                 (1 << 4)
-#define HMATRIX_EBI_CF1_ENABLE                 (1 << 5)
-#define HMATRIX_EBI_PULLUP_DISABLE             (1 << 8)
-
-/*
- * Base addresses of controllers that may be accessed early by
- * platform code.
- */
-#define PM_BASE                0xfff00000
-#define HMATRIX_BASE   0xfff00800
-#define SDRAMC_BASE    0xfff03800
-
-/* LCDC on port C */
-#define ATMEL_LCDC_PC_CC       (1ULL << 19)
-#define ATMEL_LCDC_PC_HSYNC    (1ULL << 20)
-#define ATMEL_LCDC_PC_PCLK     (1ULL << 21)
-#define ATMEL_LCDC_PC_VSYNC    (1ULL << 22)
-#define ATMEL_LCDC_PC_DVAL     (1ULL << 23)
-#define ATMEL_LCDC_PC_MODE     (1ULL << 24)
-#define ATMEL_LCDC_PC_PWR      (1ULL << 25)
-#define ATMEL_LCDC_PC_DATA0    (1ULL << 26)
-#define ATMEL_LCDC_PC_DATA1    (1ULL << 27)
-#define ATMEL_LCDC_PC_DATA2    (1ULL << 28)
-#define ATMEL_LCDC_PC_DATA3    (1ULL << 29)
-#define ATMEL_LCDC_PC_DATA4    (1ULL << 30)
-#define ATMEL_LCDC_PC_DATA5    (1ULL << 31)
-
-/* LCDC on port D */
-#define ATMEL_LCDC_PD_DATA6    (1ULL << 0)
-#define ATMEL_LCDC_PD_DATA7    (1ULL << 1)
-#define ATMEL_LCDC_PD_DATA8    (1ULL << 2)
-#define ATMEL_LCDC_PD_DATA9    (1ULL << 3)
-#define ATMEL_LCDC_PD_DATA10   (1ULL << 4)
-#define ATMEL_LCDC_PD_DATA11   (1ULL << 5)
-#define ATMEL_LCDC_PD_DATA12   (1ULL << 6)
-#define ATMEL_LCDC_PD_DATA13   (1ULL << 7)
-#define ATMEL_LCDC_PD_DATA14   (1ULL << 8)
-#define ATMEL_LCDC_PD_DATA15   (1ULL << 9)
-#define ATMEL_LCDC_PD_DATA16   (1ULL << 10)
-#define ATMEL_LCDC_PD_DATA17   (1ULL << 11)
-#define ATMEL_LCDC_PD_DATA18   (1ULL << 12)
-#define ATMEL_LCDC_PD_DATA19   (1ULL << 13)
-#define ATMEL_LCDC_PD_DATA20   (1ULL << 14)
-#define ATMEL_LCDC_PD_DATA21   (1ULL << 15)
-#define ATMEL_LCDC_PD_DATA22   (1ULL << 16)
-#define ATMEL_LCDC_PD_DATA23   (1ULL << 17)
-
-/* LCDC on port E */
-#define ATMEL_LCDC_PE_CC       (1ULL << (32 + 0))
-#define ATMEL_LCDC_PE_DVAL     (1ULL << (32 + 1))
-#define ATMEL_LCDC_PE_MODE     (1ULL << (32 + 2))
-#define ATMEL_LCDC_PE_DATA0    (1ULL << (32 + 3))
-#define ATMEL_LCDC_PE_DATA1    (1ULL << (32 + 4))
-#define ATMEL_LCDC_PE_DATA2    (1ULL << (32 + 5))
-#define ATMEL_LCDC_PE_DATA3    (1ULL << (32 + 6))
-#define ATMEL_LCDC_PE_DATA4    (1ULL << (32 + 7))
-#define ATMEL_LCDC_PE_DATA8    (1ULL << (32 + 8))
-#define ATMEL_LCDC_PE_DATA9    (1ULL << (32 + 9))
-#define ATMEL_LCDC_PE_DATA10   (1ULL << (32 + 10))
-#define ATMEL_LCDC_PE_DATA11   (1ULL << (32 + 11))
-#define ATMEL_LCDC_PE_DATA12   (1ULL << (32 + 12))
-#define ATMEL_LCDC_PE_DATA16   (1ULL << (32 + 13))
-#define ATMEL_LCDC_PE_DATA17   (1ULL << (32 + 14))
-#define ATMEL_LCDC_PE_DATA18   (1ULL << (32 + 15))
-#define ATMEL_LCDC_PE_DATA19   (1ULL << (32 + 16))
-#define ATMEL_LCDC_PE_DATA20   (1ULL << (32 + 17))
-#define ATMEL_LCDC_PE_DATA21   (1ULL << (32 + 18))
-
-
-#define ATMEL_LCDC(PORT, PIN)  (ATMEL_LCDC_##PORT##_##PIN)
-
-
-#define ATMEL_LCDC_PRI_24B_DATA        (                                       \
-               ATMEL_LCDC(PC, DATA0)  | ATMEL_LCDC(PC, DATA1)  |       \
-               ATMEL_LCDC(PC, DATA2)  | ATMEL_LCDC(PC, DATA3)  |       \
-               ATMEL_LCDC(PC, DATA4)  | ATMEL_LCDC(PC, DATA5)  |       \
-               ATMEL_LCDC(PD, DATA6)  | ATMEL_LCDC(PD, DATA7)  |       \
-               ATMEL_LCDC(PD, DATA8)  | ATMEL_LCDC(PD, DATA9)  |       \
-               ATMEL_LCDC(PD, DATA10) | ATMEL_LCDC(PD, DATA11) |       \
-               ATMEL_LCDC(PD, DATA12) | ATMEL_LCDC(PD, DATA13) |       \
-               ATMEL_LCDC(PD, DATA14) | ATMEL_LCDC(PD, DATA15) |       \
-               ATMEL_LCDC(PD, DATA16) | ATMEL_LCDC(PD, DATA17) |       \
-               ATMEL_LCDC(PD, DATA18) | ATMEL_LCDC(PD, DATA19) |       \
-               ATMEL_LCDC(PD, DATA20) | ATMEL_LCDC(PD, DATA21) |       \
-               ATMEL_LCDC(PD, DATA22) | ATMEL_LCDC(PD, DATA23))
-
-#define ATMEL_LCDC_ALT_24B_DATA (                                      \
-               ATMEL_LCDC(PE, DATA0)  | ATMEL_LCDC(PE, DATA1)  |       \
-               ATMEL_LCDC(PE, DATA2)  | ATMEL_LCDC(PE, DATA3)  |       \
-               ATMEL_LCDC(PE, DATA4)  | ATMEL_LCDC(PC, DATA5)  |       \
-               ATMEL_LCDC(PD, DATA6)  | ATMEL_LCDC(PD, DATA7)  |       \
-               ATMEL_LCDC(PE, DATA8)  | ATMEL_LCDC(PE, DATA9)  |       \
-               ATMEL_LCDC(PE, DATA10) | ATMEL_LCDC(PE, DATA11) |       \
-               ATMEL_LCDC(PE, DATA12) | ATMEL_LCDC(PD, DATA13) |       \
-               ATMEL_LCDC(PD, DATA14) | ATMEL_LCDC(PD, DATA15) |       \
-               ATMEL_LCDC(PE, DATA16) | ATMEL_LCDC(PE, DATA17) |       \
-               ATMEL_LCDC(PE, DATA18) | ATMEL_LCDC(PE, DATA19) |       \
-               ATMEL_LCDC(PE, DATA20) | ATMEL_LCDC(PE, DATA21) |       \
-               ATMEL_LCDC(PD, DATA22) | ATMEL_LCDC(PD, DATA23))
-
-#define ATMEL_LCDC_PRI_18B_DATA (                                      \
-               ATMEL_LCDC(PC, DATA2)  | ATMEL_LCDC(PC, DATA3)  |       \
-               ATMEL_LCDC(PC, DATA4)  | ATMEL_LCDC(PC, DATA5)  |       \
-               ATMEL_LCDC(PD, DATA6)  | ATMEL_LCDC(PD, DATA7)  |       \
-               ATMEL_LCDC(PD, DATA10) | ATMEL_LCDC(PD, DATA11) |       \
-               ATMEL_LCDC(PD, DATA12) | ATMEL_LCDC(PD, DATA13) |       \
-               ATMEL_LCDC(PD, DATA14) | ATMEL_LCDC(PD, DATA15) |       \
-               ATMEL_LCDC(PD, DATA18) | ATMEL_LCDC(PD, DATA19) |       \
-               ATMEL_LCDC(PD, DATA20) | ATMEL_LCDC(PD, DATA21) |       \
-               ATMEL_LCDC(PD, DATA22) | ATMEL_LCDC(PD, DATA23))
-
-#define ATMEL_LCDC_ALT_18B_DATA        (                                       \
-               ATMEL_LCDC(PE, DATA2)  | ATMEL_LCDC(PE, DATA3)  |       \
-               ATMEL_LCDC(PE, DATA4)  | ATMEL_LCDC(PC, DATA5)  |       \
-               ATMEL_LCDC(PD, DATA6)  | ATMEL_LCDC(PD, DATA7)  |       \
-               ATMEL_LCDC(PE, DATA10) | ATMEL_LCDC(PE, DATA11) |       \
-               ATMEL_LCDC(PE, DATA12) | ATMEL_LCDC(PD, DATA13) |       \
-               ATMEL_LCDC(PD, DATA14) | ATMEL_LCDC(PD, DATA15) |       \
-               ATMEL_LCDC(PE, DATA18) | ATMEL_LCDC(PE, DATA19) |       \
-               ATMEL_LCDC(PE, DATA20) | ATMEL_LCDC(PE, DATA21) |       \
-               ATMEL_LCDC(PD, DATA22) | ATMEL_LCDC(PD, DATA23))
-
-#define ATMEL_LCDC_PRI_15B_DATA (                                      \
-               ATMEL_LCDC(PC, DATA3)  | ATMEL_LCDC(PC, DATA4)  |       \
-               ATMEL_LCDC(PC, DATA5)  | ATMEL_LCDC(PD, DATA6)  |       \
-               ATMEL_LCDC(PD, DATA7)  |                                \
-               ATMEL_LCDC(PD, DATA11) | ATMEL_LCDC(PD, DATA12) |       \
-               ATMEL_LCDC(PD, DATA13) | ATMEL_LCDC(PD, DATA14) |       \
-               ATMEL_LCDC(PD, DATA15) |                                \
-               ATMEL_LCDC(PD, DATA19) | ATMEL_LCDC(PD, DATA20) |       \
-               ATMEL_LCDC(PD, DATA21) | ATMEL_LCDC(PD, DATA22) |       \
-               ATMEL_LCDC(PD, DATA23))
-
-#define ATMEL_LCDC_ALT_15B_DATA        (                                       \
-               ATMEL_LCDC(PE, DATA3)  | ATMEL_LCDC(PE, DATA4)  |       \
-               ATMEL_LCDC(PC, DATA5)  | ATMEL_LCDC(PD, DATA6)  |       \
-               ATMEL_LCDC(PD, DATA7)  |                                \
-               ATMEL_LCDC(PE, DATA11) | ATMEL_LCDC(PE, DATA12) |       \
-               ATMEL_LCDC(PD, DATA13) | ATMEL_LCDC(PD, DATA14) |       \
-               ATMEL_LCDC(PD, DATA15) |                                \
-               ATMEL_LCDC(PE, DATA19) | ATMEL_LCDC(PE, DATA20) |       \
-               ATMEL_LCDC(PE, DATA21) | ATMEL_LCDC(PD, DATA22) |       \
-               ATMEL_LCDC(PD, DATA23))
-
-#define ATMEL_LCDC_PRI_CONTROL (                                       \
-               ATMEL_LCDC(PC, CC)   | ATMEL_LCDC(PC, DVAL) |           \
-               ATMEL_LCDC(PC, MODE) | ATMEL_LCDC(PC, PWR))
-
-#define ATMEL_LCDC_ALT_CONTROL (                                       \
-               ATMEL_LCDC(PE, CC)   | ATMEL_LCDC(PE, DVAL) |           \
-               ATMEL_LCDC(PE, MODE) | ATMEL_LCDC(PC, PWR))
-
-#define ATMEL_LCDC_CONTROL (                                           \
-               ATMEL_LCDC(PC, HSYNC) | ATMEL_LCDC(PC, VSYNC) |         \
-               ATMEL_LCDC(PC, PCLK))
-
-#define ATMEL_LCDC_PRI_24BIT   (ATMEL_LCDC_CONTROL | ATMEL_LCDC_PRI_24B_DATA)
-
-#define ATMEL_LCDC_ALT_24BIT   (ATMEL_LCDC_CONTROL | ATMEL_LCDC_ALT_24B_DATA)
-
-#define ATMEL_LCDC_PRI_18BIT   (ATMEL_LCDC_CONTROL | ATMEL_LCDC_PRI_18B_DATA)
-
-#define ATMEL_LCDC_ALT_18BIT   (ATMEL_LCDC_CONTROL | ATMEL_LCDC_ALT_18B_DATA)
-
-#define ATMEL_LCDC_PRI_15BIT   (ATMEL_LCDC_CONTROL | ATMEL_LCDC_PRI_15B_DATA)
-
-#define ATMEL_LCDC_ALT_15BIT   (ATMEL_LCDC_CONTROL | ATMEL_LCDC_ALT_15B_DATA)
-
-/* Bitmask for all EBI data (D16..D31) pins on port E */
-#define ATMEL_EBI_PE_DATA_ALL  (0x0000FFFF)
-
-#endif /* __ASM_ARCH_AT32AP700X_H__ */
diff --git a/arch/avr32/mach-at32ap/include/mach/board.h b/arch/avr32/mach-at32ap/include/mach/board.h
deleted file mode 100644 (file)
index f1a316d..0000000
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * Platform data definitions.
- */
-#ifndef __ASM_ARCH_BOARD_H
-#define __ASM_ARCH_BOARD_H
-
-#include <linux/types.h>
-#include <linux/serial.h>
-#include <linux/platform_data/macb.h>
-#include <linux/platform_data/atmel.h>
-
-#define GPIO_PIN_NONE  (-1)
-
-/*
- * Clock rates for various on-board oscillators. The number of entries
- * in this array is chip-dependent.
- */
-extern unsigned long at32_board_osc_rates[];
-
-/*
- * This used to add essential system devices, but this is now done
- * automatically. Please don't use it in new board code.
- */
-static inline void __deprecated at32_add_system_devices(void)
-{
-
-}
-
-extern struct platform_device *atmel_default_console_device;
-
-/* Flags for selecting USART extra pins */
-#define        ATMEL_USART_RTS         0x01
-#define        ATMEL_USART_CTS         0x02
-#define        ATMEL_USART_CLK         0x04
-
-void at32_map_usart(unsigned int hw_id, unsigned int line, int flags);
-struct platform_device *at32_add_device_usart(unsigned int id);
-
-struct platform_device *
-at32_add_device_eth(unsigned int id, struct macb_platform_data *data);
-
-struct spi_board_info;
-struct platform_device *
-at32_add_device_spi(unsigned int id, struct spi_board_info *b, unsigned int n);
-void at32_spi_setup_slaves(unsigned int bus_num, struct spi_board_info *b, unsigned int n);
-
-struct atmel_lcdfb_pdata;
-struct platform_device *
-at32_add_device_lcdc(unsigned int id, struct atmel_lcdfb_pdata *data,
-                    unsigned long fbmem_start, unsigned long fbmem_len,
-                    u64 pin_mask);
-
-struct usba_platform_data;
-struct platform_device *
-at32_add_device_usba(unsigned int id, struct usba_platform_data *data);
-
-struct ide_platform_data {
-       u8      cs;
-};
-struct platform_device *
-at32_add_device_ide(unsigned int id, unsigned int extint,
-                   struct ide_platform_data *data);
-
-/* mask says which PWM channels to mux */
-struct platform_device *at32_add_device_pwm(u32 mask);
-
-/* depending on what's hooked up, not all SSC pins will be used */
-#define        ATMEL_SSC_TK            0x01
-#define        ATMEL_SSC_TF            0x02
-#define        ATMEL_SSC_TD            0x04
-#define        ATMEL_SSC_TX            (ATMEL_SSC_TK | ATMEL_SSC_TF | ATMEL_SSC_TD)
-
-#define        ATMEL_SSC_RK            0x10
-#define        ATMEL_SSC_RF            0x20
-#define        ATMEL_SSC_RD            0x40
-#define        ATMEL_SSC_RX            (ATMEL_SSC_RK | ATMEL_SSC_RF | ATMEL_SSC_RD)
-
-struct platform_device *
-at32_add_device_ssc(unsigned int id, unsigned int flags);
-
-struct i2c_board_info;
-struct platform_device *at32_add_device_twi(unsigned int id,
-                                           struct i2c_board_info *b,
-                                           unsigned int n);
-
-struct mci_platform_data;
-struct platform_device *
-at32_add_device_mci(unsigned int id, struct mci_platform_data *data);
-
-struct ac97c_platform_data;
-struct platform_device *
-at32_add_device_ac97c(unsigned int id, struct ac97c_platform_data *data,
-                     unsigned int flags);
-
-struct atmel_abdac_pdata;
-struct platform_device *
-at32_add_device_abdac(unsigned int id, struct atmel_abdac_pdata *data);
-
-struct platform_device *at32_add_device_psif(unsigned int id);
-
-struct cf_platform_data {
-       int     detect_pin;
-       int     reset_pin;
-       int     vcc_pin;
-       int     ready_pin;
-       u8      cs;
-};
-struct platform_device *
-at32_add_device_cf(unsigned int id, unsigned int extint,
-               struct cf_platform_data *data);
-
-struct platform_device *
-at32_add_device_nand(unsigned int id, struct atmel_nand_data *data);
-
-#endif /* __ASM_ARCH_BOARD_H */
diff --git a/arch/avr32/mach-at32ap/include/mach/chip.h b/arch/avr32/mach-at32ap/include/mach/chip.h
deleted file mode 100644 (file)
index 5efca6d..0000000
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- * AVR32 chip-specific definitions
- *
- * Copyright (C) 2008 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef __ASM_AVR32_ARCH_CHIP_H__
-#define __ASM_AVR32_ARCH_CHIP_H__
-
-#if defined(CONFIG_CPU_AT32AP700X)
-# include <mach/at32ap700x.h>
-#else
-# error Unknown chip type selected
-#endif
-
-#endif /* __ASM_AVR32_ARCH_CHIP_H__ */
diff --git a/arch/avr32/mach-at32ap/include/mach/cpu.h b/arch/avr32/mach-at32ap/include/mach/cpu.h
deleted file mode 100644 (file)
index 4181086..0000000
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * AVR32 CPU identification
- *
- * Copyright (C) 2007 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef __ASM_ARCH_CPU_H
-#define __ASM_ARCH_CPU_H
-
-/*
- * Only AT32AP7000 is defined for now. We can identify the specific
- * chip at runtime, but I'm not sure if it's really worth it.
- */
-#ifdef CONFIG_CPU_AT32AP700X
-# define cpu_is_at32ap7000()   (1)
-#else
-# define cpu_is_at32ap7000()   (0)
-#endif
-
-#endif /* __ASM_ARCH_CPU_H */
diff --git a/arch/avr32/mach-at32ap/include/mach/gpio.h b/arch/avr32/mach-at32ap/include/mach/gpio.h
deleted file mode 100644 (file)
index 0180f58..0000000
+++ /dev/null
@@ -1,45 +0,0 @@
-#ifndef __ASM_AVR32_ARCH_GPIO_H
-#define __ASM_AVR32_ARCH_GPIO_H
-
-#include <linux/compiler.h>
-#include <asm/irq.h>
-
-
-/* Some GPIO chips can manage IRQs; some can't.  The exact numbers can
- * be changed if needed, but for the moment they're not configurable.
- */
-#define ARCH_NR_GPIOS  (NR_GPIO_IRQS + 2 * 32)
-
-
-/* Arch-neutral GPIO API, supporting both "native" and external GPIOs. */
-#include <asm-generic/gpio.h>
-
-static inline int gpio_get_value(unsigned int gpio)
-{
-       return __gpio_get_value(gpio);
-}
-
-static inline void gpio_set_value(unsigned int gpio, int value)
-{
-       __gpio_set_value(gpio, value);
-}
-
-static inline int gpio_cansleep(unsigned int gpio)
-{
-       return __gpio_cansleep(gpio);
-}
-
-
-static inline int gpio_to_irq(unsigned int gpio)
-{
-       if (gpio < NR_GPIO_IRQS)
-               return gpio + GPIO_IRQ_BASE;
-       return -EINVAL;
-}
-
-static inline int irq_to_gpio(unsigned int irq)
-{
-       return irq - GPIO_IRQ_BASE;
-}
-
-#endif /* __ASM_AVR32_ARCH_GPIO_H */
diff --git a/arch/avr32/mach-at32ap/include/mach/hmatrix.h b/arch/avr32/mach-at32ap/include/mach/hmatrix.h
deleted file mode 100644 (file)
index 7a368f2..0000000
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * High-Speed Bus Matrix configuration registers
- *
- * Copyright (C) 2008 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef __HMATRIX_H
-#define __HMATRIX_H
-
-extern struct clk at32_hmatrix_clk;
-
-void hmatrix_write_reg(unsigned long offset, u32 value);
-u32 hmatrix_read_reg(unsigned long offset);
-
-void hmatrix_sfr_set_bits(unsigned int slave_id, u32 mask);
-void hmatrix_sfr_clear_bits(unsigned int slave_id, u32 mask);
-
-/* Master Configuration register */
-#define HMATRIX_MCFG(m)                        (0x0000 + 4 * (m))
-/* Undefined length burst limit */
-# define HMATRIX_MCFG_ULBT_INFINITE    0       /* Infinite length */
-# define HMATRIX_MCFG_ULBT_SINGLE      1       /* Single Access */
-# define HMATRIX_MCFG_ULBT_FOUR_BEAT   2       /* Four beat */
-# define HMATRIX_MCFG_ULBT_EIGHT_BEAT  3       /* Eight beat */
-# define HMATRIX_MCFG_ULBT_SIXTEEN_BEAT        4       /* Sixteen beat */
-
-/* Slave Configuration register */
-#define HMATRIX_SCFG(s)                        (0x0040 + 4 * (s))
-# define HMATRIX_SCFG_SLOT_CYCLE(x)    ((x) <<  0)     /* Max burst cycles */
-# define HMATRIX_SCFG_DEFMSTR_NONE     (  0 << 16)     /* No default master */
-# define HMATRIX_SCFG_DEFMSTR_LAST     (  1 << 16)     /* Last def master */
-# define HMATRIX_SCFG_DEFMSTR_FIXED    (  2 << 16)     /* Fixed def master */
-# define HMATRIX_SCFG_FIXED_DEFMSTR(m) ((m) << 18)     /* Fixed master ID */
-# define HMATRIX_SCFG_ARBT_ROUND_ROBIN (  0 << 24)     /* RR arbitration */
-# define HMATRIX_SCFG_ARBT_FIXED_PRIO  (  1 << 24)     /* Fixed priority */
-
-/* Slave Priority register A (master 0..7) */
-#define HMATRIX_PRAS(s)                        (0x0080 + 8 * (s))
-# define HMATRIX_PRAS_PRIO(m, p)       ((p) << ((m) * 4))
-
-/* Slave Priority register A (master 8..15) */
-#define HMATRIX_PRBS(s)                        (0x0084 + 8 * (s))
-# define HMATRIX_PRBS_PRIO(m, p)       ((p) << (((m) - 8) * 4))
-
-/* Master Remap Control Register */
-#define HMATRIX_MRCR                           0x0100
-# define HMATRIX_MRCR_REMAP(m)         (  1 << (m))    /* Remap master m */
-
-/* Special Function Register. Bit definitions are chip-specific */
-#define HMATRIX_SFR(s)                 (0x0110 + 4 * (s))
-
-#endif /* __HMATRIX_H */
diff --git a/arch/avr32/mach-at32ap/include/mach/init.h b/arch/avr32/mach-at32ap/include/mach/init.h
deleted file mode 100644 (file)
index bc40e3d..0000000
+++ /dev/null
@@ -1,18 +0,0 @@
-/*
- * AT32AP platform initialization calls.
- *
- * Copyright (C) 2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef __ASM_AVR32_AT32AP_INIT_H__
-#define __ASM_AVR32_AT32AP_INIT_H__
-
-void setup_platform(void);
-void setup_board(void);
-
-void at32_setup_serial_console(unsigned int usart_id);
-
-#endif /* __ASM_AVR32_AT32AP_INIT_H__ */
diff --git a/arch/avr32/mach-at32ap/include/mach/io.h b/arch/avr32/mach-at32ap/include/mach/io.h
deleted file mode 100644 (file)
index 22ea79b..0000000
+++ /dev/null
@@ -1,38 +0,0 @@
-#ifndef __ASM_AVR32_ARCH_AT32AP_IO_H
-#define __ASM_AVR32_ARCH_AT32AP_IO_H
-
-#include <linux/swab.h>
-
-#if defined(CONFIG_AP700X_32_BIT_SMC)
-# define __swizzle_addr_b(addr)        (addr ^ 3UL)
-# define __swizzle_addr_w(addr)        (addr ^ 2UL)
-# define __swizzle_addr_l(addr)        (addr)
-# define ioswabb(a, x)         (x)
-# define ioswabw(a, x)         (x)
-# define ioswabl(a, x)         (x)
-# define __mem_ioswabb(a, x)   (x)
-# define __mem_ioswabw(a, x)   swab16(x)
-# define __mem_ioswabl(a, x)   swab32(x)
-#elif defined(CONFIG_AP700X_16_BIT_SMC)
-# define __swizzle_addr_b(addr)        (addr ^ 1UL)
-# define __swizzle_addr_w(addr)        (addr)
-# define __swizzle_addr_l(addr)        (addr)
-# define ioswabb(a, x)         (x)
-# define ioswabw(a, x)         (x)
-# define ioswabl(a, x)         swahw32(x)
-# define __mem_ioswabb(a, x)   (x)
-# define __mem_ioswabw(a, x)   swab16(x)
-# define __mem_ioswabl(a, x)   swahb32(x)
-#else
-# define __swizzle_addr_b(addr)        (addr)
-# define __swizzle_addr_w(addr)        (addr)
-# define __swizzle_addr_l(addr)        (addr)
-# define ioswabb(a, x)         (x)
-# define ioswabw(a, x)         swab16(x)
-# define ioswabl(a, x)         swab32(x)
-# define __mem_ioswabb(a, x)   (x)
-# define __mem_ioswabw(a, x)   (x)
-# define __mem_ioswabl(a, x)   (x)
-#endif
-
-#endif /* __ASM_AVR32_ARCH_AT32AP_IO_H */
diff --git a/arch/avr32/mach-at32ap/include/mach/irq.h b/arch/avr32/mach-at32ap/include/mach/irq.h
deleted file mode 100644 (file)
index 608e350..0000000
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef __ASM_AVR32_ARCH_IRQ_H
-#define __ASM_AVR32_ARCH_IRQ_H
-
-#define EIM_IRQ_BASE   NR_INTERNAL_IRQS
-#define NR_EIM_IRQS    32
-#define AT32_EXTINT(n) (EIM_IRQ_BASE + (n))
-
-#define GPIO_IRQ_BASE  (EIM_IRQ_BASE + NR_EIM_IRQS)
-#define NR_GPIO_CTLR   (5 /*internal*/ + 1 /*external*/)
-#define NR_GPIO_IRQS   (NR_GPIO_CTLR * 32)
-
-#define NR_IRQS                (GPIO_IRQ_BASE + NR_GPIO_IRQS)
-
-#endif /* __ASM_AVR32_ARCH_IRQ_H */
diff --git a/arch/avr32/mach-at32ap/include/mach/pm.h b/arch/avr32/mach-at32ap/include/mach/pm.h
deleted file mode 100644 (file)
index f29ff2c..0000000
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * AVR32 AP Power Management.
- *
- * Copyright (C) 2008 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef __ASM_AVR32_ARCH_PM_H
-#define __ASM_AVR32_ARCH_PM_H
-
-/* Possible arguments to the "sleep" instruction */
-#define CPU_SLEEP_IDLE         0
-#define CPU_SLEEP_FROZEN       1
-#define CPU_SLEEP_STANDBY      2
-#define CPU_SLEEP_STOP         3
-#define CPU_SLEEP_STATIC       5
-
-#ifndef __ASSEMBLY__
-extern void cpu_enter_idle(void);
-extern void cpu_enter_standby(unsigned long sdramc_base);
-
-void intc_set_suspend_handler(unsigned long offset);
-#endif
-
-#endif /* __ASM_AVR32_ARCH_PM_H */
diff --git a/arch/avr32/mach-at32ap/include/mach/portmux.h b/arch/avr32/mach-at32ap/include/mach/portmux.h
deleted file mode 100644 (file)
index 4873024..0000000
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * AT32 portmux interface.
- *
- * Copyright (C) 2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef __ASM_ARCH_PORTMUX_H__
-#define __ASM_ARCH_PORTMUX_H__
-
-/*
- * Set up pin multiplexing, called from board init only.
- *
- * The following flags determine the initial state of the pin.
- */
-#define AT32_GPIOF_PULLUP      0x00000001      /* (not-OUT) Enable pull-up */
-#define AT32_GPIOF_OUTPUT      0x00000002      /* (OUT) Enable output driver */
-#define AT32_GPIOF_HIGH                0x00000004      /* (OUT) Set output high */
-#define AT32_GPIOF_DEGLITCH    0x00000008      /* (IN) Filter glitches */
-#define AT32_GPIOF_MULTIDRV    0x00000010      /* Enable multidriver option */
-
-void at32_select_periph(unsigned int port, unsigned int pin,
-                       unsigned int periph, unsigned long flags);
-void at32_select_gpio(unsigned int pin, unsigned long flags);
-void at32_deselect_pin(unsigned int pin);
-void at32_reserve_pin(unsigned int port, u32 pin_mask);
-
-#endif /* __ASM_ARCH_PORTMUX_H__ */
diff --git a/arch/avr32/mach-at32ap/include/mach/smc.h b/arch/avr32/mach-at32ap/include/mach/smc.h
deleted file mode 100644 (file)
index c98eea4..0000000
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Static Memory Controller for AT32 chips
- *
- * Copyright (C) 2006 Atmel Corporation
- *
- * Inspired by the OMAP2 General-Purpose Memory Controller interface
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef __ARCH_AT32AP_SMC_H
-#define __ARCH_AT32AP_SMC_H
-
-/*
- * All timing parameters are in nanoseconds.
- */
-struct smc_timing {
-       /* Delay from address valid to assertion of given strobe */
-       int ncs_read_setup;
-       int nrd_setup;
-       int ncs_write_setup;
-       int nwe_setup;
-
-       /* Pulse length of given strobe */
-       int ncs_read_pulse;
-       int nrd_pulse;
-       int ncs_write_pulse;
-       int nwe_pulse;
-
-       /* Total cycle length of given operation */
-       int read_cycle;
-       int write_cycle;
-
-       /* Minimal recovery times, will extend cycle if needed */
-       int ncs_read_recover;
-       int nrd_recover;
-       int ncs_write_recover;
-       int nwe_recover;
-};
-
-/*
- * All timing parameters are in clock cycles.
- */
-struct smc_config {
-
-       /* Delay from address valid to assertion of given strobe */
-       u8              ncs_read_setup;
-       u8              nrd_setup;
-       u8              ncs_write_setup;
-       u8              nwe_setup;
-
-       /* Pulse length of given strobe */
-       u8              ncs_read_pulse;
-       u8              nrd_pulse;
-       u8              ncs_write_pulse;
-       u8              nwe_pulse;
-
-       /* Total cycle length of given operation */
-       u8              read_cycle;
-       u8              write_cycle;
-
-       /* Bus width in bytes */
-       u8              bus_width;
-
-       /*
-        * 0: Data is sampled on rising edge of NCS
-        * 1: Data is sampled on rising edge of NRD
-        */
-       unsigned int    nrd_controlled:1;
-
-       /*
-        * 0: Data is driven on falling edge of NCS
-        * 1: Data is driven on falling edge of NWR
-        */
-       unsigned int    nwe_controlled:1;
-
-       /*
-        * 0: NWAIT is disabled
-        * 1: Reserved
-        * 2: NWAIT is frozen mode
-        * 3: NWAIT in ready mode
-        */
-       unsigned int    nwait_mode:2;
-
-       /*
-        * 0: Byte select access type
-        * 1: Byte write access type
-        */
-       unsigned int    byte_write:1;
-
-       /*
-        * Number of clock cycles before data is released after
-        * the rising edge of the read controlling signal
-        *
-        * Total cycles from SMC is tdf_cycles + 1
-        */
-       unsigned int    tdf_cycles:4;
-
-       /*
-        * 0: TDF optimization disabled
-        * 1: TDF optimization enabled
-        */
-       unsigned int    tdf_mode:1;
-};
-
-extern void smc_set_timing(struct smc_config *config,
-                          const struct smc_timing *timing);
-
-extern int smc_set_configuration(int cs, const struct smc_config *config);
-extern struct smc_config *smc_get_configuration(int cs);
-
-#endif /* __ARCH_AT32AP_SMC_H */
diff --git a/arch/avr32/mach-at32ap/include/mach/sram.h b/arch/avr32/mach-at32ap/include/mach/sram.h
deleted file mode 100644 (file)
index 4838dae..0000000
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * Simple SRAM allocator
- *
- * Copyright (C) 2008 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef __ASM_AVR32_ARCH_SRAM_H
-#define __ASM_AVR32_ARCH_SRAM_H
-
-#include <linux/genalloc.h>
-
-extern struct gen_pool *sram_pool;
-
-static inline unsigned long sram_alloc(size_t len)
-{
-       if (!sram_pool)
-               return 0UL;
-
-       return gen_pool_alloc(sram_pool, len);
-}
-
-static inline void sram_free(unsigned long addr, size_t len)
-{
-       return gen_pool_free(sram_pool, addr, len);
-}
-
-#endif /* __ASM_AVR32_ARCH_SRAM_H */
diff --git a/arch/avr32/mach-at32ap/intc.c b/arch/avr32/mach-at32ap/intc.c
deleted file mode 100644 (file)
index aaff83c..0000000
+++ /dev/null
@@ -1,200 +0,0 @@
-/*
- * Copyright (C) 2006, 2008 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/clk.h>
-#include <linux/err.h>
-#include <linux/init.h>
-#include <linux/interrupt.h>
-#include <linux/irq.h>
-#include <linux/platform_device.h>
-#include <linux/syscore_ops.h>
-#include <linux/export.h>
-
-#include <asm/io.h>
-
-#include "intc.h"
-
-struct intc {
-       void __iomem            *regs;
-       struct irq_chip         chip;
-#ifdef CONFIG_PM
-       unsigned long           suspend_ipr;
-       unsigned long           saved_ipr[64];
-#endif
-};
-
-extern struct platform_device at32_intc0_device;
-
-/*
- * TODO: We may be able to implement mask/unmask by setting IxM flags
- * in the status register.
- */
-static void intc_mask_irq(struct irq_data *d)
-{
-
-}
-
-static void intc_unmask_irq(struct irq_data *d)
-{
-
-}
-
-static struct intc intc0 = {
-       .chip = {
-               .name           = "intc",
-               .irq_mask       = intc_mask_irq,
-               .irq_unmask     = intc_unmask_irq,
-       },
-};
-
-/*
- * All interrupts go via intc at some point.
- */
-asmlinkage void do_IRQ(int level, struct pt_regs *regs)
-{
-       struct pt_regs *old_regs;
-       unsigned int irq;
-       unsigned long status_reg;
-
-       local_irq_disable();
-
-       old_regs = set_irq_regs(regs);
-
-       irq_enter();
-
-       irq = intc_readl(&intc0, INTCAUSE0 - 4 * level);
-       generic_handle_irq(irq);
-
-       /*
-        * Clear all interrupt level masks so that we may handle
-        * interrupts during softirq processing.  If this is a nested
-        * interrupt, interrupts must stay globally disabled until we
-        * return.
-        */
-       status_reg = sysreg_read(SR);
-       status_reg &= ~(SYSREG_BIT(I0M) | SYSREG_BIT(I1M)
-                       | SYSREG_BIT(I2M) | SYSREG_BIT(I3M));
-       sysreg_write(SR, status_reg);
-
-       irq_exit();
-
-       set_irq_regs(old_regs);
-}
-
-void __init init_IRQ(void)
-{
-       extern void _evba(void);
-       extern void irq_level0(void);
-       struct resource *regs;
-       struct clk *pclk;
-       unsigned int i;
-       u32 offset, readback;
-
-       regs = platform_get_resource(&at32_intc0_device, IORESOURCE_MEM, 0);
-       if (!regs) {
-               printk(KERN_EMERG "intc: no mmio resource defined\n");
-               goto fail;
-       }
-       pclk = clk_get(&at32_intc0_device.dev, "pclk");
-       if (IS_ERR(pclk)) {
-               printk(KERN_EMERG "intc: no clock defined\n");
-               goto fail;
-       }
-
-       clk_enable(pclk);
-
-       intc0.regs = ioremap(regs->start, resource_size(regs));
-       if (!intc0.regs) {
-               printk(KERN_EMERG "intc: failed to map registers (0x%08lx)\n",
-                      (unsigned long)regs->start);
-               goto fail;
-       }
-
-       /*
-        * Initialize all interrupts to level 0 (lowest priority). The
-        * priority level may be changed by calling
-        * irq_set_priority().
-        *
-        */
-       offset = (unsigned long)&irq_level0 - (unsigned long)&_evba;
-       for (i = 0; i < NR_INTERNAL_IRQS; i++) {
-               intc_writel(&intc0, INTPR0 + 4 * i, offset);
-               readback = intc_readl(&intc0, INTPR0 + 4 * i);
-               if (readback == offset)
-                       irq_set_chip_and_handler(i, &intc0.chip,
-                                                handle_simple_irq);
-       }
-
-       /* Unmask all interrupt levels */
-       sysreg_write(SR, (sysreg_read(SR)
-                         & ~(SR_I3M | SR_I2M | SR_I1M | SR_I0M)));
-
-       return;
-
-fail:
-       panic("Interrupt controller initialization failed!\n");
-}
-
-#ifdef CONFIG_PM
-void intc_set_suspend_handler(unsigned long offset)
-{
-       intc0.suspend_ipr = offset;
-}
-
-static int intc_suspend(void)
-{
-       int i;
-
-       if (unlikely(!irqs_disabled())) {
-               pr_err("intc_suspend: called with interrupts enabled\n");
-               return -EINVAL;
-       }
-
-       if (unlikely(!intc0.suspend_ipr)) {
-               pr_err("intc_suspend: suspend_ipr not initialized\n");
-               return -EINVAL;
-       }
-
-       for (i = 0; i < 64; i++) {
-               intc0.saved_ipr[i] = intc_readl(&intc0, INTPR0 + 4 * i);
-               intc_writel(&intc0, INTPR0 + 4 * i, intc0.suspend_ipr);
-       }
-
-       return 0;
-}
-
-static void intc_resume(void)
-{
-       int i;
-
-       for (i = 0; i < 64; i++)
-               intc_writel(&intc0, INTPR0 + 4 * i, intc0.saved_ipr[i]);
-}
-#else
-#define intc_suspend   NULL
-#define intc_resume    NULL
-#endif
-
-static struct syscore_ops intc_syscore_ops = {
-       .suspend        = intc_suspend,
-       .resume         = intc_resume,
-};
-
-static int __init intc_init_syscore(void)
-{
-       register_syscore_ops(&intc_syscore_ops);
-
-       return 0;
-}
-device_initcall(intc_init_syscore);
-
-unsigned long intc_get_pending(unsigned int group)
-{
-       return intc_readl(&intc0, INTREQ0 + 4 * group);
-}
-EXPORT_SYMBOL_GPL(intc_get_pending);
diff --git a/arch/avr32/mach-at32ap/intc.h b/arch/avr32/mach-at32ap/intc.h
deleted file mode 100644 (file)
index 4d3664e..0000000
+++ /dev/null
@@ -1,329 +0,0 @@
-/*
- * Automatically generated by gen-header.xsl
- */
-#ifndef __ASM_AVR32_PERIHP_INTC_H__
-#define __ASM_AVR32_PERIHP_INTC_H__
-
-#define INTC_NUM_INT_GRPS            33
-
-#define INTC_INTPR0                  0x0
-# define INTC_INTPR0_INTLEV_OFFSET   30
-# define INTC_INTPR0_INTLEV_SIZE     2
-# define INTC_INTPR0_OFFSET_OFFSET   0
-# define INTC_INTPR0_OFFSET_SIZE     24
-#define INTC_INTREQ0                 0x100
-# define INTC_INTREQ0_IREQUEST0_OFFSET 0
-# define INTC_INTREQ0_IREQUEST0_SIZE 1
-# define INTC_INTREQ0_IREQUEST1_OFFSET 1
-# define INTC_INTREQ0_IREQUEST1_SIZE 1
-#define INTC_INTPR1                  0x4
-# define INTC_INTPR1_INTLEV_OFFSET   30
-# define INTC_INTPR1_INTLEV_SIZE     2
-# define INTC_INTPR1_OFFSET_OFFSET   0
-# define INTC_INTPR1_OFFSET_SIZE     24
-#define INTC_INTREQ1                 0x104
-# define INTC_INTREQ1_IREQUEST32_OFFSET 0
-# define INTC_INTREQ1_IREQUEST32_SIZE 1
-# define INTC_INTREQ1_IREQUEST33_OFFSET 1
-# define INTC_INTREQ1_IREQUEST33_SIZE 1
-# define INTC_INTREQ1_IREQUEST34_OFFSET 2
-# define INTC_INTREQ1_IREQUEST34_SIZE 1
-# define INTC_INTREQ1_IREQUEST35_OFFSET 3
-# define INTC_INTREQ1_IREQUEST35_SIZE 1
-# define INTC_INTREQ1_IREQUEST36_OFFSET 4
-# define INTC_INTREQ1_IREQUEST36_SIZE 1
-# define INTC_INTREQ1_IREQUEST37_OFFSET 5
-# define INTC_INTREQ1_IREQUEST37_SIZE 1
-#define INTC_INTPR2                  0x8
-# define INTC_INTPR2_INTLEV_OFFSET   30
-# define INTC_INTPR2_INTLEV_SIZE     2
-# define INTC_INTPR2_OFFSET_OFFSET   0
-# define INTC_INTPR2_OFFSET_SIZE     24
-#define INTC_INTREQ2                 0x108
-# define INTC_INTREQ2_IREQUEST64_OFFSET 0
-# define INTC_INTREQ2_IREQUEST64_SIZE 1
-# define INTC_INTREQ2_IREQUEST65_OFFSET 1
-# define INTC_INTREQ2_IREQUEST65_SIZE 1
-# define INTC_INTREQ2_IREQUEST66_OFFSET 2
-# define INTC_INTREQ2_IREQUEST66_SIZE 1
-# define INTC_INTREQ2_IREQUEST67_OFFSET 3
-# define INTC_INTREQ2_IREQUEST67_SIZE 1
-# define INTC_INTREQ2_IREQUEST68_OFFSET 4
-# define INTC_INTREQ2_IREQUEST68_SIZE 1
-#define INTC_INTPR3                  0xc
-# define INTC_INTPR3_INTLEV_OFFSET   30
-# define INTC_INTPR3_INTLEV_SIZE     2
-# define INTC_INTPR3_OFFSET_OFFSET   0
-# define INTC_INTPR3_OFFSET_SIZE     24
-#define INTC_INTREQ3                 0x10c
-# define INTC_INTREQ3_IREQUEST96_OFFSET 0
-# define INTC_INTREQ3_IREQUEST96_SIZE 1
-#define INTC_INTPR4                  0x10
-# define INTC_INTPR4_INTLEV_OFFSET   30
-# define INTC_INTPR4_INTLEV_SIZE     2
-# define INTC_INTPR4_OFFSET_OFFSET   0
-# define INTC_INTPR4_OFFSET_SIZE     24
-#define INTC_INTREQ4                 0x110
-# define INTC_INTREQ4_IREQUEST128_OFFSET 0
-# define INTC_INTREQ4_IREQUEST128_SIZE 1
-#define INTC_INTPR5                  0x14
-# define INTC_INTPR5_INTLEV_OFFSET   30
-# define INTC_INTPR5_INTLEV_SIZE     2
-# define INTC_INTPR5_OFFSET_OFFSET   0
-# define INTC_INTPR5_OFFSET_SIZE     24
-#define INTC_INTREQ5                 0x114
-# define INTC_INTREQ5_IREQUEST160_OFFSET 0
-# define INTC_INTREQ5_IREQUEST160_SIZE 1
-#define INTC_INTPR6                  0x18
-# define INTC_INTPR6_INTLEV_OFFSET   30
-# define INTC_INTPR6_INTLEV_SIZE     2
-# define INTC_INTPR6_OFFSET_OFFSET   0
-# define INTC_INTPR6_OFFSET_SIZE     24
-#define INTC_INTREQ6                 0x118
-# define INTC_INTREQ6_IREQUEST192_OFFSET 0
-# define INTC_INTREQ6_IREQUEST192_SIZE 1
-#define INTC_INTPR7                  0x1c
-# define INTC_INTPR7_INTLEV_OFFSET   30
-# define INTC_INTPR7_INTLEV_SIZE     2
-# define INTC_INTPR7_OFFSET_OFFSET   0
-# define INTC_INTPR7_OFFSET_SIZE     24
-#define INTC_INTREQ7                 0x11c
-# define INTC_INTREQ7_IREQUEST224_OFFSET 0
-# define INTC_INTREQ7_IREQUEST224_SIZE 1
-#define INTC_INTPR8                  0x20
-# define INTC_INTPR8_INTLEV_OFFSET   30
-# define INTC_INTPR8_INTLEV_SIZE     2
-# define INTC_INTPR8_OFFSET_OFFSET   0
-# define INTC_INTPR8_OFFSET_SIZE     24
-#define INTC_INTREQ8                 0x120
-# define INTC_INTREQ8_IREQUEST256_OFFSET 0
-# define INTC_INTREQ8_IREQUEST256_SIZE 1
-#define INTC_INTPR9                  0x24
-# define INTC_INTPR9_INTLEV_OFFSET   30
-# define INTC_INTPR9_INTLEV_SIZE     2
-# define INTC_INTPR9_OFFSET_OFFSET   0
-# define INTC_INTPR9_OFFSET_SIZE     24
-#define INTC_INTREQ9                 0x124
-# define INTC_INTREQ9_IREQUEST288_OFFSET 0
-# define INTC_INTREQ9_IREQUEST288_SIZE 1
-#define INTC_INTPR10                 0x28
-# define INTC_INTPR10_INTLEV_OFFSET  30
-# define INTC_INTPR10_INTLEV_SIZE    2
-# define INTC_INTPR10_OFFSET_OFFSET  0
-# define INTC_INTPR10_OFFSET_SIZE    24
-#define INTC_INTREQ10                0x128
-# define INTC_INTREQ10_IREQUEST320_OFFSET 0
-# define INTC_INTREQ10_IREQUEST320_SIZE 1
-#define INTC_INTPR11                 0x2c
-# define INTC_INTPR11_INTLEV_OFFSET  30
-# define INTC_INTPR11_INTLEV_SIZE    2
-# define INTC_INTPR11_OFFSET_OFFSET  0
-# define INTC_INTPR11_OFFSET_SIZE    24
-#define INTC_INTREQ11                0x12c
-# define INTC_INTREQ11_IREQUEST352_OFFSET 0
-# define INTC_INTREQ11_IREQUEST352_SIZE 1
-#define INTC_INTPR12                 0x30
-# define INTC_INTPR12_INTLEV_OFFSET  30
-# define INTC_INTPR12_INTLEV_SIZE    2
-# define INTC_INTPR12_OFFSET_OFFSET  0
-# define INTC_INTPR12_OFFSET_SIZE    24
-#define INTC_INTREQ12                0x130
-# define INTC_INTREQ12_IREQUEST384_OFFSET 0
-# define INTC_INTREQ12_IREQUEST384_SIZE 1
-#define INTC_INTPR13                 0x34
-# define INTC_INTPR13_INTLEV_OFFSET  30
-# define INTC_INTPR13_INTLEV_SIZE    2
-# define INTC_INTPR13_OFFSET_OFFSET  0
-# define INTC_INTPR13_OFFSET_SIZE    24
-#define INTC_INTREQ13                0x134
-# define INTC_INTREQ13_IREQUEST416_OFFSET 0
-# define INTC_INTREQ13_IREQUEST416_SIZE 1
-#define INTC_INTPR14                 0x38
-# define INTC_INTPR14_INTLEV_OFFSET  30
-# define INTC_INTPR14_INTLEV_SIZE    2
-# define INTC_INTPR14_OFFSET_OFFSET  0
-# define INTC_INTPR14_OFFSET_SIZE    24
-#define INTC_INTREQ14                0x138
-# define INTC_INTREQ14_IREQUEST448_OFFSET 0
-# define INTC_INTREQ14_IREQUEST448_SIZE 1
-#define INTC_INTPR15                 0x3c
-# define INTC_INTPR15_INTLEV_OFFSET  30
-# define INTC_INTPR15_INTLEV_SIZE    2
-# define INTC_INTPR15_OFFSET_OFFSET  0
-# define INTC_INTPR15_OFFSET_SIZE    24
-#define INTC_INTREQ15                0x13c
-# define INTC_INTREQ15_IREQUEST480_OFFSET 0
-# define INTC_INTREQ15_IREQUEST480_SIZE 1
-#define INTC_INTPR16                 0x40
-# define INTC_INTPR16_INTLEV_OFFSET  30
-# define INTC_INTPR16_INTLEV_SIZE    2
-# define INTC_INTPR16_OFFSET_OFFSET  0
-# define INTC_INTPR16_OFFSET_SIZE    24
-#define INTC_INTREQ16                0x140
-# define INTC_INTREQ16_IREQUEST512_OFFSET 0
-# define INTC_INTREQ16_IREQUEST512_SIZE 1
-#define INTC_INTPR17                 0x44
-# define INTC_INTPR17_INTLEV_OFFSET  30
-# define INTC_INTPR17_INTLEV_SIZE    2
-# define INTC_INTPR17_OFFSET_OFFSET  0
-# define INTC_INTPR17_OFFSET_SIZE    24
-#define INTC_INTREQ17                0x144
-# define INTC_INTREQ17_IREQUEST544_OFFSET 0
-# define INTC_INTREQ17_IREQUEST544_SIZE 1
-#define INTC_INTPR18                 0x48
-# define INTC_INTPR18_INTLEV_OFFSET  30
-# define INTC_INTPR18_INTLEV_SIZE    2
-# define INTC_INTPR18_OFFSET_OFFSET  0
-# define INTC_INTPR18_OFFSET_SIZE    24
-#define INTC_INTREQ18                0x148
-# define INTC_INTREQ18_IREQUEST576_OFFSET 0
-# define INTC_INTREQ18_IREQUEST576_SIZE 1
-#define INTC_INTPR19                 0x4c
-# define INTC_INTPR19_INTLEV_OFFSET  30
-# define INTC_INTPR19_INTLEV_SIZE    2
-# define INTC_INTPR19_OFFSET_OFFSET  0
-# define INTC_INTPR19_OFFSET_SIZE    24
-#define INTC_INTREQ19                0x14c
-# define INTC_INTREQ19_IREQUEST608_OFFSET 0
-# define INTC_INTREQ19_IREQUEST608_SIZE 1
-# define INTC_INTREQ19_IREQUEST609_OFFSET 1
-# define INTC_INTREQ19_IREQUEST609_SIZE 1
-# define INTC_INTREQ19_IREQUEST610_OFFSET 2
-# define INTC_INTREQ19_IREQUEST610_SIZE 1
-# define INTC_INTREQ19_IREQUEST611_OFFSET 3
-# define INTC_INTREQ19_IREQUEST611_SIZE 1
-#define INTC_INTPR20                 0x50
-# define INTC_INTPR20_INTLEV_OFFSET  30
-# define INTC_INTPR20_INTLEV_SIZE    2
-# define INTC_INTPR20_OFFSET_OFFSET  0
-# define INTC_INTPR20_OFFSET_SIZE    24
-#define INTC_INTREQ20                0x150
-# define INTC_INTREQ20_IREQUEST640_OFFSET 0
-# define INTC_INTREQ20_IREQUEST640_SIZE 1
-#define INTC_INTPR21                 0x54
-# define INTC_INTPR21_INTLEV_OFFSET  30
-# define INTC_INTPR21_INTLEV_SIZE    2
-# define INTC_INTPR21_OFFSET_OFFSET  0
-# define INTC_INTPR21_OFFSET_SIZE    24
-#define INTC_INTREQ21                0x154
-# define INTC_INTREQ21_IREQUEST672_OFFSET 0
-# define INTC_INTREQ21_IREQUEST672_SIZE 1
-#define INTC_INTPR22                 0x58
-# define INTC_INTPR22_INTLEV_OFFSET  30
-# define INTC_INTPR22_INTLEV_SIZE    2
-# define INTC_INTPR22_OFFSET_OFFSET  0
-# define INTC_INTPR22_OFFSET_SIZE    24
-#define INTC_INTREQ22                0x158
-# define INTC_INTREQ22_IREQUEST704_OFFSET 0
-# define INTC_INTREQ22_IREQUEST704_SIZE 1
-# define INTC_INTREQ22_IREQUEST705_OFFSET 1
-# define INTC_INTREQ22_IREQUEST705_SIZE 1
-# define INTC_INTREQ22_IREQUEST706_OFFSET 2
-# define INTC_INTREQ22_IREQUEST706_SIZE 1
-#define INTC_INTPR23                 0x5c
-# define INTC_INTPR23_INTLEV_OFFSET  30
-# define INTC_INTPR23_INTLEV_SIZE    2
-# define INTC_INTPR23_OFFSET_OFFSET  0
-# define INTC_INTPR23_OFFSET_SIZE    24
-#define INTC_INTREQ23                0x15c
-# define INTC_INTREQ23_IREQUEST736_OFFSET 0
-# define INTC_INTREQ23_IREQUEST736_SIZE 1
-# define INTC_INTREQ23_IREQUEST737_OFFSET 1
-# define INTC_INTREQ23_IREQUEST737_SIZE 1
-# define INTC_INTREQ23_IREQUEST738_OFFSET 2
-# define INTC_INTREQ23_IREQUEST738_SIZE 1
-#define INTC_INTPR24                 0x60
-# define INTC_INTPR24_INTLEV_OFFSET  30
-# define INTC_INTPR24_INTLEV_SIZE    2
-# define INTC_INTPR24_OFFSET_OFFSET  0
-# define INTC_INTPR24_OFFSET_SIZE    24
-#define INTC_INTREQ24                0x160
-# define INTC_INTREQ24_IREQUEST768_OFFSET 0
-# define INTC_INTREQ24_IREQUEST768_SIZE 1
-#define INTC_INTPR25                 0x64
-# define INTC_INTPR25_INTLEV_OFFSET  30
-# define INTC_INTPR25_INTLEV_SIZE    2
-# define INTC_INTPR25_OFFSET_OFFSET  0
-# define INTC_INTPR25_OFFSET_SIZE    24
-#define INTC_INTREQ25                0x164
-# define INTC_INTREQ25_IREQUEST800_OFFSET 0
-# define INTC_INTREQ25_IREQUEST800_SIZE 1
-#define INTC_INTPR26                 0x68
-# define INTC_INTPR26_INTLEV_OFFSET  30
-# define INTC_INTPR26_INTLEV_SIZE    2
-# define INTC_INTPR26_OFFSET_OFFSET  0
-# define INTC_INTPR26_OFFSET_SIZE    24
-#define INTC_INTREQ26                0x168
-# define INTC_INTREQ26_IREQUEST832_OFFSET 0
-# define INTC_INTREQ26_IREQUEST832_SIZE 1
-#define INTC_INTPR27                 0x6c
-# define INTC_INTPR27_INTLEV_OFFSET  30
-# define INTC_INTPR27_INTLEV_SIZE    2
-# define INTC_INTPR27_OFFSET_OFFSET  0
-# define INTC_INTPR27_OFFSET_SIZE    24
-#define INTC_INTREQ27                0x16c
-# define INTC_INTREQ27_IREQUEST864_OFFSET 0
-# define INTC_INTREQ27_IREQUEST864_SIZE 1
-#define INTC_INTPR28                 0x70
-# define INTC_INTPR28_INTLEV_OFFSET  30
-# define INTC_INTPR28_INTLEV_SIZE    2
-# define INTC_INTPR28_OFFSET_OFFSET  0
-# define INTC_INTPR28_OFFSET_SIZE    24
-#define INTC_INTREQ28                0x170
-# define INTC_INTREQ28_IREQUEST896_OFFSET 0
-# define INTC_INTREQ28_IREQUEST896_SIZE 1
-#define INTC_INTPR29                 0x74
-# define INTC_INTPR29_INTLEV_OFFSET  30
-# define INTC_INTPR29_INTLEV_SIZE    2
-# define INTC_INTPR29_OFFSET_OFFSET  0
-# define INTC_INTPR29_OFFSET_SIZE    24
-#define INTC_INTREQ29                0x174
-# define INTC_INTREQ29_IREQUEST928_OFFSET 0
-# define INTC_INTREQ29_IREQUEST928_SIZE 1
-#define INTC_INTPR30                 0x78
-# define INTC_INTPR30_INTLEV_OFFSET  30
-# define INTC_INTPR30_INTLEV_SIZE    2
-# define INTC_INTPR30_OFFSET_OFFSET  0
-# define INTC_INTPR30_OFFSET_SIZE    24
-#define INTC_INTREQ30                0x178
-# define INTC_INTREQ30_IREQUEST960_OFFSET 0
-# define INTC_INTREQ30_IREQUEST960_SIZE 1
-#define INTC_INTPR31                 0x7c
-# define INTC_INTPR31_INTLEV_OFFSET  30
-# define INTC_INTPR31_INTLEV_SIZE    2
-# define INTC_INTPR31_OFFSET_OFFSET  0
-# define INTC_INTPR31_OFFSET_SIZE    24
-#define INTC_INTREQ31                0x17c
-# define INTC_INTREQ31_IREQUEST992_OFFSET 0
-# define INTC_INTREQ31_IREQUEST992_SIZE 1
-#define INTC_INTPR32                 0x80
-# define INTC_INTPR32_INTLEV_OFFSET  30
-# define INTC_INTPR32_INTLEV_SIZE    2
-# define INTC_INTPR32_OFFSET_OFFSET  0
-# define INTC_INTPR32_OFFSET_SIZE    24
-#define INTC_INTREQ32                0x180
-# define INTC_INTREQ32_IREQUEST1024_OFFSET 0
-# define INTC_INTREQ32_IREQUEST1024_SIZE 1
-#define INTC_INTCAUSE0               0x20c
-# define INTC_INTCAUSE0_CAUSEGRP_OFFSET 0
-# define INTC_INTCAUSE0_CAUSEGRP_SIZE 6
-#define INTC_INTCAUSE1               0x208
-# define INTC_INTCAUSE1_CAUSEGRP_OFFSET 0
-# define INTC_INTCAUSE1_CAUSEGRP_SIZE 6
-#define INTC_INTCAUSE2               0x204
-# define INTC_INTCAUSE2_CAUSEGRP_OFFSET 0
-# define INTC_INTCAUSE2_CAUSEGRP_SIZE 6
-#define INTC_INTCAUSE3               0x200
-# define INTC_INTCAUSE3_CAUSEGRP_OFFSET 0
-# define INTC_INTCAUSE3_CAUSEGRP_SIZE 6
-
-#define INTC_BIT(name)               (1 << INTC_##name##_OFFSET)
-#define INTC_MKBF(name, value)       (((value) & ((1 << INTC_##name##_SIZE) - 1)) << INTC_##name##_OFFSET)
-#define INTC_GETBF(name, value)      (((value) >> INTC_##name##_OFFSET) & ((1 << INTC_##name##_SIZE) - 1))
-
-#define intc_readl(port,reg)                                   \
-       __raw_readl((port)->regs + INTC_##reg)
-#define intc_writel(port,reg,value)                            \
-       __raw_writel((value), (port)->regs + INTC_##reg)
-
-#endif /* __ASM_AVR32_PERIHP_INTC_H__ */
diff --git a/arch/avr32/mach-at32ap/pdc.c b/arch/avr32/mach-at32ap/pdc.c
deleted file mode 100644 (file)
index 61ab15a..0000000
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (C) 2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/clk.h>
-#include <linux/err.h>
-#include <linux/init.h>
-#include <linux/platform_device.h>
-
-static int __init pdc_probe(struct platform_device *pdev)
-{
-       struct clk *pclk, *hclk;
-
-       pclk = clk_get(&pdev->dev, "pclk");
-       if (IS_ERR(pclk)) {
-               dev_err(&pdev->dev, "no pclk defined\n");
-               return PTR_ERR(pclk);
-       }
-       hclk = clk_get(&pdev->dev, "hclk");
-       if (IS_ERR(hclk)) {
-               dev_err(&pdev->dev, "no hclk defined\n");
-               clk_put(pclk);
-               return PTR_ERR(hclk);
-       }
-
-       clk_enable(pclk);
-       clk_enable(hclk);
-
-       dev_info(&pdev->dev, "Atmel Peripheral DMA Controller enabled\n");
-       return 0;
-}
-
-static struct platform_driver pdc_driver = {
-       .driver         = {
-               .name   = "pdc",
-       },
-};
-
-static int __init pdc_init(void)
-{
-       return platform_driver_probe(&pdc_driver, pdc_probe);
-}
-arch_initcall(pdc_init);
diff --git a/arch/avr32/mach-at32ap/pio.c b/arch/avr32/mach-at32ap/pio.c
deleted file mode 100644 (file)
index 7fae6ec..0000000
+++ /dev/null
@@ -1,470 +0,0 @@
-/*
- * Atmel PIO2 Port Multiplexer support
- *
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/clk.h>
-#include <linux/debugfs.h>
-#include <linux/export.h>
-#include <linux/fs.h>
-#include <linux/platform_device.h>
-#include <linux/irq.h>
-#include <linux/gpio.h>
-
-#include <asm/io.h>
-
-#include <mach/portmux.h>
-
-#include "pio.h"
-
-#define MAX_NR_PIO_DEVICES             8
-
-struct pio_device {
-       struct gpio_chip chip;
-       void __iomem *regs;
-       const struct platform_device *pdev;
-       struct clk *clk;
-       u32 pinmux_mask;
-       char name[8];
-};
-
-static struct pio_device pio_dev[MAX_NR_PIO_DEVICES];
-
-static struct pio_device *gpio_to_pio(unsigned int gpio)
-{
-       struct pio_device *pio;
-       unsigned int index;
-
-       index = gpio >> 5;
-       if (index >= MAX_NR_PIO_DEVICES)
-               return NULL;
-       pio = &pio_dev[index];
-       if (!pio->regs)
-               return NULL;
-
-       return pio;
-}
-
-/* Pin multiplexing API */
-static DEFINE_SPINLOCK(pio_lock);
-
-void __init at32_select_periph(unsigned int port, u32 pin_mask,
-                              unsigned int periph, unsigned long flags)
-{
-       struct pio_device *pio;
-
-       /* assign and verify pio */
-       pio = gpio_to_pio(port);
-       if (unlikely(!pio)) {
-               printk(KERN_WARNING "pio: invalid port %u\n", port);
-               goto fail;
-       }
-
-       /* Test if any of the requested pins is already muxed */
-       spin_lock(&pio_lock);
-       if (unlikely(pio->pinmux_mask & pin_mask)) {
-               printk(KERN_WARNING "%s: pin(s) busy (requested 0x%x, busy 0x%x)\n",
-                      pio->name, pin_mask, pio->pinmux_mask & pin_mask);
-               spin_unlock(&pio_lock);
-               goto fail;
-       }
-
-       pio->pinmux_mask |= pin_mask;
-
-       /* enable pull ups */
-       pio_writel(pio, PUER, pin_mask);
-
-       /* select either peripheral A or B */
-       if (periph)
-               pio_writel(pio, BSR, pin_mask);
-       else
-               pio_writel(pio, ASR, pin_mask);
-
-       /* enable peripheral control */
-       pio_writel(pio, PDR, pin_mask);
-
-       /* Disable pull ups if not requested. */
-       if (!(flags & AT32_GPIOF_PULLUP))
-               pio_writel(pio, PUDR, pin_mask);
-
-       spin_unlock(&pio_lock);
-
-       return;
-
-fail:
-       dump_stack();
-}
-
-void __init at32_select_gpio(unsigned int pin, unsigned long flags)
-{
-       struct pio_device *pio;
-       unsigned int pin_index = pin & 0x1f;
-       u32 mask = 1 << pin_index;
-
-       pio = gpio_to_pio(pin);
-       if (unlikely(!pio)) {
-               printk("pio: invalid pin %u\n", pin);
-               goto fail;
-       }
-
-       if (unlikely(test_and_set_bit(pin_index, &pio->pinmux_mask))) {
-               printk("%s: pin %u is busy\n", pio->name, pin_index);
-               goto fail;
-       }
-
-       if (flags & AT32_GPIOF_OUTPUT) {
-               if (flags & AT32_GPIOF_HIGH)
-                       pio_writel(pio, SODR, mask);
-               else
-                       pio_writel(pio, CODR, mask);
-               if (flags & AT32_GPIOF_MULTIDRV)
-                       pio_writel(pio, MDER, mask);
-               else
-                       pio_writel(pio, MDDR, mask);
-               pio_writel(pio, PUDR, mask);
-               pio_writel(pio, OER, mask);
-       } else {
-               if (flags & AT32_GPIOF_PULLUP)
-                       pio_writel(pio, PUER, mask);
-               else
-                       pio_writel(pio, PUDR, mask);
-               if (flags & AT32_GPIOF_DEGLITCH)
-                       pio_writel(pio, IFER, mask);
-               else
-                       pio_writel(pio, IFDR, mask);
-               pio_writel(pio, ODR, mask);
-       }
-
-       pio_writel(pio, PER, mask);
-
-       return;
-
-fail:
-       dump_stack();
-}
-
-/*
- * Undo a previous pin reservation. Will not affect the hardware
- * configuration.
- */
-void at32_deselect_pin(unsigned int pin)
-{
-       struct pio_device *pio;
-       unsigned int pin_index = pin & 0x1f;
-
-       pio = gpio_to_pio(pin);
-       if (unlikely(!pio)) {
-               printk("pio: invalid pin %u\n", pin);
-               dump_stack();
-               return;
-       }
-
-       clear_bit(pin_index, &pio->pinmux_mask);
-}
-
-/* Reserve a pin, preventing anyone else from changing its configuration. */
-void __init at32_reserve_pin(unsigned int port, u32 pin_mask)
-{
-       struct pio_device *pio;
-
-       /* assign and verify pio */
-       pio = gpio_to_pio(port);
-       if (unlikely(!pio)) {
-               printk(KERN_WARNING "pio: invalid port %u\n", port);
-               goto fail;
-       }
-
-       /* Test if any of the requested pins is already muxed */
-       spin_lock(&pio_lock);
-       if (unlikely(pio->pinmux_mask & pin_mask)) {
-               printk(KERN_WARNING "%s: pin(s) busy (req. 0x%x, busy 0x%x)\n",
-                      pio->name, pin_mask, pio->pinmux_mask & pin_mask);
-               spin_unlock(&pio_lock);
-               goto fail;
-       }
-
-       /* Reserve pins */
-       pio->pinmux_mask |= pin_mask;
-       spin_unlock(&pio_lock);
-       return;
-
-fail:
-       dump_stack();
-}
-
-/*--------------------------------------------------------------------------*/
-
-/* GPIO API */
-
-static int direction_input(struct gpio_chip *chip, unsigned offset)
-{
-       struct pio_device *pio = gpiochip_get_data(chip);
-       u32 mask = 1 << offset;
-
-       if (!(pio_readl(pio, PSR) & mask))
-               return -EINVAL;
-
-       pio_writel(pio, ODR, mask);
-       return 0;
-}
-
-static int gpio_get(struct gpio_chip *chip, unsigned offset)
-{
-       struct pio_device *pio = gpiochip_get_data(chip);
-
-       return (pio_readl(pio, PDSR) >> offset) & 1;
-}
-
-static void gpio_set(struct gpio_chip *chip, unsigned offset, int value);
-
-static int direction_output(struct gpio_chip *chip, unsigned offset, int value)
-{
-       struct pio_device *pio = gpiochip_get_data(chip);
-       u32 mask = 1 << offset;
-
-       if (!(pio_readl(pio, PSR) & mask))
-               return -EINVAL;
-
-       gpio_set(chip, offset, value);
-       pio_writel(pio, OER, mask);
-       return 0;
-}
-
-static void gpio_set(struct gpio_chip *chip, unsigned offset, int value)
-{
-       struct pio_device *pio = gpiochip_get_data(chip);
-       u32 mask = 1 << offset;
-
-       if (value)
-               pio_writel(pio, SODR, mask);
-       else
-               pio_writel(pio, CODR, mask);
-}
-
-/*--------------------------------------------------------------------------*/
-
-/* GPIO IRQ support */
-
-static void gpio_irq_mask(struct irq_data *d)
-{
-       unsigned                gpio = irq_to_gpio(d->irq);
-       struct pio_device       *pio = &pio_dev[gpio >> 5];
-
-       pio_writel(pio, IDR, 1 << (gpio & 0x1f));
-}
-
-static void gpio_irq_unmask(struct irq_data *d)
-{
-       unsigned                gpio = irq_to_gpio(d->irq);
-       struct pio_device       *pio = &pio_dev[gpio >> 5];
-
-       pio_writel(pio, IER, 1 << (gpio & 0x1f));
-}
-
-static int gpio_irq_type(struct irq_data *d, unsigned type)
-{
-       if (type != IRQ_TYPE_EDGE_BOTH && type != IRQ_TYPE_NONE)
-               return -EINVAL;
-
-       return 0;
-}
-
-static struct irq_chip gpio_irqchip = {
-       .name           = "gpio",
-       .irq_mask       = gpio_irq_mask,
-       .irq_unmask     = gpio_irq_unmask,
-       .irq_set_type   = gpio_irq_type,
-};
-
-static void gpio_irq_handler(struct irq_desc *desc)
-{
-       struct pio_device       *pio = irq_desc_get_chip_data(desc);
-       unsigned                gpio_irq;
-
-       gpio_irq = (unsigned) irq_desc_get_handler_data(desc);
-       for (;;) {
-               u32             isr;
-
-               /* ack pending GPIO interrupts */
-               isr = pio_readl(pio, ISR) & pio_readl(pio, IMR);
-               if (!isr)
-                       break;
-               do {
-                       int i;
-
-                       i = ffs(isr) - 1;
-                       isr &= ~(1 << i);
-
-                       i += gpio_irq;
-                       generic_handle_irq(i);
-               } while (isr);
-       }
-}
-
-static void __init
-gpio_irq_setup(struct pio_device *pio, int irq, int gpio_irq)
-{
-       unsigned        i;
-
-       irq_set_chip_data(irq, pio);
-
-       for (i = 0; i < 32; i++, gpio_irq++) {
-               irq_set_chip_data(gpio_irq, pio);
-               irq_set_chip_and_handler(gpio_irq, &gpio_irqchip,
-                                        handle_simple_irq);
-       }
-
-       irq_set_chained_handler_and_data(irq, gpio_irq_handler,
-                                        (void *)gpio_irq);
-}
-
-/*--------------------------------------------------------------------------*/
-
-#ifdef CONFIG_DEBUG_FS
-
-#include <linux/seq_file.h>
-
-/*
- * This shows more info than the generic gpio dump code:
- * pullups, deglitching, open drain drive.
- */
-static void pio_bank_show(struct seq_file *s, struct gpio_chip *chip)
-{
-       struct pio_device *pio = gpiochip_get_data(chip);
-       u32                     psr, osr, imr, pdsr, pusr, ifsr, mdsr;
-       unsigned                i;
-       u32                     mask;
-       char                    bank;
-
-       psr = pio_readl(pio, PSR);
-       osr = pio_readl(pio, OSR);
-       imr = pio_readl(pio, IMR);
-       pdsr = pio_readl(pio, PDSR);
-       pusr = pio_readl(pio, PUSR);
-       ifsr = pio_readl(pio, IFSR);
-       mdsr = pio_readl(pio, MDSR);
-
-       bank = 'A' + pio->pdev->id;
-
-       for (i = 0, mask = 1; i < 32; i++, mask <<= 1) {
-               const char *label;
-
-               label = gpiochip_is_requested(chip, i);
-               if (!label && (imr & mask))
-                       label = "[irq]";
-               if (!label)
-                       continue;
-
-               seq_printf(s, " gpio-%-3d P%c%-2d (%-12s) %s %s %s",
-                       chip->base + i, bank, i,
-                       label,
-                       (osr & mask) ? "out" : "in ",
-                       (mask & pdsr) ? "hi" : "lo",
-                       (mask & pusr) ? "  " : "up");
-               if (ifsr & mask)
-                       seq_puts(s, " deglitch");
-               if ((osr & mdsr) & mask)
-                       seq_puts(s, " open-drain");
-               if (imr & mask)
-                       seq_printf(s, " irq-%d edge-both",
-                               gpio_to_irq(chip->base + i));
-               seq_putc(s, '\n');
-       }
-}
-
-#else
-#define pio_bank_show  NULL
-#endif
-
-
-/*--------------------------------------------------------------------------*/
-
-static int __init pio_probe(struct platform_device *pdev)
-{
-       struct pio_device *pio = NULL;
-       int irq = platform_get_irq(pdev, 0);
-       int gpio_irq_base = GPIO_IRQ_BASE + pdev->id * 32;
-
-       BUG_ON(pdev->id >= MAX_NR_PIO_DEVICES);
-       pio = &pio_dev[pdev->id];
-       BUG_ON(!pio->regs);
-
-       pio->chip.label = pio->name;
-       pio->chip.base = pdev->id * 32;
-       pio->chip.ngpio = 32;
-       pio->chip.parent = &pdev->dev;
-       pio->chip.owner = THIS_MODULE;
-
-       pio->chip.direction_input = direction_input;
-       pio->chip.get = gpio_get;
-       pio->chip.direction_output = direction_output;
-       pio->chip.set = gpio_set;
-       pio->chip.dbg_show = pio_bank_show;
-
-       gpiochip_add_data(&pio->chip, pio);
-
-       gpio_irq_setup(pio, irq, gpio_irq_base);
-
-       platform_set_drvdata(pdev, pio);
-
-       printk(KERN_DEBUG "%s: base 0x%p, irq %d chains %d..%d\n",
-              pio->name, pio->regs, irq, gpio_irq_base, gpio_irq_base + 31);
-
-       return 0;
-}
-
-static struct platform_driver pio_driver = {
-       .driver         = {
-               .name           = "pio",
-       },
-};
-
-static int __init pio_init(void)
-{
-       return platform_driver_probe(&pio_driver, pio_probe);
-}
-postcore_initcall(pio_init);
-
-void __init at32_init_pio(struct platform_device *pdev)
-{
-       struct resource *regs;
-       struct pio_device *pio;
-
-       if (pdev->id >= MAX_NR_PIO_DEVICES) {
-               dev_err(&pdev->dev, "only %d PIO devices supported\n",
-                       MAX_NR_PIO_DEVICES);
-               return;
-       }
-
-       pio = &pio_dev[pdev->id];
-       snprintf(pio->name, sizeof(pio->name), "pio%d", pdev->id);
-
-       regs = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-       if (!regs) {
-               dev_err(&pdev->dev, "no mmio resource defined\n");
-               return;
-       }
-
-       pio->clk = clk_get(&pdev->dev, "mck");
-       if (IS_ERR(pio->clk))
-               /*
-                * This is a fatal error, but if we continue we might
-                * be so lucky that we manage to initialize the
-                * console and display this message...
-                */
-               dev_err(&pdev->dev, "no mck clock defined\n");
-       else
-               clk_enable(pio->clk);
-
-       pio->pdev = pdev;
-       pio->regs = ioremap(regs->start, resource_size(regs));
-
-       /* start with irqs disabled and acked */
-       pio_writel(pio, IDR, ~0UL);
-       (void) pio_readl(pio, ISR);
-}
diff --git a/arch/avr32/mach-at32ap/pio.h b/arch/avr32/mach-at32ap/pio.h
deleted file mode 100644 (file)
index 9484dfc..0000000
+++ /dev/null
@@ -1,180 +0,0 @@
-/*
- * Atmel PIO2 Port Multiplexer support
- *
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef __ARCH_AVR32_AT32AP_PIO_H__
-#define __ARCH_AVR32_AT32AP_PIO_H__
-
-/* PIO register offsets */
-#define PIO_PER                                0x0000
-#define PIO_PDR                                0x0004
-#define PIO_PSR                                0x0008
-#define PIO_OER                                0x0010
-#define PIO_ODR                                0x0014
-#define PIO_OSR                                0x0018
-#define PIO_IFER                               0x0020
-#define PIO_IFDR                               0x0024
-#define PIO_IFSR                               0x0028
-#define PIO_SODR                               0x0030
-#define PIO_CODR                               0x0034
-#define PIO_ODSR                               0x0038
-#define PIO_PDSR                               0x003c
-#define PIO_IER                                0x0040
-#define PIO_IDR                                0x0044
-#define PIO_IMR                                0x0048
-#define PIO_ISR                                0x004c
-#define PIO_MDER                               0x0050
-#define PIO_MDDR                               0x0054
-#define PIO_MDSR                               0x0058
-#define PIO_PUDR                               0x0060
-#define PIO_PUER                               0x0064
-#define PIO_PUSR                               0x0068
-#define PIO_ASR                                0x0070
-#define PIO_BSR                                0x0074
-#define PIO_ABSR                               0x0078
-#define PIO_OWER                               0x00a0
-#define PIO_OWDR                               0x00a4
-#define PIO_OWSR                               0x00a8
-
-/* Bitfields in PER */
-
-/* Bitfields in PDR */
-
-/* Bitfields in PSR */
-
-/* Bitfields in OER */
-
-/* Bitfields in ODR */
-
-/* Bitfields in OSR */
-
-/* Bitfields in IFER */
-
-/* Bitfields in IFDR */
-
-/* Bitfields in IFSR */
-
-/* Bitfields in SODR */
-
-/* Bitfields in CODR */
-
-/* Bitfields in ODSR */
-
-/* Bitfields in PDSR */
-
-/* Bitfields in IER */
-
-/* Bitfields in IDR */
-
-/* Bitfields in IMR */
-
-/* Bitfields in ISR */
-
-/* Bitfields in MDER */
-
-/* Bitfields in MDDR */
-
-/* Bitfields in MDSR */
-
-/* Bitfields in PUDR */
-
-/* Bitfields in PUER */
-
-/* Bitfields in PUSR */
-
-/* Bitfields in ASR */
-
-/* Bitfields in BSR */
-
-/* Bitfields in ABSR */
-#define PIO_P0_OFFSET                          0
-#define PIO_P0_SIZE                            1
-#define PIO_P1_OFFSET                          1
-#define PIO_P1_SIZE                            1
-#define PIO_P2_OFFSET                          2
-#define PIO_P2_SIZE                            1
-#define PIO_P3_OFFSET                          3
-#define PIO_P3_SIZE                            1
-#define PIO_P4_OFFSET                          4
-#define PIO_P4_SIZE                            1
-#define PIO_P5_OFFSET                          5
-#define PIO_P5_SIZE                            1
-#define PIO_P6_OFFSET                          6
-#define PIO_P6_SIZE                            1
-#define PIO_P7_OFFSET                          7
-#define PIO_P7_SIZE                            1
-#define PIO_P8_OFFSET                          8
-#define PIO_P8_SIZE                            1
-#define PIO_P9_OFFSET                          9
-#define PIO_P9_SIZE                            1
-#define PIO_P10_OFFSET                         10
-#define PIO_P10_SIZE                           1
-#define PIO_P11_OFFSET                         11
-#define PIO_P11_SIZE                           1
-#define PIO_P12_OFFSET                         12
-#define PIO_P12_SIZE                           1
-#define PIO_P13_OFFSET                         13
-#define PIO_P13_SIZE                           1
-#define PIO_P14_OFFSET                         14
-#define PIO_P14_SIZE                           1
-#define PIO_P15_OFFSET                         15
-#define PIO_P15_SIZE                           1
-#define PIO_P16_OFFSET                         16
-#define PIO_P16_SIZE                           1
-#define PIO_P17_OFFSET                         17
-#define PIO_P17_SIZE                           1
-#define PIO_P18_OFFSET                         18
-#define PIO_P18_SIZE                           1
-#define PIO_P19_OFFSET                         19
-#define PIO_P19_SIZE                           1
-#define PIO_P20_OFFSET                         20
-#define PIO_P20_SIZE                           1
-#define PIO_P21_OFFSET                         21
-#define PIO_P21_SIZE                           1
-#define PIO_P22_OFFSET                         22
-#define PIO_P22_SIZE                           1
-#define PIO_P23_OFFSET                         23
-#define PIO_P23_SIZE                           1
-#define PIO_P24_OFFSET                         24
-#define PIO_P24_SIZE                           1
-#define PIO_P25_OFFSET                         25
-#define PIO_P25_SIZE                           1
-#define PIO_P26_OFFSET                         26
-#define PIO_P26_SIZE                           1
-#define PIO_P27_OFFSET                         27
-#define PIO_P27_SIZE                           1
-#define PIO_P28_OFFSET                         28
-#define PIO_P28_SIZE                           1
-#define PIO_P29_OFFSET                         29
-#define PIO_P29_SIZE                           1
-#define PIO_P30_OFFSET                         30
-#define PIO_P30_SIZE                           1
-#define PIO_P31_OFFSET                         31
-#define PIO_P31_SIZE                           1
-
-/* Bitfields in OWER */
-
-/* Bitfields in OWDR */
-
-/* Bitfields in OWSR */
-
-/* Bit manipulation macros */
-#define PIO_BIT(name)                          (1 << PIO_##name##_OFFSET)
-#define PIO_BF(name,value)                     (((value) & ((1 << PIO_##name##_SIZE) - 1)) << PIO_##name##_OFFSET)
-#define PIO_BFEXT(name,value)                  (((value) >> PIO_##name##_OFFSET) & ((1 << PIO_##name##_SIZE) - 1))
-#define PIO_BFINS(name,value,old)              (((old) & ~(((1 << PIO_##name##_SIZE) - 1) << PIO_##name##_OFFSET)) | PIO_BF(name,value))
-
-/* Register access macros */
-#define pio_readl(port,reg)                                    \
-       __raw_readl((port)->regs + PIO_##reg)
-#define pio_writel(port,reg,value)                             \
-       __raw_writel((value), (port)->regs + PIO_##reg)
-
-void at32_init_pio(struct platform_device *pdev);
-
-#endif /* __ARCH_AVR32_AT32AP_PIO_H__ */
diff --git a/arch/avr32/mach-at32ap/pm-at32ap700x.S b/arch/avr32/mach-at32ap/pm-at32ap700x.S
deleted file mode 100644 (file)
index 1c8e4e6..0000000
+++ /dev/null
@@ -1,167 +0,0 @@
-/*
- * Low-level Power Management code.
- *
- * Copyright (C) 2008 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <asm/asm.h>
-#include <asm/asm-offsets.h>
-#include <asm/thread_info.h>
-#include <mach/pm.h>
-
-#include "pm.h"
-#include "sdramc.h"
-
-/* Same as 0xfff00000 but fits in a 21 bit signed immediate */
-#define PM_BASE        -0x100000
-
-       /* Keep this close to the irq handlers */
-       .section .irq.text, "ax", @progbits
-
-       /*
-        * void cpu_enter_idle(void)
-        *
-        * Put the CPU into "idle" mode, in which it will consume
-        * significantly less power.
-        *
-        * If an interrupt comes along in the window between
-        * unmask_interrupts and the sleep instruction below, the
-        * interrupt code will adjust the return address so that we
-        * never execute the sleep instruction. This is required
-        * because the AP7000 doesn't unmask interrupts when entering
-        * sleep modes; later CPUs may not need this workaround.
-        */
-       .global cpu_enter_idle
-       .type   cpu_enter_idle, @function
-cpu_enter_idle:
-       mask_interrupts
-       get_thread_info r8
-       ld.w    r9, r8[TI_flags]
-       bld     r9, TIF_NEED_RESCHED
-       brcs    .Lret_from_sleep
-       sbr     r9, TIF_CPU_GOING_TO_SLEEP
-       st.w    r8[TI_flags], r9
-       unmask_interrupts
-       sleep   CPU_SLEEP_IDLE
-       .size   cpu_enter_idle, . - cpu_enter_idle
-
-       /*
-        * Common return path for PM functions that don't run from
-        * SRAM.
-        */
-       .global cpu_idle_skip_sleep
-       .type   cpu_idle_skip_sleep, @function
-cpu_idle_skip_sleep:
-       mask_interrupts
-       ld.w    r9, r8[TI_flags]
-       cbr     r9, TIF_CPU_GOING_TO_SLEEP
-       st.w    r8[TI_flags], r9
-.Lret_from_sleep:
-       unmask_interrupts
-       retal   r12
-       .size   cpu_idle_skip_sleep, . - cpu_idle_skip_sleep
-
-#ifdef CONFIG_PM
-       .section .init.text, "ax", @progbits
-
-       .global pm_exception
-       .type   pm_exception, @function
-pm_exception:
-       /*
-        * Exceptions are masked when we switch to this handler, so
-        * we'll only get "unrecoverable" exceptions (offset 0.)
-        */
-       sub     r12, pc, . - .Lpanic_msg
-       lddpc   pc, .Lpanic_addr
-
-       .align  2
-.Lpanic_addr:
-       .long   panic
-.Lpanic_msg:
-       .asciz  "Unrecoverable exception during suspend\n"
-       .size   pm_exception, . - pm_exception
-
-       .global pm_irq0
-       .type   pm_irq0, @function
-pm_irq0:
-       /* Disable interrupts and return after the sleep instruction */
-       mfsr    r9, SYSREG_RSR_INT0
-       mtsr    SYSREG_RAR_INT0, r8
-       sbr     r9, SYSREG_GM_OFFSET
-       mtsr    SYSREG_RSR_INT0, r9
-       rete
-
-       /*
-        * void cpu_enter_standby(unsigned long sdramc_base)
-        *
-        * Enter PM_SUSPEND_STANDBY mode. At this point, all drivers
-        * are suspended and interrupts are disabled. Interrupts
-        * marked as 'wakeup' event sources may still come along and
-        * get us out of here.
-        *
-        * The SDRAM will be put into self-refresh mode (which does
-        * not require a clock from the CPU), and the CPU will be put
-        * into "frozen" mode (HSB bus stopped). The SDRAM controller
-        * will automatically bring the SDRAM into normal mode on the
-        * first access, and the power manager will automatically
-        * start the HSB and CPU clocks upon a wakeup event.
-        *
-        * This code uses the same "skip sleep" technique as above.
-        * It is very important that we jump directly to
-        * cpu_after_sleep after the sleep instruction since that's
-        * where we'll end up if the interrupt handler decides that we
-        * need to skip the sleep instruction.
-        */
-       .global pm_standby
-       .type   pm_standby, @function
-pm_standby:
-       /*
-        * interrupts are already masked at this point, and EVBA
-        * points to pm_exception above.
-        */
-       ld.w    r10, r12[SDRAMC_LPR]
-       sub     r8, pc, . - 1f          /* return address for irq handler */
-       mov     r11, SDRAMC_LPR_LPCB_SELF_RFR
-       bfins   r10, r11, 0, 2          /* LPCB <- self Refresh */
-       sync    0                       /* flush write buffer */
-       st.w    r12[SDRAMC_LPR], r10    /* put SDRAM in self-refresh mode */
-       ld.w    r11, r12[SDRAMC_LPR]
-       unmask_interrupts
-       sleep   CPU_SLEEP_FROZEN
-1:     mask_interrupts
-       retal   r12
-       .size   pm_standby, . - pm_standby
-
-       .global pm_suspend_to_ram
-       .type   pm_suspend_to_ram, @function
-pm_suspend_to_ram:
-       /*
-        * interrupts are already masked at this point, and EVBA
-        * points to pm_exception above.
-        */
-       mov     r11, 0
-       cache   r11[2], 8               /* clean all dcache lines */
-       sync    0                       /* flush write buffer */
-       ld.w    r10, r12[SDRAMC_LPR]
-       sub     r8, pc, . - 1f          /* return address for irq handler */
-       mov     r11, SDRAMC_LPR_LPCB_SELF_RFR
-       bfins   r10, r11, 0, 2          /* LPCB <- self refresh */
-       st.w    r12[SDRAMC_LPR], r10    /* put SDRAM in self-refresh mode */
-       ld.w    r11, r12[SDRAMC_LPR]
-
-       unmask_interrupts
-       sleep   CPU_SLEEP_STOP
-1:     mask_interrupts
-
-       retal   r12
-       .size   pm_suspend_to_ram, . - pm_suspend_to_ram
-
-       .global pm_sram_end
-       .type   pm_sram_end, @function
-pm_sram_end:
-       .size   pm_sram_end, 0
-
-#endif /* CONFIG_PM */
diff --git a/arch/avr32/mach-at32ap/pm.c b/arch/avr32/mach-at32ap/pm.c
deleted file mode 100644 (file)
index db19084..0000000
+++ /dev/null
@@ -1,243 +0,0 @@
-/*
- * AVR32 AP Power Management
- *
- * Copyright (C) 2008 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation.
- */
-#include <linux/io.h>
-#include <linux/suspend.h>
-#include <linux/vmalloc.h>
-
-#include <asm/cacheflush.h>
-#include <asm/sysreg.h>
-
-#include <mach/chip.h>
-#include <mach/pm.h>
-#include <mach/sram.h>
-
-#include "sdramc.h"
-
-#define SRAM_PAGE_FLAGS        (SYSREG_BIT(TLBELO_D) | SYSREG_BF(SZ, 1)        \
-                               | SYSREG_BF(AP, 3) | SYSREG_BIT(G))
-
-
-static unsigned long   pm_sram_start;
-static size_t          pm_sram_size;
-static struct vm_struct        *pm_sram_area;
-
-static void (*avr32_pm_enter_standby)(unsigned long sdramc_base);
-static void (*avr32_pm_enter_str)(unsigned long sdramc_base);
-
-/*
- * Must be called with interrupts disabled. Exceptions will be masked
- * on return (i.e. all exceptions will be "unrecoverable".)
- */
-static void *avr32_pm_map_sram(void)
-{
-       unsigned long   vaddr;
-       unsigned long   page_addr;
-       u32             tlbehi;
-       u32             mmucr;
-
-       vaddr = (unsigned long)pm_sram_area->addr;
-       page_addr = pm_sram_start & PAGE_MASK;
-
-       /*
-        * Mask exceptions and grab the first TLB entry. We won't be
-        * needing it while sleeping.
-        */
-       asm volatile("ssrf      %0" : : "i"(SYSREG_EM_OFFSET) : "memory");
-
-       mmucr = sysreg_read(MMUCR);
-       tlbehi = sysreg_read(TLBEHI);
-       sysreg_write(MMUCR, SYSREG_BFINS(DRP, 0, mmucr));
-
-       tlbehi = SYSREG_BF(ASID, SYSREG_BFEXT(ASID, tlbehi));
-       tlbehi |= vaddr & PAGE_MASK;
-       tlbehi |= SYSREG_BIT(TLBEHI_V);
-
-       sysreg_write(TLBELO, page_addr | SRAM_PAGE_FLAGS);
-       sysreg_write(TLBEHI, tlbehi);
-       __builtin_tlbw();
-
-       return (void *)(vaddr + pm_sram_start - page_addr);
-}
-
-/*
- * Must be called with interrupts disabled. Exceptions will be
- * unmasked on return.
- */
-static void avr32_pm_unmap_sram(void)
-{
-       u32     mmucr;
-       u32     tlbehi;
-       u32     tlbarlo;
-
-       /* Going to update TLB entry at index 0 */
-       mmucr = sysreg_read(MMUCR);
-       tlbehi = sysreg_read(TLBEHI);
-       sysreg_write(MMUCR, SYSREG_BFINS(DRP, 0, mmucr));
-
-       /* Clear the "valid" bit */
-       tlbehi = SYSREG_BF(ASID, SYSREG_BFEXT(ASID, tlbehi));
-       sysreg_write(TLBEHI, tlbehi);
-
-       /* Mark it as "not accessed" */
-       tlbarlo = sysreg_read(TLBARLO);
-       sysreg_write(TLBARLO, tlbarlo | 0x80000000U);
-
-       /* Update the TLB */
-       __builtin_tlbw();
-
-       /* Unmask exceptions */
-       asm volatile("csrf      %0" : : "i"(SYSREG_EM_OFFSET) : "memory");
-}
-
-static int avr32_pm_valid_state(suspend_state_t state)
-{
-       switch (state) {
-       case PM_SUSPEND_ON:
-       case PM_SUSPEND_STANDBY:
-       case PM_SUSPEND_MEM:
-               return 1;
-
-       default:
-               return 0;
-       }
-}
-
-static int avr32_pm_enter(suspend_state_t state)
-{
-       u32             lpr_saved;
-       u32             evba_saved;
-       void            *sram;
-
-       switch (state) {
-       case PM_SUSPEND_STANDBY:
-               sram = avr32_pm_map_sram();
-
-               /* Switch to in-sram exception handlers */
-               evba_saved = sysreg_read(EVBA);
-               sysreg_write(EVBA, (unsigned long)sram);
-
-               /*
-                * Save the LPR register so that we can re-enable
-                * SDRAM Low Power mode on resume.
-                */
-               lpr_saved = sdramc_readl(LPR);
-               pr_debug("%s: Entering standby...\n", __func__);
-               avr32_pm_enter_standby(SDRAMC_BASE);
-               sdramc_writel(LPR, lpr_saved);
-
-               /* Switch back to regular exception handlers */
-               sysreg_write(EVBA, evba_saved);
-
-               avr32_pm_unmap_sram();
-               break;
-
-       case PM_SUSPEND_MEM:
-               sram = avr32_pm_map_sram();
-
-               /* Switch to in-sram exception handlers */
-               evba_saved = sysreg_read(EVBA);
-               sysreg_write(EVBA, (unsigned long)sram);
-
-               /*
-                * Save the LPR register so that we can re-enable
-                * SDRAM Low Power mode on resume.
-                */
-               lpr_saved = sdramc_readl(LPR);
-               pr_debug("%s: Entering suspend-to-ram...\n", __func__);
-               avr32_pm_enter_str(SDRAMC_BASE);
-               sdramc_writel(LPR, lpr_saved);
-
-               /* Switch back to regular exception handlers */
-               sysreg_write(EVBA, evba_saved);
-
-               avr32_pm_unmap_sram();
-               break;
-
-       case PM_SUSPEND_ON:
-               pr_debug("%s: Entering idle...\n", __func__);
-               cpu_enter_idle();
-               break;
-
-       default:
-               pr_debug("%s: Invalid suspend state %d\n", __func__, state);
-               goto out;
-       }
-
-       pr_debug("%s: wakeup\n", __func__);
-
-out:
-       return 0;
-}
-
-static const struct platform_suspend_ops avr32_pm_ops = {
-       .valid  = avr32_pm_valid_state,
-       .enter  = avr32_pm_enter,
-};
-
-static unsigned long __init avr32_pm_offset(void *symbol)
-{
-       extern u8 pm_exception[];
-
-       return (unsigned long)symbol - (unsigned long)pm_exception;
-}
-
-static int __init avr32_pm_init(void)
-{
-       extern u8 pm_exception[];
-       extern u8 pm_irq0[];
-       extern u8 pm_standby[];
-       extern u8 pm_suspend_to_ram[];
-       extern u8 pm_sram_end[];
-       void *dst;
-
-       /*
-        * To keep things simple, we depend on not needing more than a
-        * single page.
-        */
-       pm_sram_size = avr32_pm_offset(pm_sram_end);
-       if (pm_sram_size > PAGE_SIZE)
-               goto err;
-
-       pm_sram_start = sram_alloc(pm_sram_size);
-       if (!pm_sram_start)
-               goto err_alloc_sram;
-
-       /* Grab a virtual area we can use later on. */
-       pm_sram_area = get_vm_area(pm_sram_size, VM_IOREMAP);
-       if (!pm_sram_area)
-               goto err_vm_area;
-       pm_sram_area->phys_addr = pm_sram_start;
-
-       local_irq_disable();
-       dst = avr32_pm_map_sram();
-       memcpy(dst, pm_exception, pm_sram_size);
-       flush_dcache_region(dst, pm_sram_size);
-       invalidate_icache_region(dst, pm_sram_size);
-       avr32_pm_unmap_sram();
-       local_irq_enable();
-
-       avr32_pm_enter_standby = dst + avr32_pm_offset(pm_standby);
-       avr32_pm_enter_str = dst + avr32_pm_offset(pm_suspend_to_ram);
-       intc_set_suspend_handler(avr32_pm_offset(pm_irq0));
-
-       suspend_set_ops(&avr32_pm_ops);
-
-       printk("AVR32 AP Power Management enabled\n");
-
-       return 0;
-
-err_vm_area:
-       sram_free(pm_sram_start, pm_sram_size);
-err_alloc_sram:
-err:
-       pr_err("AVR32 Power Management initialization failed\n");
-       return -ENOMEM;
-}
-arch_initcall(avr32_pm_init);
diff --git a/arch/avr32/mach-at32ap/pm.h b/arch/avr32/mach-at32ap/pm.h
deleted file mode 100644 (file)
index 532a373..0000000
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Register definitions for the Power Manager (PM)
- */
-#ifndef __ARCH_AVR32_MACH_AT32AP_PM_H__
-#define __ARCH_AVR32_MACH_AT32AP_PM_H__
-
-/* PM register offsets */
-#define PM_MCCTRL                              0x0000
-#define PM_CKSEL                               0x0004
-#define PM_CPU_MASK                            0x0008
-#define PM_HSB_MASK                            0x000c
-#define PM_PBA_MASK                            0x0010
-#define PM_PBB_MASK                            0x0014
-#define PM_PLL0                                        0x0020
-#define PM_PLL1                                        0x0024
-#define PM_IER                                 0x0040
-#define PM_IDR                                 0x0044
-#define PM_IMR                                 0x0048
-#define PM_ISR                                 0x004c
-#define PM_ICR                                 0x0050
-#define PM_GCCTRL(x)                           (0x0060 + 4 * (x))
-#define PM_RCAUSE                              0x00c0
-
-/* Bitfields in CKSEL */
-#define PM_CPUSEL_OFFSET                       0
-#define PM_CPUSEL_SIZE                         3
-#define PM_CPUDIV_OFFSET                       7
-#define PM_CPUDIV_SIZE                         1
-#define PM_HSBSEL_OFFSET                       8
-#define PM_HSBSEL_SIZE                         3
-#define PM_HSBDIV_OFFSET                       15
-#define PM_HSBDIV_SIZE                         1
-#define PM_PBASEL_OFFSET                       16
-#define PM_PBASEL_SIZE                         3
-#define PM_PBADIV_OFFSET                       23
-#define PM_PBADIV_SIZE                         1
-#define PM_PBBSEL_OFFSET                       24
-#define PM_PBBSEL_SIZE                         3
-#define PM_PBBDIV_OFFSET                       31
-#define PM_PBBDIV_SIZE                         1
-
-/* Bitfields in PLL0 */
-#define PM_PLLEN_OFFSET                                0
-#define PM_PLLEN_SIZE                          1
-#define PM_PLLOSC_OFFSET                       1
-#define PM_PLLOSC_SIZE                         1
-#define PM_PLLOPT_OFFSET                       2
-#define PM_PLLOPT_SIZE                         3
-#define PM_PLLDIV_OFFSET                       8
-#define PM_PLLDIV_SIZE                         8
-#define PM_PLLMUL_OFFSET                       16
-#define PM_PLLMUL_SIZE                         8
-#define PM_PLLCOUNT_OFFSET                     24
-#define PM_PLLCOUNT_SIZE                       6
-#define PM_PLLTEST_OFFSET                      31
-#define PM_PLLTEST_SIZE                                1
-
-/* Bitfields in ICR */
-#define PM_LOCK0_OFFSET                                0
-#define PM_LOCK0_SIZE                          1
-#define PM_LOCK1_OFFSET                                1
-#define PM_LOCK1_SIZE                          1
-#define PM_WAKE_OFFSET                         2
-#define PM_WAKE_SIZE                           1
-#define PM_CKRDY_OFFSET                                5
-#define PM_CKRDY_SIZE                          1
-#define PM_MSKRDY_OFFSET                       6
-#define PM_MSKRDY_SIZE                         1
-
-/* Bitfields in GCCTRL0 */
-#define PM_OSCSEL_OFFSET                       0
-#define PM_OSCSEL_SIZE                         1
-#define PM_PLLSEL_OFFSET                       1
-#define PM_PLLSEL_SIZE                         1
-#define PM_CEN_OFFSET                          2
-#define PM_CEN_SIZE                            1
-#define PM_DIVEN_OFFSET                                4
-#define PM_DIVEN_SIZE                          1
-#define PM_DIV_OFFSET                          8
-#define PM_DIV_SIZE                            8
-
-/* Bitfields in RCAUSE */
-#define PM_POR_OFFSET                          0
-#define PM_POR_SIZE                            1
-#define PM_EXT_OFFSET                          2
-#define PM_EXT_SIZE                            1
-#define PM_WDT_OFFSET                          3
-#define PM_WDT_SIZE                            1
-#define PM_NTAE_OFFSET                         4
-#define PM_NTAE_SIZE                           1
-
-/* Bit manipulation macros */
-#define PM_BIT(name)                                   \
-       (1 << PM_##name##_OFFSET)
-#define PM_BF(name,value)                              \
-       (((value) & ((1 << PM_##name##_SIZE) - 1))      \
-        << PM_##name##_OFFSET)
-#define PM_BFEXT(name,value)                           \
-       (((value) >> PM_##name##_OFFSET)                \
-        & ((1 << PM_##name##_SIZE) - 1))
-#define PM_BFINS(name,value,old)\
-       (((old) & ~(((1 << PM_##name##_SIZE) - 1)       \
-                   << PM_##name##_OFFSET))             \
-        | PM_BF(name,value))
-
-/* Register access macros */
-#define pm_readl(reg)                                                  \
-       __raw_readl((void __iomem __force *)PM_BASE + PM_##reg)
-#define pm_writel(reg,value)                                           \
-       __raw_writel((value), (void __iomem __force *)PM_BASE + PM_##reg)
-
-#endif /* __ARCH_AVR32_MACH_AT32AP_PM_H__ */
diff --git a/arch/avr32/mach-at32ap/sdramc.h b/arch/avr32/mach-at32ap/sdramc.h
deleted file mode 100644 (file)
index 66eeaed..0000000
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Register definitions for the AT32AP SDRAM Controller
- *
- * Copyright (C) 2008 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation.
- */
-
-/* Register offsets */
-#define SDRAMC_MR                      0x0000
-#define SDRAMC_TR                      0x0004
-#define SDRAMC_CR                      0x0008
-#define SDRAMC_HSR                     0x000c
-#define SDRAMC_LPR                     0x0010
-#define SDRAMC_IER                     0x0014
-#define SDRAMC_IDR                     0x0018
-#define SDRAMC_IMR                     0x001c
-#define SDRAMC_ISR                     0x0020
-#define SDRAMC_MDR                     0x0024
-
-/* MR - Mode Register */
-#define SDRAMC_MR_MODE_NORMAL          (  0 <<  0)
-#define SDRAMC_MR_MODE_NOP             (  1 <<  0)
-#define SDRAMC_MR_MODE_BANKS_PRECHARGE (  2 <<  0)
-#define SDRAMC_MR_MODE_LOAD_MODE       (  3 <<  0)
-#define SDRAMC_MR_MODE_AUTO_REFRESH    (  4 <<  0)
-#define SDRAMC_MR_MODE_EXT_LOAD_MODE   (  5 <<  0)
-#define SDRAMC_MR_MODE_POWER_DOWN      (  6 <<  0)
-
-/* CR - Configuration Register */
-#define SDRAMC_CR_NC_8_BITS            (  0 <<  0)
-#define SDRAMC_CR_NC_9_BITS            (  1 <<  0)
-#define SDRAMC_CR_NC_10_BITS           (  2 <<  0)
-#define SDRAMC_CR_NC_11_BITS           (  3 <<  0)
-#define SDRAMC_CR_NR_11_BITS           (  0 <<  2)
-#define SDRAMC_CR_NR_12_BITS           (  1 <<  2)
-#define SDRAMC_CR_NR_13_BITS           (  2 <<  2)
-#define SDRAMC_CR_NB_2_BANKS           (  0 <<  4)
-#define SDRAMC_CR_NB_4_BANKS           (  1 <<  4)
-#define SDRAMC_CR_CAS(x)               ((x) <<  5)
-#define SDRAMC_CR_DBW_32_BITS          (  0 <<  7)
-#define SDRAMC_CR_DBW_16_BITS          (  1 <<  7)
-#define SDRAMC_CR_TWR(x)               ((x) <<  8)
-#define SDRAMC_CR_TRC(x)               ((x) << 12)
-#define SDRAMC_CR_TRP(x)               ((x) << 16)
-#define SDRAMC_CR_TRCD(x)              ((x) << 20)
-#define SDRAMC_CR_TRAS(x)              ((x) << 24)
-#define SDRAMC_CR_TXSR(x)              ((x) << 28)
-
-/* HSR - High Speed Register */
-#define SDRAMC_HSR_DA                  (  1 <<  0)
-
-/* LPR - Low Power Register */
-#define SDRAMC_LPR_LPCB_INHIBIT                (  0 <<  0)
-#define SDRAMC_LPR_LPCB_SELF_RFR       (  1 <<  0)
-#define SDRAMC_LPR_LPCB_PDOWN          (  2 <<  0)
-#define SDRAMC_LPR_LPCB_DEEP_PDOWN     (  3 <<  0)
-#define SDRAMC_LPR_PASR(x)             ((x) <<  4)
-#define SDRAMC_LPR_TCSR(x)             ((x) <<  8)
-#define SDRAMC_LPR_DS(x)               ((x) << 10)
-#define SDRAMC_LPR_TIMEOUT(x)          ((x) << 12)
-
-/* IER/IDR/IMR/ISR - Interrupt Enable/Disable/Mask/Status Register */
-#define SDRAMC_ISR_RES                 (  1 <<  0)
-
-/* MDR - Memory Device Register */
-#define SDRAMC_MDR_MD_SDRAM            (  0 <<  0)
-#define SDRAMC_MDR_MD_LOW_PWR_SDRAM    (  1 <<  0)
-
-/* Register access macros */
-#define sdramc_readl(reg) \
-       __raw_readl((void __iomem __force *)SDRAMC_BASE + SDRAMC_##reg)
-#define sdramc_writel(reg, value) \
-       __raw_writel(value, (void __iomem __force *)SDRAMC_BASE + SDRAMC_##reg)
diff --git a/arch/avr32/mm/Makefile b/arch/avr32/mm/Makefile
deleted file mode 100644 (file)
index 0066491..0000000
+++ /dev/null
@@ -1,6 +0,0 @@
-#
-# Makefile for the Linux/AVR32 kernel.
-#
-
-obj-y                          += init.o clear_page.o copy_page.o dma-coherent.o
-obj-y                          += ioremap.o cache.o fault.o tlb.o
diff --git a/arch/avr32/mm/cache.c b/arch/avr32/mm/cache.c
deleted file mode 100644 (file)
index d947682..0000000
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/highmem.h>
-#include <linux/unistd.h>
-
-#include <asm/cacheflush.h>
-#include <asm/cachectl.h>
-#include <asm/processor.h>
-#include <linux/uaccess.h>
-#include <asm/syscalls.h>
-
-/*
- * If you attempt to flush anything more than this, you need superuser
- * privileges.  The value is completely arbitrary.
- */
-#define CACHEFLUSH_MAX_LEN     1024
-
-void invalidate_dcache_region(void *start, size_t size)
-{
-       unsigned long v, begin, end, linesz, mask;
-
-       linesz = boot_cpu_data.dcache.linesz;
-       mask = linesz - 1;
-
-       /* when first and/or last cachelines are shared, flush them
-        * instead of invalidating ... never discard valid data!
-        */
-       begin = (unsigned long)start;
-       end = begin + size;
-
-       if (begin & mask) {
-               flush_dcache_line(start);
-               begin += linesz;
-       }
-       if (end & mask) {
-               flush_dcache_line((void *)end);
-               end &= ~mask;
-       }
-
-       /* remaining cachelines only need invalidation */
-       for (v = begin; v < end; v += linesz)
-               invalidate_dcache_line((void *)v);
-       flush_write_buffer();
-}
-
-void clean_dcache_region(void *start, size_t size)
-{
-       unsigned long v, begin, end, linesz;
-
-       linesz = boot_cpu_data.dcache.linesz;
-       begin = (unsigned long)start & ~(linesz - 1);
-       end = ((unsigned long)start + size + linesz - 1) & ~(linesz - 1);
-
-       for (v = begin; v < end; v += linesz)
-               clean_dcache_line((void *)v);
-       flush_write_buffer();
-}
-
-void flush_dcache_region(void *start, size_t size)
-{
-       unsigned long v, begin, end, linesz;
-
-       linesz = boot_cpu_data.dcache.linesz;
-       begin = (unsigned long)start & ~(linesz - 1);
-       end = ((unsigned long)start + size + linesz - 1) & ~(linesz - 1);
-
-       for (v = begin; v < end; v += linesz)
-               flush_dcache_line((void *)v);
-       flush_write_buffer();
-}
-
-void invalidate_icache_region(void *start, size_t size)
-{
-       unsigned long v, begin, end, linesz;
-
-       linesz = boot_cpu_data.icache.linesz;
-       begin = (unsigned long)start & ~(linesz - 1);
-       end = ((unsigned long)start + size + linesz - 1) & ~(linesz - 1);
-
-       for (v = begin; v < end; v += linesz)
-               invalidate_icache_line((void *)v);
-}
-
-static inline void __flush_icache_range(unsigned long start, unsigned long end)
-{
-       unsigned long v, linesz;
-
-       linesz = boot_cpu_data.dcache.linesz;
-       for (v = start; v < end; v += linesz) {
-               clean_dcache_line((void *)v);
-               invalidate_icache_line((void *)v);
-       }
-
-       flush_write_buffer();
-}
-
-/*
- * This one is called after a module has been loaded.
- */
-void flush_icache_range(unsigned long start, unsigned long end)
-{
-       unsigned long linesz;
-
-       linesz = boot_cpu_data.dcache.linesz;
-       __flush_icache_range(start & ~(linesz - 1),
-                            (end + linesz - 1) & ~(linesz - 1));
-}
-EXPORT_SYMBOL(flush_icache_range);
-
-/*
- * This one is called from __do_fault() and do_swap_page().
- */
-void flush_icache_page(struct vm_area_struct *vma, struct page *page)
-{
-       if (vma->vm_flags & VM_EXEC) {
-               void *v = page_address(page);
-               __flush_icache_range((unsigned long)v, (unsigned long)v + PAGE_SIZE);
-       }
-}
-
-asmlinkage int sys_cacheflush(int operation, void __user *addr, size_t len)
-{
-       int ret;
-
-       if (len > CACHEFLUSH_MAX_LEN) {
-               ret = -EPERM;
-               if (!capable(CAP_SYS_ADMIN))
-                       goto out;
-       }
-
-       ret = -EFAULT;
-       if (!access_ok(VERIFY_WRITE, addr, len))
-               goto out;
-
-       switch (operation) {
-       case CACHE_IFLUSH:
-               flush_icache_range((unsigned long)addr,
-                                  (unsigned long)addr + len);
-               ret = 0;
-               break;
-       default:
-               ret = -EINVAL;
-       }
-
-out:
-       return ret;
-}
-
-void copy_to_user_page(struct vm_area_struct *vma, struct page *page,
-               unsigned long vaddr, void *dst, const void *src,
-               unsigned long len)
-{
-       memcpy(dst, src, len);
-       if (vma->vm_flags & VM_EXEC)
-               flush_icache_range((unsigned long)dst,
-                               (unsigned long)dst + len);
-}
diff --git a/arch/avr32/mm/clear_page.S b/arch/avr32/mm/clear_page.S
deleted file mode 100644 (file)
index 5d70dca..0000000
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/linkage.h>
-#include <asm/page.h>
-
-/*
- * clear_page
- * r12: P1 address (to)
- */
-       .text
-       .global clear_page
-clear_page:
-       sub     r9, r12, -PAGE_SIZE
-       mov     r10, 0
-       mov     r11, 0
-0:      st.d    r12++, r10
-       cp      r12, r9
-       brne    0b
-       mov     pc, lr
diff --git a/arch/avr32/mm/copy_page.S b/arch/avr32/mm/copy_page.S
deleted file mode 100644 (file)
index c2b3752..0000000
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/linkage.h>
-#include <asm/page.h>
-
-/*
- * copy_page
- *
- * r12         to (P1 address)
- * r11         from (P1 address)
- * r8-r10      scratch
- */
-       .text
-       .global copy_page
-copy_page:
-       sub     r10, r11, -(1 << PAGE_SHIFT)
-       /* pref r11[0] */
-1:     /* pref r11[8] */
-       ld.d    r8, r11++
-       st.d    r12++, r8
-       cp      r11, r10
-       brlo    1b
-       mov     pc, lr
diff --git a/arch/avr32/mm/dma-coherent.c b/arch/avr32/mm/dma-coherent.c
deleted file mode 100644 (file)
index 555222d..0000000
+++ /dev/null
@@ -1,202 +0,0 @@
-/*
- *  Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/dma-mapping.h>
-#include <linux/gfp.h>
-#include <linux/export.h>
-#include <linux/mm.h>
-#include <linux/device.h>
-#include <linux/scatterlist.h>
-
-#include <asm/processor.h>
-#include <asm/cacheflush.h>
-#include <asm/io.h>
-#include <asm/addrspace.h>
-
-void dma_cache_sync(struct device *dev, void *vaddr, size_t size, int direction)
-{
-       /*
-        * No need to sync an uncached area
-        */
-       if (PXSEG(vaddr) == P2SEG)
-               return;
-
-       switch (direction) {
-       case DMA_FROM_DEVICE:           /* invalidate only */
-               invalidate_dcache_region(vaddr, size);
-               break;
-       case DMA_TO_DEVICE:             /* writeback only */
-               clean_dcache_region(vaddr, size);
-               break;
-       case DMA_BIDIRECTIONAL:         /* writeback and invalidate */
-               flush_dcache_region(vaddr, size);
-               break;
-       default:
-               BUG();
-       }
-}
-EXPORT_SYMBOL(dma_cache_sync);
-
-static struct page *__dma_alloc(struct device *dev, size_t size,
-                               dma_addr_t *handle, gfp_t gfp)
-{
-       struct page *page, *free, *end;
-       int order;
-
-       /* Following is a work-around (a.k.a. hack) to prevent pages
-        * with __GFP_COMP being passed to split_page() which cannot
-        * handle them.  The real problem is that this flag probably
-        * should be 0 on AVR32 as it is not supported on this
-        * platform--see CONFIG_HUGETLB_PAGE. */
-       gfp &= ~(__GFP_COMP);
-
-       size = PAGE_ALIGN(size);
-       order = get_order(size);
-
-       page = alloc_pages(gfp, order);
-       if (!page)
-               return NULL;
-       split_page(page, order);
-
-       /*
-        * When accessing physical memory with valid cache data, we
-        * get a cache hit even if the virtual memory region is marked
-        * as uncached.
-        *
-        * Since the memory is newly allocated, there is no point in
-        * doing a writeback. If the previous owner cares, he should
-        * have flushed the cache before releasing the memory.
-        */
-       invalidate_dcache_region(phys_to_virt(page_to_phys(page)), size);
-
-       *handle = page_to_bus(page);
-       free = page + (size >> PAGE_SHIFT);
-       end = page + (1 << order);
-
-       /*
-        * Free any unused pages
-        */
-       while (free < end) {
-               __free_page(free);
-               free++;
-       }
-
-       return page;
-}
-
-static void __dma_free(struct device *dev, size_t size,
-                      struct page *page, dma_addr_t handle)
-{
-       struct page *end = page + (PAGE_ALIGN(size) >> PAGE_SHIFT);
-
-       while (page < end)
-               __free_page(page++);
-}
-
-static void *avr32_dma_alloc(struct device *dev, size_t size,
-               dma_addr_t *handle, gfp_t gfp, unsigned long attrs)
-{
-       struct page *page;
-       dma_addr_t phys;
-
-       page = __dma_alloc(dev, size, handle, gfp);
-       if (!page)
-               return NULL;
-       phys = page_to_phys(page);
-
-       if (attrs & DMA_ATTR_WRITE_COMBINE) {
-               /* Now, map the page into P3 with write-combining turned on */
-               *handle = phys;
-               return __ioremap(phys, size, _PAGE_BUFFER);
-       } else {
-               return phys_to_uncached(phys);
-       }
-}
-
-static void avr32_dma_free(struct device *dev, size_t size,
-               void *cpu_addr, dma_addr_t handle, unsigned long attrs)
-{
-       struct page *page;
-
-       if (attrs & DMA_ATTR_WRITE_COMBINE) {
-               iounmap(cpu_addr);
-
-               page = phys_to_page(handle);
-       } else {
-               void *addr = phys_to_cached(uncached_to_phys(cpu_addr));
-
-               pr_debug("avr32_dma_free addr %p (phys %08lx) size %u\n",
-                        cpu_addr, (unsigned long)handle, (unsigned)size);
-
-               BUG_ON(!virt_addr_valid(addr));
-               page = virt_to_page(addr);
-       }
-
-       __dma_free(dev, size, page, handle);
-}
-
-static dma_addr_t avr32_dma_map_page(struct device *dev, struct page *page,
-               unsigned long offset, size_t size,
-               enum dma_data_direction direction, unsigned long attrs)
-{
-       void *cpu_addr = page_address(page) + offset;
-
-       if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
-               dma_cache_sync(dev, cpu_addr, size, direction);
-       return virt_to_bus(cpu_addr);
-}
-
-static int avr32_dma_map_sg(struct device *dev, struct scatterlist *sglist,
-               int nents, enum dma_data_direction direction,
-               unsigned long attrs)
-{
-       int i;
-       struct scatterlist *sg;
-
-       for_each_sg(sglist, sg, nents, i) {
-               char *virt;
-
-               sg->dma_address = page_to_bus(sg_page(sg)) + sg->offset;
-               virt = sg_virt(sg);
-
-               if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
-                       continue;
-
-               dma_cache_sync(dev, virt, sg->length, direction);
-       }
-
-       return nents;
-}
-
-static void avr32_dma_sync_single_for_device(struct device *dev,
-               dma_addr_t dma_handle, size_t size,
-               enum dma_data_direction direction)
-{
-       dma_cache_sync(dev, bus_to_virt(dma_handle), size, direction);
-}
-
-static void avr32_dma_sync_sg_for_device(struct device *dev,
-               struct scatterlist *sglist, int nents,
-               enum dma_data_direction direction)
-{
-       int i;
-       struct scatterlist *sg;
-
-       for_each_sg(sglist, sg, nents, i)
-               dma_cache_sync(dev, sg_virt(sg), sg->length, direction);
-}
-
-const struct dma_map_ops avr32_dma_ops = {
-       .alloc                  = avr32_dma_alloc,
-       .free                   = avr32_dma_free,
-       .map_page               = avr32_dma_map_page,
-       .map_sg                 = avr32_dma_map_sg,
-       .sync_single_for_device = avr32_dma_sync_single_for_device,
-       .sync_sg_for_device     = avr32_dma_sync_sg_for_device,
-};
-EXPORT_SYMBOL(avr32_dma_ops);
diff --git a/arch/avr32/mm/fault.c b/arch/avr32/mm/fault.c
deleted file mode 100644 (file)
index b3977e9..0000000
+++ /dev/null
@@ -1,268 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * Based on linux/arch/sh/mm/fault.c:
- *   Copyright (C) 1999  Niibe Yutaka
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/mm.h>
-#include <linux/extable.h>
-#include <linux/pagemap.h>
-#include <linux/kdebug.h>
-#include <linux/kprobes.h>
-#include <linux/uaccess.h>
-
-#include <asm/mmu_context.h>
-#include <asm/sysreg.h>
-#include <asm/tlb.h>
-
-#ifdef CONFIG_KPROBES
-static inline int notify_page_fault(struct pt_regs *regs, int trap)
-{
-       int ret = 0;
-
-       if (!user_mode(regs)) {
-               if (kprobe_running() && kprobe_fault_handler(regs, trap))
-                       ret = 1;
-       }
-
-       return ret;
-}
-#else
-static inline int notify_page_fault(struct pt_regs *regs, int trap)
-{
-       return 0;
-}
-#endif
-
-int exception_trace = 1;
-
-/*
- * This routine handles page faults. It determines the address and the
- * problem, and then passes it off to one of the appropriate routines.
- *
- * ecr is the Exception Cause Register. Possible values are:
- *   6:  Protection fault (instruction access)
- *   15: Protection fault (read access)
- *   16: Protection fault (write access)
- *   20: Page not found (instruction access)
- *   24: Page not found (read access)
- *   28: Page not found (write access)
- */
-asmlinkage void do_page_fault(unsigned long ecr, struct pt_regs *regs)
-{
-       struct task_struct *tsk;
-       struct mm_struct *mm;
-       struct vm_area_struct *vma;
-       const struct exception_table_entry *fixup;
-       unsigned long address;
-       unsigned long page;
-       long signr;
-       int code;
-       int fault;
-       unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
-
-       if (notify_page_fault(regs, ecr))
-               return;
-
-       address = sysreg_read(TLBEAR);
-
-       tsk = current;
-       mm = tsk->mm;
-
-       signr = SIGSEGV;
-       code = SEGV_MAPERR;
-
-       /*
-        * If we're in an interrupt or have no user context, we must
-        * not take the fault...
-        */
-       if (faulthandler_disabled() || !mm || regs->sr & SYSREG_BIT(GM))
-               goto no_context;
-
-       local_irq_enable();
-
-       if (user_mode(regs))
-               flags |= FAULT_FLAG_USER;
-retry:
-       down_read(&mm->mmap_sem);
-
-       vma = find_vma(mm, address);
-       if (!vma)
-               goto bad_area;
-       if (vma->vm_start <= address)
-               goto good_area;
-       if (!(vma->vm_flags & VM_GROWSDOWN))
-               goto bad_area;
-       if (expand_stack(vma, address))
-               goto bad_area;
-
-       /*
-        * Ok, we have a good vm_area for this memory access, so we
-        * can handle it...
-        */
-good_area:
-       code = SEGV_ACCERR;
-
-       switch (ecr) {
-       case ECR_PROTECTION_X:
-       case ECR_TLB_MISS_X:
-               if (!(vma->vm_flags & VM_EXEC))
-                       goto bad_area;
-               break;
-       case ECR_PROTECTION_R:
-       case ECR_TLB_MISS_R:
-               if (!(vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)))
-                       goto bad_area;
-               break;
-       case ECR_PROTECTION_W:
-       case ECR_TLB_MISS_W:
-               if (!(vma->vm_flags & VM_WRITE))
-                       goto bad_area;
-               flags |= FAULT_FLAG_WRITE;
-               break;
-       default:
-               panic("Unhandled case %lu in do_page_fault!", ecr);
-       }
-
-       /*
-        * If for any reason at all we couldn't handle the fault, make
-        * sure we exit gracefully rather than endlessly redo the
-        * fault.
-        */
-       fault = handle_mm_fault(vma, address, flags);
-
-       if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
-               return;
-
-       if (unlikely(fault & VM_FAULT_ERROR)) {
-               if (fault & VM_FAULT_OOM)
-                       goto out_of_memory;
-               else if (fault & VM_FAULT_SIGSEGV)
-                       goto bad_area;
-               else if (fault & VM_FAULT_SIGBUS)
-                       goto do_sigbus;
-               BUG();
-       }
-
-       if (flags & FAULT_FLAG_ALLOW_RETRY) {
-               if (fault & VM_FAULT_MAJOR)
-                       tsk->maj_flt++;
-               else
-                       tsk->min_flt++;
-               if (fault & VM_FAULT_RETRY) {
-                       flags &= ~FAULT_FLAG_ALLOW_RETRY;
-                       flags |= FAULT_FLAG_TRIED;
-
-                       /*
-                        * No need to up_read(&mm->mmap_sem) as we would have
-                        * already released it in __lock_page_or_retry() in
-                        * mm/filemap.c.
-                        */
-                       goto retry;
-               }
-       }
-
-       up_read(&mm->mmap_sem);
-       return;
-
-       /*
-        * Something tried to access memory that isn't in our memory
-        * map. Fix it, but check if it's kernel or user first...
-        */
-bad_area:
-       up_read(&mm->mmap_sem);
-
-       if (user_mode(regs)) {
-               if (exception_trace && printk_ratelimit())
-                       printk("%s%s[%d]: segfault at %08lx pc %08lx "
-                              "sp %08lx ecr %lu\n",
-                              is_global_init(tsk) ? KERN_EMERG : KERN_INFO,
-                              tsk->comm, tsk->pid, address, regs->pc,
-                              regs->sp, ecr);
-               _exception(SIGSEGV, regs, code, address);
-               return;
-       }
-
-no_context:
-       /* Are we prepared to handle this kernel fault? */
-       fixup = search_exception_tables(regs->pc);
-       if (fixup) {
-               regs->pc = fixup->fixup;
-               return;
-       }
-
-       /*
-        * Oops. The kernel tried to access some bad page. We'll have
-        * to terminate things with extreme prejudice.
-        */
-       if (address < PAGE_SIZE)
-               printk(KERN_ALERT
-                      "Unable to handle kernel NULL pointer dereference");
-       else
-               printk(KERN_ALERT
-                      "Unable to handle kernel paging request");
-       printk(" at virtual address %08lx\n", address);
-
-       page = sysreg_read(PTBR);
-       printk(KERN_ALERT "ptbr = %08lx", page);
-       if (address >= TASK_SIZE)
-               page = (unsigned long)swapper_pg_dir;
-       if (page) {
-               page = ((unsigned long *)page)[address >> 22];
-               printk(" pgd = %08lx", page);
-               if (page & _PAGE_PRESENT) {
-                       page &= PAGE_MASK;
-                       address &= 0x003ff000;
-                       page = ((unsigned long *)__va(page))[address >> PAGE_SHIFT];
-                       printk(" pte = %08lx", page);
-               }
-       }
-       printk("\n");
-       die("Kernel access of bad area", regs, signr);
-       return;
-
-       /*
-        * We ran out of memory, or some other thing happened to us
-        * that made us unable to handle the page fault gracefully.
-        */
-out_of_memory:
-       up_read(&mm->mmap_sem);
-       if (!user_mode(regs))
-               goto no_context;
-       pagefault_out_of_memory();
-       return;
-
-do_sigbus:
-       up_read(&mm->mmap_sem);
-
-       /* Kernel mode? Handle exceptions or die */
-       signr = SIGBUS;
-       code = BUS_ADRERR;
-       if (!user_mode(regs))
-               goto no_context;
-
-       if (exception_trace)
-               printk("%s%s[%d]: bus error at %08lx pc %08lx "
-                      "sp %08lx ecr %lu\n",
-                      is_global_init(tsk) ? KERN_EMERG : KERN_INFO,
-                      tsk->comm, tsk->pid, address, regs->pc,
-                      regs->sp, ecr);
-
-       _exception(SIGBUS, regs, BUS_ADRERR, address);
-}
-
-asmlinkage void do_bus_error(unsigned long addr, int write_access,
-                            struct pt_regs *regs)
-{
-       printk(KERN_ALERT
-              "Bus error at physical address 0x%08lx (%s access)\n",
-              addr, write_access ? "write" : "read");
-       printk(KERN_INFO "DTLB dump:\n");
-       dump_dtlb();
-       die("Bus Error", regs, SIGKILL);
-}
diff --git a/arch/avr32/mm/init.c b/arch/avr32/mm/init.c
deleted file mode 100644 (file)
index def5391..0000000
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/kernel.h>
-#include <linux/gfp.h>
-#include <linux/mm.h>
-#include <linux/swap.h>
-#include <linux/init.h>
-#include <linux/mmzone.h>
-#include <linux/module.h>
-#include <linux/bootmem.h>
-#include <linux/pagemap.h>
-#include <linux/nodemask.h>
-
-#include <asm/page.h>
-#include <asm/mmu_context.h>
-#include <asm/tlb.h>
-#include <asm/io.h>
-#include <asm/dma.h>
-#include <asm/setup.h>
-#include <asm/sections.h>
-
-pgd_t swapper_pg_dir[PTRS_PER_PGD] __page_aligned_data;
-
-struct page *empty_zero_page;
-EXPORT_SYMBOL(empty_zero_page);
-
-/*
- * Cache of MMU context last used.
- */
-unsigned long mmu_context_cache = NO_CONTEXT;
-
-/*
- * paging_init() sets up the page tables
- *
- * This routine also unmaps the page at virtual kernel address 0, so
- * that we can trap those pesky NULL-reference errors in the kernel.
- */
-void __init paging_init(void)
-{
-       extern unsigned long _evba;
-       void *zero_page;
-       int nid;
-
-       /*
-        * Make sure we can handle exceptions before enabling
-        * paging. Not that we should ever _get_ any exceptions this
-        * early, but you never know...
-        */
-       printk("Exception vectors start at %p\n", &_evba);
-       sysreg_write(EVBA, (unsigned long)&_evba);
-
-       /*
-        * Since we are ready to handle exceptions now, we should let
-        * the CPU generate them...
-        */
-       __asm__ __volatile__ ("csrf %0" : : "i"(SR_EM_BIT));
-
-       /*
-        * Allocate the zero page. The allocator will panic if it
-        * can't satisfy the request, so no need to check.
-        */
-       zero_page = alloc_bootmem_low_pages_node(NODE_DATA(0),
-                                                PAGE_SIZE);
-
-       sysreg_write(PTBR, (unsigned long)swapper_pg_dir);
-       enable_mmu();
-       printk ("CPU: Paging enabled\n");
-
-       for_each_online_node(nid) {
-               pg_data_t *pgdat = NODE_DATA(nid);
-               unsigned long zones_size[MAX_NR_ZONES];
-               unsigned long low, start_pfn;
-
-               start_pfn = pgdat->bdata->node_min_pfn;
-               low = pgdat->bdata->node_low_pfn;
-
-               memset(zones_size, 0, sizeof(zones_size));
-               zones_size[ZONE_NORMAL] = low - start_pfn;
-
-               printk("Node %u: start_pfn = 0x%lx, low = 0x%lx\n",
-                      nid, start_pfn, low);
-
-               free_area_init_node(nid, zones_size, start_pfn, NULL);
-
-               printk("Node %u: mem_map starts at %p\n",
-                      pgdat->node_id, pgdat->node_mem_map);
-       }
-
-       mem_map = NODE_DATA(0)->node_mem_map;
-
-       empty_zero_page = virt_to_page(zero_page);
-       flush_dcache_page(empty_zero_page);
-}
-
-void __init mem_init(void)
-{
-       pg_data_t *pgdat;
-
-       high_memory = NULL;
-       for_each_online_pgdat(pgdat)
-               high_memory = max_t(void *, high_memory,
-                                   __va(pgdat_end_pfn(pgdat) << PAGE_SHIFT));
-
-       set_max_mapnr(MAP_NR(high_memory));
-       free_all_bootmem();
-       mem_init_print_info(NULL);
-}
-
-void free_initmem(void)
-{
-       free_initmem_default(-1);
-}
-
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
-       free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-#endif
diff --git a/arch/avr32/mm/ioremap.c b/arch/avr32/mm/ioremap.c
deleted file mode 100644 (file)
index 7def0d8..0000000
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/vmalloc.h>
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/io.h>
-#include <linux/slab.h>
-
-#include <asm/pgtable.h>
-#include <asm/addrspace.h>
-
-/*
- * Re-map an arbitrary physical address space into the kernel virtual
- * address space. Needed when the kernel wants to access physical
- * memory directly.
- */
-void __iomem *__ioremap(unsigned long phys_addr, size_t size,
-                       unsigned long flags)
-{
-       unsigned long addr;
-       struct vm_struct *area;
-       unsigned long offset, last_addr;
-       pgprot_t prot;
-
-       /*
-        * Check if we can simply use the P4 segment. This area is
-        * uncacheable, so if caching/buffering is requested, we can't
-        * use it.
-        */
-       if ((phys_addr >= P4SEG) && (flags == 0))
-               return (void __iomem *)phys_addr;
-
-       /* Don't allow wraparound or zero size */
-       last_addr = phys_addr + size - 1;
-       if (!size || last_addr < phys_addr)
-               return NULL;
-
-       /*
-        * XXX: When mapping regular RAM, we'd better make damn sure
-        * it's never used for anything else.  But this is really the
-        * caller's responsibility...
-        */
-       if (PHYSADDR(P2SEGADDR(phys_addr)) == phys_addr)
-               return (void __iomem *)P2SEGADDR(phys_addr);
-
-       /* Mappings have to be page-aligned */
-       offset = phys_addr & ~PAGE_MASK;
-       phys_addr &= PAGE_MASK;
-       size = PAGE_ALIGN(last_addr + 1) - phys_addr;
-
-       prot = __pgprot(_PAGE_PRESENT | _PAGE_GLOBAL | _PAGE_RW | _PAGE_DIRTY
-                       | _PAGE_ACCESSED | _PAGE_TYPE_SMALL | flags);
-
-       /*
-        * Ok, go for it..
-        */
-       area = get_vm_area(size, VM_IOREMAP);
-       if (!area)
-               return NULL;
-       area->phys_addr = phys_addr;
-       addr = (unsigned long )area->addr;
-       if (ioremap_page_range(addr, addr + size, phys_addr, prot)) {
-               vunmap((void *)addr);
-               return NULL;
-       }
-
-       return (void __iomem *)(offset + (char *)addr);
-}
-EXPORT_SYMBOL(__ioremap);
-
-void __iounmap(void __iomem *addr)
-{
-       struct vm_struct *p;
-
-       if ((unsigned long)addr >= P4SEG)
-               return;
-       if (PXSEG(addr) == P2SEG)
-               return;
-
-       p = remove_vm_area((void *)(PAGE_MASK & (unsigned long __force)addr));
-       if (unlikely(!p)) {
-               printk (KERN_ERR "iounmap: bad address %p\n", addr);
-               return;
-       }
-
-       kfree (p);
-}
-EXPORT_SYMBOL(__iounmap);
diff --git a/arch/avr32/mm/tlb.c b/arch/avr32/mm/tlb.c
deleted file mode 100644 (file)
index 0da2310..0000000
+++ /dev/null
@@ -1,375 +0,0 @@
-/*
- * AVR32 TLB operations
- *
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/mm.h>
-
-#include <asm/mmu_context.h>
-
-/* TODO: Get the correct number from the CONFIG1 system register */
-#define NR_TLB_ENTRIES 32
-
-static void show_dtlb_entry(unsigned int index)
-{
-       u32 tlbehi, tlbehi_save, tlbelo, mmucr, mmucr_save;
-       unsigned long flags;
-
-       local_irq_save(flags);
-       mmucr_save = sysreg_read(MMUCR);
-       tlbehi_save = sysreg_read(TLBEHI);
-       mmucr = SYSREG_BFINS(DRP, index, mmucr_save);
-       sysreg_write(MMUCR, mmucr);
-
-       __builtin_tlbr();
-       cpu_sync_pipeline();
-
-       tlbehi = sysreg_read(TLBEHI);
-       tlbelo = sysreg_read(TLBELO);
-
-       printk("%2u: %c %c %02x   %05x %05x %o  %o  %c %c %c %c\n",
-              index,
-              SYSREG_BFEXT(TLBEHI_V, tlbehi) ? '1' : '0',
-              SYSREG_BFEXT(G, tlbelo) ? '1' : '0',
-              SYSREG_BFEXT(ASID, tlbehi),
-              SYSREG_BFEXT(VPN, tlbehi) >> 2,
-              SYSREG_BFEXT(PFN, tlbelo) >> 2,
-              SYSREG_BFEXT(AP, tlbelo),
-              SYSREG_BFEXT(SZ, tlbelo),
-              SYSREG_BFEXT(TLBELO_C, tlbelo) ? 'C' : ' ',
-              SYSREG_BFEXT(B, tlbelo) ? 'B' : ' ',
-              SYSREG_BFEXT(W, tlbelo) ? 'W' : ' ',
-              SYSREG_BFEXT(TLBELO_D, tlbelo) ? 'D' : ' ');
-
-       sysreg_write(MMUCR, mmucr_save);
-       sysreg_write(TLBEHI, tlbehi_save);
-       cpu_sync_pipeline();
-       local_irq_restore(flags);
-}
-
-void dump_dtlb(void)
-{
-       unsigned int i;
-
-       printk("ID  V G ASID VPN   PFN   AP SZ C B W D\n");
-       for (i = 0; i < NR_TLB_ENTRIES; i++)
-               show_dtlb_entry(i);
-}
-
-static void update_dtlb(unsigned long address, pte_t pte)
-{
-       u32 tlbehi;
-       u32 mmucr;
-
-       /*
-        * We're not changing the ASID here, so no need to flush the
-        * pipeline.
-        */
-       tlbehi = sysreg_read(TLBEHI);
-       tlbehi = SYSREG_BF(ASID, SYSREG_BFEXT(ASID, tlbehi));
-       tlbehi |= address & MMU_VPN_MASK;
-       tlbehi |= SYSREG_BIT(TLBEHI_V);
-       sysreg_write(TLBEHI, tlbehi);
-
-       /* Does this mapping already exist? */
-       __builtin_tlbs();
-       mmucr = sysreg_read(MMUCR);
-
-       if (mmucr & SYSREG_BIT(MMUCR_N)) {
-               /* Not found -- pick a not-recently-accessed entry */
-               unsigned int rp;
-               u32 tlbar = sysreg_read(TLBARLO);
-
-               rp = 32 - fls(tlbar);
-               if (rp == 32) {
-                       rp = 0;
-                       sysreg_write(TLBARLO, -1L);
-               }
-
-               mmucr = SYSREG_BFINS(DRP, rp, mmucr);
-               sysreg_write(MMUCR, mmucr);
-       }
-
-       sysreg_write(TLBELO, pte_val(pte) & _PAGE_FLAGS_HARDWARE_MASK);
-
-       /* Let's go */
-       __builtin_tlbw();
-}
-
-void update_mmu_cache(struct vm_area_struct *vma,
-                     unsigned long address, pte_t *ptep)
-{
-       unsigned long flags;
-
-       /* ptrace may call this routine */
-       if (vma && current->active_mm != vma->vm_mm)
-               return;
-
-       local_irq_save(flags);
-       update_dtlb(address, *ptep);
-       local_irq_restore(flags);
-}
-
-static void __flush_tlb_page(unsigned long asid, unsigned long page)
-{
-       u32 mmucr, tlbehi;
-
-       /*
-        * Caller is responsible for masking out non-PFN bits in page
-        * and changing the current ASID if necessary. This means that
-        * we don't need to flush the pipeline after writing TLBEHI.
-        */
-       tlbehi = page | asid;
-       sysreg_write(TLBEHI, tlbehi);
-
-       __builtin_tlbs();
-       mmucr = sysreg_read(MMUCR);
-
-       if (!(mmucr & SYSREG_BIT(MMUCR_N))) {
-               unsigned int entry;
-               u32 tlbarlo;
-
-               /* Clear the "valid" bit */
-               sysreg_write(TLBEHI, tlbehi);
-
-               /* mark the entry as "not accessed" */
-               entry = SYSREG_BFEXT(DRP, mmucr);
-               tlbarlo = sysreg_read(TLBARLO);
-               tlbarlo |= (0x80000000UL >> entry);
-               sysreg_write(TLBARLO, tlbarlo);
-
-               /* update the entry with valid bit clear */
-               __builtin_tlbw();
-       }
-}
-
-void flush_tlb_page(struct vm_area_struct *vma, unsigned long page)
-{
-       if (vma->vm_mm && vma->vm_mm->context != NO_CONTEXT) {
-               unsigned long flags, asid;
-               unsigned long saved_asid = MMU_NO_ASID;
-
-               asid = vma->vm_mm->context & MMU_CONTEXT_ASID_MASK;
-               page &= PAGE_MASK;
-
-               local_irq_save(flags);
-               if (vma->vm_mm != current->mm) {
-                       saved_asid = get_asid();
-                       set_asid(asid);
-               }
-
-               __flush_tlb_page(asid, page);
-
-               if (saved_asid != MMU_NO_ASID)
-                       set_asid(saved_asid);
-               local_irq_restore(flags);
-       }
-}
-
-void flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
-                    unsigned long end)
-{
-       struct mm_struct *mm = vma->vm_mm;
-
-       if (mm->context != NO_CONTEXT) {
-               unsigned long flags;
-               int size;
-
-               local_irq_save(flags);
-               size = (end - start + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
-
-               if (size > (MMU_DTLB_ENTRIES / 4)) { /* Too many entries to flush */
-                       mm->context = NO_CONTEXT;
-                       if (mm == current->mm)
-                               activate_context(mm);
-               } else {
-                       unsigned long asid;
-                       unsigned long saved_asid;
-
-                       asid = mm->context & MMU_CONTEXT_ASID_MASK;
-                       saved_asid = MMU_NO_ASID;
-
-                       start &= PAGE_MASK;
-                       end += (PAGE_SIZE - 1);
-                       end &= PAGE_MASK;
-
-                       if (mm != current->mm) {
-                               saved_asid = get_asid();
-                               set_asid(asid);
-                       }
-
-                       while (start < end) {
-                               __flush_tlb_page(asid, start);
-                               start += PAGE_SIZE;
-                       }
-                       if (saved_asid != MMU_NO_ASID)
-                               set_asid(saved_asid);
-               }
-               local_irq_restore(flags);
-       }
-}
-
-/*
- * This function depends on the pages to be flushed having the G
- * (global) bit set in their pte. This is true for all
- * PAGE_KERNEL(_RO) pages.
- */
-void flush_tlb_kernel_range(unsigned long start, unsigned long end)
-{
-       unsigned long flags;
-       int size;
-
-       size = (end - start + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
-       if (size > (MMU_DTLB_ENTRIES / 4)) { /* Too many entries to flush */
-               flush_tlb_all();
-       } else {
-               unsigned long asid;
-
-               local_irq_save(flags);
-               asid = get_asid();
-
-               start &= PAGE_MASK;
-               end += (PAGE_SIZE - 1);
-               end &= PAGE_MASK;
-
-               while (start < end) {
-                       __flush_tlb_page(asid, start);
-                       start += PAGE_SIZE;
-               }
-               local_irq_restore(flags);
-       }
-}
-
-void flush_tlb_mm(struct mm_struct *mm)
-{
-       /* Invalidate all TLB entries of this process by getting a new ASID */
-       if (mm->context != NO_CONTEXT) {
-               unsigned long flags;
-
-               local_irq_save(flags);
-               mm->context = NO_CONTEXT;
-               if (mm == current->mm)
-                       activate_context(mm);
-               local_irq_restore(flags);
-       }
-}
-
-void flush_tlb_all(void)
-{
-       unsigned long flags;
-
-       local_irq_save(flags);
-       sysreg_write(MMUCR, sysreg_read(MMUCR) | SYSREG_BIT(MMUCR_I));
-       local_irq_restore(flags);
-}
-
-#ifdef CONFIG_PROC_FS
-
-#include <linux/seq_file.h>
-#include <linux/proc_fs.h>
-#include <linux/init.h>
-
-static void *tlb_start(struct seq_file *tlb, loff_t *pos)
-{
-       static unsigned long tlb_index;
-
-       if (*pos >= NR_TLB_ENTRIES)
-               return NULL;
-
-       tlb_index = 0;
-       return &tlb_index;
-}
-
-static void *tlb_next(struct seq_file *tlb, void *v, loff_t *pos)
-{
-       unsigned long *index = v;
-
-       if (*index >= NR_TLB_ENTRIES - 1)
-               return NULL;
-
-       ++*pos;
-       ++*index;
-       return index;
-}
-
-static void tlb_stop(struct seq_file *tlb, void *v)
-{
-
-}
-
-static int tlb_show(struct seq_file *tlb, void *v)
-{
-       unsigned int tlbehi, tlbehi_save, tlbelo, mmucr, mmucr_save;
-       unsigned long flags;
-       unsigned long *index = v;
-
-       if (*index == 0)
-               seq_puts(tlb, "ID  V G ASID VPN   PFN   AP SZ C B W D\n");
-
-       BUG_ON(*index >= NR_TLB_ENTRIES);
-
-       local_irq_save(flags);
-       mmucr_save = sysreg_read(MMUCR);
-       tlbehi_save = sysreg_read(TLBEHI);
-       mmucr = SYSREG_BFINS(DRP, *index, mmucr_save);
-       sysreg_write(MMUCR, mmucr);
-
-       /* TLBR might change the ASID */
-       __builtin_tlbr();
-       cpu_sync_pipeline();
-
-       tlbehi = sysreg_read(TLBEHI);
-       tlbelo = sysreg_read(TLBELO);
-
-       sysreg_write(MMUCR, mmucr_save);
-       sysreg_write(TLBEHI, tlbehi_save);
-       cpu_sync_pipeline();
-       local_irq_restore(flags);
-
-       seq_printf(tlb, "%2lu: %c %c %02x   %05x %05x %o  %o  %c %c %c %c\n",
-                  *index,
-                  SYSREG_BFEXT(TLBEHI_V, tlbehi) ? '1' : '0',
-                  SYSREG_BFEXT(G, tlbelo) ? '1' : '0',
-                  SYSREG_BFEXT(ASID, tlbehi),
-                  SYSREG_BFEXT(VPN, tlbehi) >> 2,
-                  SYSREG_BFEXT(PFN, tlbelo) >> 2,
-                  SYSREG_BFEXT(AP, tlbelo),
-                  SYSREG_BFEXT(SZ, tlbelo),
-                  SYSREG_BFEXT(TLBELO_C, tlbelo) ? '1' : '0',
-                  SYSREG_BFEXT(B, tlbelo) ? '1' : '0',
-                  SYSREG_BFEXT(W, tlbelo) ? '1' : '0',
-                  SYSREG_BFEXT(TLBELO_D, tlbelo) ? '1' : '0');
-
-       return 0;
-}
-
-static const struct seq_operations tlb_ops = {
-       .start          = tlb_start,
-       .next           = tlb_next,
-       .stop           = tlb_stop,
-       .show           = tlb_show,
-};
-
-static int tlb_open(struct inode *inode, struct file *file)
-{
-       return seq_open(file, &tlb_ops);
-}
-
-static const struct file_operations proc_tlb_operations = {
-       .open           = tlb_open,
-       .read           = seq_read,
-       .llseek         = seq_lseek,
-       .release        = seq_release,
-};
-
-static int __init proctlb_init(void)
-{
-       proc_create("tlb", 0, NULL, &proc_tlb_operations);
-       return 0;
-}
-late_initcall(proctlb_init);
-#endif /* CONFIG_PROC_FS */
diff --git a/arch/avr32/oprofile/Makefile b/arch/avr32/oprofile/Makefile
deleted file mode 100644 (file)
index e0eb520..0000000
+++ /dev/null
@@ -1,8 +0,0 @@
-obj-$(CONFIG_OPROFILE) += oprofile.o
-
-oprofile-y             := $(addprefix ../../../drivers/oprofile/,      \
-                               oprof.o cpu_buffer.o buffer_sync.o      \
-                               event_buffer.o oprofile_files.o         \
-                               oprofilefs.o oprofile_stats.o           \
-                               timer_int.o)
-oprofile-y             += op_model_avr32.o backtrace.o
diff --git a/arch/avr32/oprofile/backtrace.c b/arch/avr32/oprofile/backtrace.c
deleted file mode 100644 (file)
index 29cf2f1..0000000
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * AVR32 specific backtracing code for oprofile
- *
- * Copyright 2008 Weinmann GmbH
- *
- * Author: Nikolaus Voss <n.voss@weinmann.de>
- *
- * Based on i386 oprofile backtrace code by John Levon and David Smith
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- */
-
-#include <linux/oprofile.h>
-#include <linux/ptrace.h>
-#include <linux/uaccess.h>
-
-/* The first two words of each frame on the stack look like this if we have
- * frame pointers */
-struct frame_head {
-       unsigned long lr;
-       struct frame_head *fp;
-};
-
-/* copied from arch/avr32/kernel/process.c */
-static inline int valid_stack_ptr(struct thread_info *tinfo, unsigned long p)
-{
-       return (p > (unsigned long)tinfo)
-               && (p < (unsigned long)tinfo + THREAD_SIZE - 3);
-}
-
-/* copied from arch/x86/oprofile/backtrace.c */
-static struct frame_head *dump_user_backtrace(struct frame_head *head)
-{
-       struct frame_head bufhead[2];
-
-       /* Also check accessibility of one struct frame_head beyond */
-       if (!access_ok(VERIFY_READ, head, sizeof(bufhead)))
-               return NULL;
-       if (__copy_from_user_inatomic(bufhead, head, sizeof(bufhead)))
-               return NULL;
-
-       oprofile_add_trace(bufhead[0].lr);
-
-       /* frame pointers should strictly progress back up the stack
-        * (towards higher addresses) */
-       if (bufhead[0].fp <= head)
-               return NULL;
-
-       return bufhead[0].fp;
-}
-
-void avr32_backtrace(struct pt_regs * const regs, unsigned int depth)
-{
-       /* Get first frame pointer */
-       struct frame_head *head = (struct frame_head *)(regs->r7);
-
-       if (!user_mode(regs)) {
-#ifdef CONFIG_FRAME_POINTER
-               /*
-                * Traverse the kernel stack from frame to frame up to
-                * "depth" steps.
-                */
-               while (depth-- && valid_stack_ptr(task_thread_info(current),
-                                                 (unsigned long)head)) {
-                       oprofile_add_trace(head->lr);
-                       if (head->fp <= head)
-                               break;
-                       head = head->fp;
-               }
-#endif
-       } else {
-               /* Assume we have frame pointers in user mode process */
-               while (depth-- && head)
-                       head = dump_user_backtrace(head);
-       }
-}
-
-
diff --git a/arch/avr32/oprofile/op_model_avr32.c b/arch/avr32/oprofile/op_model_avr32.c
deleted file mode 100644 (file)
index 08308be..0000000
+++ /dev/null
@@ -1,236 +0,0 @@
-/*
- * AVR32 Performance Counter Driver
- *
- * Copyright (C) 2005-2007 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Author: Ronny Pedersen
- */
-#include <linux/errno.h>
-#include <linux/interrupt.h>
-#include <linux/irq.h>
-#include <linux/oprofile.h>
-#include <linux/sched.h>
-#include <linux/types.h>
-
-#include <asm/sysreg.h>
-
-#define AVR32_PERFCTR_IRQ_GROUP        0
-#define AVR32_PERFCTR_IRQ_LINE 1
-
-void avr32_backtrace(struct pt_regs * const regs, unsigned int depth);
-
-enum { PCCNT, PCNT0, PCNT1, NR_counter };
-
-struct avr32_perf_counter {
-       unsigned long   enabled;
-       unsigned long   event;
-       unsigned long   count;
-       unsigned long   unit_mask;
-       unsigned long   kernel;
-       unsigned long   user;
-
-       u32             ie_mask;
-       u32             flag_mask;
-};
-
-static struct avr32_perf_counter counter[NR_counter] = {
-       {
-               .ie_mask        = SYSREG_BIT(IEC),
-               .flag_mask      = SYSREG_BIT(FC),
-       }, {
-               .ie_mask        = SYSREG_BIT(IE0),
-               .flag_mask      = SYSREG_BIT(F0),
-       }, {
-               .ie_mask        = SYSREG_BIT(IE1),
-               .flag_mask      = SYSREG_BIT(F1),
-       },
-};
-
-static void avr32_perf_counter_reset(void)
-{
-       /* Reset all counter and disable/clear all interrupts */
-       sysreg_write(PCCR, (SYSREG_BIT(PCCR_R)
-                               | SYSREG_BIT(PCCR_C)
-                               | SYSREG_BIT(FC)
-                               | SYSREG_BIT(F0)
-                               | SYSREG_BIT(F1)));
-}
-
-static irqreturn_t avr32_perf_counter_interrupt(int irq, void *dev_id)
-{
-       struct avr32_perf_counter *ctr = dev_id;
-       struct pt_regs *regs;
-       u32 pccr;
-
-       if (likely(!(intc_get_pending(AVR32_PERFCTR_IRQ_GROUP)
-                                       & (1 << AVR32_PERFCTR_IRQ_LINE))))
-               return IRQ_NONE;
-
-       regs = get_irq_regs();
-       pccr = sysreg_read(PCCR);
-
-       /* Clear the interrupt flags we're about to handle */
-       sysreg_write(PCCR, pccr);
-
-       /* PCCNT */
-       if (ctr->enabled && (pccr & ctr->flag_mask)) {
-               sysreg_write(PCCNT, -ctr->count);
-               oprofile_add_sample(regs, PCCNT);
-       }
-       ctr++;
-       /* PCNT0 */
-       if (ctr->enabled && (pccr & ctr->flag_mask)) {
-               sysreg_write(PCNT0, -ctr->count);
-               oprofile_add_sample(regs, PCNT0);
-       }
-       ctr++;
-       /* PCNT1 */
-       if (ctr->enabled && (pccr & ctr->flag_mask)) {
-               sysreg_write(PCNT1, -ctr->count);
-               oprofile_add_sample(regs, PCNT1);
-       }
-
-       return IRQ_HANDLED;
-}
-
-static int avr32_perf_counter_create_files(struct dentry *root)
-{
-       struct dentry *dir;
-       unsigned int i;
-       char filename[4];
-
-       for (i = 0; i < NR_counter; i++) {
-               snprintf(filename, sizeof(filename), "%u", i);
-               dir = oprofilefs_mkdir(root, filename);
-
-               oprofilefs_create_ulong(dir, "enabled",
-                               &counter[i].enabled);
-               oprofilefs_create_ulong(dir, "event",
-                               &counter[i].event);
-               oprofilefs_create_ulong(dir, "count",
-                               &counter[i].count);
-
-               /* Dummy entries */
-               oprofilefs_create_ulong(dir, "kernel",
-                               &counter[i].kernel);
-               oprofilefs_create_ulong(dir, "user",
-                               &counter[i].user);
-               oprofilefs_create_ulong(dir, "unit_mask",
-                               &counter[i].unit_mask);
-       }
-
-       return 0;
-}
-
-static int avr32_perf_counter_setup(void)
-{
-       struct avr32_perf_counter *ctr;
-       u32 pccr;
-       int ret;
-       int i;
-
-       pr_debug("avr32_perf_counter_setup\n");
-
-       if (sysreg_read(PCCR) & SYSREG_BIT(PCCR_E)) {
-               printk(KERN_ERR
-                       "oprofile: setup: perf counter already enabled\n");
-               return -EBUSY;
-       }
-
-       ret = request_irq(AVR32_PERFCTR_IRQ_GROUP,
-                       avr32_perf_counter_interrupt, IRQF_SHARED,
-                       "oprofile", counter);
-       if (ret)
-               return ret;
-
-       avr32_perf_counter_reset();
-
-       pccr = 0;
-       for (i = PCCNT; i < NR_counter; i++) {
-               ctr = &counter[i];
-               if (!ctr->enabled)
-                       continue;
-
-               pr_debug("enabling counter %d...\n", i);
-
-               pccr |= ctr->ie_mask;
-
-               switch (i) {
-               case PCCNT:
-                       /* PCCNT always counts cycles, so no events */
-                       sysreg_write(PCCNT, -ctr->count);
-                       break;
-               case PCNT0:
-                       pccr |= SYSREG_BF(CONF0, ctr->event);
-                       sysreg_write(PCNT0, -ctr->count);
-                       break;
-               case PCNT1:
-                       pccr |= SYSREG_BF(CONF1, ctr->event);
-                       sysreg_write(PCNT1, -ctr->count);
-                       break;
-               }
-       }
-
-       pr_debug("oprofile: writing 0x%x to PCCR...\n", pccr);
-
-       sysreg_write(PCCR, pccr);
-
-       return 0;
-}
-
-static void avr32_perf_counter_shutdown(void)
-{
-       pr_debug("avr32_perf_counter_shutdown\n");
-
-       avr32_perf_counter_reset();
-       free_irq(AVR32_PERFCTR_IRQ_GROUP, counter);
-}
-
-static int avr32_perf_counter_start(void)
-{
-       pr_debug("avr32_perf_counter_start\n");
-
-       sysreg_write(PCCR, sysreg_read(PCCR) | SYSREG_BIT(PCCR_E));
-
-       return 0;
-}
-
-static void avr32_perf_counter_stop(void)
-{
-       pr_debug("avr32_perf_counter_stop\n");
-
-       sysreg_write(PCCR, sysreg_read(PCCR) & ~SYSREG_BIT(PCCR_E));
-}
-
-static struct oprofile_operations avr32_perf_counter_ops __initdata = {
-       .create_files   = avr32_perf_counter_create_files,
-       .setup          = avr32_perf_counter_setup,
-       .shutdown       = avr32_perf_counter_shutdown,
-       .start          = avr32_perf_counter_start,
-       .stop           = avr32_perf_counter_stop,
-       .cpu_type       = "avr32",
-};
-
-int __init oprofile_arch_init(struct oprofile_operations *ops)
-{
-       if (!(current_cpu_data.features & AVR32_FEATURE_PCTR))
-               return -ENODEV;
-
-       memcpy(ops, &avr32_perf_counter_ops,
-                       sizeof(struct oprofile_operations));
-
-       ops->backtrace = avr32_backtrace;
-
-       printk(KERN_INFO "oprofile: using AVR32 performance monitoring.\n");
-
-       return 0;
-}
-
-void oprofile_arch_exit(void)
-{
-
-}
index 625db8a..dc4ef9a 100644 (file)
@@ -7,6 +7,7 @@ generic-y += device.h
 generic-y += div64.h
 generic-y += emergency-restart.h
 generic-y += errno.h
+generic-y += extable.h
 generic-y += fb.h
 generic-y += futex.h
 generic-y += hw_irq.h
index 0eff88a..f54a34f 100644 (file)
@@ -12,7 +12,6 @@
 /*
  * User space memory access functions
  */
-#include <linux/sched.h>
 #include <linux/mm.h>
 #include <linux/string.h>
 
@@ -29,9 +28,6 @@ static inline void set_fs(mm_segment_t fs)
 
 #define segment_eq(a, b) ((a) == (b))
 
-#define VERIFY_READ    0
-#define VERIFY_WRITE   1
-
 #define access_ok(type, addr, size) _access_ok((unsigned long)(addr), (size))
 
 /*
@@ -46,22 +42,7 @@ static inline int _access_ok(unsigned long addr, unsigned long size) { return 1;
 extern int _access_ok(unsigned long addr, unsigned long size);
 #endif
 
-/*
- * The exception table consists of pairs of addresses: the first is the
- * address of an instruction that is allowed to fault, and the second is
- * the address at which the program should continue.  No registers are
- * modified, so it is entirely up to the continuation code to figure out
- * what to do.
- *
- * All the routines below use bits of fixup code that are out of line
- * with the main instruction path.  This means when everything is well,
- * we don't even have to jump over them.  Further, they do not intrude
- * on our cache or tlb entries.
- */
-
-struct exception_table_entry {
-       unsigned long insn, fixup;
-};
+#include <asm/extable.h>
 
 /*
  * These are the main single-value transfer routines.  They automatically
@@ -163,41 +144,23 @@ static inline int bad_user_access_length(void)
                : "a" (__ptr(ptr)));            \
 })
 
-#define __copy_to_user_inatomic __copy_to_user
-#define __copy_from_user_inatomic __copy_from_user
-
 static inline unsigned long __must_check
-__copy_from_user(void *to, const void __user *from, unsigned long n)
+raw_copy_from_user(void *to, const void __user *from, unsigned long n)
 {
        memcpy(to, (const void __force *)from, n);
        return 0;
 }
 
 static inline unsigned long __must_check
-__copy_to_user(void __user *to, const void *from, unsigned long n)
+raw_copy_to_user(void __user *to, const void *from, unsigned long n)
 {
        memcpy((void __force *)to, from, n);
        SSYNC();
        return 0;
 }
 
-static inline unsigned long __must_check
-copy_from_user(void *to, const void __user *from, unsigned long n)
-{
-       if (likely(access_ok(VERIFY_READ, from, n)))
-               return __copy_from_user(to, from, n);
-       memset(to, 0, n);
-       return n;
-}
-
-static inline unsigned long __must_check
-copy_to_user(void __user *to, const void *from, unsigned long n)
-{
-       if (likely(access_ok(VERIFY_WRITE, to, n)))
-               return __copy_to_user(to, from, n);
-       return n;
-}
-
+#define INLINE_COPY_FROM_USER
+#define INLINE_COPY_TO_USER
 /*
  * Copy a null terminated string from userspace.
  */
index 89d5162..8981485 100644 (file)
@@ -370,7 +370,7 @@ int _access_ok(unsigned long addr, unsigned long size)
        /* Check that things do not wrap around */
        if (addr > ULONG_MAX - size)
                return 0;
-       if (segment_eq(get_fs(), KERNEL_DS))
+       if (uaccess_kernel())
                return 1;
 #ifdef CONFIG_MTD_UCLINUX
        if (1)
index 0e9fcf8..0135055 100644 (file)
@@ -230,7 +230,9 @@ static void __init bfin_gptmr0_clockevent_init(struct clock_event_device *evt)
        clock_tick = get_sclk();
        evt->mult = div_sc(clock_tick, NSEC_PER_SEC, evt->shift);
        evt->max_delta_ns = clockevent_delta2ns(-1, evt);
+       evt->max_delta_ticks = (unsigned long)-1;
        evt->min_delta_ns = clockevent_delta2ns(100, evt);
+       evt->min_delta_ticks = 100;
 
        evt->cpumask = cpumask_of(0);
 
@@ -344,7 +346,9 @@ void bfin_coretmr_clockevent_init(void)
        clock_tick = get_cclk() / TIME_SCALE;
        evt->mult = div_sc(clock_tick, NSEC_PER_SEC, evt->shift);
        evt->max_delta_ns = clockevent_delta2ns(-1, evt);
+       evt->max_delta_ticks = (unsigned long)-1;
        evt->min_delta_ns = clockevent_delta2ns(100, evt);
+       evt->min_delta_ticks = 100;
 
        evt->cpumask = cpumask_of(cpu);
 
index 82619c3..f0eaf04 100644 (file)
@@ -12,6 +12,7 @@ generic-y += dma.h
 generic-y += emergency-restart.h
 generic-y += errno.h
 generic-y += exec.h
+generic-y += extable.h
 generic-y += fb.h
 generic-y += fcntl.h
 generic-y += futex.h
index 453dd26..ba67568 100644 (file)
 #include <linux/compiler.h>
 #include <linux/string.h>
 
-#ifdef CONFIG_ACCESS_CHECK
-#define __access_ok _access_ok
-#endif
-
 /*
- * __copy_from_user/copy_to_user are based on ones in asm-generic/uaccess.h
- *
  * C6X supports unaligned 32 and 64 bit loads and stores.
  */
-static inline __must_check long __copy_from_user(void *to,
-               const void __user *from, unsigned long n)
+static inline __must_check unsigned long
+raw_copy_from_user(void *to, const void __user *from, unsigned long n)
 {
        u32 tmp32;
        u64 tmp64;
@@ -58,8 +52,8 @@ static inline __must_check long __copy_from_user(void *to,
        return 0;
 }
 
-static inline __must_check long __copy_to_user(void __user *to,
-               const void *from, unsigned long n)
+static inline __must_check unsigned long
+raw_copy_to_user(void __user *to, const void *from, unsigned long n)
 {
        u32 tmp32;
        u64 tmp64;
@@ -93,9 +87,8 @@ static inline __must_check long __copy_to_user(void __user *to,
        memcpy((void __force *)to, from, n);
        return 0;
 }
-
-#define __copy_to_user   __copy_to_user
-#define __copy_from_user __copy_from_user
+#define INLINE_COPY_FROM_USER
+#define INLINE_COPY_TO_USER
 
 extern int _access_ok(unsigned long addr, unsigned long size);
 #ifdef CONFIG_ACCESS_CHECK
index 3e9bdfb..a742ae2 100644 (file)
@@ -23,7 +23,7 @@ int _access_ok(unsigned long addr, unsigned long size)
        if (!addr || addr > (0xffffffffUL - (size - 1)))
                goto _bad_access;
 
-       if (segment_eq(get_fs(), KERNEL_DS))
+       if (uaccess_kernel())
                return 1;
 
        if (memory_start <= addr && (addr + size - 1) < memory_end)
index c19901e..0bd0452 100644 (file)
@@ -234,7 +234,9 @@ void __init timer64_init(void)
        clockevents_calc_mult_shift(cd, c6x_core_freq / TIMER_DIVISOR, 5);
 
        cd->max_delta_ns        = clockevent_delta2ns(0x7fffffff, cd);
+       cd->max_delta_ticks     = 0x7fffffff;
        cd->min_delta_ns        = clockevent_delta2ns(250, cd);
+       cd->min_delta_ticks     = 250;
 
        cd->cpumask             = cpumask_of(smp_processor_id());
 
index 1ba7cc0..48fa37f 100644 (file)
@@ -188,11 +188,10 @@ unsigned long __copy_user(void __user *pdst, const void *psrc, unsigned long pn)
 }
 EXPORT_SYMBOL(__copy_user);
 
-/* Copy from user to kernel, zeroing the bytes that were inaccessible in
-   userland.  The return-value is the number of bytes that were
+/* Copy from user to kernel.  The return-value is the number of bytes that were
    inaccessible.  */
 
-unsigned long __copy_user_zeroing(void *pdst, const void __user *psrc,
+unsigned long __copy_user_in(void *pdst, const void __user *psrc,
                                  unsigned long pn)
 {
   /* We want the parameters put in special registers.
@@ -217,19 +216,17 @@ unsigned long __copy_user_zeroing(void *pdst, const void __user *psrc,
     {
       __asm_copy_from_user_1 (dst, src, retn);
       n--;
+      if (retn)
+         goto exception;
     }
 
     if (((unsigned long) src & 2) && n >= 2)
     {
       __asm_copy_from_user_2 (dst, src, retn);
       n -= 2;
+      if (retn)
+         goto exception;
     }
-
-    /* We only need one check after the unalignment-adjustments, because
-       if both adjustments were done, either both or neither reference
-       had an exception.  */
-    if (retn != 0)
-      goto copy_exception_bytes;
   }
 
   /* Decide which copying method to use. */
@@ -328,7 +325,7 @@ unsigned long __copy_user_zeroing(void *pdst, const void __user *psrc,
     n -= 4;
 
     if (retn)
-      goto copy_exception_bytes;
+      goto exception;
   }
 
   /* If we get here, there were no memory read faults.  */
@@ -356,20 +353,10 @@ unsigned long __copy_user_zeroing(void *pdst, const void __user *psrc,
      bytes.  */
   return retn;
 
-copy_exception_bytes:
-  /* We already have "retn" bytes cleared, and need to clear the
-     remaining "n" bytes.  A non-optimized simple byte-for-byte in-line
-     memset is preferred here, since this isn't speed-critical code and
-     we'd rather have this a leaf-function than calling memset.  */
-  {
-    char *endp;
-    for (endp = dst + n; dst < endp; dst++)
-      *dst = 0;
-  }
-
+exception:
   return retn + n;
 }
-EXPORT_SYMBOL(__copy_user_zeroing);
+EXPORT_SYMBOL(__copy_user_in);
 
 /* Zero userspace.  */
 unsigned long __do_clear_user(void __user *pto, unsigned long pn)
index 05e58da..20b6080 100644 (file)
@@ -156,10 +156,9 @@ unsigned long __copy_user(void __user *pdst, const void *psrc, unsigned long pn)
 }
 EXPORT_SYMBOL(__copy_user);
 
-/* Copy from user to kernel, zeroing the bytes that were inaccessible in
-   userland.  The return-value is the number of bytes that were
+/* Copy from user to kernel.  The return-value is the number of bytes that were
    inaccessible.  */
-unsigned long __copy_user_zeroing(void *pdst, const void __user *psrc,
+unsigned long __copy_user_in(void *pdst, const void __user *psrc,
                                  unsigned long pn)
 {
   /* We want the parameters put in special registers.
@@ -184,19 +183,18 @@ unsigned long __copy_user_zeroing(void *pdst, const void __user *psrc,
     {
       __asm_copy_from_user_1 (dst, src, retn);
       n--;
+      if (retn != 0)
+        goto exception;
     }
 
     if (((unsigned long) src & 2) && n >= 2)
     {
       __asm_copy_from_user_2 (dst, src, retn);
       n -= 2;
+      if (retn != 0)
+        goto exception;
     }
 
-    /* We only need one check after the unalignment-adjustments, because
-       if both adjustments were done, either both or neither reference
-       had an exception.  */
-    if (retn != 0)
-      goto copy_exception_bytes;
   }
 
   /* Movem is dirt cheap.  The overheap is low enough to always use the
@@ -279,7 +277,7 @@ unsigned long __copy_user_zeroing(void *pdst, const void __user *psrc,
     n -= 4;
 
     if (retn)
-      goto copy_exception_bytes;
+      goto exception;
   }
 
   /* If we get here, there were no memory read faults.  */
@@ -307,20 +305,10 @@ unsigned long __copy_user_zeroing(void *pdst, const void __user *psrc,
      bytes.  */
   return retn;
 
-copy_exception_bytes:
-  /* We already have "retn" bytes cleared, and need to clear the
-     remaining "n" bytes.  A non-optimized simple byte-for-byte in-line
-     memset is preferred here, since this isn't speed-critical code and
-     we'd rather have this a leaf-function than calling memset.  */
-  {
-    char *endp;
-    for (endp = dst + n; dst < endp; dst++)
-      *dst = 0;
-  }
-
+exception:
   return retn + n;
 }
-EXPORT_SYMBOL(__copy_user_zeroing);
+EXPORT_SYMBOL(__copy_user_in);
 
 /* Zero userspace.  */
 unsigned long __do_clear_user(void __user *pto, unsigned long pn)
index 65b02d9..5477c98 100644 (file)
@@ -172,16 +172,14 @@ __do_strncpy_from_user(char *dst, const char *src, long count)
        __asm_copy_user_cont(to, from, ret,     \
                "       move.b [%1+],$r9\n"     \
                "2:     move.b $r9,[%0+]\n",    \
-               "3:     addq 1,%2\n"            \
-               "       clear.b [%0+]\n",       \
+               "3:     addq 1,%2\n",           \
                "       .dword 2b,3b\n")
 
 #define __asm_copy_from_user_2x_cont(to, from, ret, COPY, FIXUP, TENTRY) \
        __asm_copy_user_cont(to, from, ret,             \
                "       move.w [%1+],$r9\n"             \
                "2:     move.w $r9,[%0+]\n" COPY,       \
-               "3:     addq 2,%2\n"                    \
-               "       clear.w [%0+]\n" FIXUP,         \
+               "3:     addq 2,%2\n" FIXUP,             \
                "       .dword 2b,3b\n" TENTRY)
 
 #define __asm_copy_from_user_2(to, from, ret) \
@@ -191,16 +189,14 @@ __do_strncpy_from_user(char *dst, const char *src, long count)
        __asm_copy_from_user_2x_cont(to, from, ret,     \
                "       move.b [%1+],$r9\n"             \
                "4:     move.b $r9,[%0+]\n",            \
-               "5:     addq 1,%2\n"                    \
-               "       clear.b [%0+]\n",               \
+               "5:     addq 1,%2\n",                   \
                "       .dword 4b,5b\n")
 
 #define __asm_copy_from_user_4x_cont(to, from, ret, COPY, FIXUP, TENTRY) \
        __asm_copy_user_cont(to, from, ret,             \
                "       move.d [%1+],$r9\n"             \
                "2:     move.d $r9,[%0+]\n" COPY,       \
-               "3:     addq 4,%2\n"                    \
-               "       clear.d [%0+]\n" FIXUP,         \
+               "3:     addq 4,%2\n" FIXUP,             \
                "       .dword 2b,3b\n" TENTRY)
 
 #define __asm_copy_from_user_4(to, from, ret) \
@@ -210,8 +206,7 @@ __do_strncpy_from_user(char *dst, const char *src, long count)
        __asm_copy_from_user_4x_cont(to, from, ret,     \
                "       move.b [%1+],$r9\n"             \
                "4:     move.b $r9,[%0+]\n",            \
-               "5:     addq 1,%2\n"                    \
-               "       clear.b [%0+]\n",               \
+               "5:     addq 1,%2\n",                   \
                "       .dword 4b,5b\n")
 
 #define __asm_copy_from_user_6x_cont(to, from, ret, COPY, FIXUP, TENTRY) \
@@ -219,7 +214,7 @@ __do_strncpy_from_user(char *dst, const char *src, long count)
                "       move.w [%1+],$r9\n"             \
                "4:     move.w $r9,[%0+]\n" COPY,       \
                "5:     addq 2,%2\n"                    \
-               "       clear.w [%0+]\n" FIXUP,         \
+                       FIXUP,                          \
                "       .dword 4b,5b\n" TENTRY)
 
 #define __asm_copy_from_user_6(to, from, ret) \
@@ -229,8 +224,7 @@ __do_strncpy_from_user(char *dst, const char *src, long count)
        __asm_copy_from_user_6x_cont(to, from, ret,     \
                "       move.b [%1+],$r9\n"             \
                "6:     move.b $r9,[%0+]\n",            \
-               "7:     addq 1,%2\n"                    \
-               "       clear.b [%0+]\n",               \
+               "7:     addq 1,%2\n",                   \
                "       .dword 6b,7b\n")
 
 #define __asm_copy_from_user_8x_cont(to, from, ret, COPY, FIXUP, TENTRY) \
@@ -238,7 +232,7 @@ __do_strncpy_from_user(char *dst, const char *src, long count)
                "       move.d [%1+],$r9\n"             \
                "4:     move.d $r9,[%0+]\n" COPY,       \
                "5:     addq 4,%2\n"                    \
-               "       clear.d [%0+]\n" FIXUP,         \
+                       FIXUP,                          \
                "       .dword 4b,5b\n" TENTRY)
 
 #define __asm_copy_from_user_8(to, from, ret) \
@@ -248,8 +242,7 @@ __do_strncpy_from_user(char *dst, const char *src, long count)
        __asm_copy_from_user_8x_cont(to, from, ret,     \
                "       move.b [%1+],$r9\n"             \
                "6:     move.b $r9,[%0+]\n",            \
-               "7:     addq 1,%2\n"                    \
-               "       clear.b [%0+]\n",               \
+               "7:     addq 1,%2\n",                   \
                "       .dword 6b,7b\n")
 
 #define __asm_copy_from_user_10x_cont(to, from, ret, COPY, FIXUP, TENTRY) \
@@ -257,7 +250,7 @@ __do_strncpy_from_user(char *dst, const char *src, long count)
                "       move.w [%1+],$r9\n"             \
                "6:     move.w $r9,[%0+]\n" COPY,       \
                "7:     addq 2,%2\n"                    \
-               "       clear.w [%0+]\n" FIXUP,         \
+                       FIXUP,                          \
                "       .dword 6b,7b\n" TENTRY)
 
 #define __asm_copy_from_user_10(to, from, ret) \
@@ -267,8 +260,7 @@ __do_strncpy_from_user(char *dst, const char *src, long count)
        __asm_copy_from_user_10x_cont(to, from, ret,    \
                "       move.b [%1+],$r9\n"             \
                "8:     move.b $r9,[%0+]\n",            \
-               "9:     addq 1,%2\n"                    \
-               "       clear.b [%0+]\n",               \
+               "9:     addq 1,%2\n",                   \
                "       .dword 8b,9b\n")
 
 #define __asm_copy_from_user_12x_cont(to, from, ret, COPY, FIXUP, TENTRY) \
@@ -276,7 +268,7 @@ __do_strncpy_from_user(char *dst, const char *src, long count)
                "       move.d [%1+],$r9\n"             \
                "6:     move.d $r9,[%0+]\n" COPY,       \
                "7:     addq 4,%2\n"                    \
-               "       clear.d [%0+]\n" FIXUP,         \
+                       FIXUP,                          \
                "       .dword 6b,7b\n" TENTRY)
 
 #define __asm_copy_from_user_12(to, from, ret) \
@@ -286,8 +278,7 @@ __do_strncpy_from_user(char *dst, const char *src, long count)
        __asm_copy_from_user_12x_cont(to, from, ret,    \
                "       move.b [%1+],$r9\n"             \
                "8:     move.b $r9,[%0+]\n",            \
-               "9:     addq 1,%2\n"                    \
-               "       clear.b [%0+]\n",               \
+               "9:     addq 1,%2\n",                   \
                "       .dword 8b,9b\n")
 
 #define __asm_copy_from_user_14x_cont(to, from, ret, COPY, FIXUP, TENTRY) \
@@ -295,7 +286,7 @@ __do_strncpy_from_user(char *dst, const char *src, long count)
                "       move.w [%1+],$r9\n"             \
                "8:     move.w $r9,[%0+]\n" COPY,       \
                "9:     addq 2,%2\n"                    \
-               "       clear.w [%0+]\n" FIXUP,         \
+                       FIXUP,                          \
                "       .dword 8b,9b\n" TENTRY)
 
 #define __asm_copy_from_user_14(to, from, ret) \
@@ -305,8 +296,7 @@ __do_strncpy_from_user(char *dst, const char *src, long count)
        __asm_copy_from_user_14x_cont(to, from, ret,    \
                "       move.b [%1+],$r9\n"             \
                "10:    move.b $r9,[%0+]\n",            \
-               "11:    addq 1,%2\n"                    \
-               "       clear.b [%0+]\n",               \
+               "11:    addq 1,%2\n",                   \
                "       .dword 10b,11b\n")
 
 #define __asm_copy_from_user_16x_cont(to, from, ret, COPY, FIXUP, TENTRY) \
@@ -314,7 +304,7 @@ __do_strncpy_from_user(char *dst, const char *src, long count)
                "       move.d [%1+],$r9\n"             \
                "8:     move.d $r9,[%0+]\n" COPY,       \
                "9:     addq 4,%2\n"                    \
-               "       clear.d [%0+]\n" FIXUP,         \
+                       FIXUP,                          \
                "       .dword 8b,9b\n" TENTRY)
 
 #define __asm_copy_from_user_16(to, from, ret) \
@@ -325,7 +315,7 @@ __do_strncpy_from_user(char *dst, const char *src, long count)
                "       move.d [%1+],$r9\n"             \
                "10:    move.d $r9,[%0+]\n" COPY,       \
                "11:    addq 4,%2\n"                    \
-               "       clear.d [%0+]\n" FIXUP,         \
+                       FIXUP,                          \
                "       .dword 10b,11b\n" TENTRY)
 
 #define __asm_copy_from_user_20(to, from, ret) \
@@ -336,7 +326,7 @@ __do_strncpy_from_user(char *dst, const char *src, long count)
                "       move.d [%1+],$r9\n"             \
                "12:    move.d $r9,[%0+]\n" COPY,       \
                "13:    addq 4,%2\n"                    \
-               "       clear.d [%0+]\n" FIXUP,         \
+                       FIXUP,                          \
                "       .dword 12b,13b\n" TENTRY)
 
 #define __asm_copy_from_user_24(to, from, ret) \
index 3196019..dc2ce09 100644 (file)
@@ -178,8 +178,7 @@ __do_strncpy_from_user(char *dst, const char *src, long count)
                "2:     move.b [%1+],$acr\n"    \
                "       move.b $acr,[%0+]\n",   \
                "3:     addq 1,%2\n"            \
-               "       jump 1b\n"              \
-               "       clear.b [%0+]\n",       \
+               "       jump 1b\n",             \
                "       .dword 2b,3b\n")
 
 #define __asm_copy_from_user_2x_cont(to, from, ret, COPY, FIXUP, TENTRY) \
@@ -189,8 +188,7 @@ __do_strncpy_from_user(char *dst, const char *src, long count)
                "       move.w $acr,[%0+]\n",           \
                        FIXUP                           \
                "3:     addq 2,%2\n"                    \
-               "       jump 1b\n"                      \
-               "       clear.w [%0+]\n",               \
+               "       jump 1b\n",                     \
                        TENTRY                          \
                "       .dword 2b,3b\n")
 
@@ -201,8 +199,7 @@ __do_strncpy_from_user(char *dst, const char *src, long count)
        __asm_copy_from_user_2x_cont(to, from, ret,     \
                "4:     move.b [%1+],$acr\n"            \
                "       move.b $acr,[%0+]\n",           \
-               "5:     addq 1,%2\n"                    \
-               "       clear.b [%0+]\n",               \
+               "5:     addq 1,%2\n",                   \
                "       .dword 4b,5b\n")
 
 #define __asm_copy_from_user_4x_cont(to, from, ret, COPY, FIXUP, TENTRY) \
@@ -212,8 +209,7 @@ __do_strncpy_from_user(char *dst, const char *src, long count)
                "       move.d $acr,[%0+]\n",           \
                        FIXUP                           \
                "3:     addq 4,%2\n"                    \
-               "       jump 1b\n"                      \
-               "       clear.d [%0+]\n",               \
+               "       jump 1b\n",                     \
                        TENTRY                          \
                "       .dword 2b,3b\n")
 
@@ -224,8 +220,7 @@ __do_strncpy_from_user(char *dst, const char *src, long count)
        __asm_copy_from_user_4x_cont(to, from, ret,     \
                "4:     move.b [%1+],$acr\n"            \
                "       move.b $acr,[%0+]\n",           \
-               "5:     addq 1,%2\n"                    \
-               "       clear.b [%0+]\n",               \
+               "5:     addq 1,%2\n",                   \
                "       .dword 4b,5b\n")
 
 #define __asm_copy_from_user_6x_cont(to, from, ret, COPY, FIXUP, TENTRY) \
@@ -234,8 +229,7 @@ __do_strncpy_from_user(char *dst, const char *src, long count)
                "4:     move.w [%1+],$acr\n"            \
                "       move.w $acr,[%0+]\n",           \
                        FIXUP                           \
-               "5:     addq 2,%2\n"                    \
-               "       clear.w [%0+]\n",               \
+               "5:     addq 2,%2\n",                   \
                        TENTRY                          \
                "       .dword 4b,5b\n")
 
@@ -246,8 +240,7 @@ __do_strncpy_from_user(char *dst, const char *src, long count)
        __asm_copy_from_user_6x_cont(to, from, ret,     \
                "6:     move.b [%1+],$acr\n"            \
                "       move.b $acr,[%0+]\n",           \
-               "7:     addq 1,%2\n"                    \
-               "       clear.b [%0+]\n",               \
+               "7:     addq 1,%2\n",                   \
                "       .dword 6b,7b\n")
 
 #define __asm_copy_from_user_8x_cont(to, from, ret, COPY, FIXUP, TENTRY) \
@@ -256,8 +249,7 @@ __do_strncpy_from_user(char *dst, const char *src, long count)
                "4:     move.d [%1+],$acr\n"            \
                "       move.d $acr,[%0+]\n",           \
                        FIXUP                           \
-               "5:     addq 4,%2\n"                    \
-               "       clear.d [%0+]\n",               \
+               "5:     addq 4,%2\n",                   \
                        TENTRY                          \
                "       .dword 4b,5b\n")
 
@@ -268,8 +260,7 @@ __do_strncpy_from_user(char *dst, const char *src, long count)
        __asm_copy_from_user_8x_cont(to, from, ret,     \
                "6:     move.b [%1+],$acr\n"            \
                "       move.b $acr,[%0+]\n",           \
-               "7:     addq 1,%2\n"                    \
-               "       clear.b [%0+]\n",               \
+               "7:     addq 1,%2\n",                   \
                "       .dword 6b,7b\n")
 
 #define __asm_copy_from_user_10x_cont(to, from, ret, COPY, FIXUP, TENTRY) \
@@ -278,8 +269,7 @@ __do_strncpy_from_user(char *dst, const char *src, long count)
                "6:     move.w [%1+],$acr\n"            \
                "       move.w $acr,[%0+]\n",           \
                        FIXUP                           \
-               "7:     addq 2,%2\n"                    \
-               "       clear.w [%0+]\n",               \
+               "7:     addq 2,%2\n",                   \
                        TENTRY                          \
                "       .dword 6b,7b\n")
 
@@ -290,8 +280,7 @@ __do_strncpy_from_user(char *dst, const char *src, long count)
        __asm_copy_from_user_10x_cont(to, from, ret,    \
                "8:     move.b [%1+],$acr\n"            \
                "       move.b $acr,[%0+]\n",           \
-               "9:     addq 1,%2\n"                    \
-               "       clear.b [%0+]\n",               \
+               "9:     addq 1,%2\n",                   \
                "       .dword 8b,9b\n")
 
 #define __asm_copy_from_user_12x_cont(to, from, ret, COPY, FIXUP, TENTRY) \
@@ -300,8 +289,7 @@ __do_strncpy_from_user(char *dst, const char *src, long count)
                "6:     move.d [%1+],$acr\n"            \
                "       move.d $acr,[%0+]\n",           \
                        FIXUP                           \
-               "7:     addq 4,%2\n"                    \
-               "       clear.d [%0+]\n",               \
+               "7:     addq 4,%2\n",                   \
                        TENTRY                          \
                "       .dword 6b,7b\n")
 
@@ -312,8 +300,7 @@ __do_strncpy_from_user(char *dst, const char *src, long count)
        __asm_copy_from_user_12x_cont(to, from, ret,    \
                "8:     move.b [%1+],$acr\n"            \
                "       move.b $acr,[%0+]\n",           \
-               "9:     addq 1,%2\n"                    \
-               "       clear.b [%0+]\n",               \
+               "9:     addq 1,%2\n",                   \
                "       .dword 8b,9b\n")
 
 #define __asm_copy_from_user_14x_cont(to, from, ret, COPY, FIXUP, TENTRY) \
@@ -322,8 +309,7 @@ __do_strncpy_from_user(char *dst, const char *src, long count)
                "8:     move.w [%1+],$acr\n"            \
                "       move.w $acr,[%0+]\n",           \
                        FIXUP                           \
-               "9:     addq 2,%2\n"                    \
-               "       clear.w [%0+]\n",               \
+               "9:     addq 2,%2\n",                   \
                        TENTRY                          \
                "       .dword 8b,9b\n")
 
@@ -334,8 +320,7 @@ __do_strncpy_from_user(char *dst, const char *src, long count)
        __asm_copy_from_user_14x_cont(to, from, ret,    \
                "10:    move.b [%1+],$acr\n"            \
                "       move.b $acr,[%0+]\n",           \
-               "11:    addq 1,%2\n"                    \
-               "       clear.b [%0+]\n",               \
+               "11:    addq 1,%2\n",                   \
                "       .dword 10b,11b\n")
 
 #define __asm_copy_from_user_16x_cont(to, from, ret, COPY, FIXUP, TENTRY) \
@@ -344,8 +329,7 @@ __do_strncpy_from_user(char *dst, const char *src, long count)
                "8:     move.d [%1+],$acr\n"            \
                "       move.d $acr,[%0+]\n",           \
                        FIXUP                           \
-               "9:     addq 4,%2\n"                    \
-               "       clear.d [%0+]\n",               \
+               "9:     addq 4,%2\n",                   \
                        TENTRY                          \
                "       .dword 8b,9b\n")
 
@@ -358,8 +342,7 @@ __do_strncpy_from_user(char *dst, const char *src, long count)
                "10:    move.d [%1+],$acr\n"            \
                "       move.d $acr,[%0+]\n",           \
                        FIXUP                           \
-               "11:    addq 4,%2\n"                    \
-               "       clear.d [%0+]\n",               \
+               "11:    addq 4,%2\n",                   \
                        TENTRY                          \
                "       .dword 10b,11b\n")
 
@@ -372,8 +355,7 @@ __do_strncpy_from_user(char *dst, const char *src, long count)
                "12:    move.d [%1+],$acr\n"            \
                "       move.d $acr,[%0+]\n",           \
                        FIXUP                           \
-               "13:    addq 4,%2\n"                    \
-               "       clear.d [%0+]\n",               \
+               "13:    addq 4,%2\n",                   \
                        TENTRY                          \
                "       .dword 12b,13b\n")
 
index 0f5132b..2890099 100644 (file)
@@ -9,6 +9,7 @@ generic-y += device.h
 generic-y += div64.h
 generic-y += errno.h
 generic-y += exec.h
+generic-y += extable.h
 generic-y += emergency-restart.h
 generic-y += fcntl.h
 generic-y += futex.h
index 56c7d57..0d473ae 100644 (file)
 #ifndef _CRIS_UACCESS_H
 #define _CRIS_UACCESS_H
 
-#ifndef __ASSEMBLY__
-#include <linux/sched.h>
-#include <linux/errno.h>
 #include <asm/processor.h>
 #include <asm/page.h>
 
-#define VERIFY_READ    0
-#define VERIFY_WRITE   1
-
 /*
  * The fs value determines whether argument validity checking should be
  * performed or not.  If get_fs() == USER_DS, checking is performed, with
 
 #define segment_eq(a, b)       ((a).seg == (b).seg)
 
-#define __kernel_ok (segment_eq(get_fs(), KERNEL_DS))
+#define __kernel_ok (uaccess_kernel())
 #define __user_ok(addr, size) \
        (((size) <= TASK_SIZE) && ((addr) <= TASK_SIZE-(size)))
 #define __access_ok(addr, size) (__kernel_ok || __user_ok((addr), (size)))
 #define access_ok(type, addr, size) __access_ok((unsigned long)(addr), (size))
 
 #include <arch/uaccess.h>
-
-/*
- * The exception table consists of pairs of addresses: the first is the
- * address of an instruction that is allowed to fault, and the second is
- * the address at which the program should continue.  No registers are
- * modified, so it is entirely up to the continuation code to figure out
- * what to do.
- *
- * All the routines below use bits of fixup code that are out of line
- * with the main instruction path.  This means when everything is well,
- * we don't even have to jump over them.  Further, they do not intrude
- * on our cache or tlb entries.
- */
-
-struct exception_table_entry {
-       unsigned long insn, fixup;
-};
+#include <asm/extable.h>
 
 /*
  * These are the main single-value transfer routines.  They automatically
@@ -191,7 +169,7 @@ extern long __get_user_bad(void);
    live in lib/usercopy.c  */
 
 extern unsigned long __copy_user(void __user *to, const void *from, unsigned long n);
-extern unsigned long __copy_user_zeroing(void *to, const void __user *from, unsigned long n);
+extern unsigned long __copy_user_in(void *to, const void __user *from, unsigned long n);
 extern unsigned long __do_clear_user(void __user *to, unsigned long n);
 
 static inline long
@@ -258,7 +236,7 @@ __constant_copy_from_user(void *to, const void __user *from, unsigned long n)
        else if (n == 24)
                __asm_copy_from_user_24(to, from, ret);
        else
-               ret = __copy_user_zeroing(to, from, n);
+               ret = __copy_user_in(to, from, n);
 
        return ret;
 }
@@ -358,64 +336,33 @@ static inline size_t clear_user(void __user *to, size_t n)
                return __do_clear_user(to, n);
 }
 
-static inline size_t copy_from_user(void *to, const void __user *from, size_t n)
+static inline unsigned long
+raw_copy_from_user(void *to, const void __user *from, unsigned long n)
 {
-       if (unlikely(!access_ok(VERIFY_READ, from, n))) {
-               memset(to, 0, n);
-               return n;
-       }
        if (__builtin_constant_p(n))
                return __constant_copy_from_user(to, from, n);
        else
-               return __copy_user_zeroing(to, from, n);
+               return __copy_user_in(to, from, n);
 }
 
-static inline size_t copy_to_user(void __user *to, const void *from, size_t n)
+static inline unsigned long
+raw_copy_to_user(void __user *to, const void *from, unsigned long n)
 {
-       if (unlikely(!access_ok(VERIFY_WRITE, to, n)))
-               return n;
        if (__builtin_constant_p(n))
                return __constant_copy_to_user(to, from, n);
        else
                return __copy_user(to, from, n);
 }
 
-/* We let the __ versions of copy_from/to_user inline, because they're often
- * used in fast paths and have only a small space overhead.
- */
+#define INLINE_COPY_FROM_USER
+#define INLINE_COPY_TO_USER
 
 static inline unsigned long
-__generic_copy_from_user_nocheck(void *to, const void __user *from,
-                                unsigned long n)
-{
-       return __copy_user_zeroing(to, from, n);
-}
-
-static inline unsigned long
-__generic_copy_to_user_nocheck(void __user *to, const void *from,
-                              unsigned long n)
-{
-       return __copy_user(to, from, n);
-}
-
-static inline unsigned long
-__generic_clear_user_nocheck(void __user *to, unsigned long n)
+__clear_user(void __user *to, unsigned long n)
 {
        return __do_clear_user(to, n);
 }
 
-/* without checking */
-
-#define __copy_to_user(to, from, n) \
-       __generic_copy_to_user_nocheck((to), (from), (n))
-#define __copy_from_user(to, from, n) \
-       __generic_copy_from_user_nocheck((to), (from), (n))
-#define __copy_to_user_inatomic __copy_to_user
-#define __copy_from_user_inatomic __copy_from_user
-#define __clear_user(to, n) __generic_clear_user_nocheck((to), (n))
-
 #define strlen_user(str)       strnlen_user((str), 0x7ffffffe)
 
-#endif  /* __ASSEMBLY__ */
-
 #endif /* _CRIS_UACCESS_H */
index c33b467..cce3bc3 100644 (file)
@@ -1,6 +1,7 @@
 
 generic-y += clkdev.h
 generic-y += exec.h
+generic-y += extable.h
 generic-y += irq_work.h
 generic-y += mcs_spinlock.h
 generic-y += mm-arch-hooks.h
index c0f4057..e4e33b4 100644 (file)
 /*
  * User space memory access functions
  */
-#include <linux/sched.h>
 #include <linux/mm.h>
 #include <asm/segment.h>
 #include <asm/sections.h>
+#include <asm/extable.h>
 
 #define __ptr(x) ((unsigned long __force *)(x))
 
-#define VERIFY_READ    0
-#define VERIFY_WRITE   1
-
 /*
  * check that a range of addresses falls within the current address limit
  */
@@ -63,26 +60,6 @@ static inline int ___range_ok(unsigned long addr, unsigned long size)
 #define access_ok(type,addr,size) (__range_ok((void __user *)(addr), (size)) == 0)
 #define __access_ok(addr,size) (__range_ok((addr), (size)) == 0)
 
-/*
- * The exception table consists of pairs of addresses: the first is the
- * address of an instruction that is allowed to fault, and the second is
- * the address at which the program should continue.  No registers are
- * modified, so it is entirely up to the continuation code to figure out
- * what to do.
- *
- * All the routines below use bits of fixup code that are out of line
- * with the main instruction path.  This means when everything is well,
- * we don't even have to jump over them.  Further, they do not intrude
- * on our cache or tlb entries.
- */
-struct exception_table_entry
-{
-       unsigned long insn, fixup;
-};
-
-/* Returns 0 if exception not found and fixup otherwise.  */
-extern unsigned long search_exception_table(unsigned long);
-
 
 /*
  * These are the main single-value transfer routines.  They automatically
@@ -256,61 +233,50 @@ do {                                                      \
 /*
  *
  */
+
 #define ____force(x) (__force void *)(void __user *)(x)
 #ifdef CONFIG_MMU
 extern long __memset_user(void *dst, unsigned long count);
 extern long __memcpy_user(void *dst, const void *src, unsigned long count);
 
 #define __clear_user(dst,count)                        __memset_user(____force(dst), (count))
-#define __copy_from_user_inatomic(to, from, n) __memcpy_user((to), ____force(from), (n))
-#define __copy_to_user_inatomic(to, from, n)   __memcpy_user(____force(to), (from), (n))
 
 #else
 
 #define __clear_user(dst,count)                        (memset(____force(dst), 0, (count)), 0)
-#define __copy_from_user_inatomic(to, from, n) (memcpy((to), ____force(from), (n)), 0)
-#define __copy_to_user_inatomic(to, from, n)   (memcpy(____force(to), (from), (n)), 0)
 
 #endif
 
-static inline unsigned long __must_check
-clear_user(void __user *to, unsigned long n)
-{
-       if (likely(__access_ok(to, n)))
-               n = __clear_user(to, n);
-       return n;
-}
-
-static inline unsigned long __must_check
-__copy_to_user(void __user *to, const void *from, unsigned long n)
-{
-       might_fault();
-       return __copy_to_user_inatomic(to, from, n);
-}
-
 static inline unsigned long
-__copy_from_user(void *to, const void __user *from, unsigned long n)
+raw_copy_from_user(void *to, const void __user *from, unsigned long n)
 {
-       might_fault();
-       return __copy_from_user_inatomic(to, from, n);
+#ifdef CONFIG_MMU
+       return __memcpy_user(to, (__force const void *)from, n);
+#else
+       memcpy(to, (__force const void *)from, n);
+       return 0;
+#endif
 }
 
-static inline long copy_from_user(void *to, const void __user *from, unsigned long n)
+static inline unsigned long
+raw_copy_to_user(void __user *to, const void *from, unsigned long n)
 {
-       unsigned long ret = n;
-
-       if (likely(__access_ok(from, n)))
-               ret = __copy_from_user(to, from, n);
-
-       if (unlikely(ret != 0))
-               memset(to + (n - ret), 0, ret);
-
-       return ret;
+#ifdef CONFIG_MMU
+       return __memcpy_user((__force void *)to, from, n);
+#else
+       memcpy((__force void *)to, from, n);
+       return 0;
+#endif
 }
+#define INLINE_COPY_TO_USER
+#define INLINE_COPY_FROM_USER
 
-static inline long copy_to_user(void __user *to, const void *from, unsigned long n)
+static inline unsigned long __must_check
+clear_user(void __user *to, unsigned long n)
 {
-       return likely(__access_ok(to, n)) ? __copy_to_user(to, from, n) : n;
+       if (likely(__access_ok(to, n)))
+               n = __clear_user(to, n);
+       return n;
 }
 
 extern long strncpy_from_user(char *dst, const char __user *src, long count);
@@ -318,6 +284,4 @@ extern long strnlen_user(const char __user *src, long count);
 
 #define strlen_user(str) strnlen_user(str, 32767)
 
-extern unsigned long search_exception_table(unsigned long addr);
-
 #endif /* _ASM_UACCESS_H */
index ce29991..fb08ebe 100644 (file)
@@ -360,13 +360,8 @@ asmlinkage void memory_access_exception(unsigned long esr0,
        siginfo_t info;
 
 #ifdef CONFIG_MMU
-       unsigned long fixup;
-
-       fixup = search_exception_table(__frame->pc);
-       if (fixup) {
-               __frame->pc = fixup;
+       if (fixup_exception(__frame))
                return;
-       }
 #endif
 
        die_if_kernel("-- Memory Access Exception --\n"
index a0e8b3e..9198ddd 100644 (file)
@@ -10,40 +10,39 @@ extern const void __memset_end, __memset_user_error_lr, __memset_user_error_hand
 extern const void __memcpy_end, __memcpy_user_error_lr, __memcpy_user_error_handler;
 extern spinlock_t modlist_lock;
 
-
-/*****************************************************************************/
-/*
- * see if there's a fixup handler available to deal with a kernel fault
- */
-unsigned long search_exception_table(unsigned long pc)
+int fixup_exception(struct pt_regs *regs)
 {
        const struct exception_table_entry *extab;
+       unsigned long pc = regs->pc;
 
        /* determine if the fault lay during a memcpy_user or a memset_user */
-       if (__frame->lr == (unsigned long) &__memset_user_error_lr &&
+       if (regs->lr == (unsigned long) &__memset_user_error_lr &&
            (unsigned long) &memset <= pc && pc < (unsigned long) &__memset_end
            ) {
                /* the fault occurred in a protected memset
                 * - we search for the return address (in LR) instead of the program counter
                 * - it was probably during a clear_user()
                 */
-               return (unsigned long) &__memset_user_error_handler;
+               regs->pc = (unsigned long) &__memset_user_error_handler;
+               return 1;
        }
 
-       if (__frame->lr == (unsigned long) &__memcpy_user_error_lr &&
+       if (regs->lr == (unsigned long) &__memcpy_user_error_lr &&
            (unsigned long) &memcpy <= pc && pc < (unsigned long) &__memcpy_end
            ) {
                /* the fault occurred in a protected memset
                 * - we search for the return address (in LR) instead of the program counter
                 * - it was probably during a copy_to/from_user()
                 */
-               return (unsigned long) &__memcpy_user_error_handler;
+               regs->pc = (unsigned long) &__memcpy_user_error_handler;
+               return 1;
        }
 
        extab = search_exception_tables(pc);
-       if (extab)
-               return extab->fixup;
+       if (extab) {
+               regs->pc = extab->fixup;
+               return 1;
+       }
 
        return 0;
-
-} /* end search_exception_table() */
+}
index 614a46c..179e79e 100644 (file)
@@ -33,7 +33,7 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear
 {
        struct vm_area_struct *vma;
        struct mm_struct *mm;
-       unsigned long _pme, lrai, lrad, fixup;
+       unsigned long _pme, lrai, lrad;
        unsigned long flags = 0;
        siginfo_t info;
        pgd_t *pge;
@@ -201,10 +201,8 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear
 
  no_context:
        /* are we prepared to handle this kernel fault? */
-       if ((fixup = search_exception_table(__frame->pc)) != 0) {
-               __frame->pc = fixup;
+       if (fixup_exception(__frame))
                return;
-       }
 
 /*
  * Oops. The kernel tried to access some bad page. We'll have to
index 341740c..757cdeb 100644 (file)
@@ -13,6 +13,7 @@ generic-y += dma.h
 generic-y += emergency-restart.h
 generic-y += errno.h
 generic-y += exec.h
+generic-y += extable.h
 generic-y += fb.h
 generic-y += fcntl.h
 generic-y += ftrace.h
@@ -68,7 +69,6 @@ generic-y += tlbflush.h
 generic-y += trace_clock.h
 generic-y += topology.h
 generic-y += types.h
-generic-y += uaccess.h
 generic-y += ucontext.h
 generic-y += unaligned.h
 generic-y += vga.h
diff --git a/arch/h8300/include/asm/uaccess.h b/arch/h8300/include/asm/uaccess.h
new file mode 100644 (file)
index 0000000..6f6144a
--- /dev/null
@@ -0,0 +1,54 @@
+#ifndef _ASM_UACCESS_H
+#define _ASM_UACCESS_H
+
+#include <linux/string.h>
+
+static inline __must_check unsigned long
+raw_copy_from_user(void *to, const void __user * from, unsigned long n)
+{
+       if (__builtin_constant_p(n)) {
+               switch(n) {
+               case 1:
+                       *(u8 *)to = *(u8 __force *)from;
+                       return 0;
+               case 2:
+                       *(u16 *)to = *(u16 __force *)from;
+                       return 0;
+               case 4:
+                       *(u32 *)to = *(u32 __force *)from;
+                       return 0;
+               }
+       }
+
+       memcpy(to, (const void __force *)from, n);
+       return 0;
+}
+
+static inline __must_check unsigned long
+raw_copy_to_user(void __user *to, const void *from, unsigned long n)
+{
+       if (__builtin_constant_p(n)) {
+               switch(n) {
+               case 1:
+                       *(u8 __force *)to = *(u8 *)from;
+                       return 0;
+               case 2:
+                       *(u16 __force *)to = *(u16 *)from;
+                       return 0;
+               case 4:
+                       *(u32 __force *)to = *(u32 *)from;
+                       return 0;
+               default:
+                       break;
+               }
+       }
+
+       memcpy((void __force *)to, from, n);
+       return 0;
+}
+#define INLINE_COPY_FROM_USER
+#define INLINE_COPY_TO_USER
+
+#include <asm-generic/uaccess.h>
+
+#endif
index 797b64a..a2036bf 100644 (file)
@@ -11,6 +11,7 @@ generic-y += device.h
 generic-y += div64.h
 generic-y += emergency-restart.h
 generic-y += errno.h
+generic-y += extable.h
 generic-y += fb.h
 generic-y += fcntl.h
 generic-y += ftrace.h
index f61cfb2..458b698 100644 (file)
@@ -23,7 +23,6 @@
 /*
  * User space memory access functions
  */
-#include <linux/sched.h>
 #include <linux/mm.h>
 #include <asm/segment.h>
 #include <asm/sections.h>
@@ -50,8 +49,6 @@
  * reasonably simple and not *too* slow.  After all, we've got the
  * MMU for backup.
  */
-#define VERIFY_READ     0
-#define VERIFY_WRITE    1
 
 #define __access_ok(addr, size) \
        ((get_fs().seg == KERNEL_DS.seg) || \
  */
 
 /*  Assembly somewhat optimized copy routines  */
-unsigned long __copy_from_user_hexagon(void *to, const void __user *from,
+unsigned long raw_copy_from_user(void *to, const void __user *from,
                                     unsigned long n);
-unsigned long __copy_to_user_hexagon(void __user *to, const void *from,
+unsigned long raw_copy_to_user(void __user *to, const void *from,
                                   unsigned long n);
-
-#define __copy_from_user(to, from, n) __copy_from_user_hexagon(to, from, n)
-#define __copy_to_user(to, from, n) __copy_to_user_hexagon(to, from, n)
-
-/*
- * XXX todo: some additonal performance gain is possible by
- * implementing __copy_to/from_user_inatomic, which is much
- * like __copy_to/from_user, but performs slightly less checking.
- */
+#define INLINE_COPY_FROM_USER
+#define INLINE_COPY_TO_USER
 
 __kernel_size_t __clear_user_hexagon(void __user *dest, unsigned long count);
 #define __clear_user(a, s) __clear_user_hexagon((a), (s))
@@ -107,10 +97,14 @@ static inline long hexagon_strncpy_from_user(char *dst, const char __user *src,
                return -EFAULT;
 
        if (res > n) {
-               copy_from_user(dst, src, n);
+               long left = raw_copy_from_user(dst, src, n);
+               if (unlikely(left))
+                       memset(dst + (n - left), 0, left);
                return n;
        } else {
-               copy_from_user(dst, src, res);
+               long left = raw_copy_from_user(dst, src, res);
+               if (unlikely(left))
+                       memset(dst + (res - left), 0, left);
                return res-1;
        }
 }
index af9dec4..00bcad9 100644 (file)
@@ -25,8 +25,8 @@
 
 /* Additional functions */
 EXPORT_SYMBOL(__clear_user_hexagon);
-EXPORT_SYMBOL(__copy_from_user_hexagon);
-EXPORT_SYMBOL(__copy_to_user_hexagon);
+EXPORT_SYMBOL(raw_copy_from_user);
+EXPORT_SYMBOL(raw_copy_to_user);
 EXPORT_SYMBOL(__iounmap);
 EXPORT_SYMBOL(__strnlen_user);
 EXPORT_SYMBOL(__vmgetie);
index ff4e9bf..29b1f57 100644 (file)
@@ -199,7 +199,9 @@ void __init time_init_deferred(void)
        clockevents_calc_mult_shift(ce_dev, sleep_clk_freq, 4);
 
        ce_dev->max_delta_ns = clockevent_delta2ns(0x7fffffff, ce_dev);
+       ce_dev->max_delta_ticks = 0x7fffffff;
        ce_dev->min_delta_ns = clockevent_delta2ns(0xf, ce_dev);
+       ce_dev->min_delta_ticks = 0xf;
 
 #ifdef CONFIG_SMP
        setup_percpu_clockdev();
index 7fc94f3..7da066f 100644 (file)
@@ -44,7 +44,7 @@
 #define bytes r2
 #define loopcount r5
 
-#define FUNCNAME __copy_from_user_hexagon
+#define FUNCNAME raw_copy_from_user
 #include "copy_user_template.S"
 
        /* LOAD FAULTS from COPY_FROM_USER */
index 0cfbcc0..a7b7f8d 100644 (file)
@@ -43,7 +43,7 @@
 #define bytes r2
 #define loopcount r5
 
-#define FUNCNAME __copy_to_user_hexagon
+#define FUNCNAME raw_copy_to_user
 #include "copy_user_template.S"
 
        /* STORE FAULTS from COPY_TO_USER */
index 18ca6a9..6a15083 100644 (file)
@@ -52,7 +52,6 @@ config IA64
        select MODULES_USE_ELF_RELA
        select ARCH_USE_CMPXCHG_LOCKREF
        select HAVE_ARCH_AUDITSYSCALL
-       select HAVE_ARCH_HARDENED_USERCOPY
        default y
        help
          The Itanium Processor Family is Intel's 64-bit successor to
diff --git a/arch/ia64/include/asm/asm-prototypes.h b/arch/ia64/include/asm/asm-prototypes.h
new file mode 100644 (file)
index 0000000..a2c1398
--- /dev/null
@@ -0,0 +1,29 @@
+#ifndef _ASM_IA64_ASM_PROTOTYPES_H
+#define _ASM_IA64_ASM_PROTOTYPES_H
+
+#include <asm/cacheflush.h>
+#include <asm/checksum.h>
+#include <asm/esi.h>
+#include <asm/ftrace.h>
+#include <asm/page.h>
+#include <asm/pal.h>
+#include <asm/string.h>
+#include <asm/uaccess.h>
+#include <asm/unwind.h>
+#include <asm/xor.h>
+
+extern const char ia64_ivt[];
+
+signed int __divsi3(signed int, unsigned int);
+signed int __modsi3(signed int, unsigned int);
+
+signed long long __divdi3(signed long long, unsigned long long);
+signed long long __moddi3(signed long long, unsigned long long);
+
+unsigned int __udivsi3(unsigned int, unsigned int);
+unsigned int __umodsi3(unsigned int, unsigned int);
+
+unsigned long long __udivdi3(unsigned long long, unsigned long long);
+unsigned long long __umoddi3(unsigned long long, unsigned long long);
+
+#endif /* _ASM_IA64_ASM_PROTOTYPES_H */
diff --git a/arch/ia64/include/asm/extable.h b/arch/ia64/include/asm/extable.h
new file mode 100644 (file)
index 0000000..20376e7
--- /dev/null
@@ -0,0 +1,11 @@
+#ifndef _ASM_IA64_EXTABLE_H
+#define _ASM_IA64_EXTABLE_H
+
+#define ARCH_HAS_RELATIVE_EXTABLE
+
+struct exception_table_entry {
+       int insn;       /* location-relative address of insn this fixup is for */
+       int fixup;      /* location-relative continuation addr.; if bit 2 is set, r9 is set to 0 */
+};
+
+#endif
index 471044b..82a7646 100644 (file)
  */
 
 #include <linux/compiler.h>
-#include <linux/errno.h>
-#include <linux/sched.h>
 #include <linux/page-flags.h>
 #include <linux/mm.h>
 
 #include <asm/intrinsics.h>
 #include <asm/pgtable.h>
 #include <asm/io.h>
+#include <asm/extable.h>
 
 /*
  * For historical reasons, the following macros are grossly misnamed:
@@ -48,9 +47,6 @@
 #define KERNEL_DS      ((mm_segment_t) { ~0UL })               /* cf. access_ok() */
 #define USER_DS                ((mm_segment_t) { TASK_SIZE-1 })        /* cf. access_ok() */
 
-#define VERIFY_READ    0
-#define VERIFY_WRITE   1
-
 #define get_ds()  (KERNEL_DS)
 #define get_fs()  (current_thread_info()->addr_limit)
 #define set_fs(x) (current_thread_info()->addr_limit = (x))
  * address TASK_SIZE is never valid.  We also need to make sure that the address doesn't
  * point inside the virtually mapped linear page table.
  */
-#define __access_ok(addr, size, segment)                                               \
-({                                                                                     \
-       __chk_user_ptr(addr);                                                           \
-       (likely((unsigned long) (addr) <= (segment).seg)                                \
-        && ((segment).seg == KERNEL_DS.seg                                             \
-            || likely(REGION_OFFSET((unsigned long) (addr)) < RGN_MAP_LIMIT)));        \
-})
-#define access_ok(type, addr, size)    __access_ok((addr), (size), get_fs())
+static inline int __access_ok(const void __user *p, unsigned long size)
+{
+       unsigned long addr = (unsigned long)p;
+       unsigned long seg = get_fs().seg;
+       return likely(addr <= seg) &&
+        (seg == KERNEL_DS.seg || likely(REGION_OFFSET(addr) < RGN_MAP_LIMIT));
+}
+#define access_ok(type, addr, size)    __access_ok((addr), (size))
 
 /*
  * These are the main single-value transfer routines.  They automatically
@@ -80,8 +76,8 @@
  * (a) re-use the arguments for side effects (sizeof/typeof is ok)
  * (b) require any knowledge of processes at this stage
  */
-#define put_user(x, ptr)       __put_user_check((__typeof__(*(ptr))) (x), (ptr), sizeof(*(ptr)), get_fs())
-#define get_user(x, ptr)       __get_user_check((x), (ptr), sizeof(*(ptr)), get_fs())
+#define put_user(x, ptr)       __put_user_check((__typeof__(*(ptr))) (x), (ptr), sizeof(*(ptr)))
+#define get_user(x, ptr)       __get_user_check((x), (ptr), sizeof(*(ptr)))
 
 /*
  * The "__xxx" versions do not do address space checking, useful when
@@ -184,13 +180,13 @@ extern void __get_user_unknown (void);
  * could clobber r8 and r9 (among others).  Thus, be careful not to evaluate it while
  * using r8/r9.
  */
-#define __do_get_user(check, x, ptr, size, segment)                                    \
+#define __do_get_user(check, x, ptr, size)                                             \
 ({                                                                                     \
        const __typeof__(*(ptr)) __user *__gu_ptr = (ptr);                              \
        __typeof__ (size) __gu_size = (size);                                           \
        long __gu_err = -EFAULT;                                                        \
        unsigned long __gu_val = 0;                                                     \
-       if (!check || __access_ok(__gu_ptr, size, segment))                             \
+       if (!check || __access_ok(__gu_ptr, size))                                      \
                switch (__gu_size) {                                                    \
                      case 1: __get_user_size(__gu_val, __gu_ptr, 1, __gu_err); break;  \
                      case 2: __get_user_size(__gu_val, __gu_ptr, 2, __gu_err); break;  \
@@ -202,8 +198,8 @@ extern void __get_user_unknown (void);
        __gu_err;                                                                       \
 })
 
-#define __get_user_nocheck(x, ptr, size)       __do_get_user(0, x, ptr, size, KERNEL_DS)
-#define __get_user_check(x, ptr, size, segment)        __do_get_user(1, x, ptr, size, segment)
+#define __get_user_nocheck(x, ptr, size)       __do_get_user(0, x, ptr, size)
+#define __get_user_check(x, ptr, size) __do_get_user(1, x, ptr, size)
 
 extern void __put_user_unknown (void);
 
@@ -211,14 +207,14 @@ extern void __put_user_unknown (void);
  * Evaluating arguments X, PTR, SIZE, and SEGMENT may involve subroutine-calls, which
  * could clobber r8 (among others).  Thus, be careful not to evaluate them while using r8.
  */
-#define __do_put_user(check, x, ptr, size, segment)                                    \
+#define __do_put_user(check, x, ptr, size)                                             \
 ({                                                                                     \
        __typeof__ (x) __pu_x = (x);                                                    \
        __typeof__ (*(ptr)) __user *__pu_ptr = (ptr);                                   \
        __typeof__ (size) __pu_size = (size);                                           \
        long __pu_err = -EFAULT;                                                        \
                                                                                        \
-       if (!check || __access_ok(__pu_ptr, __pu_size, segment))                        \
+       if (!check || __access_ok(__pu_ptr, __pu_size))                                 \
                switch (__pu_size) {                                                    \
                      case 1: __put_user_size(__pu_x, __pu_ptr, 1, __pu_err); break;    \
                      case 2: __put_user_size(__pu_x, __pu_ptr, 2, __pu_err); break;    \
@@ -229,8 +225,8 @@ extern void __put_user_unknown (void);
        __pu_err;                                                                       \
 })
 
-#define __put_user_nocheck(x, ptr, size)       __do_put_user(0, x, ptr, size, KERNEL_DS)
-#define __put_user_check(x, ptr, size, segment)        __do_put_user(1, x, ptr, size, segment)
+#define __put_user_nocheck(x, ptr, size)       __do_put_user(0, x, ptr, size)
+#define __put_user_check(x, ptr, size) __do_put_user(1, x, ptr, size)
 
 /*
  * Complex access routines
@@ -239,56 +235,19 @@ extern unsigned long __must_check __copy_user (void __user *to, const void __use
                                               unsigned long count);
 
 static inline unsigned long
-__copy_to_user (void __user *to, const void *from, unsigned long count)
+raw_copy_to_user(void __user *to, const void *from, unsigned long count)
 {
-       check_object_size(from, count, true);
-
        return __copy_user(to, (__force void __user *) from, count);
 }
 
 static inline unsigned long
-__copy_from_user (void *to, const void __user *from, unsigned long count)
+raw_copy_from_user(void *to, const void __user *from, unsigned long count)
 {
-       check_object_size(to, count, false);
-
        return __copy_user((__force void __user *) to, from, count);
 }
 
-#define __copy_to_user_inatomic                __copy_to_user
-#define __copy_from_user_inatomic      __copy_from_user
-#define copy_to_user(to, from, n)                                                      \
-({                                                                                     \
-       void __user *__cu_to = (to);                                                    \
-       const void *__cu_from = (from);                                                 \
-       long __cu_len = (n);                                                            \
-                                                                                       \
-       if (__access_ok(__cu_to, __cu_len, get_fs())) {                                 \
-               check_object_size(__cu_from, __cu_len, true);                   \
-               __cu_len = __copy_user(__cu_to, (__force void __user *)  __cu_from, __cu_len);  \
-       }                                                                               \
-       __cu_len;                                                                       \
-})
-
-static inline unsigned long
-copy_from_user(void *to, const void __user *from, unsigned long n)
-{
-       check_object_size(to, n, false);
-       if (likely(__access_ok(from, n, get_fs())))
-               n = __copy_user((__force void __user *) to, from, n);
-       else
-               memset(to, 0, n);
-       return n;
-}
-
-#define __copy_in_user(to, from, size) __copy_user((to), (from), (size))
-
-static inline unsigned long
-copy_in_user (void __user *to, const void __user *from, unsigned long n)
-{
-       if (likely(access_ok(VERIFY_READ, from, n) && access_ok(VERIFY_WRITE, to, n)))
-               n = __copy_user(to, from, n);
-       return n;
-}
+#define INLINE_COPY_FROM_USER
+#define INLINE_COPY_TO_USER
 
 extern unsigned long __do_clear_user (void __user *, unsigned long);
 
@@ -297,7 +256,7 @@ extern unsigned long __do_clear_user (void __user *, unsigned long);
 #define clear_user(to, n)                                      \
 ({                                                             \
        unsigned long __cu_len = (n);                           \
-       if (__access_ok(to, __cu_len, get_fs()))                \
+       if (__access_ok(to, __cu_len))                          \
                __cu_len = __do_clear_user(to, __cu_len);       \
        __cu_len;                                               \
 })
@@ -313,7 +272,7 @@ extern long __must_check __strncpy_from_user (char *to, const char __user *from,
 ({                                                                     \
        const char __user * __sfu_from = (from);                        \
        long __sfu_ret = -EFAULT;                                       \
-       if (__access_ok(__sfu_from, 0, get_fs()))                       \
+       if (__access_ok(__sfu_from, 0))                                 \
                __sfu_ret = __strncpy_from_user((to), __sfu_from, (n)); \
        __sfu_ret;                                                      \
 })
@@ -325,7 +284,7 @@ extern unsigned long __strlen_user (const char __user *);
 ({                                                     \
        const char __user *__su_str = (str);            \
        unsigned long __su_ret = 0;                     \
-       if (__access_ok(__su_str, 0, get_fs()))         \
+       if (__access_ok(__su_str, 0))                   \
                __su_ret = __strlen_user(__su_str);     \
        __su_ret;                                       \
 })
@@ -341,18 +300,11 @@ extern unsigned long __strnlen_user (const char __user *, long);
 ({                                                             \
        const char __user *__su_str = (str);                    \
        unsigned long __su_ret = 0;                             \
-       if (__access_ok(__su_str, 0, get_fs()))                 \
+       if (__access_ok(__su_str, 0))                           \
                __su_ret = __strnlen_user(__su_str, len);       \
        __su_ret;                                               \
 })
 
-#define ARCH_HAS_RELATIVE_EXTABLE
-
-struct exception_table_entry {
-       int insn;       /* location-relative address of insn this fixup is for */
-       int fixup;      /* location-relative continuation addr.; if bit 2 is set, r9 is set to 0 */
-};
-
 #define ARCH_HAS_TRANSLATE_MEM_PTR     1
 static __inline__ void *
 xlate_dev_mem_ptr(phys_addr_t p)
index 6ab0ae7..d1d945c 100644 (file)
@@ -153,7 +153,7 @@ slot (const struct insn *insn)
 static int
 apply_imm64 (struct module *mod, struct insn *insn, uint64_t val)
 {
-       if (slot(insn) != 2) {
+       if (slot(insn) != 1 && slot(insn) != 2) {
                printk(KERN_ERR "%s: invalid slot number %d for IMM64\n",
                       mod->name, slot(insn));
                return 0;
@@ -165,7 +165,7 @@ apply_imm64 (struct module *mod, struct insn *insn, uint64_t val)
 static int
 apply_imm60 (struct module *mod, struct insn *insn, uint64_t val)
 {
-       if (slot(insn) != 2) {
+       if (slot(insn) != 1 && slot(insn) != 2) {
                printk(KERN_ERR "%s: invalid slot number %d for IMM60\n",
                       mod->name, slot(insn));
                return 0;
index d194d5c..63dc9cd 100644 (file)
@@ -179,14 +179,14 @@ struct salinfo_platform_oemdata_parms {
        const u8 *efi_guid;
        u8 **oemdata;
        u64 *oemdata_size;
-       int ret;
 };
 
-static void
+static long
 salinfo_platform_oemdata_cpu(void *context)
 {
        struct salinfo_platform_oemdata_parms *parms = context;
-       parms->ret = salinfo_platform_oemdata(parms->efi_guid, parms->oemdata, parms->oemdata_size);
+
+       return salinfo_platform_oemdata(parms->efi_guid, parms->oemdata, parms->oemdata_size);
 }
 
 static void
@@ -380,16 +380,7 @@ salinfo_log_release(struct inode *inode, struct file *file)
        return 0;
 }
 
-static void
-call_on_cpu(int cpu, void (*fn)(void *), void *arg)
-{
-       cpumask_t save_cpus_allowed = current->cpus_allowed;
-       set_cpus_allowed_ptr(current, cpumask_of(cpu));
-       (*fn)(arg);
-       set_cpus_allowed_ptr(current, &save_cpus_allowed);
-}
-
-static void
+static long
 salinfo_log_read_cpu(void *context)
 {
        struct salinfo_data *data = context;
@@ -399,6 +390,7 @@ salinfo_log_read_cpu(void *context)
        /* Clear corrected errors as they are read from SAL */
        if (rh->severity == sal_log_severity_corrected)
                ia64_sal_clear_state_info(data->type);
+       return 0;
 }
 
 static void
@@ -430,7 +422,7 @@ retry:
        spin_unlock_irqrestore(&data_saved_lock, flags);
 
        if (!data->saved_num)
-               call_on_cpu(cpu, salinfo_log_read_cpu, data);
+               work_on_cpu_safe(cpu, salinfo_log_read_cpu, data);
        if (!data->log_size) {
                data->state = STATE_NO_DATA;
                cpumask_clear_cpu(cpu, &data->cpu_event);
@@ -459,11 +451,13 @@ salinfo_log_read(struct file *file, char __user *buffer, size_t count, loff_t *p
        return simple_read_from_buffer(buffer, count, ppos, buf, bufsize);
 }
 
-static void
+static long
 salinfo_log_clear_cpu(void *context)
 {
        struct salinfo_data *data = context;
+
        ia64_sal_clear_state_info(data->type);
+       return 0;
 }
 
 static int
@@ -486,7 +480,7 @@ salinfo_log_clear(struct salinfo_data *data, int cpu)
        rh = (sal_log_record_header_t *)(data->log_buffer);
        /* Corrected errors have already been cleared from SAL */
        if (rh->severity != sal_log_severity_corrected)
-               call_on_cpu(cpu, salinfo_log_clear_cpu, data);
+               work_on_cpu_safe(cpu, salinfo_log_clear_cpu, data);
        /* clearing a record may make a new record visible */
        salinfo_log_new_read(cpu, data);
        if (data->state == STATE_LOG_RECORD) {
@@ -531,9 +525,8 @@ salinfo_log_write(struct file *file, const char __user *buffer, size_t count, lo
                                .oemdata = &data->oemdata,
                                .oemdata_size = &data->oemdata_size
                        };
-                       call_on_cpu(cpu, salinfo_platform_oemdata_cpu, &parms);
-                       if (parms.ret)
-                               count = parms.ret;
+                       count = work_on_cpu_safe(cpu, salinfo_platform_oemdata_cpu,
+                                                &parms);
                } else
                        data->oemdata_size = 0;
        } else
index 1a68f01..d76529c 100644 (file)
@@ -355,18 +355,12 @@ static int cache_add_dev(unsigned int cpu)
        unsigned long i, j;
        struct cache_info *this_object;
        int retval = 0;
-       cpumask_t oldmask;
 
        if (all_cpu_cache_info[cpu].kobj.parent)
                return 0;
 
-       oldmask = current->cpus_allowed;
-       retval = set_cpus_allowed_ptr(current, cpumask_of(cpu));
-       if (unlikely(retval))
-               return retval;
 
        retval = cpu_cache_sysfs_init(cpu);
-       set_cpus_allowed_ptr(current, &oldmask);
        if (unlikely(retval < 0))
                return retval;
 
index 1f3d387..0a40b14 100644 (file)
@@ -24,25 +24,25 @@ AFLAGS___modsi3.o   =            -DMODULO
 AFLAGS___umodsi3.o     = -DUNSIGNED -DMODULO
 
 $(obj)/__divdi3.o: $(src)/idiv64.S FORCE
-       $(call if_changed_dep,as_o_S)
+       $(call if_changed_rule,as_o_S)
 
 $(obj)/__udivdi3.o: $(src)/idiv64.S FORCE
-       $(call if_changed_dep,as_o_S)
+       $(call if_changed_rule,as_o_S)
 
 $(obj)/__moddi3.o: $(src)/idiv64.S FORCE
-       $(call if_changed_dep,as_o_S)
+       $(call if_changed_rule,as_o_S)
 
 $(obj)/__umoddi3.o: $(src)/idiv64.S FORCE
-       $(call if_changed_dep,as_o_S)
+       $(call if_changed_rule,as_o_S)
 
 $(obj)/__divsi3.o: $(src)/idiv32.S FORCE
-       $(call if_changed_dep,as_o_S)
+       $(call if_changed_rule,as_o_S)
 
 $(obj)/__udivsi3.o: $(src)/idiv32.S FORCE
-       $(call if_changed_dep,as_o_S)
+       $(call if_changed_rule,as_o_S)
 
 $(obj)/__modsi3.o: $(src)/idiv32.S FORCE
-       $(call if_changed_dep,as_o_S)
+       $(call if_changed_rule,as_o_S)
 
 $(obj)/__umodsi3.o: $(src)/idiv32.S FORCE
-       $(call if_changed_dep,as_o_S)
+       $(call if_changed_rule,as_o_S)
index b264b6a..bbbadc4 100644 (file)
@@ -556,9 +556,6 @@ EK(.ex_handler,  (p17)      st8     [dst1]=r39,8);                                          \
 #define D      r22
 #define F      r28
 
-#define memset_arg0    r32
-#define memset_arg2    r33
-
 #define saved_retval   loc0
 #define saved_rtlink   loc1
 #define saved_pfs_stack        loc2
@@ -622,7 +619,7 @@ EK(.ex_handler,  (p17)      st8     [dst1]=r39,8);                                          \
  *     (faulting_addr - orig_dst)      -> len to faulting st address
  * B = (cur_dst - orig_dst)            -> len copied so far
  * C = A - B                           -> len need to be copied
- * D = orig_len - A                    -> len need to be zeroed
+ * D = orig_len - A                    -> len need to be left along
  */
 (p6)   sub     A = F, saved_in0
 (p7)   sub     A = F, saved_in1
@@ -638,9 +635,6 @@ EK(.ex_handler,  (p17)      st8     [dst1]=r39,8);                                          \
        sub     D = saved_in2, A
        ;;
        cmp.gt  p8,p0=C,r0              // more than 1 byte?
-       add     memset_arg0=saved_in0, A
-(p6)   mov     memset_arg2=0           // copy_to_user should not call memset
-(p7)   mov     memset_arg2=D           // copy_from_user need to have kbuf zeroed
        mov     r8=0
        mov     saved_retval = D
        mov     saved_rtlink = b0
@@ -652,11 +646,6 @@ EK(.ex_handler,  (p17)     st8     [dst1]=r39,8);                                          \
        ;;
 
        add     saved_retval=saved_retval,r8    // above might return non-zero value
-       cmp.gt  p8,p0=memset_arg2,r0    // more than 1 byte?
-       mov     out0=memset_arg0        // *s
-       mov     out1=r0                 // c
-       mov     out2=memset_arg2        // n
-(p8)   br.call.sptk.few b0=memset
        ;;
 
        mov     retval=saved_retval
index 4edb816..10dd4a6 100644 (file)
@@ -5,7 +5,10 @@
  *     David Mosberger-Tang <davidm@hpl.hp.com>
  */
 
-#include <linux/uaccess.h>
+#include <asm/ptrace.h>
+#include <asm/extable.h>
+#include <asm/errno.h>
+#include <asm/processor.h>
 
 void
 ia64_handle_exception (struct pt_regs *regs, const struct exception_table_entry *e)
index 52704f1..55febd6 100644 (file)
@@ -598,12 +598,17 @@ static void sn_hwperf_call_sal(void *info)
        op_info->ret = r;
 }
 
+static long sn_hwperf_call_sal_work(void *info)
+{
+       sn_hwperf_call_sal(info);
+       return 0;
+}
+
 static int sn_hwperf_op_cpu(struct sn_hwperf_op_info *op_info)
 {
        u32 cpu;
        u32 use_ipi;
        int r = 0;
-       cpumask_t save_allowed;
        
        cpu = (op_info->a->arg & SN_HWPERF_ARG_CPU_MASK) >> 32;
        use_ipi = op_info->a->arg & SN_HWPERF_ARG_USE_IPI_MASK;
@@ -629,13 +634,9 @@ static int sn_hwperf_op_cpu(struct sn_hwperf_op_info *op_info)
                        /* use an interprocessor interrupt to call SAL */
                        smp_call_function_single(cpu, sn_hwperf_call_sal,
                                op_info, 1);
-               }
-               else {
-                       /* migrate the task before calling SAL */ 
-                       save_allowed = current->cpus_allowed;
-                       set_cpus_allowed_ptr(current, cpumask_of(cpu));
-                       sn_hwperf_call_sal(op_info);
-                       set_cpus_allowed_ptr(current, &save_allowed);
+               } else {
+                       /* Call on the target CPU */
+                       work_on_cpu_safe(cpu, sn_hwperf_call_sal_work, op_info);
                }
        }
        r = op_info->ret;
index deb2987..c000ffa 100644 (file)
@@ -2,6 +2,7 @@
 generic-y += clkdev.h
 generic-y += current.h
 generic-y += exec.h
+generic-y += extable.h
 generic-y += irq_work.h
 generic-y += kvm_para.h
 generic-y += mcs_spinlock.h
index 6f89821..07be349 100644 (file)
 /*
  * User space memory access functions
  */
-#include <linux/errno.h>
-#include <linux/thread_info.h>
 #include <asm/page.h>
 #include <asm/setup.h>
-
-#define VERIFY_READ 0
-#define VERIFY_WRITE 1
+#include <linux/prefetch.h>
 
 /*
  * The fs value determines whether argument validity checking should be
@@ -114,25 +110,7 @@ static inline int access_ok(int type, const void *addr, unsigned long size)
 }
 #endif /* CONFIG_MMU */
 
-/*
- * The exception table consists of pairs of addresses: the first is the
- * address of an instruction that is allowed to fault, and the second is
- * the address at which the program should continue.  No registers are
- * modified, so it is entirely up to the continuation code to figure out
- * what to do.
- *
- * All the routines below use bits of fixup code that are out of line
- * with the main instruction path.  This means when everything is well,
- * we don't even have to jump over them.  Further, they do not intrude
- * on our cache or tlb entries.
- */
-
-struct exception_table_entry
-{
-       unsigned long insn, fixup;
-};
-
-extern int fixup_exception(struct pt_regs *regs);
+#include <asm/extable.h>
 
 /*
  * These are the main single-value transfer routines.  They automatically
@@ -483,174 +461,25 @@ do {                                                                     \
                : "r14", "memory");                                     \
 } while (0)
 
-#define __copy_user_zeroing(to, from, size)                            \
-do {                                                                   \
-       unsigned long __dst, __src, __c;                                \
-       __asm__ __volatile__ (                                          \
-               "       mv      r14, %0\n"                              \
-               "       or      r14, %1\n"                              \
-               "       beq     %0, %1, 9f\n"                           \
-               "       beqz    %2, 9f\n"                               \
-               "       and3    r14, r14, #3\n"                         \
-               "       bnez    r14, 2f\n"                              \
-               "       and3    %2, %2, #3\n"                           \
-               "       beqz    %3, 2f\n"                               \
-               "       addi    %0, #-4         ; word_copy \n"         \
-               "       .fillinsn\n"                                    \
-               "0:     ld      r14, @%1+\n"                            \
-               "       addi    %3, #-1\n"                              \
-               "       .fillinsn\n"                                    \
-               "1:     st      r14, @+%0\n"                            \
-               "       bnez    %3, 0b\n"                               \
-               "       beqz    %2, 9f\n"                               \
-               "       addi    %0, #4\n"                               \
-               "       .fillinsn\n"                                    \
-               "2:     ldb     r14, @%1        ; byte_copy \n"         \
-               "       .fillinsn\n"                                    \
-               "3:     stb     r14, @%0\n"                             \
-               "       addi    %1, #1\n"                               \
-               "       addi    %2, #-1\n"                              \
-               "       addi    %0, #1\n"                               \
-               "       bnez    %2, 2b\n"                               \
-               "       .fillinsn\n"                                    \
-               "9:\n"                                                  \
-               ".section .fixup,\"ax\"\n"                              \
-               "       .balign 4\n"                                    \
-               "5:     addi    %3, #1\n"                               \
-               "       addi    %1, #-4\n"                              \
-               "       .fillinsn\n"                                    \
-               "6:     slli    %3, #2\n"                               \
-               "       add     %2, %3\n"                               \
-               "       addi    %0, #4\n"                               \
-               "       .fillinsn\n"                                    \
-               "7:     ldi     r14, #0         ; store zero \n"        \
-               "       .fillinsn\n"                                    \
-               "8:     addi    %2, #-1\n"                              \
-               "       stb     r14, @%0        ; ACE? \n"              \
-               "       addi    %0, #1\n"                               \
-               "       bnez    %2, 8b\n"                               \
-               "       seth    r14, #high(9b)\n"                       \
-               "       or3     r14, r14, #low(9b)\n"                   \
-               "       jmp     r14\n"                                  \
-               ".previous\n"                                           \
-               ".section __ex_table,\"a\"\n"                           \
-               "       .balign 4\n"                                    \
-               "       .long 0b,6b\n"                                  \
-               "       .long 1b,5b\n"                                  \
-               "       .long 2b,7b\n"                                  \
-               "       .long 3b,7b\n"                                  \
-               ".previous\n"                                           \
-               : "=&r" (__dst), "=&r" (__src), "=&r" (size),           \
-                 "=&r" (__c)                                           \
-               : "0" (to), "1" (from), "2" (size), "3" (size / 4)      \
-               : "r14", "memory");                                     \
-} while (0)
-
-
 /* We let the __ versions of copy_from/to_user inline, because they're often
  * used in fast paths and have only a small space overhead.
  */
-static inline unsigned long __generic_copy_from_user_nocheck(void *to,
-       const void __user *from, unsigned long n)
+static inline unsigned long
+raw_copy_from_user(void *to, const void __user *from, unsigned long n)
 {
-       __copy_user_zeroing(to, from, n);
+       prefetchw(to);
+       __copy_user(to, from, n);
        return n;
 }
 
-static inline unsigned long __generic_copy_to_user_nocheck(void __user *to,
-       const void *from, unsigned long n)
+static inline unsigned long
+raw_copy_to_user(void __user *to, const void *from, unsigned long n)
 {
+       prefetch(from);
        __copy_user(to, from, n);
        return n;
 }
 
-unsigned long __generic_copy_to_user(void __user *, const void *, unsigned long);
-unsigned long __generic_copy_from_user(void *, const void __user *, unsigned long);
-
-/**
- * __copy_to_user: - Copy a block of data into user space, with less checking.
- * @to:   Destination address, in user space.
- * @from: Source address, in kernel space.
- * @n:    Number of bytes to copy.
- *
- * Context: User context only. This function may sleep if pagefaults are
- *          enabled.
- *
- * Copy data from kernel space to user space.  Caller must check
- * the specified block with access_ok() before calling this function.
- *
- * Returns number of bytes that could not be copied.
- * On success, this will be zero.
- */
-#define __copy_to_user(to, from, n)                    \
-       __generic_copy_to_user_nocheck((to), (from), (n))
-
-#define __copy_to_user_inatomic __copy_to_user
-#define __copy_from_user_inatomic __copy_from_user
-
-/**
- * copy_to_user: - Copy a block of data into user space.
- * @to:   Destination address, in user space.
- * @from: Source address, in kernel space.
- * @n:    Number of bytes to copy.
- *
- * Context: User context only. This function may sleep if pagefaults are
- *          enabled.
- *
- * Copy data from kernel space to user space.
- *
- * Returns number of bytes that could not be copied.
- * On success, this will be zero.
- */
-#define copy_to_user(to, from, n)                      \
-({                                                     \
-       might_fault();                                  \
-       __generic_copy_to_user((to), (from), (n));      \
-})
-
-/**
- * __copy_from_user: - Copy a block of data from user space, with less checking. * @to:   Destination address, in kernel space.
- * @from: Source address, in user space.
- * @n:    Number of bytes to copy.
- *
- * Context: User context only. This function may sleep if pagefaults are
- *          enabled.
- *
- * Copy data from user space to kernel space.  Caller must check
- * the specified block with access_ok() before calling this function.
- *
- * Returns number of bytes that could not be copied.
- * On success, this will be zero.
- *
- * If some data could not be copied, this function will pad the copied
- * data to the requested size using zero bytes.
- */
-#define __copy_from_user(to, from, n)                  \
-       __generic_copy_from_user_nocheck((to), (from), (n))
-
-/**
- * copy_from_user: - Copy a block of data from user space.
- * @to:   Destination address, in kernel space.
- * @from: Source address, in user space.
- * @n:    Number of bytes to copy.
- *
- * Context: User context only. This function may sleep if pagefaults are
- *          enabled.
- *
- * Copy data from user space to kernel space.
- *
- * Returns number of bytes that could not be copied.
- * On success, this will be zero.
- *
- * If some data could not be copied, this function will pad the copied
- * data to the requested size using zero bytes.
- */
-#define copy_from_user(to, from, n)                    \
-({                                                     \
-       might_fault();                                  \
-       __generic_copy_from_user((to), (from), (n));    \
-})
-
 long __must_check strncpy_from_user(char *dst, const char __user *src,
                                long count);
 long __must_check __strncpy_from_user(char *dst,
index d763f0b..a4d43b5 100644 (file)
@@ -26,8 +26,6 @@ EXPORT_SYMBOL(strncpy_from_user);
 EXPORT_SYMBOL(__strncpy_from_user);
 EXPORT_SYMBOL(clear_user);
 EXPORT_SYMBOL(__clear_user);
-EXPORT_SYMBOL(__generic_copy_from_user);
-EXPORT_SYMBOL(__generic_copy_to_user);
 EXPORT_SYMBOL(strnlen_user);
 
 #ifdef CONFIG_SMP
index fd03f27..b3ef2c8 100644 (file)
 #include <linux/thread_info.h>
 #include <linux/uaccess.h>
 
-unsigned long
-__generic_copy_to_user(void __user *to, const void *from, unsigned long n)
-{
-       prefetch(from);
-       if (access_ok(VERIFY_WRITE, to, n))
-               __copy_user(to,from,n);
-       return n;
-}
-
-unsigned long
-__generic_copy_from_user(void *to, const void __user *from, unsigned long n)
-{
-       prefetchw(to);
-       if (access_ok(VERIFY_READ, from, n))
-               __copy_user_zeroing(to,from,n);
-       else
-               memset(to, 0, n);
-       return n;
-}
-
-
 /*
  * Copy a null terminated string from userspace.
  */
index 175553d..6c08780 100644 (file)
@@ -149,8 +149,10 @@ void hw_timer_init(irq_handler_t handler)
        cf_pit_clockevent.mult = div_sc(FREQ, NSEC_PER_SEC, 32);
        cf_pit_clockevent.max_delta_ns =
                clockevent_delta2ns(0xFFFF, &cf_pit_clockevent);
+       cf_pit_clockevent.max_delta_ticks = 0xFFFF;
        cf_pit_clockevent.min_delta_ns =
                clockevent_delta2ns(0x3f, &cf_pit_clockevent);
+       cf_pit_clockevent.min_delta_ticks = 0x3f;
        clockevents_register_device(&cf_pit_clockevent);
 
        setup_irq(MCF_IRQ_PIT1, &pit_irq);
index d4f9ccb..82005d2 100644 (file)
@@ -5,6 +5,7 @@ generic-y += device.h
 generic-y += emergency-restart.h
 generic-y += errno.h
 generic-y += exec.h
+generic-y += extable.h
 generic-y += futex.h
 generic-y += hw_irq.h
 generic-y += ioctl.h
index f5f790c..77239e8 100644 (file)
@@ -122,16 +122,6 @@ static inline void start_thread(struct pt_regs * regs, unsigned long pc,
        wrusp(usp);
 }
 
-#ifdef CONFIG_MMU
-extern int handle_kernel_fault(struct pt_regs *regs);
-#else
-static inline  int handle_kernel_fault(struct pt_regs *regs)
-{
-       /* Any fault in kernel is fatal on non-mmu */
-       return 0;
-}
-#endif
-
 /* Forward declaration, a strange C thing */
 struct task_struct;
 
index 3fadc4a..67b3481 100644 (file)
@@ -4,6 +4,7 @@
 #include <asm/uaccess_mm.h>
 #endif
 
+#include <asm/extable.h>
 #ifdef CONFIG_CPU_HAS_NO_UNALIGNED
 #include <asm-generic/uaccess-unaligned.h>
 #else
index d228601..ef856ff 100644 (file)
@@ -5,14 +5,9 @@
  * User space memory access functions
  */
 #include <linux/compiler.h>
-#include <linux/errno.h>
 #include <linux/types.h>
-#include <linux/sched.h>
 #include <asm/segment.h>
 
-#define VERIFY_READ    0
-#define VERIFY_WRITE   1
-
 /* We let the MMU do all checking */
 static inline int access_ok(int type, const void __user *addr,
                            unsigned long size)
@@ -36,24 +31,6 @@ static inline int access_ok(int type, const void __user *addr,
 #define        MOVES   "move"
 #endif
 
-/*
- * The exception table consists of pairs of addresses: the first is the
- * address of an instruction that is allowed to fault, and the second is
- * the address at which the program should continue.  No registers are
- * modified, so it is entirely up to the continuation code to figure out
- * what to do.
- *
- * All the routines below use bits of fixup code that are out of line
- * with the main instruction path.  This means when everything is well,
- * we don't even have to jump over them.  Further, they do not intrude
- * on our cache or tlb entries.
- */
-
-struct exception_table_entry
-{
-       unsigned long insn, fixup;
-};
-
 extern int __put_user_bad(void);
 extern int __get_user_bad(void);
 
@@ -202,39 +179,55 @@ asm volatile ("\n"                                        \
 unsigned long __generic_copy_from_user(void *to, const void __user *from, unsigned long n);
 unsigned long __generic_copy_to_user(void __user *to, const void *from, unsigned long n);
 
-#define __constant_copy_from_user_asm(res, to, from, tmp, n, s1, s2, s3)\
+#define __suffix0
+#define __suffix1 b
+#define __suffix2 w
+#define __suffix4 l
+
+#define ____constant_copy_from_user_asm(res, to, from, tmp, n1, n2, n3, s1, s2, s3)\
        asm volatile ("\n"                                              \
                "1:     "MOVES"."#s1"   (%2)+,%3\n"                     \
                "       move."#s1"      %3,(%1)+\n"                     \
+               "       .ifnc   \""#s2"\",\"\"\n"                       \
                "2:     "MOVES"."#s2"   (%2)+,%3\n"                     \
                "       move."#s2"      %3,(%1)+\n"                     \
                "       .ifnc   \""#s3"\",\"\"\n"                       \
                "3:     "MOVES"."#s3"   (%2)+,%3\n"                     \
                "       move."#s3"      %3,(%1)+\n"                     \
                "       .endif\n"                                       \
+               "       .endif\n"                                       \
                "4:\n"                                                  \
                "       .section __ex_table,\"a\"\n"                    \
                "       .align  4\n"                                    \
                "       .long   1b,10f\n"                               \
+               "       .ifnc   \""#s2"\",\"\"\n"                       \
                "       .long   2b,20f\n"                               \
                "       .ifnc   \""#s3"\",\"\"\n"                       \
                "       .long   3b,30f\n"                               \
                "       .endif\n"                                       \
+               "       .endif\n"                                       \
                "       .previous\n"                                    \
                "\n"                                                    \
                "       .section .fixup,\"ax\"\n"                       \
                "       .even\n"                                        \
-               "10:    clr."#s1"       (%1)+\n"                        \
-               "20:    clr."#s2"       (%1)+\n"                        \
+               "10:    addq.l #"#n1",%0\n"                             \
+               "       .ifnc   \""#s2"\",\"\"\n"                       \
+               "20:    addq.l #"#n2",%0\n"                             \
                "       .ifnc   \""#s3"\",\"\"\n"                       \
-               "30:    clr."#s3"       (%1)+\n"                        \
+               "30:    addq.l #"#n3",%0\n"                             \
+               "       .endif\n"                                       \
                "       .endif\n"                                       \
-               "       moveq.l #"#n",%0\n"                             \
                "       jra     4b\n"                                   \
                "       .previous\n"                                    \
                : "+d" (res), "+&a" (to), "+a" (from), "=&d" (tmp)      \
                : : "memory")
 
+#define ___constant_copy_from_user_asm(res, to, from, tmp, n1, n2, n3, s1, s2, s3)\
+       ____constant_copy_from_user_asm(res, to, from, tmp, n1, n2, n3, s1, s2, s3)
+#define __constant_copy_from_user_asm(res, to, from, tmp, n1, n2, n3)  \
+       ___constant_copy_from_user_asm(res, to, from, tmp, n1, n2, n3,  \
+                                       __suffix##n1, __suffix##n2, __suffix##n3)
+
 static __always_inline unsigned long
 __constant_copy_from_user(void *to, const void __user *from, unsigned long n)
 {
@@ -242,37 +235,37 @@ __constant_copy_from_user(void *to, const void __user *from, unsigned long n)
 
        switch (n) {
        case 1:
-               __get_user_asm(res, *(u8 *)to, (u8 __user *)from, u8, b, d, 1);
+               __constant_copy_from_user_asm(res, to, from, tmp, 1, 0, 0);
                break;
        case 2:
-               __get_user_asm(res, *(u16 *)to, (u16 __user *)from, u16, w, r, 2);
+               __constant_copy_from_user_asm(res, to, from, tmp, 2, 0, 0);
                break;
        case 3:
-               __constant_copy_from_user_asm(res, to, from, tmp, 3, w, b,);
+               __constant_copy_from_user_asm(res, to, from, tmp, 2, 1, 0);
                break;
        case 4:
-               __get_user_asm(res, *(u32 *)to, (u32 __user *)from, u32, l, r, 4);
+               __constant_copy_from_user_asm(res, to, from, tmp, 4, 0, 0);
                break;
        case 5:
-               __constant_copy_from_user_asm(res, to, from, tmp, 5, l, b,);
+               __constant_copy_from_user_asm(res, to, from, tmp, 4, 1, 0);
                break;
        case 6:
-               __constant_copy_from_user_asm(res, to, from, tmp, 6, l, w,);
+               __constant_copy_from_user_asm(res, to, from, tmp, 4, 2, 0);
                break;
        case 7:
-               __constant_copy_from_user_asm(res, to, from, tmp, 7, l, w, b);
+               __constant_copy_from_user_asm(res, to, from, tmp, 4, 2, 1);
                break;
        case 8:
-               __constant_copy_from_user_asm(res, to, from, tmp, 8, l, l,);
+               __constant_copy_from_user_asm(res, to, from, tmp, 4, 4, 0);
                break;
        case 9:
-               __constant_copy_from_user_asm(res, to, from, tmp, 9, l, l, b);
+               __constant_copy_from_user_asm(res, to, from, tmp, 4, 4, 1);
                break;
        case 10:
-               __constant_copy_from_user_asm(res, to, from, tmp, 10, l, l, w);
+               __constant_copy_from_user_asm(res, to, from, tmp, 4, 4, 2);
                break;
        case 12:
-               __constant_copy_from_user_asm(res, to, from, tmp, 12, l, l, l);
+               __constant_copy_from_user_asm(res, to, from, tmp, 4, 4, 4);
                break;
        default:
                /* we limit the inlined version to 3 moves */
@@ -363,24 +356,26 @@ __constant_copy_to_user(void __user *to, const void *from, unsigned long n)
        return res;
 }
 
-#define __copy_from_user(to, from, n)          \
-(__builtin_constant_p(n) ?                     \
- __constant_copy_from_user(to, from, n) :      \
- __generic_copy_from_user(to, from, n))
-
-#define __copy_to_user(to, from, n)            \
-(__builtin_constant_p(n) ?                     \
- __constant_copy_to_user(to, from, n) :                \
- __generic_copy_to_user(to, from, n))
-
-#define __copy_to_user_inatomic                __copy_to_user
-#define __copy_from_user_inatomic      __copy_from_user
+static inline unsigned long
+raw_copy_from_user(void *to, const void __user *from, unsigned long n)
+{
+       if (__builtin_constant_p(n))
+               return __constant_copy_from_user(to, from, n);
+       return __generic_copy_from_user(to, from, n);
+}
 
-#define copy_from_user(to, from, n)    __copy_from_user(to, from, n)
-#define copy_to_user(to, from, n)      __copy_to_user(to, from, n)
+static inline unsigned long
+raw_copy_to_user(void __user *to, const void *from, unsigned long n)
+{
+       if (__builtin_constant_p(n))
+               return __constant_copy_to_user(to, from, n);
+       return __generic_copy_to_user(to, from, n);
+}
+#define INLINE_COPY_FROM_USER
+#define INLINE_COPY_TO_USER
 
 #define user_addr_max() \
-       (segment_eq(get_fs(), USER_DS) ? TASK_SIZE : ~0UL)
+       (uaccess_kernel() ? ~0UL : TASK_SIZE)
 
 extern long strncpy_from_user(char *dst, const char __user *src, long count);
 extern __must_check long strlen_user(const char __user *str);
index 36deeb3..e482c38 100644 (file)
@@ -4,15 +4,11 @@
 /*
  * User space memory access functions
  */
-#include <linux/sched.h>
 #include <linux/mm.h>
 #include <linux/string.h>
 
 #include <asm/segment.h>
 
-#define VERIFY_READ    0
-#define VERIFY_WRITE   1
-
 #define access_ok(type,addr,size)      _access_ok((unsigned long)(addr),(size))
 
 /*
@@ -27,25 +23,6 @@ static inline int _access_ok(unsigned long addr, unsigned long size)
 }
 
 /*
- * The exception table consists of pairs of addresses: the first is the
- * address of an instruction that is allowed to fault, and the second is
- * the address at which the program should continue.  No registers are
- * modified, so it is entirely up to the continuation code to figure out
- * what to do.
- *
- * All the routines below use bits of fixup code that are out of line
- * with the main instruction path.  This means when everything is well,
- * we don't even have to jump over them.  Further, they do not intrude
- * on our cache or tlb entries.
- */
-
-struct exception_table_entry
-{
-       unsigned long insn, fixup;
-};
-
-
-/*
  * These are the main single-value transfer routines.  They automatically
  * use the right size if we just have the right pointer type.
  */
@@ -124,13 +101,21 @@ extern int __get_user_bad(void);
                 : "=d" (x)                                     \
                 : "m" (*__ptr(ptr)))
 
-#define copy_from_user(to, from, n)            (memcpy(to, from, n), 0)
-#define copy_to_user(to, from, n)              (memcpy(to, from, n), 0)
+static inline unsigned long
+raw_copy_from_user(void *to, const void __user *from, unsigned long n)
+{
+       memcpy(to, (__force const void *)from, n);
+       return 0;
+}
 
-#define __copy_from_user(to, from, n) copy_from_user(to, from, n)
-#define __copy_to_user(to, from, n) copy_to_user(to, from, n)
-#define __copy_to_user_inatomic __copy_to_user
-#define __copy_from_user_inatomic __copy_from_user
+static inline unsigned long
+raw_copy_to_user(void __user *to, const void *from, unsigned long n)
+{
+       memcpy((__force void *)to, from, n);
+       return 0;
+}
+#define INLINE_COPY_FROM_USER
+#define INLINE_COPY_TO_USER
 
 /*
  * Copy a null terminated string from userspace.
index 093b7c4..6f945bb 100644 (file)
@@ -88,7 +88,7 @@ static inline int frame_extra_sizes(int f)
        return frame_size_change[f];
 }
 
-int handle_kernel_fault(struct pt_regs *regs)
+int fixup_exception(struct pt_regs *regs)
 {
        const struct exception_table_entry *fixup;
        struct pt_regs *tregs;
index a926d2c..c1cc4e9 100644 (file)
@@ -1016,8 +1016,13 @@ asmlinkage void trap_c(struct frame *fp)
                        /* traced a trapping instruction on a 68020/30,
                         * real exception will be executed afterwards.
                         */
-               } else if (!handle_kernel_fault(&fp->ptregs))
-                       bad_super_trap(fp);
+                       return;
+               }
+#ifdef CONFIG_MMU
+               if (fixup_exception(&fp->ptregs))
+                       return;
+#endif
+               bad_super_trap(fp);
                return;
        }
 
index a76b73a..7646e46 100644 (file)
@@ -30,19 +30,13 @@ unsigned long __generic_copy_from_user(void *to, const void __user *from,
                "6:\n"
                "       .section .fixup,\"ax\"\n"
                "       .even\n"
-               "10:    move.l  %0,%3\n"
-               "7:     clr.l   (%2)+\n"
-               "       subq.l  #1,%3\n"
-               "       jne     7b\n"
-               "       lsl.l   #2,%0\n"
+               "10:    lsl.l   #2,%0\n"
                "       btst    #1,%5\n"
                "       jeq     8f\n"
-               "30:    clr.w   (%2)+\n"
-               "       addq.l  #2,%0\n"
+               "30:    addq.l  #2,%0\n"
                "8:     btst    #0,%5\n"
                "       jeq     6b\n"
-               "50:    clr.b   (%2)+\n"
-               "       addq.l  #1,%0\n"
+               "50:    addq.l  #1,%0\n"
                "       jra     6b\n"
                "       .previous\n"
                "\n"
index bd66a0b..2795e4c 100644 (file)
@@ -32,7 +32,7 @@ int send_fault_sig(struct pt_regs *regs)
                force_sig_info(siginfo.si_signo,
                               &siginfo, current);
        } else {
-               if (handle_kernel_fault(regs))
+               if (fixup_exception(regs))
                        return -1;
 
                //if (siginfo.si_signo == SIGBUS)
index f9b9df5..8f94055 100644 (file)
@@ -8,6 +8,7 @@ generic-y += dma.h
 generic-y += emergency-restart.h
 generic-y += errno.h
 generic-y += exec.h
+generic-y += extable.h
 generic-y += fb.h
 generic-y += fcntl.h
 generic-y += futex.h
index 273e612..5ebc285 100644 (file)
@@ -4,10 +4,6 @@
 /*
  * User space memory access functions
  */
-#include <linux/sched.h>
-
-#define VERIFY_READ    0
-#define VERIFY_WRITE   1
 
 /*
  * The fs value determines whether argument validity checking should be
@@ -28,7 +24,7 @@
 
 #define segment_eq(a, b)       ((a).seg == (b).seg)
 
-#define __kernel_ok (segment_eq(get_fs(), KERNEL_DS))
+#define __kernel_ok (uaccess_kernel())
 /*
  * Explicitly allow NULL pointers here. Parts of the kernel such
  * as readv/writev use access_ok to validate pointers, but want
@@ -51,28 +47,7 @@ static inline int __access_ok(unsigned long addr, unsigned long size)
 #define access_ok(type, addr, size) __access_ok((unsigned long)(addr), \
                                                (unsigned long)(size))
 
-static inline int verify_area(int type, const void *addr, unsigned long size)
-{
-       return access_ok(type, addr, size) ? 0 : -EFAULT;
-}
-
-/*
- * The exception table consists of pairs of addresses: the first is the
- * address of an instruction that is allowed to fault, and the second is
- * the address at which the program should continue.  No registers are
- * modified, so it is entirely up to the continuation code to figure out
- * what to do.
- *
- * All the routines below use bits of fixup code that are out of line
- * with the main instruction path.  This means when everything is well,
- * we don't even have to jump over them.  Further, they do not intrude
- * on our cache or tlb entries.
- */
-struct exception_table_entry {
-       unsigned long insn, fixup;
-};
-
-extern int fixup_exception(struct pt_regs *regs);
+#include <asm/extable.h>
 
 /*
  * These are the main single-value transfer routines.  They automatically
@@ -197,36 +172,10 @@ extern long __must_check strnlen_user(const char __user *src, long count);
 
 #define strlen_user(str) strnlen_user(str, 32767)
 
-extern unsigned long __must_check __copy_user_zeroing(void *to,
-                                                     const void __user *from,
-                                                     unsigned long n);
-
-static inline unsigned long
-copy_from_user(void *to, const void __user *from, unsigned long n)
-{
-       if (likely(access_ok(VERIFY_READ, from, n)))
-               return __copy_user_zeroing(to, from, n);
-       memset(to, 0, n);
-       return n;
-}
-
-#define __copy_from_user(to, from, n) __copy_user_zeroing(to, from, n)
-#define __copy_from_user_inatomic __copy_from_user
-
-extern unsigned long __must_check __copy_user(void __user *to,
-                                             const void *from,
-                                             unsigned long n);
-
-static inline unsigned long copy_to_user(void __user *to, const void *from,
-                                        unsigned long n)
-{
-       if (access_ok(VERIFY_WRITE, to, n))
-               return __copy_user(to, from, n);
-       return n;
-}
-
-#define __copy_to_user(to, from, n) __copy_user(to, from, n)
-#define __copy_to_user_inatomic __copy_to_user
+extern unsigned long raw_copy_from_user(void *to, const void __user *from,
+                                       unsigned long n);
+extern unsigned long raw_copy_to_user(void __user *to, const void *from,
+                                     unsigned long n);
 
 /*
  * Zero Userspace
index b3ebfe9..e8a4ea8 100644 (file)
@@ -29,7 +29,6 @@
                COPY                                             \
                "1:\n"                                           \
                "       .section .fixup,\"ax\"\n"                \
-               "       MOV D1Ar1,#0\n"                          \
                FIXUP                                            \
                "       MOVT    D1Ar1,#HI(1b)\n"                 \
                "       JUMP    D1Ar1,#LO(1b)\n"                 \
                "MGETL  D0FrT, D0.5, D0.6, D0.7, [%1++]\n"              \
                "22:\n"                                                 \
                "MSETL  [%0++], D0FrT, D0.5, D0.6, D0.7\n"              \
-               "SUB    %3, %3, #32\n"                                  \
                "23:\n"                                                 \
-               "MGETL  D0FrT, D0.5, D0.6, D0.7, [%1++]\n"              \
+               "SUB    %3, %3, #32\n"                                  \
                "24:\n"                                                 \
+               "MGETL  D0FrT, D0.5, D0.6, D0.7, [%1++]\n"              \
+               "25:\n"                                                 \
                "MSETL  [%0++], D0FrT, D0.5, D0.6, D0.7\n"              \
+               "26:\n"                                                 \
                "SUB    %3, %3, #32\n"                                  \
                "DCACHE [%1+#-64], D0Ar6\n"                             \
                "BR     $Lloop"id"\n"                                   \
                                                                        \
                "MOV    RAPF, %1\n"                                     \
-               "25:\n"                                                 \
+               "27:\n"                                                 \
                "MGETL  D0FrT, D0.5, D0.6, D0.7, [%1++]\n"              \
-               "26:\n"                                                 \
+               "28:\n"                                                 \
                "MSETL  [%0++], D0FrT, D0.5, D0.6, D0.7\n"              \
+               "29:\n"                                                 \
                "SUB    %3, %3, #32\n"                                  \
-               "27:\n"                                                 \
+               "30:\n"                                                 \
                "MGETL  D0FrT, D0.5, D0.6, D0.7, [%1++]\n"              \
-               "28:\n"                                                 \
+               "31:\n"                                                 \
                "MSETL  [%0++], D0FrT, D0.5, D0.6, D0.7\n"              \
+               "32:\n"                                                 \
                "SUB    %0, %0, #8\n"                                   \
-               "29:\n"                                                 \
+               "33:\n"                                                 \
                "SETL   [%0++], D0.7, D1.7\n"                           \
                "SUB    %3, %3, #32\n"                                  \
                "1:"                                                    \
                "       .long 26b,3b\n"                                 \
                "       .long 27b,3b\n"                                 \
                "       .long 28b,3b\n"                                 \
-               "       .long 29b,4b\n"                                 \
+               "       .long 29b,3b\n"                                 \
+               "       .long 30b,3b\n"                                 \
+               "       .long 31b,3b\n"                                 \
+               "       .long 32b,3b\n"                                 \
+               "       .long 33b,4b\n"                                 \
                "       .previous\n"                                    \
                : "=r" (to), "=r" (from), "=r" (ret), "=d" (n)          \
                : "0" (to), "1" (from), "2" (ret), "3" (n)              \
-               : "D1Ar1", "D0Ar2", "memory")
+               : "D1Ar1", "D0Ar2", "cc", "memory")
 
 /*     rewind 'to' and 'from'  pointers when a fault occurs
  *
 #define __asm_copy_to_user_64bit_rapf_loop(to, from, ret, n, id)\
        __asm_copy_user_64bit_rapf_loop(to, from, ret, n, id,           \
                "LSR    D0Ar2, D0Ar2, #8\n"                             \
-               "AND    D0Ar2, D0Ar2, #0x7\n"                           \
+               "ANDS   D0Ar2, D0Ar2, #0x7\n"                           \
                "ADDZ   D0Ar2, D0Ar2, #4\n"                             \
                "SUB    D0Ar2, D0Ar2, #1\n"                             \
                "MOV    D1Ar1, #4\n"                                    \
                "MGETD  D0FrT, D0.5, D0.6, D0.7, [%1++]\n"              \
                "22:\n"                                                 \
                "MSETD  [%0++], D0FrT, D0.5, D0.6, D0.7\n"              \
-               "SUB    %3, %3, #16\n"                                  \
                "23:\n"                                                 \
-               "MGETD  D0FrT, D0.5, D0.6, D0.7, [%1++]\n"              \
-               "24:\n"                                                 \
-               "MSETD  [%0++], D0FrT, D0.5, D0.6, D0.7\n"              \
                "SUB    %3, %3, #16\n"                                  \
-               "25:\n"                                                 \
+               "24:\n"                                                 \
                "MGETD  D0FrT, D0.5, D0.6, D0.7, [%1++]\n"              \
-               "26:\n"                                                 \
+               "25:\n"                                                 \
                "MSETD  [%0++], D0FrT, D0.5, D0.6, D0.7\n"              \
+               "26:\n"                                                 \
                "SUB    %3, %3, #16\n"                                  \
                "27:\n"                                                 \
                "MGETD  D0FrT, D0.5, D0.6, D0.7, [%1++]\n"              \
                "28:\n"                                                 \
                "MSETD  [%0++], D0FrT, D0.5, D0.6, D0.7\n"              \
+               "29:\n"                                                 \
+               "SUB    %3, %3, #16\n"                                  \
+               "30:\n"                                                 \
+               "MGETD  D0FrT, D0.5, D0.6, D0.7, [%1++]\n"              \
+               "31:\n"                                                 \
+               "MSETD  [%0++], D0FrT, D0.5, D0.6, D0.7\n"              \
+               "32:\n"                                                 \
                "SUB    %3, %3, #16\n"                                  \
                "DCACHE [%1+#-64], D0Ar6\n"                             \
                "BR     $Lloop"id"\n"                                   \
                                                                        \
                "MOV    RAPF, %1\n"                                     \
-               "29:\n"                                                 \
+               "33:\n"                                                 \
                "MGETD  D0FrT, D0.5, D0.6, D0.7, [%1++]\n"              \
-               "30:\n"                                                 \
+               "34:\n"                                                 \
                "MSETD  [%0++], D0FrT, D0.5, D0.6, D0.7\n"              \
+               "35:\n"                                                 \
                "SUB    %3, %3, #16\n"                                  \
-               "31:\n"                                                 \
+               "36:\n"                                                 \
                "MGETD  D0FrT, D0.5, D0.6, D0.7, [%1++]\n"              \
-               "32:\n"                                                 \
+               "37:\n"                                                 \
                "MSETD  [%0++], D0FrT, D0.5, D0.6, D0.7\n"              \
+               "38:\n"                                                 \
                "SUB    %3, %3, #16\n"                                  \
-               "33:\n"                                                 \
+               "39:\n"                                                 \
                "MGETD  D0FrT, D0.5, D0.6, D0.7, [%1++]\n"              \
-               "34:\n"                                                 \
+               "40:\n"                                                 \
                "MSETD  [%0++], D0FrT, D0.5, D0.6, D0.7\n"              \
+               "41:\n"                                                 \
                "SUB    %3, %3, #16\n"                                  \
-               "35:\n"                                                 \
+               "42:\n"                                                 \
                "MGETD  D0FrT, D0.5, D0.6, D0.7, [%1++]\n"              \
-               "36:\n"                                                 \
+               "43:\n"                                                 \
                "MSETD  [%0++], D0FrT, D0.5, D0.6, D0.7\n"              \
+               "44:\n"                                                 \
                "SUB    %0, %0, #4\n"                                   \
-               "37:\n"                                                 \
+               "45:\n"                                                 \
                "SETD   [%0++], D0.7\n"                                 \
                "SUB    %3, %3, #16\n"                                  \
                "1:"                                                    \
                "       .long 34b,3b\n"                                 \
                "       .long 35b,3b\n"                                 \
                "       .long 36b,3b\n"                                 \
-               "       .long 37b,4b\n"                                 \
+               "       .long 37b,3b\n"                                 \
+               "       .long 38b,3b\n"                                 \
+               "       .long 39b,3b\n"                                 \
+               "       .long 40b,3b\n"                                 \
+               "       .long 41b,3b\n"                                 \
+               "       .long 42b,3b\n"                                 \
+               "       .long 43b,3b\n"                                 \
+               "       .long 44b,3b\n"                                 \
+               "       .long 45b,4b\n"                                 \
                "       .previous\n"                                    \
                : "=r" (to), "=r" (from), "=r" (ret), "=d" (n)          \
                : "0" (to), "1" (from), "2" (ret), "3" (n)              \
-               : "D1Ar1", "D0Ar2", "memory")
+               : "D1Ar1", "D0Ar2", "cc", "memory")
 
 /*     rewind 'to' and 'from'  pointers when a fault occurs
  *
 #define __asm_copy_to_user_32bit_rapf_loop(to, from, ret, n, id)\
        __asm_copy_user_32bit_rapf_loop(to, from, ret, n, id,           \
                "LSR    D0Ar2, D0Ar2, #8\n"                             \
-               "AND    D0Ar2, D0Ar2, #0x7\n"                           \
+               "ANDS   D0Ar2, D0Ar2, #0x7\n"                           \
                "ADDZ   D0Ar2, D0Ar2, #4\n"                             \
                "SUB    D0Ar2, D0Ar2, #1\n"                             \
                "MOV    D1Ar1, #4\n"                                    \
                "SUB    %1,     %1,     D0Ar2\n"                        \
                "SUB    %3, %3, D1Ar1\n")
 
-unsigned long __copy_user(void __user *pdst, const void *psrc,
-                         unsigned long n)
+unsigned long raw_copy_to_user(void __user *pdst, const void *psrc,
+                              unsigned long n)
 {
        register char __user *dst asm ("A0.2") = pdst;
        register const char *src asm ("A1.2") = psrc;
@@ -538,23 +561,31 @@ unsigned long __copy_user(void __user *pdst, const void *psrc,
        if ((unsigned long) src & 1) {
                __asm_copy_to_user_1(dst, src, retn);
                n--;
+               if (retn)
+                       return retn + n;
        }
        if ((unsigned long) dst & 1) {
                /* Worst case - byte copy */
                while (n > 0) {
                        __asm_copy_to_user_1(dst, src, retn);
                        n--;
+                       if (retn)
+                               return retn + n;
                }
        }
        if (((unsigned long) src & 2) && n >= 2) {
                __asm_copy_to_user_2(dst, src, retn);
                n -= 2;
+               if (retn)
+                       return retn + n;
        }
        if ((unsigned long) dst & 2) {
                /* Second worst case - word copy */
                while (n >= 2) {
                        __asm_copy_to_user_2(dst, src, retn);
                        n -= 2;
+                       if (retn)
+                               return retn + n;
                }
        }
 
@@ -569,6 +600,8 @@ unsigned long __copy_user(void __user *pdst, const void *psrc,
                while (n >= 8) {
                        __asm_copy_to_user_8x64(dst, src, retn);
                        n -= 8;
+                       if (retn)
+                               return retn + n;
                }
        }
        if (n >= RAPF_MIN_BUF_SIZE) {
@@ -581,6 +614,8 @@ unsigned long __copy_user(void __user *pdst, const void *psrc,
                while (n >= 8) {
                        __asm_copy_to_user_8x64(dst, src, retn);
                        n -= 8;
+                       if (retn)
+                               return retn + n;
                }
        }
 #endif
@@ -588,11 +623,15 @@ unsigned long __copy_user(void __user *pdst, const void *psrc,
        while (n >= 16) {
                __asm_copy_to_user_16(dst, src, retn);
                n -= 16;
+               if (retn)
+                       return retn + n;
        }
 
        while (n >= 4) {
                __asm_copy_to_user_4(dst, src, retn);
                n -= 4;
+               if (retn)
+                       return retn + n;
        }
 
        switch (n) {
@@ -609,24 +648,26 @@ unsigned long __copy_user(void __user *pdst, const void *psrc,
                break;
        }
 
+       /*
+        * If we get here, retn correctly reflects the number of failing
+        * bytes.
+        */
        return retn;
 }
-EXPORT_SYMBOL(__copy_user);
+EXPORT_SYMBOL(raw_copy_to_user);
 
 #define __asm_copy_from_user_1(to, from, ret) \
        __asm_copy_user_cont(to, from, ret,     \
                "       GETB D1Ar1,[%1++]\n"    \
                "2:     SETB [%0++],D1Ar1\n",   \
-               "3:     ADD  %2,%2,#1\n"        \
-               "       SETB [%0++],D1Ar1\n",   \
+               "3:     ADD  %2,%2,#1\n",       \
                "       .long 2b,3b\n")
 
 #define __asm_copy_from_user_2x_cont(to, from, ret, COPY, FIXUP, TENTRY) \
        __asm_copy_user_cont(to, from, ret,             \
                "       GETW D1Ar1,[%1++]\n"            \
                "2:     SETW [%0++],D1Ar1\n" COPY,      \
-               "3:     ADD  %2,%2,#2\n"                \
-               "       SETW [%0++],D1Ar1\n" FIXUP,     \
+               "3:     ADD  %2,%2,#2\n" FIXUP,         \
                "       .long 2b,3b\n" TENTRY)
 
 #define __asm_copy_from_user_2(to, from, ret) \
@@ -636,145 +677,26 @@ EXPORT_SYMBOL(__copy_user);
        __asm_copy_from_user_2x_cont(to, from, ret,     \
                "       GETB D1Ar1,[%1++]\n"            \
                "4:     SETB [%0++],D1Ar1\n",           \
-               "5:     ADD  %2,%2,#1\n"                \
-               "       SETB [%0++],D1Ar1\n",           \
+               "5:     ADD  %2,%2,#1\n",               \
                "       .long 4b,5b\n")
 
 #define __asm_copy_from_user_4x_cont(to, from, ret, COPY, FIXUP, TENTRY) \
        __asm_copy_user_cont(to, from, ret,             \
                "       GETD D1Ar1,[%1++]\n"            \
                "2:     SETD [%0++],D1Ar1\n" COPY,      \
-               "3:     ADD  %2,%2,#4\n"                \
-               "       SETD [%0++],D1Ar1\n" FIXUP,     \
+               "3:     ADD  %2,%2,#4\n" FIXUP,         \
                "       .long 2b,3b\n" TENTRY)
 
 #define __asm_copy_from_user_4(to, from, ret) \
        __asm_copy_from_user_4x_cont(to, from, ret, "", "", "")
 
-#define __asm_copy_from_user_5(to, from, ret) \
-       __asm_copy_from_user_4x_cont(to, from, ret,     \
-               "       GETB D1Ar1,[%1++]\n"            \
-               "4:     SETB [%0++],D1Ar1\n",           \
-               "5:     ADD  %2,%2,#1\n"                \
-               "       SETB [%0++],D1Ar1\n",           \
-               "       .long 4b,5b\n")
-
-#define __asm_copy_from_user_6x_cont(to, from, ret, COPY, FIXUP, TENTRY) \
-       __asm_copy_from_user_4x_cont(to, from, ret,     \
-               "       GETW D1Ar1,[%1++]\n"            \
-               "4:     SETW [%0++],D1Ar1\n" COPY,      \
-               "5:     ADD  %2,%2,#2\n"                \
-               "       SETW [%0++],D1Ar1\n" FIXUP,     \
-               "       .long 4b,5b\n" TENTRY)
-
-#define __asm_copy_from_user_6(to, from, ret) \
-       __asm_copy_from_user_6x_cont(to, from, ret, "", "", "")
-
-#define __asm_copy_from_user_7(to, from, ret) \
-       __asm_copy_from_user_6x_cont(to, from, ret,     \
-               "       GETB D1Ar1,[%1++]\n"            \
-               "6:     SETB [%0++],D1Ar1\n",           \
-               "7:     ADD  %2,%2,#1\n"                \
-               "       SETB [%0++],D1Ar1\n",           \
-               "       .long 6b,7b\n")
-
-#define __asm_copy_from_user_8x_cont(to, from, ret, COPY, FIXUP, TENTRY) \
-       __asm_copy_from_user_4x_cont(to, from, ret,     \
-               "       GETD D1Ar1,[%1++]\n"            \
-               "4:     SETD [%0++],D1Ar1\n" COPY,      \
-               "5:     ADD  %2,%2,#4\n"                        \
-               "       SETD [%0++],D1Ar1\n" FIXUP,             \
-               "       .long 4b,5b\n" TENTRY)
-
-#define __asm_copy_from_user_8(to, from, ret) \
-       __asm_copy_from_user_8x_cont(to, from, ret, "", "", "")
-
-#define __asm_copy_from_user_9(to, from, ret) \
-       __asm_copy_from_user_8x_cont(to, from, ret,     \
-               "       GETB D1Ar1,[%1++]\n"            \
-               "6:     SETB [%0++],D1Ar1\n",           \
-               "7:     ADD  %2,%2,#1\n"                \
-               "       SETB [%0++],D1Ar1\n",           \
-               "       .long 6b,7b\n")
-
-#define __asm_copy_from_user_10x_cont(to, from, ret, COPY, FIXUP, TENTRY) \
-       __asm_copy_from_user_8x_cont(to, from, ret,     \
-               "       GETW D1Ar1,[%1++]\n"            \
-               "6:     SETW [%0++],D1Ar1\n" COPY,      \
-               "7:     ADD  %2,%2,#2\n"                \
-               "       SETW [%0++],D1Ar1\n" FIXUP,     \
-               "       .long 6b,7b\n" TENTRY)
-
-#define __asm_copy_from_user_10(to, from, ret) \
-       __asm_copy_from_user_10x_cont(to, from, ret, "", "", "")
-
-#define __asm_copy_from_user_11(to, from, ret)         \
-       __asm_copy_from_user_10x_cont(to, from, ret,    \
-               "       GETB D1Ar1,[%1++]\n"            \
-               "8:     SETB [%0++],D1Ar1\n",           \
-               "9:     ADD  %2,%2,#1\n"                \
-               "       SETB [%0++],D1Ar1\n",           \
-               "       .long 8b,9b\n")
-
-#define __asm_copy_from_user_12x_cont(to, from, ret, COPY, FIXUP, TENTRY) \
-       __asm_copy_from_user_8x_cont(to, from, ret,     \
-               "       GETD D1Ar1,[%1++]\n"            \
-               "6:     SETD [%0++],D1Ar1\n" COPY,      \
-               "7:     ADD  %2,%2,#4\n"                \
-               "       SETD [%0++],D1Ar1\n" FIXUP,     \
-               "       .long 6b,7b\n" TENTRY)
-
-#define __asm_copy_from_user_12(to, from, ret) \
-       __asm_copy_from_user_12x_cont(to, from, ret, "", "", "")
-
-#define __asm_copy_from_user_13(to, from, ret) \
-       __asm_copy_from_user_12x_cont(to, from, ret,    \
-               "       GETB D1Ar1,[%1++]\n"            \
-               "8:     SETB [%0++],D1Ar1\n",           \
-               "9:     ADD  %2,%2,#1\n"                \
-               "       SETB [%0++],D1Ar1\n",           \
-               "       .long 8b,9b\n")
-
-#define __asm_copy_from_user_14x_cont(to, from, ret, COPY, FIXUP, TENTRY) \
-       __asm_copy_from_user_12x_cont(to, from, ret,    \
-               "       GETW D1Ar1,[%1++]\n"            \
-               "8:     SETW [%0++],D1Ar1\n" COPY,      \
-               "9:     ADD  %2,%2,#2\n"                \
-               "       SETW [%0++],D1Ar1\n" FIXUP,     \
-               "       .long 8b,9b\n" TENTRY)
-
-#define __asm_copy_from_user_14(to, from, ret) \
-       __asm_copy_from_user_14x_cont(to, from, ret, "", "", "")
-
-#define __asm_copy_from_user_15(to, from, ret) \
-       __asm_copy_from_user_14x_cont(to, from, ret,    \
-               "       GETB D1Ar1,[%1++]\n"            \
-               "10:    SETB [%0++],D1Ar1\n",           \
-               "11:    ADD  %2,%2,#1\n"                \
-               "       SETB [%0++],D1Ar1\n",           \
-               "       .long 10b,11b\n")
-
-#define __asm_copy_from_user_16x_cont(to, from, ret, COPY, FIXUP, TENTRY) \
-       __asm_copy_from_user_12x_cont(to, from, ret,    \
-               "       GETD D1Ar1,[%1++]\n"            \
-               "8:     SETD [%0++],D1Ar1\n" COPY,      \
-               "9:     ADD  %2,%2,#4\n"                \
-               "       SETD [%0++],D1Ar1\n" FIXUP,     \
-               "       .long 8b,9b\n" TENTRY)
-
-#define __asm_copy_from_user_16(to, from, ret) \
-       __asm_copy_from_user_16x_cont(to, from, ret, "", "", "")
-
 #define __asm_copy_from_user_8x64(to, from, ret) \
        asm volatile (                          \
                "       GETL D0Ar2,D1Ar1,[%1++]\n"      \
                "2:     SETL [%0++],D0Ar2,D1Ar1\n"      \
                "1:\n"                                  \
                "       .section .fixup,\"ax\"\n"       \
-               "       MOV D1Ar1,#0\n"                 \
-               "       MOV D0Ar2,#0\n"                 \
                "3:     ADD  %2,%2,#8\n"                \
-               "       SETL [%0++],D0Ar2,D1Ar1\n"      \
                "       MOVT    D0Ar2,#HI(1b)\n"        \
                "       JUMP    D0Ar2,#LO(1b)\n"        \
                "       .previous\n"                    \
@@ -789,36 +711,57 @@ EXPORT_SYMBOL(__copy_user);
  *
  *     Rationale:
  *             A fault occurs while reading from user buffer, which is the
- *             source. Since the fault is at a single address, we only
- *             need to rewind by 8 bytes.
+ *             source.
  *             Since we don't write to kernel buffer until we read first,
  *             the kernel buffer is at the right state and needn't be
- *             corrected.
+ *             corrected, but the source must be rewound to the beginning of
+ *             the block, which is LSM_STEP*8 bytes.
+ *             LSM_STEP is bits 10:8 in TXSTATUS which is already read
+ *             and stored in D0Ar2
+ *
+ *             NOTE: If a fault occurs at the last operation in M{G,S}ETL
+ *                     LSM_STEP will be 0. ie: we do 4 writes in our case, if
+ *                     a fault happens at the 4th write, LSM_STEP will be 0
+ *                     instead of 4. The code copes with that.
  */
 #define __asm_copy_from_user_64bit_rapf_loop(to, from, ret, n, id)     \
        __asm_copy_user_64bit_rapf_loop(to, from, ret, n, id,           \
-               "SUB    %1, %1, #8\n")
+               "LSR    D0Ar2, D0Ar2, #5\n"                             \
+               "ANDS   D0Ar2, D0Ar2, #0x38\n"                          \
+               "ADDZ   D0Ar2, D0Ar2, #32\n"                            \
+               "SUB    %1, %1, D0Ar2\n")
 
 /*     rewind 'from' pointer when a fault occurs
  *
  *     Rationale:
  *             A fault occurs while reading from user buffer, which is the
- *             source. Since the fault is at a single address, we only
- *             need to rewind by 4 bytes.
+ *             source.
  *             Since we don't write to kernel buffer until we read first,
  *             the kernel buffer is at the right state and needn't be
- *             corrected.
+ *             corrected, but the source must be rewound to the beginning of
+ *             the block, which is LSM_STEP*4 bytes.
+ *             LSM_STEP is bits 10:8 in TXSTATUS which is already read
+ *             and stored in D0Ar2
+ *
+ *             NOTE: If a fault occurs at the last operation in M{G,S}ETL
+ *                     LSM_STEP will be 0. ie: we do 4 writes in our case, if
+ *                     a fault happens at the 4th write, LSM_STEP will be 0
+ *                     instead of 4. The code copes with that.
  */
 #define __asm_copy_from_user_32bit_rapf_loop(to, from, ret, n, id)     \
        __asm_copy_user_32bit_rapf_loop(to, from, ret, n, id,           \
-               "SUB    %1, %1, #4\n")
+               "LSR    D0Ar2, D0Ar2, #6\n"                             \
+               "ANDS   D0Ar2, D0Ar2, #0x1c\n"                          \
+               "ADDZ   D0Ar2, D0Ar2, #16\n"                            \
+               "SUB    %1, %1, D0Ar2\n")
 
 
-/* Copy from user to kernel, zeroing the bytes that were inaccessible in
-   userland.  The return-value is the number of bytes that were
-   inaccessible.  */
-unsigned long __copy_user_zeroing(void *pdst, const void __user *psrc,
-                                 unsigned long n)
+/*
+ * Copy from user to kernel. The return-value is the number of bytes that were
+ * inaccessible.
+ */
+unsigned long raw_copy_from_user(void *pdst, const void __user *psrc,
+                                unsigned long n)
 {
        register char *dst asm ("A0.2") = pdst;
        register const char __user *src asm ("A1.2") = psrc;
@@ -830,6 +773,8 @@ unsigned long __copy_user_zeroing(void *pdst, const void __user *psrc,
        if ((unsigned long) src & 1) {
                __asm_copy_from_user_1(dst, src, retn);
                n--;
+               if (retn)
+                       return retn + n;
        }
        if ((unsigned long) dst & 1) {
                /* Worst case - byte copy */
@@ -837,12 +782,14 @@ unsigned long __copy_user_zeroing(void *pdst, const void __user *psrc,
                        __asm_copy_from_user_1(dst, src, retn);
                        n--;
                        if (retn)
-                               goto copy_exception_bytes;
+                               return retn + n;
                }
        }
        if (((unsigned long) src & 2) && n >= 2) {
                __asm_copy_from_user_2(dst, src, retn);
                n -= 2;
+               if (retn)
+                       return retn + n;
        }
        if ((unsigned long) dst & 2) {
                /* Second worst case - word copy */
@@ -850,16 +797,10 @@ unsigned long __copy_user_zeroing(void *pdst, const void __user *psrc,
                        __asm_copy_from_user_2(dst, src, retn);
                        n -= 2;
                        if (retn)
-                               goto copy_exception_bytes;
+                               return retn + n;
                }
        }
 
-       /* We only need one check after the unalignment-adjustments,
-          because if both adjustments were done, either both or
-          neither reference had an exception.  */
-       if (retn != 0)
-               goto copy_exception_bytes;
-
 #ifdef USE_RAPF
        /* 64 bit copy loop */
        if (!(((unsigned long) src | (unsigned long) dst) & 7)) {
@@ -872,7 +813,7 @@ unsigned long __copy_user_zeroing(void *pdst, const void __user *psrc,
                        __asm_copy_from_user_8x64(dst, src, retn);
                        n -= 8;
                        if (retn)
-                               goto copy_exception_bytes;
+                               return retn + n;
                }
        }
 
@@ -888,7 +829,7 @@ unsigned long __copy_user_zeroing(void *pdst, const void __user *psrc,
                        __asm_copy_from_user_8x64(dst, src, retn);
                        n -= 8;
                        if (retn)
-                               goto copy_exception_bytes;
+                               return retn + n;
                }
        }
 #endif
@@ -898,7 +839,7 @@ unsigned long __copy_user_zeroing(void *pdst, const void __user *psrc,
                n -= 4;
 
                if (retn)
-                       goto copy_exception_bytes;
+                       return retn + n;
        }
 
        /* If we get here, there were no memory read faults.  */
@@ -924,21 +865,8 @@ unsigned long __copy_user_zeroing(void *pdst, const void __user *psrc,
        /* If we get here, retn correctly reflects the number of failing
           bytes.  */
        return retn;
-
- copy_exception_bytes:
-       /* We already have "retn" bytes cleared, and need to clear the
-          remaining "n" bytes.  A non-optimized simple byte-for-byte in-line
-          memset is preferred here, since this isn't speed-critical code and
-          we'd rather have this a leaf-function than calling memset.  */
-       {
-               char *endp;
-               for (endp = dst + n; dst < endp; dst++)
-                       *dst = 0;
-       }
-
-       return retn + n;
 }
-EXPORT_SYMBOL(__copy_user_zeroing);
+EXPORT_SYMBOL(raw_copy_from_user);
 
 #define __asm_clear_8x64(to, ret) \
        asm volatile (                                  \
index 1732ec1..56830ff 100644 (file)
@@ -3,6 +3,7 @@ generic-y += barrier.h
 generic-y += clkdev.h
 generic-y += device.h
 generic-y += exec.h
+generic-y += extable.h
 generic-y += irq_work.h
 generic-y += mcs_spinlock.h
 generic-y += mm-arch-hooks.h
index 253a67e..38f2c9c 100644 (file)
 #ifndef _ASM_MICROBLAZE_UACCESS_H
 #define _ASM_MICROBLAZE_UACCESS_H
 
-#ifdef __KERNEL__
-#ifndef __ASSEMBLY__
-
 #include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/sched.h> /* RLIMIT_FSIZE */
 #include <linux/mm.h>
 
 #include <asm/mmu.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
+#include <asm/extable.h>
 #include <linux/string.h>
 
-#define VERIFY_READ    0
-#define VERIFY_WRITE   1
-
 /*
  * On Microblaze the fs value is actually the top of the corresponding
  * address space.
 
 # define segment_eq(a, b)      ((a).seg == (b).seg)
 
-/*
- * The exception table consists of pairs of addresses: the first is the
- * address of an instruction that is allowed to fault, and the second is
- * the address at which the program should continue. No registers are
- * modified, so it is entirely up to the continuation code to figure out
- * what to do.
- *
- * All the routines below use bits of fixup code that are out of line
- * with the main instruction path. This means when everything is well,
- * we don't even have to jump over them. Further, they do not intrude
- * on our cache or tlb entries.
- */
-struct exception_table_entry {
-       unsigned long insn, fixup;
-};
-
 #ifndef CONFIG_MMU
 
 /* Check against bounds of physical memory */
@@ -359,39 +336,19 @@ extern long __user_bad(void);
        __gu_err;                                                       \
 })
 
-
-/* copy_to_from_user */
-#define __copy_from_user(to, from, n)  \
-       __copy_tofrom_user((__force void __user *)(to), \
-                               (void __user *)(from), (n))
-#define __copy_from_user_inatomic(to, from, n) \
-               __copy_from_user((to), (from), (n))
-
-static inline long copy_from_user(void *to,
-               const void __user *from, unsigned long n)
+static inline unsigned long
+raw_copy_from_user(void *to, const void __user *from, unsigned long n)
 {
-       unsigned long res = n;
-       might_fault();
-       if (likely(access_ok(VERIFY_READ, from, n)))
-               res = __copy_from_user(to, from, n);
-       if (unlikely(res))
-               memset(to + (n - res), 0, res);
-       return res;
+       return __copy_tofrom_user((__force void __user *)to, from, n);
 }
 
-#define __copy_to_user(to, from, n)    \
-               __copy_tofrom_user((void __user *)(to), \
-                       (__force const void __user *)(from), (n))
-#define __copy_to_user_inatomic(to, from, n) __copy_to_user((to), (from), (n))
-
-static inline long copy_to_user(void __user *to,
-               const void *from, unsigned long n)
+static inline unsigned long
+raw_copy_to_user(void __user *to, const void *from, unsigned long n)
 {
-       might_fault();
-       if (access_ok(VERIFY_WRITE, to, n))
-               return __copy_to_user(to, from, n);
-       return n;
+       return __copy_tofrom_user(to, (__force const void __user *)from, n);
 }
+#define INLINE_COPY_FROM_USER
+#define INLINE_COPY_TO_USER
 
 /*
  * Copy a null terminated string from userspace.
@@ -422,7 +379,4 @@ static inline long strnlen_user(const char __user *src, long n)
        return __strnlen_user(src, n);
 }
 
-#endif  /* __ASSEMBLY__ */
-#endif /* __KERNEL__ */
-
 #endif /* _ASM_MICROBLAZE_UACCESS_H */
index a008a9f..d6d545a 100644 (file)
@@ -68,7 +68,6 @@ config MIPS
        select HANDLE_DOMAIN_IRQ
        select HAVE_EXIT_THREAD
        select HAVE_REGS_AND_STACK_ACCESS_API
-       select HAVE_ARCH_HARDENED_USERCOPY
 
 menu "Machine selection"
 
@@ -1531,7 +1530,7 @@ config CPU_MIPS64_R6
        select CPU_SUPPORTS_HIGHMEM
        select CPU_SUPPORTS_MSA
        select GENERIC_CSUM
-       select MIPS_O32_FP64_SUPPORT if MIPS32_O32
+       select MIPS_O32_FP64_SUPPORT if 32BIT || MIPS32_O32
        select HAVE_KVM
        help
          Choose this option to build a kernel for release 6 or later of the
index 8ef9c02..02a1787 100644 (file)
@@ -489,7 +489,7 @@ $(generic_defconfigs):
        $(Q)$(CONFIG_SHELL) $(srctree)/scripts/kconfig/merge_config.sh \
                -m -O $(objtree) $(srctree)/arch/$(ARCH)/configs/generic_defconfig $^ \
                $(foreach board,$(BOARDS),$(generic_config_dir)/board-$(board).config)
-       $(Q)$(MAKE) olddefconfig
+       $(Q)$(MAKE) -f $(srctree)/Makefile olddefconfig
 
 #
 # Prevent generic merge_config rules attempting to merge single fragments
@@ -503,8 +503,8 @@ $(generic_config_dir)/%.config: ;
 #
 .PHONY: sead3_defconfig
 sead3_defconfig:
-       $(Q)$(MAKE) 32r2el_defconfig BOARDS=sead-3
+       $(Q)$(MAKE) -f $(srctree)/Makefile 32r2el_defconfig BOARDS=sead-3
 
 .PHONY: sead3micro_defconfig
 sead3micro_defconfig:
-       $(Q)$(MAKE) micro32r2el_defconfig BOARDS=sead-3
+       $(Q)$(MAKE) -f $(srctree)/Makefile micro32r2el_defconfig BOARDS=sead-3
index e1bec5a..32d1333 100644 (file)
@@ -138,7 +138,9 @@ static int __init alchemy_time_init(unsigned int m2int)
        cd->shift = 32;
        cd->mult = div_sc(32768, NSEC_PER_SEC, cd->shift);
        cd->max_delta_ns = clockevent_delta2ns(0xffffffff, cd);
-       cd->min_delta_ns = clockevent_delta2ns(9, cd);  /* ~0.28ms */
+       cd->max_delta_ticks = 0xffffffff;
+       cd->min_delta_ns = clockevent_delta2ns(9, cd);
+       cd->min_delta_ticks = 9;        /* ~0.28ms */
        clockevents_register_device(cd);
        setup_irq(m2int, &au1x_rtcmatch2_irqaction);
 
index cfd97f6..0a7c983 100644 (file)
        .set    noat
 
 /*
- * t7 is used as a flag to note inatomic mode.
- */
-LEAF(__copy_user_inatomic)
-EXPORT_SYMBOL(__copy_user_inatomic)
-       b       __copy_user_common
-        li     t7, 1
-       END(__copy_user_inatomic)
-
-/*
  * A combined memcpy/__copy_user
  * __copy_user sets len to 0 for success; else to an upper bound of
  * the number of uncopied bytes.
@@ -161,8 +152,6 @@ EXPORT_SYMBOL(memcpy)
 __memcpy:
 FEXPORT(__copy_user)
 EXPORT_SYMBOL(__copy_user)
-       li      t7, 0                           /* not inatomic */
-__copy_user_common:
        /*
         * Note: dst & src may be unaligned, len may be 0
         * Temps
@@ -414,25 +403,7 @@ l_exc:
        LOAD    t0, TI_TASK($28)
        LOAD    t0, THREAD_BUADDR(t0)   # t0 is just past last good address
        SUB     len, AT, t0             # len number of uncopied bytes
-       bnez    t7, 2f          /* Skip the zeroing out part if inatomic */
-       /*
-        * Here's where we rely on src and dst being incremented in tandem,
-        *   See (3) above.
-        * dst += (fault addr - src) to put dst at first byte to clear
-        */
-       ADD     dst, t0                 # compute start address in a1
-       SUB     dst, src
-       /*
-        * Clear len bytes starting at dst.  Can't call __bzero because it
-        * might modify len.  An inefficient loop for these rare times...
-        */
-       beqz    len, done
-        SUB    src, len, 1
-1:     sb      zero, 0(dst)
-       ADD     dst, dst, 1
-       bnez    src, 1b
-        SUB    src, src, 1
-2:     jr      ra
+       jr      ra
         nop
 
 
index a160cf6..6e28971 100644 (file)
@@ -3,3 +3,4 @@
 #include <asm/fpu.h>
 #include <asm-generic/asm-prototypes.h>
 #include <asm/uaccess.h>
+#include <asm/ftrace.h>
index c8b574f..77cad23 100644 (file)
@@ -50,7 +50,7 @@ __wsum csum_partial_copy_from_user(const void __user *src, void *dst, int len,
                                   __wsum sum, int *err_ptr)
 {
        might_fault();
-       if (segment_eq(get_fs(), get_ds()))
+       if (uaccess_kernel())
                return __csum_partial_copy_kernel((__force void *)src, dst,
                                                  len, sum, err_ptr);
        else
@@ -82,7 +82,7 @@ __wsum csum_and_copy_to_user(const void *src, void __user *dst, int len,
 {
        might_fault();
        if (access_ok(VERIFY_WRITE, dst, len)) {
-               if (segment_eq(get_fs(), get_ds()))
+               if (uaccess_kernel())
                        return __csum_partial_copy_kernel(src,
                                                          (__force void *)dst,
                                                          len, sum, err_ptr);
index f94455f..a2813fe 100644 (file)
@@ -21,6 +21,7 @@
 #include <asm/cpu-features.h>
 #include <asm/fpu_emulator.h>
 #include <asm/hazards.h>
+#include <asm/ptrace.h>
 #include <asm/processor.h>
 #include <asm/current.h>
 #include <asm/msa.h>
index 956db6e..ddd1c91 100644 (file)
 #include <irq.h>
 
 #define IRQ_STACK_SIZE                 THREAD_SIZE
+#define IRQ_STACK_START                        (IRQ_STACK_SIZE - sizeof(unsigned long))
 
 extern void *irq_stack[NR_CPUS];
 
+/*
+ * The highest address on the IRQ stack contains a dummy frame put down in
+ * genex.S (handle_int & except_vec_vi_handler) which is structured as follows:
+ *
+ *   top ------------
+ *       | task sp  | <- irq_stack[cpu] + IRQ_STACK_START
+ *       ------------
+ *       |          | <- First frame of IRQ context
+ *       ------------
+ *
+ * task sp holds a copy of the task stack pointer where the struct pt_regs
+ * from exception entry can be found.
+ */
+
 static inline bool on_irq_stack(int cpu, unsigned long sp)
 {
        unsigned long low = (unsigned long)irq_stack[cpu];
index 55fd94e..7f12d7e 100644 (file)
@@ -20,7 +20,7 @@
 #include <asm/cpu-features.h>
 #include <asm/cpu-type.h>
 #include <asm/mipsmtregs.h>
-#include <linux/uaccess.h> /* for segment_eq() */
+#include <linux/uaccess.h> /* for uaccess_kernel() */
 
 extern void (*r4k_blast_dcache)(void);
 extern void (*r4k_blast_icache)(void);
@@ -714,7 +714,7 @@ static inline void protected_blast_##pfx##cache##_range(unsigned long start,\
                                                                        \
        __##pfx##flush_prologue                                         \
                                                                        \
-       if (segment_eq(get_fs(), USER_DS)) {                            \
+       if (!uaccess_kernel()) {                                        \
                while (1) {                                             \
                        protected_cachee_op(hitop, addr);               \
                        if (addr == aend)                               \
index f485afe..a8df44d 100644 (file)
@@ -127,7 +127,7 @@ static inline void arch_spin_lock(arch_spinlock_t *lock)
                "       andi    %[ticket], %[ticket], 0xffff            \n"
                "       bne     %[ticket], %[my_ticket], 4f             \n"
                "        subu   %[ticket], %[my_ticket], %[ticket]      \n"
-               "2:                                                     \n"
+               "2:     .insn                                           \n"
                "       .subsection 2                                   \n"
                "4:     andi    %[ticket], %[ticket], 0xffff            \n"
                "       sll     %[ticket], 5                            \n"
@@ -202,7 +202,7 @@ static inline unsigned int arch_spin_trylock(arch_spinlock_t *lock)
                "       sc      %[ticket], %[ticket_ptr]                \n"
                "       beqz    %[ticket], 1b                           \n"
                "        li     %[ticket], 1                            \n"
-               "2:                                                     \n"
+               "2:     .insn                                           \n"
                "       .subsection 2                                   \n"
                "3:     b       2b                                      \n"
                "        li     %[ticket], 0                            \n"
@@ -382,7 +382,7 @@ static inline int arch_read_trylock(arch_rwlock_t *rw)
                "       .set    reorder                                 \n"
                __WEAK_LLSC_MB
                "       li      %2, 1                                   \n"
-               "2:                                                     \n"
+               "2:     .insn                                           \n"
                : "=" GCC_OFF_SMALL_ASM() (rw->lock), "=&r" (tmp), "=&r" (ret)
                : GCC_OFF_SMALL_ASM() (rw->lock)
                : "memory");
@@ -422,7 +422,7 @@ static inline int arch_write_trylock(arch_rwlock_t *rw)
                        "       lui     %1, 0x8000                      \n"
                        "       sc      %1, %0                          \n"
                        "       li      %2, 1                           \n"
-                       "2:                                             \n"
+                       "2:     .insn                                   \n"
                        : "=" GCC_OFF_SMALL_ASM() (rw->lock), "=&r" (tmp),
                          "=&r" (ret)
                        : GCC_OFF_SMALL_ASM() (rw->lock)
index 5347cfe..99e629a 100644 (file)
@@ -12,8 +12,6 @@
 #define _ASM_UACCESS_H
 
 #include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/thread_info.h>
 #include <linux/string.h>
 #include <asm/asm-eva.h>
 #include <asm/extable.h>
@@ -71,9 +69,6 @@ extern u64 __ua_limit;
 #define USER_DS                ((mm_segment_t) { __UA_LIMIT })
 #endif
 
-#define VERIFY_READ    0
-#define VERIFY_WRITE   1
-
 #define get_ds()       (KERNEL_DS)
 #define get_fs()       (current_thread_info()->addr_limit)
 #define set_fs(x)      (current_thread_info()->addr_limit = (x))
@@ -93,7 +88,7 @@ static inline bool eva_kernel_access(void)
        if (!IS_ENABLED(CONFIG_EVA))
                return false;
 
-       return segment_eq(get_fs(), get_ds());
+       return uaccess_kernel();
 }
 
 /*
@@ -133,23 +128,14 @@ static inline bool eva_kernel_access(void)
  * this function, memory access functions may still return -EFAULT.
  */
 
-#define __access_mask get_fs().seg
-
-#define __access_ok(addr, size, mask)                                  \
-({                                                                     \
-       unsigned long __addr = (unsigned long) (addr);                  \
-       unsigned long __size = size;                                    \
-       unsigned long __mask = mask;                                    \
-       unsigned long __ok;                                             \
-                                                                       \
-       __chk_user_ptr(addr);                                           \
-       __ok = (signed long)(__mask & (__addr | (__addr + __size) |     \
-               __ua_size(__size)));                                    \
-       __ok == 0;                                                      \
-})
+static inline int __access_ok(const void __user *p, unsigned long size)
+{
+       unsigned long addr = (unsigned long)p;
+       return (get_fs().seg & (addr | (addr + size) | __ua_size(size))) == 0;
+}
 
 #define access_ok(type, addr, size)                                    \
-       likely(__access_ok((addr), (size), __access_mask))
+       likely(__access_ok((addr), (size)))
 
 /*
  * put_user: - Write a simple value into user space.
@@ -811,157 +797,7 @@ extern void __put_user_unaligned_unknown(void);
 
 extern size_t __copy_user(void *__to, const void *__from, size_t __n);
 
-#ifndef CONFIG_EVA
-#define __invoke_copy_to_user(to, from, n)                             \
-({                                                                     \
-       register void __user *__cu_to_r __asm__("$4");                  \
-       register const void *__cu_from_r __asm__("$5");                 \
-       register long __cu_len_r __asm__("$6");                         \
-                                                                       \
-       __cu_to_r = (to);                                               \
-       __cu_from_r = (from);                                           \
-       __cu_len_r = (n);                                               \
-       __asm__ __volatile__(                                           \
-       __MODULE_JAL(__copy_user)                                       \
-       : "+r" (__cu_to_r), "+r" (__cu_from_r), "+r" (__cu_len_r)       \
-       :                                                               \
-       : "$8", "$9", "$10", "$11", "$12", "$14", "$15", "$24", "$31",  \
-         DADDI_SCRATCH, "memory");                                     \
-       __cu_len_r;                                                     \
-})
-
-#define __invoke_copy_to_kernel(to, from, n)                           \
-       __invoke_copy_to_user(to, from, n)
-
-#endif
-
-/*
- * __copy_to_user: - Copy a block of data into user space, with less checking.
- * @to:          Destination address, in user space.
- * @from: Source address, in kernel space.
- * @n:   Number of bytes to copy.
- *
- * Context: User context only. This function may sleep if pagefaults are
- *          enabled.
- *
- * Copy data from kernel space to user space.  Caller must check
- * the specified block with access_ok() before calling this function.
- *
- * Returns number of bytes that could not be copied.
- * On success, this will be zero.
- */
-#define __copy_to_user(to, from, n)                                    \
-({                                                                     \
-       void __user *__cu_to;                                           \
-       const void *__cu_from;                                          \
-       long __cu_len;                                                  \
-                                                                       \
-       __cu_to = (to);                                                 \
-       __cu_from = (from);                                             \
-       __cu_len = (n);                                                 \
-                                                                       \
-       check_object_size(__cu_from, __cu_len, true);                   \
-       might_fault();                                                  \
-                                                                       \
-       if (eva_kernel_access())                                        \
-               __cu_len = __invoke_copy_to_kernel(__cu_to, __cu_from,  \
-                                                  __cu_len);           \
-       else                                                            \
-               __cu_len = __invoke_copy_to_user(__cu_to, __cu_from,    \
-                                                __cu_len);             \
-       __cu_len;                                                       \
-})
-
-extern size_t __copy_user_inatomic(void *__to, const void *__from, size_t __n);
-
-#define __copy_to_user_inatomic(to, from, n)                           \
-({                                                                     \
-       void __user *__cu_to;                                           \
-       const void *__cu_from;                                          \
-       long __cu_len;                                                  \
-                                                                       \
-       __cu_to = (to);                                                 \
-       __cu_from = (from);                                             \
-       __cu_len = (n);                                                 \
-                                                                       \
-       check_object_size(__cu_from, __cu_len, true);                   \
-                                                                       \
-       if (eva_kernel_access())                                        \
-               __cu_len = __invoke_copy_to_kernel(__cu_to, __cu_from,  \
-                                                  __cu_len);           \
-       else                                                            \
-               __cu_len = __invoke_copy_to_user(__cu_to, __cu_from,    \
-                                                __cu_len);             \
-       __cu_len;                                                       \
-})
-
-#define __copy_from_user_inatomic(to, from, n)                         \
-({                                                                     \
-       void *__cu_to;                                                  \
-       const void __user *__cu_from;                                   \
-       long __cu_len;                                                  \
-                                                                       \
-       __cu_to = (to);                                                 \
-       __cu_from = (from);                                             \
-       __cu_len = (n);                                                 \
-                                                                       \
-       check_object_size(__cu_to, __cu_len, false);                    \
-                                                                       \
-       if (eva_kernel_access())                                        \
-               __cu_len = __invoke_copy_from_kernel_inatomic(__cu_to,  \
-                                                             __cu_from,\
-                                                             __cu_len);\
-       else                                                            \
-               __cu_len = __invoke_copy_from_user_inatomic(__cu_to,    \
-                                                           __cu_from,  \
-                                                           __cu_len);  \
-       __cu_len;                                                       \
-})
-
-/*
- * copy_to_user: - Copy a block of data into user space.
- * @to:          Destination address, in user space.
- * @from: Source address, in kernel space.
- * @n:   Number of bytes to copy.
- *
- * Context: User context only. This function may sleep if pagefaults are
- *          enabled.
- *
- * Copy data from kernel space to user space.
- *
- * Returns number of bytes that could not be copied.
- * On success, this will be zero.
- */
-#define copy_to_user(to, from, n)                                      \
-({                                                                     \
-       void __user *__cu_to;                                           \
-       const void *__cu_from;                                          \
-       long __cu_len;                                                  \
-                                                                       \
-       __cu_to = (to);                                                 \
-       __cu_from = (from);                                             \
-       __cu_len = (n);                                                 \
-                                                                       \
-       check_object_size(__cu_from, __cu_len, true);                   \
-                                                                       \
-       if (eva_kernel_access()) {                                      \
-               __cu_len = __invoke_copy_to_kernel(__cu_to,             \
-                                                  __cu_from,           \
-                                                  __cu_len);           \
-       } else {                                                        \
-               if (access_ok(VERIFY_WRITE, __cu_to, __cu_len)) {       \
-                       might_fault();                                  \
-                       __cu_len = __invoke_copy_to_user(__cu_to,       \
-                                                        __cu_from,     \
-                                                        __cu_len);     \
-               }                                                       \
-       }                                                               \
-       __cu_len;                                                       \
-})
-
-#ifndef CONFIG_EVA
-
-#define __invoke_copy_from_user(to, from, n)                           \
+#define __invoke_copy_from(func, to, from, n)                          \
 ({                                                                     \
        register void *__cu_to_r __asm__("$4");                         \
        register const void __user *__cu_from_r __asm__("$5");          \
@@ -972,7 +808,7 @@ extern size_t __copy_user_inatomic(void *__to, const void *__from, size_t __n);
        __cu_len_r = (n);                                               \
        __asm__ __volatile__(                                           \
        ".set\tnoreorder\n\t"                                           \
-       __MODULE_JAL(__copy_user)                                       \
+       __MODULE_JAL(func)                                              \
        ".set\tnoat\n\t"                                                \
        __UA_ADDU "\t$1, %1, %2\n\t"                                    \
        ".set\tat\n\t"                                                  \
@@ -984,33 +820,17 @@ extern size_t __copy_user_inatomic(void *__to, const void *__from, size_t __n);
        __cu_len_r;                                                     \
 })
 
-#define __invoke_copy_from_kernel(to, from, n)                         \
-       __invoke_copy_from_user(to, from, n)
-
-/* For userland <-> userland operations */
-#define ___invoke_copy_in_user(to, from, n)                            \
-       __invoke_copy_from_user(to, from, n)
-
-/* For kernel <-> kernel operations */
-#define ___invoke_copy_in_kernel(to, from, n)                          \
-       __invoke_copy_from_user(to, from, n)
-
-#define __invoke_copy_from_user_inatomic(to, from, n)                  \
+#define __invoke_copy_to(func, to, from, n)                            \
 ({                                                                     \
-       register void *__cu_to_r __asm__("$4");                         \
-       register const void __user *__cu_from_r __asm__("$5");          \
+       register void __user *__cu_to_r __asm__("$4");                  \
+       register const void *__cu_from_r __asm__("$5");                 \
        register long __cu_len_r __asm__("$6");                         \
                                                                        \
        __cu_to_r = (to);                                               \
        __cu_from_r = (from);                                           \
        __cu_len_r = (n);                                               \
        __asm__ __volatile__(                                           \
-       ".set\tnoreorder\n\t"                                           \
-       __MODULE_JAL(__copy_user_inatomic)                              \
-       ".set\tnoat\n\t"                                                \
-       __UA_ADDU "\t$1, %1, %2\n\t"                                    \
-       ".set\tat\n\t"                                                  \
-       ".set\treorder"                                                 \
+       __MODULE_JAL(func)                                              \
        : "+r" (__cu_to_r), "+r" (__cu_from_r), "+r" (__cu_len_r)       \
        :                                                               \
        : "$8", "$9", "$10", "$11", "$12", "$14", "$15", "$24", "$31",  \
@@ -1018,228 +838,79 @@ extern size_t __copy_user_inatomic(void *__to, const void *__from, size_t __n);
        __cu_len_r;                                                     \
 })
 
-#define __invoke_copy_from_kernel_inatomic(to, from, n)                        \
-       __invoke_copy_from_user_inatomic(to, from, n)                   \
+#define __invoke_copy_from_kernel(to, from, n)                         \
+       __invoke_copy_from(__copy_user, to, from, n)
+
+#define __invoke_copy_to_kernel(to, from, n)                           \
+       __invoke_copy_to(__copy_user, to, from, n)
+
+#define ___invoke_copy_in_kernel(to, from, n)                          \
+       __invoke_copy_from(__copy_user, to, from, n)
+
+#ifndef CONFIG_EVA
+#define __invoke_copy_from_user(to, from, n)                           \
+       __invoke_copy_from(__copy_user, to, from, n)
+
+#define __invoke_copy_to_user(to, from, n)                             \
+       __invoke_copy_to(__copy_user, to, from, n)
+
+#define ___invoke_copy_in_user(to, from, n)                            \
+       __invoke_copy_from(__copy_user, to, from, n)
 
 #else
 
 /* EVA specific functions */
 
-extern size_t __copy_user_inatomic_eva(void *__to, const void *__from,
-                                      size_t __n);
 extern size_t __copy_from_user_eva(void *__to, const void *__from,
                                   size_t __n);
 extern size_t __copy_to_user_eva(void *__to, const void *__from,
                                 size_t __n);
 extern size_t __copy_in_user_eva(void *__to, const void *__from, size_t __n);
 
-#define __invoke_copy_from_user_eva_generic(to, from, n, func_ptr)     \
-({                                                                     \
-       register void *__cu_to_r __asm__("$4");                         \
-       register const void __user *__cu_from_r __asm__("$5");          \
-       register long __cu_len_r __asm__("$6");                         \
-                                                                       \
-       __cu_to_r = (to);                                               \
-       __cu_from_r = (from);                                           \
-       __cu_len_r = (n);                                               \
-       __asm__ __volatile__(                                           \
-       ".set\tnoreorder\n\t"                                           \
-       __MODULE_JAL(func_ptr)                                          \
-       ".set\tnoat\n\t"                                                \
-       __UA_ADDU "\t$1, %1, %2\n\t"                                    \
-       ".set\tat\n\t"                                                  \
-       ".set\treorder"                                                 \
-       : "+r" (__cu_to_r), "+r" (__cu_from_r), "+r" (__cu_len_r)       \
-       :                                                               \
-       : "$8", "$9", "$10", "$11", "$12", "$14", "$15", "$24", "$31",  \
-         DADDI_SCRATCH, "memory");                                     \
-       __cu_len_r;                                                     \
-})
-
-#define __invoke_copy_to_user_eva_generic(to, from, n, func_ptr)       \
-({                                                                     \
-       register void *__cu_to_r __asm__("$4");                         \
-       register const void __user *__cu_from_r __asm__("$5");          \
-       register long __cu_len_r __asm__("$6");                         \
-                                                                       \
-       __cu_to_r = (to);                                               \
-       __cu_from_r = (from);                                           \
-       __cu_len_r = (n);                                               \
-       __asm__ __volatile__(                                           \
-       __MODULE_JAL(func_ptr)                                          \
-       : "+r" (__cu_to_r), "+r" (__cu_from_r), "+r" (__cu_len_r)       \
-       :                                                               \
-       : "$8", "$9", "$10", "$11", "$12", "$14", "$15", "$24", "$31",  \
-         DADDI_SCRATCH, "memory");                                     \
-       __cu_len_r;                                                     \
-})
-
 /*
  * Source or destination address is in userland. We need to go through
  * the TLB
  */
 #define __invoke_copy_from_user(to, from, n)                           \
-       __invoke_copy_from_user_eva_generic(to, from, n, __copy_from_user_eva)
-
-#define __invoke_copy_from_user_inatomic(to, from, n)                  \
-       __invoke_copy_from_user_eva_generic(to, from, n,                \
-                                           __copy_user_inatomic_eva)
+       __invoke_copy_from(__copy_from_user_eva, to, from, n)
 
 #define __invoke_copy_to_user(to, from, n)                             \
-       __invoke_copy_to_user_eva_generic(to, from, n, __copy_to_user_eva)
+       __invoke_copy_to(__copy_to_user_eva, to, from, n)
 
 #define ___invoke_copy_in_user(to, from, n)                            \
-       __invoke_copy_from_user_eva_generic(to, from, n, __copy_in_user_eva)
-
-/*
- * Source or destination address in the kernel. We are not going through
- * the TLB
- */
-#define __invoke_copy_from_kernel(to, from, n)                         \
-       __invoke_copy_from_user_eva_generic(to, from, n, __copy_user)
-
-#define __invoke_copy_from_kernel_inatomic(to, from, n)                        \
-       __invoke_copy_from_user_eva_generic(to, from, n, __copy_user_inatomic)
-
-#define __invoke_copy_to_kernel(to, from, n)                           \
-       __invoke_copy_to_user_eva_generic(to, from, n, __copy_user)
-
-#define ___invoke_copy_in_kernel(to, from, n)                          \
-       __invoke_copy_from_user_eva_generic(to, from, n, __copy_user)
+       __invoke_copy_from(__copy_in_user_eva, to, from, n)
 
 #endif /* CONFIG_EVA */
 
-/*
- * __copy_from_user: - Copy a block of data from user space, with less checking.
- * @to:          Destination address, in kernel space.
- * @from: Source address, in user space.
- * @n:   Number of bytes to copy.
- *
- * Context: User context only. This function may sleep if pagefaults are
- *          enabled.
- *
- * Copy data from user space to kernel space.  Caller must check
- * the specified block with access_ok() before calling this function.
- *
- * Returns number of bytes that could not be copied.
- * On success, this will be zero.
- *
- * If some data could not be copied, this function will pad the copied
- * data to the requested size using zero bytes.
- */
-#define __copy_from_user(to, from, n)                                  \
-({                                                                     \
-       void *__cu_to;                                                  \
-       const void __user *__cu_from;                                   \
-       long __cu_len;                                                  \
-                                                                       \
-       __cu_to = (to);                                                 \
-       __cu_from = (from);                                             \
-       __cu_len = (n);                                                 \
-                                                                       \
-       check_object_size(__cu_to, __cu_len, false);                    \
-                                                                       \
-       if (eva_kernel_access()) {                                      \
-               __cu_len = __invoke_copy_from_kernel(__cu_to,           \
-                                                    __cu_from,         \
-                                                    __cu_len);         \
-       } else {                                                        \
-               might_fault();                                          \
-               __cu_len = __invoke_copy_from_user(__cu_to, __cu_from,  \
-                                                  __cu_len);           \
-       }                                                               \
-       __cu_len;                                                       \
-})
+static inline unsigned long
+raw_copy_to_user(void __user *to, const void *from, unsigned long n)
+{
+       if (eva_kernel_access())
+               return __invoke_copy_to_kernel(to, from, n);
+       else
+               return __invoke_copy_to_user(to, from, n);
+}
 
-/*
- * copy_from_user: - Copy a block of data from user space.
- * @to:          Destination address, in kernel space.
- * @from: Source address, in user space.
- * @n:   Number of bytes to copy.
- *
- * Context: User context only. This function may sleep if pagefaults are
- *          enabled.
- *
- * Copy data from user space to kernel space.
- *
- * Returns number of bytes that could not be copied.
- * On success, this will be zero.
- *
- * If some data could not be copied, this function will pad the copied
- * data to the requested size using zero bytes.
- */
-#define copy_from_user(to, from, n)                                    \
-({                                                                     \
-       void *__cu_to;                                                  \
-       const void __user *__cu_from;                                   \
-       long __cu_len;                                                  \
-                                                                       \
-       __cu_to = (to);                                                 \
-       __cu_from = (from);                                             \
-       __cu_len = (n);                                                 \
-                                                                       \
-       check_object_size(__cu_to, __cu_len, false);                    \
-                                                                       \
-       if (eva_kernel_access()) {                                      \
-               __cu_len = __invoke_copy_from_kernel(__cu_to,           \
-                                                    __cu_from,         \
-                                                    __cu_len);         \
-       } else {                                                        \
-               if (access_ok(VERIFY_READ, __cu_from, __cu_len)) {      \
-                       might_fault();                                  \
-                       __cu_len = __invoke_copy_from_user(__cu_to,     \
-                                                          __cu_from,   \
-                                                          __cu_len);   \
-               } else {                                                \
-                       memset(__cu_to, 0, __cu_len);                   \
-               }                                                       \
-       }                                                               \
-       __cu_len;                                                       \
-})
+static inline unsigned long
+raw_copy_from_user(void *to, const void __user *from, unsigned long n)
+{
+       if (eva_kernel_access())
+               return __invoke_copy_from_kernel(to, from, n);
+       else
+               return __invoke_copy_from_user(to, from, n);
+}
 
-#define __copy_in_user(to, from, n)                                    \
-({                                                                     \
-       void __user *__cu_to;                                           \
-       const void __user *__cu_from;                                   \
-       long __cu_len;                                                  \
-                                                                       \
-       __cu_to = (to);                                                 \
-       __cu_from = (from);                                             \
-       __cu_len = (n);                                                 \
-       if (eva_kernel_access()) {                                      \
-               __cu_len = ___invoke_copy_in_kernel(__cu_to, __cu_from, \
-                                                   __cu_len);          \
-       } else {                                                        \
-               might_fault();                                          \
-               __cu_len = ___invoke_copy_in_user(__cu_to, __cu_from,   \
-                                                 __cu_len);            \
-       }                                                               \
-       __cu_len;                                                       \
-})
+#define INLINE_COPY_FROM_USER
+#define INLINE_COPY_TO_USER
 
-#define copy_in_user(to, from, n)                                      \
-({                                                                     \
-       void __user *__cu_to;                                           \
-       const void __user *__cu_from;                                   \
-       long __cu_len;                                                  \
-                                                                       \
-       __cu_to = (to);                                                 \
-       __cu_from = (from);                                             \
-       __cu_len = (n);                                                 \
-       if (eva_kernel_access()) {                                      \
-               __cu_len = ___invoke_copy_in_kernel(__cu_to,__cu_from,  \
-                                                   __cu_len);          \
-       } else {                                                        \
-               if (likely(access_ok(VERIFY_READ, __cu_from, __cu_len) &&\
-                          access_ok(VERIFY_WRITE, __cu_to, __cu_len))) {\
-                       might_fault();                                  \
-                       __cu_len = ___invoke_copy_in_user(__cu_to,      \
-                                                         __cu_from,    \
-                                                         __cu_len);    \
-               }                                                       \
-       }                                                               \
-       __cu_len;                                                       \
-})
+static inline unsigned long
+raw_copy_in_user(void __user*to, const void __user *from, unsigned long n)
+{
+       if (eva_kernel_access())
+               return ___invoke_copy_in_kernel(to, from, n);
+       else
+               return ___invoke_copy_in_user(to, from, n);
+}
 
 extern __kernel_size_t __bzero_kernel(void __user *addr, __kernel_size_t size);
 extern __kernel_size_t __bzero(void __user *addr, __kernel_size_t size);
index 3e940db..78faf42 100644 (file)
 #define __NR_pkey_mprotect             (__NR_Linux + 363)
 #define __NR_pkey_alloc                        (__NR_Linux + 364)
 #define __NR_pkey_free                 (__NR_Linux + 365)
+#define __NR_statx                     (__NR_Linux + 366)
 
 
 /*
  * Offset of the last Linux o32 flavoured syscall
  */
-#define __NR_Linux_syscalls            365
+#define __NR_Linux_syscalls            366
 
 #endif /* _MIPS_SIM == _MIPS_SIM_ABI32 */
 
 #define __NR_O32_Linux                 4000
-#define __NR_O32_Linux_syscalls                365
+#define __NR_O32_Linux_syscalls                366
 
 #if _MIPS_SIM == _MIPS_SIM_ABI64
 
 #define __NR_pkey_mprotect             (__NR_Linux + 323)
 #define __NR_pkey_alloc                        (__NR_Linux + 324)
 #define __NR_pkey_free                 (__NR_Linux + 325)
+#define __NR_statx                     (__NR_Linux + 326)
 
 /*
  * Offset of the last Linux 64-bit flavoured syscall
  */
-#define __NR_Linux_syscalls            325
+#define __NR_Linux_syscalls            326
 
 #endif /* _MIPS_SIM == _MIPS_SIM_ABI64 */
 
 #define __NR_64_Linux                  5000
-#define __NR_64_Linux_syscalls         325
+#define __NR_64_Linux_syscalls         326
 
 #if _MIPS_SIM == _MIPS_SIM_NABI32
 
 #define __NR_pkey_mprotect             (__NR_Linux + 327)
 #define __NR_pkey_alloc                        (__NR_Linux + 328)
 #define __NR_pkey_free                 (__NR_Linux + 329)
+#define __NR_statx                     (__NR_Linux + 330)
 
 /*
  * Offset of the last N32 flavoured syscall
  */
-#define __NR_Linux_syscalls            329
+#define __NR_Linux_syscalls            330
 
 #endif /* _MIPS_SIM == _MIPS_SIM_NABI32 */
 
 #define __NR_N32_Linux                 6000
-#define __NR_N32_Linux_syscalls                329
+#define __NR_N32_Linux_syscalls                330
 
 #endif /* _UAPI_ASM_UNISTD_H */
index bcf8f8c..bb1ad51 100644 (file)
@@ -145,7 +145,9 @@ void __init plat_time_init(void)
 
        clockevent_set_clock(&jz4740_clockevent, clk_rate);
        jz4740_clockevent.min_delta_ns = clockevent_delta2ns(100, &jz4740_clockevent);
+       jz4740_clockevent.min_delta_ticks = 100;
        jz4740_clockevent.max_delta_ns = clockevent_delta2ns(0xffff, &jz4740_clockevent);
+       jz4740_clockevent.max_delta_ticks = 0xffff;
        jz4740_clockevent.cpumask = cpumask_of(0);
 
        clockevents_register_device(&jz4740_clockevent);
index bb5c5d3..a670c0c 100644 (file)
@@ -102,6 +102,7 @@ void output_thread_info_defines(void)
        DEFINE(_THREAD_SIZE, THREAD_SIZE);
        DEFINE(_THREAD_MASK, THREAD_MASK);
        DEFINE(_IRQ_STACK_SIZE, IRQ_STACK_SIZE);
+       DEFINE(_IRQ_STACK_START, IRQ_STACK_START);
        BLANK();
 }
 
index 940ac00..8f9f2da 100644 (file)
@@ -123,7 +123,9 @@ void sb1480_clockevent_init(void)
                                  CLOCK_EVT_FEAT_ONESHOT;
        clockevent_set_clock(cd, V_SCD_TIMER_FREQ);
        cd->max_delta_ns        = clockevent_delta2ns(0x7fffff, cd);
+       cd->max_delta_ticks     = 0x7fffff;
        cd->min_delta_ns        = clockevent_delta2ns(2, cd);
+       cd->min_delta_ticks     = 2;
        cd->rating              = 200;
        cd->irq                 = irq;
        cd->cpumask             = cpumask_of(cpu);
index 77a5ddf..61ad907 100644 (file)
@@ -128,7 +128,9 @@ int __init ds1287_clockevent_init(int irq)
        cd->irq = irq;
        clockevent_set_clock(cd, 32768);
        cd->max_delta_ns = clockevent_delta2ns(0x7fffffff, cd);
+       cd->max_delta_ticks = 0x7fffffff;
        cd->min_delta_ns = clockevent_delta2ns(0x300, cd);
+       cd->min_delta_ticks = 0x300;
        cd->cpumask = cpumask_of(0);
 
        clockevents_register_device(&ds1287_clockevent);
index 6604005..fd90c82 100644 (file)
@@ -152,7 +152,9 @@ static int __init gt641xx_timer0_clockevent_init(void)
        cd->rating = 200 + gt641xx_base_clock / 10000000;
        clockevent_set_clock(cd, gt641xx_base_clock);
        cd->max_delta_ns = clockevent_delta2ns(0x7fffffff, cd);
+       cd->max_delta_ticks = 0x7fffffff;
        cd->min_delta_ns = clockevent_delta2ns(0x300, cd);
+       cd->min_delta_ticks = 0x300;
        cd->cpumask = cpumask_of(0);
 
        clockevents_register_device(&gt641xx_timer0_clockevent);
index 804d2a2..dd6a18b 100644 (file)
@@ -80,7 +80,7 @@ static unsigned int calculate_min_delta(void)
                }
 
                /* Sorted insert of 75th percentile into buf2 */
-               for (k = 0; k < i; ++k) {
+               for (k = 0; k < i && k < ARRAY_SIZE(buf2); ++k) {
                        if (buf1[ARRAY_SIZE(buf1) - 1] < buf2[k]) {
                                l = min_t(unsigned int,
                                          i, ARRAY_SIZE(buf2) - 1);
index 3d860ef..9d1edb5 100644 (file)
@@ -123,7 +123,9 @@ void sb1250_clockevent_init(void)
                                  CLOCK_EVT_FEAT_ONESHOT;
        clockevent_set_clock(cd, V_SCD_TIMER_FREQ);
        cd->max_delta_ns        = clockevent_delta2ns(0x7fffff, cd);
+       cd->max_delta_ticks     = 0x7fffff;
        cd->min_delta_ns        = clockevent_delta2ns(2, cd);
+       cd->min_delta_ticks     = 2;
        cd->rating              = 200;
        cd->irq                 = irq;
        cd->cpumask             = cpumask_of(cpu);
index aaca60d..7b17c8f 100644 (file)
@@ -196,7 +196,9 @@ void __init txx9_clockevent_init(unsigned long baseaddr, int irq,
        clockevent_set_clock(cd, TIMER_CLK(imbusclk));
        cd->max_delta_ns =
                clockevent_delta2ns(0xffffffff >> (32 - TXX9_TIMER_BITS), cd);
+       cd->max_delta_ticks = 0xffffffff >> (32 - TXX9_TIMER_BITS);
        cd->min_delta_ns = clockevent_delta2ns(0xf, cd);
+       cd->min_delta_ticks = 0xf;
        cd->irq = irq;
        cd->cpumask = cpumask_of(0),
        clockevents_register_device(cd);
index 59476a6..a00e87b 100644 (file)
@@ -361,7 +361,7 @@ LEAF(mips_cps_get_bootcfg)
        END(mips_cps_get_bootcfg)
 
 LEAF(mips_cps_boot_vpes)
-       PTR_L   ta2, COREBOOTCFG_VPEMASK(a0)
+       lw      ta2, COREBOOTCFG_VPEMASK(a0)
        PTR_L   ta3, COREBOOTCFG_VPECONFIG(a0)
 
 #if defined(CONFIG_CPU_MIPSR6)
index 07718bb..12422fd 100644 (file)
@@ -1824,7 +1824,7 @@ static inline void cpu_probe_loongson(struct cpuinfo_mips *c, unsigned int cpu)
                }
 
                decode_configs(c);
-               c->options |= MIPS_CPU_TLBINV | MIPS_CPU_LDPTE;
+               c->options |= MIPS_CPU_FTLB | MIPS_CPU_TLBINV | MIPS_CPU_LDPTE;
                c->writecombine = _CACHE_UNCACHED_ACCELERATED;
                break;
        default:
index 6430bff..5c429d7 100644 (file)
@@ -257,7 +257,7 @@ int arch_check_elf(void *_ehdr, bool has_interpreter, void *_interp_ehdr,
        else if ((prog_req.fr1 && prog_req.frdefault) ||
                 (prog_req.single && !prog_req.frdefault))
                /* Make sure 64-bit MIPS III/IV/64R1 will not pick FR1 */
-               state->overall_fp_mode = ((current_cpu_data.fpu_id & MIPS_FPIR_F64) &&
+               state->overall_fp_mode = ((raw_current_cpu_data.fpu_id & MIPS_FPIR_F64) &&
                                          cpu_has_mips_r2_r6) ?
                                          FP_FR1 : FP_FR0;
        else if (prog_req.fr1)
index 7ec9612..ae810da 100644 (file)
@@ -215,9 +215,11 @@ NESTED(handle_int, PT_SIZE, sp)
        beq     t0, t1, 2f
 
        /* Switch to IRQ stack */
-       li      t1, _IRQ_STACK_SIZE
+       li      t1, _IRQ_STACK_START
        PTR_ADD sp, t0, t1
 
+       /* Save task's sp on IRQ stack so that unwinding can follow it */
+       LONG_S  s1, 0(sp)
 2:
        jal     plat_irq_dispatch
 
@@ -325,9 +327,11 @@ NESTED(except_vec_vi_handler, 0, sp)
        beq     t0, t1, 2f
 
        /* Switch to IRQ stack */
-       li      t1, _IRQ_STACK_SIZE
+       li      t1, _IRQ_STACK_START
        PTR_ADD sp, t0, t1
 
+       /* Save task's sp on IRQ stack so that unwinding can follow it */
+       LONG_S  s1, 0(sp)
 2:
        jalr    v0
 
@@ -519,7 +523,7 @@ NESTED(nmi_handler, PT_SIZE, sp)
        BUILD_HANDLER reserved reserved sti verbose     /* others */
 
        .align  5
-       LEAF(handle_ri_rdhwr_vivt)
+       LEAF(handle_ri_rdhwr_tlbp)
        .set    push
        .set    noat
        .set    noreorder
@@ -538,7 +542,7 @@ NESTED(nmi_handler, PT_SIZE, sp)
        .set    pop
        bltz    k1, handle_ri   /* slow path */
        /* fall thru */
-       END(handle_ri_rdhwr_vivt)
+       END(handle_ri_rdhwr_tlbp)
 
        LEAF(handle_ri_rdhwr)
        .set    push
index 1f4bd22..eb6c0d5 100644 (file)
@@ -244,9 +244,6 @@ static int compute_signal(int tt)
 void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
 {
        int reg;
-       struct thread_info *ti = task_thread_info(p);
-       unsigned long ksp = (unsigned long)ti + THREAD_SIZE - 32;
-       struct pt_regs *regs = (struct pt_regs *)ksp - 1;
 #if (KGDB_GDB_REG_SIZE == 32)
        u32 *ptr = (u32 *)gdb_regs;
 #else
@@ -254,25 +251,46 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
 #endif
 
        for (reg = 0; reg < 16; reg++)
-               *(ptr++) = regs->regs[reg];
+               *(ptr++) = 0;
 
        /* S0 - S7 */
-       for (reg = 16; reg < 24; reg++)
-               *(ptr++) = regs->regs[reg];
+       *(ptr++) = p->thread.reg16;
+       *(ptr++) = p->thread.reg17;
+       *(ptr++) = p->thread.reg18;
+       *(ptr++) = p->thread.reg19;
+       *(ptr++) = p->thread.reg20;
+       *(ptr++) = p->thread.reg21;
+       *(ptr++) = p->thread.reg22;
+       *(ptr++) = p->thread.reg23;
 
        for (reg = 24; reg < 28; reg++)
                *(ptr++) = 0;
 
        /* GP, SP, FP, RA */
-       for (reg = 28; reg < 32; reg++)
-               *(ptr++) = regs->regs[reg];
-
-       *(ptr++) = regs->cp0_status;
-       *(ptr++) = regs->lo;
-       *(ptr++) = regs->hi;
-       *(ptr++) = regs->cp0_badvaddr;
-       *(ptr++) = regs->cp0_cause;
-       *(ptr++) = regs->cp0_epc;
+       *(ptr++) = (long)p;
+       *(ptr++) = p->thread.reg29;
+       *(ptr++) = p->thread.reg30;
+       *(ptr++) = p->thread.reg31;
+
+       *(ptr++) = p->thread.cp0_status;
+
+       /* lo, hi */
+       *(ptr++) = 0;
+       *(ptr++) = 0;
+
+       /*
+        * BadVAddr, Cause
+        * Ideally these would come from the last exception frame up the stack
+        * but that requires unwinding, otherwise we can't know much for sure.
+        */
+       *(ptr++) = 0;
+       *(ptr++) = 0;
+
+       /*
+        * PC
+        * use return address (RA), i.e. the moment after return from resume()
+        */
+       *(ptr++) = p->thread.reg31;
 }
 
 void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long pc)
index d8f1cf1..550e7d0 100644 (file)
@@ -1200,7 +1200,7 @@ fpu_emul:
        case lwl_op:
                rt = regs->regs[MIPSInst_RT(inst)];
                vaddr = regs->regs[MIPSInst_RS(inst)] + MIPSInst_SIMM(inst);
-               if (!access_ok(VERIFY_READ, vaddr, 4)) {
+               if (!access_ok(VERIFY_READ, (void __user *)vaddr, 4)) {
                        current->thread.cp0_baduaddr = vaddr;
                        err = SIGSEGV;
                        break;
@@ -1273,7 +1273,7 @@ fpu_emul:
        case lwr_op:
                rt = regs->regs[MIPSInst_RT(inst)];
                vaddr = regs->regs[MIPSInst_RS(inst)] + MIPSInst_SIMM(inst);
-               if (!access_ok(VERIFY_READ, vaddr, 4)) {
+               if (!access_ok(VERIFY_READ, (void __user *)vaddr, 4)) {
                        current->thread.cp0_baduaddr = vaddr;
                        err = SIGSEGV;
                        break;
@@ -1347,7 +1347,7 @@ fpu_emul:
        case swl_op:
                rt = regs->regs[MIPSInst_RT(inst)];
                vaddr = regs->regs[MIPSInst_RS(inst)] + MIPSInst_SIMM(inst);
-               if (!access_ok(VERIFY_WRITE, vaddr, 4)) {
+               if (!access_ok(VERIFY_WRITE, (void __user *)vaddr, 4)) {
                        current->thread.cp0_baduaddr = vaddr;
                        err = SIGSEGV;
                        break;
@@ -1417,7 +1417,7 @@ fpu_emul:
        case swr_op:
                rt = regs->regs[MIPSInst_RT(inst)];
                vaddr = regs->regs[MIPSInst_RS(inst)] + MIPSInst_SIMM(inst);
-               if (!access_ok(VERIFY_WRITE, vaddr, 4)) {
+               if (!access_ok(VERIFY_WRITE, (void __user *)vaddr, 4)) {
                        current->thread.cp0_baduaddr = vaddr;
                        err = SIGSEGV;
                        break;
@@ -1492,7 +1492,7 @@ fpu_emul:
 
                rt = regs->regs[MIPSInst_RT(inst)];
                vaddr = regs->regs[MIPSInst_RS(inst)] + MIPSInst_SIMM(inst);
-               if (!access_ok(VERIFY_READ, vaddr, 8)) {
+               if (!access_ok(VERIFY_READ, (void __user *)vaddr, 8)) {
                        current->thread.cp0_baduaddr = vaddr;
                        err = SIGSEGV;
                        break;
@@ -1611,7 +1611,7 @@ fpu_emul:
 
                rt = regs->regs[MIPSInst_RT(inst)];
                vaddr = regs->regs[MIPSInst_RS(inst)] + MIPSInst_SIMM(inst);
-               if (!access_ok(VERIFY_READ, vaddr, 8)) {
+               if (!access_ok(VERIFY_READ, (void __user *)vaddr, 8)) {
                        current->thread.cp0_baduaddr = vaddr;
                        err = SIGSEGV;
                        break;
@@ -1730,7 +1730,7 @@ fpu_emul:
 
                rt = regs->regs[MIPSInst_RT(inst)];
                vaddr = regs->regs[MIPSInst_RS(inst)] + MIPSInst_SIMM(inst);
-               if (!access_ok(VERIFY_WRITE, vaddr, 8)) {
+               if (!access_ok(VERIFY_WRITE, (void __user *)vaddr, 8)) {
                        current->thread.cp0_baduaddr = vaddr;
                        err = SIGSEGV;
                        break;
@@ -1848,7 +1848,7 @@ fpu_emul:
 
                rt = regs->regs[MIPSInst_RT(inst)];
                vaddr = regs->regs[MIPSInst_RS(inst)] + MIPSInst_SIMM(inst);
-               if (!access_ok(VERIFY_WRITE, vaddr, 8)) {
+               if (!access_ok(VERIFY_WRITE, (void __user *)vaddr, 8)) {
                        current->thread.cp0_baduaddr = vaddr;
                        err = SIGSEGV;
                        break;
@@ -1965,7 +1965,7 @@ fpu_emul:
                        err = SIGBUS;
                        break;
                }
-               if (!access_ok(VERIFY_READ, vaddr, 4)) {
+               if (!access_ok(VERIFY_READ, (void __user *)vaddr, 4)) {
                        current->thread.cp0_baduaddr = vaddr;
                        err = SIGBUS;
                        break;
@@ -2021,7 +2021,7 @@ fpu_emul:
                        err = SIGBUS;
                        break;
                }
-               if (!access_ok(VERIFY_WRITE, vaddr, 4)) {
+               if (!access_ok(VERIFY_WRITE, (void __user *)vaddr, 4)) {
                        current->thread.cp0_baduaddr = vaddr;
                        err = SIGBUS;
                        break;
@@ -2084,7 +2084,7 @@ fpu_emul:
                        err = SIGBUS;
                        break;
                }
-               if (!access_ok(VERIFY_READ, vaddr, 8)) {
+               if (!access_ok(VERIFY_READ, (void __user *)vaddr, 8)) {
                        current->thread.cp0_baduaddr = vaddr;
                        err = SIGBUS;
                        break;
@@ -2145,7 +2145,7 @@ fpu_emul:
                        err = SIGBUS;
                        break;
                }
-               if (!access_ok(VERIFY_WRITE, vaddr, 8)) {
+               if (!access_ok(VERIFY_WRITE, (void __user *)vaddr, 8)) {
                        current->thread.cp0_baduaddr = vaddr;
                        err = SIGBUS;
                        break;
index 8c35b31..9452b02 100644 (file)
@@ -1446,6 +1446,11 @@ static int mipsxx_pmu_handle_shared_irq(void)
        HANDLE_COUNTER(0)
        }
 
+#ifdef CONFIG_MIPS_PERF_SHARED_TC_COUNTERS
+       read_unlock(&pmuint_rwlock);
+#endif
+       resume_local_counters();
+
        /*
         * Do all the work for the pending perf events. We can do this
         * in here because the performance counter interrupt is a regular
@@ -1454,10 +1459,6 @@ static int mipsxx_pmu_handle_shared_irq(void)
        if (handled == IRQ_HANDLED)
                irq_work_run();
 
-#ifdef CONFIG_MIPS_PERF_SHARED_TC_COUNTERS
-       read_unlock(&pmuint_rwlock);
-#endif
-       resume_local_counters();
        return handled;
 }
 
index fb6b6b6..b68e10f 100644 (file)
@@ -488,31 +488,52 @@ unsigned long notrace unwind_stack_by_address(unsigned long stack_page,
                                              unsigned long pc,
                                              unsigned long *ra)
 {
+       unsigned long low, high, irq_stack_high;
        struct mips_frame_info info;
        unsigned long size, ofs;
+       struct pt_regs *regs;
        int leaf;
-       extern void ret_from_irq(void);
-       extern void ret_from_exception(void);
 
        if (!stack_page)
                return 0;
 
        /*
-        * If we reached the bottom of interrupt context,
-        * return saved pc in pt_regs.
+        * IRQ stacks start at IRQ_STACK_START
+        * task stacks at THREAD_SIZE - 32
         */
-       if (pc == (unsigned long)ret_from_irq ||
-           pc == (unsigned long)ret_from_exception) {
-               struct pt_regs *regs;
-               if (*sp >= stack_page &&
-                   *sp + sizeof(*regs) <= stack_page + THREAD_SIZE - 32) {
-                       regs = (struct pt_regs *)*sp;
-                       pc = regs->cp0_epc;
-                       if (!user_mode(regs) && __kernel_text_address(pc)) {
-                               *sp = regs->regs[29];
-                               *ra = regs->regs[31];
-                               return pc;
-                       }
+       low = stack_page;
+       if (!preemptible() && on_irq_stack(raw_smp_processor_id(), *sp)) {
+               high = stack_page + IRQ_STACK_START;
+               irq_stack_high = high;
+       } else {
+               high = stack_page + THREAD_SIZE - 32;
+               irq_stack_high = 0;
+       }
+
+       /*
+        * If we reached the top of the interrupt stack, start unwinding
+        * the interrupted task stack.
+        */
+       if (unlikely(*sp == irq_stack_high)) {
+               unsigned long task_sp = *(unsigned long *)*sp;
+
+               /*
+                * Check that the pointer saved in the IRQ stack head points to
+                * something within the stack of the current task
+                */
+               if (!object_is_on_stack((void *)task_sp))
+                       return 0;
+
+               /*
+                * Follow pointer to tasks kernel stack frame where interrupted
+                * state was saved.
+                */
+               regs = (struct pt_regs *)task_sp;
+               pc = regs->cp0_epc;
+               if (!user_mode(regs) && __kernel_text_address(pc)) {
+                       *sp = regs->regs[29];
+                       *ra = regs->regs[31];
+                       return pc;
                }
                return 0;
        }
@@ -533,8 +554,7 @@ unsigned long notrace unwind_stack_by_address(unsigned long stack_page,
        if (leaf < 0)
                return 0;
 
-       if (*sp < stack_page ||
-           *sp + info.frame_size > stack_page + THREAD_SIZE - 32)
+       if (*sp < low || *sp + info.frame_size > high)
                return 0;
 
        if (leaf)
index 9103beb..2d1a0c4 100644 (file)
@@ -18,7 +18,7 @@
 #include <linux/kernel.h>
 #include <linux/libfdt.h>
 #include <linux/of_fdt.h>
-#include <linux/sched.h>
+#include <linux/sched/task.h>
 #include <linux/start_kernel.h>
 #include <linux/string.h>
 #include <linux/printk.h>
index c29d397..80ed68b 100644 (file)
@@ -600,3 +600,4 @@ EXPORT(sys_call_table)
        PTR     sys_pkey_mprotect
        PTR     sys_pkey_alloc
        PTR     sys_pkey_free                   /* 4365 */
+       PTR     sys_statx
index 0687f96..49765b4 100644 (file)
@@ -438,4 +438,5 @@ EXPORT(sys_call_table)
        PTR     sys_pkey_mprotect
        PTR     sys_pkey_alloc
        PTR     sys_pkey_free                   /* 5325 */
+       PTR     sys_statx
        .size   sys_call_table,.-sys_call_table
index 0331ba3..90bad2d 100644 (file)
@@ -433,4 +433,5 @@ EXPORT(sysn32_call_table)
        PTR     sys_pkey_mprotect
        PTR     sys_pkey_alloc
        PTR     sys_pkey_free
+       PTR     sys_statx                       /* 6330 */
        .size   sysn32_call_table,.-sysn32_call_table
index 5a47042..2dd70bd 100644 (file)
@@ -588,4 +588,5 @@ EXPORT(sys32_call_table)
        PTR     sys_pkey_mprotect
        PTR     sys_pkey_alloc
        PTR     sys_pkey_free                   /* 4365 */
+       PTR     sys_statx
        .size   sys32_call_table,.-sys32_call_table
index 6d45f05..795b4aa 100644 (file)
@@ -422,13 +422,12 @@ void play_dead(void)
        local_irq_disable();
        idle_task_exit();
        cpu = smp_processor_id();
+       core = cpu_data[cpu].core;
        cpu_death = CPU_DEATH_POWER;
 
        pr_debug("CPU%d going offline\n", cpu);
 
        if (cpu_has_mipsmt || cpu_has_vp) {
-               core = cpu_data[cpu].core;
-
                /* Look for another online VPE within the core */
                for_each_online_cpu(cpu_death_sibling) {
                        if (cpu_data[cpu_death_sibling].core != core)
index f1d17ec..1dfa7f5 100644 (file)
@@ -98,7 +98,7 @@ static inline int mips_atomic_set(unsigned long addr, unsigned long new)
        if (unlikely(addr & 3))
                return -EINVAL;
 
-       if (unlikely(!access_ok(VERIFY_WRITE, addr, 4)))
+       if (unlikely(!access_ok(VERIFY_WRITE, (const void __user *)addr, 4)))
                return -EINVAL;
 
        if (cpu_has_llsc && R10000_LLSC_WAR) {
index c7d17cf..b49e7bf 100644 (file)
@@ -83,7 +83,7 @@ extern asmlinkage void handle_dbe(void);
 extern asmlinkage void handle_sys(void);
 extern asmlinkage void handle_bp(void);
 extern asmlinkage void handle_ri(void);
-extern asmlinkage void handle_ri_rdhwr_vivt(void);
+extern asmlinkage void handle_ri_rdhwr_tlbp(void);
 extern asmlinkage void handle_ri_rdhwr(void);
 extern asmlinkage void handle_cpu(void);
 extern asmlinkage void handle_ov(void);
@@ -2408,9 +2408,18 @@ void __init trap_init(void)
 
        set_except_vector(EXCCODE_SYS, handle_sys);
        set_except_vector(EXCCODE_BP, handle_bp);
-       set_except_vector(EXCCODE_RI, rdhwr_noopt ? handle_ri :
-                         (cpu_has_vtag_icache ?
-                          handle_ri_rdhwr_vivt : handle_ri_rdhwr));
+
+       if (rdhwr_noopt)
+               set_except_vector(EXCCODE_RI, handle_ri);
+       else {
+               if (cpu_has_vtag_icache)
+                       set_except_vector(EXCCODE_RI, handle_ri_rdhwr_tlbp);
+               else if (current_cpu_type() == CPU_LOONGSON3)
+                       set_except_vector(EXCCODE_RI, handle_ri_rdhwr_tlbp);
+               else
+                       set_except_vector(EXCCODE_RI, handle_ri_rdhwr);
+       }
+
        set_except_vector(EXCCODE_CPU, handle_cpu);
        set_except_vector(EXCCODE_OV, handle_ov);
        set_except_vector(EXCCODE_TR, handle_tr);
index 7ed9835..f806ee5 100644 (file)
@@ -1026,7 +1026,7 @@ static void emulate_load_store_insn(struct pt_regs *regs,
                        goto sigbus;
 
                if (IS_ENABLED(CONFIG_EVA)) {
-                       if (segment_eq(get_fs(), get_ds()))
+                       if (uaccess_kernel())
                                LoadHW(addr, value, res);
                        else
                                LoadHWE(addr, value, res);
@@ -1045,7 +1045,7 @@ static void emulate_load_store_insn(struct pt_regs *regs,
                        goto sigbus;
 
                if (IS_ENABLED(CONFIG_EVA)) {
-                       if (segment_eq(get_fs(), get_ds()))
+                       if (uaccess_kernel())
                                LoadW(addr, value, res);
                        else
                                LoadWE(addr, value, res);
@@ -1064,7 +1064,7 @@ static void emulate_load_store_insn(struct pt_regs *regs,
                        goto sigbus;
 
                if (IS_ENABLED(CONFIG_EVA)) {
-                       if (segment_eq(get_fs(), get_ds()))
+                       if (uaccess_kernel())
                                LoadHWU(addr, value, res);
                        else
                                LoadHWUE(addr, value, res);
@@ -1132,7 +1132,7 @@ static void emulate_load_store_insn(struct pt_regs *regs,
                value = regs->regs[insn.i_format.rt];
 
                if (IS_ENABLED(CONFIG_EVA)) {
-                       if (segment_eq(get_fs(), get_ds()))
+                       if (uaccess_kernel())
                                StoreHW(addr, value, res);
                        else
                                StoreHWE(addr, value, res);
@@ -1152,7 +1152,7 @@ static void emulate_load_store_insn(struct pt_regs *regs,
                value = regs->regs[insn.i_format.rt];
 
                if (IS_ENABLED(CONFIG_EVA)) {
-                       if (segment_eq(get_fs(), get_ds()))
+                       if (uaccess_kernel())
                                StoreW(addr, value, res);
                        else
                                StoreWE(addr, value, res);
index 3c3aa05..95bec46 100644 (file)
@@ -467,7 +467,7 @@ void __init ltq_soc_init(void)
 
                if (!np_xbar)
                        panic("Failed to load xbar nodes from devicetree");
-               if (of_address_to_resource(np_pmu, 0, &res_xbar))
+               if (of_address_to_resource(np_xbar, 0, &res_xbar))
                        panic("Failed to get xbar resources");
                if (!request_mem_region(res_xbar.start, resource_size(&res_xbar),
                        res_xbar.name))
index c3031f1..3114a2e 100644 (file)
        LOADK   t0, THREAD_BUADDR(t0)   # t0 is just past last good address
         nop
        SUB     len, AT, t0             # len number of uncopied bytes
-       bnez    t6, .Ldone\@    /* Skip the zeroing part if inatomic */
-       /*
-        * Here's where we rely on src and dst being incremented in tandem,
-        *   See (3) above.
-        * dst += (fault addr - src) to put dst at first byte to clear
-        */
-       ADD     dst, t0                 # compute start address in a1
-       SUB     dst, src
-       /*
-        * Clear len bytes starting at dst.  Can't call __bzero because it
-        * might modify len.  An inefficient loop for these rare times...
-        */
-       .set    reorder                         /* DADDI_WAR */
-       SUB     src, len, 1
-       beqz    len, .Ldone\@
-       .set    noreorder
-1:     sb      zero, 0(dst)
-       ADD     dst, dst, 1
-#ifndef CONFIG_CPU_DADDI_WORKAROUNDS
-       bnez    src, 1b
-        SUB    src, src, 1
-#else
-       .set    push
-       .set    noat
-       li      v1, 1
-       bnez    src, 1b
-        SUB    src, src, v1
-       .set    pop
-#endif
        jr      ra
         nop
 
-
 #define SEXC(n)                                                        \
        .set    reorder;                        /* DADDI_WAR */ \
 .Ls_exc_p ## n ## u\@:                                         \
@@ -673,15 +643,6 @@ LEAF(__rmemcpy)                                    /* a0=dst a1=src a2=len */
        END(__rmemcpy)
 
 /*
- * t6 is used as a flag to note inatomic mode.
- */
-LEAF(__copy_user_inatomic)
-EXPORT_SYMBOL(__copy_user_inatomic)
-       b       __copy_user_common
-       li      t6, 1
-       END(__copy_user_inatomic)
-
-/*
  * A combined memcpy/__copy_user
  * __copy_user sets len to 0 for success; else to an upper bound of
  * the number of uncopied bytes.
@@ -694,8 +655,6 @@ EXPORT_SYMBOL(memcpy)
 .L__memcpy:
 FEXPORT(__copy_user)
 EXPORT_SYMBOL(__copy_user)
-       li      t6, 0   /* not inatomic */
-__copy_user_common:
        /* Legacy Mode, user <-> user */
        __BUILD_COPY_USER LEGACY_MODE USEROP USEROP
 
@@ -708,20 +667,12 @@ __copy_user_common:
  * space
  */
 
-LEAF(__copy_user_inatomic_eva)
-EXPORT_SYMBOL(__copy_user_inatomic_eva)
-       b       __copy_from_user_common
-       li      t6, 1
-       END(__copy_user_inatomic_eva)
-
 /*
  * __copy_from_user (EVA)
  */
 
 LEAF(__copy_from_user_eva)
 EXPORT_SYMBOL(__copy_from_user_eva)
-       li      t6, 0   /* not inatomic */
-__copy_from_user_common:
        __BUILD_COPY_USER EVA_MODE USEROP KERNELOP
 END(__copy_from_user_eva)
 
index e6f972d..1c4332a 100644 (file)
@@ -199,7 +199,9 @@ static void __init ls1x_time_init(void)
 
        clockevent_set_clock(cd, mips_hpt_frequency);
        cd->max_delta_ns = clockevent_delta2ns(0xffffff, cd);
+       cd->max_delta_ticks = 0xffffff;
        cd->min_delta_ns = clockevent_delta2ns(0x000300, cd);
+       cd->min_delta_ticks = 0x000300;
        cd->cpumask = cpumask_of(smp_processor_id());
        clockevents_register_device(cd);
 
index b817d6d..a6adcc4 100644 (file)
@@ -123,7 +123,9 @@ void __init setup_mfgpt0_timer(void)
        cd->cpumask = cpumask_of(cpu);
        clockevent_set_clock(cd, MFGPT_TICK_RATE);
        cd->max_delta_ns = clockevent_delta2ns(0xffff, cd);
+       cd->max_delta_ticks = 0xffff;
        cd->min_delta_ns = clockevent_delta2ns(0xf, cd);
+       cd->min_delta_ticks = 0xf;
 
        /* Enable MFGPT0 Comparator 2 Output to the Interrupt Mapper */
        _wrmsr(DIVIL_MSR_REG(MFGPT_IRQ), 0, 0x100);
index 24afe36..4df9d4b 100644 (file)
@@ -241,7 +241,9 @@ void __init setup_hpet_timer(void)
        cd->cpumask = cpumask_of(cpu);
        clockevent_set_clock(cd, HPET_FREQ);
        cd->max_delta_ns = clockevent_delta2ns(0x7fffffff, cd);
+       cd->max_delta_ticks = 0x7fffffff;
        cd->min_delta_ns = clockevent_delta2ns(HPET_MIN_PROG_DELTA, cd);
+       cd->min_delta_ticks = HPET_MIN_PROG_DELTA;
 
        clockevents_register_device(cd);
        setup_irq(HPET_T0_IRQ, &hpet_irq);
index e7f798d..3fe99cb 100644 (file)
@@ -1562,6 +1562,7 @@ static void probe_vcache(void)
        vcache_size = c->vcache.sets * c->vcache.ways * c->vcache.linesz;
 
        c->vcache.waybit = 0;
+       c->vcache.waysize = vcache_size / c->vcache.ways;
 
        pr_info("Unified victim cache %ldkB %s, linesize %d bytes.\n",
                vcache_size >> 10, way_string[c->vcache.ways], c->vcache.linesz);
@@ -1664,6 +1665,7 @@ static void __init loongson3_sc_init(void)
        /* Loongson-3 has 4 cores, 1MB scache for each. scaches are shared */
        scache_size *= 4;
        c->scache.waybit = 0;
+       c->scache.waysize = scache_size / c->scache.ways;
        pr_info("Unified secondary cache %ldkB %s, linesize %d bytes.\n",
               scache_size >> 10, way_string[c->scache.ways], c->scache.linesz);
        if (scache_size)
index 9bfee89..4f642e0 100644 (file)
@@ -760,7 +760,8 @@ static void build_huge_update_entries(u32 **p, unsigned int pte,
 static void build_huge_handler_tail(u32 **p, struct uasm_reloc **r,
                                    struct uasm_label **l,
                                    unsigned int pte,
-                                   unsigned int ptr)
+                                   unsigned int ptr,
+                                   unsigned int flush)
 {
 #ifdef CONFIG_SMP
        UASM_i_SC(p, pte, 0, ptr);
@@ -769,6 +770,22 @@ static void build_huge_handler_tail(u32 **p, struct uasm_reloc **r,
 #else
        UASM_i_SW(p, pte, 0, ptr);
 #endif
+       if (cpu_has_ftlb && flush) {
+               BUG_ON(!cpu_has_tlbinv);
+
+               UASM_i_MFC0(p, ptr, C0_ENTRYHI);
+               uasm_i_ori(p, ptr, ptr, MIPS_ENTRYHI_EHINV);
+               UASM_i_MTC0(p, ptr, C0_ENTRYHI);
+               build_tlb_write_entry(p, l, r, tlb_indexed);
+
+               uasm_i_xori(p, ptr, ptr, MIPS_ENTRYHI_EHINV);
+               UASM_i_MTC0(p, ptr, C0_ENTRYHI);
+               build_huge_update_entries(p, pte, ptr);
+               build_huge_tlb_write_entry(p, l, r, pte, tlb_random, 0);
+
+               return;
+       }
+
        build_huge_update_entries(p, pte, ptr);
        build_huge_tlb_write_entry(p, l, r, pte, tlb_indexed, 0);
 }
@@ -2199,7 +2216,7 @@ static void build_r4000_tlb_load_handler(void)
                uasm_l_tlbl_goaround2(&l, p);
        }
        uasm_i_ori(&p, wr.r1, wr.r1, (_PAGE_ACCESSED | _PAGE_VALID));
-       build_huge_handler_tail(&p, &r, &l, wr.r1, wr.r2);
+       build_huge_handler_tail(&p, &r, &l, wr.r1, wr.r2, 1);
 #endif
 
        uasm_l_nopage_tlbl(&l, p);
@@ -2254,7 +2271,7 @@ static void build_r4000_tlb_store_handler(void)
        build_tlb_probe_entry(&p);
        uasm_i_ori(&p, wr.r1, wr.r1,
                   _PAGE_ACCESSED | _PAGE_MODIFIED | _PAGE_VALID | _PAGE_DIRTY);
-       build_huge_handler_tail(&p, &r, &l, wr.r1, wr.r2);
+       build_huge_handler_tail(&p, &r, &l, wr.r1, wr.r2, 1);
 #endif
 
        uasm_l_nopage_tlbs(&l, p);
@@ -2310,7 +2327,7 @@ static void build_r4000_tlb_modify_handler(void)
        build_tlb_probe_entry(&p);
        uasm_i_ori(&p, wr.r1, wr.r1,
                   _PAGE_ACCESSED | _PAGE_MODIFIED | _PAGE_VALID | _PAGE_DIRTY);
-       build_huge_handler_tail(&p, &r, &l, wr.r1, wr.r2);
+       build_huge_handler_tail(&p, &r, &l, wr.r1, wr.r2, 0);
 #endif
 
        uasm_l_nopage_tlbm(&l, p);
index cb675ec..54f56d5 100644 (file)
@@ -232,6 +232,17 @@ void __init arch_init_irq(void)
 {
        int corehi_irq;
 
+       /*
+        * Preallocate the i8259's expected virq's here. Since irqchip_init()
+        * will probe the irqchips in hierarchial order, i8259 is probed last.
+        * If anything allocates a virq before the i8259 is probed, it will
+        * be given one of the i8259's expected range and consequently setup
+        * of the i8259 will fail.
+        */
+       WARN(irq_alloc_descs(I8259A_IRQ_BASE, I8259A_IRQ_BASE,
+                           16, numa_node_id()) < 0,
+               "Cannot reserve i8259 virqs at IRQ%d\n", I8259A_IRQ_BASE);
+
        i8259_set_poll(mips_pcibios_iack);
        irqchip_init();
 
index 1829a90..289edcf 100644 (file)
@@ -21,6 +21,7 @@
 #include <linux/i8253.h>
 #include <linux/init.h>
 #include <linux/kernel_stat.h>
+#include <linux/libfdt.h>
 #include <linux/math64.h>
 #include <linux/sched.h>
 #include <linux/spinlock.h>
@@ -207,6 +208,33 @@ static void __init init_rtc(void)
                CMOS_WRITE(ctrl & ~RTC_SET, RTC_CONTROL);
 }
 
+#ifdef CONFIG_CLKSRC_MIPS_GIC
+static u32 gic_frequency_dt;
+
+static struct property gic_frequency_prop = {
+       .name = "clock-frequency",
+       .length = sizeof(u32),
+       .value = &gic_frequency_dt,
+};
+
+static void update_gic_frequency_dt(void)
+{
+       struct device_node *node;
+
+       gic_frequency_dt = cpu_to_be32(gic_frequency);
+
+       node = of_find_compatible_node(NULL, NULL, "mti,gic-timer");
+       if (!node) {
+               pr_err("mti,gic-timer device node not found\n");
+               return;
+       }
+
+       if (of_update_property(node, &gic_frequency_prop) < 0)
+               pr_err("error updating gic frequency property\n");
+}
+
+#endif
+
 void __init plat_time_init(void)
 {
        unsigned int prid = read_c0_prid() & (PRID_COMP_MASK | PRID_IMP_MASK);
@@ -236,7 +264,8 @@ void __init plat_time_init(void)
                printk("GIC frequency %d.%02d MHz\n", freq/1000000,
                       (freq%1000000)*100/1000000);
 #ifdef CONFIG_CLKSRC_MIPS_GIC
-               gic_clocksource_init(gic_frequency);
+               update_gic_frequency_dt();
+               clocksource_probe();
 #endif
        }
 #endif
index 5e645c9..16ace55 100644 (file)
@@ -18,7 +18,7 @@ struct stackframe {
 static inline int get_mem(unsigned long addr, unsigned long *result)
 {
        unsigned long *address = (unsigned long *) addr;
-       if (!access_ok(VERIFY_READ, addr, sizeof(unsigned long)))
+       if (!access_ok(VERIFY_READ, address, sizeof(unsigned long)))
                return -1;
        if (__copy_from_user_inatomic(result, address, sizeof(unsigned long)))
                return -3;
index 014649b..3a84f6c 100644 (file)
@@ -190,7 +190,7 @@ void register_pci_controller(struct pci_controller *hose)
        }
 
        INIT_LIST_HEAD(&hose->list);
-       list_add(&hose->list, &controllers);
+       list_add_tail(&hose->list, &controllers);
 
        /*
         * Do not panic here but later - this might happen before console init.
index f24eee0..b8a1376 100644 (file)
@@ -129,7 +129,9 @@ static int __init ralink_systick_init(struct device_node *np)
        systick.dev.name = np->name;
        clockevents_calc_mult_shift(&systick.dev, SYSTICK_FREQ, 60);
        systick.dev.max_delta_ns = clockevent_delta2ns(0x7fff, &systick.dev);
+       systick.dev.max_delta_ticks = 0x7fff;
        systick.dev.min_delta_ns = clockevent_delta2ns(0x3, &systick.dev);
+       systick.dev.min_delta_ticks = 0x3;
        systick.dev.irq = irq_of_parse_and_map(np, 0);
        if (!systick.dev.irq) {
                pr_err("%s: request_irq failed", np->name);
index c4ffd43..48ce701 100644 (file)
@@ -35,7 +35,7 @@ static struct rt2880_pmx_func uartlite_func[] = { FUNC("uartlite", 0, 15, 2) };
 static struct rt2880_pmx_func jtag_func[] = { FUNC("jtag", 0, 17, 5) };
 static struct rt2880_pmx_func mdio_func[] = { FUNC("mdio", 0, 22, 2) };
 static struct rt2880_pmx_func lna_a_func[] = { FUNC("lna a", 0, 32, 3) };
-static struct rt2880_pmx_func lna_g_func[] = { FUNC("lna a", 0, 35, 3) };
+static struct rt2880_pmx_func lna_g_func[] = { FUNC("lna g", 0, 35, 3) };
 static struct rt2880_pmx_func pci_func[] = {
        FUNC("pci-dev", 0, 40, 32),
        FUNC("pci-host2", 1, 40, 32),
@@ -43,7 +43,7 @@ static struct rt2880_pmx_func pci_func[] = {
        FUNC("pci-fnc", 3, 40, 32)
 };
 static struct rt2880_pmx_func ge1_func[] = { FUNC("ge1", 0, 72, 12) };
-static struct rt2880_pmx_func ge2_func[] = { FUNC("ge1", 0, 84, 12) };
+static struct rt2880_pmx_func ge2_func[] = { FUNC("ge2", 0, 84, 12) };
 
 static struct rt2880_pmx_group rt3883_pinmux_data[] = {
        GRP("i2c", i2c_func, 1, RT3883_GPIO_MODE_I2C),
index 695c51b..a53f0c8 100644 (file)
@@ -113,7 +113,9 @@ void hub_rt_clock_event_init(void)
        cd->features            = CLOCK_EVT_FEAT_ONESHOT;
        clockevent_set_clock(cd, CYCLES_PER_SEC);
        cd->max_delta_ns        = clockevent_delta2ns(0xfffffffffffff, cd);
+       cd->max_delta_ticks     = 0xfffffffffffff;
        cd->min_delta_ns        = clockevent_delta2ns(0x300, cd);
+       cd->min_delta_ticks     = 0x300;
        cd->rating              = 200;
        cd->irq                 = irq;
        cd->cpumask             = cpumask_of(cpu);
index 97f64c7..ed810e7 100644 (file)
@@ -2,6 +2,7 @@
 generic-y += barrier.h
 generic-y += clkdev.h
 generic-y += exec.h
+generic-y += extable.h
 generic-y += irq_work.h
 generic-y += mcs_spinlock.h
 generic-y += mm-arch-hooks.h
index 2eedf6f..c696647 100644 (file)
 /*
  * User space memory access functions
  */
-#include <linux/thread_info.h>
 #include <linux/kernel.h>
 #include <asm/page.h>
-#include <asm/errno.h>
-
-#define VERIFY_READ 0
-#define VERIFY_WRITE 1
 
 /*
  * The fs value determines whether argument validity checking should be
@@ -71,26 +66,7 @@ static inline int ___range_ok(unsigned long addr, unsigned int size)
 #define access_ok(type, addr, size) (__range_ok((addr), (size)) == 0)
 #define __access_ok(addr, size)     (__range_ok((addr), (size)) == 0)
 
-/*
- * The exception table consists of pairs of addresses: the first is the
- * address of an instruction that is allowed to fault, and the second is
- * the address at which the program should continue.  No registers are
- * modified, so it is entirely up to the continuation code to figure out
- * what to do.
- *
- * All the routines below use bits of fixup code that are out of line
- * with the main instruction path.  This means when everything is well,
- * we don't even have to jump over them.  Further, they do not intrude
- * on our cache or tlb entries.
- */
-
-struct exception_table_entry
-{
-       unsigned long insn, fixup;
-};
-
-/* Returns 0 if exception not found and fixup otherwise.  */
-extern int fixup_exception(struct pt_regs *regs);
+#include <asm/extable.h>
 
 #define put_user(x, ptr) __put_user_check((x), (ptr), sizeof(*(ptr)))
 #define get_user(x, ptr) __get_user_check((x), (ptr), sizeof(*(ptr)))
@@ -299,170 +275,19 @@ do {                                                                     \
        }                                                               \
 } while (0)
 
-#define __copy_user_zeroing(to, from, size)                            \
-do {                                                                   \
-       if (size) {                                                     \
-               void *__to = to;                                        \
-               const void *__from = from;                              \
-               int w;                                                  \
-               asm volatile(                                           \
-                       "0:     movbu   (%0),%3;\n"                     \
-                       "1:     movbu   %3,(%1);\n"                     \
-                       "       inc     %0;\n"                          \
-                       "       inc     %1;\n"                          \
-                       "       add     -1,%2;\n"                       \
-                       "       bne     0b;\n"                          \
-                       "2:\n"                                          \
-                       "       .section .fixup,\"ax\"\n"               \
-                       "3:\n"                                          \
-                       "       mov     %2,%0\n"                        \
-                       "       clr     %3\n"                           \
-                       "4:     movbu   %3,(%1);\n"                     \
-                       "       inc     %1;\n"                          \
-                       "       add     -1,%2;\n"                       \
-                       "       bne     4b;\n"                          \
-                       "       mov     %0,%2\n"                        \
-                       "       jmp     2b\n"                           \
-                       "       .previous\n"                            \
-                       "       .section __ex_table,\"a\"\n"            \
-                       "       .balign 4\n"                            \
-                       "       .long   0b,3b\n"                        \
-                       "       .long   1b,3b\n"                        \
-                       "       .previous\n"                            \
-                       : "=a"(__from), "=a"(__to), "=r"(size), "=&r"(w)\
-                       : "0"(__from), "1"(__to), "2"(size)             \
-                       : "cc", "memory");                              \
-       }                                                               \
-} while (0)
-
-/* We let the __ versions of copy_from/to_user inline, because they're often
- * used in fast paths and have only a small space overhead.
- */
-static inline
-unsigned long __generic_copy_from_user_nocheck(void *to, const void *from,
-                                              unsigned long n)
-{
-       __copy_user_zeroing(to, from, n);
-       return n;
-}
-
-static inline
-unsigned long __generic_copy_to_user_nocheck(void *to, const void *from,
-                                            unsigned long n)
+static inline unsigned long
+raw_copy_from_user(void *to, const void __user *from, unsigned long n)
 {
        __copy_user(to, from, n);
        return n;
 }
 
-
-#if 0
-#error "don't use - these macros don't increment to & from pointers"
-/* Optimize just a little bit when we know the size of the move. */
-#define __constant_copy_user(to, from, size)   \
-do {                                           \
-       asm volatile(                           \
-               "       mov %0,a0;\n"           \
-               "0:     movbu (%1),d3;\n"       \
-               "1:     movbu d3,(%2);\n"       \
-               "       add -1,a0;\n"           \
-               "       bne 0b;\n"              \
-               "2:;"                           \
-               ".section .fixup,\"ax\"\n"      \
-               "3:     jmp 2b\n"               \
-               ".previous\n"                   \
-               ".section __ex_table,\"a\"\n"   \
-               "       .balign 4\n"            \
-               "       .long 0b,3b\n"          \
-               "       .long 1b,3b\n"          \
-               ".previous"                     \
-               :                               \
-               : "d"(size), "d"(to), "d"(from) \
-               : "d3", "a0");                  \
-} while (0)
-
-/* Optimize just a little bit when we know the size of the move. */
-#define __constant_copy_user_zeroing(to, from, size)   \
-do {                                                   \
-       asm volatile(                                   \
-               "       mov %0,a0;\n"                   \
-               "0:     movbu (%1),d3;\n"               \
-               "1:     movbu d3,(%2);\n"               \
-               "       add -1,a0;\n"                   \
-               "       bne 0b;\n"                      \
-               "2:;"                                   \
-               ".section .fixup,\"ax\"\n"              \
-               "3:     jmp 2b\n"                       \
-               ".previous\n"                           \
-               ".section __ex_table,\"a\"\n"           \
-               "       .balign 4\n"                    \
-               "       .long 0b,3b\n"                  \
-               "       .long 1b,3b\n"                  \
-               ".previous"                             \
-               :                                       \
-               : "d"(size), "d"(to), "d"(from)         \
-               : "d3", "a0");                          \
-} while (0)
-
-static inline
-unsigned long __constant_copy_to_user(void *to, const void *from,
-                                     unsigned long n)
-{
-       if (access_ok(VERIFY_WRITE, to, n))
-               __constant_copy_user(to, from, n);
-       return n;
-}
-
-static inline
-unsigned long __constant_copy_from_user(void *to, const void *from,
-                                       unsigned long n)
+static inline unsigned long
+raw_copy_to_user(void __user *to, const void *from, unsigned long n)
 {
-       if (access_ok(VERIFY_READ, from, n))
-               __constant_copy_user_zeroing(to, from, n);
-       return n;
-}
-
-static inline
-unsigned long __constant_copy_to_user_nocheck(void *to, const void *from,
-                                             unsigned long n)
-{
-       __constant_copy_user(to, from, n);
-       return n;
-}
-
-static inline
-unsigned long __constant_copy_from_user_nocheck(void *to, const void *from,
-                                               unsigned long n)
-{
-       __constant_copy_user_zeroing(to, from, n);
+       __copy_user(to, from, n);
        return n;
 }
-#endif
-
-extern unsigned long __generic_copy_to_user(void __user *, const void *,
-                                           unsigned long);
-extern unsigned long __generic_copy_from_user(void *, const void __user *,
-                                             unsigned long);
-
-#define __copy_to_user_inatomic(to, from, n) \
-       __generic_copy_to_user_nocheck((to), (from), (n))
-#define __copy_from_user_inatomic(to, from, n) \
-       __generic_copy_from_user_nocheck((to), (from), (n))
-
-#define __copy_to_user(to, from, n)                    \
-({                                                     \
-       might_fault();                                  \
-       __copy_to_user_inatomic((to), (from), (n));     \
-})
-
-#define __copy_from_user(to, from, n)                  \
-({                                                     \
-       might_fault();                                  \
-       __copy_from_user_inatomic((to), (from), (n));   \
-})
-
-
-#define copy_to_user(to, from, n)   __generic_copy_to_user((to), (from), (n))
-#define copy_from_user(to, from, n) __generic_copy_from_user((to), (from), (n))
 
 extern long strncpy_from_user(char *dst, const char __user *src, long count);
 extern long __strncpy_from_user(char *dst, const char __user *src, long count);
index d9b34dd..2b21bbc 100644 (file)
@@ -98,7 +98,9 @@ int __init init_clockevents(void)
 
        /* Calculate the min / max delta */
        cd->max_delta_ns        = clockevent_delta2ns(TMJCBR_MAX, cd);
+       cd->max_delta_ticks     = TMJCBR_MAX;
        cd->min_delta_ns        = clockevent_delta2ns(100, cd);
+       cd->min_delta_ticks     = 100;
 
        cd->rating              = 200;
        cd->cpumask             = cpumask_of(smp_processor_id());
index ec6c4f8..5e9f919 100644 (file)
@@ -26,8 +26,6 @@ EXPORT_SYMBOL(strncpy_from_user);
 EXPORT_SYMBOL(__strncpy_from_user);
 EXPORT_SYMBOL(clear_user);
 EXPORT_SYMBOL(__clear_user);
-EXPORT_SYMBOL(__generic_copy_from_user);
-EXPORT_SYMBOL(__generic_copy_to_user);
 EXPORT_SYMBOL(strnlen_user);
 
 extern u64 __ashrdi3(u64, unsigned);
index ce8899e..cece179 100644 (file)
  */
 #include <linux/uaccess.h>
 
-unsigned long
-__generic_copy_to_user(void *to, const void *from, unsigned long n)
-{
-       if (access_ok(VERIFY_WRITE, to, n))
-               __copy_user(to, from, n);
-       return n;
-}
-
-unsigned long
-__generic_copy_from_user(void *to, const void *from, unsigned long n)
-{
-       if (access_ok(VERIFY_READ, from, n))
-               __copy_user_zeroing(to, from, n);
-       else
-               memset(to, 0, n);
-       return n;
-}
-
 /*
  * Copy a null terminated string from userspace.
  */
index aaa3c21..87e70f2 100644 (file)
@@ -13,6 +13,7 @@ generic-y += dma.h
 generic-y += emergency-restart.h
 generic-y += errno.h
 generic-y += exec.h
+generic-y += extable.h
 generic-y += fb.h
 generic-y += fcntl.h
 generic-y += ftrace.h
index 0ab8232..727bd95 100644 (file)
 #ifndef _ASM_NIOS2_UACCESS_H
 #define _ASM_NIOS2_UACCESS_H
 
-#include <linux/errno.h>
-#include <linux/thread_info.h>
 #include <linux/string.h>
 
 #include <asm/page.h>
 
-#define VERIFY_READ    0
-#define VERIFY_WRITE   1
-
-/*
- * The exception table consists of pairs of addresses: the first is the
- * address of an instruction that is allowed to fault, and the second is
- * the address at which the program should continue.  No registers are
- * modified, so it is entirely up to the continuation code to figure out
- * what to do.
- *
- * All the routines below use bits of fixup code that are out of line
- * with the main instruction path.  This means when everything is well,
- * we don't even have to jump over them.  Further, they do not intrude
- * on our cache or tlb entries.
- */
-struct exception_table_entry {
-       unsigned long insn;
-       unsigned long fixup;
-};
-
-extern int fixup_exception(struct pt_regs *regs);
+#include <asm/extable.h>
 
 /*
  * Segment stuff
@@ -95,36 +73,17 @@ static inline unsigned long __must_check clear_user(void __user *to,
        return __clear_user(to, n);
 }
 
-extern long __copy_from_user(void *to, const void __user *from,
-                               unsigned long n);
-extern long __copy_to_user(void __user *to, const void *from, unsigned long n);
-
-static inline long copy_from_user(void *to, const void __user *from,
-                               unsigned long n)
-{
-       unsigned long res = n;
-       if (access_ok(VERIFY_READ, from, n))
-               res = __copy_from_user(to, from, n);
-       if (unlikely(res))
-               memset(to + (n - res), 0, res);
-       return res;
-}
-
-static inline long copy_to_user(void __user *to, const void *from,
-                               unsigned long n)
-{
-       if (!access_ok(VERIFY_WRITE, to, n))
-               return n;
-       return __copy_to_user(to, from, n);
-}
+extern unsigned long
+raw_copy_from_user(void *to, const void __user *from, unsigned long n);
+extern unsigned long
+raw_copy_to_user(void __user *to, const void *from, unsigned long n);
+#define INLINE_COPY_FROM_USER
+#define INLINE_COPY_TO_USER
 
 extern long strncpy_from_user(char *__to, const char __user *__from,
                                long __len);
 extern long strnlen_user(const char __user *s, long n);
 
-#define __copy_from_user_inatomic      __copy_from_user
-#define __copy_to_user_inatomic                __copy_to_user
-
 /* Optimized macros */
 #define __get_user_asm(val, insn, addr, err)                           \
 {                                                                      \
index 7663e15..8049833 100644 (file)
@@ -10,9 +10,9 @@
 #include <linux/export.h>
 #include <linux/uaccess.h>
 
-asm(".global   __copy_from_user\n"
-       "   .type __copy_from_user, @function\n"
-       "__copy_from_user:\n"
+asm(".global   raw_copy_from_user\n"
+       "   .type raw_copy_from_user, @function\n"
+       "raw_copy_from_user:\n"
        "   movi  r2,7\n"
        "   mov   r3,r4\n"
        "   bge   r2,r6,1f\n"
@@ -65,12 +65,12 @@ asm(".global        __copy_from_user\n"
        ".word 7b,13b\n"
        ".previous\n"
        );
-EXPORT_SYMBOL(__copy_from_user);
+EXPORT_SYMBOL(raw_copy_from_user);
 
 asm(
-       "   .global __copy_to_user\n"
-       "   .type __copy_to_user, @function\n"
-       "__copy_to_user:\n"
+       "   .global raw_copy_to_user\n"
+       "   .type raw_copy_to_user, @function\n"
+       "raw_copy_to_user:\n"
        "   movi  r2,7\n"
        "   mov   r3,r4\n"
        "   bge   r2,r6,1f\n"
@@ -127,7 +127,7 @@ asm(
        ".word 11b,13b\n"
        ".word 12b,13b\n"
        ".previous\n");
-EXPORT_SYMBOL(__copy_to_user);
+EXPORT_SYMBOL(raw_copy_to_user);
 
 long strncpy_from_user(char *__to, const char __user *__from, long __len)
 {
index fb01873..df8e2f7 100644 (file)
@@ -16,6 +16,7 @@ generic-y += dma.h
 generic-y += emergency-restart.h
 generic-y += errno.h
 generic-y += exec.h
+generic-y += extable.h
 generic-y += fb.h
 generic-y += fcntl.h
 generic-y += ftrace.h
index 1311e6b..a557a7c 100644 (file)
 /*
  * User space memory access functions
  */
-#include <linux/errno.h>
-#include <linux/thread_info.h>
 #include <linux/prefetch.h>
 #include <linux/string.h>
 #include <asm/page.h>
-
-#define VERIFY_READ    0
-#define VERIFY_WRITE   1
+#include <asm/extable.h>
 
 /*
  * The fs value determines whether argument validity checking should be
        __range_ok((unsigned long)addr, (unsigned long)size)
 
 /*
- * The exception table consists of pairs of addresses: the first is the
- * address of an instruction that is allowed to fault, and the second is
- * the address at which the program should continue.  No registers are
- * modified, so it is entirely up to the continuation code to figure out
- * what to do.
- *
- * All the routines below use bits of fixup code that are out of line
- * with the main instruction path.  This means when everything is well,
- * we don't even have to jump over them.  Further, they do not intrude
- * on our cache or tlb entries.
- */
-
-struct exception_table_entry {
-       unsigned long insn, fixup;
-};
-
-/*
  * These are the main single-value transfer routines.  They automatically
  * use the right size if we just have the right pointer type.
  *
@@ -257,34 +236,18 @@ do {                                                                      \
 
 extern unsigned long __must_check
 __copy_tofrom_user(void *to, const void *from, unsigned long size);
-
-#define __copy_from_user(to, from, size) \
-       __copy_tofrom_user(to, from, size)
-#define __copy_to_user(to, from, size) \
-       __copy_tofrom_user(to, from, size)
-
-#define __copy_to_user_inatomic __copy_to_user
-#define __copy_from_user_inatomic __copy_from_user
-
 static inline unsigned long
-copy_from_user(void *to, const void *from, unsigned long n)
+raw_copy_from_user(void *to, const void __user *from, unsigned long size)
 {
-       unsigned long res = n;
-
-       if (likely(access_ok(VERIFY_READ, from, n)))
-               res = __copy_tofrom_user(to, from, n);
-       if (unlikely(res))
-               memset(to + (n - res), 0, res);
-       return res;
+       return __copy_tofrom_user(to, (__force const void *)from, size);
 }
-
 static inline unsigned long
-copy_to_user(void *to, const void *from, unsigned long n)
+raw_copy_to_user(void *to, const void __user *from, unsigned long size)
 {
-       if (likely(access_ok(VERIFY_WRITE, to, n)))
-               n = __copy_tofrom_user(to, from, n);
-       return n;
+       return __copy_tofrom_user((__force void *)to, from, size);
 }
+#define INLINE_COPY_FROM_USER
+#define INLINE_COPY_TO_USER
 
 extern unsigned long __clear_user(void *addr, unsigned long size);
 
@@ -297,7 +260,7 @@ clear_user(void *addr, unsigned long size)
 }
 
 #define user_addr_max() \
-       (segment_eq(get_fs(), USER_DS) ? TASK_SIZE : ~0UL)
+       (uaccess_kernel() ? ~0UL : TASK_SIZE)
 
 extern long strncpy_from_user(char *dest, const char __user *src, long count);
 
index ad294b3..531da9e 100644 (file)
@@ -26,7 +26,6 @@ config PARISC
        select SYSCTL_ARCH_UNALIGN_ALLOW
        select SYSCTL_EXCEPTION_TRACE
        select HAVE_MOD_ARCH_SPECIFIC
-       select HAVE_ARCH_HARDENED_USERCOPY
        select VIRT_TO_BUS
        select MODULES_USE_ELF_RELA
        select CLONE_BACKWARDS
index ac8bd58..0ba1430 100644 (file)
@@ -109,7 +109,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
        /* futex.c wants to do a cmpxchg_inatomic on kernel NULL, which is
         * our gateway page, and causes no end of trouble...
         */
-       if (segment_eq(KERNEL_DS, get_fs()) && !uaddr)
+       if (uaccess_kernel() && !uaddr)
                return -EFAULT;
 
        if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
index 8442727..6b113f3 100644 (file)
@@ -6,15 +6,10 @@
  */
 #include <asm/page.h>
 #include <asm/cache.h>
-#include <asm/errno.h>
 #include <asm-generic/uaccess-unaligned.h>
 
 #include <linux/bug.h>
 #include <linux/string.h>
-#include <linux/thread_info.h>
-
-#define VERIFY_READ 0
-#define VERIFY_WRITE 1
 
 #define KERNEL_DS      ((mm_segment_t){0})
 #define USER_DS        ((mm_segment_t){1})
 #define get_user __get_user
 
 #if !defined(CONFIG_64BIT)
-#define LDD_USER(ptr)          __get_user_asm64(ptr)
+#define LDD_USER(val, ptr)     __get_user_asm64(val, ptr)
 #define STD_USER(x, ptr)       __put_user_asm64(x, ptr)
 #else
-#define LDD_USER(ptr)          __get_user_asm("ldd", ptr)
+#define LDD_USER(val, ptr)     __get_user_asm(val, "ldd", ptr)
 #define STD_USER(x, ptr)       __put_user_asm("std", x, ptr)
 #endif
 
@@ -97,63 +92,87 @@ struct exception_data {
                " mtsp %0,%%sr2\n\t"            \
                : : "r"(get_fs()) : )
 
-#define __get_user(x, ptr)                               \
-({                                                       \
-       register long __gu_err __asm__ ("r8") = 0;       \
-       register long __gu_val;                          \
-                                                        \
-       load_sr2();                                      \
-       switch (sizeof(*(ptr))) {                        \
-           case 1: __get_user_asm("ldb", ptr); break;   \
-           case 2: __get_user_asm("ldh", ptr); break;   \
-           case 4: __get_user_asm("ldw", ptr); break;   \
-           case 8: LDD_USER(ptr);  break;               \
-           default: BUILD_BUG(); break;                 \
-       }                                                \
-                                                        \
-       (x) = (__force __typeof__(*(ptr))) __gu_val;     \
-       __gu_err;                                        \
+#define __get_user_internal(val, ptr)                  \
+({                                                     \
+       register long __gu_err __asm__ ("r8") = 0;      \
+                                                       \
+       switch (sizeof(*(ptr))) {                       \
+       case 1: __get_user_asm(val, "ldb", ptr); break; \
+       case 2: __get_user_asm(val, "ldh", ptr); break; \
+       case 4: __get_user_asm(val, "ldw", ptr); break; \
+       case 8: LDD_USER(val, ptr); break;              \
+       default: BUILD_BUG();                           \
+       }                                               \
+                                                       \
+       __gu_err;                                       \
+})
+
+#define __get_user(val, ptr)                           \
+({                                                     \
+       load_sr2();                                     \
+       __get_user_internal(val, ptr);                  \
 })
 
-#define __get_user_asm(ldx, ptr)                        \
+#define __get_user_asm(val, ldx, ptr)                  \
+{                                                      \
+       register long __gu_val;                         \
+                                                       \
        __asm__("1: " ldx " 0(%%sr2,%2),%0\n"           \
                "9:\n"                                  \
                ASM_EXCEPTIONTABLE_ENTRY_EFAULT(1b, 9b) \
                : "=r"(__gu_val), "=r"(__gu_err)        \
-               : "r"(ptr), "1"(__gu_err));
+               : "r"(ptr), "1"(__gu_err));             \
+                                                       \
+       (val) = (__force __typeof__(*(ptr))) __gu_val;  \
+}
 
 #if !defined(CONFIG_64BIT)
 
-#define __get_user_asm64(ptr)                          \
+#define __get_user_asm64(val, ptr)                     \
+{                                                      \
+       union {                                         \
+               unsigned long long      l;              \
+               __typeof__(*(ptr))      t;              \
+       } __gu_tmp;                                     \
+                                                       \
        __asm__("   copy %%r0,%R0\n"                    \
                "1: ldw 0(%%sr2,%2),%0\n"               \
                "2: ldw 4(%%sr2,%2),%R0\n"              \
                "9:\n"                                  \
                ASM_EXCEPTIONTABLE_ENTRY_EFAULT(1b, 9b) \
                ASM_EXCEPTIONTABLE_ENTRY_EFAULT(2b, 9b) \
-               : "=r"(__gu_val), "=r"(__gu_err)        \
-               : "r"(ptr), "1"(__gu_err));
+               : "=&r"(__gu_tmp.l), "=r"(__gu_err)     \
+               : "r"(ptr), "1"(__gu_err));             \
+                                                       \
+       (val) = __gu_tmp.t;                             \
+}
 
 #endif /* !defined(CONFIG_64BIT) */
 
 
-#define __put_user(x, ptr)                                      \
+#define __put_user_internal(x, ptr)                            \
 ({                                                             \
        register long __pu_err __asm__ ("r8") = 0;              \
         __typeof__(*(ptr)) __x = (__typeof__(*(ptr)))(x);      \
                                                                \
-       load_sr2();                                             \
        switch (sizeof(*(ptr))) {                               \
-           case 1: __put_user_asm("stb", __x, ptr); break;     \
-           case 2: __put_user_asm("sth", __x, ptr); break;     \
-           case 4: __put_user_asm("stw", __x, ptr); break;     \
-           case 8: STD_USER(__x, ptr); break;                  \
-           default: BUILD_BUG(); break;                        \
-       }                                                       \
+       case 1: __put_user_asm("stb", __x, ptr); break;         \
+       case 2: __put_user_asm("sth", __x, ptr); break;         \
+       case 4: __put_user_asm("stw", __x, ptr); break;         \
+       case 8: STD_USER(__x, ptr); break;                      \
+       default: BUILD_BUG();                                   \
+       }                                                       \
                                                                \
        __pu_err;                                               \
 })
 
+#define __put_user(x, ptr)                                     \
+({                                                             \
+       load_sr2();                                             \
+       __put_user_internal(x, ptr);                            \
+})
+
+
 /*
  * The "__put_user/kernel_asm()" macros tell gcc they read from memory
  * instead of writing. This is because they do not write to any memory
@@ -192,9 +211,6 @@ struct exception_data {
  * Complex access routines -- external declarations
  */
 
-extern unsigned long lcopy_to_user(void __user *, const void *, unsigned long);
-extern unsigned long lcopy_from_user(void *, const void __user *, unsigned long);
-extern unsigned long lcopy_in_user(void __user *, const void __user *, unsigned long);
 extern long strncpy_from_user(char *, const char __user *, long);
 extern unsigned lclear_user(void __user *, unsigned long);
 extern long lstrnlen_user(const char __user *, long);
@@ -208,59 +224,14 @@ extern long lstrnlen_user(const char __user *, long);
 #define clear_user lclear_user
 #define __clear_user lclear_user
 
-unsigned long __must_check __copy_to_user(void __user *dst, const void *src,
-                                         unsigned long len);
-unsigned long __must_check __copy_from_user(void *dst, const void __user *src,
-                                         unsigned long len);
-unsigned long copy_in_user(void __user *dst, const void __user *src,
-                          unsigned long len);
-#define __copy_in_user copy_in_user
-#define __copy_to_user_inatomic __copy_to_user
-#define __copy_from_user_inatomic __copy_from_user
-
-extern void __compiletime_error("usercopy buffer size is too small")
-__bad_copy_user(void);
-
-static inline void copy_user_overflow(int size, unsigned long count)
-{
-       WARN(1, "Buffer overflow detected (%d < %lu)!\n", size, count);
-}
-
-static __always_inline unsigned long __must_check
-copy_from_user(void *to, const void __user *from, unsigned long n)
-{
-       int sz = __compiletime_object_size(to);
-       unsigned long ret = n;
-
-       if (likely(sz < 0 || sz >= n)) {
-               check_object_size(to, n, false);
-               ret = __copy_from_user(to, from, n);
-       } else if (!__builtin_constant_p(n))
-               copy_user_overflow(sz, n);
-       else
-               __bad_copy_user();
-
-       if (unlikely(ret))
-               memset(to + (n - ret), 0, ret);
-
-       return ret;
-}
-
-static __always_inline unsigned long __must_check
-copy_to_user(void __user *to, const void *from, unsigned long n)
-{
-       int sz = __compiletime_object_size(from);
-
-       if (likely(sz < 0 || sz >= n)) {
-               check_object_size(from, n, true);
-               n = __copy_to_user(to, from, n);
-       } else if (!__builtin_constant_p(n))
-               copy_user_overflow(sz, n);
-       else
-               __bad_copy_user();
-
-       return n;
-}
+unsigned long __must_check raw_copy_to_user(void __user *dst, const void *src,
+                                           unsigned long len);
+unsigned long __must_check raw_copy_from_user(void *dst, const void __user *src,
+                                           unsigned long len);
+unsigned long __must_check raw_copy_in_user(void __user *dst, const void __user *src,
+                                           unsigned long len);
+#define INLINE_COPY_TO_USER
+#define INLINE_COPY_FROM_USER
 
 struct pt_regs;
 int fixup_exception(struct pt_regs *regs);
index f01188c..85c28bb 100644 (file)
@@ -201,7 +201,7 @@ ENTRY_CFI(pa_memcpy)
        add     dst,len,end
 
        /* short copy with less than 16 bytes? */
-       cmpib,>>=,n 15,len,.Lbyte_loop
+       cmpib,COND(>>=),n 15,len,.Lbyte_loop
 
        /* same alignment? */
        xor     src,dst,t0
@@ -216,7 +216,7 @@ ENTRY_CFI(pa_memcpy)
        /* loop until we are 64-bit aligned */
 .Lalign_loop64:
        extru   dst,31,3,t1
-       cmpib,=,n       0,t1,.Lcopy_loop_16
+       cmpib,=,n       0,t1,.Lcopy_loop_16_start
 20:    ldb,ma  1(srcspc,src),t1
 21:    stb,ma  t1,1(dstspc,dst)
        b       .Lalign_loop64
@@ -225,6 +225,7 @@ ENTRY_CFI(pa_memcpy)
        ASM_EXCEPTIONTABLE_ENTRY(20b,.Lcopy_done)
        ASM_EXCEPTIONTABLE_ENTRY(21b,.Lcopy_done)
 
+.Lcopy_loop_16_start:
        ldi     31,t0
 .Lcopy_loop_16:
        cmpb,COND(>>=),n t0,len,.Lword_loop
@@ -267,7 +268,7 @@ ENTRY_CFI(pa_memcpy)
        /* loop until we are 32-bit aligned */
 .Lalign_loop32:
        extru   dst,31,2,t1
-       cmpib,=,n       0,t1,.Lcopy_loop_4
+       cmpib,=,n       0,t1,.Lcopy_loop_8
 20:    ldb,ma  1(srcspc,src),t1
 21:    stb,ma  t1,1(dstspc,dst)
        b       .Lalign_loop32
@@ -277,7 +278,7 @@ ENTRY_CFI(pa_memcpy)
        ASM_EXCEPTIONTABLE_ENTRY(21b,.Lcopy_done)
 
 
-.Lcopy_loop_4:
+.Lcopy_loop_8:
        cmpib,COND(>>=),n 15,len,.Lbyte_loop
 
 10:    ldw     0(srcspc,src),t1
@@ -299,7 +300,7 @@ ENTRY_CFI(pa_memcpy)
        ASM_EXCEPTIONTABLE_ENTRY(16b,.Lcopy_done)
        ASM_EXCEPTIONTABLE_ENTRY(17b,.Lcopy_done)
 
-       b       .Lcopy_loop_4
+       b       .Lcopy_loop_8
        ldo     -16(len),len
 
 .Lbyte_loop:
@@ -324,7 +325,7 @@ ENTRY_CFI(pa_memcpy)
 .Lunaligned_copy:
        /* align until dst is 32bit-word-aligned */
        extru   dst,31,2,t1
-       cmpib,COND(=),n 0,t1,.Lcopy_dstaligned
+       cmpib,=,n       0,t1,.Lcopy_dstaligned
 20:    ldb     0(srcspc,src),t1
        ldo     1(src),src
 21:    stb,ma  t1,1(dstspc,dst)
@@ -362,7 +363,7 @@ ENTRY_CFI(pa_memcpy)
        cmpiclr,<> 1,t0,%r0
        b,n .Lcase1
 .Lcase0:
-       cmpb,= %r0,len,.Lcda_finish
+       cmpb,COND(=) %r0,len,.Lcda_finish
        nop
 
 1:     ldw,ma 4(srcspc,src), a3
@@ -376,7 +377,7 @@ ENTRY_CFI(pa_memcpy)
 1:     ldw,ma 4(srcspc,src), a3
        ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
        ldo -1(len),len
-       cmpb,=,n %r0,len,.Ldo0
+       cmpb,COND(=),n %r0,len,.Ldo0
 .Ldo4:
 1:     ldw,ma 4(srcspc,src), a0
        ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
@@ -402,7 +403,7 @@ ENTRY_CFI(pa_memcpy)
 1:     stw,ma t0, 4(dstspc,dst)
        ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcopy_done)
        ldo -4(len),len
-       cmpb,<> %r0,len,.Ldo4
+       cmpb,COND(<>) %r0,len,.Ldo4
        nop
 .Ldo0:
        shrpw a2, a3, %sar, t0
@@ -436,14 +437,14 @@ ENTRY_CFI(pa_memcpy)
        /* fault exception fixup handlers: */
 #ifdef CONFIG_64BIT
 .Lcopy16_fault:
-10:    b       .Lcopy_done
-       std,ma  t1,8(dstspc,dst)
+       b       .Lcopy_done
+10:    std,ma  t1,8(dstspc,dst)
        ASM_EXCEPTIONTABLE_ENTRY(10b,.Lcopy_done)
 #endif
 
 .Lcopy8_fault:
-10:    b       .Lcopy_done
-       stw,ma  t1,4(dstspc,dst)
+       b       .Lcopy_done
+10:    stw,ma  t1,4(dstspc,dst)
        ASM_EXCEPTIONTABLE_ENTRY(10b,.Lcopy_done)
 
        .exit
index b3d47ec..99115cd 100644 (file)
 
 DECLARE_PER_CPU(struct exception_data, exception_data);
 
-#define get_user_space() (segment_eq(get_fs(), KERNEL_DS) ? 0 : mfsp(3))
+#define get_user_space() (uaccess_kernel() ? 0 : mfsp(3))
 #define get_kernel_space() (0)
 
 /* Returns 0 for success, otherwise, returns number of bytes not transferred. */
 extern unsigned long pa_memcpy(void *dst, const void *src,
                                unsigned long len);
 
-unsigned long __copy_to_user(void __user *dst, const void *src,
-                            unsigned long len)
+unsigned long raw_copy_to_user(void __user *dst, const void *src,
+                              unsigned long len)
 {
        mtsp(get_kernel_space(), 1);
        mtsp(get_user_space(), 2);
        return pa_memcpy((void __force *)dst, src, len);
 }
-EXPORT_SYMBOL(__copy_to_user);
+EXPORT_SYMBOL(raw_copy_to_user);
 
-unsigned long __copy_from_user(void *dst, const void __user *src,
+unsigned long raw_copy_from_user(void *dst, const void __user *src,
                               unsigned long len)
 {
        mtsp(get_user_space(), 1);
        mtsp(get_kernel_space(), 2);
        return pa_memcpy(dst, (void __force *)src, len);
 }
-EXPORT_SYMBOL(__copy_from_user);
+EXPORT_SYMBOL(raw_copy_from_user);
 
-unsigned long copy_in_user(void __user *dst, const void __user *src, unsigned long len)
+unsigned long raw_copy_in_user(void __user *dst, const void __user *src, unsigned long len)
 {
        mtsp(get_user_space(), 1);
        mtsp(get_user_space(), 2);
@@ -70,7 +70,7 @@ void * memcpy(void * dst,const void *src, size_t count)
        return dst;
 }
 
-EXPORT_SYMBOL(copy_in_user);
+EXPORT_SYMBOL(raw_copy_in_user);
 EXPORT_SYMBOL(memcpy);
 
 long probe_kernel_read(void *dst, const void *src, size_t size)
index 97a8bc8..0533826 100644 (file)
@@ -117,7 +117,6 @@ config PPC
        select GENERIC_STRNLEN_USER
        select GENERIC_TIME_VSYSCALL_OLD
        select HAVE_ARCH_AUDITSYSCALL
-       select HAVE_ARCH_HARDENED_USERCOPY
        select HAVE_ARCH_JUMP_LABEL
        select HAVE_ARCH_KGDB
        select HAVE_ARCH_SECCOMP_FILTER
index 528ff0e..c03d0fb 100644 (file)
@@ -16,9 +16,8 @@ CONFIG_DAVICOM_PHY=y
 CONFIG_DMADEVICES=y
 CONFIG_E1000E=y
 CONFIG_E1000=y
-CONFIG_EDAC_MM_EDAC=y
-CONFIG_EDAC_MPC85XX=y
 CONFIG_EDAC=y
+CONFIG_EDAC_MPC85XX=y
 CONFIG_EEPROM_AT24=y
 CONFIG_EEPROM_LEGACY=y
 CONFIG_FB_FSL_DIU=y
index c79283b..a917f7a 100644 (file)
@@ -155,7 +155,6 @@ CONFIG_USB_OHCI_HCD_PPC_OF_BE=y
 CONFIG_USB_OHCI_HCD_PPC_OF_LE=y
 CONFIG_USB_STORAGE=y
 CONFIG_EDAC=y
-CONFIG_EDAC_MM_EDAC=y
 CONFIG_EDAC_MPC85XX=y
 CONFIG_RTC_CLASS=y
 # CONFIG_RTC_INTF_PROC is not set
index dbd961d..72900b8 100644 (file)
@@ -116,7 +116,6 @@ CONFIG_LEDS_TRIGGERS=y
 CONFIG_LEDS_TRIGGER_TIMER=y
 CONFIG_LEDS_TRIGGER_HEARTBEAT=y
 CONFIG_EDAC=y
-CONFIG_EDAC_MM_EDAC=y
 CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_DS1307=y
 CONFIG_RTC_DRV_CMOS=y
index 2d7fcbe..aa56459 100644 (file)
@@ -179,7 +179,6 @@ CONFIG_INFINIBAND_MTHCA=m
 CONFIG_INFINIBAND_IPOIB=m
 CONFIG_INFINIBAND_IPOIB_DEBUG_DATA=y
 CONFIG_EDAC=y
-CONFIG_EDAC_MM_EDAC=y
 CONFIG_EDAC_CELL=y
 CONFIG_UIO=m
 CONFIG_EXT2_FS=y
index 5553c5c..fe43ff4 100644 (file)
@@ -142,7 +142,6 @@ CONFIG_USB_UHCI_HCD=y
 CONFIG_USB_SL811_HCD=y
 CONFIG_USB_STORAGE=y
 CONFIG_EDAC=y
-CONFIG_EDAC_MM_EDAC=y
 CONFIG_EDAC_PASEMI=y
 CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_DS1307=y
index 4f1288b..f2e03f0 100644 (file)
@@ -262,7 +262,6 @@ CONFIG_INFINIBAND_IPOIB_CM=y
 CONFIG_INFINIBAND_SRP=m
 CONFIG_INFINIBAND_ISER=m
 CONFIG_EDAC=y
-CONFIG_EDAC_MM_EDAC=y
 CONFIG_EDAC_PASEMI=y
 CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_DS1307=y
index 11a3473..6340e6c 100644 (file)
@@ -173,7 +173,6 @@ CONFIG_INFINIBAND_MTHCA=m
 CONFIG_INFINIBAND_IPOIB=m
 CONFIG_INFINIBAND_ISER=m
 CONFIG_EDAC=y
-CONFIG_EDAC_MM_EDAC=y
 CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_DS1307=y
 CONFIG_FS_DAX=y
index 1d2d69d..18d0d60 100644 (file)
@@ -988,8 +988,7 @@ CONFIG_LEDS_TRIGGER_BACKLIGHT=m
 CONFIG_LEDS_TRIGGER_DEFAULT_ON=m
 CONFIG_ACCESSIBILITY=y
 CONFIG_A11Y_BRAILLE_CONSOLE=y
-CONFIG_EDAC=y
-CONFIG_EDAC_MM_EDAC=m
+CONFIG_EDAC=m
 CONFIG_RTC_CLASS=y
 # CONFIG_RTC_HCTOSYS is not set
 CONFIG_RTC_DRV_DS1307=m
index 4119945..f058e0c 100644 (file)
@@ -33,10 +33,13 @@ static u32 crc32c_vpmsum(u32 crc, unsigned char const *p, size_t len)
        }
 
        if (len & ~VMX_ALIGN_MASK) {
+               preempt_disable();
                pagefault_disable();
                enable_kernel_altivec();
                crc = __crc32c_vpmsum(crc, p, len & ~VMX_ALIGN_MASK);
+               disable_kernel_altivec();
                pagefault_enable();
+               preempt_enable();
        }
 
        tail = len & VMX_ALIGN_MASK;
index 14752ee..ed3bead 100644 (file)
@@ -236,9 +236,9 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
        mtctr   reg;                                                    \
        bctr
 
-#define BRANCH_LINK_TO_FAR(reg, label)                                 \
-       __LOAD_FAR_HANDLER(reg, label);                                 \
-       mtctr   reg;                                                    \
+#define BRANCH_LINK_TO_FAR(label)                                      \
+       __LOAD_FAR_HANDLER(r12, label);                                 \
+       mtctr   r12;                                                    \
        bctrl
 
 /*
@@ -265,7 +265,7 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 #define BRANCH_TO_COMMON(reg, label)                                   \
        b       label
 
-#define BRANCH_LINK_TO_FAR(reg, label)                                 \
+#define BRANCH_LINK_TO_FAR(label)                                      \
        bl      label
 
 #define BRANCH_TO_KVM(reg, label)                                      \
diff --git a/arch/powerpc/include/asm/extable.h b/arch/powerpc/include/asm/extable.h
new file mode 100644 (file)
index 0000000..07cc45c
--- /dev/null
@@ -0,0 +1,29 @@
+#ifndef _ARCH_POWERPC_EXTABLE_H
+#define _ARCH_POWERPC_EXTABLE_H
+
+/*
+ * The exception table consists of pairs of relative addresses: the first is
+ * the address of an instruction that is allowed to fault, and the second is
+ * the address at which the program should continue.  No registers are
+ * modified, so it is entirely up to the continuation code to figure out what
+ * to do.
+ *
+ * All the routines below use bits of fixup code that are out of line with the
+ * main instruction path.  This means when everything is well, we don't even
+ * have to jump over them.  Further, they do not intrude on our cache or tlb
+ * entries.
+ */
+
+#define ARCH_HAS_RELATIVE_EXTABLE
+
+struct exception_table_entry {
+       int insn;
+       int fixup;
+};
+
+static inline unsigned long extable_fixup(const struct exception_table_entry *x)
+{
+       return (unsigned long)&x->fixup + x->fixup;
+}
+
+#endif
index 0e6add3..5c0d8a8 100644 (file)
@@ -1,18 +1,11 @@
 #ifndef _ARCH_POWERPC_UACCESS_H
 #define _ARCH_POWERPC_UACCESS_H
 
-#ifdef __KERNEL__
-#ifndef __ASSEMBLY__
-
-#include <linux/sched.h>
-#include <linux/errno.h>
 #include <asm/asm-compat.h>
 #include <asm/ppc_asm.h>
 #include <asm/processor.h>
 #include <asm/page.h>
-
-#define VERIFY_READ    0
-#define VERIFY_WRITE   1
+#include <asm/extable.h>
 
 /*
  * The fs value determines whether argument validity checking should be
         __access_ok((__force unsigned long)(addr), (size), get_fs()))
 
 /*
- * The exception table consists of pairs of relative addresses: the first is
- * the address of an instruction that is allowed to fault, and the second is
- * the address at which the program should continue.  No registers are
- * modified, so it is entirely up to the continuation code to figure out what
- * to do.
- *
- * All the routines below use bits of fixup code that are out of line with the
- * main instruction path.  This means when everything is well, we don't even
- * have to jump over them.  Further, they do not intrude on our cache or tlb
- * entries.
- */
-
-#define ARCH_HAS_RELATIVE_EXTABLE
-
-struct exception_table_entry {
-       int insn;
-       int fixup;
-};
-
-static inline unsigned long extable_fixup(const struct exception_table_entry *x)
-{
-       return (unsigned long)&x->fixup + x->fixup;
-}
-
-/*
  * These are the main single-value transfer routines.  They automatically
  * use the right size if we just have the right pointer type.
  *
@@ -301,42 +269,19 @@ extern unsigned long __copy_tofrom_user(void __user *to,
 
 #ifndef __powerpc64__
 
-static inline unsigned long copy_from_user(void *to,
-               const void __user *from, unsigned long n)
-{
-       if (likely(access_ok(VERIFY_READ, from, n))) {
-               check_object_size(to, n, false);
-               return __copy_tofrom_user((__force void __user *)to, from, n);
-       }
-       memset(to, 0, n);
-       return n;
-}
-
-static inline unsigned long copy_to_user(void __user *to,
-               const void *from, unsigned long n)
-{
-       if (access_ok(VERIFY_WRITE, to, n)) {
-               check_object_size(from, n, true);
-               return __copy_tofrom_user(to, (__force void __user *)from, n);
-       }
-       return n;
-}
+#define INLINE_COPY_FROM_USER
+#define INLINE_COPY_TO_USER
 
 #else /* __powerpc64__ */
 
-#define __copy_in_user(to, from, size) \
-       __copy_tofrom_user((to), (from), (size))
-
-extern unsigned long copy_from_user(void *to, const void __user *from,
-                                   unsigned long n);
-extern unsigned long copy_to_user(void __user *to, const void *from,
-                                 unsigned long n);
-extern unsigned long copy_in_user(void __user *to, const void __user *from,
-                                 unsigned long n);
-
+static inline unsigned long
+raw_copy_in_user(void __user *to, const void __user *from, unsigned long n)
+{
+       return __copy_tofrom_user(to, from, n);
+}
 #endif /* __powerpc64__ */
 
-static inline unsigned long __copy_from_user_inatomic(void *to,
+static inline unsigned long raw_copy_from_user(void *to,
                const void __user *from, unsigned long n)
 {
        if (__builtin_constant_p(n) && (n <= 8)) {
@@ -360,12 +305,10 @@ static inline unsigned long __copy_from_user_inatomic(void *to,
                        return 0;
        }
 
-       check_object_size(to, n, false);
-
        return __copy_tofrom_user((__force void __user *)to, from, n);
 }
 
-static inline unsigned long __copy_to_user_inatomic(void __user *to,
+static inline unsigned long raw_copy_to_user(void __user *to,
                const void *from, unsigned long n)
 {
        if (__builtin_constant_p(n) && (n <= 8)) {
@@ -389,25 +332,9 @@ static inline unsigned long __copy_to_user_inatomic(void __user *to,
                        return 0;
        }
 
-       check_object_size(from, n, true);
-
        return __copy_tofrom_user(to, (__force const void __user *)from, n);
 }
 
-static inline unsigned long __copy_from_user(void *to,
-               const void __user *from, unsigned long size)
-{
-       might_fault();
-       return __copy_from_user_inatomic(to, from, size);
-}
-
-static inline unsigned long __copy_to_user(void __user *to,
-               const void *from, unsigned long size)
-{
-       might_fault();
-       return __copy_to_user_inatomic(to, from, size);
-}
-
 extern unsigned long __clear_user(void __user *addr, unsigned long size);
 
 static inline unsigned long clear_user(void __user *addr, unsigned long size)
@@ -422,7 +349,4 @@ extern long strncpy_from_user(char *dst, const char __user *src, long count);
 extern __must_check long strlen_user(const char __user *str);
 extern __must_check long strnlen_user(const char __user *str, long n);
 
-#endif  /* __ASSEMBLY__ */
-#endif /* __KERNEL__ */
-
 #endif /* _ARCH_POWERPC_UACCESS_H */
index cbc7c42..ec7a8b0 100644 (file)
@@ -807,14 +807,25 @@ int fix_alignment(struct pt_regs *regs)
        nb = aligninfo[instr].len;
        flags = aligninfo[instr].flags;
 
-       /* ldbrx/stdbrx overlap lfs/stfs in the DSISR unfortunately */
-       if (IS_XFORM(instruction) && ((instruction >> 1) & 0x3ff) == 532) {
-               nb = 8;
-               flags = LD+SW;
-       } else if (IS_XFORM(instruction) &&
-                  ((instruction >> 1) & 0x3ff) == 660) {
-               nb = 8;
-               flags = ST+SW;
+       /*
+        * Handle some cases which give overlaps in the DSISR values.
+        */
+       if (IS_XFORM(instruction)) {
+               switch (get_xop(instruction)) {
+               case 532:       /* ldbrx */
+                       nb = 8;
+                       flags = LD+SW;
+                       break;
+               case 660:       /* stdbrx */
+                       nb = 8;
+                       flags = ST+SW;
+                       break;
+               case 20:        /* lwarx */
+               case 84:        /* ldarx */
+               case 116:       /* lharx */
+               case 276:       /* lqarx */
+                       return 0;       /* not emulated ever */
+               }
        }
 
        /* Byteswap little endian loads and stores */
index 6432d4b..767ef6d 100644 (file)
@@ -689,7 +689,7 @@ resume_kernel:
 
        addi    r8,r1,INT_FRAME_SIZE    /* Get the kprobed function entry */
 
-       lwz     r3,GPR1(r1)
+       l     r3,GPR1(r1)
        subi    r3,r3,INT_FRAME_SIZE    /* dst: Allocate a trampoline exception frame */
        mr      r4,r1                   /* src:  current exception frame */
        mr      r1,r3                   /* Reroute the trampoline frame to r1 */
@@ -703,8 +703,8 @@ resume_kernel:
        addi    r6,r6,8
        bdnz    2b
 
-       /* Do real store operation to complete stwu */
-       lwz     r5,GPR1(r1)
+       /* Do real store operation to complete stdu */
+       l     r5,GPR1(r1)
        std     r8,0(r5)
 
        /* Clear _TIF_EMULATE_STACK_STORE flag */
index 857bf7c..6353019 100644 (file)
@@ -982,7 +982,7 @@ TRAMP_REAL_BEGIN(hmi_exception_early)
        EXCEPTION_PROLOG_COMMON_2(PACA_EXGEN)
        EXCEPTION_PROLOG_COMMON_3(0xe60)
        addi    r3,r1,STACK_FRAME_OVERHEAD
-       BRANCH_LINK_TO_FAR(r4, hmi_exception_realmode)
+       BRANCH_LINK_TO_FAR(hmi_exception_realmode) /* Function call ABI */
        /* Windup the stack. */
        /* Move original HSRR0 and HSRR1 into the respective regs */
        ld      r9,_MSR(r1)
index ae179cb..c119044 100644 (file)
@@ -67,7 +67,7 @@ PPC64_CACHES:
  *   flush all bytes from start through stop-1 inclusive
  */
 
-_GLOBAL(flush_icache_range)
+_GLOBAL_TOC(flush_icache_range)
 BEGIN_FTR_SECTION
        PURGE_PREFETCHED_INS
        blr
@@ -120,7 +120,7 @@ EXPORT_SYMBOL(flush_icache_range)
  *
  *    flush all bytes from start to stop-1 inclusive
  */
-_GLOBAL(flush_dcache_range)
+_GLOBAL_TOC(flush_dcache_range)
 
 /*
  * Flush the data cache to memory 
index 9cfaa8b..f997154 100644 (file)
@@ -236,6 +236,15 @@ static void cpu_ready_for_interrupts(void)
                mtspr(SPRN_LPCR, lpcr | LPCR_AIL_3);
        }
 
+       /*
+        * Fixup HFSCR:TM based on CPU features. The bit is set by our
+        * early asm init because at that point we haven't updated our
+        * CPU features from firmware and device-tree. Here we have,
+        * so let's do it.
+        */
+       if (cpu_has_feature(CPU_FTR_HVMODE) && !cpu_has_feature(CPU_FTR_TM_COMP))
+               mtspr(SPRN_HFSCR, mfspr(SPRN_HFSCR) & ~HFSCR_TM);
+
        /* Set IR and DR in PACA MSR */
        get_paca()->kernel_msr = MSR_KERNEL;
 }
index 46f89e6..d68ed1f 100644 (file)
@@ -787,24 +787,21 @@ static struct sched_domain_topology_level powerpc_topology[] = {
        { NULL, },
 };
 
-void __init smp_cpus_done(unsigned int max_cpus)
+static __init long smp_setup_cpu_workfn(void *data __always_unused)
 {
-       cpumask_var_t old_mask;
+       smp_ops->setup_cpu(boot_cpuid);
+       return 0;
+}
 
-       /* We want the setup_cpu() here to be called from CPU 0, but our
-        * init thread may have been "borrowed" by another CPU in the meantime
-        * se we pin us down to CPU 0 for a short while
+void __init smp_cpus_done(unsigned int max_cpus)
+{
+       /*
+        * We want the setup_cpu() here to be called on the boot CPU, but
+        * init might run on any CPU, so make sure it's invoked on the boot
+        * CPU.
         */
-       alloc_cpumask_var(&old_mask, GFP_NOWAIT);
-       cpumask_copy(old_mask, &current->cpus_allowed);
-       set_cpus_allowed_ptr(current, cpumask_of(boot_cpuid));
-       
        if (smp_ops && smp_ops->setup_cpu)
-               smp_ops->setup_cpu(boot_cpuid);
-
-       set_cpus_allowed_ptr(current, old_mask);
-
-       free_cpumask_var(old_mask);
+               work_on_cpu_safe(boot_cpuid, smp_setup_cpu_workfn, NULL);
 
        if (smp_ops && smp_ops->bringup_done)
                smp_ops->bringup_done();
@@ -812,7 +809,6 @@ void __init smp_cpus_done(unsigned int max_cpus)
        dump_numa_cpu_topology();
 
        set_sched_topology(powerpc_topology);
-
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
index 07b9072..2b33cfa 100644 (file)
@@ -995,8 +995,10 @@ static void __init init_decrementer_clockevent(void)
 
        decrementer_clockevent.max_delta_ns =
                clockevent_delta2ns(decrementer_max, &decrementer_clockevent);
+       decrementer_clockevent.max_delta_ticks = decrementer_max;
        decrementer_clockevent.min_delta_ns =
                clockevent_delta2ns(2, &decrementer_clockevent);
+       decrementer_clockevent.min_delta_ticks = 2;
 
        register_decrementer_clockevent(cpu);
 }
index 8c68145..710e491 100644 (file)
@@ -1487,6 +1487,10 @@ long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm,
        /* start new resize */
 
        resize = kzalloc(sizeof(*resize), GFP_KERNEL);
+       if (!resize) {
+               ret = -ENOMEM;
+               goto out;
+       }
        resize->order = shift;
        resize->kvm = kvm;
        INIT_WORK(&resize->work, resize_hpt_prepare_work);
index 2b5e090..ed7dfce 100644 (file)
@@ -14,7 +14,7 @@ obj-y += string.o alloc.o crtsavres.o code-patching.o \
 
 obj-$(CONFIG_PPC32)    += div64.o copy_32.o
 
-obj64-y        += copypage_64.o copyuser_64.o usercopy_64.o mem_64.o hweight_64.o \
+obj64-y        += copypage_64.o copyuser_64.o mem_64.o hweight_64.o \
           copyuser_power7.o string_64.o copypage_power7.o memcpy_power7.o \
           memcpy_64.o memcmp_64.o
 
index ff0d894..8aedbb5 100644 (file)
@@ -477,18 +477,6 @@ _GLOBAL(__copy_tofrom_user)
        bdnz    130b
 /* then clear out the destination: r3 bytes starting at 4(r6) */
 132:   mfctr   r3
-       srwi.   r0,r3,2
-       li      r9,0
-       mtctr   r0
-       beq     113f
-112:   stwu    r9,4(r6)
-       bdnz    112b
-113:   andi.   r0,r3,3
-       mtctr   r0
-       beq     120f
-114:   stb     r9,4(r6)
-       addi    r6,r6,1
-       bdnz    114b
 120:   blr
 
        EX_TABLE(30b,108b)
@@ -497,7 +485,5 @@ _GLOBAL(__copy_tofrom_user)
        EX_TABLE(41b,111b)
        EX_TABLE(130b,132b)
        EX_TABLE(131b,120b)
-       EX_TABLE(112b,120b)
-       EX_TABLE(114b,120b)
 
 EXPORT_SYMBOL(__copy_tofrom_user)
index aee6e24..08da06e 100644 (file)
@@ -319,32 +319,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
        blr
 
 /*
- * here we have trapped again, need to clear ctr bytes starting at r3
+ * here we have trapped again, amount remaining is in ctr.
  */
-143:   mfctr   r5
-       li      r0,0
-       mr      r4,r3
-       mr      r3,r5           /* return the number of bytes not copied */
-1:     andi.   r9,r4,7
-       beq     3f
-90:    stb     r0,0(r4)
-       addic.  r5,r5,-1
-       addi    r4,r4,1
-       bne     1b
-       blr
-3:     cmpldi  cr1,r5,8
-       srdi    r9,r5,3
-       andi.   r5,r5,7
-       blt     cr1,93f
-       mtctr   r9
-91:    std     r0,0(r4)
-       addi    r4,r4,8
-       bdnz    91b
-93:    beqlr
-       mtctr   r5      
-92:    stb     r0,0(r4)
-       addi    r4,r4,1
-       bdnz    92b
+143:   mfctr   r3
        blr
 
 /*
@@ -389,10 +366,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
        ld      r5,-8(r1)
        add     r6,r6,r5
        subf    r3,r3,r6        /* #bytes not copied */
-190:
-191:
-192:
-       blr                     /* #bytes not copied in r3 */
+       blr
 
        EX_TABLE(20b,120b)
        EX_TABLE(220b,320b)
@@ -451,9 +425,6 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
        EX_TABLE(88b,188b)
        EX_TABLE(43b,143b)
        EX_TABLE(89b,189b)
-       EX_TABLE(90b,190b)
-       EX_TABLE(91b,191b)
-       EX_TABLE(92b,192b)
 
 /*
  * Routine to copy a whole page of data, optimized for POWER4.
diff --git a/arch/powerpc/lib/usercopy_64.c b/arch/powerpc/lib/usercopy_64.c
deleted file mode 100644 (file)
index 9bd3a3d..0000000
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Functions which are too large to be inlined.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#include <linux/module.h>
-#include <linux/uaccess.h>
-
-unsigned long copy_from_user(void *to, const void __user *from, unsigned long n)
-{
-       if (likely(access_ok(VERIFY_READ, from, n)))
-               n = __copy_from_user(to, from, n);
-       else
-               memset(to, 0, n);
-       return n;
-}
-
-unsigned long copy_to_user(void __user *to, const void *from, unsigned long n)
-{
-       if (likely(access_ok(VERIFY_WRITE, to, n)))
-               n = __copy_to_user(to, from, n);
-       return n;
-}
-
-unsigned long copy_in_user(void __user *to, const void __user *from,
-                          unsigned long n)
-{
-       might_sleep();
-       if (likely(access_ok(VERIFY_READ, from, n) &&
-           access_ok(VERIFY_WRITE, to, n)))
-               n =__copy_tofrom_user(to, from, n);
-       return n;
-}
-
-EXPORT_SYMBOL(copy_from_user);
-EXPORT_SYMBOL(copy_to_user);
-EXPORT_SYMBOL(copy_in_user);
-
index cc33260..65bb8f3 100644 (file)
@@ -638,6 +638,10 @@ static void native_flush_hash_range(unsigned long number, int local)
        unsigned long psize = batch->psize;
        int ssize = batch->ssize;
        int i;
+       unsigned int use_local;
+
+       use_local = local && mmu_has_feature(MMU_FTR_TLBIEL) &&
+               mmu_psize_defs[psize].tlbiel && !cxl_ctx_in_use();
 
        local_irq_save(flags);
 
@@ -667,8 +671,7 @@ static void native_flush_hash_range(unsigned long number, int local)
                } pte_iterate_hashed_end();
        }
 
-       if (mmu_has_feature(MMU_FTR_TLBIEL) &&
-           mmu_psize_defs[psize].tlbiel && local) {
+       if (use_local) {
                asm volatile("ptesync":::"memory");
                for (i = 0; i < number; i++) {
                        vpn = batch->vpn[i];
index a2dcef0..b8b1434 100644 (file)
@@ -124,7 +124,6 @@ config S390
        select HAVE_ALIGNED_STRUCT_PAGE if SLUB
        select HAVE_ARCH_AUDITSYSCALL
        select HAVE_ARCH_EARLY_PFN_TO_NID
-       select HAVE_ARCH_HARDENED_USERCOPY
        select HAVE_ARCH_JUMP_LABEL
        select CPU_NO_EFFICIENT_FFS if !HAVE_MARCH_Z9_109_FEATURES
        select HAVE_ARCH_SECCOMP_FILTER
diff --git a/arch/s390/include/asm/extable.h b/arch/s390/include/asm/extable.h
new file mode 100644 (file)
index 0000000..16cfe2d
--- /dev/null
@@ -0,0 +1,28 @@
+#ifndef __S390_EXTABLE_H
+#define __S390_EXTABLE_H
+/*
+ * The exception table consists of pairs of addresses: the first is the
+ * address of an instruction that is allowed to fault, and the second is
+ * the address at which the program should continue.  No registers are
+ * modified, so it is entirely up to the continuation code to figure out
+ * what to do.
+ *
+ * All the routines below use bits of fixup code that are out of line
+ * with the main instruction path.  This means when everything is well,
+ * we don't even have to jump over them.  Further, they do not intrude
+ * on our cache or tlb entries.
+ */
+
+struct exception_table_entry
+{
+       int insn, fixup;
+};
+
+static inline unsigned long extable_fixup(const struct exception_table_entry *x)
+{
+       return (unsigned long)&x->fixup + x->fixup;
+}
+
+#define ARCH_HAS_RELATIVE_EXTABLE
+
+#endif
index 93e37b1..ecec682 100644 (file)
@@ -1051,6 +1051,8 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
 {
        if (!MACHINE_HAS_NX)
                pte_val(entry) &= ~_PAGE_NOEXEC;
+       if (pte_present(entry))
+               pte_val(entry) &= ~_PAGE_UNUSED;
        if (mm_has_pgste(mm))
                ptep_set_pte_at(mm, addr, ptep, entry);
        else
index 3ea1554..78f3f09 100644 (file)
 /*
  * User space memory access functions
  */
-#include <linux/sched.h>
-#include <linux/errno.h>
 #include <asm/processor.h>
 #include <asm/ctl_reg.h>
-
-#define VERIFY_READ     0
-#define VERIFY_WRITE    1
+#include <asm/extable.h>
 
 
 /*
@@ -42,7 +38,7 @@
 static inline void set_fs(mm_segment_t fs)
 {
        current->thread.mm_segment = fs;
-       if (segment_eq(fs, KERNEL_DS)) {
+       if (uaccess_kernel()) {
                set_cpu_flag(CIF_ASCE_SECONDARY);
                __ctl_load(S390_lowcore.kernel_asce, 7, 7);
        } else {
@@ -64,72 +60,14 @@ static inline int __range_ok(unsigned long addr, unsigned long size)
 
 #define access_ok(type, addr, size) __access_ok(addr, size)
 
-/*
- * The exception table consists of pairs of addresses: the first is the
- * address of an instruction that is allowed to fault, and the second is
- * the address at which the program should continue.  No registers are
- * modified, so it is entirely up to the continuation code to figure out
- * what to do.
- *
- * All the routines below use bits of fixup code that are out of line
- * with the main instruction path.  This means when everything is well,
- * we don't even have to jump over them.  Further, they do not intrude
- * on our cache or tlb entries.
- */
-
-struct exception_table_entry
-{
-       int insn, fixup;
-};
-
-static inline unsigned long extable_fixup(const struct exception_table_entry *x)
-{
-       return (unsigned long)&x->fixup + x->fixup;
-}
-
-#define ARCH_HAS_RELATIVE_EXTABLE
-
-/**
- * __copy_from_user: - Copy a block of data from user space, with less checking.
- * @to:   Destination address, in kernel space.
- * @from: Source address, in user space.
- * @n:   Number of bytes to copy.
- *
- * Context: User context only. This function may sleep if pagefaults are
- *          enabled.
- *
- * Copy data from user space to kernel space.  Caller must check
- * the specified block with access_ok() before calling this function.
- *
- * Returns number of bytes that could not be copied.
- * On success, this will be zero.
- *
- * If some data could not be copied, this function will pad the copied
- * data to the requested size using zero bytes.
- */
-unsigned long __must_check __copy_from_user(void *to, const void __user *from,
-                                           unsigned long n);
+unsigned long __must_check
+raw_copy_from_user(void *to, const void __user *from, unsigned long n);
 
-/**
- * __copy_to_user: - Copy a block of data into user space, with less checking.
- * @to:   Destination address, in user space.
- * @from: Source address, in kernel space.
- * @n:   Number of bytes to copy.
- *
- * Context: User context only. This function may sleep if pagefaults are
- *          enabled.
- *
- * Copy data from kernel space to user space.  Caller must check
- * the specified block with access_ok() before calling this function.
- *
- * Returns number of bytes that could not be copied.
- * On success, this will be zero.
- */
-unsigned long __must_check __copy_to_user(void __user *to, const void *from,
-                                         unsigned long n);
+unsigned long __must_check
+raw_copy_to_user(void __user *to, const void *from, unsigned long n);
 
-#define __copy_to_user_inatomic __copy_to_user
-#define __copy_from_user_inatomic __copy_from_user
+#define INLINE_COPY_FROM_USER
+#define INLINE_COPY_TO_USER
 
 #ifdef CONFIG_HAVE_MARCH_Z10_FEATURES
 
@@ -218,13 +156,13 @@ static inline int __get_user_fn(void *x, const void __user *ptr, unsigned long s
 
 static inline int __put_user_fn(void *x, void __user *ptr, unsigned long size)
 {
-       size = __copy_to_user(ptr, x, size);
+       size = raw_copy_to_user(ptr, x, size);
        return size ? -EFAULT : 0;
 }
 
 static inline int __get_user_fn(void *x, const void __user *ptr, unsigned long size)
 {
-       size = __copy_from_user(x, ptr, size);
+       size = raw_copy_from_user(x, ptr, size);
        return size ? -EFAULT : 0;
 }
 
@@ -314,77 +252,8 @@ int __get_user_bad(void) __attribute__((noreturn));
 #define __put_user_unaligned __put_user
 #define __get_user_unaligned __get_user
 
-extern void __compiletime_error("usercopy buffer size is too small")
-__bad_copy_user(void);
-
-static inline void copy_user_overflow(int size, unsigned long count)
-{
-       WARN(1, "Buffer overflow detected (%d < %lu)!\n", size, count);
-}
-
-/**
- * copy_to_user: - Copy a block of data into user space.
- * @to:   Destination address, in user space.
- * @from: Source address, in kernel space.
- * @n:    Number of bytes to copy.
- *
- * Context: User context only. This function may sleep if pagefaults are
- *          enabled.
- *
- * Copy data from kernel space to user space.
- *
- * Returns number of bytes that could not be copied.
- * On success, this will be zero.
- */
-static inline unsigned long __must_check
-copy_to_user(void __user *to, const void *from, unsigned long n)
-{
-       might_fault();
-       return __copy_to_user(to, from, n);
-}
-
-/**
- * copy_from_user: - Copy a block of data from user space.
- * @to:   Destination address, in kernel space.
- * @from: Source address, in user space.
- * @n:    Number of bytes to copy.
- *
- * Context: User context only. This function may sleep if pagefaults are
- *          enabled.
- *
- * Copy data from user space to kernel space.
- *
- * Returns number of bytes that could not be copied.
- * On success, this will be zero.
- *
- * If some data could not be copied, this function will pad the copied
- * data to the requested size using zero bytes.
- */
-static inline unsigned long __must_check
-copy_from_user(void *to, const void __user *from, unsigned long n)
-{
-       unsigned int sz = __compiletime_object_size(to);
-
-       might_fault();
-       if (unlikely(sz != -1 && sz < n)) {
-               if (!__builtin_constant_p(n))
-                       copy_user_overflow(sz, n);
-               else
-                       __bad_copy_user();
-               return n;
-       }
-       return __copy_from_user(to, from, n);
-}
-
 unsigned long __must_check
-__copy_in_user(void __user *to, const void __user *from, unsigned long n);
-
-static inline unsigned long __must_check
-copy_in_user(void __user *to, const void __user *from, unsigned long n)
-{
-       might_fault();
-       return __copy_in_user(to, from, n);
-}
+raw_copy_in_user(void __user *to, const void __user *from, unsigned long n);
 
 /*
  * Copy a null terminated string from userspace.
index c31da46..c3a52f9 100644 (file)
@@ -158,7 +158,9 @@ void init_cpu_timer(void)
        cd->mult                = 16777;
        cd->shift               = 12;
        cd->min_delta_ns        = 1;
+       cd->min_delta_ticks     = 1;
        cd->max_delta_ns        = LONG_MAX;
+       cd->max_delta_ticks     = ULONG_MAX;
        cd->rating              = 400;
        cd->cpumask             = cpumask_of(cpu);
        cd->set_next_event      = s390_next_event;
index d55c829..ddbffb7 100644 (file)
@@ -168,8 +168,7 @@ union page_table_entry {
                unsigned long z  : 1; /* Zero Bit */
                unsigned long i  : 1; /* Page-Invalid Bit */
                unsigned long p  : 1; /* DAT-Protection Bit */
-               unsigned long co : 1; /* Change-Recording Override */
-               unsigned long    : 8;
+               unsigned long    : 9;
        };
 };
 
@@ -745,8 +744,6 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
                return PGM_PAGE_TRANSLATION;
        if (pte.z)
                return PGM_TRANSLATION_SPEC;
-       if (pte.co && !edat1)
-               return PGM_TRANSLATION_SPEC;
        dat_protection |= pte.p;
        raddr.pfra = pte.pfra;
 real_address:
@@ -1182,7 +1179,7 @@ int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg,
                rc = gmap_read_table(sg->parent, pgt + vaddr.px * 8, &pte.val);
        if (!rc && pte.i)
                rc = PGM_PAGE_TRANSLATION;
-       if (!rc && (pte.z || (pte.co && sg->edat_level < 1)))
+       if (!rc && pte.z)
                rc = PGM_TRANSLATION_SPEC;
 shadow_page:
        pte.p |= dat_protection;
index f481fcd..1e5bb2b 100644 (file)
@@ -26,7 +26,7 @@ static inline unsigned long copy_from_user_mvcos(void *x, const void __user *ptr
        tmp1 = -4096UL;
        asm volatile(
                "0: .insn ss,0xc80000000000,0(%0,%2),0(%1),0\n"
-               "9: jz    7f\n"
+               "6: jz    4f\n"
                "1: algr  %0,%3\n"
                "   slgr  %1,%3\n"
                "   slgr  %2,%3\n"
@@ -35,23 +35,13 @@ static inline unsigned long copy_from_user_mvcos(void *x, const void __user *ptr
                "   nr    %4,%3\n"      /* %4 = (ptr + 4095) & -4096 */
                "   slgr  %4,%1\n"
                "   clgr  %0,%4\n"      /* copy crosses next page boundary? */
-               "   jnh   4f\n"
+               "   jnh   5f\n"
                "3: .insn ss,0xc80000000000,0(%4,%2),0(%1),0\n"
-               "10:slgr  %0,%4\n"
-               "   algr  %2,%4\n"
-               "4: lghi  %4,-1\n"
-               "   algr  %4,%0\n"      /* copy remaining size, subtract 1 */
-               "   bras  %3,6f\n"      /* memset loop */
-               "   xc    0(1,%2),0(%2)\n"
-               "5: xc    0(256,%2),0(%2)\n"
-               "   la    %2,256(%2)\n"
-               "6: aghi  %4,-256\n"
-               "   jnm   5b\n"
-               "   ex    %4,0(%3)\n"
-               "   j     8f\n"
-               "7: slgr  %0,%0\n"
-               "8:\n"
-               EX_TABLE(0b,2b) EX_TABLE(3b,4b) EX_TABLE(9b,2b) EX_TABLE(10b,4b)
+               "7: slgr  %0,%4\n"
+               "   j     5f\n"
+               "4: slgr  %0,%0\n"
+               "5:\n"
+               EX_TABLE(0b,2b) EX_TABLE(3b,5b) EX_TABLE(6b,2b) EX_TABLE(7b,5b)
                : "+a" (size), "+a" (ptr), "+a" (x), "+a" (tmp1), "=a" (tmp2)
                : "d" (reg0) : "cc", "memory");
        return size;
@@ -67,49 +57,38 @@ static inline unsigned long copy_from_user_mvcp(void *x, const void __user *ptr,
        asm volatile(
                "   sacf  0\n"
                "0: mvcp  0(%0,%2),0(%1),%3\n"
-               "10:jz    8f\n"
+               "7: jz    5f\n"
                "1: algr  %0,%3\n"
                "   la    %1,256(%1)\n"
                "   la    %2,256(%2)\n"
                "2: mvcp  0(%0,%2),0(%1),%3\n"
-               "11:jnz   1b\n"
-               "   j     8f\n"
+               "8: jnz   1b\n"
+               "   j     5f\n"
                "3: la    %4,255(%1)\n" /* %4 = ptr + 255 */
                "   lghi  %3,-4096\n"
                "   nr    %4,%3\n"      /* %4 = (ptr + 255) & -4096 */
                "   slgr  %4,%1\n"
                "   clgr  %0,%4\n"      /* copy crosses next page boundary? */
-               "   jnh   5f\n"
+               "   jnh   6f\n"
                "4: mvcp  0(%4,%2),0(%1),%3\n"
-               "12:slgr  %0,%4\n"
-               "   algr  %2,%4\n"
-               "5: lghi  %4,-1\n"
-               "   algr  %4,%0\n"      /* copy remaining size, subtract 1 */
-               "   bras  %3,7f\n"      /* memset loop */
-               "   xc    0(1,%2),0(%2)\n"
-               "6: xc    0(256,%2),0(%2)\n"
-               "   la    %2,256(%2)\n"
-               "7: aghi  %4,-256\n"
-               "   jnm   6b\n"
-               "   ex    %4,0(%3)\n"
-               "   j     9f\n"
-               "8: slgr  %0,%0\n"
-               "9: sacf  768\n"
-               EX_TABLE(0b,3b) EX_TABLE(2b,3b) EX_TABLE(4b,5b)
-               EX_TABLE(10b,3b) EX_TABLE(11b,3b) EX_TABLE(12b,5b)
+               "9: slgr  %0,%4\n"
+               "   j     6f\n"
+               "5: slgr  %0,%0\n"
+               "6: sacf  768\n"
+               EX_TABLE(0b,3b) EX_TABLE(2b,3b) EX_TABLE(4b,6b)
+               EX_TABLE(7b,3b) EX_TABLE(8b,3b) EX_TABLE(9b,6b)
                : "+a" (size), "+a" (ptr), "+a" (x), "+a" (tmp1), "=a" (tmp2)
                : : "cc", "memory");
        return size;
 }
 
-unsigned long __copy_from_user(void *to, const void __user *from, unsigned long n)
+unsigned long raw_copy_from_user(void *to, const void __user *from, unsigned long n)
 {
-       check_object_size(to, n, false);
        if (static_branch_likely(&have_mvcos))
                return copy_from_user_mvcos(to, from, n);
        return copy_from_user_mvcp(to, from, n);
 }
-EXPORT_SYMBOL(__copy_from_user);
+EXPORT_SYMBOL(raw_copy_from_user);
 
 static inline unsigned long copy_to_user_mvcos(void __user *ptr, const void *x,
                                               unsigned long size)
@@ -176,14 +155,13 @@ static inline unsigned long copy_to_user_mvcs(void __user *ptr, const void *x,
        return size;
 }
 
-unsigned long __copy_to_user(void __user *to, const void *from, unsigned long n)
+unsigned long raw_copy_to_user(void __user *to, const void *from, unsigned long n)
 {
-       check_object_size(from, n, true);
        if (static_branch_likely(&have_mvcos))
                return copy_to_user_mvcos(to, from, n);
        return copy_to_user_mvcs(to, from, n);
 }
-EXPORT_SYMBOL(__copy_to_user);
+EXPORT_SYMBOL(raw_copy_to_user);
 
 static inline unsigned long copy_in_user_mvcos(void __user *to, const void __user *from,
                                               unsigned long size)
@@ -240,13 +218,13 @@ static inline unsigned long copy_in_user_mvc(void __user *to, const void __user
        return size;
 }
 
-unsigned long __copy_in_user(void __user *to, const void __user *from, unsigned long n)
+unsigned long raw_copy_in_user(void __user *to, const void __user *from, unsigned long n)
 {
        if (static_branch_likely(&have_mvcos))
                return copy_in_user_mvcos(to, from, n);
        return copy_in_user_mvc(to, from, n);
 }
-EXPORT_SYMBOL(__copy_in_user);
+EXPORT_SYMBOL(raw_copy_in_user);
 
 static inline unsigned long clear_user_mvcos(void __user *to, unsigned long size)
 {
index 926943a..e3a8d0f 100644 (file)
@@ -4,6 +4,7 @@ header-y +=
 generic-y += barrier.h
 generic-y += clkdev.h
 generic-y += current.h
+generic-y += extable.h
 generic-y += irq_work.h
 generic-y += mcs_spinlock.h
 generic-y += mm-arch-hooks.h
diff --git a/arch/score/include/asm/extable.h b/arch/score/include/asm/extable.h
deleted file mode 100644 (file)
index c4423cc..0000000
+++ /dev/null
@@ -1,11 +0,0 @@
-#ifndef _ASM_SCORE_EXTABLE_H
-#define _ASM_SCORE_EXTABLE_H
-
-struct exception_table_entry {
-       unsigned long insn;
-       unsigned long fixup;
-};
-
-struct pt_regs;
-extern int fixup_exception(struct pt_regs *regs);
-#endif
index db58ab9..916e5db 100644 (file)
@@ -2,13 +2,8 @@
 #define __SCORE_UACCESS_H
 
 #include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/thread_info.h>
 #include <asm/extable.h>
 
-#define VERIFY_READ            0
-#define VERIFY_WRITE           1
-
 #define get_ds()               (KERNEL_DS)
 #define get_fs()               (current_thread_info()->addr_limit)
 #define segment_eq(a, b)       ((a).seg == (b).seg)
@@ -300,61 +295,19 @@ extern void __put_user_unknown(void);
 extern int __copy_tofrom_user(void *to, const void *from, unsigned long len);
 
 static inline unsigned long
-copy_from_user(void *to, const void *from, unsigned long len)
-{
-       unsigned long res = len;
-
-       if (likely(access_ok(VERIFY_READ, from, len)))
-               res = __copy_tofrom_user(to, from, len);
-
-       if (unlikely(res))
-               memset(to + (len - res), 0, res);
-
-       return res;
-}
-
-static inline unsigned long
-copy_to_user(void *to, const void *from, unsigned long len)
-{
-       if (likely(access_ok(VERIFY_WRITE, to, len)))
-               len = __copy_tofrom_user(to, from, len);
-
-       return len;
-}
-
-static inline unsigned long
-__copy_from_user(void *to, const void *from, unsigned long len)
+raw_copy_from_user(void *to, const void __user *from, unsigned long len)
 {
-       unsigned long left = __copy_tofrom_user(to, from, len);
-       if (unlikely(left))
-               memset(to + (len - left), 0, left);
-       return left;
+       return __copy_tofrom_user(to, (__force const void *)from, len);
 }
 
-#define __copy_to_user(to, from, len)          \
-               __copy_tofrom_user((to), (from), (len))
-
 static inline unsigned long
-__copy_to_user_inatomic(void *to, const void *from, unsigned long len)
+raw_copy_to_user(void __user *to, const void *from, unsigned long len)
 {
-       return __copy_to_user(to, from, len);
+       return __copy_tofrom_user((__force void *)to, from, len);
 }
 
-static inline unsigned long
-__copy_from_user_inatomic(void *to, const void *from, unsigned long len)
-{
-       return __copy_tofrom_user(to, from, len);
-}
-
-#define __copy_in_user(to, from, len)  __copy_tofrom_user(to, from, len)
-
-static inline unsigned long
-copy_in_user(void *to, const void *from, unsigned long len)
-{
-       if (access_ok(VERIFY_READ, from, len) &&
-                     access_ok(VERFITY_WRITE, to, len))
-               return __copy_tofrom_user(to, from, len);
-}
+#define INLINE_COPY_FROM_USER
+#define INLINE_COPY_TO_USER
 
 /*
  * __clear_user: - Zero a block of memory in user space, with less checking.
index 679b8d7..29aafc7 100644 (file)
@@ -81,8 +81,10 @@ void __init time_init(void)
                                        score_clockevent.shift);
        score_clockevent.max_delta_ns = clockevent_delta2ns((u32)~0,
                                        &score_clockevent);
+       score_clockevent.max_delta_ticks = (u32)~0;
        score_clockevent.min_delta_ns = clockevent_delta2ns(50,
                                                &score_clockevent) + 1;
+       score_clockevent.min_delta_ticks = 50;
        score_clockevent.cpumask = cpumask_of(0);
        clockevents_register_device(&score_clockevent);
 }
diff --git a/arch/sh/include/asm/extable.h b/arch/sh/include/asm/extable.h
new file mode 100644 (file)
index 0000000..df2ee2f
--- /dev/null
@@ -0,0 +1,10 @@
+#ifndef __ASM_SH_EXTABLE_H
+#define __ASM_SH_EXTABLE_H
+
+#include <asm-generic/extable.h>
+
+#if defined(CONFIG_SUPERH64) && defined(CONFIG_MMU)
+#define ARCH_HAS_SEARCH_EXTABLE
+#endif
+
+#endif
index c4f0fee..2722b61 100644 (file)
@@ -1,12 +1,8 @@
 #ifndef __ASM_SH_UACCESS_H
 #define __ASM_SH_UACCESS_H
 
-#include <linux/errno.h>
-#include <linux/sched.h>
 #include <asm/segment.h>
-
-#define VERIFY_READ    0
-#define VERIFY_WRITE   1
+#include <asm/extable.h>
 
 #define __addr_ok(addr) \
        ((unsigned long __force)(addr) < current_thread_info()->addr_limit.seg)
@@ -112,19 +108,18 @@ extern __must_check long strnlen_user(const char __user *str, long n);
 __kernel_size_t __copy_user(void *to, const void *from, __kernel_size_t n);
 
 static __always_inline unsigned long
-__copy_from_user(void *to, const void __user *from, unsigned long n)
+raw_copy_from_user(void *to, const void __user *from, unsigned long n)
 {
        return __copy_user(to, (__force void *)from, n);
 }
 
 static __always_inline unsigned long __must_check
-__copy_to_user(void __user *to, const void *from, unsigned long n)
+raw_copy_to_user(void __user *to, const void *from, unsigned long n)
 {
        return __copy_user((__force void *)to, from, n);
 }
-
-#define __copy_to_user_inatomic __copy_to_user
-#define __copy_from_user_inatomic __copy_from_user
+#define INLINE_COPY_FROM_USER
+#define INLINE_COPY_TO_USER
 
 /*
  * Clear the area and return remaining number of bytes
@@ -144,55 +139,6 @@ __kernel_size_t __clear_user(void *addr, __kernel_size_t size);
        __cl_size;                                                      \
 })
 
-static inline unsigned long
-copy_from_user(void *to, const void __user *from, unsigned long n)
-{
-       unsigned long __copy_from = (unsigned long) from;
-       __kernel_size_t __copy_size = (__kernel_size_t) n;
-
-       if (__copy_size && __access_ok(__copy_from, __copy_size))
-               __copy_size = __copy_user(to, from, __copy_size);
-
-       if (unlikely(__copy_size))
-               memset(to + (n - __copy_size), 0, __copy_size);
-
-       return __copy_size;
-}
-
-static inline unsigned long
-copy_to_user(void __user *to, const void *from, unsigned long n)
-{
-       unsigned long __copy_to = (unsigned long) to;
-       __kernel_size_t __copy_size = (__kernel_size_t) n;
-
-       if (__copy_size && __access_ok(__copy_to, __copy_size))
-               return __copy_user(to, from, __copy_size);
-
-       return __copy_size;
-}
-
-/*
- * The exception table consists of pairs of addresses: the first is the
- * address of an instruction that is allowed to fault, and the second is
- * the address at which the program should continue.  No registers are
- * modified, so it is entirely up to the continuation code to figure out
- * what to do.
- *
- * All the routines below use bits of fixup code that are out of line
- * with the main instruction path.  This means when everything is well,
- * we don't even have to jump over them.  Further, they do not intrude
- * on our cache or tlb entries.
- */
-struct exception_table_entry {
-       unsigned long insn, fixup;
-};
-
-#if defined(CONFIG_SUPERH64) && defined(CONFIG_MMU)
-#define ARCH_HAS_SEARCH_EXTABLE
-#endif
-
-int fixup_exception(struct pt_regs *regs);
-
 extern void *set_exception_table_vec(unsigned int vec, void *handler);
 
 static inline void *set_exception_table_evt(unsigned int evt, void *handler)
index 68ac5c7..ed96869 100644 (file)
@@ -42,8 +42,7 @@ config SPARC
        select OLD_SIGSUSPEND
        select ARCH_HAS_SG_CHAIN
        select CPU_NO_EFFICIENT_FFS
-       select HAVE_ARCH_HARDENED_USERCOPY
-       select PROVE_LOCKING_SMALL if PROVE_LOCKING
+       select LOCKDEP_SMALL if LOCKDEP
        select ARCH_WANT_RELAX_ORDER
 
 config SPARC32
@@ -82,6 +81,7 @@ config SPARC64
        select HAVE_ARCH_AUDITSYSCALL
        select ARCH_SUPPORTS_ATOMIC_RMW
        select HAVE_NMI
+       select HAVE_REGS_AND_STACK_ACCESS_API
 
 config ARCH_DEFCONFIG
        string
index f294dd4..5961b2d 100644 (file)
@@ -17,6 +17,7 @@
 
 #define HPAGE_SHIFT            23
 #define REAL_HPAGE_SHIFT       22
+#define HPAGE_2GB_SHIFT                31
 #define HPAGE_256MB_SHIFT      28
 #define HPAGE_64K_SHIFT                16
 #define REAL_HPAGE_SIZE                (_AC(1,UL) << REAL_HPAGE_SHIFT)
@@ -27,7 +28,7 @@
 #define HUGETLB_PAGE_ORDER     (HPAGE_SHIFT - PAGE_SHIFT)
 #define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
 #define REAL_HPAGE_PER_HPAGE   (_AC(1,UL) << (HPAGE_SHIFT - REAL_HPAGE_SHIFT))
-#define HUGE_MAX_HSTATE                3
+#define HUGE_MAX_HSTATE                4
 #endif
 
 #ifndef __ASSEMBLY__
index 8a59852..6fbd931 100644 (file)
@@ -679,26 +679,27 @@ static inline unsigned long pmd_pfn(pmd_t pmd)
        return pte_pfn(pte);
 }
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static inline unsigned long pmd_dirty(pmd_t pmd)
+#define __HAVE_ARCH_PMD_WRITE
+static inline unsigned long pmd_write(pmd_t pmd)
 {
        pte_t pte = __pte(pmd_val(pmd));
 
-       return pte_dirty(pte);
+       return pte_write(pte);
 }
 
-static inline unsigned long pmd_young(pmd_t pmd)
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline unsigned long pmd_dirty(pmd_t pmd)
 {
        pte_t pte = __pte(pmd_val(pmd));
 
-       return pte_young(pte);
+       return pte_dirty(pte);
 }
 
-static inline unsigned long pmd_write(pmd_t pmd)
+static inline unsigned long pmd_young(pmd_t pmd)
 {
        pte_t pte = __pte(pmd_val(pmd));
 
-       return pte_write(pte);
+       return pte_young(pte);
 }
 
 static inline unsigned long pmd_trans_huge(pmd_t pmd)
index 365d4cb..dd27159 100644 (file)
 #include <asm/signal.h>
 #include <asm/page.h>
 
-/*
- * The sparc has no problems with write protection
- */
-#define wp_works_ok 1
-#define wp_works_ok__is_a_macro /* for versions in ksyms.c */
-
 /* Whee, this is STACK_TOP + PAGE_SIZE and the lowest kernel address too...
  * That one page is used to protect kernel from intruders, so that
  * we can make our access_ok test faster
index 6448cfc..b58ee90 100644 (file)
 #include <asm/ptrace.h>
 #include <asm/page.h>
 
-/* The sparc has no problems with write protection */
-#define wp_works_ok 1
-#define wp_works_ok__is_a_macro /* for versions in ksyms.c */
-
 /*
  * User lives in his very own context, and cannot reference us. Note
  * that TASK_SIZE is a misnomer, it really gives maximum user virtual
index ca57f08..d73428e 100644 (file)
@@ -83,7 +83,8 @@ unsigned long profile_pc(struct pt_regs *);
 
 #define MAX_REG_OFFSET (offsetof(struct pt_regs, magic))
 
-extern int regs_query_register_offset(const char *name);
+int regs_query_register_offset(const char *name);
+unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, unsigned int n);
 
 /**
  * regs_get_register() - get register value from its offset
index bd56c28..9e068bf 100644 (file)
@@ -7,7 +7,7 @@
 #endif
 
 #define user_addr_max() \
-       (segment_eq(get_fs(), USER_DS) ? TASK_SIZE : ~0UL)
+       (uaccess_kernel() ? ~0UL : TASK_SIZE)
 
 long strncpy_from_user(char *dest, const char __user *src, long count);
 
index ea55f86..12ebee2 100644 (file)
@@ -7,14 +7,8 @@
 #ifndef _ASM_UACCESS_H
 #define _ASM_UACCESS_H
 
-#ifdef __KERNEL__
 #include <linux/compiler.h>
-#include <linux/sched.h>
 #include <linux/string.h>
-#include <linux/errno.h>
-#endif
-
-#ifndef __ASSEMBLY__
 
 #include <asm/processor.h>
 
@@ -30,9 +24,6 @@
 #define KERNEL_DS   ((mm_segment_t) { 0 })
 #define USER_DS     ((mm_segment_t) { -1 })
 
-#define VERIFY_READ    0
-#define VERIFY_WRITE   1
-
 #define get_ds()       (KERNEL_DS)
 #define get_fs()       (current->thread.current_ds)
 #define set_fs(val)    ((current->thread.current_ds) = (val))
@@ -45,7 +36,7 @@
  * large size and address near to PAGE_OFFSET - a fault will break his intentions.
  */
 #define __user_ok(addr, size) ({ (void)(size); (addr) < STACK_TOP; })
-#define __kernel_ok (segment_eq(get_fs(), KERNEL_DS))
+#define __kernel_ok (uaccess_kernel())
 #define __access_ok(addr, size) (__user_ok((addr) & get_fs().seg, (size)))
 #define access_ok(type, addr, size) \
        ({ (void)(type); __access_ok((unsigned long)(addr), size); })
@@ -80,8 +71,6 @@ struct exception_table_entry
 /* Returns 0 if exception not found and fixup otherwise.  */
 unsigned long search_extables_range(unsigned long addr, unsigned long *g2);
 
-void __ret_efault(void);
-
 /* Uh, these should become the main single-value transfer routines..
  * They automatically use the right size if we just have the right
  * pointer type..
@@ -246,39 +235,18 @@ int __get_user_bad(void);
 
 unsigned long __copy_user(void __user *to, const void __user *from, unsigned long size);
 
-static inline unsigned long copy_to_user(void __user *to, const void *from, unsigned long n)
-{
-       if (n && __access_ok((unsigned long) to, n)) {
-               check_object_size(from, n, true);
-               return __copy_user(to, (__force void __user *) from, n);
-       } else
-               return n;
-}
-
-static inline unsigned long __copy_to_user(void __user *to, const void *from, unsigned long n)
+static inline unsigned long raw_copy_to_user(void __user *to, const void *from, unsigned long n)
 {
-       check_object_size(from, n, true);
        return __copy_user(to, (__force void __user *) from, n);
 }
 
-static inline unsigned long copy_from_user(void *to, const void __user *from, unsigned long n)
-{
-       if (n && __access_ok((unsigned long) from, n)) {
-               check_object_size(to, n, false);
-               return __copy_user((__force void __user *) to, from, n);
-       } else {
-               memset(to, 0, n);
-               return n;
-       }
-}
-
-static inline unsigned long __copy_from_user(void *to, const void __user *from, unsigned long n)
+static inline unsigned long raw_copy_from_user(void *to, const void __user *from, unsigned long n)
 {
        return __copy_user((__force void __user *) to, from, n);
 }
 
-#define __copy_to_user_inatomic __copy_to_user
-#define __copy_from_user_inatomic __copy_from_user
+#define INLINE_COPY_FROM_USER
+#define INLINE_COPY_TO_USER
 
 static inline unsigned long __clear_user(void __user *addr, unsigned long size)
 {
@@ -312,6 +280,4 @@ static inline unsigned long clear_user(void __user *addr, unsigned long n)
 __must_check long strlen_user(const char __user *str);
 __must_check long strnlen_user(const char __user *str, long n);
 
-#endif  /* __ASSEMBLY__ */
-
 #endif /* _ASM_UACCESS_H */
index 5373136..6096d67 100644 (file)
@@ -5,18 +5,12 @@
  * User space memory access functions
  */
 
-#ifdef __KERNEL__
-#include <linux/errno.h>
 #include <linux/compiler.h>
 #include <linux/string.h>
-#include <linux/thread_info.h>
 #include <asm/asi.h>
 #include <asm/spitfire.h>
 #include <asm-generic/uaccess-unaligned.h>
 #include <asm/extable_64.h>
-#endif
-
-#ifndef __ASSEMBLY__
 
 #include <asm/processor.h>
 
@@ -36,9 +30,6 @@
 #define KERNEL_DS   ((mm_segment_t) { ASI_P })
 #define USER_DS     ((mm_segment_t) { ASI_AIUS })      /* har har har */
 
-#define VERIFY_READ    0
-#define VERIFY_WRITE   1
-
 #define get_fs() ((mm_segment_t){(current_thread_info()->current_ds)})
 #define get_ds() (KERNEL_DS)
 
@@ -185,39 +176,19 @@ __asm__ __volatile__(                                                     \
 
 int __get_user_bad(void);
 
-unsigned long __must_check ___copy_from_user(void *to,
+unsigned long __must_check raw_copy_from_user(void *to,
                                             const void __user *from,
                                             unsigned long size);
-static inline unsigned long __must_check
-copy_from_user(void *to, const void __user *from, unsigned long size)
-{
-       check_object_size(to, size, false);
 
-       return ___copy_from_user(to, from, size);
-}
-#define __copy_from_user copy_from_user
-
-unsigned long __must_check ___copy_to_user(void __user *to,
+unsigned long __must_check raw_copy_to_user(void __user *to,
                                           const void *from,
                                           unsigned long size);
-static inline unsigned long __must_check
-copy_to_user(void __user *to, const void *from, unsigned long size)
-{
-       check_object_size(from, size, true);
+#define INLINE_COPY_FROM_USER
+#define INLINE_COPY_TO_USER
 
-       return ___copy_to_user(to, from, size);
-}
-#define __copy_to_user copy_to_user
-
-unsigned long __must_check ___copy_in_user(void __user *to,
+unsigned long __must_check raw_copy_in_user(void __user *to,
                                           const void __user *from,
                                           unsigned long size);
-static inline unsigned long __must_check
-copy_in_user(void __user *to, void __user *from, unsigned long size)
-{
-       return ___copy_in_user(to, from, size);
-}
-#define __copy_in_user copy_in_user
 
 unsigned long __must_check __clear_user(void __user *, unsigned long);
 
@@ -226,14 +197,9 @@ unsigned long __must_check __clear_user(void __user *, unsigned long);
 __must_check long strlen_user(const char __user *str);
 __must_check long strnlen_user(const char __user *str, long n);
 
-#define __copy_to_user_inatomic __copy_to_user
-#define __copy_from_user_inatomic __copy_from_user
-
 struct pt_regs;
 unsigned long compute_effective_address(struct pt_regs *,
                                        unsigned int insn,
                                        unsigned int rd);
 
-#endif  /* __ASSEMBLY__ */
-
 #endif /* _ASM_UACCESS_H */
index 36eee81..ae77df7 100644 (file)
 #define __NR_copy_file_range   357
 #define __NR_preadv2           358
 #define __NR_pwritev2          359
+#define __NR_statx             360
 
-#define NR_syscalls            360
+#define NR_syscalls            361
 
 /* Bitmask values returned from kern_features system call.  */
 #define KERN_FEATURE_MIXED_MODE_STACK  0x00000001
 #define __IGNORE_getresgid
 #endif
 
+/* Sparc doesn't have protection keys. */
+#define __IGNORE_pkey_mprotect
+#define __IGNORE_pkey_alloc
+#define __IGNORE_pkey_free
+
 #endif /* _UAPI_SPARC_UNISTD_H */
index 7bb317b..7274e43 100644 (file)
@@ -809,10 +809,3 @@ lvl14_save:
        .word   0
        .word   0
        .word   t_irq14
-
-        .section        ".fixup",#alloc,#execinstr
-        .globl  __ret_efault
-__ret_efault:
-        ret
-         restore %g0, -EFAULT, %o0
-EXPORT_SYMBOL(__ret_efault)
index 6aa3da1..4410119 100644 (file)
@@ -96,6 +96,7 @@ sparc64_boot:
        andn    %g1, PSTATE_AM, %g1
        wrpr    %g1, 0x0, %pstate
        ba,a,pt %xcc, 1f
+        nop
 
        .globl  prom_finddev_name, prom_chosen_path, prom_root_node
        .globl  prom_getprop_name, prom_mmu_name, prom_peer_name
@@ -613,6 +614,7 @@ niagara_tlb_fixup:
         nop
 
        ba,a,pt %xcc, 80f
+        nop
 niagara4_patch:
        call    niagara4_patch_copyops
         nop
@@ -622,6 +624,7 @@ niagara4_patch:
         nop
 
        ba,a,pt %xcc, 80f
+        nop
 
 niagara2_patch:
        call    niagara2_patch_copyops
@@ -632,6 +635,7 @@ niagara2_patch:
         nop
 
        ba,a,pt %xcc, 80f
+        nop
 
 niagara_patch:
        call    niagara_patch_copyops
index 34b4933..9276d2f 100644 (file)
@@ -82,6 +82,7 @@ do_stdfmna:
        call            handle_stdfmna
         add            %sp, PTREGS_OFF, %o0
        ba,a,pt         %xcc, rtrap
+        nop
        .size           do_stdfmna,.-do_stdfmna
 
        .type           breakpoint_trap,#function
index fc5124c..e1d965e 100644 (file)
@@ -1162,3 +1162,39 @@ int regs_query_register_offset(const char *name)
                        return roff->offset;
        return -EINVAL;
 }
+
+/**
+ * regs_within_kernel_stack() - check the address in the stack
+ * @regs:      pt_regs which contains kernel stack pointer.
+ * @addr:      address which is checked.
+ *
+ * regs_within_kernel_stack() checks @addr is within the kernel stack page(s).
+ * If @addr is within the kernel stack, it returns true. If not, returns false.
+ */
+static inline int regs_within_kernel_stack(struct pt_regs *regs,
+                                          unsigned long addr)
+{
+       unsigned long ksp = kernel_stack_pointer(regs) + STACK_BIAS;
+       return ((addr & ~(THREAD_SIZE - 1))  ==
+               (ksp & ~(THREAD_SIZE - 1)));
+}
+
+/**
+ * regs_get_kernel_stack_nth() - get Nth entry of the stack
+ * @regs:      pt_regs which contains kernel stack pointer.
+ * @n:         stack entry number.
+ *
+ * regs_get_kernel_stack_nth() returns @n th entry of the kernel stack which
+ * is specified by @regs. If the @n th entry is NOT in the kernel stack,
+ * this returns 0.
+ */
+unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, unsigned int n)
+{
+       unsigned long ksp = kernel_stack_pointer(regs) + STACK_BIAS;
+       unsigned long *addr = (unsigned long *)ksp;
+       addr += n;
+       if (regs_within_kernel_stack(regs, (unsigned long)addr))
+               return *addr;
+       else
+               return 0;
+}
index 216948c..709a82e 100644 (file)
@@ -237,6 +237,7 @@ rt_continue:        ldx                     [%sp + PTREGS_OFF + PT_V9_G1], %g1
                bne,pt                  %xcc, user_rtt_fill_32bit
                 wrpr                   %g1, %cwp
                ba,a,pt                 %xcc, user_rtt_fill_64bit
+                nop
 
 user_rtt_fill_fixup_dax:
                ba,pt   %xcc, user_rtt_fill_fixup_common
index 4a73009..d7e5408 100644 (file)
@@ -86,6 +86,7 @@ __spitfire_cee_trap_continue:
         rd             %pc, %g7
 
        ba,a,pt         %xcc, 2f
+        nop
 
 1:     ba,pt           %xcc, etrap_irq
         rd             %pc, %g7
index 6179e19..c19f352 100644 (file)
@@ -352,6 +352,7 @@ sun4v_mna:
        call    sun4v_do_mna
         add    %sp, PTREGS_OFF, %o0
        ba,a,pt %xcc, rtrap
+        nop
 
        /* Privileged Action.  */
 sun4v_privact:
index d63fc61..5fd352b 100644 (file)
@@ -98,27 +98,7 @@ static struct attribute_group mmu_stat_group = {
        .name = "mmu_stats",
 };
 
-/* XXX convert to rusty's on_one_cpu */
-static unsigned long run_on_cpu(unsigned long cpu,
-                               unsigned long (*func)(unsigned long),
-                               unsigned long arg)
-{
-       cpumask_t old_affinity;
-       unsigned long ret;
-
-       cpumask_copy(&old_affinity, &current->cpus_allowed);
-       /* should return -EINVAL to userspace */
-       if (set_cpus_allowed_ptr(current, cpumask_of(cpu)))
-               return 0;
-
-       ret = func(arg);
-
-       set_cpus_allowed_ptr(current, &old_affinity);
-
-       return ret;
-}
-
-static unsigned long read_mmustat_enable(unsigned long junk)
+static long read_mmustat_enable(void *data __maybe_unused)
 {
        unsigned long ra = 0;
 
@@ -127,11 +107,11 @@ static unsigned long read_mmustat_enable(unsigned long junk)
        return ra != 0;
 }
 
-static unsigned long write_mmustat_enable(unsigned long val)
+static long write_mmustat_enable(void *data)
 {
-       unsigned long ra, orig_ra;
+       unsigned long ra, orig_ra, *val = data;
 
-       if (val)
+       if (*val)
                ra = __pa(&per_cpu(mmu_stats, smp_processor_id()));
        else
                ra = 0UL;
@@ -142,7 +122,8 @@ static unsigned long write_mmustat_enable(unsigned long val)
 static ssize_t show_mmustat_enable(struct device *s,
                                struct device_attribute *attr, char *buf)
 {
-       unsigned long val = run_on_cpu(s->id, read_mmustat_enable, 0);
+       long val = work_on_cpu(s->id, read_mmustat_enable, NULL);
+
        return sprintf(buf, "%lx\n", val);
 }
 
@@ -150,13 +131,15 @@ static ssize_t store_mmustat_enable(struct device *s,
                        struct device_attribute *attr, const char *buf,
                        size_t count)
 {
-       unsigned long val, err;
-       int ret = sscanf(buf, "%lu", &val);
+       unsigned long val;
+       long err;
+       int ret;
 
+       ret = sscanf(buf, "%lu", &val);
        if (ret != 1)
                return -EINVAL;
 
-       err = run_on_cpu(s->id, write_mmustat_enable, val);
+       err = work_on_cpu(s->id, write_mmustat_enable, &val);
        if (err)
                return -EIO;
 
index eac7f0d..5253e89 100644 (file)
@@ -89,3 +89,4 @@ sys_call_table:
 /*345*/        .long sys_renameat2, sys_seccomp, sys_getrandom, sys_memfd_create, sys_bpf
 /*350*/        .long sys_execveat, sys_membarrier, sys_userfaultfd, sys_bind, sys_listen
 /*355*/        .long sys_setsockopt, sys_mlock2, sys_copy_file_range, sys_preadv2, sys_pwritev2
+/*360*/        .long sys_statx
index b0f17ff..82339f6 100644 (file)
@@ -90,6 +90,7 @@ sys_call_table32:
        .word sys32_renameat2, sys_seccomp, sys_getrandom, sys_memfd_create, sys_bpf
 /*350*/        .word sys32_execveat, sys_membarrier, sys_userfaultfd, sys_bind, sys_listen
        .word compat_sys_setsockopt, sys_mlock2, sys_copy_file_range, compat_sys_preadv2, compat_sys_pwritev2
+/*360*/        .word sys_statx
 
 #endif /* CONFIG_COMPAT */
 
@@ -171,3 +172,4 @@ sys_call_table:
        .word sys_renameat2, sys_seccomp, sys_getrandom, sys_memfd_create, sys_bpf
 /*350*/        .word sys64_execveat, sys_membarrier, sys_userfaultfd, sys_bind, sys_listen
        .word sys_setsockopt, sys_mlock2, sys_copy_file_range, sys_preadv2, sys_pwritev2
+/*360*/        .word sys_statx
index 244062b..9f575df 100644 (file)
@@ -228,7 +228,9 @@ void register_percpu_ce(int cpu)
        ce->mult           = div_sc(sparc_config.clock_rate, NSEC_PER_SEC,
                                    ce->shift);
        ce->max_delta_ns   = clockevent_delta2ns(sparc_config.clock_rate, ce);
+       ce->max_delta_ticks = (unsigned long)sparc_config.clock_rate;
        ce->min_delta_ns   = clockevent_delta2ns(100, ce);
+       ce->min_delta_ticks = 100;
 
        clockevents_register_device(ce);
 }
index 12a6d35..98d05de 100644 (file)
@@ -796,8 +796,10 @@ void __init time_init(void)
 
        sparc64_clockevent.max_delta_ns =
                clockevent_delta2ns(0x7fffffffffffffffUL, &sparc64_clockevent);
+       sparc64_clockevent.max_delta_ticks = 0x7fffffffffffffffUL;
        sparc64_clockevent.min_delta_ns =
                clockevent_delta2ns(0xF, &sparc64_clockevent);
+       sparc64_clockevent.min_delta_ticks = 0xF;
 
        printk("clockevent: mult[%x] shift[%d]\n",
               sparc64_clockevent.mult, sparc64_clockevent.shift);
index 5604a2b..364af32 100644 (file)
@@ -92,6 +92,7 @@ user_rtt_fill_fixup_common:
                call    sun4v_data_access_exception
                 nop
                ba,a,pt %xcc, rtrap
+                nop
 
 1:             call    spitfire_data_access_exception
                 nop
index 855019a..1ee173c 100644 (file)
@@ -152,6 +152,8 @@ fill_fixup_dax:
        call    sun4v_data_access_exception
         nop
        ba,a,pt %xcc, rtrap
+        nop
 1:     call    spitfire_data_access_exception
         nop
        ba,a,pt %xcc, rtrap
+        nop
index 69a439f..8aa16ef 100644 (file)
@@ -23,7 +23,7 @@
 #define PREAMBLE                                       \
        rd              %asi, %g1;                      \
        cmp             %g1, ASI_AIUS;                  \
-       bne,pn          %icc, ___copy_in_user;          \
+       bne,pn          %icc, raw_copy_in_user;         \
         nop
 #endif
 
index 9947427..311c8fa 100644 (file)
@@ -27,7 +27,7 @@
 #define PREAMBLE                                       \
        rd              %asi, %g1;                      \
        cmp             %g1, ASI_AIUS;                  \
-       bne,pn          %icc, ___copy_in_user;          \
+       bne,pn          %icc, raw_copy_in_user;         \
         nop
 #endif
 
index fab9e89..95e2f1f 100644 (file)
@@ -26,8 +26,8 @@
        .type   generic_patch_copyops,#function
 generic_patch_copyops:
        GEN_DO_PATCH(memcpy, GENmemcpy)
-       GEN_DO_PATCH(___copy_from_user, GENcopy_from_user)
-       GEN_DO_PATCH(___copy_to_user, GENcopy_to_user)
+       GEN_DO_PATCH(raw_copy_from_user, GENcopy_from_user)
+       GEN_DO_PATCH(raw_copy_to_user, GENcopy_to_user)
        retl
         nop
        .size   generic_patch_copyops,.-generic_patch_copyops
index b79a699..0d8a018 100644 (file)
@@ -36,7 +36,7 @@
 #define PREAMBLE                                       \
        rd              %asi, %g1;                      \
        cmp             %g1, ASI_AIUS;                  \
-       bne,pn          %icc, ___copy_in_user;          \
+       bne,pn          %icc, raw_copy_in_user;         \
         nop
 #endif
 
index dcec55f..a7a0ea0 100644 (file)
@@ -45,7 +45,7 @@
 #define PREAMBLE                                       \
        rd              %asi, %g1;                      \
        cmp             %g1, ASI_AIUS;                  \
-       bne,pn          %icc, ___copy_in_user;          \
+       bne,pn          %icc, raw_copy_in_user;         \
         nop
 #endif
 
index c629dbd..64dcd6c 100644 (file)
@@ -326,11 +326,13 @@ FUNC_NAME:        /* %o0=dst, %o1=src, %o2=len */
        blu             170f
         nop
        ba,a,pt         %xcc, 180f
+        nop
 
 4:     /* 32 <= low bits < 48 */
        blu             150f
         nop
        ba,a,pt         %xcc, 160f
+        nop
 5:     /* 0 < low bits < 32 */
        blu,a           6f
         cmp            %g2, 8
@@ -338,6 +340,7 @@ FUNC_NAME:  /* %o0=dst, %o1=src, %o2=len */
        blu             130f
         nop
        ba,a,pt         %xcc, 140f
+        nop
 6:     /* 0 < low bits < 16 */
        bgeu            120f
         nop
@@ -475,6 +478,7 @@ FUNC_NAME:  /* %o0=dst, %o1=src, %o2=len */
        brz,pt          %o2, 85f
         sub            %o0, %o1, GLOBAL_SPARE
        ba,a,pt         %XCC, 90f
+        nop
 
        .align          64
 75: /* 16 < len <= 64 */
index 28c36f0..56ccc19 100644 (file)
@@ -26,8 +26,8 @@
        .type   niagara2_patch_copyops,#function
 niagara2_patch_copyops:
        NG_DO_PATCH(memcpy, NG2memcpy)
-       NG_DO_PATCH(___copy_from_user, NG2copy_from_user)
-       NG_DO_PATCH(___copy_to_user, NG2copy_to_user)
+       NG_DO_PATCH(raw_copy_from_user, NG2copy_from_user)
+       NG_DO_PATCH(raw_copy_to_user, NG2copy_to_user)
        retl
         nop
        .size   niagara2_patch_copyops,.-niagara2_patch_copyops
index 16a286c..5bb506b 100644 (file)
@@ -31,7 +31,7 @@
 #define PREAMBLE                                       \
        rd              %asi, %g1;                      \
        cmp             %g1, ASI_AIUS;                  \
-       bne,pn          %icc, ___copy_in_user;          \
+       bne,pn          %icc, raw_copy_in_user;         \
         nop
 #endif
 
index 6b0276f..a82d4d4 100644 (file)
@@ -40,7 +40,7 @@
 #define PREAMBLE                                       \
        rd              %asi, %g1;                      \
        cmp             %g1, ASI_AIUS;                  \
-       bne,pn          %icc, ___copy_in_user;          \
+       bne,pn          %icc, raw_copy_in_user;         \
         nop
 #endif
 
index 75bb93b..78ea962 100644 (file)
@@ -530,4 +530,5 @@ FUNC_NAME:  /* %o0=dst, %o1=src, %o2=len */
        bne,pt          %icc, 1b
         EX_ST(STORE(stb, %g1, %o0 - 0x01), NG4_retl_o2_plus_1)
        ba,a,pt         %icc, .Lexit
+        nop
        .size           FUNC_NAME, .-FUNC_NAME
index 41da4bd..7c0c81f 100644 (file)
@@ -102,4 +102,5 @@ NG4bzero:
        bne,pt          %icc, 1b
         add            %o0, 0x30, %o0
        ba,a,pt         %icc, .Lpostloop
+        nop
        .size           NG4bzero,.-NG4bzero
index a114cbc..3cc0f8c 100644 (file)
@@ -26,8 +26,8 @@
        .type   niagara4_patch_copyops,#function
 niagara4_patch_copyops:
        NG_DO_PATCH(memcpy, NG4memcpy)
-       NG_DO_PATCH(___copy_from_user, NG4copy_from_user)
-       NG_DO_PATCH(___copy_to_user, NG4copy_to_user)
+       NG_DO_PATCH(raw_copy_from_user, NG4copy_from_user)
+       NG_DO_PATCH(raw_copy_to_user, NG4copy_to_user)
        retl
         nop
        .size   niagara4_patch_copyops,.-niagara4_patch_copyops
index 9cd42fc..2333b6f 100644 (file)
@@ -25,7 +25,7 @@
 #define PREAMBLE                                       \
        rd              %asi, %g1;                      \
        cmp             %g1, ASI_AIUS;                  \
-       bne,pn          %icc, ___copy_in_user;          \
+       bne,pn          %icc, raw_copy_in_user;         \
         nop
 #endif
 
index 5c358af..07ba20b 100644 (file)
@@ -28,7 +28,7 @@
 #define PREAMBLE                                       \
        rd              %asi, %g1;                      \
        cmp             %g1, ASI_AIUS;                  \
-       bne,pn          %icc, ___copy_in_user;          \
+       bne,pn          %icc, raw_copy_in_user;         \
         nop
 #endif
 
index d88c4ed..cd654a7 100644 (file)
@@ -394,6 +394,7 @@ FUNC_NAME:  /* %i0=dst, %i1=src, %i2=len */
        brz,pt          %i2, 85f
         sub            %o0, %i1, %i3
        ba,a,pt         %XCC, 90f
+        nop
 
        .align          64
 70: /* 16 < len <= 64 */
index 3b0674f..62ccda7 100644 (file)
@@ -26,8 +26,8 @@
        .type   niagara_patch_copyops,#function
 niagara_patch_copyops:
        NG_DO_PATCH(memcpy, NGmemcpy)
-       NG_DO_PATCH(___copy_from_user, NGcopy_from_user)
-       NG_DO_PATCH(___copy_to_user, NGcopy_to_user)
+       NG_DO_PATCH(raw_copy_from_user, NGcopy_from_user)
+       NG_DO_PATCH(raw_copy_to_user, NGcopy_to_user)
        retl
         nop
        .size   niagara_patch_copyops,.-niagara_patch_copyops
index bb6ff73..9a6e68a 100644 (file)
@@ -19,7 +19,7 @@
        .text;                  \
        .align 4;
 
-#define FUNC_NAME              ___copy_from_user
+#define FUNC_NAME              raw_copy_from_user
 #define LOAD(type,addr,dest)   type##a [addr] %asi, dest
 #define LOAD_BLK(addr,dest)    ldda [addr] ASI_BLK_AIUS, dest
 #define EX_RETVAL(x)           0
@@ -31,7 +31,7 @@
 #define PREAMBLE                                       \
        rd              %asi, %g1;                      \
        cmp             %g1, ASI_AIUS;                  \
-       bne,pn          %icc, ___copy_in_user;          \
+       bne,pn          %icc, raw_copy_in_user;         \
         nop;                                           \
 
 #include "U1memcpy.S"
index ed92ce7..d7b2849 100644 (file)
@@ -19,7 +19,7 @@
        .text;                  \
        .align 4;
 
-#define FUNC_NAME              ___copy_to_user
+#define FUNC_NAME              raw_copy_to_user
 #define STORE(type,src,addr)   type##a src, [addr] ASI_AIUS
 #define STORE_BLK(src,addr)    stda src, [addr] ASI_BLK_AIUS
 #define EX_RETVAL(x)           0
@@ -31,7 +31,7 @@
 #define PREAMBLE                                       \
        rd              %asi, %g1;                      \
        cmp             %g1, ASI_AIUS;                  \
-       bne,pn          %icc, ___copy_in_user;          \
+       bne,pn          %icc, raw_copy_in_user;         \
         nop;                                           \
 
 #include "U1memcpy.S"
index c4ee858..f48fb87 100644 (file)
@@ -31,7 +31,7 @@
 #define PREAMBLE                                       \
        rd              %asi, %g1;                      \
        cmp             %g1, ASI_AIUS;                  \
-       bne,pn          %icc, ___copy_in_user;          \
+       bne,pn          %icc, raw_copy_in_user;         \
         nop;                                           \
 
 #include "U3memcpy.S"
index ecc3026..91cd653 100644 (file)
@@ -26,8 +26,8 @@
        .type   cheetah_patch_copyops,#function
 cheetah_patch_copyops:
        ULTRA3_DO_PATCH(memcpy, U3memcpy)
-       ULTRA3_DO_PATCH(___copy_from_user, U3copy_from_user)
-       ULTRA3_DO_PATCH(___copy_to_user, U3copy_to_user)
+       ULTRA3_DO_PATCH(raw_copy_from_user, U3copy_from_user)
+       ULTRA3_DO_PATCH(raw_copy_to_user, U3copy_to_user)
        retl
         nop
        .size   cheetah_patch_copyops,.-cheetah_patch_copyops
index 0252b21..1b73bb8 100644 (file)
@@ -44,7 +44,7 @@ __retl_o2_plus_1:
         * to copy register windows around during thread cloning.
         */
 
-ENTRY(___copy_in_user) /* %o0=dst, %o1=src, %o2=len */
+ENTRY(raw_copy_in_user)        /* %o0=dst, %o1=src, %o2=len */
        cmp             %o2, 0
        be,pn           %XCC, 85f
         or             %o0, %o1, %o3
@@ -105,5 +105,5 @@ ENTRY(___copy_in_user)      /* %o0=dst, %o1=src, %o2=len */
         add            %o0, 1, %o0
        retl
         clr            %o0
-ENDPROC(___copy_in_user)
-EXPORT_SYMBOL(___copy_in_user)
+ENDPROC(raw_copy_in_user)
+EXPORT_SYMBOL(raw_copy_in_user)
index cea644d..bc243ee 100644 (file)
@@ -364,21 +364,7 @@ short_aligned_end:
 97:
        mov     %o2, %g3
 fixupretl:
-       sethi   %hi(PAGE_OFFSET), %g1
-       cmp     %o0, %g1
-       blu     1f
-        cmp    %o1, %g1
-       bgeu    1f
-        ld     [%g6 + TI_PREEMPT], %g1
-       cmp     %g1, 0
-       bne     1f
-        nop
-       save    %sp, -64, %sp
-       mov     %i0, %o0
-       call    __bzero
-        mov    %g3, %o1
-       restore
-1:     retl
+       retl
         mov    %g3, %o0
 
 /* exception routine sets %g2 to (broken_insn - first_insn)>>2 */
index 323bc6b..7c29d38 100644 (file)
@@ -143,6 +143,10 @@ static pte_t sun4v_hugepage_shift_to_tte(pte_t entry, unsigned int shift)
        pte_val(entry) = pte_val(entry) & ~_PAGE_SZALL_4V;
 
        switch (shift) {
+       case HPAGE_2GB_SHIFT:
+               hugepage_size = _PAGE_SZ2GB_4V;
+               pte_val(entry) |= _PAGE_PMD_HUGE;
+               break;
        case HPAGE_256MB_SHIFT:
                hugepage_size = _PAGE_SZ256MB_4V;
                pte_val(entry) |= _PAGE_PMD_HUGE;
@@ -183,6 +187,9 @@ static unsigned int sun4v_huge_tte_to_shift(pte_t entry)
        unsigned int shift;
 
        switch (tte_szbits) {
+       case _PAGE_SZ2GB_4V:
+               shift = HPAGE_2GB_SHIFT;
+               break;
        case _PAGE_SZ256MB_4V:
                shift = HPAGE_256MB_SHIFT;
                break;
@@ -261,7 +268,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
                if (!pmd)
                        return NULL;
 
-               if (sz == PMD_SHIFT)
+               if (sz >= PMD_SIZE)
                        pte = (pte_t *)pmd;
                else
                        pte = pte_alloc_map(mm, pmd, addr);
@@ -454,6 +461,22 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb,
        pgd_t *pgd;
        unsigned long next;
 
+       addr &= PMD_MASK;
+       if (addr < floor) {
+               addr += PMD_SIZE;
+               if (!addr)
+                       return;
+       }
+       if (ceiling) {
+               ceiling &= PMD_MASK;
+               if (!ceiling)
+                       return;
+       }
+       if (end - 1 > ceiling - 1)
+               end -= PMD_SIZE;
+       if (addr > end - 1)
+               return;
+
        pgd = pgd_offset(tlb->mm, addr);
        do {
                next = pgd_addr_end(addr, end);
index ccd4553..0cda653 100644 (file)
@@ -337,6 +337,10 @@ static int __init setup_hugepagesz(char *string)
        hugepage_shift = ilog2(hugepage_size);
 
        switch (hugepage_shift) {
+       case HPAGE_2GB_SHIFT:
+               hv_pgsz_mask = HV_PGSZ_MASK_2GB;
+               hv_pgsz_idx = HV_PGSZ_IDX_2GB;
+               break;
        case HPAGE_256MB_SHIFT:
                hv_pgsz_mask = HV_PGSZ_MASK_256MB;
                hv_pgsz_idx = HV_PGSZ_IDX_256MB;
@@ -1563,7 +1567,7 @@ bool kern_addr_valid(unsigned long addr)
        if ((long)addr < 0L) {
                unsigned long pa = __pa(addr);
 
-               if ((addr >> max_phys_bits) != 0UL)
+               if ((pa >> max_phys_bits) != 0UL)
                        return false;
 
                return pfn_valid(pa >> PAGE_SHIFT);
index def82f6..8e76ebb 100644 (file)
@@ -54,6 +54,7 @@
 enum mbus_module srmmu_modtype;
 static unsigned int hwbug_bitmask;
 int vac_cache_size;
+EXPORT_SYMBOL(vac_cache_size);
 int vac_line_size;
 
 extern struct resource sparc_iomap;
index afda3bb..ee8066c 100644 (file)
@@ -154,7 +154,7 @@ static void tlb_batch_pmd_scan(struct mm_struct *mm, unsigned long vaddr,
                if (pte_val(*pte) & _PAGE_VALID) {
                        bool exec = pte_exec(*pte);
 
-                       tlb_batch_add_one(mm, vaddr, exec, false);
+                       tlb_batch_add_one(mm, vaddr, exec, PAGE_SHIFT);
                }
                pte++;
                vaddr += PAGE_SIZE;
@@ -209,9 +209,9 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr,
                        pte_t orig_pte = __pte(pmd_val(orig));
                        bool exec = pte_exec(orig_pte);
 
-                       tlb_batch_add_one(mm, addr, exec, true);
+                       tlb_batch_add_one(mm, addr, exec, REAL_HPAGE_SHIFT);
                        tlb_batch_add_one(mm, addr + REAL_HPAGE_SIZE, exec,
-                                       true);
+                                         REAL_HPAGE_SHIFT);
                } else {
                        tlb_batch_pmd_scan(mm, addr, orig);
                }
index 0a04811..bedf08b 100644 (file)
@@ -122,7 +122,7 @@ void flush_tsb_user(struct tlb_batch *tb)
 
        spin_lock_irqsave(&mm->context.lock, flags);
 
-       if (tb->hugepage_shift < HPAGE_SHIFT) {
+       if (tb->hugepage_shift < REAL_HPAGE_SHIFT) {
                base = (unsigned long) mm->context.tsb_block[MM_TSB_BASE].tsb;
                nentries = mm->context.tsb_block[MM_TSB_BASE].tsb_nentries;
                if (tlb_type == cheetah_plus || tlb_type == hypervisor)
@@ -155,7 +155,7 @@ void flush_tsb_user_page(struct mm_struct *mm, unsigned long vaddr,
 
        spin_lock_irqsave(&mm->context.lock, flags);
 
-       if (hugepage_shift < HPAGE_SHIFT) {
+       if (hugepage_shift < REAL_HPAGE_SHIFT) {
                base = (unsigned long) mm->context.tsb_block[MM_TSB_BASE].tsb;
                nentries = mm->context.tsb_block[MM_TSB_BASE].tsb_nentries;
                if (tlb_type == cheetah_plus || tlb_type == hypervisor)
index fd122ef..0d925fa 100644 (file)
@@ -249,7 +249,6 @@ CONFIG_USB_EHCI_HCD=y
 CONFIG_USB_OHCI_HCD=y
 CONFIG_USB_STORAGE=y
 CONFIG_EDAC=y
-CONFIG_EDAC_MM_EDAC=y
 CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_TILE=y
 CONFIG_EXT2_FS=y
index eb6a559..149d8e8 100644 (file)
@@ -358,7 +358,6 @@ CONFIG_WATCHDOG_NOWAYOUT=y
 # CONFIG_VGA_ARB is not set
 # CONFIG_USB_SUPPORT is not set
 CONFIG_EDAC=y
-CONFIG_EDAC_MM_EDAC=y
 CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_TILE=y
 CONFIG_EXT2_FS=y
index aa48b6e..24c44e9 100644 (file)
@@ -7,6 +7,7 @@ generic-y += clkdev.h
 generic-y += emergency-restart.h
 generic-y += errno.h
 generic-y += exec.h
+generic-y += extable.h
 generic-y += fb.h
 generic-y += fcntl.h
 generic-y += hw_irq.h
index a77369e..a803f6b 100644 (file)
 /*
  * User space memory access functions
  */
-#include <linux/sched.h>
 #include <linux/mm.h>
 #include <asm-generic/uaccess-unaligned.h>
 #include <asm/processor.h>
 #include <asm/page.h>
 
-#define VERIFY_READ    0
-#define VERIFY_WRITE   1
-
 /*
  * The fs value determines whether argument validity checking should be
  * performed or not.  If get_fs() == USER_DS, checking is performed, with
@@ -102,24 +98,7 @@ int __range_ok(unsigned long addr, unsigned long size);
        likely(__range_ok((unsigned long)(addr), (size)) == 0); \
 })
 
-/*
- * The exception table consists of pairs of addresses: the first is the
- * address of an instruction that is allowed to fault, and the second is
- * the address at which the program should continue.  No registers are
- * modified, so it is entirely up to the continuation code to figure out
- * what to do.
- *
- * All the routines below use bits of fixup code that are out of line
- * with the main instruction path.  This means when everything is well,
- * we don't even have to jump over them.  Further, they do not intrude
- * on our cache or tlb entries.
- */
-
-struct exception_table_entry {
-       unsigned long insn, fixup;
-};
-
-extern int fixup_exception(struct pt_regs *regs);
+#include <asm/extable.h>
 
 /*
  * This is a type: either unsigned long, if the argument fits into
@@ -334,145 +313,16 @@ extern int __put_user_bad(void)
                ((x) = 0, -EFAULT);                                     \
 })
 
-/**
- * __copy_to_user() - copy data into user space, with less checking.
- * @to:   Destination address, in user space.
- * @from: Source address, in kernel space.
- * @n:    Number of bytes to copy.
- *
- * Context: User context only. This function may sleep if pagefaults are
- *          enabled.
- *
- * Copy data from kernel space to user space.  Caller must check
- * the specified block with access_ok() before calling this function.
- *
- * Returns number of bytes that could not be copied.
- * On success, this will be zero.
- *
- * An alternate version - __copy_to_user_inatomic() - is designed
- * to be called from atomic context, typically bracketed by calls
- * to pagefault_disable() and pagefault_enable().
- */
-extern unsigned long __must_check __copy_to_user_inatomic(
-       void __user *to, const void *from, unsigned long n);
-
-static inline unsigned long __must_check
-__copy_to_user(void __user *to, const void *from, unsigned long n)
-{
-       might_fault();
-       return __copy_to_user_inatomic(to, from, n);
-}
-
-static inline unsigned long __must_check
-copy_to_user(void __user *to, const void *from, unsigned long n)
-{
-       if (access_ok(VERIFY_WRITE, to, n))
-               n = __copy_to_user(to, from, n);
-       return n;
-}
-
-/**
- * __copy_from_user() - copy data from user space, with less checking.
- * @to:   Destination address, in kernel space.
- * @from: Source address, in user space.
- * @n:    Number of bytes to copy.
- *
- * Context: User context only. This function may sleep if pagefaults are
- *          enabled.
- *
- * Copy data from user space to kernel space.  Caller must check
- * the specified block with access_ok() before calling this function.
- *
- * Returns number of bytes that could not be copied.
- * On success, this will be zero.
- *
- * If some data could not be copied, this function will pad the copied
- * data to the requested size using zero bytes.
- *
- * An alternate version - __copy_from_user_inatomic() - is designed
- * to be called from atomic context, typically bracketed by calls
- * to pagefault_disable() and pagefault_enable().  This version
- * does *NOT* pad with zeros.
- */
-extern unsigned long __must_check __copy_from_user_inatomic(
-       void *to, const void __user *from, unsigned long n);
-extern unsigned long __must_check __copy_from_user_zeroing(
-       void *to, const void __user *from, unsigned long n);
-
-static inline unsigned long __must_check
-__copy_from_user(void *to, const void __user *from, unsigned long n)
-{
-       might_fault();
-       return __copy_from_user_zeroing(to, from, n);
-}
-
-static inline unsigned long __must_check
-_copy_from_user(void *to, const void __user *from, unsigned long n)
-{
-       if (access_ok(VERIFY_READ, from, n))
-               n = __copy_from_user(to, from, n);
-       else
-               memset(to, 0, n);
-       return n;
-}
-
-extern void __compiletime_error("usercopy buffer size is too small")
-__bad_copy_user(void);
-
-static inline void copy_user_overflow(int size, unsigned long count)
-{
-       WARN(1, "Buffer overflow detected (%d < %lu)!\n", size, count);
-}
-
-static inline unsigned long __must_check copy_from_user(void *to,
-                                         const void __user *from,
-                                         unsigned long n)
-{
-       int sz = __compiletime_object_size(to);
-
-       if (likely(sz == -1 || sz >= n))
-               n = _copy_from_user(to, from, n);
-       else if (!__builtin_constant_p(n))
-               copy_user_overflow(sz, n);
-       else
-               __bad_copy_user();
-
-       return n;
-}
+extern unsigned long __must_check
+raw_copy_to_user(void __user *to, const void *from, unsigned long n);
+extern unsigned long __must_check
+raw_copy_from_user(void *to, const void __user *from, unsigned long n);
+#define INLINE_COPY_FROM_USER
+#define INLINE_COPY_TO_USER
 
 #ifdef __tilegx__
-/**
- * __copy_in_user() - copy data within user space, with less checking.
- * @to:   Destination address, in user space.
- * @from: Source address, in user space.
- * @n:    Number of bytes to copy.
- *
- * Context: User context only. This function may sleep if pagefaults are
- *          enabled.
- *
- * Copy data from user space to user space.  Caller must check
- * the specified blocks with access_ok() before calling this function.
- *
- * Returns number of bytes that could not be copied.
- * On success, this will be zero.
- */
-extern unsigned long __copy_in_user_inatomic(
+extern unsigned long raw_copy_in_user(
        void __user *to, const void __user *from, unsigned long n);
-
-static inline unsigned long __must_check
-__copy_in_user(void __user *to, const void __user *from, unsigned long n)
-{
-       might_fault();
-       return __copy_in_user_inatomic(to, from, n);
-}
-
-static inline unsigned long __must_check
-copy_in_user(void __user *to, const void __user *from, unsigned long n)
-{
-       if (access_ok(VERIFY_WRITE, to, n) && access_ok(VERIFY_READ, from, n))
-               n = __copy_in_user(to, from, n);
-       return n;
-}
 #endif
 
 
index 5bd4e88..6643ffb 100644 (file)
@@ -155,6 +155,8 @@ static DEFINE_PER_CPU(struct clock_event_device, tile_timer) = {
        .name = "tile timer",
        .features = CLOCK_EVT_FEAT_ONESHOT,
        .min_delta_ns = 1000,
+       .min_delta_ticks = 1,
+       .max_delta_ticks = MAX_TICK,
        .rating = 100,
        .irq = -1,
        .set_next_event = tile_timer_set_next_event,
index c5369fe..ecce8e1 100644 (file)
@@ -38,11 +38,10 @@ EXPORT_SYMBOL(__mcount);
 
 /* arch/tile/lib/, various memcpy files */
 EXPORT_SYMBOL(memcpy);
-EXPORT_SYMBOL(__copy_to_user_inatomic);
-EXPORT_SYMBOL(__copy_from_user_inatomic);
-EXPORT_SYMBOL(__copy_from_user_zeroing);
+EXPORT_SYMBOL(raw_copy_to_user);
+EXPORT_SYMBOL(raw_copy_from_user);
 #ifdef __tilegx__
-EXPORT_SYMBOL(__copy_in_user_inatomic);
+EXPORT_SYMBOL(raw_copy_in_user);
 #endif
 
 /* hypervisor glue */
index a2771ae..270f126 100644 (file)
@@ -24,7 +24,6 @@
 
 #define IS_MEMCPY        0
 #define IS_COPY_FROM_USER  1
-#define IS_COPY_FROM_USER_ZEROING  2
 #define IS_COPY_TO_USER   -1
 
        .section .text.memcpy_common, "ax"
        9
 
 
-/* __copy_from_user_inatomic takes the kernel target address in r0,
+/* raw_copy_from_user takes the kernel target address in r0,
  * the user source in r1, and the bytes to copy in r2.
  * It returns the number of uncopiable bytes (hopefully zero) in r0.
  */
-ENTRY(__copy_from_user_inatomic)
-.type __copy_from_user_inatomic, @function
-       FEEDBACK_ENTER_EXPLICIT(__copy_from_user_inatomic, \
+ENTRY(raw_copy_from_user)
+.type raw_copy_from_user, @function
+       FEEDBACK_ENTER_EXPLICIT(raw_copy_from_user, \
          .text.memcpy_common, \
-         .Lend_memcpy_common - __copy_from_user_inatomic)
+         .Lend_memcpy_common - raw_copy_from_user)
        { movei r29, IS_COPY_FROM_USER; j memcpy_common }
-       .size __copy_from_user_inatomic, . - __copy_from_user_inatomic
+       .size raw_copy_from_user, . - raw_copy_from_user
 
-/* __copy_from_user_zeroing is like __copy_from_user_inatomic, but
- * any uncopiable bytes are zeroed in the target.
- */
-ENTRY(__copy_from_user_zeroing)
-.type __copy_from_user_zeroing, @function
-       FEEDBACK_REENTER(__copy_from_user_inatomic)
-       { movei r29, IS_COPY_FROM_USER_ZEROING; j memcpy_common }
-       .size __copy_from_user_zeroing, . - __copy_from_user_zeroing
-
-/* __copy_to_user_inatomic takes the user target address in r0,
+/* raw_copy_to_user takes the user target address in r0,
  * the kernel source in r1, and the bytes to copy in r2.
  * It returns the number of uncopiable bytes (hopefully zero) in r0.
  */
-ENTRY(__copy_to_user_inatomic)
-.type __copy_to_user_inatomic, @function
-       FEEDBACK_REENTER(__copy_from_user_inatomic)
+ENTRY(raw_copy_to_user)
+.type raw_copy_to_user, @function
+       FEEDBACK_REENTER(raw_copy_from_user)
        { movei r29, IS_COPY_TO_USER; j memcpy_common }
-       .size __copy_to_user_inatomic, . - __copy_to_user_inatomic
+       .size raw_copy_to_user, . - raw_copy_to_user
 
 ENTRY(memcpy)
 .type memcpy, @function
-       FEEDBACK_REENTER(__copy_from_user_inatomic)
+       FEEDBACK_REENTER(raw_copy_from_user)
        { movei r29, IS_MEMCPY }
        .size memcpy, . - memcpy
        /* Fall through */
@@ -520,12 +510,7 @@ copy_from_user_fixup_loop:
        { bnzt r2, copy_from_user_fixup_loop }
 
 .Lcopy_from_user_fixup_zero_remainder:
-       { bbs r29, 2f }  /* low bit set means IS_COPY_FROM_USER */
-       /* byte-at-a-time loop faulted, so zero the rest. */
-       { move r3, r2; bz r2, 2f /* should be impossible, but handle it. */ }
-1:      { sb r0, zero; addi r0, r0, 1; addi r3, r3, -1 }
-       { bnzt r3, 1b }
-2:     move lr, r27
+       move lr, r27
        { move r0, r2; jrp lr }
 
 copy_to_user_fixup_loop:
index 97bbb60..a3fea9f 100644 (file)
@@ -51,7 +51,7 @@
                __v;                                            \
        })
 
-#define USERCOPY_FUNC __copy_to_user_inatomic
+#define USERCOPY_FUNC raw_copy_to_user
 #define ST1(p, v) _ST((p), st1, (v))
 #define ST2(p, v) _ST((p), st2, (v))
 #define ST4(p, v) _ST((p), st4, (v))
@@ -62,7 +62,7 @@
 #define LD8 LD
 #include "memcpy_64.c"
 
-#define USERCOPY_FUNC __copy_from_user_inatomic
+#define USERCOPY_FUNC raw_copy_from_user
 #define ST1 ST
 #define ST2 ST
 #define ST4 ST
@@ -73,7 +73,7 @@
 #define LD8(p) _LD((p), ld)
 #include "memcpy_64.c"
 
-#define USERCOPY_FUNC __copy_in_user_inatomic
+#define USERCOPY_FUNC raw_copy_in_user
 #define ST1(p, v) _ST((p), st1, (v))
 #define ST2(p, v) _ST((p), st2, (v))
 #define ST4(p, v) _ST((p), st4, (v))
 #define LD4(p) _LD((p), ld4u)
 #define LD8(p) _LD((p), ld)
 #include "memcpy_64.c"
-
-unsigned long __copy_from_user_zeroing(void *to, const void __user *from,
-                                      unsigned long n)
-{
-       unsigned long rc = __copy_from_user_inatomic(to, from, n);
-       if (unlikely(rc))
-               memset(to + n - rc, 0, rc);
-       return rc;
-}
index e9d42aa..50a32c3 100644 (file)
@@ -6,6 +6,7 @@ generic-y += delay.h
 generic-y += device.h
 generic-y += emergency-restart.h
 generic-y += exec.h
+generic-y += extable.h
 generic-y += ftrace.h
 generic-y += futex.h
 generic-y += hardirq.h
index 3705620..cc00fc5 100644 (file)
@@ -7,7 +7,6 @@
 #ifndef __UM_UACCESS_H
 #define __UM_UACCESS_H
 
-#include <asm/thread_info.h>
 #include <asm/elf.h>
 
 #define __under_task_size(addr, size) \
@@ -22,8 +21,8 @@
 #define __addr_range_nowrap(addr, size) \
        ((unsigned long) (addr) <= ((unsigned long) (addr) + (size)))
 
-extern long __copy_from_user(void *to, const void __user *from, unsigned long n);
-extern long __copy_to_user(void __user *to, const void *from, unsigned long n);
+extern unsigned long raw_copy_from_user(void *to, const void __user *from, unsigned long n);
+extern unsigned long raw_copy_to_user(void __user *to, const void *from, unsigned long n);
 extern long __strncpy_from_user(char *dst, const char __user *src, long count);
 extern long __strnlen_user(const void __user *str, long len);
 extern unsigned long __clear_user(void __user *mem, unsigned long len);
@@ -32,12 +31,10 @@ static inline int __access_ok(unsigned long addr, unsigned long size);
 /* Teach asm-generic/uaccess.h that we have C functions for these. */
 #define __access_ok __access_ok
 #define __clear_user __clear_user
-#define __copy_to_user __copy_to_user
-#define __copy_from_user __copy_from_user
 #define __strnlen_user __strnlen_user
 #define __strncpy_from_user __strncpy_from_user
-#define __copy_to_user_inatomic __copy_to_user
-#define __copy_from_user_inatomic __copy_from_user
+#define INLINE_COPY_FROM_USER
+#define INLINE_COPY_TO_USER
 
 #include <asm-generic/uaccess.h>
 
@@ -46,7 +43,7 @@ static inline int __access_ok(unsigned long addr, unsigned long size)
        return __addr_range_nowrap(addr, size) &&
                (__under_task_size(addr, size) ||
                __access_ok_vsyscall(addr, size) ||
-               segment_eq(get_fs(), KERNEL_DS));
+               uaccess_kernel());
 }
 
 #endif
index de5d572..cd1fa97 100644 (file)
@@ -302,8 +302,8 @@ extern int ignore_sigio_fd(int fd);
 extern void maybe_sigio_broken(int fd, int read);
 extern void sigio_broken(int fd, int read);
 
-/* sys-x86_64/prctl.c */
-extern int os_arch_prctl(int pid, int code, unsigned long *addr);
+/* prctl.c */
+extern int os_arch_prctl(int pid, int option, unsigned long *arg2);
 
 /* tty.c */
 extern int get_pty(void);
index 85ac8ad..d450797 100644 (file)
@@ -139,16 +139,16 @@ static int copy_chunk_from_user(unsigned long from, int len, void *arg)
        return 0;
 }
 
-long __copy_from_user(void *to, const void __user *from, unsigned long n)
+unsigned long raw_copy_from_user(void *to, const void __user *from, unsigned long n)
 {
-       if (segment_eq(get_fs(), KERNEL_DS)) {
+       if (uaccess_kernel()) {
                memcpy(to, (__force void*)from, n);
                return 0;
        }
 
        return buffer_op((unsigned long) from, n, 0, copy_chunk_from_user, &to);
 }
-EXPORT_SYMBOL(__copy_from_user);
+EXPORT_SYMBOL(raw_copy_from_user);
 
 static int copy_chunk_to_user(unsigned long to, int len, void *arg)
 {
@@ -159,16 +159,16 @@ static int copy_chunk_to_user(unsigned long to, int len, void *arg)
        return 0;
 }
 
-long __copy_to_user(void __user *to, const void *from, unsigned long n)
+unsigned long raw_copy_to_user(void __user *to, const void *from, unsigned long n)
 {
-       if (segment_eq(get_fs(), KERNEL_DS)) {
+       if (uaccess_kernel()) {
                memcpy((__force void *) to, from, n);
                return 0;
        }
 
        return buffer_op((unsigned long) to, n, 1, copy_chunk_to_user, &from);
 }
-EXPORT_SYMBOL(__copy_to_user);
+EXPORT_SYMBOL(raw_copy_to_user);
 
 static int strncpy_chunk_from_user(unsigned long from, int len, void *arg)
 {
@@ -189,7 +189,7 @@ long __strncpy_from_user(char *dst, const char __user *src, long count)
        long n;
        char *ptr = dst;
 
-       if (segment_eq(get_fs(), KERNEL_DS)) {
+       if (uaccess_kernel()) {
                strncpy(dst, (__force void *) src, count);
                return strnlen(dst, count);
        }
@@ -210,7 +210,7 @@ static int clear_chunk(unsigned long addr, int len, void *unused)
 
 unsigned long __clear_user(void __user *mem, unsigned long len)
 {
-       if (segment_eq(get_fs(), KERNEL_DS)) {
+       if (uaccess_kernel()) {
                memset((__force void*)mem, 0, len);
                return 0;
        }
@@ -235,7 +235,7 @@ long __strnlen_user(const void __user *str, long len)
 {
        int count = 0, n;
 
-       if (segment_eq(get_fs(), KERNEL_DS))
+       if (uaccess_kernel())
                return strnlen((__force char*)str, len) + 1;
 
        n = buffer_op((unsigned long) str, len, 0, strnlen_chunk, &count);
index ba87a27..0b034eb 100644 (file)
@@ -65,7 +65,9 @@ static struct clock_event_device timer_clockevent = {
        .set_next_event         = itimer_next_event,
        .shift                  = 0,
        .max_delta_ns           = 0xffffffff,
-       .min_delta_ns           = TIMER_MIN_DELTA, //microsecond resolution should be enough for anyone, same as 640K RAM
+       .max_delta_ticks        = 0xffffffff,
+       .min_delta_ns           = TIMER_MIN_DELTA,
+       .min_delta_ticks        = TIMER_MIN_DELTA, // microsecond resolution should be enough for anyone, same as 640K RAM
        .irq                    = 0,
        .mult                   = 1,
 };
index 84205fe..e9ad511 100644 (file)
@@ -10,6 +10,7 @@ generic-y += div64.h
 generic-y += emergency-restart.h
 generic-y += errno.h
 generic-y += exec.h
+generic-y += extable.h
 generic-y += fb.h
 generic-y += fcntl.h
 generic-y += ftrace.h
index 897e11a..1d55f2f 100644 (file)
 #ifndef __UNICORE_UACCESS_H__
 #define __UNICORE_UACCESS_H__
 
-#include <linux/thread_info.h>
-#include <linux/errno.h>
-
 #include <asm/memory.h>
 
-#define __copy_from_user       __copy_from_user
-#define __copy_to_user         __copy_to_user
 #define __strncpy_from_user    __strncpy_from_user
 #define __strnlen_user         __strnlen_user
 #define __clear_user           __clear_user
 
-#define __kernel_ok            (segment_eq(get_fs(), KERNEL_DS))
+#define __kernel_ok            (uaccess_kernel())
 #define __user_ok(addr, size)  (((size) <= TASK_SIZE)                  \
                                && ((addr) <= TASK_SIZE - (size)))
 #define __access_ok(addr, size)        (__kernel_ok || __user_ok((addr), (size)))
 
 extern unsigned long __must_check
-__copy_from_user(void *to, const void __user *from, unsigned long n);
+raw_copy_from_user(void *to, const void __user *from, unsigned long n);
 extern unsigned long __must_check
-__copy_to_user(void __user *to, const void *from, unsigned long n);
+raw_copy_to_user(void __user *to, const void *from, unsigned long n);
 extern unsigned long __must_check
 __clear_user(void __user *addr, unsigned long n);
 extern unsigned long __must_check
 __strncpy_from_user(char *to, const char __user *from, unsigned long count);
 extern unsigned long
 __strnlen_user(const char __user *s, long n);
+#define INLINE_COPY_FROM_USER
+#define INLINE_COPY_TO_USER
 
 #include <asm-generic/uaccess.h>
 
-extern int fixup_exception(struct pt_regs *regs);
-
 #endif /* __UNICORE_UACCESS_H__ */
index 0323528..dcc72ee 100644 (file)
@@ -46,8 +46,8 @@ EXPORT_SYMBOL(__strncpy_from_user);
 
 EXPORT_SYMBOL(copy_page);
 
-EXPORT_SYMBOL(__copy_from_user);
-EXPORT_SYMBOL(__copy_to_user);
+EXPORT_SYMBOL(raw_copy_from_user);
+EXPORT_SYMBOL(raw_copy_to_user);
 EXPORT_SYMBOL(__clear_user);
 
 EXPORT_SYMBOL(__ashldi3);
index d22c1dc..ddaf78a 100644 (file)
@@ -178,7 +178,7 @@ void __show_regs(struct pt_regs *regs)
                buf, interrupts_enabled(regs) ? "n" : "ff",
                fast_interrupts_enabled(regs) ? "n" : "ff",
                processor_modes[processor_mode(regs)],
-               segment_eq(get_fs(), get_ds()) ? "kernel" : "user");
+               uaccess_kernel() ? "kernel" : "user");
        {
                unsigned int ctrl;
 
index fceaa67..c6b3fa3 100644 (file)
@@ -91,8 +91,10 @@ void __init time_init(void)
 
        ckevt_puv3_osmr0.max_delta_ns =
                clockevent_delta2ns(0x7fffffff, &ckevt_puv3_osmr0);
+       ckevt_puv3_osmr0.max_delta_ticks = 0x7fffffff;
        ckevt_puv3_osmr0.min_delta_ns =
                clockevent_delta2ns(MIN_OSCR_DELTA * 2, &ckevt_puv3_osmr0) + 1;
+       ckevt_puv3_osmr0.min_delta_ticks = MIN_OSCR_DELTA * 2;
        ckevt_puv3_osmr0.cpumask = cpumask_of(0);
 
        setup_irq(IRQ_TIMER0, &puv3_timer_irq);
index ab0767e..5f80fcb 100644 (file)
@@ -16,7 +16,7 @@
 /*
  * Prototype:
  *
- *     size_t __copy_from_user(void *to, const void *from, size_t n)
+ *     size_t raw_copy_from_user(void *to, const void *from, size_t n)
  *
  * Purpose:
  *
 
        .text
 
-ENTRY(__copy_from_user)
+ENTRY(raw_copy_from_user)
 
 #include "copy_template.S"
 
-ENDPROC(__copy_from_user)
+ENDPROC(raw_copy_from_user)
 
        .pushsection .fixup,"ax"
        .align 0
        copy_abort_preamble
-       ldm.w   (r1, r2), [sp]+
-       sub     r3, r0, r1
-       rsub    r2, r3, r2
-       stw     r2, [sp]
-       mov     r1, #0
-       b.l     memset
-       ldw.w   r0, [sp]+, #4
+       ldm.w   (r1, r2, r3), [sp]+
+       sub     r0, r0, r1
+       rsub    r0, r0, r2
        copy_abort_end
        .popsection
 
index 6e22151..857c681 100644 (file)
@@ -16,7 +16,7 @@
 /*
  * Prototype:
  *
- *     size_t __copy_to_user(void *to, const void *from, size_t n)
+ *     size_t raw_copy_to_user(void *to, const void *from, size_t n)
  *
  * Purpose:
  *
 
        .text
 
-WEAK(__copy_to_user)
+WEAK(raw_copy_to_user)
 
 #include "copy_template.S"
 
-ENDPROC(__copy_to_user)
+ENDPROC(raw_copy_to_user)
 
        .pushsection .fixup,"ax"
        .align 0
index cc98d5a..a055719 100644 (file)
@@ -98,7 +98,6 @@ config X86
        select HAVE_ACPI_APEI_NMI               if ACPI
        select HAVE_ALIGNED_STRUCT_PAGE         if SLUB
        select HAVE_ARCH_AUDITSYSCALL
-       select HAVE_ARCH_HARDENED_USERCOPY
        select HAVE_ARCH_HUGE_VMAP              if X86_64 || X86_PAE
        select HAVE_ARCH_JUMP_LABEL
        select HAVE_ARCH_KASAN                  if X86_64 && SPARSEMEM_VMEMMAP
@@ -1043,6 +1042,14 @@ config X86_MCE
          The action the kernel takes depends on the severity of the problem,
          ranging from warning messages to halting the machine.
 
+config X86_MCELOG_LEGACY
+       bool "Support for deprecated /dev/mcelog character device"
+       depends on X86_MCE
+       ---help---
+         Enable support for /dev/mcelog which is needed by the old mcelog
+         userspace logging daemon. Consider switching to the new generation
+         rasdaemon solution.
+
 config X86_MCE_INTEL
        def_bool y
        prompt "Intel MCE features"
@@ -1072,7 +1079,7 @@ config X86_MCE_THRESHOLD
        def_bool y
 
 config X86_MCE_INJECT
-       depends on X86_MCE && X86_LOCAL_APIC
+       depends on X86_MCE && X86_LOCAL_APIC && X86_MCELOG_LEGACY
        tristate "Machine check injector support"
        ---help---
          Provide support for injecting machine checks for testing purposes.
@@ -1966,7 +1973,7 @@ config RELOCATABLE
 config RANDOMIZE_BASE
        bool "Randomize the address of the kernel image (KASLR)"
        depends on RELOCATABLE
-       default n
+       default y
        ---help---
          In support of Kernel Address Space Layout Randomization (KASLR),
          this randomizes the physical address at which the kernel image
@@ -1996,7 +2003,7 @@ config RANDOMIZE_BASE
          theoretically possible, but the implementations are further
          limited due to memory layouts.
 
-         If unsure, say N.
+         If unsure, say Y.
 
 # Relocation on x86 needs some additional build support
 config X86_NEED_RELOCS
@@ -2045,7 +2052,7 @@ config RANDOMIZE_MEMORY
           configuration have in average 30,000 different possible virtual
           addresses for each memory section.
 
-          If unsure, say N.
+          If unsure, say Y.
 
 config RANDOMIZE_MEMORY_PHYSICAL_PADDING
        hex "Physical memory mapping padding" if EXPERT
index a94a4d1..49d160b 100644 (file)
@@ -154,6 +154,14 @@ ifdef CONFIG_FUNCTION_GRAPH_TRACER
   else
     ifeq ($(call cc-option-yn, -mfentry), n)
        ACCUMULATE_OUTGOING_ARGS := 1
+
+       # GCC ignores '-maccumulate-outgoing-args' when used with '-Os'.
+       # If '-Os' is enabled, disable it and print a warning.
+        ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE
+          undefine CONFIG_CC_OPTIMIZE_FOR_SIZE
+         $(warning Disabling CONFIG_CC_OPTIMIZE_FOR_SIZE.  Your compiler does not have -mfentry so you cannot optimize for size with CONFIG_FUNCTION_GRAPH_TRACER.)
+        endif
+
     endif
   endif
 endif
index 9b42b6d..ef5a9cc 100644 (file)
@@ -16,7 +16,7 @@
 #ifndef BOOT_BOOT_H
 #define BOOT_BOOT_H
 
-#define STACK_SIZE     512     /* Minimum number of bytes for stack */
+#define STACK_SIZE     1024    /* Minimum number of bytes for stack */
 
 #ifndef __ASSEMBLY__
 
index 801c7a1..cbf4b87 100644 (file)
@@ -9,7 +9,9 @@
 
 #include <linux/efi.h>
 #include <linux/pci.h>
+
 #include <asm/efi.h>
+#include <asm/e820/types.h>
 #include <asm/setup.h>
 #include <asm/desc.h>
 
@@ -729,7 +731,7 @@ static void add_e820ext(struct boot_params *params,
        unsigned long size;
 
        e820ext->type = SETUP_E820_EXT;
-       e820ext->len = nr_entries * sizeof(struct e820entry);
+       e820ext->len = nr_entries * sizeof(struct boot_e820_entry);
        e820ext->next = 0;
 
        data = (struct setup_data *)(unsigned long)params->hdr.setup_data;
@@ -746,9 +748,9 @@ static void add_e820ext(struct boot_params *params,
 static efi_status_t setup_e820(struct boot_params *params,
                               struct setup_data *e820ext, u32 e820ext_size)
 {
-       struct e820entry *e820_map = &params->e820_map[0];
+       struct boot_e820_entry *entry = params->e820_table;
        struct efi_info *efi = &params->efi_info;
-       struct e820entry *prev = NULL;
+       struct boot_e820_entry *prev = NULL;
        u32 nr_entries;
        u32 nr_desc;
        int i;
@@ -773,15 +775,15 @@ static efi_status_t setup_e820(struct boot_params *params,
                case EFI_MEMORY_MAPPED_IO:
                case EFI_MEMORY_MAPPED_IO_PORT_SPACE:
                case EFI_PAL_CODE:
-                       e820_type = E820_RESERVED;
+                       e820_type = E820_TYPE_RESERVED;
                        break;
 
                case EFI_UNUSABLE_MEMORY:
-                       e820_type = E820_UNUSABLE;
+                       e820_type = E820_TYPE_UNUSABLE;
                        break;
 
                case EFI_ACPI_RECLAIM_MEMORY:
-                       e820_type = E820_ACPI;
+                       e820_type = E820_TYPE_ACPI;
                        break;
 
                case EFI_LOADER_CODE:
@@ -789,15 +791,15 @@ static efi_status_t setup_e820(struct boot_params *params,
                case EFI_BOOT_SERVICES_CODE:
                case EFI_BOOT_SERVICES_DATA:
                case EFI_CONVENTIONAL_MEMORY:
-                       e820_type = E820_RAM;
+                       e820_type = E820_TYPE_RAM;
                        break;
 
                case EFI_ACPI_MEMORY_NVS:
-                       e820_type = E820_NVS;
+                       e820_type = E820_TYPE_NVS;
                        break;
 
                case EFI_PERSISTENT_MEMORY:
-                       e820_type = E820_PMEM;
+                       e820_type = E820_TYPE_PMEM;
                        break;
 
                default:
@@ -811,26 +813,26 @@ static efi_status_t setup_e820(struct boot_params *params,
                        continue;
                }
 
-               if (nr_entries == ARRAY_SIZE(params->e820_map)) {
-                       u32 need = (nr_desc - i) * sizeof(struct e820entry) +
+               if (nr_entries == ARRAY_SIZE(params->e820_table)) {
+                       u32 need = (nr_desc - i) * sizeof(struct e820_entry) +
                                   sizeof(struct setup_data);
 
                        if (!e820ext || e820ext_size < need)
                                return EFI_BUFFER_TOO_SMALL;
 
                        /* boot_params map full, switch to e820 extended */
-                       e820_map = (struct e820entry *)e820ext->data;
+                       entry = (struct boot_e820_entry *)e820ext->data;
                }
 
-               e820_map->addr = d->phys_addr;
-               e820_map->size = d->num_pages << PAGE_SHIFT;
-               e820_map->type = e820_type;
-               prev = e820_map++;
+               entry->addr = d->phys_addr;
+               entry->size = d->num_pages << PAGE_SHIFT;
+               entry->type = e820_type;
+               prev = entry++;
                nr_entries++;
        }
 
-       if (nr_entries > ARRAY_SIZE(params->e820_map)) {
-               u32 nr_e820ext = nr_entries - ARRAY_SIZE(params->e820_map);
+       if (nr_entries > ARRAY_SIZE(params->e820_table)) {
+               u32 nr_e820ext = nr_entries - ARRAY_SIZE(params->e820_table);
 
                add_e820ext(params, e820ext, nr_e820ext);
                nr_entries -= nr_e820ext;
@@ -848,7 +850,7 @@ static efi_status_t alloc_e820ext(u32 nr_desc, struct setup_data **e820ext,
        unsigned long size;
 
        size = sizeof(struct setup_data) +
-               sizeof(struct e820entry) * nr_desc;
+               sizeof(struct e820_entry) * nr_desc;
 
        if (*e820ext) {
                efi_call_early(free_pool, *e820ext);
@@ -884,9 +886,9 @@ static efi_status_t exit_boot_func(efi_system_table_t *sys_table_arg,
 
        if (first) {
                nr_desc = *map->buff_size / *map->desc_size;
-               if (nr_desc > ARRAY_SIZE(p->boot_params->e820_map)) {
+               if (nr_desc > ARRAY_SIZE(p->boot_params->e820_table)) {
                        u32 nr_e820ext = nr_desc -
-                                       ARRAY_SIZE(p->boot_params->e820_map);
+                                       ARRAY_SIZE(p->boot_params->e820_table);
 
                        status = alloc_e820ext(nr_e820ext, &p->e820ext,
                                               &p->e820ext_size);
index 8b7c9e7..54c24f0 100644 (file)
@@ -426,7 +426,7 @@ static unsigned long slots_fetch_random(void)
        return 0;
 }
 
-static void process_e820_entry(struct e820entry *entry,
+static void process_e820_entry(struct boot_e820_entry *entry,
                               unsigned long minimum,
                               unsigned long image_size)
 {
@@ -435,7 +435,7 @@ static void process_e820_entry(struct e820entry *entry,
        unsigned long start_orig;
 
        /* Skip non-RAM entries. */
-       if (entry->type != E820_RAM)
+       if (entry->type != E820_TYPE_RAM)
                return;
 
        /* On 32-bit, ignore entries entirely above our maximum. */
@@ -518,7 +518,7 @@ static unsigned long find_random_phys_addr(unsigned long minimum,
 
        /* Verify potential e820 positions, appending to slots list. */
        for (i = 0; i < boot_params->e820_entries; i++) {
-               process_e820_entry(&boot_params->e820_map[i], minimum,
+               process_e820_entry(&boot_params->e820_table[i], minimum,
                                   image_size);
                if (slot_area_index == MAX_SLOT_AREA) {
                        debug_putstr("Aborted e820 scan (slot_areas full)!\n");
@@ -597,10 +597,17 @@ void choose_random_location(unsigned long input,
                        add_identity_map(random_addr, output_size);
                        *output = random_addr;
                }
+
+               /*
+                * This loads the identity mapping page table.
+                * This should only be done if a new physical address
+                * is found for the kernel, otherwise we should keep
+                * the old page table to make it be like the "nokaslr"
+                * case.
+                */
+               finalize_identity_maps();
        }
 
-       /* This actually loads the identity pagetable on x86_64. */
-       finalize_identity_maps();
 
        /* Pick random virtual address starting from LOAD_PHYSICAL_ADDR. */
        if (IS_ENABLED(CONFIG_X86_64))
index 3dd5be3..2ed8f0c 100644 (file)
@@ -18,7 +18,6 @@
 #include <asm/segment.h>
 #include <generated/utsrelease.h>
 #include <asm/boot.h>
-#include <asm/e820.h>
 #include <asm/page_types.h>
 #include <asm/setup.h>
 #include <asm/bootparam.h>
index db75d07..d9c28c8 100644 (file)
@@ -21,8 +21,8 @@ static int detect_memory_e820(void)
 {
        int count = 0;
        struct biosregs ireg, oreg;
-       struct e820entry *desc = boot_params.e820_map;
-       static struct e820entry buf; /* static so it is zeroed */
+       struct boot_e820_entry *desc = boot_params.e820_table;
+       static struct boot_e820_entry buf; /* static so it is zeroed */
 
        initregs(&ireg);
        ireg.ax  = 0xe820;
@@ -66,7 +66,7 @@ static int detect_memory_e820(void)
 
                *desc++ = buf;
                count++;
-       } while (ireg.ebx && count < ARRAY_SIZE(boot_params.e820_map));
+       } while (ireg.ebx && count < ARRAY_SIZE(boot_params.e820_table));
 
        return boot_params.e820_entries = count;
 }
index 5fa6ee2..6cf79e1 100644 (file)
@@ -57,6 +57,8 @@ CONFIG_EFI=y
 CONFIG_HZ_1000=y
 CONFIG_KEXEC=y
 CONFIG_CRASH_DUMP=y
+CONFIG_RANDOMIZE_BASE=y
+CONFIG_RANDOMIZE_MEMORY=y
 # CONFIG_COMPAT_VDSO is not set
 CONFIG_HIBERNATION=y
 CONFIG_PM_DEBUG=y
index 6205d3b..de45f57 100644 (file)
@@ -55,6 +55,8 @@ CONFIG_EFI=y
 CONFIG_HZ_1000=y
 CONFIG_KEXEC=y
 CONFIG_CRASH_DUMP=y
+CONFIG_RANDOMIZE_BASE=y
+CONFIG_RANDOMIZE_MEMORY=y
 # CONFIG_COMPAT_VDSO is not set
 CONFIG_HIBERNATION=y
 CONFIG_PM_DEBUG=y
index 9ba050f..0af59fa 100644 (file)
 381    i386    pkey_alloc              sys_pkey_alloc
 382    i386    pkey_free               sys_pkey_free
 383    i386    statx                   sys_statx
+384    i386    arch_prctl              sys_arch_prctl                  compat_sys_arch_prctl
index b28200d..3641e24 100644 (file)
@@ -11,6 +11,8 @@
  * published by the Free Software Foundation.
  */
 
+#define pr_fmt(fmt)    "perf/amd_iommu: " fmt
+
 #include <linux/perf_event.h>
 #include <linux/init.h>
 #include <linux/cpumask.h>
 
 #define COUNTER_SHIFT          16
 
-#define _GET_BANK(ev)       ((u8)(ev->hw.extra_reg.reg >> 8))
-#define _GET_CNTR(ev)       ((u8)(ev->hw.extra_reg.reg))
+/* iommu pmu conf masks */
+#define GET_CSOURCE(x)     ((x)->conf & 0xFFULL)
+#define GET_DEVID(x)       (((x)->conf >> 8)  & 0xFFFFULL)
+#define GET_DOMID(x)       (((x)->conf >> 24) & 0xFFFFULL)
+#define GET_PASID(x)       (((x)->conf >> 40) & 0xFFFFFULL)
 
-/* iommu pmu config masks */
-#define _GET_CSOURCE(ev)    ((ev->hw.config & 0xFFULL))
-#define _GET_DEVID(ev)      ((ev->hw.config >> 8)  & 0xFFFFULL)
-#define _GET_PASID(ev)      ((ev->hw.config >> 24) & 0xFFFFULL)
-#define _GET_DOMID(ev)      ((ev->hw.config >> 40) & 0xFFFFULL)
-#define _GET_DEVID_MASK(ev) ((ev->hw.extra_reg.config)  & 0xFFFFULL)
-#define _GET_PASID_MASK(ev) ((ev->hw.extra_reg.config >> 16) & 0xFFFFULL)
-#define _GET_DOMID_MASK(ev) ((ev->hw.extra_reg.config >> 32) & 0xFFFFULL)
+/* iommu pmu conf1 masks */
+#define GET_DEVID_MASK(x)  ((x)->conf1  & 0xFFFFULL)
+#define GET_DOMID_MASK(x)  (((x)->conf1 >> 16) & 0xFFFFULL)
+#define GET_PASID_MASK(x)  (((x)->conf1 >> 32) & 0xFFFFFULL)
 
-static struct perf_amd_iommu __perf_iommu;
+#define IOMMU_NAME_SIZE 16
 
 struct perf_amd_iommu {
+       struct list_head list;
        struct pmu pmu;
+       struct amd_iommu *iommu;
+       char name[IOMMU_NAME_SIZE];
        u8 max_banks;
        u8 max_counters;
        u64 cntr_assign_mask;
        raw_spinlock_t lock;
-       const struct attribute_group *attr_groups[4];
 };
 
-#define format_group   attr_groups[0]
-#define cpumask_group  attr_groups[1]
-#define events_group   attr_groups[2]
-#define null_group     attr_groups[3]
+static LIST_HEAD(perf_amd_iommu_list);
 
 /*---------------------------------------------
  * sysfs format attributes
  *---------------------------------------------*/
 PMU_FORMAT_ATTR(csource,    "config:0-7");
 PMU_FORMAT_ATTR(devid,      "config:8-23");
-PMU_FORMAT_ATTR(pasid,      "config:24-39");
-PMU_FORMAT_ATTR(domid,      "config:40-55");
+PMU_FORMAT_ATTR(domid,      "config:24-39");
+PMU_FORMAT_ATTR(pasid,      "config:40-59");
 PMU_FORMAT_ATTR(devid_mask, "config1:0-15");
-PMU_FORMAT_ATTR(pasid_mask, "config1:16-31");
-PMU_FORMAT_ATTR(domid_mask, "config1:32-47");
+PMU_FORMAT_ATTR(domid_mask, "config1:16-31");
+PMU_FORMAT_ATTR(pasid_mask, "config1:32-51");
 
 static struct attribute *iommu_format_attrs[] = {
        &format_attr_csource.attr,
@@ -79,6 +79,10 @@ static struct attribute_group amd_iommu_format_group = {
 /*---------------------------------------------
  * sysfs events attributes
  *---------------------------------------------*/
+static struct attribute_group amd_iommu_events_group = {
+       .name = "events",
+};
+
 struct amd_iommu_event_desc {
        struct kobj_attribute attr;
        const char *event;
@@ -150,30 +154,34 @@ static struct attribute_group amd_iommu_cpumask_group = {
 
 /*---------------------------------------------*/
 
-static int get_next_avail_iommu_bnk_cntr(struct perf_amd_iommu *perf_iommu)
+static int get_next_avail_iommu_bnk_cntr(struct perf_event *event)
 {
+       struct perf_amd_iommu *piommu = container_of(event->pmu, struct perf_amd_iommu, pmu);
+       int max_cntrs = piommu->max_counters;
+       int max_banks = piommu->max_banks;
+       u32 shift, bank, cntr;
        unsigned long flags;
-       int shift, bank, cntr, retval;
-       int max_banks = perf_iommu->max_banks;
-       int max_cntrs = perf_iommu->max_counters;
+       int retval;
 
-       raw_spin_lock_irqsave(&perf_iommu->lock, flags);
+       raw_spin_lock_irqsave(&piommu->lock, flags);
 
        for (bank = 0, shift = 0; bank < max_banks; bank++) {
                for (cntr = 0; cntr < max_cntrs; cntr++) {
                        shift = bank + (bank*3) + cntr;
-                       if (perf_iommu->cntr_assign_mask & (1ULL<<shift)) {
+                       if (piommu->cntr_assign_mask & BIT_ULL(shift)) {
                                continue;
                        } else {
-                               perf_iommu->cntr_assign_mask |= (1ULL<<shift);
-                               retval = ((u16)((u16)bank<<8) | (u8)(cntr));
+                               piommu->cntr_assign_mask |= BIT_ULL(shift);
+                               event->hw.iommu_bank = bank;
+                               event->hw.iommu_cntr = cntr;
+                               retval = 0;
                                goto out;
                        }
                }
        }
        retval = -ENOSPC;
 out:
-       raw_spin_unlock_irqrestore(&perf_iommu->lock, flags);
+       raw_spin_unlock_irqrestore(&piommu->lock, flags);
        return retval;
 }
 
@@ -202,8 +210,6 @@ static int clear_avail_iommu_bnk_cntr(struct perf_amd_iommu *perf_iommu,
 static int perf_iommu_event_init(struct perf_event *event)
 {
        struct hw_perf_event *hwc = &event->hw;
-       struct perf_amd_iommu *perf_iommu;
-       u64 config, config1;
 
        /* test the event attr type check for PMU enumeration */
        if (event->attr.type != event->pmu->type)
@@ -225,80 +231,62 @@ static int perf_iommu_event_init(struct perf_event *event)
        if (event->cpu < 0)
                return -EINVAL;
 
-       perf_iommu = &__perf_iommu;
-
-       if (event->pmu != &perf_iommu->pmu)
-               return -ENOENT;
-
-       if (perf_iommu) {
-               config = event->attr.config;
-               config1 = event->attr.config1;
-       } else {
-               return -EINVAL;
-       }
-
-       /* integrate with iommu base devid (0000), assume one iommu */
-       perf_iommu->max_banks =
-               amd_iommu_pc_get_max_banks(IOMMU_BASE_DEVID);
-       perf_iommu->max_counters =
-               amd_iommu_pc_get_max_counters(IOMMU_BASE_DEVID);
-       if ((perf_iommu->max_banks == 0) || (perf_iommu->max_counters == 0))
-               return -EINVAL;
-
        /* update the hw_perf_event struct with the iommu config data */
-       hwc->config = config;
-       hwc->extra_reg.config = config1;
+       hwc->conf  = event->attr.config;
+       hwc->conf1 = event->attr.config1;
 
        return 0;
 }
 
+static inline struct amd_iommu *perf_event_2_iommu(struct perf_event *ev)
+{
+       return (container_of(ev->pmu, struct perf_amd_iommu, pmu))->iommu;
+}
+
 static void perf_iommu_enable_event(struct perf_event *ev)
 {
-       u8 csource = _GET_CSOURCE(ev);
-       u16 devid = _GET_DEVID(ev);
+       struct amd_iommu *iommu = perf_event_2_iommu(ev);
+       struct hw_perf_event *hwc = &ev->hw;
+       u8 bank = hwc->iommu_bank;
+       u8 cntr = hwc->iommu_cntr;
        u64 reg = 0ULL;
 
-       reg = csource;
-       amd_iommu_pc_get_set_reg_val(devid,
-                       _GET_BANK(ev), _GET_CNTR(ev) ,
-                        IOMMU_PC_COUNTER_SRC_REG, &reg, true);
+       reg = GET_CSOURCE(hwc);
+       amd_iommu_pc_set_reg(iommu, bank, cntr, IOMMU_PC_COUNTER_SRC_REG, &reg);
 
-       reg = 0ULL | devid | (_GET_DEVID_MASK(ev) << 32);
+       reg = GET_DEVID_MASK(hwc);
+       reg = GET_DEVID(hwc) | (reg << 32);
        if (reg)
-               reg |= (1UL << 31);
-       amd_iommu_pc_get_set_reg_val(devid,
-                       _GET_BANK(ev), _GET_CNTR(ev) ,
-                        IOMMU_PC_DEVID_MATCH_REG, &reg, true);
+               reg |= BIT(31);
+       amd_iommu_pc_set_reg(iommu, bank, cntr, IOMMU_PC_DEVID_MATCH_REG, &reg);
 
-       reg = 0ULL | _GET_PASID(ev) | (_GET_PASID_MASK(ev) << 32);
+       reg = GET_PASID_MASK(hwc);
+       reg = GET_PASID(hwc) | (reg << 32);
        if (reg)
-               reg |= (1UL << 31);
-       amd_iommu_pc_get_set_reg_val(devid,
-                       _GET_BANK(ev), _GET_CNTR(ev) ,
-                        IOMMU_PC_PASID_MATCH_REG, &reg, true);
+               reg |= BIT(31);
+       amd_iommu_pc_set_reg(iommu, bank, cntr, IOMMU_PC_PASID_MATCH_REG, &reg);
 
-       reg = 0ULL | _GET_DOMID(ev) | (_GET_DOMID_MASK(ev) << 32);
+       reg = GET_DOMID_MASK(hwc);
+       reg = GET_DOMID(hwc) | (reg << 32);
        if (reg)
-               reg |= (1UL << 31);
-       amd_iommu_pc_get_set_reg_val(devid,
-                       _GET_BANK(ev), _GET_CNTR(ev) ,
-                        IOMMU_PC_DOMID_MATCH_REG, &reg, true);
+               reg |= BIT(31);
+       amd_iommu_pc_set_reg(iommu, bank, cntr, IOMMU_PC_DOMID_MATCH_REG, &reg);
 }
 
 static void perf_iommu_disable_event(struct perf_event *event)
 {
+       struct amd_iommu *iommu = perf_event_2_iommu(event);
+       struct hw_perf_event *hwc = &event->hw;
        u64 reg = 0ULL;
 
-       amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
-                       _GET_BANK(event), _GET_CNTR(event),
-                       IOMMU_PC_COUNTER_SRC_REG, &reg, true);
+       amd_iommu_pc_set_reg(iommu, hwc->iommu_bank, hwc->iommu_cntr,
+                            IOMMU_PC_COUNTER_SRC_REG, &reg);
 }
 
 static void perf_iommu_start(struct perf_event *event, int flags)
 {
        struct hw_perf_event *hwc = &event->hw;
 
-       pr_debug("perf: amd_iommu:perf_iommu_start\n");
        if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
                return;
 
@@ -306,10 +294,11 @@ static void perf_iommu_start(struct perf_event *event, int flags)
        hwc->state = 0;
 
        if (flags & PERF_EF_RELOAD) {
-               u64 prev_raw_count =  local64_read(&hwc->prev_count);
-               amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
-                               _GET_BANK(event), _GET_CNTR(event),
-                               IOMMU_PC_COUNTER_REG, &prev_raw_count, true);
+               u64 prev_raw_count = local64_read(&hwc->prev_count);
+               struct amd_iommu *iommu = perf_event_2_iommu(event);
+
+               amd_iommu_pc_set_reg(iommu, hwc->iommu_bank, hwc->iommu_cntr,
+                                    IOMMU_PC_COUNTER_REG, &prev_raw_count);
        }
 
        perf_iommu_enable_event(event);
@@ -319,37 +308,30 @@ static void perf_iommu_start(struct perf_event *event, int flags)
 
 static void perf_iommu_read(struct perf_event *event)
 {
-       u64 count = 0ULL;
-       u64 prev_raw_count = 0ULL;
-       u64 delta = 0ULL;
+       u64 count, prev, delta;
        struct hw_perf_event *hwc = &event->hw;
-       pr_debug("perf: amd_iommu:perf_iommu_read\n");
+       struct amd_iommu *iommu = perf_event_2_iommu(event);
 
-       amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
-                               _GET_BANK(event), _GET_CNTR(event),
-                               IOMMU_PC_COUNTER_REG, &count, false);
+       if (amd_iommu_pc_get_reg(iommu, hwc->iommu_bank, hwc->iommu_cntr,
+                                IOMMU_PC_COUNTER_REG, &count))
+               return;
 
        /* IOMMU pc counter register is only 48 bits */
-       count &= 0xFFFFFFFFFFFFULL;
+       count &= GENMASK_ULL(47, 0);
 
-       prev_raw_count =  local64_read(&hwc->prev_count);
-       if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
-                                       count) != prev_raw_count)
+       prev = local64_read(&hwc->prev_count);
+       if (local64_cmpxchg(&hwc->prev_count, prev, count) != prev)
                return;
 
-       /* Handling 48-bit counter overflowing */
-       delta = (count << COUNTER_SHIFT) - (prev_raw_count << COUNTER_SHIFT);
+       /* Handle 48-bit counter overflow */
+       delta = (count << COUNTER_SHIFT) - (prev << COUNTER_SHIFT);
        delta >>= COUNTER_SHIFT;
        local64_add(delta, &event->count);
-
 }
 
 static void perf_iommu_stop(struct perf_event *event, int flags)
 {
        struct hw_perf_event *hwc = &event->hw;
-       u64 config;
-
-       pr_debug("perf: amd_iommu:perf_iommu_stop\n");
 
        if (hwc->state & PERF_HES_UPTODATE)
                return;
@@ -361,7 +343,6 @@ static void perf_iommu_stop(struct perf_event *event, int flags)
        if (hwc->state & PERF_HES_UPTODATE)
                return;
 
-       config = hwc->config;
        perf_iommu_read(event);
        hwc->state |= PERF_HES_UPTODATE;
 }
@@ -369,17 +350,12 @@ static void perf_iommu_stop(struct perf_event *event, int flags)
 static int perf_iommu_add(struct perf_event *event, int flags)
 {
        int retval;
-       struct perf_amd_iommu *perf_iommu =
-                       container_of(event->pmu, struct perf_amd_iommu, pmu);
 
-       pr_debug("perf: amd_iommu:perf_iommu_add\n");
        event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
 
        /* request an iommu bank/counter */
-       retval = get_next_avail_iommu_bnk_cntr(perf_iommu);
-       if (retval != -ENOSPC)
-               event->hw.extra_reg.reg = (u16)retval;
-       else
+       retval = get_next_avail_iommu_bnk_cntr(event);
+       if (retval)
                return retval;
 
        if (flags & PERF_EF_START)
@@ -390,115 +366,124 @@ static int perf_iommu_add(struct perf_event *event, int flags)
 
 static void perf_iommu_del(struct perf_event *event, int flags)
 {
+       struct hw_perf_event *hwc = &event->hw;
        struct perf_amd_iommu *perf_iommu =
                        container_of(event->pmu, struct perf_amd_iommu, pmu);
 
-       pr_debug("perf: amd_iommu:perf_iommu_del\n");
        perf_iommu_stop(event, PERF_EF_UPDATE);
 
        /* clear the assigned iommu bank/counter */
        clear_avail_iommu_bnk_cntr(perf_iommu,
-                                    _GET_BANK(event),
-                                    _GET_CNTR(event));
+                                  hwc->iommu_bank, hwc->iommu_cntr);
 
        perf_event_update_userpage(event);
 }
 
-static __init int _init_events_attrs(struct perf_amd_iommu *perf_iommu)
+static __init int _init_events_attrs(void)
 {
-       struct attribute **attrs;
-       struct attribute_group *attr_group;
        int i = 0, j;
+       struct attribute **attrs;
 
        while (amd_iommu_v2_event_descs[i].attr.attr.name)
                i++;
 
-       attr_group = kzalloc(sizeof(struct attribute *)
-               * (i + 1) + sizeof(*attr_group), GFP_KERNEL);
-       if (!attr_group)
+       attrs = kzalloc(sizeof(struct attribute **) * (i + 1), GFP_KERNEL);
+       if (!attrs)
                return -ENOMEM;
 
-       attrs = (struct attribute **)(attr_group + 1);
        for (j = 0; j < i; j++)
                attrs[j] = &amd_iommu_v2_event_descs[j].attr.attr;
 
-       attr_group->name = "events";
-       attr_group->attrs = attrs;
-       perf_iommu->events_group = attr_group;
-
+       amd_iommu_events_group.attrs = attrs;
        return 0;
 }
 
-static __init void amd_iommu_pc_exit(void)
-{
-       if (__perf_iommu.events_group != NULL) {
-               kfree(__perf_iommu.events_group);
-               __perf_iommu.events_group = NULL;
-       }
-}
+const struct attribute_group *amd_iommu_attr_groups[] = {
+       &amd_iommu_format_group,
+       &amd_iommu_cpumask_group,
+       &amd_iommu_events_group,
+       NULL,
+};
+
+static struct pmu iommu_pmu = {
+       .event_init     = perf_iommu_event_init,
+       .add            = perf_iommu_add,
+       .del            = perf_iommu_del,
+       .start          = perf_iommu_start,
+       .stop           = perf_iommu_stop,
+       .read           = perf_iommu_read,
+       .task_ctx_nr    = perf_invalid_context,
+       .attr_groups    = amd_iommu_attr_groups,
+};
 
-static __init int _init_perf_amd_iommu(
-       struct perf_amd_iommu *perf_iommu, char *name)
+static __init int init_one_iommu(unsigned int idx)
 {
+       struct perf_amd_iommu *perf_iommu;
        int ret;
 
+       perf_iommu = kzalloc(sizeof(struct perf_amd_iommu), GFP_KERNEL);
+       if (!perf_iommu)
+               return -ENOMEM;
+
        raw_spin_lock_init(&perf_iommu->lock);
 
-       /* Init format attributes */
-       perf_iommu->format_group = &amd_iommu_format_group;
+       perf_iommu->pmu          = iommu_pmu;
+       perf_iommu->iommu        = get_amd_iommu(idx);
+       perf_iommu->max_banks    = amd_iommu_pc_get_max_banks(idx);
+       perf_iommu->max_counters = amd_iommu_pc_get_max_counters(idx);
 
-       /* Init cpumask attributes to only core 0 */
-       cpumask_set_cpu(0, &iommu_cpumask);
-       perf_iommu->cpumask_group = &amd_iommu_cpumask_group;
-
-       /* Init events attributes */
-       if (_init_events_attrs(perf_iommu) != 0)
-               pr_err("perf: amd_iommu: Only support raw events.\n");
+       if (!perf_iommu->iommu ||
+           !perf_iommu->max_banks ||
+           !perf_iommu->max_counters) {
+               kfree(perf_iommu);
+               return -EINVAL;
+       }
 
-       /* Init null attributes */
-       perf_iommu->null_group = NULL;
-       perf_iommu->pmu.attr_groups = perf_iommu->attr_groups;
+       snprintf(perf_iommu->name, IOMMU_NAME_SIZE, "amd_iommu_%u", idx);
 
-       ret = perf_pmu_register(&perf_iommu->pmu, name, -1);
-       if (ret) {
-               pr_err("perf: amd_iommu: Failed to initialized.\n");
-               amd_iommu_pc_exit();
+       ret = perf_pmu_register(&perf_iommu->pmu, perf_iommu->name, -1);
+       if (!ret) {
+               pr_info("Detected AMD IOMMU #%d (%d banks, %d counters/bank).\n",
+                       idx, perf_iommu->max_banks, perf_iommu->max_counters);
+               list_add_tail(&perf_iommu->list, &perf_amd_iommu_list);
        } else {
-               pr_info("perf: amd_iommu: Detected. (%d banks, %d counters/bank)\n",
-                       amd_iommu_pc_get_max_banks(IOMMU_BASE_DEVID),
-                       amd_iommu_pc_get_max_counters(IOMMU_BASE_DEVID));
+               pr_warn("Error initializing IOMMU %d.\n", idx);
+               kfree(perf_iommu);
        }
-
        return ret;
 }
 
-static struct perf_amd_iommu __perf_iommu = {
-       .pmu = {
-               .task_ctx_nr    = perf_invalid_context,
-               .event_init     = perf_iommu_event_init,
-               .add            = perf_iommu_add,
-               .del            = perf_iommu_del,
-               .start          = perf_iommu_start,
-               .stop           = perf_iommu_stop,
-               .read           = perf_iommu_read,
-       },
-       .max_banks              = 0x00,
-       .max_counters           = 0x00,
-       .cntr_assign_mask       = 0ULL,
-       .format_group           = NULL,
-       .cpumask_group          = NULL,
-       .events_group           = NULL,
-       .null_group             = NULL,
-};
-
 static __init int amd_iommu_pc_init(void)
 {
+       unsigned int i, cnt = 0;
+       int ret;
+
        /* Make sure the IOMMU PC resource is available */
        if (!amd_iommu_pc_supported())
                return -ENODEV;
 
-       _init_perf_amd_iommu(&__perf_iommu, "amd_iommu");
+       ret = _init_events_attrs();
+       if (ret)
+               return ret;
+
+       /*
+        * An IOMMU PMU is specific to an IOMMU, and can function independently.
+        * So we go through all IOMMUs and ignore the one that fails init
+        * unless all IOMMU are failing.
+        */
+       for (i = 0; i < amd_iommu_get_num_iommus(); i++) {
+               ret = init_one_iommu(i);
+               if (!ret)
+                       cnt++;
+       }
+
+       if (!cnt) {
+               kfree(amd_iommu_events_group.attrs);
+               return -ENODEV;
+       }
 
+       /* Init cpumask attributes to only core 0 */
+       cpumask_set_cpu(0, &iommu_cpumask);
        return 0;
 }
 
index 845d173..62e0702 100644 (file)
 #define PC_MAX_SPEC_BNKS                       64
 #define PC_MAX_SPEC_CNTRS                      16
 
-/* iommu pc reg masks*/
-#define IOMMU_BASE_DEVID                       0x0000
+struct amd_iommu;
 
 /* amd_iommu_init.c external support functions */
+extern int amd_iommu_get_num_iommus(void);
+
 extern bool amd_iommu_pc_supported(void);
 
-extern u8 amd_iommu_pc_get_max_banks(u16 devid);
+extern u8 amd_iommu_pc_get_max_banks(unsigned int idx);
+
+extern u8 amd_iommu_pc_get_max_counters(unsigned int idx);
+
+extern int amd_iommu_pc_set_reg(struct amd_iommu *iommu, u8 bank, u8 cntr,
+                               u8 fxn, u64 *value);
 
-extern u8 amd_iommu_pc_get_max_counters(u16 devid);
+extern int amd_iommu_pc_get_reg(struct amd_iommu *iommu, u8 bank, u8 cntr,
+                               u8 fxn, u64 *value);
 
-extern int amd_iommu_pc_get_set_reg_val(u16 devid, u8 bank, u8 cntr,
-                       u8 fxn, u64 *value, bool is_write);
+extern struct amd_iommu *get_amd_iommu(int idx);
 
 #endif /*_PERF_EVENT_AMD_IOMMU_H_*/
index 4d1f7f2..ad44af0 100644 (file)
@@ -30,6 +30,9 @@
 
 #define COUNTER_SHIFT          16
 
+#undef pr_fmt
+#define pr_fmt(fmt)    "amd_uncore: " fmt
+
 static int num_counters_llc;
 static int num_counters_nb;
 
@@ -509,51 +512,34 @@ static int __init amd_uncore_init(void)
        int ret = -ENODEV;
 
        if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
-               goto fail_nodev;
-
-       switch(boot_cpu_data.x86) {
-               case 23:
-                       /* Family 17h: */
-                       num_counters_nb = NUM_COUNTERS_NB;
-                       num_counters_llc = NUM_COUNTERS_L3;
-                       /*
-                        * For Family17h, the NorthBridge counters are
-                        * re-purposed as Data Fabric counters. Also, support is
-                        * added for L3 counters. The pmus are exported based on
-                        * family as either L2 or L3 and NB or DF.
-                        */
-                       amd_nb_pmu.name = "amd_df";
-                       amd_llc_pmu.name = "amd_l3";
-                       format_attr_event_df.show = &event_show_df;
-                       format_attr_event_l3.show = &event_show_l3;
-                       break;
-               case 22:
-                       /* Family 16h - may change: */
-                       num_counters_nb = NUM_COUNTERS_NB;
-                       num_counters_llc = NUM_COUNTERS_L2;
-                       amd_nb_pmu.name = "amd_nb";
-                       amd_llc_pmu.name = "amd_l2";
-                       format_attr_event_df = format_attr_event;
-                       format_attr_event_l3 = format_attr_event;
-                       break;
-               default:
-                       /*
-                        * All prior families have the same number of
-                        * NorthBridge and Last Level Cache counters
-                        */
-                       num_counters_nb = NUM_COUNTERS_NB;
-                       num_counters_llc = NUM_COUNTERS_L2;
-                       amd_nb_pmu.name = "amd_nb";
-                       amd_llc_pmu.name = "amd_l2";
-                       format_attr_event_df = format_attr_event;
-                       format_attr_event_l3 = format_attr_event;
-                       break;
-       }
-       amd_nb_pmu.attr_groups = amd_uncore_attr_groups_df;
-       amd_llc_pmu.attr_groups = amd_uncore_attr_groups_l3;
+               return -ENODEV;
 
        if (!boot_cpu_has(X86_FEATURE_TOPOEXT))
-               goto fail_nodev;
+               return -ENODEV;
+
+       if (boot_cpu_data.x86 == 0x17) {
+               /*
+                * For F17h, the Northbridge counters are repurposed as Data
+                * Fabric counters. Also, L3 counters are supported too. The PMUs
+                * are exported based on  family as either L2 or L3 and NB or DF.
+                */
+               num_counters_nb           = NUM_COUNTERS_NB;
+               num_counters_llc          = NUM_COUNTERS_L3;
+               amd_nb_pmu.name           = "amd_df";
+               amd_llc_pmu.name          = "amd_l3";
+               format_attr_event_df.show = &event_show_df;
+               format_attr_event_l3.show = &event_show_l3;
+       } else {
+               num_counters_nb           = NUM_COUNTERS_NB;
+               num_counters_llc          = NUM_COUNTERS_L2;
+               amd_nb_pmu.name           = "amd_nb";
+               amd_llc_pmu.name          = "amd_l2";
+               format_attr_event_df      = format_attr_event;
+               format_attr_event_l3      = format_attr_event;
+       }
+
+       amd_nb_pmu.attr_groups  = amd_uncore_attr_groups_df;
+       amd_llc_pmu.attr_groups = amd_uncore_attr_groups_l3;
 
        if (boot_cpu_has(X86_FEATURE_PERFCTR_NB)) {
                amd_uncore_nb = alloc_percpu(struct amd_uncore *);
@@ -565,7 +551,7 @@ static int __init amd_uncore_init(void)
                if (ret)
                        goto fail_nb;
 
-               pr_info("perf: AMD NB counters detected\n");
+               pr_info("AMD NB counters detected\n");
                ret = 0;
        }
 
@@ -579,7 +565,7 @@ static int __init amd_uncore_init(void)
                if (ret)
                        goto fail_llc;
 
-               pr_info("perf: AMD LLC counters detected\n");
+               pr_info("AMD LLC counters detected\n");
                ret = 0;
        }
 
@@ -615,7 +601,6 @@ fail_nb:
        if (amd_uncore_nb)
                free_percpu(amd_uncore_nb);
 
-fail_nodev:
        return ret;
 }
 device_initcall(amd_uncore_init);
index 982c9e3..8ae8c5c 100644 (file)
@@ -63,7 +63,6 @@ struct bts_buffer {
        unsigned int    cur_buf;
        bool            snapshot;
        local_t         data_size;
-       local_t         lost;
        local_t         head;
        unsigned long   end;
        void            **data_pages;
@@ -199,7 +198,8 @@ static void bts_update(struct bts_ctx *bts)
                        return;
 
                if (ds->bts_index >= ds->bts_absolute_maximum)
-                       local_inc(&buf->lost);
+                       perf_aux_output_flag(&bts->handle,
+                                            PERF_AUX_FLAG_TRUNCATED);
 
                /*
                 * old and head are always in the same physical buffer, so we
@@ -276,7 +276,7 @@ static void bts_event_start(struct perf_event *event, int flags)
        return;
 
 fail_end_stop:
-       perf_aux_output_end(&bts->handle, 0, false);
+       perf_aux_output_end(&bts->handle, 0);
 
 fail_stop:
        event->hw.state = PERF_HES_STOPPED;
@@ -319,9 +319,8 @@ static void bts_event_stop(struct perf_event *event, int flags)
                                bts->handle.head =
                                        local_xchg(&buf->data_size,
                                                   buf->nr_pages << PAGE_SHIFT);
-
-                       perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
-                                           !!local_xchg(&buf->lost, 0));
+                       perf_aux_output_end(&bts->handle,
+                                           local_xchg(&buf->data_size, 0));
                }
 
                cpuc->ds->bts_index = bts->ds_back.bts_buffer_base;
@@ -484,8 +483,7 @@ int intel_bts_interrupt(void)
        if (old_head == local_read(&buf->head))
                return handled;
 
-       perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
-                           !!local_xchg(&buf->lost, 0));
+       perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0));
 
        buf = perf_aux_output_begin(&bts->handle, event);
        if (buf)
@@ -500,7 +498,7 @@ int intel_bts_interrupt(void)
                         * cleared handle::event
                         */
                        barrier();
-                       perf_aux_output_end(&bts->handle, 0, false);
+                       perf_aux_output_end(&bts->handle, 0);
                }
        }
 
index eb1484c..a6d91d4 100644 (file)
@@ -1553,6 +1553,27 @@ static __initconst const u64 slm_hw_cache_event_ids
  },
 };
 
+EVENT_ATTR_STR(topdown-total-slots, td_total_slots_glm, "event=0x3c");
+EVENT_ATTR_STR(topdown-total-slots.scale, td_total_slots_scale_glm, "3");
+/* UOPS_NOT_DELIVERED.ANY */
+EVENT_ATTR_STR(topdown-fetch-bubbles, td_fetch_bubbles_glm, "event=0x9c");
+/* ISSUE_SLOTS_NOT_CONSUMED.RECOVERY */
+EVENT_ATTR_STR(topdown-recovery-bubbles, td_recovery_bubbles_glm, "event=0xca,umask=0x02");
+/* UOPS_RETIRED.ANY */
+EVENT_ATTR_STR(topdown-slots-retired, td_slots_retired_glm, "event=0xc2");
+/* UOPS_ISSUED.ANY */
+EVENT_ATTR_STR(topdown-slots-issued, td_slots_issued_glm, "event=0x0e");
+
+static struct attribute *glm_events_attrs[] = {
+       EVENT_PTR(td_total_slots_glm),
+       EVENT_PTR(td_total_slots_scale_glm),
+       EVENT_PTR(td_fetch_bubbles_glm),
+       EVENT_PTR(td_recovery_bubbles_glm),
+       EVENT_PTR(td_slots_issued_glm),
+       EVENT_PTR(td_slots_retired_glm),
+       NULL
+};
+
 static struct extra_reg intel_glm_extra_regs[] __read_mostly = {
        /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
        INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x760005ffbfull, RSP_0),
@@ -2130,7 +2151,7 @@ again:
         * counters from the GLOBAL_STATUS mask and we always process PEBS
         * events via drain_pebs().
         */
-       status &= ~cpuc->pebs_enabled;
+       status &= ~(cpuc->pebs_enabled & PEBS_COUNTER_MASK);
 
        /*
         * PEBS overflow sets bit 62 in the global status register
@@ -3750,6 +3771,7 @@ __init int intel_pmu_init(void)
                x86_pmu.pebs_prec_dist = true;
                x86_pmu.lbr_pt_coexist = true;
                x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+               x86_pmu.cpu_events = glm_events_attrs;
                pr_cont("Goldmont events, ");
                break;
 
index 9dfeeec..c6d23ff 100644 (file)
@@ -1222,7 +1222,7 @@ get_next_pebs_record_by_bit(void *base, void *top, int bit)
 
                        /* clear non-PEBS bit and re-check */
                        pebs_status = p->status & cpuc->pebs_enabled;
-                       pebs_status &= (1ULL << MAX_PEBS_EVENTS) - 1;
+                       pebs_status &= PEBS_COUNTER_MASK;
                        if (pebs_status == (1 << bit))
                                return at;
                }
index 81b321a..f924629 100644 (file)
@@ -507,6 +507,9 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
                cpuc->lbr_entries[i].to         = msr_lastbranch.to;
                cpuc->lbr_entries[i].mispred    = 0;
                cpuc->lbr_entries[i].predicted  = 0;
+               cpuc->lbr_entries[i].in_tx      = 0;
+               cpuc->lbr_entries[i].abort      = 0;
+               cpuc->lbr_entries[i].cycles     = 0;
                cpuc->lbr_entries[i].reserved   = 0;
        }
        cpuc->lbr_stack.nr = i;
index 5900471..ae8324d 100644 (file)
@@ -28,6 +28,7 @@
 #include <asm/insn.h>
 #include <asm/io.h>
 #include <asm/intel_pt.h>
+#include <asm/intel-family.h>
 
 #include "../perf_event.h"
 #include "pt.h"
@@ -98,6 +99,7 @@ static struct attribute_group pt_cap_group = {
        .name   = "caps",
 };
 
+PMU_FORMAT_ATTR(pt,            "config:0"      );
 PMU_FORMAT_ATTR(cyc,           "config:1"      );
 PMU_FORMAT_ATTR(pwr_evt,       "config:4"      );
 PMU_FORMAT_ATTR(fup_on_ptw,    "config:5"      );
@@ -105,11 +107,13 @@ PMU_FORMAT_ATTR(mtc,              "config:9"      );
 PMU_FORMAT_ATTR(tsc,           "config:10"     );
 PMU_FORMAT_ATTR(noretcomp,     "config:11"     );
 PMU_FORMAT_ATTR(ptw,           "config:12"     );
+PMU_FORMAT_ATTR(branch,                "config:13"     );
 PMU_FORMAT_ATTR(mtc_period,    "config:14-17"  );
 PMU_FORMAT_ATTR(cyc_thresh,    "config:19-22"  );
 PMU_FORMAT_ATTR(psb_period,    "config:24-27"  );
 
 static struct attribute *pt_formats_attr[] = {
+       &format_attr_pt.attr,
        &format_attr_cyc.attr,
        &format_attr_pwr_evt.attr,
        &format_attr_fup_on_ptw.attr,
@@ -117,6 +121,7 @@ static struct attribute *pt_formats_attr[] = {
        &format_attr_tsc.attr,
        &format_attr_noretcomp.attr,
        &format_attr_ptw.attr,
+       &format_attr_branch.attr,
        &format_attr_mtc_period.attr,
        &format_attr_cyc_thresh.attr,
        &format_attr_psb_period.attr,
@@ -197,6 +202,19 @@ static int __init pt_pmu_hw_init(void)
                pt_pmu.tsc_art_den = eax;
        }
 
+       /* model-specific quirks */
+       switch (boot_cpu_data.x86_model) {
+       case INTEL_FAM6_BROADWELL_CORE:
+       case INTEL_FAM6_BROADWELL_XEON_D:
+       case INTEL_FAM6_BROADWELL_GT3E:
+       case INTEL_FAM6_BROADWELL_X:
+               /* not setting BRANCH_EN will #GP, erratum BDM106 */
+               pt_pmu.branch_en_always_on = true;
+               break;
+       default:
+               break;
+       }
+
        if (boot_cpu_has(X86_FEATURE_VMX)) {
                /*
                 * Intel SDM, 36.5 "Tracing post-VMXON" says that
@@ -263,8 +281,20 @@ fail:
 #define RTIT_CTL_PTW   (RTIT_CTL_PTW_EN        | \
                         RTIT_CTL_FUP_ON_PTW)
 
-#define PT_CONFIG_MASK (RTIT_CTL_TSC_EN                | \
+/*
+ * Bit 0 (TraceEn) in the attr.config is meaningless as the
+ * corresponding bit in the RTIT_CTL can only be controlled
+ * by the driver; therefore, repurpose it to mean: pass
+ * through the bit that was previously assumed to be always
+ * on for PT, thereby allowing the user to *not* set it if
+ * they so wish. See also pt_event_valid() and pt_config().
+ */
+#define RTIT_CTL_PASSTHROUGH RTIT_CTL_TRACEEN
+
+#define PT_CONFIG_MASK (RTIT_CTL_TRACEEN       | \
+                       RTIT_CTL_TSC_EN         | \
                        RTIT_CTL_DISRETC        | \
+                       RTIT_CTL_BRANCH_EN      | \
                        RTIT_CTL_CYC_PSB        | \
                        RTIT_CTL_MTC            | \
                        RTIT_CTL_PWR_EVT_EN     | \
@@ -332,6 +362,33 @@ static bool pt_event_valid(struct perf_event *event)
                        return false;
        }
 
+       /*
+        * Setting bit 0 (TraceEn in RTIT_CTL MSR) in the attr.config
+        * clears the assomption that BranchEn must always be enabled,
+        * as was the case with the first implementation of PT.
+        * If this bit is not set, the legacy behavior is preserved
+        * for compatibility with the older userspace.
+        *
+        * Re-using bit 0 for this purpose is fine because it is never
+        * directly set by the user; previous attempts at setting it in
+        * the attr.config resulted in -EINVAL.
+        */
+       if (config & RTIT_CTL_PASSTHROUGH) {
+               /*
+                * Disallow not setting BRANCH_EN where BRANCH_EN is
+                * always required.
+                */
+               if (pt_pmu.branch_en_always_on &&
+                   !(config & RTIT_CTL_BRANCH_EN))
+                       return false;
+       } else {
+               /*
+                * Disallow BRANCH_EN without the PASSTHROUGH.
+                */
+               if (config & RTIT_CTL_BRANCH_EN)
+                       return false;
+       }
+
        return true;
 }
 
@@ -411,6 +468,7 @@ static u64 pt_config_filters(struct perf_event *event)
 
 static void pt_config(struct perf_event *event)
 {
+       struct pt *pt = this_cpu_ptr(&pt_ctx);
        u64 reg;
 
        if (!event->hw.itrace_started) {
@@ -419,7 +477,20 @@ static void pt_config(struct perf_event *event)
        }
 
        reg = pt_config_filters(event);
-       reg |= RTIT_CTL_TOPA | RTIT_CTL_BRANCH_EN | RTIT_CTL_TRACEEN;
+       reg |= RTIT_CTL_TOPA | RTIT_CTL_TRACEEN;
+
+       /*
+        * Previously, we had BRANCH_EN on by default, but now that PT has
+        * grown features outside of branch tracing, it is useful to allow
+        * the user to disable it. Setting bit 0 in the event's attr.config
+        * allows BRANCH_EN to pass through instead of being always on. See
+        * also the comment in pt_event_valid().
+        */
+       if (event->attr.config & BIT(0)) {
+               reg |= event->attr.config & RTIT_CTL_BRANCH_EN;
+       } else {
+               reg |= RTIT_CTL_BRANCH_EN;
+       }
 
        if (!event->attr.exclude_kernel)
                reg |= RTIT_CTL_OS;
@@ -429,11 +500,15 @@ static void pt_config(struct perf_event *event)
        reg |= (event->attr.config & PT_CONFIG_MASK);
 
        event->hw.config = reg;
-       wrmsrl(MSR_IA32_RTIT_CTL, reg);
+       if (READ_ONCE(pt->vmx_on))
+               perf_aux_output_flag(&pt->handle, PERF_AUX_FLAG_PARTIAL);
+       else
+               wrmsrl(MSR_IA32_RTIT_CTL, reg);
 }
 
 static void pt_config_stop(struct perf_event *event)
 {
+       struct pt *pt = this_cpu_ptr(&pt_ctx);
        u64 ctl = READ_ONCE(event->hw.config);
 
        /* may be already stopped by a PMI */
@@ -441,7 +516,8 @@ static void pt_config_stop(struct perf_event *event)
                return;
 
        ctl &= ~RTIT_CTL_TRACEEN;
-       wrmsrl(MSR_IA32_RTIT_CTL, ctl);
+       if (!READ_ONCE(pt->vmx_on))
+               wrmsrl(MSR_IA32_RTIT_CTL, ctl);
 
        WRITE_ONCE(event->hw.config, ctl);
 
@@ -753,7 +829,8 @@ static void pt_handle_status(struct pt *pt)
                 */
                if (!pt_cap_get(PT_CAP_topa_multiple_entries) ||
                    buf->output_off == sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) {
-                       local_inc(&buf->lost);
+                       perf_aux_output_flag(&pt->handle,
+                                            PERF_AUX_FLAG_TRUNCATED);
                        advance++;
                }
        }
@@ -846,8 +923,10 @@ static int pt_buffer_reset_markers(struct pt_buffer *buf,
 
        /* can't stop in the middle of an output region */
        if (buf->output_off + handle->size + 1 <
-           sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size))
+           sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) {
+               perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
                return -EINVAL;
+       }
 
 
        /* single entry ToPA is handled by marking all regions STOP=1 INT=1 */
@@ -1171,12 +1250,6 @@ void intel_pt_interrupt(void)
        if (!READ_ONCE(pt->handle_nmi))
                return;
 
-       /*
-        * If VMX is on and PT does not support it, don't touch anything.
-        */
-       if (READ_ONCE(pt->vmx_on))
-               return;
-
        if (!event)
                return;
 
@@ -1192,8 +1265,7 @@ void intel_pt_interrupt(void)
 
        pt_update_head(pt);
 
-       perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
-                           local_xchg(&buf->lost, 0));
+       perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0));
 
        if (!event->hw.state) {
                int ret;
@@ -1208,7 +1280,7 @@ void intel_pt_interrupt(void)
                /* snapshot counters don't use PMI, so it's safe */
                ret = pt_buffer_reset_markers(buf, &pt->handle);
                if (ret) {
-                       perf_aux_output_end(&pt->handle, 0, true);
+                       perf_aux_output_end(&pt->handle, 0);
                        return;
                }
 
@@ -1237,12 +1309,19 @@ void intel_pt_handle_vmx(int on)
        local_irq_save(flags);
        WRITE_ONCE(pt->vmx_on, on);
 
-       if (on) {
-               /* prevent pt_config_stop() from writing RTIT_CTL */
-               event = pt->handle.event;
-               if (event)
-                       event->hw.config = 0;
-       }
+       /*
+        * If an AUX transaction is in progress, it will contain
+        * gap(s), so flag it PARTIAL to inform the user.
+        */
+       event = pt->handle.event;
+       if (event)
+               perf_aux_output_flag(&pt->handle,
+                                    PERF_AUX_FLAG_PARTIAL);
+
+       /* Turn PTs back on */
+       if (!on && event)
+               wrmsrl(MSR_IA32_RTIT_CTL, event->hw.config);
+
        local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(intel_pt_handle_vmx);
@@ -1257,9 +1336,6 @@ static void pt_event_start(struct perf_event *event, int mode)
        struct pt *pt = this_cpu_ptr(&pt_ctx);
        struct pt_buffer *buf;
 
-       if (READ_ONCE(pt->vmx_on))
-               return;
-
        buf = perf_aux_output_begin(&pt->handle, event);
        if (!buf)
                goto fail_stop;
@@ -1280,7 +1356,7 @@ static void pt_event_start(struct perf_event *event, int mode)
        return;
 
 fail_end_stop:
-       perf_aux_output_end(&pt->handle, 0, true);
+       perf_aux_output_end(&pt->handle, 0);
 fail_stop:
        hwc->state = PERF_HES_STOPPED;
 }
@@ -1321,8 +1397,7 @@ static void pt_event_stop(struct perf_event *event, int mode)
                        pt->handle.head =
                                local_xchg(&buf->data_size,
                                           buf->nr_pages << PAGE_SHIFT);
-               perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
-                                   local_xchg(&buf->lost, 0));
+               perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0));
        }
 }
 
index 53473c2..0eb41d0 100644 (file)
@@ -110,6 +110,7 @@ struct pt_pmu {
        struct pmu              pmu;
        u32                     caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES];
        bool                    vmx;
+       bool                    branch_en_always_on;
        unsigned long           max_nonturbo_ratio;
        unsigned int            tsc_art_num;
        unsigned int            tsc_art_den;
@@ -143,7 +144,6 @@ struct pt_buffer {
        size_t                  output_off;
        unsigned long           nr_pages;
        local_t                 data_size;
-       local_t                 lost;
        local64_t               head;
        bool                    snapshot;
        unsigned long           stop_pos, intr_pos;
index bcbb1d2..be3d362 100644 (file)
@@ -79,6 +79,7 @@ struct amd_nb {
 
 /* The maximal number of PEBS events: */
 #define MAX_PEBS_EVENTS                8
+#define PEBS_COUNTER_MASK      ((1ULL << MAX_PEBS_EVENTS) - 1)
 
 /*
  * Flags PEBS can handle without an PMI.
index 395b695..2efc768 100644 (file)
@@ -52,6 +52,8 @@ extern u8 acpi_sci_flags;
 extern int acpi_sci_override_gsi;
 void acpi_pic_sci_set_trigger(unsigned int, u16);
 
+struct device;
+
 extern int (*__acpi_register_gsi)(struct device *dev, u32 gsi,
                                  int trigger, int polarity);
 extern void (*__acpi_unregister_gsi)(u32 gsi);
index 14635c5..caa5798 100644 (file)
@@ -186,6 +186,12 @@ static __always_inline int atomic_cmpxchg(atomic_t *v, int old, int new)
        return cmpxchg(&v->counter, old, new);
 }
 
+#define atomic_try_cmpxchg atomic_try_cmpxchg
+static __always_inline bool atomic_try_cmpxchg(atomic_t *v, int *old, int new)
+{
+       return try_cmpxchg(&v->counter, old, new);
+}
+
 static inline int atomic_xchg(atomic_t *v, int new)
 {
        return xchg(&v->counter, new);
@@ -201,16 +207,12 @@ static inline void atomic_##op(int i, atomic_t *v)                        \
 }
 
 #define ATOMIC_FETCH_OP(op, c_op)                                      \
-static inline int atomic_fetch_##op(int i, atomic_t *v)                \
+static inline int atomic_fetch_##op(int i, atomic_t *v)                        \
 {                                                                      \
-       int old, val = atomic_read(v);                                  \
-       for (;;) {                                                      \
-               old = atomic_cmpxchg(v, val, val c_op i);               \
-               if (old == val)                                         \
-                       break;                                          \
-               val = old;                                              \
-       }                                                               \
-       return old;                                                     \
+       int val = atomic_read(v);                                       \
+       do {                                                            \
+       } while (!atomic_try_cmpxchg(v, &val, val c_op i));             \
+       return val;                                                     \
 }
 
 #define ATOMIC_OPS(op, c_op)                                           \
@@ -236,16 +238,11 @@ ATOMIC_OPS(xor, ^)
  */
 static __always_inline int __atomic_add_unless(atomic_t *v, int a, int u)
 {
-       int c, old;
-       c = atomic_read(v);
-       for (;;) {
-               if (unlikely(c == (u)))
-                       break;
-               old = atomic_cmpxchg((v), c, c + (a));
-               if (likely(old == c))
+       int c = atomic_read(v);
+       do {
+               if (unlikely(c == u))
                        break;
-               c = old;
-       }
+       } while (!atomic_try_cmpxchg(v, &c, c + a));
        return c;
 }
 
index 89ed2f6..6189a43 100644 (file)
@@ -176,6 +176,12 @@ static inline long atomic64_cmpxchg(atomic64_t *v, long old, long new)
        return cmpxchg(&v->counter, old, new);
 }
 
+#define atomic64_try_cmpxchg atomic64_try_cmpxchg
+static __always_inline bool atomic64_try_cmpxchg(atomic64_t *v, long *old, long new)
+{
+       return try_cmpxchg(&v->counter, old, new);
+}
+
 static inline long atomic64_xchg(atomic64_t *v, long new)
 {
        return xchg(&v->counter, new);
@@ -192,17 +198,12 @@ static inline long atomic64_xchg(atomic64_t *v, long new)
  */
 static inline bool atomic64_add_unless(atomic64_t *v, long a, long u)
 {
-       long c, old;
-       c = atomic64_read(v);
-       for (;;) {
-               if (unlikely(c == (u)))
-                       break;
-               old = atomic64_cmpxchg((v), c, c + (a));
-               if (likely(old == c))
-                       break;
-               c = old;
-       }
-       return c != (u);
+       long c = atomic64_read(v);
+       do {
+               if (unlikely(c == u))
+                       return false;
+       } while (!atomic64_try_cmpxchg(v, &c, c + a));
+       return true;
 }
 
 #define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1, 0)
@@ -216,17 +217,12 @@ static inline bool atomic64_add_unless(atomic64_t *v, long a, long u)
  */
 static inline long atomic64_dec_if_positive(atomic64_t *v)
 {
-       long c, old, dec;
-       c = atomic64_read(v);
-       for (;;) {
+       long dec, c = atomic64_read(v);
+       do {
                dec = c - 1;
                if (unlikely(dec < 0))
                        break;
-               old = atomic64_cmpxchg((v), c, dec);
-               if (likely(old == c))
-                       break;
-               c = old;
-       }
+       } while (!atomic64_try_cmpxchg(v, &c, dec));
        return dec;
 }
 
@@ -242,14 +238,10 @@ static inline void atomic64_##op(long i, atomic64_t *v)                   \
 #define ATOMIC64_FETCH_OP(op, c_op)                                    \
 static inline long atomic64_fetch_##op(long i, atomic64_t *v)          \
 {                                                                      \
-       long old, val = atomic64_read(v);                               \
-       for (;;) {                                                      \
-               old = atomic64_cmpxchg(v, val, val c_op i);             \
-               if (old == val)                                         \
-                       break;                                          \
-               val = old;                                              \
-       }                                                               \
-       return old;                                                     \
+       long val = atomic64_read(v);                                    \
+       do {                                                            \
+       } while (!atomic64_try_cmpxchg(v, &val, val c_op i));           \
+       return val;                                                     \
 }
 
 #define ATOMIC64_OPS(op, c_op)                                         \
index 97848cd..d90296d 100644 (file)
@@ -153,6 +153,76 @@ extern void __add_wrong_size(void)
 #define cmpxchg_local(ptr, old, new)                                   \
        __cmpxchg_local(ptr, old, new, sizeof(*(ptr)))
 
+
+#define __raw_try_cmpxchg(_ptr, _pold, _new, size, lock)               \
+({                                                                     \
+       bool success;                                                   \
+       __typeof__(_ptr) _old = (_pold);                                \
+       __typeof__(*(_ptr)) __old = *_old;                              \
+       __typeof__(*(_ptr)) __new = (_new);                             \
+       switch (size) {                                                 \
+       case __X86_CASE_B:                                              \
+       {                                                               \
+               volatile u8 *__ptr = (volatile u8 *)(_ptr);             \
+               asm volatile(lock "cmpxchgb %[new], %[ptr]"             \
+                            CC_SET(z)                                  \
+                            : CC_OUT(z) (success),                     \
+                              [ptr] "+m" (*__ptr),                     \
+                              [old] "+a" (__old)                       \
+                            : [new] "q" (__new)                        \
+                            : "memory");                               \
+               break;                                                  \
+       }                                                               \
+       case __X86_CASE_W:                                              \
+       {                                                               \
+               volatile u16 *__ptr = (volatile u16 *)(_ptr);           \
+               asm volatile(lock "cmpxchgw %[new], %[ptr]"             \
+                            CC_SET(z)                                  \
+                            : CC_OUT(z) (success),                     \
+                              [ptr] "+m" (*__ptr),                     \
+                              [old] "+a" (__old)                       \
+                            : [new] "r" (__new)                        \
+                            : "memory");                               \
+               break;                                                  \
+       }                                                               \
+       case __X86_CASE_L:                                              \
+       {                                                               \
+               volatile u32 *__ptr = (volatile u32 *)(_ptr);           \
+               asm volatile(lock "cmpxchgl %[new], %[ptr]"             \
+                            CC_SET(z)                                  \
+                            : CC_OUT(z) (success),                     \
+                              [ptr] "+m" (*__ptr),                     \
+                              [old] "+a" (__old)                       \
+                            : [new] "r" (__new)                        \
+                            : "memory");                               \
+               break;                                                  \
+       }                                                               \
+       case __X86_CASE_Q:                                              \
+       {                                                               \
+               volatile u64 *__ptr = (volatile u64 *)(_ptr);           \
+               asm volatile(lock "cmpxchgq %[new], %[ptr]"             \
+                            CC_SET(z)                                  \
+                            : CC_OUT(z) (success),                     \
+                              [ptr] "+m" (*__ptr),                     \
+                              [old] "+a" (__old)                       \
+                            : [new] "r" (__new)                        \
+                            : "memory");                               \
+               break;                                                  \
+       }                                                               \
+       default:                                                        \
+               __cmpxchg_wrong_size();                                 \
+       }                                                               \
+       if (unlikely(!success))                                         \
+               *_old = __old;                                          \
+       likely(success);                                                \
+})
+
+#define __try_cmpxchg(ptr, pold, new, size)                            \
+       __raw_try_cmpxchg((ptr), (pold), (new), (size), LOCK_PREFIX)
+
+#define try_cmpxchg(ptr, pold, new)                                    \
+       __try_cmpxchg((ptr), (pold), (new), sizeof(*(ptr)))
+
 /*
  * xadd() adds "inc" to "*ptr" and atomically returns the previous
  * value of "*ptr".
index 25d7f52..2701e5f 100644 (file)
  * Reuse free bits when adding new feature flags!
  */
 #define X86_FEATURE_RING3MWAIT ( 7*32+ 0) /* Ring 3 MONITOR/MWAIT */
+#define X86_FEATURE_CPUID_FAULT ( 7*32+ 1) /* Intel CPUID faulting */
 #define X86_FEATURE_CPB                ( 7*32+ 2) /* AMD Core Performance Boost */
 #define X86_FEATURE_EPB                ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
 #define X86_FEATURE_CAT_L3     ( 7*32+ 4) /* Cache Allocation Technology L3 */
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
deleted file mode 100644 (file)
index 67313f3..0000000
+++ /dev/null
@@ -1,73 +0,0 @@
-#ifndef _ASM_X86_E820_H
-#define _ASM_X86_E820_H
-
-/*
- * E820_X_MAX is the maximum size of the extended E820 table.  The extended
- * table may contain up to 3 extra E820 entries per possible NUMA node, so we
- * make room for 3 * MAX_NUMNODES possible entries, beyond the standard 128.
- * Also note that E820_X_MAX *must* be defined before we include uapi/asm/e820.h.
- */
-#include <linux/numa.h>
-#define E820_X_MAX (E820MAX + 3 * MAX_NUMNODES)
-
-#include <uapi/asm/e820.h>
-
-#ifndef __ASSEMBLY__
-/* see comment in arch/x86/kernel/e820.c */
-extern struct e820map *e820;
-extern struct e820map *e820_saved;
-
-extern unsigned long pci_mem_start;
-extern int e820_any_mapped(u64 start, u64 end, unsigned type);
-extern int e820_all_mapped(u64 start, u64 end, unsigned type);
-extern void e820_add_region(u64 start, u64 size, int type);
-extern void e820_print_map(char *who);
-extern int
-sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, u32 *pnr_map);
-extern u64 e820_update_range(u64 start, u64 size, unsigned old_type,
-                              unsigned new_type);
-extern u64 e820_remove_range(u64 start, u64 size, unsigned old_type,
-                            int checktype);
-extern void update_e820(void);
-extern void e820_setup_gap(void);
-struct setup_data;
-extern void parse_e820_ext(u64 phys_addr, u32 data_len);
-
-#if defined(CONFIG_X86_64) || \
-       (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION))
-extern void e820_mark_nosave_regions(unsigned long limit_pfn);
-#else
-static inline void e820_mark_nosave_regions(unsigned long limit_pfn)
-{
-}
-#endif
-
-extern unsigned long e820_end_of_ram_pfn(void);
-extern unsigned long e820_end_of_low_ram_pfn(void);
-extern u64 early_reserve_e820(u64 sizet, u64 align);
-
-void memblock_x86_fill(void);
-void memblock_find_dma_reserve(void);
-
-extern void finish_e820_parsing(void);
-extern void e820_reserve_resources(void);
-extern void e820_reserve_resources_late(void);
-extern void setup_memory_map(void);
-extern char *default_machine_specific_memory_setup(void);
-
-extern void e820_reallocate_tables(void);
-
-/*
- * Returns true iff the specified range [s,e) is completely contained inside
- * the ISA region.
- */
-static inline bool is_ISA_range(u64 s, u64 e)
-{
-       return s >= ISA_START_ADDRESS && e <= ISA_END_ADDRESS;
-}
-
-#endif /* __ASSEMBLY__ */
-#include <linux/ioport.h>
-
-#define HIGH_MEMORY    (1024*1024)
-#endif /* _ASM_X86_E820_H */
diff --git a/arch/x86/include/asm/e820/api.h b/arch/x86/include/asm/e820/api.h
new file mode 100644 (file)
index 0000000..8e0f8b8
--- /dev/null
@@ -0,0 +1,50 @@
+#ifndef _ASM_E820_API_H
+#define _ASM_E820_API_H
+
+#include <asm/e820/types.h>
+
+extern struct e820_table *e820_table;
+extern struct e820_table *e820_table_firmware;
+
+extern unsigned long pci_mem_start;
+
+extern bool e820__mapped_any(u64 start, u64 end, enum e820_type type);
+extern bool e820__mapped_all(u64 start, u64 end, enum e820_type type);
+
+extern void e820__range_add   (u64 start, u64 size, enum e820_type type);
+extern u64  e820__range_update(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type);
+extern u64  e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool check_type);
+
+extern void e820__print_table(char *who);
+extern int  e820__update_table(struct e820_table *table);
+extern void e820__update_table_print(void);
+
+extern unsigned long e820__end_of_ram_pfn(void);
+extern unsigned long e820__end_of_low_ram_pfn(void);
+
+extern u64  e820__memblock_alloc_reserved(u64 size, u64 align);
+extern void e820__memblock_setup(void);
+
+extern void e820__reserve_setup_data(void);
+extern void e820__finish_early_params(void);
+extern void e820__reserve_resources(void);
+extern void e820__reserve_resources_late(void);
+
+extern void e820__memory_setup(void);
+extern void e820__memory_setup_extended(u64 phys_addr, u32 data_len);
+extern char *e820__memory_setup_default(void);
+extern void e820__setup_pci_gap(void);
+
+extern void e820__reallocate_tables(void);
+extern void e820__register_nosave_regions(unsigned long limit_pfn);
+
+/*
+ * Returns true iff the specified range [start,end) is completely contained inside
+ * the ISA region.
+ */
+static inline bool is_ISA_range(u64 start, u64 end)
+{
+       return start >= ISA_START_ADDRESS && end <= ISA_END_ADDRESS;
+}
+
+#endif /* _ASM_E820_API_H */
diff --git a/arch/x86/include/asm/e820/types.h b/arch/x86/include/asm/e820/types.h
new file mode 100644 (file)
index 0000000..4adeed0
--- /dev/null
@@ -0,0 +1,104 @@
+#ifndef _ASM_E820_TYPES_H
+#define _ASM_E820_TYPES_H
+
+#include <uapi/asm/bootparam.h>
+
+/*
+ * These are the E820 types known to the kernel:
+ */
+enum e820_type {
+       E820_TYPE_RAM           = 1,
+       E820_TYPE_RESERVED      = 2,
+       E820_TYPE_ACPI          = 3,
+       E820_TYPE_NVS           = 4,
+       E820_TYPE_UNUSABLE      = 5,
+       E820_TYPE_PMEM          = 7,
+
+       /*
+        * This is a non-standardized way to represent ADR or
+        * NVDIMM regions that persist over a reboot.
+        *
+        * The kernel will ignore their special capabilities
+        * unless the CONFIG_X86_PMEM_LEGACY=y option is set.
+        *
+        * ( Note that older platforms also used 6 for the same
+        *   type of memory, but newer versions switched to 12 as
+        *   6 was assigned differently. Some time they will learn... )
+        */
+       E820_TYPE_PRAM          = 12,
+
+       /*
+        * Reserved RAM used by the kernel itself if
+        * CONFIG_INTEL_TXT=y is enabled, memory of this type
+        * will be included in the S3 integrity calculation
+        * and so should not include any memory that the BIOS
+        * might alter over the S3 transition:
+        */
+       E820_TYPE_RESERVED_KERN = 128,
+};
+
+/*
+ * A single E820 map entry, describing a memory range of [addr...addr+size-1],
+ * of 'type' memory type:
+ *
+ * (We pack it because there can be thousands of them on large systems.)
+ */
+struct e820_entry {
+       u64                     addr;
+       u64                     size;
+       enum e820_type          type;
+} __attribute__((packed));
+
+/*
+ * The legacy E820 BIOS limits us to 128 (E820_MAX_ENTRIES_ZEROPAGE) nodes
+ * due to the constrained space in the zeropage.
+ *
+ * On large systems we can easily have thousands of nodes with RAM,
+ * which cannot be fit into so few entries - so we have a mechanism
+ * to extend the e820 table size at build-time, via the E820_MAX_ENTRIES
+ * define below.
+ *
+ * ( Those extra entries are enumerated via the EFI memory map, not
+ *   via the legacy zeropage mechanism. )
+ *
+ * Size our internal memory map tables to have room for these additional
+ * entries, based on a heuristic calculation: up to three entries per
+ * NUMA node, plus E820_MAX_ENTRIES_ZEROPAGE for some extra space.
+ *
+ * This allows for bootstrap/firmware quirks such as possible duplicate
+ * E820 entries that might need room in the same arrays, prior to the
+ * call to e820__update_table() to remove duplicates.  The allowance
+ * of three memory map entries per node is "enough" entries for
+ * the initial hardware platform motivating this mechanism to make
+ * use of additional EFI map entries.  Future platforms may want
+ * to allow more than three entries per node or otherwise refine
+ * this size.
+ */
+
+#include <linux/numa.h>
+
+#define E820_MAX_ENTRIES       (E820_MAX_ENTRIES_ZEROPAGE + 3*MAX_NUMNODES)
+
+/*
+ * The whole array of E820 entries:
+ */
+struct e820_table {
+       __u32 nr_entries;
+       struct e820_entry entries[E820_MAX_ENTRIES];
+};
+
+/*
+ * Various well-known legacy memory ranges in physical memory:
+ */
+#define ISA_START_ADDRESS      0x000a0000
+#define ISA_END_ADDRESS                0x00100000
+
+#define BIOS_BEGIN             0x000a0000
+#define BIOS_END               0x00100000
+
+#define HIGH_MEMORY            0x00100000
+
+#define BIOS_ROM_BASE          0xffe00000
+#define BIOS_ROM_END           0xffffffff
+
+#endif /* _ASM_E820_TYPES_H */
index 156cd5d..1d26809 100644 (file)
@@ -1,7 +1,7 @@
 #ifndef _ASM_X86_GART_H
 #define _ASM_X86_GART_H
 
-#include <asm/e820.h>
+#include <asm/e820/api.h>
 
 extern void set_up_gart_resume(u32, u32);
 
@@ -97,7 +97,7 @@ static inline int aperture_valid(u64 aper_base, u32 aper_size, u32 min_size)
                printk(KERN_INFO "Aperture beyond 4GB. Ignoring.\n");
                return 0;
        }
-       if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) {
+       if (e820__mapped_any(aper_base, aper_base + aper_size, E820_TYPE_RAM)) {
                printk(KERN_INFO "Aperture pointing to e820 RAM. Ignoring.\n");
                return 0;
        }
index 2005816..34b984c 100644 (file)
@@ -72,14 +72,13 @@ struct arch_specific_insn {
        /* copy of the original instruction */
        kprobe_opcode_t *insn;
        /*
-        * boostable = -1: This instruction type is not boostable.
-        * boostable = 0: This instruction type is boostable.
-        * boostable = 1: This instruction has been boosted: we have
+        * boostable = false: This instruction type is not boostable.
+        * boostable = true: This instruction has been boosted: we have
         * added a relative jump after the instruction copy in insn,
         * so no single-step and fixup are needed (unless there's
         * a post_handler or break_handler).
         */
-       int boostable;
+       bool boostable;
        bool if_modifier;
 };
 
index e638736..4fd5195 100644 (file)
  * debugging tools.  Each entry is only valid when its finished flag
  * is set.
  */
-struct mce_log {
+struct mce_log_buffer {
        char signature[12]; /* "MACHINECHECK" */
        unsigned len;       /* = MCE_LOG_LEN */
        unsigned next;
@@ -191,10 +191,12 @@ extern struct mca_config mca_cfg;
 extern struct mca_msr_regs msr_ops;
 
 enum mce_notifier_prios {
-       MCE_PRIO_SRAO           = INT_MAX,
-       MCE_PRIO_EXTLOG         = INT_MAX - 1,
-       MCE_PRIO_NFIT           = INT_MAX - 2,
-       MCE_PRIO_EDAC           = INT_MAX - 3,
+       MCE_PRIO_FIRST          = INT_MAX,
+       MCE_PRIO_SRAO           = INT_MAX - 1,
+       MCE_PRIO_EXTLOG         = INT_MAX - 2,
+       MCE_PRIO_NFIT           = INT_MAX - 3,
+       MCE_PRIO_EDAC           = INT_MAX - 4,
+       MCE_PRIO_MCELOG         = 1,
        MCE_PRIO_LOWEST         = 0,
 };
 
index 3200704..831eb78 100644 (file)
@@ -64,7 +64,7 @@ static inline void find_smp_config(void)
 }
 
 #ifdef CONFIG_X86_MPPARSE
-extern void early_reserve_e820_mpc_new(void);
+extern void e820__memblock_alloc_reserved_mpc_new(void);
 extern int enable_update_mptable;
 extern int default_mpc_apic_id(struct mpc_cpu *m);
 extern void default_smp_read_mpc_oem(struct mpc_table *mpc);
@@ -76,7 +76,7 @@ extern void default_mpc_oem_bus_info(struct mpc_bus *m, char *str);
 extern void default_find_smp_config(void);
 extern void default_get_smp_config(unsigned int early);
 #else
-static inline void early_reserve_e820_mpc_new(void) { }
+static inline void e820__memblock_alloc_reserved_mpc_new(void) { }
 #define enable_update_mptable 0
 #define default_mpc_apic_id NULL
 #define default_smp_read_mpc_oem NULL
index d8b5f8a..673f9ac 100644 (file)
@@ -45,6 +45,8 @@
 #define MSR_IA32_PERFCTR1              0x000000c2
 #define MSR_FSB_FREQ                   0x000000cd
 #define MSR_PLATFORM_INFO              0x000000ce
+#define MSR_PLATFORM_INFO_CPUID_FAULT_BIT      31
+#define MSR_PLATFORM_INFO_CPUID_FAULT          BIT_ULL(MSR_PLATFORM_INFO_CPUID_FAULT_BIT)
 
 #define MSR_PKG_CST_CONFIG_CONTROL     0x000000e2
 #define NHM_C3_AUTO_DEMOTE             (1UL << 25)
 
 /* DEBUGCTLMSR bits (others vary by model): */
 #define DEBUGCTLMSR_LBR                        (1UL <<  0) /* last branch recording */
+#define DEBUGCTLMSR_BTF_SHIFT          1
 #define DEBUGCTLMSR_BTF                        (1UL <<  1) /* single-step on branches */
 #define DEBUGCTLMSR_TR                 (1UL <<  6)
 #define DEBUGCTLMSR_BTS                        (1UL <<  7)
 #define MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE_BIT       39
 #define MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE           (1ULL << MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE_BIT)
 
-/* MISC_FEATURE_ENABLES non-architectural features */
-#define MSR_MISC_FEATURE_ENABLES       0x00000140
+/* MISC_FEATURES_ENABLES non-architectural features */
+#define MSR_MISC_FEATURES_ENABLES      0x00000140
 
-#define MSR_MISC_FEATURE_ENABLES_RING3MWAIT_BIT                1
+#define MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT      0
+#define MSR_MISC_FEATURES_ENABLES_CPUID_FAULT          BIT_ULL(MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT)
+#define MSR_MISC_FEATURES_ENABLES_RING3MWAIT_BIT       1
 
 #define MSR_IA32_TSC_DEADLINE          0x000006E0
 
index d08eacd..9f1b21f 100644 (file)
@@ -4,6 +4,8 @@
  *     (c) 1999 Martin Mares <mj@ucw.cz>
  */
 
+#include <linux/ioport.h>
+
 #undef DEBUG
 
 #ifdef DEBUG
index 585ee0d..2197e53 100644 (file)
@@ -2,8 +2,6 @@
 #define _ASM_X86_PGTABLE_H
 
 #include <asm/page.h>
-#include <asm/e820.h>
-
 #include <asm/pgtable_types.h>
 
 /*
@@ -845,6 +843,7 @@ static inline int pgd_none(pgd_t pgd)
 extern int direct_gbpages;
 void init_mem_mapping(void);
 void early_alloc_pgt_buf(void);
+extern void memblock_find_dma_reserve(void);
 
 #ifdef CONFIG_X86_64
 /* Realmode trampoline initialization. */
index 2c1ebeb..529bb4a 100644 (file)
@@ -55,7 +55,8 @@ static inline int arch_memcpy_from_pmem(void *dst, const void *src, size_t n)
  * @size:      number of bytes to write back
  *
  * Write back a cache range using the CLWB (cache line write back)
- * instruction.
+ * instruction. Note that @size is internally rounded up to be cache
+ * line size aligned.
  */
 static inline void arch_wb_cache_pmem(void *addr, size_t size)
 {
@@ -69,15 +70,6 @@ static inline void arch_wb_cache_pmem(void *addr, size_t size)
                clwb(p);
 }
 
-/*
- * copy_from_iter_nocache() on x86 only uses non-temporal stores for iovec
- * iterators, so for other types (bvec & kvec) we must do a cache write-back.
- */
-static inline bool __iter_needs_pmem_wb(struct iov_iter *i)
-{
-       return iter_is_iovec(i) == false;
-}
-
 /**
  * arch_copy_from_iter_pmem - copy data from an iterator to PMEM
  * @addr:      PMEM destination address
@@ -94,7 +86,35 @@ static inline size_t arch_copy_from_iter_pmem(void *addr, size_t bytes,
        /* TODO: skip the write-back by always using non-temporal stores */
        len = copy_from_iter_nocache(addr, bytes, i);
 
-       if (__iter_needs_pmem_wb(i))
+       /*
+        * In the iovec case on x86_64 copy_from_iter_nocache() uses
+        * non-temporal stores for the bulk of the transfer, but we need
+        * to manually flush if the transfer is unaligned. A cached
+        * memory copy is used when destination or size is not naturally
+        * aligned. That is:
+        *   - Require 8-byte alignment when size is 8 bytes or larger.
+        *   - Require 4-byte alignment when size is 4 bytes.
+        *
+        * In the non-iovec case the entire destination needs to be
+        * flushed.
+        */
+       if (iter_is_iovec(i)) {
+               unsigned long flushed, dest = (unsigned long) addr;
+
+               if (bytes < 8) {
+                       if (!IS_ALIGNED(dest, 4) || (bytes != 4))
+                               arch_wb_cache_pmem(addr, 1);
+               } else {
+                       if (!IS_ALIGNED(dest, 8)) {
+                               dest = ALIGN(dest, boot_cpu_data.x86_clflush_size);
+                               arch_wb_cache_pmem(addr, 1);
+                       }
+
+                       flushed = dest - (unsigned long) addr;
+                       if (bytes > flushed && !IS_ALIGNED(bytes - flushed, 8))
+                               arch_wb_cache_pmem(addr + bytes - 1, 1);
+               }
+       } else
                arch_wb_cache_pmem(addr, bytes);
 
        return len;
index 4aa93b5..78defd0 100644 (file)
@@ -877,6 +877,8 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
 extern int get_tsc_mode(unsigned long adr);
 extern int set_tsc_mode(unsigned int val);
 
+DECLARE_PER_CPU(u64, msr_misc_features_shadow);
+
 /* Register/unregister a process' MPX related resource */
 #define MPX_ENABLE_MANAGEMENT()        mpx_enable_management()
 #define MPX_DISABLE_MANAGEMENT()       mpx_disable_management()
index 9b9b30b..8d3964f 100644 (file)
@@ -9,6 +9,7 @@ void syscall_init(void);
 
 #ifdef CONFIG_X86_64
 void entry_SYSCALL_64(void);
+long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2);
 #endif
 
 #ifdef CONFIG_X86_32
@@ -30,6 +31,7 @@ void x86_report_nx(void);
 
 extern int reboot_force;
 
-long do_arch_prctl(struct task_struct *task, int code, unsigned long addr);
+long do_arch_prctl_common(struct task_struct *task, int option,
+                         unsigned long cpuid_enabled);
 
 #endif /* _ASM_X86_PROTO_H */
index 2cb1cc2..fc62ba8 100644 (file)
@@ -15,6 +15,7 @@ struct machine_ops {
 };
 
 extern struct machine_ops machine_ops;
+extern int crashing_cpu;
 
 void native_machine_crash_shutdown(struct pt_regs *regs);
 void native_machine_shutdown(void);
index ad6f5eb..9fc44b9 100644 (file)
@@ -87,6 +87,7 @@ struct thread_info {
 #define TIF_SECCOMP            8       /* secure computing */
 #define TIF_USER_RETURN_NOTIFY 11      /* notify kernel of userspace return */
 #define TIF_UPROBE             12      /* breakpointed or singlestepping */
+#define TIF_NOCPUID            15      /* CPUID is not accessible in userland */
 #define TIF_NOTSC              16      /* TSC is not accessible in userland */
 #define TIF_IA32               17      /* IA32 compatibility process */
 #define TIF_NOHZ               19      /* in adaptive nohz mode */
@@ -110,6 +111,7 @@ struct thread_info {
 #define _TIF_SECCOMP           (1 << TIF_SECCOMP)
 #define _TIF_USER_RETURN_NOTIFY        (1 << TIF_USER_RETURN_NOTIFY)
 #define _TIF_UPROBE            (1 << TIF_UPROBE)
+#define _TIF_NOCPUID           (1 << TIF_NOCPUID)
 #define _TIF_NOTSC             (1 << TIF_NOTSC)
 #define _TIF_IA32              (1 << TIF_IA32)
 #define _TIF_NOHZ              (1 << TIF_NOHZ)
@@ -138,7 +140,7 @@ struct thread_info {
 
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW                                                        \
-       (_TIF_IO_BITMAP|_TIF_NOTSC|_TIF_BLOCKSTEP)
+       (_TIF_IO_BITMAP|_TIF_NOCPUID|_TIF_NOTSC|_TIF_BLOCKSTEP)
 
 #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
 #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
@@ -239,6 +241,8 @@ static inline int arch_within_stack_frames(const void * const stack,
 extern void arch_task_cache_init(void);
 extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src);
 extern void arch_release_task_struct(struct task_struct *tsk);
+extern void arch_setup_new_exec(void);
+#define arch_setup_new_exec arch_setup_new_exec
 #endif /* !__ASSEMBLY__ */
 
 #endif /* _ASM_X86_THREAD_INFO_H */
index fc5abff..75d002b 100644 (file)
@@ -110,6 +110,16 @@ static inline void cr4_clear_bits(unsigned long mask)
        }
 }
 
+static inline void cr4_toggle_bits(unsigned long mask)
+{
+       unsigned long cr4;
+
+       cr4 = this_cpu_read(cpu_tlbstate.cr4);
+       cr4 ^= mask;
+       this_cpu_write(cpu_tlbstate.cr4, cr4);
+       __write_cr4(cr4);
+}
+
 /* Read the CR4 shadow. */
 static inline unsigned long cr4_read_shadow(void)
 {
index ea14831..68766b2 100644 (file)
@@ -3,19 +3,14 @@
 /*
  * User space memory access functions
  */
-#include <linux/errno.h>
 #include <linux/compiler.h>
 #include <linux/kasan-checks.h>
-#include <linux/thread_info.h>
 #include <linux/string.h>
 #include <asm/asm.h>
 #include <asm/page.h>
 #include <asm/smap.h>
 #include <asm/extable.h>
 
-#define VERIFY_READ 0
-#define VERIFY_WRITE 1
-
 /*
  * The fs value determines whether argument validity checking should be
  * performed or not.  If get_fs() == USER_DS, checking is performed, with
@@ -384,6 +379,18 @@ do {                                                                       \
                     : "=r" (err), ltype(x)                             \
                     : "m" (__m(addr)), "i" (errret), "0" (err))
 
+#define __get_user_asm_nozero(x, addr, err, itype, rtype, ltype, errret)       \
+       asm volatile("\n"                                               \
+                    "1:        mov"itype" %2,%"rtype"1\n"              \
+                    "2:\n"                                             \
+                    ".section .fixup,\"ax\"\n"                         \
+                    "3:        mov %3,%0\n"                            \
+                    "  jmp 2b\n"                                       \
+                    ".previous\n"                                      \
+                    _ASM_EXTABLE(1b, 3b)                               \
+                    : "=r" (err), ltype(x)                             \
+                    : "m" (__m(addr)), "i" (errret), "0" (err))
+
 /*
  * This doesn't do __uaccess_begin/end - the exception handling
  * around it must do that.
@@ -675,59 +682,6 @@ extern struct movsl_mask {
 # include <asm/uaccess_64.h>
 #endif
 
-unsigned long __must_check _copy_from_user(void *to, const void __user *from,
-                                          unsigned n);
-unsigned long __must_check _copy_to_user(void __user *to, const void *from,
-                                        unsigned n);
-
-extern void __compiletime_error("usercopy buffer size is too small")
-__bad_copy_user(void);
-
-static inline void copy_user_overflow(int size, unsigned long count)
-{
-       WARN(1, "Buffer overflow detected (%d < %lu)!\n", size, count);
-}
-
-static __always_inline unsigned long __must_check
-copy_from_user(void *to, const void __user *from, unsigned long n)
-{
-       int sz = __compiletime_object_size(to);
-
-       might_fault();
-
-       kasan_check_write(to, n);
-
-       if (likely(sz < 0 || sz >= n)) {
-               check_object_size(to, n, false);
-               n = _copy_from_user(to, from, n);
-       } else if (!__builtin_constant_p(n))
-               copy_user_overflow(sz, n);
-       else
-               __bad_copy_user();
-
-       return n;
-}
-
-static __always_inline unsigned long __must_check
-copy_to_user(void __user *to, const void *from, unsigned long n)
-{
-       int sz = __compiletime_object_size(from);
-
-       kasan_check_read(from, n);
-
-       might_fault();
-
-       if (likely(sz < 0 || sz >= n)) {
-               check_object_size(from, n, true);
-               n = _copy_to_user(to, from, n);
-       } else if (!__builtin_constant_p(n))
-               copy_user_overflow(sz, n);
-       else
-               __bad_copy_user();
-
-       return n;
-}
-
 /*
  * We rely on the nested NMI work to allow atomic faults from the NMI path; the
  * nested NMI paths are careful to preserve CR2.
index 7d3bdd1..aeda9bb 100644 (file)
 /*
  * User space memory access functions
  */
-#include <linux/errno.h>
-#include <linux/thread_info.h>
 #include <linux/string.h>
 #include <asm/asm.h>
 #include <asm/page.h>
 
-unsigned long __must_check __copy_to_user_ll
-               (void __user *to, const void *from, unsigned long n);
-unsigned long __must_check __copy_from_user_ll
-               (void *to, const void __user *from, unsigned long n);
-unsigned long __must_check __copy_from_user_ll_nozero
-               (void *to, const void __user *from, unsigned long n);
-unsigned long __must_check __copy_from_user_ll_nocache
-               (void *to, const void __user *from, unsigned long n);
+unsigned long __must_check __copy_user_ll
+               (void *to, const void *from, unsigned long n);
 unsigned long __must_check __copy_from_user_ll_nocache_nozero
                (void *to, const void __user *from, unsigned long n);
 
-/**
- * __copy_to_user_inatomic: - Copy a block of data into user space, with less checking.
- * @to:   Destination address, in user space.
- * @from: Source address, in kernel space.
- * @n:    Number of bytes to copy.
- *
- * Context: User context only.
- *
- * Copy data from kernel space to user space.  Caller must check
- * the specified block with access_ok() before calling this function.
- * The caller should also make sure he pins the user space address
- * so that we don't result in page fault and sleep.
- */
-static __always_inline unsigned long __must_check
-__copy_to_user_inatomic(void __user *to, const void *from, unsigned long n)
-{
-       check_object_size(from, n, true);
-       return __copy_to_user_ll(to, from, n);
-}
-
-/**
- * __copy_to_user: - Copy a block of data into user space, with less checking.
- * @to:   Destination address, in user space.
- * @from: Source address, in kernel space.
- * @n:    Number of bytes to copy.
- *
- * Context: User context only. This function may sleep if pagefaults are
- *          enabled.
- *
- * Copy data from kernel space to user space.  Caller must check
- * the specified block with access_ok() before calling this function.
- *
- * Returns number of bytes that could not be copied.
- * On success, this will be zero.
- */
 static __always_inline unsigned long __must_check
-__copy_to_user(void __user *to, const void *from, unsigned long n)
+raw_copy_to_user(void __user *to, const void *from, unsigned long n)
 {
-       might_fault();
-       return __copy_to_user_inatomic(to, from, n);
+       return __copy_user_ll((__force void *)to, from, n);
 }
 
 static __always_inline unsigned long
-__copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
-{
-       return __copy_from_user_ll_nozero(to, from, n);
-}
-
-/**
- * __copy_from_user: - Copy a block of data from user space, with less checking.
- * @to:   Destination address, in kernel space.
- * @from: Source address, in user space.
- * @n:    Number of bytes to copy.
- *
- * Context: User context only. This function may sleep if pagefaults are
- *          enabled.
- *
- * Copy data from user space to kernel space.  Caller must check
- * the specified block with access_ok() before calling this function.
- *
- * Returns number of bytes that could not be copied.
- * On success, this will be zero.
- *
- * If some data could not be copied, this function will pad the copied
- * data to the requested size using zero bytes.
- *
- * An alternate version - __copy_from_user_inatomic() - may be called from
- * atomic context and will fail rather than sleep.  In this case the
- * uncopied bytes will *NOT* be padded with zeros.  See fs/filemap.h
- * for explanation of why this is needed.
- */
-static __always_inline unsigned long
-__copy_from_user(void *to, const void __user *from, unsigned long n)
-{
-       might_fault();
-       check_object_size(to, n, false);
-       if (__builtin_constant_p(n)) {
-               unsigned long ret;
-
-               switch (n) {
-               case 1:
-                       __uaccess_begin();
-                       __get_user_size(*(u8 *)to, from, 1, ret, 1);
-                       __uaccess_end();
-                       return ret;
-               case 2:
-                       __uaccess_begin();
-                       __get_user_size(*(u16 *)to, from, 2, ret, 2);
-                       __uaccess_end();
-                       return ret;
-               case 4:
-                       __uaccess_begin();
-                       __get_user_size(*(u32 *)to, from, 4, ret, 4);
-                       __uaccess_end();
-                       return ret;
-               }
-       }
-       return __copy_from_user_ll(to, from, n);
-}
-
-static __always_inline unsigned long __copy_from_user_nocache(void *to,
-                               const void __user *from, unsigned long n)
+raw_copy_from_user(void *to, const void __user *from, unsigned long n)
 {
-       might_fault();
        if (__builtin_constant_p(n)) {
                unsigned long ret;
 
                switch (n) {
                case 1:
+                       ret = 0;
                        __uaccess_begin();
-                       __get_user_size(*(u8 *)to, from, 1, ret, 1);
+                       __get_user_asm_nozero(*(u8 *)to, from, ret,
+                                             "b", "b", "=q", 1);
                        __uaccess_end();
                        return ret;
                case 2:
+                       ret = 0;
                        __uaccess_begin();
-                       __get_user_size(*(u16 *)to, from, 2, ret, 2);
+                       __get_user_asm_nozero(*(u16 *)to, from, ret,
+                                             "w", "w", "=r", 2);
                        __uaccess_end();
                        return ret;
                case 4:
+                       ret = 0;
                        __uaccess_begin();
-                       __get_user_size(*(u32 *)to, from, 4, ret, 4);
+                       __get_user_asm_nozero(*(u32 *)to, from, ret,
+                                             "l", "k", "=r", 4);
                        __uaccess_end();
                        return ret;
                }
        }
-       return __copy_from_user_ll_nocache(to, from, n);
+       return __copy_user_ll(to, (__force const void *)from, n);
 }
 
 static __always_inline unsigned long
index 673059a..c5504b9 100644 (file)
@@ -5,7 +5,6 @@
  * User space memory access functions
  */
 #include <linux/compiler.h>
-#include <linux/errno.h>
 #include <linux/lockdep.h>
 #include <linux/kasan-checks.h>
 #include <asm/alternative.h>
@@ -46,58 +45,54 @@ copy_user_generic(void *to, const void *from, unsigned len)
        return ret;
 }
 
-__must_check unsigned long
-copy_in_user(void __user *to, const void __user *from, unsigned len);
-
-static __always_inline __must_check
-int __copy_from_user_nocheck(void *dst, const void __user *src, unsigned size)
+static __always_inline __must_check unsigned long
+raw_copy_from_user(void *dst, const void __user *src, unsigned long size)
 {
        int ret = 0;
 
-       check_object_size(dst, size, false);
        if (!__builtin_constant_p(size))
                return copy_user_generic(dst, (__force void *)src, size);
        switch (size) {
        case 1:
                __uaccess_begin();
-               __get_user_asm(*(u8 *)dst, (u8 __user *)src,
+               __get_user_asm_nozero(*(u8 *)dst, (u8 __user *)src,
                              ret, "b", "b", "=q", 1);
                __uaccess_end();
                return ret;
        case 2:
                __uaccess_begin();
-               __get_user_asm(*(u16 *)dst, (u16 __user *)src,
+               __get_user_asm_nozero(*(u16 *)dst, (u16 __user *)src,
                              ret, "w", "w", "=r", 2);
                __uaccess_end();
                return ret;
        case 4:
                __uaccess_begin();
-               __get_user_asm(*(u32 *)dst, (u32 __user *)src,
+               __get_user_asm_nozero(*(u32 *)dst, (u32 __user *)src,
                              ret, "l", "k", "=r", 4);
                __uaccess_end();
                return ret;
        case 8:
                __uaccess_begin();
-               __get_user_asm(*(u64 *)dst, (u64 __user *)src,
+               __get_user_asm_nozero(*(u64 *)dst, (u64 __user *)src,
                              ret, "q", "", "=r", 8);
                __uaccess_end();
                return ret;
        case 10:
                __uaccess_begin();
-               __get_user_asm(*(u64 *)dst, (u64 __user *)src,
+               __get_user_asm_nozero(*(u64 *)dst, (u64 __user *)src,
                               ret, "q", "", "=r", 10);
                if (likely(!ret))
-                       __get_user_asm(*(u16 *)(8 + (char *)dst),
+                       __get_user_asm_nozero(*(u16 *)(8 + (char *)dst),
                                       (u16 __user *)(8 + (char __user *)src),
                                       ret, "w", "w", "=r", 2);
                __uaccess_end();
                return ret;
        case 16:
                __uaccess_begin();
-               __get_user_asm(*(u64 *)dst, (u64 __user *)src,
+               __get_user_asm_nozero(*(u64 *)dst, (u64 __user *)src,
                               ret, "q", "", "=r", 16);
                if (likely(!ret))
-                       __get_user_asm(*(u64 *)(8 + (char *)dst),
+                       __get_user_asm_nozero(*(u64 *)(8 + (char *)dst),
                                       (u64 __user *)(8 + (char __user *)src),
                                       ret, "q", "", "=r", 8);
                __uaccess_end();
@@ -107,20 +102,11 @@ int __copy_from_user_nocheck(void *dst, const void __user *src, unsigned size)
        }
 }
 
-static __always_inline __must_check
-int __copy_from_user(void *dst, const void __user *src, unsigned size)
-{
-       might_fault();
-       kasan_check_write(dst, size);
-       return __copy_from_user_nocheck(dst, src, size);
-}
-
-static __always_inline __must_check
-int __copy_to_user_nocheck(void __user *dst, const void *src, unsigned size)
+static __always_inline __must_check unsigned long
+raw_copy_to_user(void __user *dst, const void *src, unsigned long size)
 {
        int ret = 0;
 
-       check_object_size(src, size, true);
        if (!__builtin_constant_p(size))
                return copy_user_generic((__force void *)dst, src, size);
        switch (size) {
@@ -176,100 +162,16 @@ int __copy_to_user_nocheck(void __user *dst, const void *src, unsigned size)
 }
 
 static __always_inline __must_check
-int __copy_to_user(void __user *dst, const void *src, unsigned size)
-{
-       might_fault();
-       kasan_check_read(src, size);
-       return __copy_to_user_nocheck(dst, src, size);
-}
-
-static __always_inline __must_check
-int __copy_in_user(void __user *dst, const void __user *src, unsigned size)
-{
-       int ret = 0;
-
-       might_fault();
-       if (!__builtin_constant_p(size))
-               return copy_user_generic((__force void *)dst,
-                                        (__force void *)src, size);
-       switch (size) {
-       case 1: {
-               u8 tmp;
-               __uaccess_begin();
-               __get_user_asm(tmp, (u8 __user *)src,
-                              ret, "b", "b", "=q", 1);
-               if (likely(!ret))
-                       __put_user_asm(tmp, (u8 __user *)dst,
-                                      ret, "b", "b", "iq", 1);
-               __uaccess_end();
-               return ret;
-       }
-       case 2: {
-               u16 tmp;
-               __uaccess_begin();
-               __get_user_asm(tmp, (u16 __user *)src,
-                              ret, "w", "w", "=r", 2);
-               if (likely(!ret))
-                       __put_user_asm(tmp, (u16 __user *)dst,
-                                      ret, "w", "w", "ir", 2);
-               __uaccess_end();
-               return ret;
-       }
-
-       case 4: {
-               u32 tmp;
-               __uaccess_begin();
-               __get_user_asm(tmp, (u32 __user *)src,
-                              ret, "l", "k", "=r", 4);
-               if (likely(!ret))
-                       __put_user_asm(tmp, (u32 __user *)dst,
-                                      ret, "l", "k", "ir", 4);
-               __uaccess_end();
-               return ret;
-       }
-       case 8: {
-               u64 tmp;
-               __uaccess_begin();
-               __get_user_asm(tmp, (u64 __user *)src,
-                              ret, "q", "", "=r", 8);
-               if (likely(!ret))
-                       __put_user_asm(tmp, (u64 __user *)dst,
-                                      ret, "q", "", "er", 8);
-               __uaccess_end();
-               return ret;
-       }
-       default:
-               return copy_user_generic((__force void *)dst,
-                                        (__force void *)src, size);
-       }
-}
-
-static __must_check __always_inline int
-__copy_from_user_inatomic(void *dst, const void __user *src, unsigned size)
-{
-       kasan_check_write(dst, size);
-       return __copy_from_user_nocheck(dst, src, size);
-}
-
-static __must_check __always_inline int
-__copy_to_user_inatomic(void __user *dst, const void *src, unsigned size)
+unsigned long raw_copy_in_user(void __user *dst, const void __user *src, unsigned long size)
 {
-       kasan_check_read(src, size);
-       return __copy_to_user_nocheck(dst, src, size);
+       return copy_user_generic((__force void *)dst,
+                                (__force void *)src, size);
 }
 
 extern long __copy_user_nocache(void *dst, const void __user *src,
                                unsigned size, int zerorest);
 
 static inline int
-__copy_from_user_nocache(void *dst, const void __user *src, unsigned size)
-{
-       might_fault();
-       kasan_check_write(dst, size);
-       return __copy_user_nocache(dst, src, size, 1);
-}
-
-static inline int
 __copy_from_user_inatomic_nocache(void *dst, const void __user *src,
                                  unsigned size)
 {
index 33cbd3d..64c5e74 100644 (file)
@@ -6,6 +6,7 @@
 #include <linux/spinlock.h>
 #include <linux/pfn.h>
 #include <linux/mm.h>
+#include <linux/device.h>
 
 #include <linux/uaccess.h>
 #include <asm/page.h>
index 07244ea..ddef37b 100644 (file)
@@ -34,7 +34,6 @@
 #include <linux/screen_info.h>
 #include <linux/apm_bios.h>
 #include <linux/edd.h>
-#include <asm/e820.h>
 #include <asm/ist.h>
 #include <video/edid.h>
 
@@ -111,6 +110,21 @@ struct efi_info {
        __u32 efi_memmap_hi;
 };
 
+/*
+ * This is the maximum number of entries in struct boot_params::e820_table
+ * (the zeropage), which is part of the x86 boot protocol ABI:
+ */
+#define E820_MAX_ENTRIES_ZEROPAGE 128
+
+/*
+ * The E820 memory region entry of the boot protocol ABI:
+ */
+struct boot_e820_entry {
+       __u64 addr;
+       __u64 size;
+       __u32 type;
+} __attribute__((packed));
+
 /* The so-called "zeropage" */
 struct boot_params {
        struct screen_info screen_info;                 /* 0x000 */
@@ -153,7 +167,7 @@ struct boot_params {
        struct setup_header hdr;    /* setup header */  /* 0x1f1 */
        __u8  _pad7[0x290-0x1f1-sizeof(struct setup_header)];
        __u32 edd_mbr_sig_buffer[EDD_MBR_SIG_MAX];      /* 0x290 */
-       struct e820entry e820_map[E820MAX];             /* 0x2d0 */
+       struct boot_e820_entry e820_table[E820_MAX_ENTRIES_ZEROPAGE]; /* 0x2d0 */
        __u8  _pad8[48];                                /* 0xcd0 */
        struct edd_info eddbuf[EDDMAXNR];               /* 0xd00 */
        __u8  _pad9[276];                               /* 0xeec */
index 835aa51..c457655 100644 (file)
@@ -1,10 +1,13 @@
 #ifndef _ASM_X86_PRCTL_H
 #define _ASM_X86_PRCTL_H
 
-#define ARCH_SET_GS 0x1001
-#define ARCH_SET_FS 0x1002
-#define ARCH_GET_FS 0x1003
-#define ARCH_GET_GS 0x1004
+#define ARCH_SET_GS            0x1001
+#define ARCH_SET_FS            0x1002
+#define ARCH_GET_FS            0x1003
+#define ARCH_GET_GS            0x1004
+
+#define ARCH_GET_CPUID         0x1011
+#define ARCH_SET_CPUID         0x1012
 
 #define ARCH_MAP_VDSO_X32      0x2001
 #define ARCH_MAP_VDSO_32       0x2002
index b2879cc..6bb6806 100644 (file)
@@ -37,6 +37,7 @@
 #include <linux/pci.h>
 #include <linux/efi-bgrt.h>
 
+#include <asm/e820/api.h>
 #include <asm/irqdomain.h>
 #include <asm/pci_x86.h>
 #include <asm/pgtable.h>
@@ -1564,12 +1565,6 @@ int __init early_acpi_boot_init(void)
        return 0;
 }
 
-static int __init acpi_parse_bgrt(struct acpi_table_header *table)
-{
-       efi_bgrt_init(table);
-       return 0;
-}
-
 int __init acpi_boot_init(void)
 {
        /* those are executed after early-quirks are executed */
@@ -1729,6 +1724,6 @@ int __acpi_release_global_lock(unsigned int *lock)
 
 void __init arch_reserve_mem_area(acpi_physical_address addr, size_t size)
 {
-       e820_add_region(addr, size, E820_ACPI);
-       update_e820();
+       e820__range_add(addr, size, E820_TYPE_ACPI);
+       e820__update_table_print();
 }
index 0a2bb1f..ef2859f 100644 (file)
@@ -21,7 +21,7 @@
 #include <linux/pci.h>
 #include <linux/bitops.h>
 #include <linux/suspend.h>
-#include <asm/e820.h>
+#include <asm/e820/api.h>
 #include <asm/io.h>
 #include <asm/iommu.h>
 #include <asm/gart.h>
@@ -306,13 +306,13 @@ void __init early_gart_iommu_check(void)
                fix = 1;
 
        if (gart_fix_e820 && !fix && aper_enabled) {
-               if (e820_any_mapped(aper_base, aper_base + aper_size,
-                                   E820_RAM)) {
+               if (e820__mapped_any(aper_base, aper_base + aper_size,
+                                   E820_TYPE_RAM)) {
                        /* reserve it, so we can reuse it in second kernel */
                        pr_info("e820: reserve [mem %#010Lx-%#010Lx] for GART\n",
                                aper_base, aper_base + aper_size - 1);
-                       e820_add_region(aper_base, aper_size, E820_RESERVED);
-                       update_e820();
+                       e820__range_add(aper_base, aper_size, E820_TYPE_RESERVED);
+                       e820__update_table_print();
                }
        }
 
index 8ccb7ef..847650b 100644 (file)
@@ -731,8 +731,10 @@ static int __init calibrate_APIC_clock(void)
                                        TICK_NSEC, lapic_clockevent.shift);
                lapic_clockevent.max_delta_ns =
                        clockevent_delta2ns(0x7FFFFF, &lapic_clockevent);
+               lapic_clockevent.max_delta_ticks = 0x7FFFFF;
                lapic_clockevent.min_delta_ns =
                        clockevent_delta2ns(0xF, &lapic_clockevent);
+               lapic_clockevent.min_delta_ticks = 0xF;
                lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
                return 0;
        }
@@ -778,8 +780,10 @@ static int __init calibrate_APIC_clock(void)
                                       lapic_clockevent.shift);
        lapic_clockevent.max_delta_ns =
                clockevent_delta2ns(0x7FFFFFFF, &lapic_clockevent);
+       lapic_clockevent.max_delta_ticks = 0x7FFFFFFF;
        lapic_clockevent.min_delta_ns =
                clockevent_delta2ns(0xF, &lapic_clockevent);
+       lapic_clockevent.min_delta_ticks = 0xF;
 
        lapic_timer_frequency = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS;
 
@@ -2627,7 +2631,7 @@ static int __init lapic_insert_resource(void)
 }
 
 /*
- * need call insert after e820_reserve_resources()
+ * need call insert after e820__reserve_resources()
  * that is using request_resource
  */
 late_initcall(lapic_insert_resource);
index b109e43..2262eb6 100644 (file)
@@ -26,7 +26,7 @@
 
 #include <linux/interrupt.h>
 #include <asm/acpi.h>
-#include <asm/e820.h>
+#include <asm/e820/api.h>
 
 static void noop_init_apic_ldr(void) { }
 static void noop_send_IPI(int cpu, int vector) { }
index c48264e..2e8f7f0 100644 (file)
@@ -25,7 +25,7 @@
 
 #include <linux/interrupt.h>
 #include <asm/acpi.h>
-#include <asm/e820.h>
+#include <asm/e820/api.h>
 
 #ifdef CONFIG_HOTPLUG_CPU
 #define DEFAULT_SEND_IPI       (1)
index 86f20cc..b487b3a 100644 (file)
@@ -34,6 +34,7 @@
 #include <asm/uv/bios.h>
 #include <asm/uv/uv.h>
 #include <asm/apic.h>
+#include <asm/e820/api.h>
 #include <asm/ipi.h>
 #include <asm/smp.h>
 #include <asm/x86_init.h>
index 43955ee..44207b7 100644 (file)
@@ -3,7 +3,7 @@
 #include <linux/sched/clock.h>
 
 #include <asm/cpufeature.h>
-#include <asm/e820.h>
+#include <asm/e820/api.h>
 #include <asm/mtrr.h>
 #include <asm/msr.h>
 
index 0631977..dfa90a3 100644 (file)
@@ -90,16 +90,12 @@ static void probe_xeon_phi_r3mwait(struct cpuinfo_x86 *c)
                return;
        }
 
-       if (ring3mwait_disabled) {
-               msr_clear_bit(MSR_MISC_FEATURE_ENABLES,
-                             MSR_MISC_FEATURE_ENABLES_RING3MWAIT_BIT);
+       if (ring3mwait_disabled)
                return;
-       }
-
-       msr_set_bit(MSR_MISC_FEATURE_ENABLES,
-                   MSR_MISC_FEATURE_ENABLES_RING3MWAIT_BIT);
 
        set_cpu_cap(c, X86_FEATURE_RING3MWAIT);
+       this_cpu_or(msr_misc_features_shadow,
+                   1UL << MSR_MISC_FEATURES_ENABLES_RING3MWAIT_BIT);
 
        if (c == &boot_cpu_data)
                ELF_HWCAP2 |= HWCAP2_RING3MWAIT;
@@ -488,6 +484,34 @@ static void intel_bsp_resume(struct cpuinfo_x86 *c)
        init_intel_energy_perf(c);
 }
 
+static void init_cpuid_fault(struct cpuinfo_x86 *c)
+{
+       u64 msr;
+
+       if (!rdmsrl_safe(MSR_PLATFORM_INFO, &msr)) {
+               if (msr & MSR_PLATFORM_INFO_CPUID_FAULT)
+                       set_cpu_cap(c, X86_FEATURE_CPUID_FAULT);
+       }
+}
+
+static void init_intel_misc_features(struct cpuinfo_x86 *c)
+{
+       u64 msr;
+
+       if (rdmsrl_safe(MSR_MISC_FEATURES_ENABLES, &msr))
+               return;
+
+       /* Clear all MISC features */
+       this_cpu_write(msr_misc_features_shadow, 0);
+
+       /* Check features and update capabilities and shadow control bits */
+       init_cpuid_fault(c);
+       probe_xeon_phi_r3mwait(c);
+
+       msr = this_cpu_read(msr_misc_features_shadow);
+       wrmsrl(MSR_MISC_FEATURES_ENABLES, msr);
+}
+
 static void init_intel(struct cpuinfo_x86 *c)
 {
        unsigned int l2 = 0;
@@ -602,7 +626,7 @@ static void init_intel(struct cpuinfo_x86 *c)
 
        init_intel_energy_perf(c);
 
-       probe_xeon_phi_r3mwait(c);
+       init_intel_misc_features(c);
 }
 
 #ifdef CONFIG_X86_32
index a3311c8..43051f0 100644 (file)
@@ -9,3 +9,5 @@ obj-$(CONFIG_X86_MCE_INJECT)    += mce-inject.o
 obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o
 
 obj-$(CONFIG_ACPI_APEI)                += mce-apei.o
+
+obj-$(CONFIG_X86_MCELOG_LEGACY)        += dev-mcelog.o
diff --git a/arch/x86/kernel/cpu/mcheck/dev-mcelog.c b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c
new file mode 100644 (file)
index 0000000..9c632cb
--- /dev/null
@@ -0,0 +1,397 @@
+/*
+ * /dev/mcelog driver
+ *
+ * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ * Rest from unknown author(s).
+ * 2004 Andi Kleen. Rewrote most of it.
+ * Copyright 2008 Intel Corporation
+ * Author: Andi Kleen
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/miscdevice.h>
+#include <linux/slab.h>
+#include <linux/kmod.h>
+#include <linux/poll.h>
+
+#include "mce-internal.h"
+
+static DEFINE_MUTEX(mce_chrdev_read_mutex);
+
+static char mce_helper[128];
+static char *mce_helper_argv[2] = { mce_helper, NULL };
+
+#define mce_log_get_idx_check(p) \
+({ \
+       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
+                        !lockdep_is_held(&mce_chrdev_read_mutex), \
+                        "suspicious mce_log_get_idx_check() usage"); \
+       smp_load_acquire(&(p)); \
+})
+
+/*
+ * Lockless MCE logging infrastructure.
+ * This avoids deadlocks on printk locks without having to break locks. Also
+ * separate MCEs from kernel messages to avoid bogus bug reports.
+ */
+
+static struct mce_log_buffer mcelog = {
+       .signature      = MCE_LOG_SIGNATURE,
+       .len            = MCE_LOG_LEN,
+       .recordlen      = sizeof(struct mce),
+};
+
+static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
+
+/* User mode helper program triggered by machine check event */
+extern char                    mce_helper[128];
+
+static int dev_mce_log(struct notifier_block *nb, unsigned long val,
+                               void *data)
+{
+       struct mce *mce = (struct mce *)data;
+       unsigned int next, entry;
+
+       wmb();
+       for (;;) {
+               entry = mce_log_get_idx_check(mcelog.next);
+               for (;;) {
+
+                       /*
+                        * When the buffer fills up discard new entries.
+                        * Assume that the earlier errors are the more
+                        * interesting ones:
+                        */
+                       if (entry >= MCE_LOG_LEN) {
+                               set_bit(MCE_OVERFLOW,
+                                       (unsigned long *)&mcelog.flags);
+                               return NOTIFY_OK;
+                       }
+                       /* Old left over entry. Skip: */
+                       if (mcelog.entry[entry].finished) {
+                               entry++;
+                               continue;
+                       }
+                       break;
+               }
+               smp_rmb();
+               next = entry + 1;
+               if (cmpxchg(&mcelog.next, entry, next) == entry)
+                       break;
+       }
+       memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
+       wmb();
+       mcelog.entry[entry].finished = 1;
+       wmb();
+
+       /* wake processes polling /dev/mcelog */
+       wake_up_interruptible(&mce_chrdev_wait);
+
+       return NOTIFY_OK;
+}
+
+static struct notifier_block dev_mcelog_nb = {
+       .notifier_call  = dev_mce_log,
+       .priority       = MCE_PRIO_MCELOG,
+};
+
+static void mce_do_trigger(struct work_struct *work)
+{
+       call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
+}
+
+static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
+
+
+void mce_work_trigger(void)
+{
+       if (mce_helper[0])
+               schedule_work(&mce_trigger_work);
+}
+
+static ssize_t
+show_trigger(struct device *s, struct device_attribute *attr, char *buf)
+{
+       strcpy(buf, mce_helper);
+       strcat(buf, "\n");
+       return strlen(mce_helper) + 1;
+}
+
+static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
+                               const char *buf, size_t siz)
+{
+       char *p;
+
+       strncpy(mce_helper, buf, sizeof(mce_helper));
+       mce_helper[sizeof(mce_helper)-1] = 0;
+       p = strchr(mce_helper, '\n');
+
+       if (p)
+               *p = 0;
+
+       return strlen(mce_helper) + !!p;
+}
+
+DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
+
+/*
+ * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
+ */
+
+static DEFINE_SPINLOCK(mce_chrdev_state_lock);
+static int mce_chrdev_open_count;      /* #times opened */
+static int mce_chrdev_open_exclu;      /* already open exclusive? */
+
+static int mce_chrdev_open(struct inode *inode, struct file *file)
+{
+       spin_lock(&mce_chrdev_state_lock);
+
+       if (mce_chrdev_open_exclu ||
+           (mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
+               spin_unlock(&mce_chrdev_state_lock);
+
+               return -EBUSY;
+       }
+
+       if (file->f_flags & O_EXCL)
+               mce_chrdev_open_exclu = 1;
+       mce_chrdev_open_count++;
+
+       spin_unlock(&mce_chrdev_state_lock);
+
+       return nonseekable_open(inode, file);
+}
+
+static int mce_chrdev_release(struct inode *inode, struct file *file)
+{
+       spin_lock(&mce_chrdev_state_lock);
+
+       mce_chrdev_open_count--;
+       mce_chrdev_open_exclu = 0;
+
+       spin_unlock(&mce_chrdev_state_lock);
+
+       return 0;
+}
+
+static void collect_tscs(void *data)
+{
+       unsigned long *cpu_tsc = (unsigned long *)data;
+
+       cpu_tsc[smp_processor_id()] = rdtsc();
+}
+
+static int mce_apei_read_done;
+
+/* Collect MCE record of previous boot in persistent storage via APEI ERST. */
+static int __mce_read_apei(char __user **ubuf, size_t usize)
+{
+       int rc;
+       u64 record_id;
+       struct mce m;
+
+       if (usize < sizeof(struct mce))
+               return -EINVAL;
+
+       rc = apei_read_mce(&m, &record_id);
+       /* Error or no more MCE record */
+       if (rc <= 0) {
+               mce_apei_read_done = 1;
+               /*
+                * When ERST is disabled, mce_chrdev_read() should return
+                * "no record" instead of "no device."
+                */
+               if (rc == -ENODEV)
+                       return 0;
+               return rc;
+       }
+       rc = -EFAULT;
+       if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
+               return rc;
+       /*
+        * In fact, we should have cleared the record after that has
+        * been flushed to the disk or sent to network in
+        * /sbin/mcelog, but we have no interface to support that now,
+        * so just clear it to avoid duplication.
+        */
+       rc = apei_clear_mce(record_id);
+       if (rc) {
+               mce_apei_read_done = 1;
+               return rc;
+       }
+       *ubuf += sizeof(struct mce);
+
+       return 0;
+}
+
+static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
+                               size_t usize, loff_t *off)
+{
+       char __user *buf = ubuf;
+       unsigned long *cpu_tsc;
+       unsigned prev, next;
+       int i, err;
+
+       cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
+       if (!cpu_tsc)
+               return -ENOMEM;
+
+       mutex_lock(&mce_chrdev_read_mutex);
+
+       if (!mce_apei_read_done) {
+               err = __mce_read_apei(&buf, usize);
+               if (err || buf != ubuf)
+                       goto out;
+       }
+
+       next = mce_log_get_idx_check(mcelog.next);
+
+       /* Only supports full reads right now */
+       err = -EINVAL;
+       if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
+               goto out;
+
+       err = 0;
+       prev = 0;
+       do {
+               for (i = prev; i < next; i++) {
+                       unsigned long start = jiffies;
+                       struct mce *m = &mcelog.entry[i];
+
+                       while (!m->finished) {
+                               if (time_after_eq(jiffies, start + 2)) {
+                                       memset(m, 0, sizeof(*m));
+                                       goto timeout;
+                               }
+                               cpu_relax();
+                       }
+                       smp_rmb();
+                       err |= copy_to_user(buf, m, sizeof(*m));
+                       buf += sizeof(*m);
+timeout:
+                       ;
+               }
+
+               memset(mcelog.entry + prev, 0,
+                      (next - prev) * sizeof(struct mce));
+               prev = next;
+               next = cmpxchg(&mcelog.next, prev, 0);
+       } while (next != prev);
+
+       synchronize_sched();
+
+       /*
+        * Collect entries that were still getting written before the
+        * synchronize.
+        */
+       on_each_cpu(collect_tscs, cpu_tsc, 1);
+
+       for (i = next; i < MCE_LOG_LEN; i++) {
+               struct mce *m = &mcelog.entry[i];
+
+               if (m->finished && m->tsc < cpu_tsc[m->cpu]) {
+                       err |= copy_to_user(buf, m, sizeof(*m));
+                       smp_rmb();
+                       buf += sizeof(*m);
+                       memset(m, 0, sizeof(*m));
+               }
+       }
+
+       if (err)
+               err = -EFAULT;
+
+out:
+       mutex_unlock(&mce_chrdev_read_mutex);
+       kfree(cpu_tsc);
+
+       return err ? err : buf - ubuf;
+}
+
+static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
+{
+       poll_wait(file, &mce_chrdev_wait, wait);
+       if (READ_ONCE(mcelog.next))
+               return POLLIN | POLLRDNORM;
+       if (!mce_apei_read_done && apei_check_mce())
+               return POLLIN | POLLRDNORM;
+       return 0;
+}
+
+static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
+                               unsigned long arg)
+{
+       int __user *p = (int __user *)arg;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       switch (cmd) {
+       case MCE_GET_RECORD_LEN:
+               return put_user(sizeof(struct mce), p);
+       case MCE_GET_LOG_LEN:
+               return put_user(MCE_LOG_LEN, p);
+       case MCE_GETCLEAR_FLAGS: {
+               unsigned flags;
+
+               do {
+                       flags = mcelog.flags;
+               } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
+
+               return put_user(flags, p);
+       }
+       default:
+               return -ENOTTY;
+       }
+}
+
+static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf,
+                           size_t usize, loff_t *off);
+
+void register_mce_write_callback(ssize_t (*fn)(struct file *filp,
+                            const char __user *ubuf,
+                            size_t usize, loff_t *off))
+{
+       mce_write = fn;
+}
+EXPORT_SYMBOL_GPL(register_mce_write_callback);
+
+static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
+                               size_t usize, loff_t *off)
+{
+       if (mce_write)
+               return mce_write(filp, ubuf, usize, off);
+       else
+               return -EINVAL;
+}
+
+static const struct file_operations mce_chrdev_ops = {
+       .open                   = mce_chrdev_open,
+       .release                = mce_chrdev_release,
+       .read                   = mce_chrdev_read,
+       .write                  = mce_chrdev_write,
+       .poll                   = mce_chrdev_poll,
+       .unlocked_ioctl         = mce_chrdev_ioctl,
+       .llseek                 = no_llseek,
+};
+
+static struct miscdevice mce_chrdev_device = {
+       MISC_MCELOG_MINOR,
+       "mcelog",
+       &mce_chrdev_ops,
+};
+
+static __init int dev_mcelog_init_device(void)
+{
+       int err;
+
+       /* register character device /dev/mcelog */
+       err = misc_register(&mce_chrdev_device);
+       if (err) {
+               pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err);
+               return err;
+       }
+       mce_register_decode_chain(&dev_mcelog_nb);
+       return 0;
+}
+device_initcall_sync(dev_mcelog_init_device);
index 1e5a50c..217cd44 100644 (file)
@@ -85,7 +85,7 @@ void mce_gen_pool_process(struct work_struct *__unused)
        head = llist_reverse_order(head);
        llist_for_each_entry_safe(node, tmp, head, llnode) {
                mce = &node->mce;
-               atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
+               blocking_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
                gen_pool_free(mce_evt_pool, (unsigned long)node, sizeof(*node));
        }
 }
index 903043e..654ad06 100644 (file)
@@ -13,7 +13,7 @@ enum severity_level {
        MCE_PANIC_SEVERITY,
 };
 
-extern struct atomic_notifier_head x86_mce_decoder_chain;
+extern struct blocking_notifier_head x86_mce_decoder_chain;
 
 #define ATTR_LEN               16
 #define INITIAL_CHECK_INTERVAL 5 * 60 /* 5 minutes */
@@ -96,3 +96,11 @@ static inline bool mce_cmp(struct mce *m1, struct mce *m2)
                m1->addr != m2->addr ||
                m1->misc != m2->misc;
 }
+
+extern struct device_attribute dev_attr_trigger;
+
+#ifdef CONFIG_X86_MCELOG_LEGACY
+extern void mce_work_trigger(void);
+#else
+static inline void mce_work_trigger(void)      { }
+#endif
index 5accfbd..5abd4bf 100644 (file)
@@ -35,6 +35,7 @@
 #include <linux/poll.h>
 #include <linux/nmi.h>
 #include <linux/cpu.h>
+#include <linux/ras.h>
 #include <linux/smp.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
 #include <asm/tlbflush.h>
 #include <asm/mce.h>
 #include <asm/msr.h>
+#include <asm/reboot.h>
 
 #include "mce-internal.h"
 
-static DEFINE_MUTEX(mce_chrdev_read_mutex);
-
-static int mce_chrdev_open_count;      /* #times opened */
-
-#define mce_log_get_idx_check(p) \
-({ \
-       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
-                        !lockdep_is_held(&mce_chrdev_read_mutex), \
-                        "suspicious mce_log_get_idx_check() usage"); \
-       smp_load_acquire(&(p)); \
-})
+static DEFINE_MUTEX(mce_log_mutex);
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/mce.h>
@@ -87,15 +79,9 @@ struct mca_config mca_cfg __read_mostly = {
        .monarch_timeout = -1
 };
 
-/* User mode helper program triggered by machine check event */
-static unsigned long           mce_need_notify;
-static char                    mce_helper[128];
-static char                    *mce_helper_argv[2] = { mce_helper, NULL };
-
-static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
-
 static DEFINE_PER_CPU(struct mce, mces_seen);
-static int                     cpu_missing;
+static unsigned long mce_need_notify;
+static int cpu_missing;
 
 /*
  * MCA banks polled by the period polling timer for corrected events.
@@ -123,7 +109,7 @@ static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
  * CPU/chipset specific EDAC code can register a notifier call here to print
  * MCE errors in a human-readable form.
  */
-ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
+BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain);
 
 /* Do initial initialization of a struct mce */
 void mce_setup(struct mce *m)
@@ -145,82 +131,38 @@ void mce_setup(struct mce *m)
 DEFINE_PER_CPU(struct mce, injectm);
 EXPORT_PER_CPU_SYMBOL_GPL(injectm);
 
-/*
- * Lockless MCE logging infrastructure.
- * This avoids deadlocks on printk locks without having to break locks. Also
- * separate MCEs from kernel messages to avoid bogus bug reports.
- */
-
-static struct mce_log mcelog = {
-       .signature      = MCE_LOG_SIGNATURE,
-       .len            = MCE_LOG_LEN,
-       .recordlen      = sizeof(struct mce),
-};
-
-void mce_log(struct mce *mce)
+void mce_log(struct mce *m)
 {
-       unsigned next, entry;
-
-       /* Emit the trace record: */
-       trace_mce_record(mce);
-
-       if (!mce_gen_pool_add(mce))
+       if (!mce_gen_pool_add(m))
                irq_work_queue(&mce_irq_work);
-
-       wmb();
-       for (;;) {
-               entry = mce_log_get_idx_check(mcelog.next);
-               for (;;) {
-
-                       /*
-                        * When the buffer fills up discard new entries.
-                        * Assume that the earlier errors are the more
-                        * interesting ones:
-                        */
-                       if (entry >= MCE_LOG_LEN) {
-                               set_bit(MCE_OVERFLOW,
-                                       (unsigned long *)&mcelog.flags);
-                               return;
-                       }
-                       /* Old left over entry. Skip: */
-                       if (mcelog.entry[entry].finished) {
-                               entry++;
-                               continue;
-                       }
-                       break;
-               }
-               smp_rmb();
-               next = entry + 1;
-               if (cmpxchg(&mcelog.next, entry, next) == entry)
-                       break;
-       }
-       memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
-       wmb();
-       mcelog.entry[entry].finished = 1;
-       wmb();
-
-       set_bit(0, &mce_need_notify);
 }
 
 void mce_inject_log(struct mce *m)
 {
-       mutex_lock(&mce_chrdev_read_mutex);
+       mutex_lock(&mce_log_mutex);
        mce_log(m);
-       mutex_unlock(&mce_chrdev_read_mutex);
+       mutex_unlock(&mce_log_mutex);
 }
 EXPORT_SYMBOL_GPL(mce_inject_log);
 
 static struct notifier_block mce_srao_nb;
 
+/*
+ * We run the default notifier if we have only the SRAO, the first and the
+ * default notifier registered. I.e., the mandatory NUM_DEFAULT_NOTIFIERS
+ * notifiers registered on the chain.
+ */
+#define NUM_DEFAULT_NOTIFIERS  3
 static atomic_t num_notifiers;
 
 void mce_register_decode_chain(struct notifier_block *nb)
 {
-       atomic_inc(&num_notifiers);
+       if (WARN_ON(nb->priority > MCE_PRIO_MCELOG && nb->priority < MCE_PRIO_EDAC))
+               return;
 
-       WARN_ON(nb->priority > MCE_PRIO_LOWEST && nb->priority < MCE_PRIO_EDAC);
+       atomic_inc(&num_notifiers);
 
-       atomic_notifier_chain_register(&x86_mce_decoder_chain, nb);
+       blocking_notifier_chain_register(&x86_mce_decoder_chain, nb);
 }
 EXPORT_SYMBOL_GPL(mce_register_decode_chain);
 
@@ -228,7 +170,7 @@ void mce_unregister_decode_chain(struct notifier_block *nb)
 {
        atomic_dec(&num_notifiers);
 
-       atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
+       blocking_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
 }
 EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
 
@@ -321,18 +263,7 @@ static void __print_mce(struct mce *m)
 
 static void print_mce(struct mce *m)
 {
-       int ret = 0;
-
        __print_mce(m);
-
-       /*
-        * Print out human-readable details about the MCE error,
-        * (if the CPU has an implementation for that)
-        */
-       ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
-       if (ret == NOTIFY_STOP)
-               return;
-
        pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
 }
 
@@ -521,7 +452,6 @@ static void mce_schedule_work(void)
 
 static void mce_irq_work_cb(struct irq_work *entry)
 {
-       mce_notify_irq();
        mce_schedule_work();
 }
 
@@ -550,20 +480,97 @@ static void mce_report_event(struct pt_regs *regs)
  */
 static int mce_usable_address(struct mce *m)
 {
-       if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
+       if (!(m->status & MCI_STATUS_ADDRV))
                return 0;
 
        /* Checks after this one are Intel-specific: */
        if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
                return 1;
 
+       if (!(m->status & MCI_STATUS_MISCV))
+               return 0;
+
        if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
                return 0;
+
        if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
                return 0;
+
        return 1;
 }
 
+static bool memory_error(struct mce *m)
+{
+       struct cpuinfo_x86 *c = &boot_cpu_data;
+
+       if (c->x86_vendor == X86_VENDOR_AMD) {
+               /* ErrCodeExt[20:16] */
+               u8 xec = (m->status >> 16) & 0x1f;
+
+               return (xec == 0x0 || xec == 0x8);
+       } else if (c->x86_vendor == X86_VENDOR_INTEL) {
+               /*
+                * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
+                *
+                * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
+                * indicating a memory error. Bit 8 is used for indicating a
+                * cache hierarchy error. The combination of bit 2 and bit 3
+                * is used for indicating a `generic' cache hierarchy error
+                * But we can't just blindly check the above bits, because if
+                * bit 11 is set, then it is a bus/interconnect error - and
+                * either way the above bits just gives more detail on what
+                * bus/interconnect error happened. Note that bit 12 can be
+                * ignored, as it's the "filter" bit.
+                */
+               return (m->status & 0xef80) == BIT(7) ||
+                      (m->status & 0xef00) == BIT(8) ||
+                      (m->status & 0xeffc) == 0xc;
+       }
+
+       return false;
+}
+
+static bool cec_add_mce(struct mce *m)
+{
+       if (!m)
+               return false;
+
+       /* We eat only correctable DRAM errors with usable addresses. */
+       if (memory_error(m) &&
+           !(m->status & MCI_STATUS_UC) &&
+           mce_usable_address(m))
+               if (!cec_add_elem(m->addr >> PAGE_SHIFT))
+                       return true;
+
+       return false;
+}
+
+static int mce_first_notifier(struct notifier_block *nb, unsigned long val,
+                             void *data)
+{
+       struct mce *m = (struct mce *)data;
+
+       if (!m)
+               return NOTIFY_DONE;
+
+       if (cec_add_mce(m))
+               return NOTIFY_STOP;
+
+       /* Emit the trace record: */
+       trace_mce_record(m);
+
+       set_bit(0, &mce_need_notify);
+
+       mce_notify_irq();
+
+       return NOTIFY_DONE;
+}
+
+static struct notifier_block first_nb = {
+       .notifier_call  = mce_first_notifier,
+       .priority       = MCE_PRIO_FIRST,
+};
+
 static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
                                void *data)
 {
@@ -593,15 +600,7 @@ static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
        if (!m)
                return NOTIFY_DONE;
 
-       /*
-        * Run the default notifier if we have only the SRAO
-        * notifier and us registered.
-        */
-       if (atomic_read(&num_notifiers) > 2)
-               return NOTIFY_DONE;
-
-       /* Don't print when mcelog is running */
-       if (mce_chrdev_open_count > 0)
+       if (atomic_read(&num_notifiers) > NUM_DEFAULT_NOTIFIERS)
                return NOTIFY_DONE;
 
        __print_mce(m);
@@ -654,37 +653,6 @@ static void mce_read_aux(struct mce *m, int i)
        }
 }
 
-static bool memory_error(struct mce *m)
-{
-       struct cpuinfo_x86 *c = &boot_cpu_data;
-
-       if (c->x86_vendor == X86_VENDOR_AMD) {
-               /* ErrCodeExt[20:16] */
-               u8 xec = (m->status >> 16) & 0x1f;
-
-               return (xec == 0x0 || xec == 0x8);
-       } else if (c->x86_vendor == X86_VENDOR_INTEL) {
-               /*
-                * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
-                *
-                * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
-                * indicating a memory error. Bit 8 is used for indicating a
-                * cache hierarchy error. The combination of bit 2 and bit 3
-                * is used for indicating a `generic' cache hierarchy error
-                * But we can't just blindly check the above bits, because if
-                * bit 11 is set, then it is a bus/interconnect error - and
-                * either way the above bits just gives more detail on what
-                * bus/interconnect error happened. Note that bit 12 can be
-                * ignored, as it's the "filter" bit.
-                */
-               return (m->status & 0xef80) == BIT(7) ||
-                      (m->status & 0xef00) == BIT(8) ||
-                      (m->status & 0xeffc) == 0xc;
-       }
-
-       return false;
-}
-
 DEFINE_PER_CPU(unsigned, mce_poll_count);
 
 /*
@@ -1133,9 +1101,22 @@ void do_machine_check(struct pt_regs *regs, long error_code)
         * on Intel.
         */
        int lmce = 1;
+       int cpu = smp_processor_id();
 
-       /* If this CPU is offline, just bail out. */
-       if (cpu_is_offline(smp_processor_id())) {
+       /*
+        * Cases where we avoid rendezvous handler timeout:
+        * 1) If this CPU is offline.
+        *
+        * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to
+        *  skip those CPUs which remain looping in the 1st kernel - see
+        *  crash_nmi_callback().
+        *
+        * Note: there still is a small window between kexec-ing and the new,
+        * kdump kernel establishing a new #MC handler where a broadcasted MCE
+        * might not get handled properly.
+        */
+       if (cpu_is_offline(cpu) ||
+           (crashing_cpu != -1 && crashing_cpu != cpu)) {
                u64 mcgstatus;
 
                mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
@@ -1405,13 +1386,6 @@ static void mce_timer_delete_all(void)
                del_timer_sync(&per_cpu(mce_timer, cpu));
 }
 
-static void mce_do_trigger(struct work_struct *work)
-{
-       call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
-}
-
-static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
-
 /*
  * Notify the user(s) about new machine check events.
  * Can be called from interrupt context, but not from machine check/NMI
@@ -1423,11 +1397,7 @@ int mce_notify_irq(void)
        static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
 
        if (test_and_clear_bit(0, &mce_need_notify)) {
-               /* wake processes polling /dev/mcelog */
-               wake_up_interruptible(&mce_chrdev_wait);
-
-               if (mce_helper[0])
-                       schedule_work(&mce_trigger_work);
+               mce_work_trigger();
 
                if (__ratelimit(&ratelimit))
                        pr_info(HW_ERR "Machine check events logged\n");
@@ -1694,30 +1664,35 @@ static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
        return 0;
 }
 
-static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
+/*
+ * Init basic CPU features needed for early decoding of MCEs.
+ */
+static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c)
 {
-       switch (c->x86_vendor) {
-       case X86_VENDOR_INTEL:
-               mce_intel_feature_init(c);
-               mce_adjust_timer = cmci_intel_adjust_timer;
-               break;
-
-       case X86_VENDOR_AMD: {
+       if (c->x86_vendor == X86_VENDOR_AMD) {
                mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV);
                mce_flags.succor         = !!cpu_has(c, X86_FEATURE_SUCCOR);
                mce_flags.smca           = !!cpu_has(c, X86_FEATURE_SMCA);
 
-               /*
-                * Install proper ops for Scalable MCA enabled processors
-                */
                if (mce_flags.smca) {
                        msr_ops.ctl     = smca_ctl_reg;
                        msr_ops.status  = smca_status_reg;
                        msr_ops.addr    = smca_addr_reg;
                        msr_ops.misc    = smca_misc_reg;
                }
-               mce_amd_feature_init(c);
+       }
+}
 
+static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
+{
+       switch (c->x86_vendor) {
+       case X86_VENDOR_INTEL:
+               mce_intel_feature_init(c);
+               mce_adjust_timer = cmci_intel_adjust_timer;
+               break;
+
+       case X86_VENDOR_AMD: {
+               mce_amd_feature_init(c);
                break;
                }
 
@@ -1804,6 +1779,7 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c)
 
        machine_check_vector = do_machine_check;
 
+       __mcheck_cpu_init_early(c);
        __mcheck_cpu_init_generic();
        __mcheck_cpu_init_vendor(c);
        __mcheck_cpu_init_clear_banks();
@@ -1829,251 +1805,6 @@ void mcheck_cpu_clear(struct cpuinfo_x86 *c)
 
 }
 
-/*
- * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
- */
-
-static DEFINE_SPINLOCK(mce_chrdev_state_lock);
-static int mce_chrdev_open_exclu;      /* already open exclusive? */
-
-static int mce_chrdev_open(struct inode *inode, struct file *file)
-{
-       spin_lock(&mce_chrdev_state_lock);
-
-       if (mce_chrdev_open_exclu ||
-           (mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
-               spin_unlock(&mce_chrdev_state_lock);
-
-               return -EBUSY;
-       }
-
-       if (file->f_flags & O_EXCL)
-               mce_chrdev_open_exclu = 1;
-       mce_chrdev_open_count++;
-
-       spin_unlock(&mce_chrdev_state_lock);
-
-       return nonseekable_open(inode, file);
-}
-
-static int mce_chrdev_release(struct inode *inode, struct file *file)
-{
-       spin_lock(&mce_chrdev_state_lock);
-
-       mce_chrdev_open_count--;
-       mce_chrdev_open_exclu = 0;
-
-       spin_unlock(&mce_chrdev_state_lock);
-
-       return 0;
-}
-
-static void collect_tscs(void *data)
-{
-       unsigned long *cpu_tsc = (unsigned long *)data;
-
-       cpu_tsc[smp_processor_id()] = rdtsc();
-}
-
-static int mce_apei_read_done;
-
-/* Collect MCE record of previous boot in persistent storage via APEI ERST. */
-static int __mce_read_apei(char __user **ubuf, size_t usize)
-{
-       int rc;
-       u64 record_id;
-       struct mce m;
-
-       if (usize < sizeof(struct mce))
-               return -EINVAL;
-
-       rc = apei_read_mce(&m, &record_id);
-       /* Error or no more MCE record */
-       if (rc <= 0) {
-               mce_apei_read_done = 1;
-               /*
-                * When ERST is disabled, mce_chrdev_read() should return
-                * "no record" instead of "no device."
-                */
-               if (rc == -ENODEV)
-                       return 0;
-               return rc;
-       }
-       rc = -EFAULT;
-       if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
-               return rc;
-       /*
-        * In fact, we should have cleared the record after that has
-        * been flushed to the disk or sent to network in
-        * /sbin/mcelog, but we have no interface to support that now,
-        * so just clear it to avoid duplication.
-        */
-       rc = apei_clear_mce(record_id);
-       if (rc) {
-               mce_apei_read_done = 1;
-               return rc;
-       }
-       *ubuf += sizeof(struct mce);
-
-       return 0;
-}
-
-static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
-                               size_t usize, loff_t *off)
-{
-       char __user *buf = ubuf;
-       unsigned long *cpu_tsc;
-       unsigned prev, next;
-       int i, err;
-
-       cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
-       if (!cpu_tsc)
-               return -ENOMEM;
-
-       mutex_lock(&mce_chrdev_read_mutex);
-
-       if (!mce_apei_read_done) {
-               err = __mce_read_apei(&buf, usize);
-               if (err || buf != ubuf)
-                       goto out;
-       }
-
-       next = mce_log_get_idx_check(mcelog.next);
-
-       /* Only supports full reads right now */
-       err = -EINVAL;
-       if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
-               goto out;
-
-       err = 0;
-       prev = 0;
-       do {
-               for (i = prev; i < next; i++) {
-                       unsigned long start = jiffies;
-                       struct mce *m = &mcelog.entry[i];
-
-                       while (!m->finished) {
-                               if (time_after_eq(jiffies, start + 2)) {
-                                       memset(m, 0, sizeof(*m));
-                                       goto timeout;
-                               }
-                               cpu_relax();
-                       }
-                       smp_rmb();
-                       err |= copy_to_user(buf, m, sizeof(*m));
-                       buf += sizeof(*m);
-timeout:
-                       ;
-               }
-
-               memset(mcelog.entry + prev, 0,
-                      (next - prev) * sizeof(struct mce));
-               prev = next;
-               next = cmpxchg(&mcelog.next, prev, 0);
-       } while (next != prev);
-
-       synchronize_sched();
-
-       /*
-        * Collect entries that were still getting written before the
-        * synchronize.
-        */
-       on_each_cpu(collect_tscs, cpu_tsc, 1);
-
-       for (i = next; i < MCE_LOG_LEN; i++) {
-               struct mce *m = &mcelog.entry[i];
-
-               if (m->finished && m->tsc < cpu_tsc[m->cpu]) {
-                       err |= copy_to_user(buf, m, sizeof(*m));
-                       smp_rmb();
-                       buf += sizeof(*m);
-                       memset(m, 0, sizeof(*m));
-               }
-       }
-
-       if (err)
-               err = -EFAULT;
-
-out:
-       mutex_unlock(&mce_chrdev_read_mutex);
-       kfree(cpu_tsc);
-
-       return err ? err : buf - ubuf;
-}
-
-static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
-{
-       poll_wait(file, &mce_chrdev_wait, wait);
-       if (READ_ONCE(mcelog.next))
-               return POLLIN | POLLRDNORM;
-       if (!mce_apei_read_done && apei_check_mce())
-               return POLLIN | POLLRDNORM;
-       return 0;
-}
-
-static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
-                               unsigned long arg)
-{
-       int __user *p = (int __user *)arg;
-
-       if (!capable(CAP_SYS_ADMIN))
-               return -EPERM;
-
-       switch (cmd) {
-       case MCE_GET_RECORD_LEN:
-               return put_user(sizeof(struct mce), p);
-       case MCE_GET_LOG_LEN:
-               return put_user(MCE_LOG_LEN, p);
-       case MCE_GETCLEAR_FLAGS: {
-               unsigned flags;
-
-               do {
-                       flags = mcelog.flags;
-               } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
-
-               return put_user(flags, p);
-       }
-       default:
-               return -ENOTTY;
-       }
-}
-
-static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf,
-                           size_t usize, loff_t *off);
-
-void register_mce_write_callback(ssize_t (*fn)(struct file *filp,
-                            const char __user *ubuf,
-                            size_t usize, loff_t *off))
-{
-       mce_write = fn;
-}
-EXPORT_SYMBOL_GPL(register_mce_write_callback);
-
-static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
-                               size_t usize, loff_t *off)
-{
-       if (mce_write)
-               return mce_write(filp, ubuf, usize, off);
-       else
-               return -EINVAL;
-}
-
-static const struct file_operations mce_chrdev_ops = {
-       .open                   = mce_chrdev_open,
-       .release                = mce_chrdev_release,
-       .read                   = mce_chrdev_read,
-       .write                  = mce_chrdev_write,
-       .poll                   = mce_chrdev_poll,
-       .unlocked_ioctl         = mce_chrdev_ioctl,
-       .llseek                 = no_llseek,
-};
-
-static struct miscdevice mce_chrdev_device = {
-       MISC_MCELOG_MINOR,
-       "mcelog",
-       &mce_chrdev_ops,
-};
-
 static void __mce_disable_bank(void *arg)
 {
        int bank = *((int *)arg);
@@ -2147,6 +1878,7 @@ __setup("mce", mcheck_enable);
 int __init mcheck_init(void)
 {
        mcheck_intel_therm_init();
+       mce_register_decode_chain(&first_nb);
        mce_register_decode_chain(&mce_srao_nb);
        mce_register_decode_chain(&mce_default_nb);
        mcheck_vendor_init_severity();
@@ -2291,29 +2023,6 @@ static ssize_t set_bank(struct device *s, struct device_attribute *attr,
        return size;
 }
 
-static ssize_t
-show_trigger(struct device *s, struct device_attribute *attr, char *buf)
-{
-       strcpy(buf, mce_helper);
-       strcat(buf, "\n");
-       return strlen(mce_helper) + 1;
-}
-
-static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
-                               const char *buf, size_t siz)
-{
-       char *p;
-
-       strncpy(mce_helper, buf, sizeof(mce_helper));
-       mce_helper[sizeof(mce_helper)-1] = 0;
-       p = strchr(mce_helper, '\n');
-
-       if (p)
-               *p = 0;
-
-       return strlen(mce_helper) + !!p;
-}
-
 static ssize_t set_ignore_ce(struct device *s,
                             struct device_attribute *attr,
                             const char *buf, size_t size)
@@ -2370,7 +2079,6 @@ static ssize_t store_int_with_restart(struct device *s,
        return ret;
 }
 
-static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
 static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant);
 static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
 static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
@@ -2393,7 +2101,9 @@ static struct dev_ext_attribute dev_attr_cmci_disabled = {
 static struct device_attribute *mce_device_attrs[] = {
        &dev_attr_tolerant.attr,
        &dev_attr_check_interval.attr,
+#ifdef CONFIG_X86_MCELOG_LEGACY
        &dev_attr_trigger,
+#endif
        &dev_attr_monarch_timeout.attr,
        &dev_attr_dont_log_ce.attr,
        &dev_attr_ignore_ce.attr,
@@ -2567,7 +2277,6 @@ static __init void mce_init_banks(void)
 
 static __init int mcheck_init_device(void)
 {
-       enum cpuhp_state hp_online;
        int err;
 
        if (!mce_available(&boot_cpu_data)) {
@@ -2595,21 +2304,11 @@ static __init int mcheck_init_device(void)
                                mce_cpu_online, mce_cpu_pre_down);
        if (err < 0)
                goto err_out_online;
-       hp_online = err;
 
        register_syscore_ops(&mce_syscore_ops);
 
-       /* register character device /dev/mcelog */
-       err = misc_register(&mce_chrdev_device);
-       if (err)
-               goto err_register;
-
        return 0;
 
-err_register:
-       unregister_syscore_ops(&mce_syscore_ops);
-       cpuhp_remove_state(hp_online);
-
 err_out_online:
        cpuhp_remove_state(CPUHP_X86_MCE_DEAD);
 
@@ -2617,7 +2316,7 @@ err_out_mem:
        free_cpumask_var(mce_device_initialized);
 
 err_out:
-       pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err);
+       pr_err("Unable to init MCE device (rc: %d)\n", err);
 
        return err;
 }
@@ -2696,6 +2395,7 @@ static int __init mcheck_late_init(void)
                static_branch_inc(&mcsafe_key);
 
        mcheck_debugfs_init();
+       cec_init();
 
        /*
         * Flush out everything that has been logged during early boot, now that
index 190b3e6..e84db79 100644 (file)
@@ -481,6 +481,9 @@ static void intel_ppin_init(struct cpuinfo_x86 *c)
        case INTEL_FAM6_BROADWELL_XEON_D:
        case INTEL_FAM6_BROADWELL_X:
        case INTEL_FAM6_SKYLAKE_X:
+       case INTEL_FAM6_XEON_PHI_KNL:
+       case INTEL_FAM6_XEON_PHI_KNM:
+
                if (rdmsrl_safe(MSR_PPIN_CTL, &val))
                        return;
 
index 3b442b6..765afd5 100644 (file)
@@ -27,7 +27,7 @@
 #include <linux/range.h>
 
 #include <asm/processor.h>
-#include <asm/e820.h>
+#include <asm/e820/api.h>
 #include <asm/mtrr.h>
 #include <asm/msr.h>
 
@@ -860,7 +860,7 @@ real_trim_memory(unsigned long start_pfn, unsigned long limit_pfn)
        trim_size <<= PAGE_SHIFT;
        trim_size -= trim_start;
 
-       return e820_update_range(trim_start, trim_size, E820_RAM, E820_RESERVED);
+       return e820__range_update(trim_start, trim_size, E820_TYPE_RAM, E820_TYPE_RESERVED);
 }
 
 /**
@@ -978,7 +978,7 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
                        WARN_ON(1);
 
                pr_info("update e820 for mtrr\n");
-               update_e820();
+               e820__update_table_print();
 
                return 1;
        }
index 24e87e7..2bce84d 100644 (file)
@@ -48,7 +48,7 @@
 #include <linux/syscore_ops.h>
 
 #include <asm/cpufeature.h>
-#include <asm/e820.h>
+#include <asm/e820/api.h>
 #include <asm/mtrr.h>
 #include <asm/msr.h>
 #include <asm/pat.h>
index 3741461..22217ec 100644 (file)
@@ -29,6 +29,7 @@
 #include <asm/nmi.h>
 #include <asm/hw_irq.h>
 #include <asm/apic.h>
+#include <asm/e820/types.h>
 #include <asm/io_apic.h>
 #include <asm/hpet.h>
 #include <linux/kdebug.h>
@@ -503,16 +504,16 @@ static int prepare_elf_headers(struct kimage *image, void **addr,
        return ret;
 }
 
-static int add_e820_entry(struct boot_params *params, struct e820entry *entry)
+static int add_e820_entry(struct boot_params *params, struct e820_entry *entry)
 {
        unsigned int nr_e820_entries;
 
        nr_e820_entries = params->e820_entries;
-       if (nr_e820_entries >= E820MAX)
+       if (nr_e820_entries >= E820_MAX_ENTRIES_ZEROPAGE)
                return 1;
 
-       memcpy(&params->e820_map[nr_e820_entries], entry,
-                       sizeof(struct e820entry));
+       memcpy(&params->e820_table[nr_e820_entries], entry,
+                       sizeof(struct e820_entry));
        params->e820_entries++;
        return 0;
 }
@@ -521,7 +522,7 @@ static int memmap_entry_callback(u64 start, u64 end, void *arg)
 {
        struct crash_memmap_data *cmd = arg;
        struct boot_params *params = cmd->params;
-       struct e820entry ei;
+       struct e820_entry ei;
 
        ei.addr = start;
        ei.size = end - start + 1;
@@ -560,7 +561,7 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params)
 {
        int i, ret = 0;
        unsigned long flags;
-       struct e820entry ei;
+       struct e820_entry ei;
        struct crash_memmap_data cmd;
        struct crash_mem *cmem;
 
@@ -574,17 +575,17 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params)
        /* Add first 640K segment */
        ei.addr = image->arch.backup_src_start;
        ei.size = image->arch.backup_src_sz;
-       ei.type = E820_RAM;
+       ei.type = E820_TYPE_RAM;
        add_e820_entry(params, &ei);
 
        /* Add ACPI tables */
-       cmd.type = E820_ACPI;
+       cmd.type = E820_TYPE_ACPI;
        flags = IORESOURCE_MEM | IORESOURCE_BUSY;
        walk_iomem_res_desc(IORES_DESC_ACPI_TABLES, flags, 0, -1, &cmd,
                       memmap_entry_callback);
 
        /* Add ACPI Non-volatile Storage */
-       cmd.type = E820_NVS;
+       cmd.type = E820_TYPE_NVS;
        walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1, &cmd,
                        memmap_entry_callback);
 
@@ -592,7 +593,7 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params)
        if (crashk_low_res.end) {
                ei.addr = crashk_low_res.start;
                ei.size = crashk_low_res.end - crashk_low_res.start + 1;
-               ei.type = E820_RAM;
+               ei.type = E820_TYPE_RAM;
                add_e820_entry(params, &ei);
        }
 
@@ -609,7 +610,7 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params)
                if (ei.size < PAGE_SIZE)
                        continue;
                ei.addr = cmem->ranges[i].start;
-               ei.type = E820_RAM;
+               ei.type = E820_TYPE_RAM;
                add_e820_entry(params, &ei);
        }
 
index b2bbad6..6e9b26f 100644 (file)
@@ -1,49 +1,55 @@
 /*
- * Handle the memory map.
- * The functions here do the job until bootmem takes over.
+ * Low level x86 E820 memory map handling functions.
  *
- *  Getting sanitize_e820_map() in sync with i386 version by applying change:
- *  -  Provisions for empty E820 memory regions (reported by certain BIOSes).
- *     Alex Achenbach <xela@slit.de>, December 2002.
- *  Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
+ * The firmware and bootloader passes us the "E820 table", which is the primary
+ * physical memory layout description available about x86 systems.
  *
+ * The kernel takes the E820 memory layout and optionally modifies it with
+ * quirks and other tweaks, and feeds that into the generic Linux memory
+ * allocation code routines via a platform independent interface (memblock, etc.).
  */
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/init.h>
 #include <linux/crash_dump.h>
-#include <linux/export.h>
 #include <linux/bootmem.h>
-#include <linux/pfn.h>
 #include <linux/suspend.h>
 #include <linux/acpi.h>
 #include <linux/firmware-map.h>
 #include <linux/memblock.h>
 #include <linux/sort.h>
 
-#include <asm/e820.h>
-#include <asm/proto.h>
+#include <asm/e820/api.h>
 #include <asm/setup.h>
-#include <asm/cpufeature.h>
 
 /*
- * The e820 map is the map that gets modified e.g. with command line parameters
- * and that is also registered with modifications in the kernel resource tree
- * with the iomem_resource as parent.
+ * We organize the E820 table into two main data structures:
  *
- * The e820_saved is directly saved after the BIOS-provided memory map is
- * copied. It doesn't get modified afterwards. It's registered for the
- * /sys/firmware/memmap interface.
+ * - 'e820_table_firmware': the original firmware version passed to us by the
+ *   bootloader - not modified by the kernel. We use this to:
  *
- * That memory map is not modified and is used as base for kexec. The kexec'd
- * kernel should get the same memory map as the firmware provides. Then the
- * user can e.g. boot the original kernel with mem=1G while still booting the
- * next kernel with full memory.
+ *       - inform the user about the firmware's notion of memory layout
+ *         via /sys/firmware/memmap
+ *
+ *       - the hibernation code uses it to generate a kernel-independent MD5
+ *         fingerprint of the physical memory layout of a system.
+ *
+ *       - kexec, which is a bootloader in disguise, uses the original E820
+ *         layout to pass to the kexec-ed kernel. This way the original kernel
+ *         can have a restricted E820 map while the kexec()-ed kexec-kernel
+ *         can have access to full memory - etc.
+ *
+ * - 'e820_table': this is the main E820 table that is massaged by the
+ *   low level x86 platform code, or modified by boot parameters, before
+ *   passed on to higher level MM layers.
+ *
+ * Once the E820 map has been converted to the standard Linux memory layout
+ * information its role stops - modifying it has no effect and does not get
+ * re-propagated. So itsmain role is a temporary bootstrap storage of firmware
+ * specific memory layout data during early bootup.
  */
-static struct e820map initial_e820  __initdata;
-static struct e820map initial_e820_saved  __initdata;
-struct e820map *e820 __refdata = &initial_e820;
-struct e820map *e820_saved __refdata = &initial_e820_saved;
+static struct e820_table e820_table_init               __initdata;
+static struct e820_table e820_table_firmware_init      __initdata;
+
+struct e820_table *e820_table __refdata                        = &e820_table_init;
+struct e820_table *e820_table_firmware __refdata       = &e820_table_firmware_init;
 
 /* For PCI or other memory-mapped resources */
 unsigned long pci_mem_start = 0xaeedbabe;
@@ -55,51 +61,53 @@ EXPORT_SYMBOL(pci_mem_start);
  * This function checks if any part of the range <start,end> is mapped
  * with type.
  */
-int
-e820_any_mapped(u64 start, u64 end, unsigned type)
+bool e820__mapped_any(u64 start, u64 end, enum e820_type type)
 {
        int i;
 
-       for (i = 0; i < e820->nr_map; i++) {
-               struct e820entry *ei = &e820->map[i];
+       for (i = 0; i < e820_table->nr_entries; i++) {
+               struct e820_entry *entry = &e820_table->entries[i];
 
-               if (type && ei->type != type)
+               if (type && entry->type != type)
                        continue;
-               if (ei->addr >= end || ei->addr + ei->size <= start)
+               if (entry->addr >= end || entry->addr + entry->size <= start)
                        continue;
                return 1;
        }
        return 0;
 }
-EXPORT_SYMBOL_GPL(e820_any_mapped);
+EXPORT_SYMBOL_GPL(e820__mapped_any);
 
 /*
- * This function checks if the entire range <start,end> is mapped with type.
+ * This function checks if the entire <start,end> range is mapped with 'type'.
  *
- * Note: this function only works correct if the e820 table is sorted and
- * not-overlapping, which is the case
+ * Note: this function only works correctly once the E820 table is sorted and
+ * not-overlapping (at least for the range specified), which is the case normally.
  */
-int __init e820_all_mapped(u64 start, u64 end, unsigned type)
+bool __init e820__mapped_all(u64 start, u64 end, enum e820_type type)
 {
        int i;
 
-       for (i = 0; i < e820->nr_map; i++) {
-               struct e820entry *ei = &e820->map[i];
+       for (i = 0; i < e820_table->nr_entries; i++) {
+               struct e820_entry *entry = &e820_table->entries[i];
 
-               if (type && ei->type != type)
+               if (type && entry->type != type)
                        continue;
-               /* is the region (part) in overlap with the current region ?*/
-               if (ei->addr >= end || ei->addr + ei->size <= start)
+
+               /* Is the region (part) in overlap with the current region? */
+               if (entry->addr >= end || entry->addr + entry->size <= start)
                        continue;
 
-               /* if the region is at the beginning of <start,end> we move
-                * start to the end of the region since it's ok until there
+               /*
+                * If the region is at the beginning of <start,end> we move
+                * 'start' to the end of the region since it's ok until there
                 */
-               if (ei->addr <= start)
-                       start = ei->addr + ei->size;
+               if (entry->addr <= start)
+                       start = entry->addr + entry->size;
+
                /*
-                * if start is now at or beyond end, we're done, full
-                * coverage
+                * If 'start' is now at or beyond 'end', we're done, full
+                * coverage of the desired range exists:
                 */
                if (start >= end)
                        return 1;
@@ -108,94 +116,77 @@ int __init e820_all_mapped(u64 start, u64 end, unsigned type)
 }
 
 /*
- * Add a memory region to the kernel e820 map.
+ * Add a memory region to the kernel E820 map.
  */
-static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size,
-                                        int type)
+static void __init __e820__range_add(struct e820_table *table, u64 start, u64 size, enum e820_type type)
 {
-       int x = e820x->nr_map;
+       int x = table->nr_entries;
 
-       if (x >= ARRAY_SIZE(e820x->map)) {
-               printk(KERN_ERR "e820: too many entries; ignoring [mem %#010llx-%#010llx]\n",
-                      (unsigned long long) start,
-                      (unsigned long long) (start + size - 1));
+       if (x >= ARRAY_SIZE(table->entries)) {
+               pr_err("e820: too many entries; ignoring [mem %#010llx-%#010llx]\n", start, start + size - 1);
                return;
        }
 
-       e820x->map[x].addr = start;
-       e820x->map[x].size = size;
-       e820x->map[x].type = type;
-       e820x->nr_map++;
+       table->entries[x].addr = start;
+       table->entries[x].size = size;
+       table->entries[x].type = type;
+       table->nr_entries++;
 }
 
-void __init e820_add_region(u64 start, u64 size, int type)
+void __init e820__range_add(u64 start, u64 size, enum e820_type type)
 {
-       __e820_add_region(e820, start, size, type);
+       __e820__range_add(e820_table, start, size, type);
 }
 
-static void __init e820_print_type(u32 type)
+static void __init e820_print_type(enum e820_type type)
 {
        switch (type) {
-       case E820_RAM:
-       case E820_RESERVED_KERN:
-               printk(KERN_CONT "usable");
-               break;
-       case E820_RESERVED:
-               printk(KERN_CONT "reserved");
-               break;
-       case E820_ACPI:
-               printk(KERN_CONT "ACPI data");
-               break;
-       case E820_NVS:
-               printk(KERN_CONT "ACPI NVS");
-               break;
-       case E820_UNUSABLE:
-               printk(KERN_CONT "unusable");
-               break;
-       case E820_PMEM:
-       case E820_PRAM:
-               printk(KERN_CONT "persistent (type %u)", type);
-               break;
-       default:
-               printk(KERN_CONT "type %u", type);
-               break;
+       case E820_TYPE_RAM:             /* Fall through: */
+       case E820_TYPE_RESERVED_KERN:   pr_cont("usable");                      break;
+       case E820_TYPE_RESERVED:        pr_cont("reserved");                    break;
+       case E820_TYPE_ACPI:            pr_cont("ACPI data");                   break;
+       case E820_TYPE_NVS:             pr_cont("ACPI NVS");                    break;
+       case E820_TYPE_UNUSABLE:        pr_cont("unusable");                    break;
+       case E820_TYPE_PMEM:            /* Fall through: */
+       case E820_TYPE_PRAM:            pr_cont("persistent (type %u)", type);  break;
+       default:                        pr_cont("type %u", type);               break;
        }
 }
 
-void __init e820_print_map(char *who)
+void __init e820__print_table(char *who)
 {
        int i;
 
-       for (i = 0; i < e820->nr_map; i++) {
-               printk(KERN_INFO "%s: [mem %#018Lx-%#018Lx] ", who,
-                      (unsigned long long) e820->map[i].addr,
-                      (unsigned long long)
-                      (e820->map[i].addr + e820->map[i].size - 1));
-               e820_print_type(e820->map[i].type);
-               printk(KERN_CONT "\n");
+       for (i = 0; i < e820_table->nr_entries; i++) {
+               pr_info("%s: [mem %#018Lx-%#018Lx] ", who,
+                      e820_table->entries[i].addr,
+                      e820_table->entries[i].addr + e820_table->entries[i].size - 1);
+
+               e820_print_type(e820_table->entries[i].type);
+               pr_cont("\n");
        }
 }
 
 /*
- * Sanitize the BIOS e820 map.
+ * Sanitize an E820 map.
  *
- * Some e820 responses include overlapping entries. The following
- * replaces the original e820 map with a new one, removing overlaps,
+ * Some E820 layouts include overlapping entries. The following
+ * replaces the original E820 map with a new one, removing overlaps,
  * and resolving conflicting memory types in favor of highest
  * numbered type.
  *
- * The input parameter biosmap points to an array of 'struct
- * e820entry' which on entry has elements in the range [0, *pnr_map)
- * valid, and which has space for up to max_nr_map entries.
- * On return, the resulting sanitized e820 map entries will be in
- * overwritten in the same location, starting at biosmap.
+ * The input parameter 'entries' points to an array of 'struct
+ * e820_entry' which on entry has elements in the range [0, *nr_entries)
+ * valid, and which has space for up to max_nr_entries entries.
+ * On return, the resulting sanitized E820 map entries will be in
+ * overwritten in the same location, starting at 'entries'.
  *
- * The integer pointed to by pnr_map must be valid on entry (the
- * current number of valid entries located at biosmap). If the
- * sanitizing succeeds the *pnr_map will be updated with the new
- * number of valid entries (something no more than max_nr_map).
+ * The integer pointed to by nr_entries must be valid on entry (the
+ * current number of valid entries located at 'entries'). If the
+ * sanitizing succeeds the *nr_entries will be updated with the new
+ * number of valid entries (something no more than max_nr_entries).
  *
- * The return value from sanitize_e820_map() is zero if it
+ * The return value from e820__update_table() is zero if it
  * successfully 'sanitized' the map entries passed in, and is -1
  * if it did nothing, which can happen if either of (1) it was
  * only passed one map entry, or (2) any of the input map entries
@@ -238,10 +229,17 @@ void __init e820_print_map(char *who)
  *        ______________________4_
  */
 struct change_member {
-       struct e820entry *pbios; /* pointer to original bios entry */
-       unsigned long long addr; /* address for this change point */
+       /* Pointer to the original entry: */
+       struct e820_entry       *entry;
+       /* Address for this change point: */
+       unsigned long long      addr;
 };
 
+static struct change_member    change_point_list[2*E820_MAX_ENTRIES]   __initdata;
+static struct change_member    *change_point[2*E820_MAX_ENTRIES]       __initdata;
+static struct e820_entry       *overlap_list[E820_MAX_ENTRIES]         __initdata;
+static struct e820_entry       new_entries[E820_MAX_ENTRIES]           __initdata;
+
 static int __init cpcompare(const void *a, const void *b)
 {
        struct change_member * const *app = a, * const *bpp = b;
@@ -249,164 +247,141 @@ static int __init cpcompare(const void *a, const void *b)
 
        /*
         * Inputs are pointers to two elements of change_point[].  If their
-        * addresses are unequal, their difference dominates.  If the addresses
+        * addresses are not equal, their difference dominates.  If the addresses
         * are equal, then consider one that represents the end of its region
         * to be greater than one that does not.
         */
        if (ap->addr != bp->addr)
                return ap->addr > bp->addr ? 1 : -1;
 
-       return (ap->addr != ap->pbios->addr) - (bp->addr != bp->pbios->addr);
+       return (ap->addr != ap->entry->addr) - (bp->addr != bp->entry->addr);
 }
 
-int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
-                            u32 *pnr_map)
+int __init e820__update_table(struct e820_table *table)
 {
-       static struct change_member change_point_list[2*E820_X_MAX] __initdata;
-       static struct change_member *change_point[2*E820_X_MAX] __initdata;
-       static struct e820entry *overlap_list[E820_X_MAX] __initdata;
-       static struct e820entry new_bios[E820_X_MAX] __initdata;
-       unsigned long current_type, last_type;
+       struct e820_entry *entries = table->entries;
+       u32 max_nr_entries = ARRAY_SIZE(table->entries);
+       enum e820_type current_type, last_type;
        unsigned long long last_addr;
-       int chgidx;
-       int overlap_entries;
-       int new_bios_entry;
-       int old_nr, new_nr, chg_nr;
-       int i;
+       u32 new_nr_entries, overlap_entries;
+       u32 i, chg_idx, chg_nr;
 
-       /* if there's only one memory region, don't bother */
-       if (*pnr_map < 2)
+       /* If there's only one memory region, don't bother: */
+       if (table->nr_entries < 2)
                return -1;
 
-       old_nr = *pnr_map;
-       BUG_ON(old_nr > max_nr_map);
+       table->nr_entries = table->nr_entries;
+       BUG_ON(table->nr_entries > max_nr_entries);
 
-       /* bail out if we find any unreasonable addresses in bios map */
-       for (i = 0; i < old_nr; i++)
-               if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
+       /* Bail out if we find any unreasonable addresses in the map: */
+       for (i = 0; i < table->nr_entries; i++) {
+               if (entries[i].addr + entries[i].size < entries[i].addr)
                        return -1;
+       }
 
-       /* create pointers for initial change-point information (for sorting) */
-       for (i = 0; i < 2 * old_nr; i++)
+       /* Create pointers for initial change-point information (for sorting): */
+       for (i = 0; i < 2 * table->nr_entries; i++)
                change_point[i] = &change_point_list[i];
 
-       /* record all known change-points (starting and ending addresses),
-          omitting those that are for empty memory regions */
-       chgidx = 0;
-       for (i = 0; i < old_nr; i++)    {
-               if (biosmap[i].size != 0) {
-                       change_point[chgidx]->addr = biosmap[i].addr;
-                       change_point[chgidx++]->pbios = &biosmap[i];
-                       change_point[chgidx]->addr = biosmap[i].addr +
-                               biosmap[i].size;
-                       change_point[chgidx++]->pbios = &biosmap[i];
+       /*
+        * Record all known change-points (starting and ending addresses),
+        * omitting empty memory regions:
+        */
+       chg_idx = 0;
+       for (i = 0; i < table->nr_entries; i++) {
+               if (entries[i].size != 0) {
+                       change_point[chg_idx]->addr     = entries[i].addr;
+                       change_point[chg_idx++]->entry  = &entries[i];
+                       change_point[chg_idx]->addr     = entries[i].addr + entries[i].size;
+                       change_point[chg_idx++]->entry  = &entries[i];
                }
        }
-       chg_nr = chgidx;
-
-       /* sort change-point list by memory addresses (low -> high) */
-       sort(change_point, chg_nr, sizeof *change_point, cpcompare, NULL);
-
-       /* create a new bios memory map, removing overlaps */
-       overlap_entries = 0;     /* number of entries in the overlap table */
-       new_bios_entry = 0;      /* index for creating new bios map entries */
-       last_type = 0;           /* start with undefined memory type */
-       last_addr = 0;           /* start with 0 as last starting address */
-
-       /* loop through change-points, determining affect on the new bios map */
-       for (chgidx = 0; chgidx < chg_nr; chgidx++) {
-               /* keep track of all overlapping bios entries */
-               if (change_point[chgidx]->addr ==
-                   change_point[chgidx]->pbios->addr) {
-                       /*
-                        * add map entry to overlap list (> 1 entry
-                        * implies an overlap)
-                        */
-                       overlap_list[overlap_entries++] =
-                               change_point[chgidx]->pbios;
+       chg_nr = chg_idx;
+
+       /* Sort change-point list by memory addresses (low -> high): */
+       sort(change_point, chg_nr, sizeof(*change_point), cpcompare, NULL);
+
+       /* Create a new memory map, removing overlaps: */
+       overlap_entries = 0;     /* Number of entries in the overlap table */
+       new_nr_entries = 0;      /* Index for creating new map entries */
+       last_type = 0;           /* Start with undefined memory type */
+       last_addr = 0;           /* Start with 0 as last starting address */
+
+       /* Loop through change-points, determining effect on the new map: */
+       for (chg_idx = 0; chg_idx < chg_nr; chg_idx++) {
+               /* Keep track of all overlapping entries */
+               if (change_point[chg_idx]->addr == change_point[chg_idx]->entry->addr) {
+                       /* Add map entry to overlap list (> 1 entry implies an overlap) */
+                       overlap_list[overlap_entries++] = change_point[chg_idx]->entry;
                } else {
-                       /*
-                        * remove entry from list (order independent,
-                        * so swap with last)
-                        */
+                       /* Remove entry from list (order independent, so swap with last): */
                        for (i = 0; i < overlap_entries; i++) {
-                               if (overlap_list[i] ==
-                                   change_point[chgidx]->pbios)
-                                       overlap_list[i] =
-                                               overlap_list[overlap_entries-1];
+                               if (overlap_list[i] == change_point[chg_idx]->entry)
+                                       overlap_list[i] = overlap_list[overlap_entries-1];
                        }
                        overlap_entries--;
                }
                /*
-                * if there are overlapping entries, decide which
+                * If there are overlapping entries, decide which
                 * "type" to use (larger value takes precedence --
                 * 1=usable, 2,3,4,4+=unusable)
                 */
                current_type = 0;
-               for (i = 0; i < overlap_entries; i++)
+               for (i = 0; i < overlap_entries; i++) {
                        if (overlap_list[i]->type > current_type)
                                current_type = overlap_list[i]->type;
-               /*
-                * continue building up new bios map based on this
-                * information
-                */
-               if (current_type != last_type || current_type == E820_PRAM) {
+               }
+
+               /* Continue building up new map based on this information: */
+               if (current_type != last_type || current_type == E820_TYPE_PRAM) {
                        if (last_type != 0)      {
-                               new_bios[new_bios_entry].size =
-                                       change_point[chgidx]->addr - last_addr;
-                               /*
-                                * move forward only if the new size
-                                * was non-zero
-                                */
-                               if (new_bios[new_bios_entry].size != 0)
-                                       /*
-                                        * no more space left for new
-                                        * bios entries ?
-                                        */
-                                       if (++new_bios_entry >= max_nr_map)
+                               new_entries[new_nr_entries].size = change_point[chg_idx]->addr - last_addr;
+                               /* Move forward only if the new size was non-zero: */
+                               if (new_entries[new_nr_entries].size != 0)
+                                       /* No more space left for new entries? */
+                                       if (++new_nr_entries >= max_nr_entries)
                                                break;
                        }
                        if (current_type != 0)  {
-                               new_bios[new_bios_entry].addr =
-                                       change_point[chgidx]->addr;
-                               new_bios[new_bios_entry].type = current_type;
-                               last_addr = change_point[chgidx]->addr;
+                               new_entries[new_nr_entries].addr = change_point[chg_idx]->addr;
+                               new_entries[new_nr_entries].type = current_type;
+                               last_addr = change_point[chg_idx]->addr;
                        }
                        last_type = current_type;
                }
        }
-       /* retain count for new bios entries */
-       new_nr = new_bios_entry;
 
-       /* copy new bios mapping into original location */
-       memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry));
-       *pnr_map = new_nr;
+       /* Copy the new entries into the original location: */
+       memcpy(entries, new_entries, new_nr_entries*sizeof(*entries));
+       table->nr_entries = new_nr_entries;
 
        return 0;
 }
 
-static int __init __append_e820_map(struct e820entry *biosmap, int nr_map)
+static int __init __append_e820_table(struct boot_e820_entry *entries, u32 nr_entries)
 {
-       while (nr_map) {
-               u64 start = biosmap->addr;
-               u64 size = biosmap->size;
+       struct boot_e820_entry *entry = entries;
+
+       while (nr_entries) {
+               u64 start = entry->addr;
+               u64 size = entry->size;
                u64 end = start + size - 1;
-               u32 type = biosmap->type;
+               u32 type = entry->type;
 
-               /* Overflow in 64 bits? Ignore the memory map. */
+               /* Ignore the entry on 64-bit overflow: */
                if (start > end && likely(size))
                        return -1;
 
-               e820_add_region(start, size, type);
+               e820__range_add(start, size, type);
 
-               biosmap++;
-               nr_map--;
+               entry++;
+               nr_entries--;
        }
        return 0;
 }
 
 /*
- * Copy the BIOS e820 map into a safe place.
+ * Copy the BIOS E820 map into a safe place.
  *
  * Sanity-check it while we're at it..
  *
@@ -414,18 +389,17 @@ static int __init __append_e820_map(struct e820entry *biosmap, int nr_map)
  * will have given us a memory map that we can use to properly
  * set up memory.  If we aren't, we'll fake a memory map.
  */
-static int __init append_e820_map(struct e820entry *biosmap, int nr_map)
+static int __init append_e820_table(struct boot_e820_entry *entries, u32 nr_entries)
 {
        /* Only one memory region (or negative)? Ignore it */
-       if (nr_map < 2)
+       if (nr_entries < 2)
                return -1;
 
-       return __append_e820_map(biosmap, nr_map);
+       return __append_e820_table(entries, nr_entries);
 }
 
-static u64 __init __e820_update_range(struct e820map *e820x, u64 start,
-                                       u64 size, unsigned old_type,
-                                       unsigned new_type)
+static u64 __init
+__e820__range_update(struct e820_table *table, u64 start, u64 size, enum e820_type old_type, enum e820_type new_type)
 {
        u64 end;
        unsigned int i;
@@ -437,77 +411,73 @@ static u64 __init __e820_update_range(struct e820map *e820x, u64 start,
                size = ULLONG_MAX - start;
 
        end = start + size;
-       printk(KERN_DEBUG "e820: update [mem %#010Lx-%#010Lx] ",
-              (unsigned long long) start, (unsigned long long) (end - 1));
+       printk(KERN_DEBUG "e820: update [mem %#010Lx-%#010Lx] ", start, end - 1);
        e820_print_type(old_type);
-       printk(KERN_CONT " ==> ");
+       pr_cont(" ==> ");
        e820_print_type(new_type);
-       printk(KERN_CONT "\n");
+       pr_cont("\n");
 
-       for (i = 0; i < e820x->nr_map; i++) {
-               struct e820entry *ei = &e820x->map[i];
+       for (i = 0; i < table->nr_entries; i++) {
+               struct e820_entry *entry = &table->entries[i];
                u64 final_start, final_end;
-               u64 ei_end;
+               u64 entry_end;
 
-               if (ei->type != old_type)
+               if (entry->type != old_type)
                        continue;
 
-               ei_end = ei->addr + ei->size;
-               /* totally covered by new range? */
-               if (ei->addr >= start && ei_end <= end) {
-                       ei->type = new_type;
-                       real_updated_size += ei->size;
+               entry_end = entry->addr + entry->size;
+
+               /* Completely covered by new range? */
+               if (entry->addr >= start && entry_end <= end) {
+                       entry->type = new_type;
+                       real_updated_size += entry->size;
                        continue;
                }
 
-               /* new range is totally covered? */
-               if (ei->addr < start && ei_end > end) {
-                       __e820_add_region(e820x, start, size, new_type);
-                       __e820_add_region(e820x, end, ei_end - end, ei->type);
-                       ei->size = start - ei->addr;
+               /* New range is completely covered? */
+               if (entry->addr < start && entry_end > end) {
+                       __e820__range_add(table, start, size, new_type);
+                       __e820__range_add(table, end, entry_end - end, entry->type);
+                       entry->size = start - entry->addr;
                        real_updated_size += size;
                        continue;
                }
 
-               /* partially covered */
-               final_start = max(start, ei->addr);
-               final_end = min(end, ei_end);
+               /* Partially covered: */
+               final_start = max(start, entry->addr);
+               final_end = min(end, entry_end);
                if (final_start >= final_end)
                        continue;
 
-               __e820_add_region(e820x, final_start, final_end - final_start,
-                                 new_type);
+               __e820__range_add(table, final_start, final_end - final_start, new_type);
 
                real_updated_size += final_end - final_start;
 
                /*
-                * left range could be head or tail, so need to update
-                * size at first.
+                * Left range could be head or tail, so need to update
+                * its size first:
                 */
-               ei->size -= final_end - final_start;
-               if (ei->addr < final_start)
+               entry->size -= final_end - final_start;
+               if (entry->addr < final_start)
                        continue;
-               ei->addr = final_end;
+
+               entry->addr = final_end;
        }
        return real_updated_size;
 }
 
-u64 __init e820_update_range(u64 start, u64 size, unsigned old_type,
-                            unsigned new_type)
+u64 __init e820__range_update(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type)
 {
-       return __e820_update_range(e820, start, size, old_type, new_type);
+       return __e820__range_update(e820_table, start, size, old_type, new_type);
 }
 
-static u64 __init e820_update_range_saved(u64 start, u64 size,
-                                         unsigned old_type, unsigned new_type)
+static u64 __init e820__range_update_firmware(u64 start, u64 size, enum e820_type old_type, enum e820_type  new_type)
 {
-       return __e820_update_range(e820_saved, start, size, old_type,
-                                    new_type);
+       return __e820__range_update(e820_table_firmware, start, size, old_type, new_type);
 }
 
-/* make e820 not cover the range */
-u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
-                            int checktype)
+/* Remove a range of memory from the E820 table: */
+u64 __init e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool check_type)
 {
        int i;
        u64 end;
@@ -517,85 +487,89 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
                size = ULLONG_MAX - start;
 
        end = start + size;
-       printk(KERN_DEBUG "e820: remove [mem %#010Lx-%#010Lx] ",
-              (unsigned long long) start, (unsigned long long) (end - 1));
-       if (checktype)
+       printk(KERN_DEBUG "e820: remove [mem %#010Lx-%#010Lx] ", start, end - 1);
+       if (check_type)
                e820_print_type(old_type);
-       printk(KERN_CONT "\n");
+       pr_cont("\n");
 
-       for (i = 0; i < e820->nr_map; i++) {
-               struct e820entry *ei = &e820->map[i];
+       for (i = 0; i < e820_table->nr_entries; i++) {
+               struct e820_entry *entry = &e820_table->entries[i];
                u64 final_start, final_end;
-               u64 ei_end;
+               u64 entry_end;
 
-               if (checktype && ei->type != old_type)
+               if (check_type && entry->type != old_type)
                        continue;
 
-               ei_end = ei->addr + ei->size;
-               /* totally covered? */
-               if (ei->addr >= start && ei_end <= end) {
-                       real_removed_size += ei->size;
-                       memset(ei, 0, sizeof(struct e820entry));
+               entry_end = entry->addr + entry->size;
+
+               /* Completely covered? */
+               if (entry->addr >= start && entry_end <= end) {
+                       real_removed_size += entry->size;
+                       memset(entry, 0, sizeof(*entry));
                        continue;
                }
 
-               /* new range is totally covered? */
-               if (ei->addr < start && ei_end > end) {
-                       e820_add_region(end, ei_end - end, ei->type);
-                       ei->size = start - ei->addr;
+               /* Is the new range completely covered? */
+               if (entry->addr < start && entry_end > end) {
+                       e820__range_add(end, entry_end - end, entry->type);
+                       entry->size = start - entry->addr;
                        real_removed_size += size;
                        continue;
                }
 
-               /* partially covered */
-               final_start = max(start, ei->addr);
-               final_end = min(end, ei_end);
+               /* Partially covered: */
+               final_start = max(start, entry->addr);
+               final_end = min(end, entry_end);
                if (final_start >= final_end)
                        continue;
+
                real_removed_size += final_end - final_start;
 
                /*
-                * left range could be head or tail, so need to update
-                * size at first.
+                * Left range could be head or tail, so need to update
+                * the size first:
                 */
-               ei->size -= final_end - final_start;
-               if (ei->addr < final_start)
+               entry->size -= final_end - final_start;
+               if (entry->addr < final_start)
                        continue;
-               ei->addr = final_end;
+
+               entry->addr = final_end;
        }
        return real_removed_size;
 }
 
-void __init update_e820(void)
+void __init e820__update_table_print(void)
 {
-       if (sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map))
+       if (e820__update_table(e820_table))
                return;
-       printk(KERN_INFO "e820: modified physical RAM map:\n");
-       e820_print_map("modified");
+
+       pr_info("e820: modified physical RAM map:\n");
+       e820__print_table("modified");
 }
-static void __init update_e820_saved(void)
+
+static void __init e820__update_table_firmware(void)
 {
-       sanitize_e820_map(e820_saved->map, ARRAY_SIZE(e820_saved->map),
-                               &e820_saved->nr_map);
+       e820__update_table(e820_table_firmware);
 }
+
 #define MAX_GAP_END 0x100000000ull
+
 /*
- * Search for a gap in the e820 memory space from 0 to MAX_GAP_END.
+ * Search for a gap in the E820 memory space from 0 to MAX_GAP_END (4GB).
  */
-static int __init e820_search_gap(unsigned long *gapstart,
-               unsigned long *gapsize)
+static int __init e820_search_gap(unsigned long *gapstart, unsigned long *gapsize)
 {
        unsigned long long last = MAX_GAP_END;
-       int i = e820->nr_map;
+       int i = e820_table->nr_entries;
        int found = 0;
 
        while (--i >= 0) {
-               unsigned long long start = e820->map[i].addr;
-               unsigned long long end = start + e820->map[i].size;
+               unsigned long long start = e820_table->entries[i].addr;
+               unsigned long long end = start + e820_table->entries[i].size;
 
                /*
                 * Since "last" is at most 4GB, we know we'll
-                * fit in 32 bits if this condition is true
+                * fit in 32 bits if this condition is true:
                 */
                if (last > end) {
                        unsigned long gap = last - end;
@@ -613,12 +587,14 @@ static int __init e820_search_gap(unsigned long *gapstart,
 }
 
 /*
- * Search for the biggest gap in the low 32 bits of the e820
- * memory space.  We pass this space to PCI to assign MMIO resources
- * for hotplug or unconfigured devices in.
+ * Search for the biggest gap in the low 32 bits of the E820
+ * memory space. We pass this space to the PCI subsystem, so
+ * that it can assign MMIO resources for hotplug or
+ * unconfigured devices in.
+ *
  * Hopefully the BIOS let enough space left.
  */
-__init void e820_setup_gap(void)
+__init void e820__setup_pci_gap(void)
 {
        unsigned long gapstart, gapsize;
        int found;
@@ -629,138 +605,143 @@ __init void e820_setup_gap(void)
        if (!found) {
 #ifdef CONFIG_X86_64
                gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024;
-               printk(KERN_ERR
-       "e820: cannot find a gap in the 32bit address range\n"
-       "e820: PCI devices with unassigned 32bit BARs may break!\n");
+               pr_err(
+                       "e820: Cannot find an available gap in the 32-bit address range\n"
+                       "e820: PCI devices with unassigned 32-bit BARs may not work!\n");
 #else
                gapstart = 0x10000000;
 #endif
        }
 
        /*
-        * e820_reserve_resources_late protect stolen RAM already
+        * e820__reserve_resources_late() protects stolen RAM already:
         */
        pci_mem_start = gapstart;
 
-       printk(KERN_INFO
-              "e820: [mem %#010lx-%#010lx] available for PCI devices\n",
-              gapstart, gapstart + gapsize - 1);
+       pr_info("e820: [mem %#010lx-%#010lx] available for PCI devices\n", gapstart, gapstart + gapsize - 1);
 }
 
 /*
  * Called late during init, in free_initmem().
  *
- * Initial e820 and e820_saved are largish __initdata arrays.
- * Copy them to (usually much smaller) dynamically allocated area.
- * This is done after all tweaks we ever do to them:
- * all functions which modify them are __init functions,
- * they won't exist after this point.
+ * Initial e820_table and e820_table_firmware are largish __initdata arrays.
+ *
+ * Copy them to a (usually much smaller) dynamically allocated area that is
+ * sized precisely after the number of e820 entries.
+ *
+ * This is done after we've performed all the fixes and tweaks to the tables.
+ * All functions which modify them are __init functions, which won't exist
+ * after free_initmem().
  */
-__init void e820_reallocate_tables(void)
+__init void e820__reallocate_tables(void)
 {
-       struct e820map *n;
+       struct e820_table *n;
        int size;
 
-       size = offsetof(struct e820map, map) + sizeof(struct e820entry) * e820->nr_map;
+       size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table->nr_entries;
        n = kmalloc(size, GFP_KERNEL);
        BUG_ON(!n);
-       memcpy(n, e820, size);
-       e820 = n;
+       memcpy(n, e820_table, size);
+       e820_table = n;
 
-       size = offsetof(struct e820map, map) + sizeof(struct e820entry) * e820_saved->nr_map;
+       size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table_firmware->nr_entries;
        n = kmalloc(size, GFP_KERNEL);
        BUG_ON(!n);
-       memcpy(n, e820_saved, size);
-       e820_saved = n;
+       memcpy(n, e820_table_firmware, size);
+       e820_table_firmware = n;
 }
 
-/**
- * Because of the size limitation of struct boot_params, only first
- * 128 E820 memory entries are passed to kernel via
- * boot_params.e820_map, others are passed via SETUP_E820_EXT node of
- * linked list of struct setup_data, which is parsed here.
+/*
+ * Because of the small fixed size of struct boot_params, only the first
+ * 128 E820 memory entries are passed to the kernel via boot_params.e820_table,
+ * the remaining (if any) entries are passed via the SETUP_E820_EXT node of
+ * struct setup_data, which is parsed here.
  */
-void __init parse_e820_ext(u64 phys_addr, u32 data_len)
+void __init e820__memory_setup_extended(u64 phys_addr, u32 data_len)
 {
        int entries;
-       struct e820entry *extmap;
+       struct boot_e820_entry *extmap;
        struct setup_data *sdata;
 
        sdata = early_memremap(phys_addr, data_len);
-       entries = sdata->len / sizeof(struct e820entry);
-       extmap = (struct e820entry *)(sdata->data);
-       __append_e820_map(extmap, entries);
-       sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map);
+       entries = sdata->len / sizeof(*extmap);
+       extmap = (struct boot_e820_entry *)(sdata->data);
+
+       __append_e820_table(extmap, entries);
+       e820__update_table(e820_table);
+
        early_memunmap(sdata, data_len);
-       printk(KERN_INFO "e820: extended physical RAM map:\n");
-       e820_print_map("extended");
+       pr_info("e820: extended physical RAM map:\n");
+       e820__print_table("extended");
 }
 
-#if defined(CONFIG_X86_64) || \
-       (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION))
-/**
+/*
  * Find the ranges of physical addresses that do not correspond to
- * e820 RAM areas and mark the corresponding pages as nosave for
- * hibernation (32 bit) or software suspend and suspend to RAM (64 bit).
+ * E820 RAM areas and register the corresponding pages as 'nosave' for
+ * hibernation (32-bit) or software suspend and suspend to RAM (64-bit).
  *
- * This function requires the e820 map to be sorted and without any
+ * This function requires the E820 map to be sorted and without any
  * overlapping entries.
  */
-void __init e820_mark_nosave_regions(unsigned long limit_pfn)
+void __init e820__register_nosave_regions(unsigned long limit_pfn)
 {
        int i;
        unsigned long pfn = 0;
 
-       for (i = 0; i < e820->nr_map; i++) {
-               struct e820entry *ei = &e820->map[i];
+       for (i = 0; i < e820_table->nr_entries; i++) {
+               struct e820_entry *entry = &e820_table->entries[i];
 
-               if (pfn < PFN_UP(ei->addr))
-                       register_nosave_region(pfn, PFN_UP(ei->addr));
+               if (pfn < PFN_UP(entry->addr))
+                       register_nosave_region(pfn, PFN_UP(entry->addr));
 
-               pfn = PFN_DOWN(ei->addr + ei->size);
+               pfn = PFN_DOWN(entry->addr + entry->size);
 
-               if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN)
-                       register_nosave_region(PFN_UP(ei->addr), pfn);
+               if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN)
+                       register_nosave_region(PFN_UP(entry->addr), pfn);
 
                if (pfn >= limit_pfn)
                        break;
        }
 }
-#endif
 
 #ifdef CONFIG_ACPI
-/**
- * Mark ACPI NVS memory region, so that we can save/restore it during
- * hibernation and the subsequent resume.
+/*
+ * Register ACPI NVS memory regions, so that we can save/restore them during
+ * hibernation and the subsequent resume:
  */
-static int __init e820_mark_nvs_memory(void)
+static int __init e820__register_nvs_regions(void)
 {
        int i;
 
-       for (i = 0; i < e820->nr_map; i++) {
-               struct e820entry *ei = &e820->map[i];
+       for (i = 0; i < e820_table->nr_entries; i++) {
+               struct e820_entry *entry = &e820_table->entries[i];
 
-               if (ei->type == E820_NVS)
-                       acpi_nvs_register(ei->addr, ei->size);
+               if (entry->type == E820_TYPE_NVS)
+                       acpi_nvs_register(entry->addr, entry->size);
        }
 
        return 0;
 }
-core_initcall(e820_mark_nvs_memory);
+core_initcall(e820__register_nvs_regions);
 #endif
 
 /*
- * pre allocated 4k and reserved it in memblock and e820_saved
+ * Allocate the requested number of bytes with the requsted alignment
+ * and return (the physical address) to the caller. Also register this
+ * range in the 'firmware' E820 table as a reserved range.
+ *
+ * This allows kexec to fake a new mptable, as if it came from the real
+ * system.
  */
-u64 __init early_reserve_e820(u64 size, u64 align)
+u64 __init e820__memblock_alloc_reserved(u64 size, u64 align)
 {
        u64 addr;
 
        addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
        if (addr) {
-               e820_update_range_saved(addr, size, E820_RAM, E820_RESERVED);
-               printk(KERN_INFO "e820: update e820_saved for early_reserve_e820\n");
-               update_e820_saved();
+               e820__range_update_firmware(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED);
+               pr_info("e820: update e820_table_firmware for e820__memblock_alloc_reserved()\n");
+               e820__update_table_firmware();
        }
 
        return addr;
@@ -779,22 +760,22 @@ u64 __init early_reserve_e820(u64 size, u64 align)
 /*
  * Find the highest page frame number we have available
  */
-static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
+static unsigned long __init e820_end_pfn(unsigned long limit_pfn, enum e820_type type)
 {
        int i;
        unsigned long last_pfn = 0;
        unsigned long max_arch_pfn = MAX_ARCH_PFN;
 
-       for (i = 0; i < e820->nr_map; i++) {
-               struct e820entry *ei = &e820->map[i];
+       for (i = 0; i < e820_table->nr_entries; i++) {
+               struct e820_entry *entry = &e820_table->entries[i];
                unsigned long start_pfn;
                unsigned long end_pfn;
 
-               if (ei->type != type)
+               if (entry->type != type)
                        continue;
 
-               start_pfn = ei->addr >> PAGE_SHIFT;
-               end_pfn = (ei->addr + ei->size) >> PAGE_SHIFT;
+               start_pfn = entry->addr >> PAGE_SHIFT;
+               end_pfn = (entry->addr + entry->size) >> PAGE_SHIFT;
 
                if (start_pfn >= limit_pfn)
                        continue;
@@ -809,18 +790,19 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
        if (last_pfn > max_arch_pfn)
                last_pfn = max_arch_pfn;
 
-       printk(KERN_INFO "e820: last_pfn = %#lx max_arch_pfn = %#lx\n",
+       pr_info("e820: last_pfn = %#lx max_arch_pfn = %#lx\n",
                         last_pfn, max_arch_pfn);
        return last_pfn;
 }
-unsigned long __init e820_end_of_ram_pfn(void)
+
+unsigned long __init e820__end_of_ram_pfn(void)
 {
-       return e820_end_pfn(MAX_ARCH_PFN, E820_RAM);
+       return e820_end_pfn(MAX_ARCH_PFN, E820_TYPE_RAM);
 }
 
-unsigned long __init e820_end_of_low_ram_pfn(void)
+unsigned long __init e820__end_of_low_ram_pfn(void)
 {
-       return e820_end_pfn(1UL << (32 - PAGE_SHIFT), E820_RAM);
+       return e820_end_pfn(1UL << (32 - PAGE_SHIFT), E820_TYPE_RAM);
 }
 
 static void __init early_panic(char *msg)
@@ -831,7 +813,7 @@ static void __init early_panic(char *msg)
 
 static int userdef __initdata;
 
-/* "mem=nopentium" disables the 4MB page tables. */
+/* The "mem=nopentium" boot option disables 4MB page tables on 32-bit kernels: */
 static int __init parse_memopt(char *p)
 {
        u64 mem_size;
@@ -844,17 +826,19 @@ static int __init parse_memopt(char *p)
                setup_clear_cpu_cap(X86_FEATURE_PSE);
                return 0;
 #else
-               printk(KERN_WARNING "mem=nopentium ignored! (only supported on x86_32)\n");
+               pr_warn("mem=nopentium ignored! (only supported on x86_32)\n");
                return -EINVAL;
 #endif
        }
 
        userdef = 1;
        mem_size = memparse(p, &p);
-       /* don't remove all of memory when handling "mem={invalid}" param */
+
+       /* Don't remove all memory when getting "mem={invalid}" parameter: */
        if (mem_size == 0)
                return -EINVAL;
-       e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
+
+       e820__range_remove(mem_size, ULLONG_MAX - mem_size, E820_TYPE_RAM, 1);
 
        return 0;
 }
@@ -872,12 +856,12 @@ static int __init parse_memmap_one(char *p)
 #ifdef CONFIG_CRASH_DUMP
                /*
                 * If we are doing a crash dump, we still need to know
-                * the real mem size before original memory map is
+                * the real memory size before the original memory map is
                 * reset.
                 */
-               saved_max_pfn = e820_end_of_ram_pfn();
+               saved_max_pfn = e820__end_of_ram_pfn();
 #endif
-               e820->nr_map = 0;
+               e820_table->nr_entries = 0;
                userdef = 1;
                return 0;
        }
@@ -890,21 +874,23 @@ static int __init parse_memmap_one(char *p)
        userdef = 1;
        if (*p == '@') {
                start_at = memparse(p+1, &p);
-               e820_add_region(start_at, mem_size, E820_RAM);
+               e820__range_add(start_at, mem_size, E820_TYPE_RAM);
        } else if (*p == '#') {
                start_at = memparse(p+1, &p);
-               e820_add_region(start_at, mem_size, E820_ACPI);
+               e820__range_add(start_at, mem_size, E820_TYPE_ACPI);
        } else if (*p == '$') {
                start_at = memparse(p+1, &p);
-               e820_add_region(start_at, mem_size, E820_RESERVED);
+               e820__range_add(start_at, mem_size, E820_TYPE_RESERVED);
        } else if (*p == '!') {
                start_at = memparse(p+1, &p);
-               e820_add_region(start_at, mem_size, E820_PRAM);
-       } else
-               e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
+               e820__range_add(start_at, mem_size, E820_TYPE_PRAM);
+       } else {
+               e820__range_remove(mem_size, ULLONG_MAX - mem_size, E820_TYPE_RAM, 1);
+       }
 
        return *p == '\0' ? 0 : -EINVAL;
 }
+
 static int __init parse_memmap_opt(char *str)
 {
        while (str) {
@@ -921,68 +907,97 @@ static int __init parse_memmap_opt(char *str)
 }
 early_param("memmap", parse_memmap_opt);
 
-void __init finish_e820_parsing(void)
+/*
+ * Reserve all entries from the bootloader's extensible data nodes list,
+ * because if present we are going to use it later on to fetch e820
+ * entries from it:
+ */
+void __init e820__reserve_setup_data(void)
+{
+       struct setup_data *data;
+       u64 pa_data;
+
+       pa_data = boot_params.hdr.setup_data;
+       if (!pa_data)
+               return;
+
+       while (pa_data) {
+               data = early_memremap(pa_data, sizeof(*data));
+               e820__range_update(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN);
+               pa_data = data->next;
+               early_memunmap(data, sizeof(*data));
+       }
+
+       e820__update_table(e820_table);
+
+       memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware));
+
+       pr_info("extended physical RAM map:\n");
+       e820__print_table("reserve setup_data");
+}
+
+/*
+ * Called after parse_early_param(), after early parameters (such as mem=)
+ * have been processed, in which case we already have an E820 table filled in
+ * via the parameter callback function(s), but it's not sorted and printed yet:
+ */
+void __init e820__finish_early_params(void)
 {
        if (userdef) {
-               if (sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map),
-                                       &e820->nr_map) < 0)
+               if (e820__update_table(e820_table) < 0)
                        early_panic("Invalid user supplied memory map");
 
-               printk(KERN_INFO "e820: user-defined physical RAM map:\n");
-               e820_print_map("user");
+               pr_info("e820: user-defined physical RAM map:\n");
+               e820__print_table("user");
        }
 }
 
-static const char *__init e820_type_to_string(int e820_type)
+static const char *__init e820_type_to_string(struct e820_entry *entry)
 {
-       switch (e820_type) {
-       case E820_RESERVED_KERN:
-       case E820_RAM:  return "System RAM";
-       case E820_ACPI: return "ACPI Tables";
-       case E820_NVS:  return "ACPI Non-volatile Storage";
-       case E820_UNUSABLE:     return "Unusable memory";
-       case E820_PRAM: return "Persistent Memory (legacy)";
-       case E820_PMEM: return "Persistent Memory";
-       default:        return "reserved";
+       switch (entry->type) {
+       case E820_TYPE_RESERVED_KERN:   /* Fall-through: */
+       case E820_TYPE_RAM:             return "System RAM";
+       case E820_TYPE_ACPI:            return "ACPI Tables";
+       case E820_TYPE_NVS:             return "ACPI Non-volatile Storage";
+       case E820_TYPE_UNUSABLE:        return "Unusable memory";
+       case E820_TYPE_PRAM:            return "Persistent Memory (legacy)";
+       case E820_TYPE_PMEM:            return "Persistent Memory";
+       case E820_TYPE_RESERVED:        return "Reserved";
+       default:                        return "Unknown E820 type";
        }
 }
 
-static unsigned long __init e820_type_to_iomem_type(int e820_type)
+static unsigned long __init e820_type_to_iomem_type(struct e820_entry *entry)
 {
-       switch (e820_type) {
-       case E820_RESERVED_KERN:
-       case E820_RAM:
-               return IORESOURCE_SYSTEM_RAM;
-       case E820_ACPI:
-       case E820_NVS:
-       case E820_UNUSABLE:
-       case E820_PRAM:
-       case E820_PMEM:
-       default:
-               return IORESOURCE_MEM;
+       switch (entry->type) {
+       case E820_TYPE_RESERVED_KERN:   /* Fall-through: */
+       case E820_TYPE_RAM:             return IORESOURCE_SYSTEM_RAM;
+       case E820_TYPE_ACPI:            /* Fall-through: */
+       case E820_TYPE_NVS:             /* Fall-through: */
+       case E820_TYPE_UNUSABLE:        /* Fall-through: */
+       case E820_TYPE_PRAM:            /* Fall-through: */
+       case E820_TYPE_PMEM:            /* Fall-through: */
+       case E820_TYPE_RESERVED:        /* Fall-through: */
+       default:                        return IORESOURCE_MEM;
        }
 }
 
-static unsigned long __init e820_type_to_iores_desc(int e820_type)
+static unsigned long __init e820_type_to_iores_desc(struct e820_entry *entry)
 {
-       switch (e820_type) {
-       case E820_ACPI:
-               return IORES_DESC_ACPI_TABLES;
-       case E820_NVS:
-               return IORES_DESC_ACPI_NV_STORAGE;
-       case E820_PMEM:
-               return IORES_DESC_PERSISTENT_MEMORY;
-       case E820_PRAM:
-               return IORES_DESC_PERSISTENT_MEMORY_LEGACY;
-       case E820_RESERVED_KERN:
-       case E820_RAM:
-       case E820_UNUSABLE:
-       default:
-               return IORES_DESC_NONE;
+       switch (entry->type) {
+       case E820_TYPE_ACPI:            return IORES_DESC_ACPI_TABLES;
+       case E820_TYPE_NVS:             return IORES_DESC_ACPI_NV_STORAGE;
+       case E820_TYPE_PMEM:            return IORES_DESC_PERSISTENT_MEMORY;
+       case E820_TYPE_PRAM:            return IORES_DESC_PERSISTENT_MEMORY_LEGACY;
+       case E820_TYPE_RESERVED_KERN:   /* Fall-through: */
+       case E820_TYPE_RAM:             /* Fall-through: */
+       case E820_TYPE_UNUSABLE:        /* Fall-through: */
+       case E820_TYPE_RESERVED:        /* Fall-through: */
+       default:                        return IORES_DESC_NONE;
        }
 }
 
-static bool __init do_mark_busy(u32 type, struct resource *res)
+static bool __init do_mark_busy(enum e820_type type, struct resource *res)
 {
        /* this is the legacy bios/dos rom-shadow + mmio region */
        if (res->start < (1ULL<<20))
@@ -993,61 +1008,71 @@ static bool __init do_mark_busy(u32 type, struct resource *res)
         * for exclusive use of a driver
         */
        switch (type) {
-       case E820_RESERVED:
-       case E820_PRAM:
-       case E820_PMEM:
+       case E820_TYPE_RESERVED:
+       case E820_TYPE_PRAM:
+       case E820_TYPE_PMEM:
                return false;
+       case E820_TYPE_RESERVED_KERN:
+       case E820_TYPE_RAM:
+       case E820_TYPE_ACPI:
+       case E820_TYPE_NVS:
+       case E820_TYPE_UNUSABLE:
        default:
                return true;
        }
 }
 
 /*
- * Mark e820 reserved areas as busy for the resource manager.
+ * Mark E820 reserved areas as busy for the resource manager:
  */
+
 static struct resource __initdata *e820_res;
-void __init e820_reserve_resources(void)
+
+void __init e820__reserve_resources(void)
 {
        int i;
        struct resource *res;
        u64 end;
 
-       res = alloc_bootmem(sizeof(struct resource) * e820->nr_map);
+       res = alloc_bootmem(sizeof(*res) * e820_table->nr_entries);
        e820_res = res;
-       for (i = 0; i < e820->nr_map; i++) {
-               end = e820->map[i].addr + e820->map[i].size - 1;
+
+       for (i = 0; i < e820_table->nr_entries; i++) {
+               struct e820_entry *entry = e820_table->entries + i;
+
+               end = entry->addr + entry->size - 1;
                if (end != (resource_size_t)end) {
                        res++;
                        continue;
                }
-               res->name = e820_type_to_string(e820->map[i].type);
-               res->start = e820->map[i].addr;
-               res->end = end;
-
-               res->flags = e820_type_to_iomem_type(e820->map[i].type);
-               res->desc = e820_type_to_iores_desc(e820->map[i].type);
+               res->start = entry->addr;
+               res->end   = end;
+               res->name  = e820_type_to_string(entry);
+               res->flags = e820_type_to_iomem_type(entry);
+               res->desc  = e820_type_to_iores_desc(entry);
 
                /*
-                * don't register the region that could be conflicted with
-                * pci device BAR resource and insert them later in
-                * pcibios_resource_survey()
+                * Don't register the region that could be conflicted with
+                * PCI device BAR resources and insert them later in
+                * pcibios_resource_survey():
                 */
-               if (do_mark_busy(e820->map[i].type, res)) {
+               if (do_mark_busy(entry->type, res)) {
                        res->flags |= IORESOURCE_BUSY;
                        insert_resource(&iomem_resource, res);
                }
                res++;
        }
 
-       for (i = 0; i < e820_saved->nr_map; i++) {
-               struct e820entry *entry = &e820_saved->map[i];
-               firmware_map_add_early(entry->addr,
-                       entry->addr + entry->size,
-                       e820_type_to_string(entry->type));
+       for (i = 0; i < e820_table_firmware->nr_entries; i++) {
+               struct e820_entry *entry = e820_table_firmware->entries + i;
+
+               firmware_map_add_early(entry->addr, entry->addr + entry->size, e820_type_to_string(entry));
        }
 }
 
-/* How much should we pad RAM ending depending on where it is? */
+/*
+ * How much should we pad the end of RAM, depending on where it is?
+ */
 static unsigned long __init ram_alignment(resource_size_t pos)
 {
        unsigned long mb = pos >> 20;
@@ -1066,64 +1091,59 @@ static unsigned long __init ram_alignment(resource_size_t pos)
 
 #define MAX_RESOURCE_SIZE ((resource_size_t)-1)
 
-void __init e820_reserve_resources_late(void)
+void __init e820__reserve_resources_late(void)
 {
        int i;
        struct resource *res;
 
        res = e820_res;
-       for (i = 0; i < e820->nr_map; i++) {
+       for (i = 0; i < e820_table->nr_entries; i++) {
                if (!res->parent && res->end)
                        insert_resource_expand_to_fit(&iomem_resource, res);
                res++;
        }
 
        /*
-        * Try to bump up RAM regions to reasonable boundaries to
+        * Try to bump up RAM regions to reasonable boundaries, to
         * avoid stolen RAM:
         */
-       for (i = 0; i < e820->nr_map; i++) {
-               struct e820entry *entry = &e820->map[i];
+       for (i = 0; i < e820_table->nr_entries; i++) {
+               struct e820_entry *entry = &e820_table->entries[i];
                u64 start, end;
 
-               if (entry->type != E820_RAM)
+               if (entry->type != E820_TYPE_RAM)
                        continue;
+
                start = entry->addr + entry->size;
                end = round_up(start, ram_alignment(start)) - 1;
                if (end > MAX_RESOURCE_SIZE)
                        end = MAX_RESOURCE_SIZE;
                if (start >= end)
                        continue;
-               printk(KERN_DEBUG
-                      "e820: reserve RAM buffer [mem %#010llx-%#010llx]\n",
-                      start, end);
-               reserve_region_with_split(&iomem_resource, start, end,
-                                         "RAM buffer");
+
+               printk(KERN_DEBUG "e820: reserve RAM buffer [mem %#010llx-%#010llx]\n", start, end);
+               reserve_region_with_split(&iomem_resource, start, end, "RAM buffer");
        }
 }
 
-char *__init default_machine_specific_memory_setup(void)
+/*
+ * Pass the firmware (bootloader) E820 map to the kernel and process it:
+ */
+char *__init e820__memory_setup_default(void)
 {
        char *who = "BIOS-e820";
-       u32 new_nr;
+
        /*
         * Try to copy the BIOS-supplied E820-map.
         *
         * Otherwise fake a memory map; one section from 0k->640k,
         * the next section from 1mb->appropriate_mem_k
         */
-       new_nr = boot_params.e820_entries;
-       sanitize_e820_map(boot_params.e820_map,
-                       ARRAY_SIZE(boot_params.e820_map),
-                       &new_nr);
-       boot_params.e820_entries = new_nr;
-       if (append_e820_map(boot_params.e820_map, boot_params.e820_entries)
-         < 0) {
+       if (append_e820_table(boot_params.e820_table, boot_params.e820_entries) < 0) {
                u64 mem_size;
 
-               /* compare results from other methods and take the greater */
-               if (boot_params.alt_mem_k
-                   < boot_params.screen_info.ext_mem_k) {
+               /* Compare results from other methods and take the one that gives more RAM: */
+               if (boot_params.alt_mem_k < boot_params.screen_info.ext_mem_k) {
                        mem_size = boot_params.screen_info.ext_mem_k;
                        who = "BIOS-88";
                } else {
@@ -1131,84 +1151,68 @@ char *__init default_machine_specific_memory_setup(void)
                        who = "BIOS-e801";
                }
 
-               e820->nr_map = 0;
-               e820_add_region(0, LOWMEMSIZE(), E820_RAM);
-               e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
+               e820_table->nr_entries = 0;
+               e820__range_add(0, LOWMEMSIZE(), E820_TYPE_RAM);
+               e820__range_add(HIGH_MEMORY, mem_size << 10, E820_TYPE_RAM);
        }
 
-       /* In case someone cares... */
+       /* We just appended a lot of ranges, sanitize the table: */
+       e820__update_table(e820_table);
+
        return who;
 }
 
-void __init setup_memory_map(void)
+/*
+ * Calls e820__memory_setup_default() in essence to pick up the firmware/bootloader
+ * E820 map - with an optional platform quirk available for virtual platforms
+ * to override this method of boot environment processing:
+ */
+void __init e820__memory_setup(void)
 {
        char *who;
 
+       /* This is a firmware interface ABI - make sure we don't break it: */
+       BUILD_BUG_ON(sizeof(struct boot_e820_entry) != 20);
+
        who = x86_init.resources.memory_setup();
-       memcpy(e820_saved, e820, sizeof(struct e820map));
-       printk(KERN_INFO "e820: BIOS-provided physical RAM map:\n");
-       e820_print_map(who);
+
+       memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware));
+
+       pr_info("e820: BIOS-provided physical RAM map:\n");
+       e820__print_table(who);
 }
 
-void __init memblock_x86_fill(void)
+void __init e820__memblock_setup(void)
 {
        int i;
        u64 end;
 
        /*
-        * EFI may have more than 128 entries
-        * We are safe to enable resizing, beause memblock_x86_fill()
-        * is rather later for x86
+        * The bootstrap memblock region count maximum is 128 entries
+        * (INIT_MEMBLOCK_REGIONS), but EFI might pass us more E820 entries
+        * than that - so allow memblock resizing.
+        *
+        * This is safe, because this call happens pretty late during x86 setup,
+        * so we know about reserved memory regions already. (This is important
+        * so that memblock resizing does no stomp over reserved areas.)
         */
        memblock_allow_resize();
 
-       for (i = 0; i < e820->nr_map; i++) {
-               struct e820entry *ei = &e820->map[i];
+       for (i = 0; i < e820_table->nr_entries; i++) {
+               struct e820_entry *entry = &e820_table->entries[i];
 
-               end = ei->addr + ei->size;
+               end = entry->addr + entry->size;
                if (end != (resource_size_t)end)
                        continue;
 
-               if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN)
+               if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN)
                        continue;
 
-               memblock_add(ei->addr, ei->size);
+               memblock_add(entry->addr, entry->size);
        }
 
-       /* throw away partial pages */
+       /* Throw away partial pages: */
        memblock_trim_memory(PAGE_SIZE);
 
        memblock_dump_all();
 }
-
-void __init memblock_find_dma_reserve(void)
-{
-#ifdef CONFIG_X86_64
-       u64 nr_pages = 0, nr_free_pages = 0;
-       unsigned long start_pfn, end_pfn;
-       phys_addr_t start, end;
-       int i;
-       u64 u;
-
-       /*
-        * need to find out used area below MAX_DMA_PFN
-        * need to use memblock to get free size in [0, MAX_DMA_PFN]
-        * at first, and assume boot_mem will not take below MAX_DMA_PFN
-        */
-       for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
-               start_pfn = min(start_pfn, MAX_DMA_PFN);
-               end_pfn = min(end_pfn, MAX_DMA_PFN);
-               nr_pages += end_pfn - start_pfn;
-       }
-
-       for_each_free_mem_range(u, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end,
-                               NULL) {
-               start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN);
-               end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN);
-               if (start_pfn < end_pfn)
-                       nr_free_pages += end_pfn - start_pfn;
-       }
-
-       set_dma_reserve(nr_pages - nr_free_pages);
-#endif
-}
index 6a08e25..ff7e4b3 100644 (file)
@@ -546,8 +546,8 @@ intel_graphics_stolen(int num, int slot, int func,
               &base, &end);
 
        /* Mark this space as reserved */
-       e820_add_region(base, size, E820_RESERVED);
-       sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map);
+       e820__range_add(base, size, E820_TYPE_RESERVED);
+       e820__update_table(e820_table);
 }
 
 static void __init intel_graphics_quirks(int num, int slot, int func)
index cbd73eb..5b71535 100644 (file)
 #include <asm/ftrace.h>
 #include <asm/nops.h>
 
-#if defined(CONFIG_FUNCTION_GRAPH_TRACER) && \
-       !defined(CC_USING_FENTRY) && \
-       !defined(CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE)
-# error The following combination is not supported: ((compiler missing -mfentry) || (CONFIG_X86_32 and !CONFIG_DYNAMIC_FTRACE)) && CONFIG_FUNCTION_GRAPH_TRACER && CONFIG_CC_OPTIMIZE_FOR_SIZE
-#endif
-
 #ifdef CONFIG_DYNAMIC_FTRACE
 
 int ftrace_arch_code_modify_prepare(void)
@@ -989,6 +983,18 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent,
        unsigned long return_hooker = (unsigned long)
                                &return_to_handler;
 
+       /*
+        * When resuming from suspend-to-ram, this function can be indirectly
+        * called from early CPU startup code while the CPU is in real mode,
+        * which would fail miserably.  Make sure the stack pointer is a
+        * virtual address.
+        *
+        * This check isn't as accurate as virt_addr_valid(), but it should be
+        * good enough for this purpose, and it's fast.
+        */
+       if (unlikely((long)__builtin_frame_address(0) >= 0))
+               return;
+
        if (unlikely(ftrace_graph_is_dead()))
                return;
 
index e5fb436..538ec01 100644 (file)
@@ -12,7 +12,7 @@
 
 #include <asm/setup.h>
 #include <asm/sections.h>
-#include <asm/e820.h>
+#include <asm/e820/api.h>
 #include <asm/page.h>
 #include <asm/apic.h>
 #include <asm/io_apic.h>
index b5785c1..43b7002 100644 (file)
@@ -24,7 +24,7 @@
 #include <asm/tlbflush.h>
 #include <asm/sections.h>
 #include <asm/kdebug.h>
-#include <asm/e820.h>
+#include <asm/e820/api.h>
 #include <asm/bios_ebda.h>
 #include <asm/bootparam_utils.h>
 #include <asm/microcode.h>
index b467b14..ac9d327 100644 (file)
@@ -269,10 +269,8 @@ ENTRY(secondary_startup_64)
        /* rsi is pointer to real mode structure with interesting info.
           pass it to C */
        movq    %rsi, %rdi
-       jmp     start_cpu
-ENDPROC(secondary_startup_64)
 
-ENTRY(start_cpu)
+.Ljump_to_C_code:
        /*
         * Jump to run C code and to be on a real kernel address.
         * Since we are running on identity-mapped space we have to jump
@@ -305,7 +303,7 @@ ENTRY(start_cpu)
        pushq   %rax            # target address in negative space
        lretq
 .Lafter_lret:
-ENDPROC(start_cpu)
+ENDPROC(secondary_startup_64)
 
 #include "verify_cpu.S"
 
@@ -313,11 +311,11 @@ ENDPROC(start_cpu)
 /*
  * Boot CPU0 entry point. It's called from play_dead(). Everything has been set
  * up already except stack. We just set up stack here. Then call
- * start_secondary() via start_cpu().
+ * start_secondary() via .Ljump_to_C_code.
  */
 ENTRY(start_cpu0)
        movq    initial_stack(%rip), %rsp
-       jmp     start_cpu
+       jmp     .Ljump_to_C_code
 ENDPROC(start_cpu0)
 #endif
 
index d0a814a..9d7fd5e 100644 (file)
@@ -25,6 +25,7 @@
 #include <asm/setup.h>
 #include <asm/crash.h>
 #include <asm/efi.h>
+#include <asm/e820/api.h>
 #include <asm/kexec-bzimage64.h>
 
 #define MAX_ELFCOREHDR_STR_LEN 30      /* elfcorehdr=0x<64bit-value> */
@@ -99,15 +100,14 @@ static int setup_e820_entries(struct boot_params *params)
 {
        unsigned int nr_e820_entries;
 
-       nr_e820_entries = e820_saved->nr_map;
+       nr_e820_entries = e820_table_firmware->nr_entries;
 
-       /* TODO: Pass entries more than E820MAX in bootparams setup data */
-       if (nr_e820_entries > E820MAX)
-               nr_e820_entries = E820MAX;
+       /* TODO: Pass entries more than E820_MAX_ENTRIES_ZEROPAGE in bootparams setup data */
+       if (nr_e820_entries > E820_MAX_ENTRIES_ZEROPAGE)
+               nr_e820_entries = E820_MAX_ENTRIES_ZEROPAGE;
 
        params->e820_entries = nr_e820_entries;
-       memcpy(&params->e820_map, &e820_saved->map,
-              nr_e820_entries * sizeof(struct e820entry));
+       memcpy(&params->e820_table, &e820_table_firmware->entries, nr_e820_entries*sizeof(struct e820_entry));
 
        return 0;
 }
@@ -232,10 +232,10 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params,
        nr_e820_entries = params->e820_entries;
 
        for (i = 0; i < nr_e820_entries; i++) {
-               if (params->e820_map[i].type != E820_RAM)
+               if (params->e820_table[i].type != E820_TYPE_RAM)
                        continue;
-               start = params->e820_map[i].addr;
-               end = params->e820_map[i].addr + params->e820_map[i].size - 1;
+               start = params->e820_table[i].addr;
+               end = params->e820_table[i].addr + params->e820_table[i].size - 1;
 
                if ((start <= 0x100000) && end > 0x100000) {
                        mem_k = (end >> 10) - (0x100000 >> 10);
index d688826..db2182d 100644 (file)
@@ -67,7 +67,7 @@
 #endif
 
 /* Ensure if the instruction can be boostable */
-extern int can_boost(kprobe_opcode_t *instruction, void *addr);
+extern int can_boost(struct insn *insn, void *orig_addr);
 /* Recover instruction if given address is probed */
 extern unsigned long recover_probed_instruction(kprobe_opcode_t *buf,
                                         unsigned long addr);
@@ -75,7 +75,7 @@ extern unsigned long recover_probed_instruction(kprobe_opcode_t *buf,
  * Copy an instruction and adjust the displacement if the instruction
  * uses the %rip-relative addressing mode.
  */
-extern int __copy_instruction(u8 *dest, u8 *src);
+extern int __copy_instruction(u8 *dest, u8 *src, struct insn *insn);
 
 /* Generate a relative-jump/call instruction */
 extern void synthesize_reljump(void *from, void *to);
index 993fa4f..19e1f2a 100644 (file)
@@ -164,42 +164,38 @@ static kprobe_opcode_t *skip_prefixes(kprobe_opcode_t *insn)
 NOKPROBE_SYMBOL(skip_prefixes);
 
 /*
- * Returns non-zero if opcode is boostable.
+ * Returns non-zero if INSN is boostable.
  * RIP relative instructions are adjusted at copying time in 64 bits mode
  */
-int can_boost(kprobe_opcode_t *opcodes, void *addr)
+int can_boost(struct insn *insn, void *addr)
 {
        kprobe_opcode_t opcode;
-       kprobe_opcode_t *orig_opcodes = opcodes;
 
        if (search_exception_tables((unsigned long)addr))
                return 0;       /* Page fault may occur on this address. */
 
-retry:
-       if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
-               return 0;
-       opcode = *(opcodes++);
-
        /* 2nd-byte opcode */
-       if (opcode == 0x0f) {
-               if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
-                       return 0;
-               return test_bit(*opcodes,
+       if (insn->opcode.nbytes == 2)
+               return test_bit(insn->opcode.bytes[1],
                                (unsigned long *)twobyte_is_boostable);
-       }
+
+       if (insn->opcode.nbytes != 1)
+               return 0;
+
+       /* Can't boost Address-size override prefix */
+       if (unlikely(inat_is_address_size_prefix(insn->attr)))
+               return 0;
+
+       opcode = insn->opcode.bytes[0];
 
        switch (opcode & 0xf0) {
-#ifdef CONFIG_X86_64
-       case 0x40:
-               goto retry; /* REX prefix is boostable */
-#endif
        case 0x60:
-               if (0x63 < opcode && opcode < 0x67)
-                       goto retry; /* prefixes */
-               /* can't boost Address-size override and bound */
-               return (opcode != 0x62 && opcode != 0x67);
+               /* can't boost "bound" */
+               return (opcode != 0x62);
        case 0x70:
                return 0; /* can't boost conditional jump */
+       case 0x90:
+               return opcode != 0x9a;  /* can't boost call far */
        case 0xc0:
                /* can't boost software-interruptions */
                return (0xc1 < opcode && opcode < 0xcc) || opcode == 0xcf;
@@ -210,14 +206,9 @@ retry:
                /* can boost in/out and absolute jmps */
                return ((opcode & 0x04) || opcode == 0xea);
        case 0xf0:
-               if ((opcode & 0x0c) == 0 && opcode != 0xf1)
-                       goto retry; /* lock/rep(ne) prefix */
                /* clear and set flags are boostable */
                return (opcode == 0xf5 || (0xf7 < opcode && opcode < 0xfe));
        default:
-               /* segment override prefixes are boostable */
-               if (opcode == 0x26 || opcode == 0x36 || opcode == 0x3e)
-                       goto retry; /* prefixes */
                /* CS override prefix and call are not boostable */
                return (opcode != 0x2e && opcode != 0x9a);
        }
@@ -264,7 +255,10 @@ __recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr)
         * Fortunately, we know that the original code is the ideal 5-byte
         * long NOP.
         */
-       memcpy(buf, (void *)addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
+       if (probe_kernel_read(buf, (void *)addr,
+               MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
+               return 0UL;
+
        if (faddr)
                memcpy(buf, ideal_nops[NOP_ATOMIC5], 5);
        else
@@ -276,7 +270,7 @@ __recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr)
  * Recover the probed instruction at addr for further analysis.
  * Caller must lock kprobes by kprobe_mutex, or disable preemption
  * for preventing to release referencing kprobes.
- * Returns zero if the instruction can not get recovered.
+ * Returns zero if the instruction can not get recovered (or access failed).
  */
 unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
 {
@@ -348,37 +342,36 @@ static int is_IF_modifier(kprobe_opcode_t *insn)
 }
 
 /*
- * Copy an instruction and adjust the displacement if the instruction
- * uses the %rip-relative addressing mode.
- * If it does, Return the address of the 32-bit displacement word.
- * If not, return null.
- * Only applicable to 64-bit x86.
+ * Copy an instruction with recovering modified instruction by kprobes
+ * and adjust the displacement if the instruction uses the %rip-relative
+ * addressing mode.
+ * This returns the length of copied instruction, or 0 if it has an error.
  */
-int __copy_instruction(u8 *dest, u8 *src)
+int __copy_instruction(u8 *dest, u8 *src, struct insn *insn)
 {
-       struct insn insn;
        kprobe_opcode_t buf[MAX_INSN_SIZE];
-       int length;
        unsigned long recovered_insn =
                recover_probed_instruction(buf, (unsigned long)src);
 
-       if (!recovered_insn)
+       if (!recovered_insn || !insn)
+               return 0;
+
+       /* This can access kernel text if given address is not recovered */
+       if (probe_kernel_read(dest, (void *)recovered_insn, MAX_INSN_SIZE))
                return 0;
-       kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE);
-       insn_get_length(&insn);
-       length = insn.length;
+
+       kernel_insn_init(insn, dest, MAX_INSN_SIZE);
+       insn_get_length(insn);
 
        /* Another subsystem puts a breakpoint, failed to recover */
-       if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
+       if (insn->opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
                return 0;
-       memcpy(dest, insn.kaddr, length);
 
 #ifdef CONFIG_X86_64
-       if (insn_rip_relative(&insn)) {
+       /* Only x86_64 has RIP relative instructions */
+       if (insn_rip_relative(insn)) {
                s64 newdisp;
                u8 *disp;
-               kernel_insn_init(&insn, dest, length);
-               insn_get_displacement(&insn);
                /*
                 * The copied instruction uses the %rip-relative addressing
                 * mode.  Adjust the displacement for the difference between
@@ -391,36 +384,57 @@ int __copy_instruction(u8 *dest, u8 *src)
                 * extension of the original signed 32-bit displacement would
                 * have given.
                 */
-               newdisp = (u8 *) src + (s64) insn.displacement.value - (u8 *) dest;
+               newdisp = (u8 *) src + (s64) insn->displacement.value
+                         - (u8 *) dest;
                if ((s64) (s32) newdisp != newdisp) {
                        pr_err("Kprobes error: new displacement does not fit into s32 (%llx)\n", newdisp);
-                       pr_err("\tSrc: %p, Dest: %p, old disp: %x\n", src, dest, insn.displacement.value);
+                       pr_err("\tSrc: %p, Dest: %p, old disp: %x\n",
+                               src, dest, insn->displacement.value);
                        return 0;
                }
-               disp = (u8 *) dest + insn_offset_displacement(&insn);
+               disp = (u8 *) dest + insn_offset_displacement(insn);
                *(s32 *) disp = (s32) newdisp;
        }
 #endif
-       return length;
+       return insn->length;
+}
+
+/* Prepare reljump right after instruction to boost */
+static void prepare_boost(struct kprobe *p, struct insn *insn)
+{
+       if (can_boost(insn, p->addr) &&
+           MAX_INSN_SIZE - insn->length >= RELATIVEJUMP_SIZE) {
+               /*
+                * These instructions can be executed directly if it
+                * jumps back to correct address.
+                */
+               synthesize_reljump(p->ainsn.insn + insn->length,
+                                  p->addr + insn->length);
+               p->ainsn.boostable = true;
+       } else {
+               p->ainsn.boostable = false;
+       }
 }
 
 static int arch_copy_kprobe(struct kprobe *p)
 {
-       int ret;
+       struct insn insn;
+       int len;
+
+       set_memory_rw((unsigned long)p->ainsn.insn & PAGE_MASK, 1);
 
        /* Copy an instruction with recovering if other optprobe modifies it.*/
-       ret = __copy_instruction(p->ainsn.insn, p->addr);
-       if (!ret)
+       len = __copy_instruction(p->ainsn.insn, p->addr, &insn);
+       if (!len)
                return -EINVAL;
 
        /*
         * __copy_instruction can modify the displacement of the instruction,
         * but it doesn't affect boostable check.
         */
-       if (can_boost(p->ainsn.insn, p->addr))
-               p->ainsn.boostable = 0;
-       else
-               p->ainsn.boostable = -1;
+       prepare_boost(p, &insn);
+
+       set_memory_ro((unsigned long)p->ainsn.insn & PAGE_MASK, 1);
 
        /* Check whether the instruction modifies Interrupt Flag or not */
        p->ainsn.if_modifier = is_IF_modifier(p->ainsn.insn);
@@ -459,7 +473,7 @@ void arch_disarm_kprobe(struct kprobe *p)
 void arch_remove_kprobe(struct kprobe *p)
 {
        if (p->ainsn.insn) {
-               free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1));
+               free_insn_slot(p->ainsn.insn, p->ainsn.boostable);
                p->ainsn.insn = NULL;
        }
 }
@@ -531,7 +545,7 @@ static void setup_singlestep(struct kprobe *p, struct pt_regs *regs,
                return;
 
 #if !defined(CONFIG_PREEMPT)
-       if (p->ainsn.boostable == 1 && !p->post_handler) {
+       if (p->ainsn.boostable && !p->post_handler) {
                /* Boost up -- we can execute copied instructions directly */
                if (!reenter)
                        reset_current_kprobe();
@@ -851,7 +865,7 @@ static void resume_execution(struct kprobe *p, struct pt_regs *regs,
        case 0xcf:
        case 0xea:      /* jmp absolute -- ip is correct */
                /* ip is already adjusted, no more changes required */
-               p->ainsn.boostable = 1;
+               p->ainsn.boostable = true;
                goto no_change;
        case 0xe8:      /* call relative - Fix return addr */
                *tos = orig_ip + (*tos - copy_ip);
@@ -876,28 +890,13 @@ static void resume_execution(struct kprobe *p, struct pt_regs *regs,
                         * jmp near and far, absolute indirect
                         * ip is correct. And this is boostable
                         */
-                       p->ainsn.boostable = 1;
+                       p->ainsn.boostable = true;
                        goto no_change;
                }
        default:
                break;
        }
 
-       if (p->ainsn.boostable == 0) {
-               if ((regs->ip > copy_ip) &&
-                   (regs->ip - copy_ip) + 5 < MAX_INSN_SIZE) {
-                       /*
-                        * These instructions can be executed directly if it
-                        * jumps back to correct address.
-                        */
-                       synthesize_reljump((void *)regs->ip,
-                               (void *)orig_ip + (regs->ip - copy_ip));
-                       p->ainsn.boostable = 1;
-               } else {
-                       p->ainsn.boostable = -1;
-               }
-       }
-
        regs->ip += orig_ip - copy_ip;
 
 no_change:
index 5f8f0b3..041f7b6 100644 (file)
@@ -94,6 +94,6 @@ NOKPROBE_SYMBOL(kprobe_ftrace_handler);
 int arch_prepare_kprobe_ftrace(struct kprobe *p)
 {
        p->ainsn.insn = NULL;
-       p->ainsn.boostable = -1;
+       p->ainsn.boostable = false;
        return 0;
 }
index 3e7c6e5..9aadff3 100644 (file)
@@ -65,7 +65,10 @@ found:
         * overwritten by jump destination address. In this case, original
         * bytes must be recovered from op->optinsn.copied_insn buffer.
         */
-       memcpy(buf, (void *)addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
+       if (probe_kernel_read(buf, (void *)addr,
+               MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
+               return 0UL;
+
        if (addr == (unsigned long)kp->addr) {
                buf[0] = kp->opcode;
                memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
@@ -174,11 +177,12 @@ NOKPROBE_SYMBOL(optimized_callback);
 
 static int copy_optimized_instructions(u8 *dest, u8 *src)
 {
+       struct insn insn;
        int len = 0, ret;
 
        while (len < RELATIVEJUMP_SIZE) {
-               ret = __copy_instruction(dest + len, src + len);
-               if (!ret || !can_boost(dest + len, src + len))
+               ret = __copy_instruction(dest + len, src + len, &insn);
+               if (!ret || !can_boost(&insn, src + len))
                        return -EINVAL;
                len += ret;
        }
@@ -350,6 +354,7 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op,
        }
 
        buf = (u8 *)op->optinsn.insn;
+       set_memory_rw((unsigned long)buf & PAGE_MASK, 1);
 
        /* Copy instructions into the out-of-line buffer */
        ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr);
@@ -372,6 +377,8 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op,
        synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size,
                           (u8 *)op->kp.addr + op->optinsn.size);
 
+       set_memory_ro((unsigned long)buf & PAGE_MASK, 1);
+
        flush_icache_range((unsigned long) buf,
                           (unsigned long) buf + TMPL_END_IDX +
                           op->optinsn.size + RELATIVEJUMP_SIZE);
index 0f8d204..0d904d7 100644 (file)
@@ -26,7 +26,7 @@
 #include <asm/io_apic.h>
 #include <asm/proto.h>
 #include <asm/bios_ebda.h>
-#include <asm/e820.h>
+#include <asm/e820/api.h>
 #include <asm/setup.h>
 #include <asm/smp.h>
 
@@ -826,10 +826,10 @@ static int __init parse_alloc_mptable_opt(char *p)
 }
 early_param("alloc_mptable", parse_alloc_mptable_opt);
 
-void __init early_reserve_e820_mpc_new(void)
+void __init e820__memblock_alloc_reserved_mpc_new(void)
 {
        if (enable_update_mptable && alloc_mptable)
-               mpc_new_phys = early_reserve_e820(mpc_new_length, 4);
+               mpc_new_phys = e820__memblock_alloc_reserved(mpc_new_length, 4);
 }
 
 static int __init update_mp_table(void)
index a723ae9..446c8aa 100644 (file)
@@ -222,17 +222,6 @@ pci_serr_error(unsigned char reason, struct pt_regs *regs)
        pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n",
                 reason, smp_processor_id());
 
-       /*
-        * On some machines, PCI SERR line is used to report memory
-        * errors. EDAC makes use of it.
-        */
-#if defined(CONFIG_EDAC)
-       if (edac_handler_set()) {
-               edac_atomic_assert_error();
-               return;
-       }
-#endif
-
        if (panic_on_unrecovered_nmi)
                nmi_panic(regs, "NMI: Not continuing");
 
index d5f15c3..963e3fb 100644 (file)
@@ -14,7 +14,7 @@
 
 #include <asm/probe_roms.h>
 #include <asm/pci-direct.h>
-#include <asm/e820.h>
+#include <asm/e820/api.h>
 #include <asm/mmzone.h>
 #include <asm/setup.h>
 #include <asm/sections.h>
index f675915..0bb8842 100644 (file)
@@ -37,6 +37,7 @@
 #include <asm/vm86.h>
 #include <asm/switch_to.h>
 #include <asm/desc.h>
+#include <asm/prctl.h>
 
 /*
  * per-CPU TSS segments. Threads are completely 'soft' on Linux,
@@ -124,11 +125,6 @@ void flush_thread(void)
        fpu__clear(&tsk->thread.fpu);
 }
 
-static void hard_disable_TSC(void)
-{
-       cr4_set_bits(X86_CR4_TSD);
-}
-
 void disable_TSC(void)
 {
        preempt_disable();
@@ -137,15 +133,10 @@ void disable_TSC(void)
                 * Must flip the CPU state synchronously with
                 * TIF_NOTSC in the current running context.
                 */
-               hard_disable_TSC();
+               cr4_set_bits(X86_CR4_TSD);
        preempt_enable();
 }
 
-static void hard_enable_TSC(void)
-{
-       cr4_clear_bits(X86_CR4_TSD);
-}
-
 static void enable_TSC(void)
 {
        preempt_disable();
@@ -154,7 +145,7 @@ static void enable_TSC(void)
                 * Must flip the CPU state synchronously with
                 * TIF_NOTSC in the current running context.
                 */
-               hard_enable_TSC();
+               cr4_clear_bits(X86_CR4_TSD);
        preempt_enable();
 }
 
@@ -182,54 +173,129 @@ int set_tsc_mode(unsigned int val)
        return 0;
 }
 
-void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
-                     struct tss_struct *tss)
-{
-       struct thread_struct *prev, *next;
-
-       prev = &prev_p->thread;
-       next = &next_p->thread;
+DEFINE_PER_CPU(u64, msr_misc_features_shadow);
 
-       if (test_tsk_thread_flag(prev_p, TIF_BLOCKSTEP) ^
-           test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) {
-               unsigned long debugctl = get_debugctlmsr();
+static void set_cpuid_faulting(bool on)
+{
+       u64 msrval;
 
-               debugctl &= ~DEBUGCTLMSR_BTF;
-               if (test_tsk_thread_flag(next_p, TIF_BLOCKSTEP))
-                       debugctl |= DEBUGCTLMSR_BTF;
+       msrval = this_cpu_read(msr_misc_features_shadow);
+       msrval &= ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT;
+       msrval |= (on << MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT);
+       this_cpu_write(msr_misc_features_shadow, msrval);
+       wrmsrl(MSR_MISC_FEATURES_ENABLES, msrval);
+}
 
-               update_debugctlmsr(debugctl);
+static void disable_cpuid(void)
+{
+       preempt_disable();
+       if (!test_and_set_thread_flag(TIF_NOCPUID)) {
+               /*
+                * Must flip the CPU state synchronously with
+                * TIF_NOCPUID in the current running context.
+                */
+               set_cpuid_faulting(true);
        }
+       preempt_enable();
+}
 
-       if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
-           test_tsk_thread_flag(next_p, TIF_NOTSC)) {
-               /* prev and next are different */
-               if (test_tsk_thread_flag(next_p, TIF_NOTSC))
-                       hard_disable_TSC();
-               else
-                       hard_enable_TSC();
+static void enable_cpuid(void)
+{
+       preempt_disable();
+       if (test_and_clear_thread_flag(TIF_NOCPUID)) {
+               /*
+                * Must flip the CPU state synchronously with
+                * TIF_NOCPUID in the current running context.
+                */
+               set_cpuid_faulting(false);
        }
+       preempt_enable();
+}
+
+static int get_cpuid_mode(void)
+{
+       return !test_thread_flag(TIF_NOCPUID);
+}
+
+static int set_cpuid_mode(struct task_struct *task, unsigned long cpuid_enabled)
+{
+       if (!static_cpu_has(X86_FEATURE_CPUID_FAULT))
+               return -ENODEV;
+
+       if (cpuid_enabled)
+               enable_cpuid();
+       else
+               disable_cpuid();
+
+       return 0;
+}
+
+/*
+ * Called immediately after a successful exec.
+ */
+void arch_setup_new_exec(void)
+{
+       /* If cpuid was previously disabled for this task, re-enable it. */
+       if (test_thread_flag(TIF_NOCPUID))
+               enable_cpuid();
+}
 
-       if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
+static inline void switch_to_bitmap(struct tss_struct *tss,
+                                   struct thread_struct *prev,
+                                   struct thread_struct *next,
+                                   unsigned long tifp, unsigned long tifn)
+{
+       if (tifn & _TIF_IO_BITMAP) {
                /*
                 * Copy the relevant range of the IO bitmap.
                 * Normally this is 128 bytes or less:
                 */
                memcpy(tss->io_bitmap, next->io_bitmap_ptr,
                       max(prev->io_bitmap_max, next->io_bitmap_max));
-
                /*
                 * Make sure that the TSS limit is correct for the CPU
                 * to notice the IO bitmap.
                 */
                refresh_tss_limit();
-       } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
+       } else if (tifp & _TIF_IO_BITMAP) {
                /*
                 * Clear any possible leftover bits:
                 */
                memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
        }
+}
+
+void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
+                     struct tss_struct *tss)
+{
+       struct thread_struct *prev, *next;
+       unsigned long tifp, tifn;
+
+       prev = &prev_p->thread;
+       next = &next_p->thread;
+
+       tifn = READ_ONCE(task_thread_info(next_p)->flags);
+       tifp = READ_ONCE(task_thread_info(prev_p)->flags);
+       switch_to_bitmap(tss, prev, next, tifp, tifn);
+
        propagate_user_return_notify(prev_p, next_p);
+
+       if ((tifp & _TIF_BLOCKSTEP || tifn & _TIF_BLOCKSTEP) &&
+           arch_has_block_step()) {
+               unsigned long debugctl, msk;
+
+               rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+               debugctl &= ~DEBUGCTLMSR_BTF;
+               msk = tifn & _TIF_BLOCKSTEP;
+               debugctl |= (msk >> TIF_BLOCKSTEP) << DEBUGCTLMSR_BTF_SHIFT;
+               wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+       }
+
+       if ((tifp ^ tifn) & _TIF_NOTSC)
+               cr4_toggle_bits(X86_CR4_TSD);
+
+       if ((tifp ^ tifn) & _TIF_NOCPUID)
+               set_cpuid_faulting(!!(tifn & _TIF_NOCPUID));
 }
 
 /*
@@ -550,3 +616,16 @@ out:
        put_task_stack(p);
        return ret;
 }
+
+long do_arch_prctl_common(struct task_struct *task, int option,
+                         unsigned long cpuid_enabled)
+{
+       switch (option) {
+       case ARCH_GET_CPUID:
+               return get_cpuid_mode();
+       case ARCH_SET_CPUID:
+               return set_cpuid_mode(task, cpuid_enabled);
+       }
+
+       return -EINVAL;
+}
index 4c818f8..ff40e74 100644 (file)
@@ -37,6 +37,7 @@
 #include <linux/uaccess.h>
 #include <linux/io.h>
 #include <linux/kdebug.h>
+#include <linux/syscalls.h>
 
 #include <asm/pgtable.h>
 #include <asm/ldt.h>
@@ -56,6 +57,7 @@
 #include <asm/switch_to.h>
 #include <asm/vm86.h>
 #include <asm/intel_rdt.h>
+#include <asm/proto.h>
 
 void __show_regs(struct pt_regs *regs, int all)
 {
@@ -304,3 +306,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 
        return prev_p;
 }
+
+SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
+{
+       return do_arch_prctl_common(current, option, arg2);
+}
index d6b784a..ea1a618 100644 (file)
@@ -37,6 +37,7 @@
 #include <linux/uaccess.h>
 #include <linux/io.h>
 #include <linux/ftrace.h>
+#include <linux/syscalls.h>
 
 #include <asm/pgtable.h>
 #include <asm/processor.h>
@@ -204,7 +205,7 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
                                (struct user_desc __user *)tls, 0);
                else
 #endif
-                       err = do_arch_prctl(p, ARCH_SET_FS, tls);
+                       err = do_arch_prctl_64(p, ARCH_SET_FS, tls);
                if (err)
                        goto out;
        }
@@ -547,70 +548,72 @@ static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr)
 }
 #endif
 
-long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
+long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2)
 {
        int ret = 0;
        int doit = task == current;
        int cpu;
 
-       switch (code) {
+       switch (option) {
        case ARCH_SET_GS:
-               if (addr >= TASK_SIZE_MAX)
+               if (arg2 >= TASK_SIZE_MAX)
                        return -EPERM;
                cpu = get_cpu();
                task->thread.gsindex = 0;
-               task->thread.gsbase = addr;
+               task->thread.gsbase = arg2;
                if (doit) {
                        load_gs_index(0);
-                       ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, addr);
+                       ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, arg2);
                }
                put_cpu();
                break;
        case ARCH_SET_FS:
                /* Not strictly needed for fs, but do it for symmetry
                   with gs */
-               if (addr >= TASK_SIZE_MAX)
+               if (arg2 >= TASK_SIZE_MAX)
                        return -EPERM;
                cpu = get_cpu();
                task->thread.fsindex = 0;
-               task->thread.fsbase = addr;
+               task->thread.fsbase = arg2;
                if (doit) {
                        /* set the selector to 0 to not confuse __switch_to */
                        loadsegment(fs, 0);
-                       ret = wrmsrl_safe(MSR_FS_BASE, addr);
+                       ret = wrmsrl_safe(MSR_FS_BASE, arg2);
                }
                put_cpu();
                break;
        case ARCH_GET_FS: {
                unsigned long base;
+
                if (doit)
                        rdmsrl(MSR_FS_BASE, base);
                else
                        base = task->thread.fsbase;
-               ret = put_user(base, (unsigned long __user *)addr);
+               ret = put_user(base, (unsigned long __user *)arg2);
                break;
        }
        case ARCH_GET_GS: {
                unsigned long base;
+
                if (doit)
                        rdmsrl(MSR_KERNEL_GS_BASE, base);
                else
                        base = task->thread.gsbase;
-               ret = put_user(base, (unsigned long __user *)addr);
+               ret = put_user(base, (unsigned long __user *)arg2);
                break;
        }
 
 #ifdef CONFIG_CHECKPOINT_RESTORE
 # ifdef CONFIG_X86_X32_ABI
        case ARCH_MAP_VDSO_X32:
-               return prctl_map_vdso(&vdso_image_x32, addr);
+               return prctl_map_vdso(&vdso_image_x32, arg2);
 # endif
 # if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
        case ARCH_MAP_VDSO_32:
-               return prctl_map_vdso(&vdso_image_32, addr);
+               return prctl_map_vdso(&vdso_image_32, arg2);
 # endif
        case ARCH_MAP_VDSO_64:
-               return prctl_map_vdso(&vdso_image_64, addr);
+               return prctl_map_vdso(&vdso_image_64, arg2);
 #endif
 
        default:
@@ -621,10 +624,23 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
        return ret;
 }
 
-long sys_arch_prctl(int code, unsigned long addr)
+SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
+{
+       long ret;
+
+       ret = do_arch_prctl_64(current, option, arg2);
+       if (ret == -EINVAL)
+               ret = do_arch_prctl_common(current, option, arg2);
+
+       return ret;
+}
+
+#ifdef CONFIG_IA32_EMULATION
+COMPAT_SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
 {
-       return do_arch_prctl(current, code, addr);
+       return do_arch_prctl_common(current, option, arg2);
 }
+#endif
 
 unsigned long KSTK_ESP(struct task_struct *task)
 {
index 2364b23..f37d181 100644 (file)
@@ -396,12 +396,12 @@ static int putreg(struct task_struct *child,
                if (value >= TASK_SIZE_MAX)
                        return -EIO;
                /*
-                * When changing the segment base, use do_arch_prctl
+                * When changing the segment base, use do_arch_prctl_64
                 * to set either thread.fs or thread.fsindex and the
                 * corresponding GDT slot.
                 */
                if (child->thread.fsbase != value)
-                       return do_arch_prctl(child, ARCH_SET_FS, value);
+                       return do_arch_prctl_64(child, ARCH_SET_FS, value);
                return 0;
        case offsetof(struct user_regs_struct,gs_base):
                /*
@@ -410,7 +410,7 @@ static int putreg(struct task_struct *child,
                if (value >= TASK_SIZE_MAX)
                        return -EIO;
                if (child->thread.gsbase != value)
-                       return do_arch_prctl(child, ARCH_SET_GS, value);
+                       return do_arch_prctl_64(child, ARCH_SET_GS, value);
                return 0;
 #endif
        }
@@ -869,7 +869,7 @@ long arch_ptrace(struct task_struct *child, long request,
                   Works just like arch_prctl, except that the arguments
                   are reversed. */
        case PTRACE_ARCH_PRCTL:
-               ret = do_arch_prctl(child, data, addr);
+               ret = do_arch_prctl_64(child, data, addr);
                break;
 #endif
 
index 067f981..2544700 100644 (file)
@@ -765,10 +765,11 @@ void machine_crash_shutdown(struct pt_regs *regs)
 #endif
 
 
+/* This is the CPU performing the emergency shutdown work. */
+int crashing_cpu = -1;
+
 #if defined(CONFIG_SMP)
 
-/* This keeps a track of which one is crashing cpu. */
-static int crashing_cpu;
 static nmi_shootdown_cb shootdown_callback;
 
 static atomic_t waiting_for_crash_ipi;
index 2408c16..5ab3895 100644 (file)
@@ -1,5 +1,5 @@
 #include <linux/ioport.h>
-#include <asm/e820.h>
+#include <asm/e820/api.h>
 
 static void resource_clip(struct resource *res, resource_size_t start,
                          resource_size_t end)
@@ -25,10 +25,10 @@ static void resource_clip(struct resource *res, resource_size_t start,
 static void remove_e820_regions(struct resource *avail)
 {
        int i;
-       struct e820entry *entry;
+       struct e820_entry *entry;
 
-       for (i = 0; i < e820->nr_map; i++) {
-               entry = &e820->map[i];
+       for (i = 0; i < e820_table->nr_entries; i++) {
+               entry = &e820_table->entries[i];
 
                resource_clip(avail, entry->addr,
                              entry->addr + entry->size - 1);
index 7cd7bbe..62a1c74 100644 (file)
@@ -75,7 +75,7 @@
 #include <asm/mtrr.h>
 #include <asm/apic.h>
 #include <asm/realmode.h>
-#include <asm/e820.h>
+#include <asm/e820/api.h>
 #include <asm/mpspec.h>
 #include <asm/setup.h>
 #include <asm/efi.h>
  * max_low_pfn_mapped: highest direct mapped pfn under 4GB
  * max_pfn_mapped:     highest direct mapped pfn over 4GB
  *
- * The direct mapping only covers E820_RAM regions, so the ranges and gaps are
+ * The direct mapping only covers E820_TYPE_RAM regions, so the ranges and gaps are
  * represented by pfn_mapped
  */
 unsigned long max_low_pfn_mapped;
@@ -423,7 +423,7 @@ static void __init parse_setup_data(void)
 
                switch (data_type) {
                case SETUP_E820_EXT:
-                       parse_e820_ext(pa_data, data_len);
+                       e820__memory_setup_extended(pa_data, data_len);
                        break;
                case SETUP_DTB:
                        add_dtb(pa_data);
@@ -438,29 +438,6 @@ static void __init parse_setup_data(void)
        }
 }
 
-static void __init e820_reserve_setup_data(void)
-{
-       struct setup_data *data;
-       u64 pa_data;
-
-       pa_data = boot_params.hdr.setup_data;
-       if (!pa_data)
-               return;
-
-       while (pa_data) {
-               data = early_memremap(pa_data, sizeof(*data));
-               e820_update_range(pa_data, sizeof(*data)+data->len,
-                        E820_RAM, E820_RESERVED_KERN);
-               pa_data = data->next;
-               early_memunmap(data, sizeof(*data));
-       }
-
-       sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map);
-       memcpy(e820_saved, e820, sizeof(struct e820map));
-       printk(KERN_INFO "extended physical RAM map:\n");
-       e820_print_map("reserve setup_data");
-}
-
 static void __init memblock_x86_reserve_range_setup_data(void)
 {
        struct setup_data *data;
@@ -753,16 +730,16 @@ static void __init trim_bios_range(void)
         * since some BIOSes are known to corrupt low memory.  See the
         * Kconfig help text for X86_RESERVE_LOW.
         */
-       e820_update_range(0, PAGE_SIZE, E820_RAM, E820_RESERVED);
+       e820__range_update(0, PAGE_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED);
 
        /*
         * special case: Some BIOSen report the PC BIOS
         * area (640->1Mb) as ram even though it is not.
         * take them out.
         */
-       e820_remove_range(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_RAM, 1);
+       e820__range_remove(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_TYPE_RAM, 1);
 
-       sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map);
+       e820__update_table(e820_table);
 }
 
 /* called before trim_bios_range() to spare extra sanitize */
@@ -772,18 +749,18 @@ static void __init e820_add_kernel_range(void)
        u64 size = __pa_symbol(_end) - start;
 
        /*
-        * Complain if .text .data and .bss are not marked as E820_RAM and
+        * Complain if .text .data and .bss are not marked as E820_TYPE_RAM and
         * attempt to fix it by adding the range. We may have a confused BIOS,
         * or the user may have used memmap=exactmap or memmap=xxM$yyM to
         * exclude kernel range. If we really are running on top non-RAM,
         * we will crash later anyways.
         */
-       if (e820_all_mapped(start, start + size, E820_RAM))
+       if (e820__mapped_all(start, start + size, E820_TYPE_RAM))
                return;
 
-       pr_warn(".text .data .bss are not marked as E820_RAM!\n");
-       e820_remove_range(start, size, E820_RAM, 0);
-       e820_add_region(start, size, E820_RAM);
+       pr_warn(".text .data .bss are not marked as E820_TYPE_RAM!\n");
+       e820__range_remove(start, size, E820_TYPE_RAM, 0);
+       e820__range_add(start, size, E820_TYPE_RAM);
 }
 
 static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10;
@@ -936,7 +913,7 @@ void __init setup_arch(char **cmdline_p)
        x86_init.oem.arch_setup();
 
        iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1;
-       setup_memory_map();
+       e820__memory_setup();
        parse_setup_data();
 
        copy_edd();
@@ -1025,9 +1002,8 @@ void __init setup_arch(char **cmdline_p)
                early_dump_pci_devices();
 #endif
 
-       /* update the e820_saved too */
-       e820_reserve_setup_data();
-       finish_e820_parsing();
+       e820__reserve_setup_data();
+       e820__finish_early_params();
 
        if (efi_enabled(EFI_BOOT))
                efi_init();
@@ -1053,11 +1029,11 @@ void __init setup_arch(char **cmdline_p)
        trim_bios_range();
 #ifdef CONFIG_X86_32
        if (ppro_with_ram_bug()) {
-               e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM,
-                                 E820_RESERVED);
-               sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map);
+               e820__range_update(0x70000000ULL, 0x40000ULL, E820_TYPE_RAM,
+                                 E820_TYPE_RESERVED);
+               e820__update_table(e820_table);
                printk(KERN_INFO "fixed physical RAM map:\n");
-               e820_print_map("bad_ppro");
+               e820__print_table("bad_ppro");
        }
 #else
        early_gart_iommu_check();
@@ -1067,12 +1043,12 @@ void __init setup_arch(char **cmdline_p)
         * partially used pages are not usable - thus
         * we are rounding upwards:
         */
-       max_pfn = e820_end_of_ram_pfn();
+       max_pfn = e820__end_of_ram_pfn();
 
        /* update e820 for memory not covered by WB MTRRs */
        mtrr_bp_init();
        if (mtrr_trim_uncached_memory(max_pfn))
-               max_pfn = e820_end_of_ram_pfn();
+               max_pfn = e820__end_of_ram_pfn();
 
        max_possible_pfn = max_pfn;
 
@@ -1091,7 +1067,7 @@ void __init setup_arch(char **cmdline_p)
        /* How many end-of-memory variables you have, grandma! */
        /* need this before calling reserve_initrd */
        if (max_pfn > (1UL<<(32 - PAGE_SHIFT)))
-               max_low_pfn = e820_end_of_low_ram_pfn();
+               max_low_pfn = e820__end_of_low_ram_pfn();
        else
                max_low_pfn = max_pfn;
 
@@ -1108,7 +1084,7 @@ void __init setup_arch(char **cmdline_p)
        early_alloc_pgt_buf();
 
        /*
-        * Need to conclude brk, before memblock_x86_fill()
+        * Need to conclude brk, before e820__memblock_setup()
         *  it could use memblock_find_in_range, could overlap with
         *  brk area.
         */
@@ -1117,7 +1093,7 @@ void __init setup_arch(char **cmdline_p)
        cleanup_highmap();
 
        memblock_set_current_limit(ISA_END_ADDRESS);
-       memblock_x86_fill();
+       e820__memblock_setup();
 
        reserve_bios_regions();
 
@@ -1134,7 +1110,7 @@ void __init setup_arch(char **cmdline_p)
        }
 
        /* preallocate 4k for mptable mpc */
-       early_reserve_e820_mpc_new();
+       e820__memblock_alloc_reserved_mpc_new();
 
 #ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
        setup_bios_corruption_check();
@@ -1272,12 +1248,12 @@ void __init setup_arch(char **cmdline_p)
 
        kvm_guest_init();
 
-       e820_reserve_resources();
-       e820_mark_nosave_regions(max_low_pfn);
+       e820__reserve_resources();
+       e820__register_nosave_regions(max_low_pfn);
 
        x86_init.resources.reserve_resources();
 
-       e820_setup_gap();
+       e820__setup_pci_gap();
 
 #ifdef CONFIG_VT
 #if defined(CONFIG_VGA_CONSOLE)
index d3c66a1..d798c0d 100644 (file)
@@ -33,6 +33,7 @@
 #include <asm/mce.h>
 #include <asm/trace/irq_vectors.h>
 #include <asm/kexec.h>
+#include <asm/virtext.h>
 
 /*
  *     Some notes on x86 processor bugs affecting SMP operation:
@@ -124,7 +125,7 @@ static bool smp_no_nmi_ipi = false;
 static void native_smp_send_reschedule(int cpu)
 {
        if (unlikely(cpu_is_offline(cpu))) {
-               WARN_ON(1);
+               WARN(1, "sched: Unexpected reschedule of offline CPU#%d!\n", cpu);
                return;
        }
        apic->send_IPI(cpu, RESCHEDULE_VECTOR);
@@ -162,6 +163,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
        if (raw_smp_processor_id() == atomic_read(&stopping_cpu))
                return NMI_HANDLED;
 
+       cpu_emergency_vmxoff();
        stop_this_cpu(NULL);
 
        return NMI_HANDLED;
@@ -174,6 +176,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
 asmlinkage __visible void smp_reboot_interrupt(void)
 {
        ipi_entering_ack_irq();
+       cpu_emergency_vmxoff();
        stop_this_cpu(NULL);
        irq_exit();
 }
index b868fa1..ccccd33 100644 (file)
@@ -42,7 +42,7 @@
 #include <asm/fixmap.h>
 #include <asm/proto.h>
 #include <asm/setup.h>
-#include <asm/e820.h>
+#include <asm/e820/api.h>
 #include <asm/io.h>
 
 #include "../realmode/rm/wakeup.h"
@@ -68,9 +68,9 @@ void __init tboot_probe(void)
         * also verify that it is mapped as we expect it before calling
         * set_fixmap(), to reduce chance of garbage value causing crash
         */
-       if (!e820_any_mapped(boot_params.tboot_addr,
-                            boot_params.tboot_addr, E820_RESERVED)) {
-               pr_warning("non-0 tboot_addr but it is not of type E820_RESERVED\n");
+       if (!e820__mapped_any(boot_params.tboot_addr,
+                            boot_params.tboot_addr, E820_TYPE_RESERVED)) {
+               pr_warning("non-0 tboot_addr but it is not of type E820_TYPE_RESERVED\n");
                return;
        }
 
@@ -188,12 +188,12 @@ static int tboot_setup_sleep(void)
 
        tboot->num_mac_regions = 0;
 
-       for (i = 0; i < e820->nr_map; i++) {
-               if ((e820->map[i].type != E820_RAM)
-                && (e820->map[i].type != E820_RESERVED_KERN))
+       for (i = 0; i < e820_table->nr_entries; i++) {
+               if ((e820_table->entries[i].type != E820_TYPE_RAM)
+                && (e820_table->entries[i].type != E820_TYPE_RESERVED_KERN))
                        continue;
 
-               add_mac_region(e820->map[i].addr, e820->map[i].size);
+               add_mac_region(e820_table->entries[i].addr, e820_table->entries[i].size);
        }
 
        tboot->acpi_sinfo.kernel_s3_resume_vector =
index 11a93f0..a088b2c 100644 (file)
@@ -14,7 +14,7 @@
 #include <asm/mpspec.h>
 #include <asm/setup.h>
 #include <asm/apic.h>
-#include <asm/e820.h>
+#include <asm/e820/api.h>
 #include <asm/time.h>
 #include <asm/irq.h>
 #include <asm/io_apic.h>
@@ -38,7 +38,7 @@ struct x86_init_ops x86_init __initdata = {
        .resources = {
                .probe_roms             = probe_roms,
                .reserve_resources      = reserve_standard_io_resources,
-               .memory_setup           = default_machine_specific_memory_setup,
+               .memory_setup           = e820__memory_setup_default,
        },
 
        .mpparse = {
index 2ee00db..259e9b2 100644 (file)
@@ -8198,6 +8198,9 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
                return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
        case EXIT_REASON_PREEMPTION_TIMER:
                return false;
+       case EXIT_REASON_PML_FULL:
+               /* We don't expose PML support to L1. */
+               return false;
        default:
                return true;
        }
@@ -10267,6 +10270,18 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 
        }
 
+       if (enable_pml) {
+               /*
+                * Conceptually we want to copy the PML address and index from
+                * vmcs01 here, and then back to vmcs01 on nested vmexit. But,
+                * since we always flush the log on each vmexit, this happens
+                * to be equivalent to simply resetting the fields in vmcs02.
+                */
+               ASSERT(vmx->pml_pg);
+               vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
+               vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
+       }
+
        if (nested_cpu_has_ept(vmcs12)) {
                kvm_mmu_unload(vcpu);
                nested_ept_init_mmu_context(vcpu);
index d3289d7..9947269 100644 (file)
@@ -67,7 +67,7 @@
 #include <asm/pgtable.h>
 #include <asm/desc.h>
 #include <asm/setup.h>
-#include <asm/e820.h>
+#include <asm/e820/api.h>
 #include <asm/mce.h>
 #include <asm/io.h>
 #include <asm/fpu/api.h>
@@ -994,7 +994,9 @@ static struct clock_event_device lguest_clockevent = {
        .mult                   = 1,
        .shift                  = 0,
        .min_delta_ns           = LG_CLOCK_MIN_DELTA,
+       .min_delta_ticks        = LG_CLOCK_MIN_DELTA,
        .max_delta_ns           = LG_CLOCK_MAX_DELTA,
+       .max_delta_ticks        = LG_CLOCK_MAX_DELTA,
 };
 
 /*
@@ -1178,9 +1180,9 @@ static __init char *lguest_memory_setup(void)
         * The Linux bootloader header contains an "e820" memory map: the
         * Launcher populated the first entry with our memory limit.
         */
-       e820_add_region(boot_params.e820_map[0].addr,
-                         boot_params.e820_map[0].size,
-                         boot_params.e820_map[0].type);
+       e820__range_add(boot_params.e820_table[0].addr,
+                         boot_params.e820_table[0].size,
+                         boot_params.e820_table[0].type);
 
        /* This string is for the boot messages. */
        return "LGUEST";
index a8e91ae..29df077 100644 (file)
@@ -93,6 +93,13 @@ static void delay_mwaitx(unsigned long __loops)
 {
        u64 start, end, delay, loops = __loops;
 
+       /*
+        * Timer value of 0 causes MWAITX to wait indefinitely, unless there
+        * is a store on the memory monitored by MONITORX.
+        */
+       if (loops == 0)
+               return;
+
        start = rdtsc_ordered();
 
        for (;;) {
index 121f59c..5761a4f 100644 (file)
@@ -8,7 +8,7 @@
 #include <asm/kaslr.h>
 #include <asm/msr.h>
 #include <asm/archrandom.h>
-#include <asm/e820.h>
+#include <asm/e820/api.h>
 #include <asm/io.h>
 
 /*
index c074799..c8c6ad0 100644 (file)
@@ -4,12 +4,9 @@
  *  For licencing details see kernel-base/COPYING
  */
 
-#include <linux/highmem.h>
+#include <linux/uaccess.h>
 #include <linux/export.h>
 
-#include <asm/word-at-a-time.h>
-#include <linux/sched.h>
-
 /*
  * We rely on the nested NMI work to allow atomic faults from the NMI path; the
  * nested NMI paths are careful to preserve CR2.
@@ -34,52 +31,3 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
        return ret;
 }
 EXPORT_SYMBOL_GPL(copy_from_user_nmi);
-
-/**
- * copy_to_user: - Copy a block of data into user space.
- * @to:   Destination address, in user space.
- * @from: Source address, in kernel space.
- * @n:    Number of bytes to copy.
- *
- * Context: User context only. This function may sleep if pagefaults are
- *          enabled.
- *
- * Copy data from kernel space to user space.
- *
- * Returns number of bytes that could not be copied.
- * On success, this will be zero.
- */
-unsigned long _copy_to_user(void __user *to, const void *from, unsigned n)
-{
-       if (access_ok(VERIFY_WRITE, to, n))
-               n = __copy_to_user(to, from, n);
-       return n;
-}
-EXPORT_SYMBOL(_copy_to_user);
-
-/**
- * copy_from_user: - Copy a block of data from user space.
- * @to:   Destination address, in kernel space.
- * @from: Source address, in user space.
- * @n:    Number of bytes to copy.
- *
- * Context: User context only. This function may sleep if pagefaults are
- *          enabled.
- *
- * Copy data from user space to kernel space.
- *
- * Returns number of bytes that could not be copied.
- * On success, this will be zero.
- *
- * If some data could not be copied, this function will pad the copied
- * data to the requested size using zero bytes.
- */
-unsigned long _copy_from_user(void *to, const void __user *from, unsigned n)
-{
-       if (access_ok(VERIFY_READ, from, n))
-               n = __copy_from_user(to, from, n);
-       else
-               memset(to, 0, n);
-       return n;
-}
-EXPORT_SYMBOL(_copy_from_user);
index 1f65ff6..bd057a4 100644 (file)
@@ -5,12 +5,7 @@
  * Copyright 1997 Andi Kleen <ak@muc.de>
  * Copyright 1997 Linus Torvalds
  */
-#include <linux/mm.h>
-#include <linux/highmem.h>
-#include <linux/blkdev.h>
 #include <linux/export.h>
-#include <linux/backing-dev.h>
-#include <linux/interrupt.h>
 #include <linux/uaccess.h>
 #include <asm/mmx.h>
 #include <asm/asm.h>
@@ -201,197 +196,6 @@ __copy_user_intel(void __user *to, const void *from, unsigned long size)
        return size;
 }
 
-static unsigned long
-__copy_user_zeroing_intel(void *to, const void __user *from, unsigned long size)
-{
-       int d0, d1;
-       __asm__ __volatile__(
-                      "        .align 2,0x90\n"
-                      "0:      movl 32(%4), %%eax\n"
-                      "        cmpl $67, %0\n"
-                      "        jbe 2f\n"
-                      "1:      movl 64(%4), %%eax\n"
-                      "        .align 2,0x90\n"
-                      "2:      movl 0(%4), %%eax\n"
-                      "21:     movl 4(%4), %%edx\n"
-                      "        movl %%eax, 0(%3)\n"
-                      "        movl %%edx, 4(%3)\n"
-                      "3:      movl 8(%4), %%eax\n"
-                      "31:     movl 12(%4),%%edx\n"
-                      "        movl %%eax, 8(%3)\n"
-                      "        movl %%edx, 12(%3)\n"
-                      "4:      movl 16(%4), %%eax\n"
-                      "41:     movl 20(%4), %%edx\n"
-                      "        movl %%eax, 16(%3)\n"
-                      "        movl %%edx, 20(%3)\n"
-                      "10:     movl 24(%4), %%eax\n"
-                      "51:     movl 28(%4), %%edx\n"
-                      "        movl %%eax, 24(%3)\n"
-                      "        movl %%edx, 28(%3)\n"
-                      "11:     movl 32(%4), %%eax\n"
-                      "61:     movl 36(%4), %%edx\n"
-                      "        movl %%eax, 32(%3)\n"
-                      "        movl %%edx, 36(%3)\n"
-                      "12:     movl 40(%4), %%eax\n"
-                      "71:     movl 44(%4), %%edx\n"
-                      "        movl %%eax, 40(%3)\n"
-                      "        movl %%edx, 44(%3)\n"
-                      "13:     movl 48(%4), %%eax\n"
-                      "81:     movl 52(%4), %%edx\n"
-                      "        movl %%eax, 48(%3)\n"
-                      "        movl %%edx, 52(%3)\n"
-                      "14:     movl 56(%4), %%eax\n"
-                      "91:     movl 60(%4), %%edx\n"
-                      "        movl %%eax, 56(%3)\n"
-                      "        movl %%edx, 60(%3)\n"
-                      "        addl $-64, %0\n"
-                      "        addl $64, %4\n"
-                      "        addl $64, %3\n"
-                      "        cmpl $63, %0\n"
-                      "        ja  0b\n"
-                      "5:      movl  %0, %%eax\n"
-                      "        shrl  $2, %0\n"
-                      "        andl $3, %%eax\n"
-                      "        cld\n"
-                      "6:      rep; movsl\n"
-                      "        movl %%eax,%0\n"
-                      "7:      rep; movsb\n"
-                      "8:\n"
-                      ".section .fixup,\"ax\"\n"
-                      "9:      lea 0(%%eax,%0,4),%0\n"
-                      "16:     pushl %0\n"
-                      "        pushl %%eax\n"
-                      "        xorl %%eax,%%eax\n"
-                      "        rep; stosb\n"
-                      "        popl %%eax\n"
-                      "        popl %0\n"
-                      "        jmp 8b\n"
-                      ".previous\n"
-                      _ASM_EXTABLE(0b,16b)
-                      _ASM_EXTABLE(1b,16b)
-                      _ASM_EXTABLE(2b,16b)
-                      _ASM_EXTABLE(21b,16b)
-                      _ASM_EXTABLE(3b,16b)
-                      _ASM_EXTABLE(31b,16b)
-                      _ASM_EXTABLE(4b,16b)
-                      _ASM_EXTABLE(41b,16b)
-                      _ASM_EXTABLE(10b,16b)
-                      _ASM_EXTABLE(51b,16b)
-                      _ASM_EXTABLE(11b,16b)
-                      _ASM_EXTABLE(61b,16b)
-                      _ASM_EXTABLE(12b,16b)
-                      _ASM_EXTABLE(71b,16b)
-                      _ASM_EXTABLE(13b,16b)
-                      _ASM_EXTABLE(81b,16b)
-                      _ASM_EXTABLE(14b,16b)
-                      _ASM_EXTABLE(91b,16b)
-                      _ASM_EXTABLE(6b,9b)
-                      _ASM_EXTABLE(7b,16b)
-                      : "=&c"(size), "=&D" (d0), "=&S" (d1)
-                      :  "1"(to), "2"(from), "0"(size)
-                      : "eax", "edx", "memory");
-       return size;
-}
-
-/*
- * Non Temporal Hint version of __copy_user_zeroing_intel.  It is cache aware.
- * hyoshiok@miraclelinux.com
- */
-
-static unsigned long __copy_user_zeroing_intel_nocache(void *to,
-                               const void __user *from, unsigned long size)
-{
-       int d0, d1;
-
-       __asm__ __volatile__(
-              "        .align 2,0x90\n"
-              "0:      movl 32(%4), %%eax\n"
-              "        cmpl $67, %0\n"
-              "        jbe 2f\n"
-              "1:      movl 64(%4), %%eax\n"
-              "        .align 2,0x90\n"
-              "2:      movl 0(%4), %%eax\n"
-              "21:     movl 4(%4), %%edx\n"
-              "        movnti %%eax, 0(%3)\n"
-              "        movnti %%edx, 4(%3)\n"
-              "3:      movl 8(%4), %%eax\n"
-              "31:     movl 12(%4),%%edx\n"
-              "        movnti %%eax, 8(%3)\n"
-              "        movnti %%edx, 12(%3)\n"
-              "4:      movl 16(%4), %%eax\n"
-              "41:     movl 20(%4), %%edx\n"
-              "        movnti %%eax, 16(%3)\n"
-              "        movnti %%edx, 20(%3)\n"
-              "10:     movl 24(%4), %%eax\n"
-              "51:     movl 28(%4), %%edx\n"
-              "        movnti %%eax, 24(%3)\n"
-              "        movnti %%edx, 28(%3)\n"
-              "11:     movl 32(%4), %%eax\n"
-              "61:     movl 36(%4), %%edx\n"
-              "        movnti %%eax, 32(%3)\n"
-              "        movnti %%edx, 36(%3)\n"
-              "12:     movl 40(%4), %%eax\n"
-              "71:     movl 44(%4), %%edx\n"
-              "        movnti %%eax, 40(%3)\n"
-              "        movnti %%edx, 44(%3)\n"
-              "13:     movl 48(%4), %%eax\n"
-              "81:     movl 52(%4), %%edx\n"
-              "        movnti %%eax, 48(%3)\n"
-              "        movnti %%edx, 52(%3)\n"
-              "14:     movl 56(%4), %%eax\n"
-              "91:     movl 60(%4), %%edx\n"
-              "        movnti %%eax, 56(%3)\n"
-              "        movnti %%edx, 60(%3)\n"
-              "        addl $-64, %0\n"
-              "        addl $64, %4\n"
-              "        addl $64, %3\n"
-              "        cmpl $63, %0\n"
-              "        ja  0b\n"
-              "        sfence \n"
-              "5:      movl  %0, %%eax\n"
-              "        shrl  $2, %0\n"
-              "        andl $3, %%eax\n"
-              "        cld\n"
-              "6:      rep; movsl\n"
-              "        movl %%eax,%0\n"
-              "7:      rep; movsb\n"
-              "8:\n"
-              ".section .fixup,\"ax\"\n"
-              "9:      lea 0(%%eax,%0,4),%0\n"
-              "16:     pushl %0\n"
-              "        pushl %%eax\n"
-              "        xorl %%eax,%%eax\n"
-              "        rep; stosb\n"
-              "        popl %%eax\n"
-              "        popl %0\n"
-              "        jmp 8b\n"
-              ".previous\n"
-              _ASM_EXTABLE(0b,16b)
-              _ASM_EXTABLE(1b,16b)
-              _ASM_EXTABLE(2b,16b)
-              _ASM_EXTABLE(21b,16b)
-              _ASM_EXTABLE(3b,16b)
-              _ASM_EXTABLE(31b,16b)
-              _ASM_EXTABLE(4b,16b)
-              _ASM_EXTABLE(41b,16b)
-              _ASM_EXTABLE(10b,16b)
-              _ASM_EXTABLE(51b,16b)
-              _ASM_EXTABLE(11b,16b)
-              _ASM_EXTABLE(61b,16b)
-              _ASM_EXTABLE(12b,16b)
-              _ASM_EXTABLE(71b,16b)
-              _ASM_EXTABLE(13b,16b)
-              _ASM_EXTABLE(81b,16b)
-              _ASM_EXTABLE(14b,16b)
-              _ASM_EXTABLE(91b,16b)
-              _ASM_EXTABLE(6b,9b)
-              _ASM_EXTABLE(7b,16b)
-              : "=&c"(size), "=&D" (d0), "=&S" (d1)
-              :  "1"(to), "2"(from), "0"(size)
-              : "eax", "edx", "memory");
-       return size;
-}
-
 static unsigned long __copy_user_intel_nocache(void *to,
                                const void __user *from, unsigned long size)
 {
@@ -486,12 +290,8 @@ static unsigned long __copy_user_intel_nocache(void *to,
  * Leave these declared but undefined.  They should not be any references to
  * them
  */
-unsigned long __copy_user_zeroing_intel(void *to, const void __user *from,
-                                       unsigned long size);
 unsigned long __copy_user_intel(void __user *to, const void *from,
                                        unsigned long size);
-unsigned long __copy_user_zeroing_intel_nocache(void *to,
-                               const void __user *from, unsigned long size);
 #endif /* CONFIG_X86_INTEL_USERCOPY */
 
 /* Generic arbitrary sized copy.  */
@@ -528,47 +328,7 @@ do {                                                                       \
                : "memory");                                            \
 } while (0)
 
-#define __copy_user_zeroing(to, from, size)                            \
-do {                                                                   \
-       int __d0, __d1, __d2;                                           \
-       __asm__ __volatile__(                                           \
-               "       cmp  $7,%0\n"                                   \
-               "       jbe  1f\n"                                      \
-               "       movl %1,%0\n"                                   \
-               "       negl %0\n"                                      \
-               "       andl $7,%0\n"                                   \
-               "       subl %0,%3\n"                                   \
-               "4:     rep; movsb\n"                                   \
-               "       movl %3,%0\n"                                   \
-               "       shrl $2,%0\n"                                   \
-               "       andl $3,%3\n"                                   \
-               "       .align 2,0x90\n"                                \
-               "0:     rep; movsl\n"                                   \
-               "       movl %3,%0\n"                                   \
-               "1:     rep; movsb\n"                                   \
-               "2:\n"                                                  \
-               ".section .fixup,\"ax\"\n"                              \
-               "5:     addl %3,%0\n"                                   \
-               "       jmp 6f\n"                                       \
-               "3:     lea 0(%3,%0,4),%0\n"                            \
-               "6:     pushl %0\n"                                     \
-               "       pushl %%eax\n"                                  \
-               "       xorl %%eax,%%eax\n"                             \
-               "       rep; stosb\n"                                   \
-               "       popl %%eax\n"                                   \
-               "       popl %0\n"                                      \
-               "       jmp 2b\n"                                       \
-               ".previous\n"                                           \
-               _ASM_EXTABLE(4b,5b)                                     \
-               _ASM_EXTABLE(0b,3b)                                     \
-               _ASM_EXTABLE(1b,6b)                                     \
-               : "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(__d2)   \
-               : "3"(size), "0"(size), "1"(to), "2"(from)              \
-               : "memory");                                            \
-} while (0)
-
-unsigned long __copy_to_user_ll(void __user *to, const void *from,
-                               unsigned long n)
+unsigned long __copy_user_ll(void *to, const void *from, unsigned long n)
 {
        stac();
        if (movsl_is_ok(to, from, n))
@@ -578,51 +338,7 @@ unsigned long __copy_to_user_ll(void __user *to, const void *from,
        clac();
        return n;
 }
-EXPORT_SYMBOL(__copy_to_user_ll);
-
-unsigned long __copy_from_user_ll(void *to, const void __user *from,
-                                       unsigned long n)
-{
-       stac();
-       if (movsl_is_ok(to, from, n))
-               __copy_user_zeroing(to, from, n);
-       else
-               n = __copy_user_zeroing_intel(to, from, n);
-       clac();
-       return n;
-}
-EXPORT_SYMBOL(__copy_from_user_ll);
-
-unsigned long __copy_from_user_ll_nozero(void *to, const void __user *from,
-                                        unsigned long n)
-{
-       stac();
-       if (movsl_is_ok(to, from, n))
-               __copy_user(to, from, n);
-       else
-               n = __copy_user_intel((void __user *)to,
-                                     (const void *)from, n);
-       clac();
-       return n;
-}
-EXPORT_SYMBOL(__copy_from_user_ll_nozero);
-
-unsigned long __copy_from_user_ll_nocache(void *to, const void __user *from,
-                                       unsigned long n)
-{
-       stac();
-#ifdef CONFIG_X86_INTEL_USERCOPY
-       if (n > 64 && static_cpu_has(X86_FEATURE_XMM2))
-               n = __copy_user_zeroing_intel_nocache(to, from, n);
-       else
-               __copy_user_zeroing(to, from, n);
-#else
-       __copy_user_zeroing(to, from, n);
-#endif
-       clac();
-       return n;
-}
-EXPORT_SYMBOL(__copy_from_user_ll_nocache);
+EXPORT_SYMBOL(__copy_user_ll);
 
 unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *from,
                                        unsigned long n)
index 6987358..3b7c40a 100644 (file)
@@ -54,15 +54,6 @@ unsigned long clear_user(void __user *to, unsigned long n)
 }
 EXPORT_SYMBOL(clear_user);
 
-unsigned long copy_in_user(void __user *to, const void __user *from, unsigned len)
-{
-       if (access_ok(VERIFY_WRITE, to, len) && access_ok(VERIFY_READ, from, len)) { 
-               return copy_user_generic((__force void *)to, (__force void *)from, len);
-       } 
-       return len;             
-}
-EXPORT_SYMBOL(copy_in_user);
-
 /*
  * Try to copy last bytes and clear the rest if needed.
  * Since protection fault in copy_from/to_user is not a normal situation,
@@ -80,9 +71,5 @@ copy_user_handle_tail(char *to, char *from, unsigned len)
                        break;
        }
        clac();
-
-       /* If the destination is a kernel buffer, we always clear the end */
-       if (!__addr_ok(to))
-               memset(to, 0, len);
        return len;
 }
index d1c7de0..91f501b 100644 (file)
@@ -19,7 +19,7 @@
 #include <asm/types.h>
 #include <asm/mmzone.h>
 #include <asm/proto.h>
-#include <asm/e820.h>
+#include <asm/e820/api.h>
 #include <asm/pci-direct.h>
 #include <asm/numa.h>
 #include <asm/mpspec.h>
index 22af912..138bad2 100644 (file)
@@ -6,7 +6,7 @@
 #include <linux/bootmem.h>     /* for max_low_pfn */
 
 #include <asm/cacheflush.h>
-#include <asm/e820.h>
+#include <asm/e820/api.h>
 #include <asm/init.h>
 #include <asm/page.h>
 #include <asm/page_types.h>
@@ -373,14 +373,14 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range,
        return nr_range;
 }
 
-struct range pfn_mapped[E820_X_MAX];
+struct range pfn_mapped[E820_MAX_ENTRIES];
 int nr_pfn_mapped;
 
 static void add_pfn_range_mapped(unsigned long start_pfn, unsigned long end_pfn)
 {
-       nr_pfn_mapped = add_range_with_merge(pfn_mapped, E820_X_MAX,
+       nr_pfn_mapped = add_range_with_merge(pfn_mapped, E820_MAX_ENTRIES,
                                             nr_pfn_mapped, start_pfn, end_pfn);
-       nr_pfn_mapped = clean_sort_range(pfn_mapped, E820_X_MAX);
+       nr_pfn_mapped = clean_sort_range(pfn_mapped, E820_MAX_ENTRIES);
 
        max_pfn_mapped = max(max_pfn_mapped, end_pfn);
 
@@ -430,7 +430,7 @@ unsigned long __ref init_memory_mapping(unsigned long start,
 
 /*
  * We need to iterate through the E820 memory map and create direct mappings
- * for only E820_RAM and E820_KERN_RESERVED regions. We cannot simply
+ * for only E820_TYPE_RAM and E820_KERN_RESERVED regions. We cannot simply
  * create direct mappings for all pfns from [0 to max_low_pfn) and
  * [4GB to max_pfn) because of possible memory holes in high addresses
  * that cannot be marked as UC by fixed/variable range MTRRs.
@@ -643,21 +643,40 @@ void __init init_mem_mapping(void)
  * devmem_is_allowed() checks to see if /dev/mem access to a certain address
  * is valid. The argument is a physical page number.
  *
- *
- * On x86, access has to be given to the first megabyte of ram because that area
- * contains BIOS code and data regions used by X and dosemu and similar apps.
- * Access has to be given to non-kernel-ram areas as well, these contain the PCI
- * mmio resources as well as potential bios/acpi data regions.
+ * On x86, access has to be given to the first megabyte of RAM because that
+ * area traditionally contains BIOS code and data regions used by X, dosemu,
+ * and similar apps. Since they map the entire memory range, the whole range
+ * must be allowed (for mapping), but any areas that would otherwise be
+ * disallowed are flagged as being "zero filled" instead of rejected.
+ * Access has to be given to non-kernel-ram areas as well, these contain the
+ * PCI mmio resources as well as potential bios/acpi data regions.
  */
 int devmem_is_allowed(unsigned long pagenr)
 {
-       if (pagenr < 256)
-               return 1;
-       if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
+       if (page_is_ram(pagenr)) {
+               /*
+                * For disallowed memory regions in the low 1MB range,
+                * request that the page be shown as all zeros.
+                */
+               if (pagenr < 256)
+                       return 2;
+
                return 0;
-       if (!page_is_ram(pagenr))
-               return 1;
-       return 0;
+       }
+
+       /*
+        * This must follow RAM test, since System RAM is considered a
+        * restricted resource under CONFIG_STRICT_IOMEM.
+        */
+       if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) {
+               /* Low 1MB bypasses iomem restrictions. */
+               if (pagenr < 256)
+                       return 1;
+
+               return 0;
+       }
+
+       return 1;
 }
 
 void free_init_pages(char *what, unsigned long begin, unsigned long end)
@@ -701,7 +720,7 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
 
 void __ref free_initmem(void)
 {
-       e820_reallocate_tables();
+       e820__reallocate_tables();
 
        free_init_pages("unused kernel",
                        (unsigned long)(&__init_begin),
@@ -724,6 +743,53 @@ void __init free_initrd_mem(unsigned long start, unsigned long end)
 }
 #endif
 
+/*
+ * Calculate the precise size of the DMA zone (first 16 MB of RAM),
+ * and pass it to the MM layer - to help it set zone watermarks more
+ * accurately.
+ *
+ * Done on 64-bit systems only for the time being, although 32-bit systems
+ * might benefit from this as well.
+ */
+void __init memblock_find_dma_reserve(void)
+{
+#ifdef CONFIG_X86_64
+       u64 nr_pages = 0, nr_free_pages = 0;
+       unsigned long start_pfn, end_pfn;
+       phys_addr_t start_addr, end_addr;
+       int i;
+       u64 u;
+
+       /*
+        * Iterate over all memory ranges (free and reserved ones alike),
+        * to calculate the total number of pages in the first 16 MB of RAM:
+        */
+       nr_pages = 0;
+       for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
+               start_pfn = min(start_pfn, MAX_DMA_PFN);
+               end_pfn   = min(end_pfn,   MAX_DMA_PFN);
+
+               nr_pages += end_pfn - start_pfn;
+       }
+
+       /*
+        * Iterate over free memory ranges to calculate the number of free
+        * pages in the DMA zone, while not counting potential partial
+        * pages at the beginning or the end of the range:
+        */
+       nr_free_pages = 0;
+       for_each_free_mem_range(u, NUMA_NO_NODE, MEMBLOCK_NONE, &start_addr, &end_addr, NULL) {
+               start_pfn = min_t(unsigned long, PFN_UP(start_addr), MAX_DMA_PFN);
+               end_pfn   = min_t(unsigned long, PFN_DOWN(end_addr), MAX_DMA_PFN);
+
+               if (start_pfn < end_pfn)
+                       nr_free_pages += end_pfn - start_pfn;
+       }
+
+       set_dma_reserve(nr_pages - nr_free_pages);
+#endif
+}
+
 void __init zone_sizes_init(void)
 {
        unsigned long max_zone_pfns[MAX_NR_ZONES];
index 4dddfaf..030bfed 100644 (file)
@@ -38,7 +38,7 @@
 #include <asm/pgtable.h>
 #include <asm/dma.h>
 #include <asm/fixmap.h>
-#include <asm/e820.h>
+#include <asm/e820/api.h>
 #include <asm/apic.h>
 #include <asm/bugs.h>
 #include <asm/tlb.h>
index 15173d3..f6da869 100644 (file)
@@ -41,7 +41,7 @@
 #include <asm/pgalloc.h>
 #include <asm/dma.h>
 #include <asm/fixmap.h>
-#include <asm/e820.h>
+#include <asm/e820/api.h>
 #include <asm/apic.h>
 #include <asm/tlb.h>
 #include <asm/mmu_context.h>
@@ -337,10 +337,10 @@ phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end,
                paddr_next = (paddr & PAGE_MASK) + PAGE_SIZE;
                if (paddr >= paddr_end) {
                        if (!after_bootmem &&
-                           !e820_any_mapped(paddr & PAGE_MASK, paddr_next,
-                                            E820_RAM) &&
-                           !e820_any_mapped(paddr & PAGE_MASK, paddr_next,
-                                            E820_RESERVED_KERN))
+                           !e820__mapped_any(paddr & PAGE_MASK, paddr_next,
+                                            E820_TYPE_RAM) &&
+                           !e820__mapped_any(paddr & PAGE_MASK, paddr_next,
+                                            E820_TYPE_RESERVED_KERN))
                                set_pte(pte, __pte(0));
                        continue;
                }
@@ -392,10 +392,10 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end,
                paddr_next = (paddr & PMD_MASK) + PMD_SIZE;
                if (paddr >= paddr_end) {
                        if (!after_bootmem &&
-                           !e820_any_mapped(paddr & PMD_MASK, paddr_next,
-                                            E820_RAM) &&
-                           !e820_any_mapped(paddr & PMD_MASK, paddr_next,
-                                            E820_RESERVED_KERN))
+                           !e820__mapped_any(paddr & PMD_MASK, paddr_next,
+                                            E820_TYPE_RAM) &&
+                           !e820__mapped_any(paddr & PMD_MASK, paddr_next,
+                                            E820_TYPE_RESERVED_KERN))
                                set_pmd(pmd, __pmd(0));
                        continue;
                }
@@ -478,10 +478,10 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
 
                if (paddr >= paddr_end) {
                        if (!after_bootmem &&
-                           !e820_any_mapped(paddr & PUD_MASK, paddr_next,
-                                            E820_RAM) &&
-                           !e820_any_mapped(paddr & PUD_MASK, paddr_next,
-                                            E820_RESERVED_KERN))
+                           !e820__mapped_any(paddr & PUD_MASK, paddr_next,
+                                            E820_TYPE_RAM) &&
+                           !e820__mapped_any(paddr & PUD_MASK, paddr_next,
+                                            E820_TYPE_RESERVED_KERN))
                                set_pud(pud, __pud(0));
                        continue;
                }
index 7aaa263..c43b6b3 100644 (file)
@@ -9,12 +9,13 @@
 #include <linux/bootmem.h>
 #include <linux/init.h>
 #include <linux/io.h>
+#include <linux/ioport.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/mmiotrace.h>
 
 #include <asm/cacheflush.h>
-#include <asm/e820.h>
+#include <asm/e820/api.h>
 #include <asm/fixmap.h>
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
index 4c90cfd..da92df3 100644 (file)
@@ -8,11 +8,12 @@
 #include <linux/sched/task.h>
 #include <linux/vmalloc.h>
 
+#include <asm/e820/types.h>
 #include <asm/tlbflush.h>
 #include <asm/sections.h>
 
 extern pgd_t early_level4_pgt[PTRS_PER_PGD];
-extern struct range pfn_mapped[E820_X_MAX];
+extern struct range pfn_mapped[E820_MAX_ENTRIES];
 
 static int __init map_range(struct range *range)
 {
@@ -104,7 +105,7 @@ void __init kasan_init(void)
        kasan_populate_zero_shadow((void *)KASAN_SHADOW_START,
                        kasan_mem_to_shadow((void *)PAGE_OFFSET));
 
-       for (i = 0; i < E820_X_MAX; i++) {
+       for (i = 0; i < E820_MAX_ENTRIES; i++) {
                if (pfn_mapped[i].end == 0)
                        break;
 
index bef3662..4d434dd 100644 (file)
@@ -32,7 +32,7 @@
 #include <linux/kallsyms.h>
 #include <asm/pgtable.h>
 #include <linux/mmiotrace.h>
-#include <asm/e820.h> /* for ISA_START_ADDRESS */
+#include <asm/e820/api.h> /* for ISA_START_ADDRESS */
 #include <linux/atomic.h>
 #include <linux/percpu.h>
 #include <linux/cpu.h>
index 12dcad7..f9d9953 100644 (file)
@@ -12,7 +12,7 @@
 #include <linux/sched.h>
 #include <linux/topology.h>
 
-#include <asm/e820.h>
+#include <asm/e820/api.h>
 #include <asm/proto.h>
 #include <asm/dma.h>
 #include <asm/amd_nb.h>
index 28d4213..a57e8e0 100644 (file)
@@ -15,7 +15,7 @@
 #include <linux/pci.h>
 #include <linux/vmalloc.h>
 
-#include <asm/e820.h>
+#include <asm/e820/api.h>
 #include <asm/processor.h>
 #include <asm/tlbflush.h>
 #include <asm/sections.h>
index efc32bc..9b78685 100644 (file)
@@ -10,6 +10,7 @@
 #include <linux/seq_file.h>
 #include <linux/bootmem.h>
 #include <linux/debugfs.h>
+#include <linux/ioport.h>
 #include <linux/kernel.h>
 #include <linux/pfn_t.h>
 #include <linux/slab.h>
@@ -23,7 +24,7 @@
 #include <asm/x86_init.h>
 #include <asm/pgtable.h>
 #include <asm/fcntl.h>
-#include <asm/e820.h>
+#include <asm/e820/api.h>
 #include <asm/mtrr.h>
 #include <asm/page.h>
 #include <asm/msr.h>
index 9adce77..de53c52 100644 (file)
@@ -12,7 +12,7 @@
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/fixmap.h>
-#include <asm/e820.h>
+#include <asm/e820/api.h>
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
 #include <asm/io.h>
index 35fe695..3ea20d6 100644 (file)
@@ -18,7 +18,7 @@
 #include <linux/mm.h>
 #include <asm/proto.h>
 #include <asm/numa.h>
-#include <asm/e820.h>
+#include <asm/e820/api.h>
 #include <asm/apic.h>
 #include <asm/uv/uv.h>
 
index 0a9f2ca..6fa84d5 100644 (file)
@@ -34,7 +34,7 @@
 #include <linux/bootmem.h>
 
 #include <asm/pat.h>
-#include <asm/e820.h>
+#include <asm/e820/api.h>
 #include <asm/pci_x86.h>
 #include <asm/io_apic.h>
 
@@ -398,7 +398,7 @@ void __init pcibios_resource_survey(void)
        list_for_each_entry(bus, &pci_root_buses, node)
                pcibios_allocate_resources(bus, 1);
 
-       e820_reserve_resources_late();
+       e820__reserve_resources_late();
        /*
         * Insert the IO APIC resources after PCI initialization has
         * occurred to handle IO APICS that are mapped in on a BAR in
index dd30b7e..d1b47d5 100644 (file)
@@ -18,7 +18,7 @@
 #include <linux/slab.h>
 #include <linux/mutex.h>
 #include <linux/rculist.h>
-#include <asm/e820.h>
+#include <asm/e820/api.h>
 #include <asm/pci_x86.h>
 #include <asm/acpi.h>
 
@@ -423,7 +423,7 @@ static acpi_status find_mboard_resource(acpi_handle handle, u32 lvl,
        return AE_OK;
 }
 
-static int is_acpi_reserved(u64 start, u64 end, unsigned not_used)
+static bool is_acpi_reserved(u64 start, u64 end, unsigned not_used)
 {
        struct resource mcfg_res;
 
@@ -440,11 +440,11 @@ static int is_acpi_reserved(u64 start, u64 end, unsigned not_used)
        return mcfg_res.flags;
 }
 
-typedef int (*check_reserved_t)(u64 start, u64 end, unsigned type);
+typedef bool (*check_reserved_t)(u64 start, u64 end, unsigned type);
 
-static int __ref is_mmconf_reserved(check_reserved_t is_reserved,
-                                   struct pci_mmcfg_region *cfg,
-                                   struct device *dev, int with_e820)
+static bool __ref is_mmconf_reserved(check_reserved_t is_reserved,
+                                    struct pci_mmcfg_region *cfg,
+                                    struct device *dev, int with_e820)
 {
        u64 addr = cfg->res.start;
        u64 size = resource_size(&cfg->res);
@@ -452,7 +452,7 @@ static int __ref is_mmconf_reserved(check_reserved_t is_reserved,
        int num_buses;
        char *method = with_e820 ? "E820" : "ACPI motherboard resources";
 
-       while (!is_reserved(addr, addr + size, E820_RESERVED)) {
+       while (!is_reserved(addr, addr + size, E820_TYPE_RESERVED)) {
                size >>= 1;
                if (size < (16UL<<20))
                        break;
@@ -494,8 +494,8 @@ static int __ref is_mmconf_reserved(check_reserved_t is_reserved,
        return 1;
 }
 
-static int __ref pci_mmcfg_check_reserved(struct device *dev,
-                 struct pci_mmcfg_region *cfg, int early)
+static bool __ref
+pci_mmcfg_check_reserved(struct device *dev, struct pci_mmcfg_region *cfg, int early)
 {
        if (!early && !acpi_disabled) {
                if (is_mmconf_reserved(is_acpi_reserved, cfg, dev, 0))
@@ -514,7 +514,7 @@ static int __ref pci_mmcfg_check_reserved(struct device *dev,
        }
 
        /*
-        * e820_all_mapped() is marked as __init.
+        * e820__mapped_all() is marked as __init.
         * All entries from ACPI MCFG table have been checked at boot time.
         * For MCFG information constructed from hotpluggable host bridge's
         * _CBA method, just assume it's reserved.
@@ -525,7 +525,7 @@ static int __ref pci_mmcfg_check_reserved(struct device *dev,
        /* Don't try to do this check unless configuration
           type 1 is available. how about type 2 ?*/
        if (raw_pci_ops)
-               return is_mmconf_reserved(e820_all_mapped, cfg, dev, 1);
+               return is_mmconf_reserved(e820__mapped_all, cfg, dev, 1);
 
        return 0;
 }
index 43984bc..3e9e166 100644 (file)
@@ -12,7 +12,7 @@
 #include <linux/pci.h>
 #include <linux/init.h>
 #include <linux/rcupdate.h>
-#include <asm/e820.h>
+#include <asm/e820/api.h>
 #include <asm/pci_x86.h>
 
 /* Assume systems with more busses have correct MCFG */
index bea5249..f1c1aa0 100644 (file)
@@ -10,7 +10,7 @@
 #include <linux/acpi.h>
 #include <linux/bitmap.h>
 #include <linux/rcupdate.h>
-#include <asm/e820.h>
+#include <asm/e820/api.h>
 #include <asm/pci_x86.h>
 
 #define PREFIX "PCI: "
index 1d97cea..29e9ba6 100644 (file)
@@ -7,7 +7,9 @@
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/uaccess.h>
+
 #include <asm/pci_x86.h>
+#include <asm/e820/types.h>
 #include <asm/pci-functions.h>
 #include <asm/cacheflush.h>
 
index 066619b..f1d83b3 100644 (file)
@@ -1,6 +1,5 @@
 OBJECT_FILES_NON_STANDARD_efi_thunk_$(BITS).o := y
 
 obj-$(CONFIG_EFI)              += quirks.o efi.o efi_$(BITS).o efi_stub_$(BITS).o
-obj-$(CONFIG_ACPI_BGRT) += efi-bgrt.o
 obj-$(CONFIG_EARLY_PRINTK_EFI) += early_printk.o
 obj-$(CONFIG_EFI_MIXED)                += efi_thunk_$(BITS).o
index 565dff3..a15cf81 100644 (file)
@@ -47,6 +47,7 @@
 
 #include <asm/setup.h>
 #include <asm/efi.h>
+#include <asm/e820/api.h>
 #include <asm/time.h>
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
@@ -139,21 +140,21 @@ static void __init do_add_efi_memmap(void)
                case EFI_BOOT_SERVICES_DATA:
                case EFI_CONVENTIONAL_MEMORY:
                        if (md->attribute & EFI_MEMORY_WB)
-                               e820_type = E820_RAM;
+                               e820_type = E820_TYPE_RAM;
                        else
-                               e820_type = E820_RESERVED;
+                               e820_type = E820_TYPE_RESERVED;
                        break;
                case EFI_ACPI_RECLAIM_MEMORY:
-                       e820_type = E820_ACPI;
+                       e820_type = E820_TYPE_ACPI;
                        break;
                case EFI_ACPI_MEMORY_NVS:
-                       e820_type = E820_NVS;
+                       e820_type = E820_TYPE_NVS;
                        break;
                case EFI_UNUSABLE_MEMORY:
-                       e820_type = E820_UNUSABLE;
+                       e820_type = E820_TYPE_UNUSABLE;
                        break;
                case EFI_PERSISTENT_MEMORY:
-                       e820_type = E820_PMEM;
+                       e820_type = E820_TYPE_PMEM;
                        break;
                default:
                        /*
@@ -161,12 +162,12 @@ static void __init do_add_efi_memmap(void)
                         * EFI_RUNTIME_SERVICES_DATA EFI_MEMORY_MAPPED_IO
                         * EFI_MEMORY_MAPPED_IO_PORT_SPACE EFI_PAL_CODE
                         */
-                       e820_type = E820_RESERVED;
+                       e820_type = E820_TYPE_RESERVED;
                        break;
                }
-               e820_add_region(start, size, e820_type);
+               e820__range_add(start, size, e820_type);
        }
-       sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map);
+       e820__update_table(e820_table);
 }
 
 int __init efi_memblock_x86_reserve_range(void)
index a4695da..642a869 100644 (file)
@@ -35,7 +35,7 @@
 
 #include <asm/setup.h>
 #include <asm/page.h>
-#include <asm/e820.h>
+#include <asm/e820/api.h>
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 #include <asm/proto.h>
@@ -47,7 +47,7 @@
 #include <asm/pgalloc.h>
 
 /*
- * We allocate runtime services regions bottom-up, starting from -4G, i.e.
+ * We allocate runtime services regions top-down, starting from -4G, i.e.
  * 0xffff_ffff_0000_0000 and limit EFI VA mapping space to 64G.
  */
 static u64 efi_va = EFI_VA_START;
index 30031d5..2661599 100644 (file)
@@ -11,6 +11,8 @@
 #include <linux/bootmem.h>
 #include <linux/acpi.h>
 #include <linux/dmi.h>
+
+#include <asm/e820/api.h>
 #include <asm/efi.h>
 #include <asm/uv/uv.h>
 
@@ -201,6 +203,10 @@ void __init efi_arch_mem_reserve(phys_addr_t addr, u64 size)
                return;
        }
 
+       /* No need to reserve regions that will never be freed. */
+       if (md.attribute & EFI_MEMORY_RUNTIME)
+               return;
+
        size += addr % EFI_PAGE_SIZE;
        size = round_up(size, EFI_PAGE_SIZE);
        addr = round_down(addr, EFI_PAGE_SIZE);
@@ -240,14 +246,14 @@ void __init efi_arch_mem_reserve(phys_addr_t addr, u64 size)
  * else. We must only reserve (and then free) regions:
  *
  * - Not within any part of the kernel
- * - Not the BIOS reserved area (E820_RESERVED, E820_NVS, etc)
+ * - Not the BIOS reserved area (E820_TYPE_RESERVED, E820_TYPE_NVS, etc)
  */
 static bool can_free_region(u64 start, u64 size)
 {
        if (start + size > __pa_symbol(_text) && start <= __pa_symbol(_end))
                return false;
 
-       if (!e820_all_mapped(start, start+size, E820_RAM))
+       if (!e820__mapped_all(start, start+size, E820_TYPE_RAM))
                return false;
 
        return true;
@@ -280,7 +286,7 @@ void __init efi_reserve_boot_services(void)
                 * A good example of a critical region that must not be
                 * freed is page zero (first 4Kb of memory), which may
                 * contain boot services code/data but is marked
-                * E820_RESERVED by trim_bios_range().
+                * E820_TYPE_RESERVED by trim_bios_range().
                 */
                if (!already_reserved) {
                        memblock_reserve(start, size);
index 2ee7632..b082d71 100644 (file)
@@ -390,9 +390,11 @@ static __init int uv_rtc_setup_clock(void)
 
        clock_event_device_uv.min_delta_ns = NSEC_PER_SEC /
                                                sn_rtc_cycles_per_second;
+       clock_event_device_uv.min_delta_ticks = 1;
 
        clock_event_device_uv.max_delta_ns = clocksource_uv.mask *
                                (NSEC_PER_SEC / sn_rtc_cycles_per_second);
+       clock_event_device_uv.max_delta_ticks = clocksource_uv.mask;
 
        rc = schedule_on_each_cpu(uv_rtc_register_clockevents);
        if (rc) {
index ded2e82..053801b 100644 (file)
@@ -16,6 +16,7 @@
 
 #include <crypto/hash.h>
 
+#include <asm/e820/api.h>
 #include <asm/init.h>
 #include <asm/proto.h>
 #include <asm/page.h>
@@ -195,12 +196,12 @@ struct restore_data_record {
 
 #if IS_BUILTIN(CONFIG_CRYPTO_MD5)
 /**
- * get_e820_md5 - calculate md5 according to given e820 map
+ * get_e820_md5 - calculate md5 according to given e820 table
  *
- * @map: the e820 map to be calculated
+ * @table: the e820 table to be calculated
  * @buf: the md5 result to be stored to
  */
-static int get_e820_md5(struct e820map *map, void *buf)
+static int get_e820_md5(struct e820_table *table, void *buf)
 {
        struct scatterlist sg;
        struct crypto_ahash *tfm;
@@ -213,10 +214,9 @@ static int get_e820_md5(struct e820map *map, void *buf)
 
        {
                AHASH_REQUEST_ON_STACK(req, tfm);
-               size = offsetof(struct e820map, map)
-                       + sizeof(struct e820entry) * map->nr_map;
+               size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry) * table->nr_entries;
                ahash_request_set_tfm(req, tfm);
-               sg_init_one(&sg, (u8 *)map, size);
+               sg_init_one(&sg, (u8 *)table, size);
                ahash_request_set_callback(req, 0, NULL, NULL);
                ahash_request_set_crypt(req, &sg, buf, size);
 
@@ -231,7 +231,7 @@ static int get_e820_md5(struct e820map *map, void *buf)
 
 static void hibernation_e820_save(void *buf)
 {
-       get_e820_md5(e820_saved, buf);
+       get_e820_md5(e820_table_firmware, buf);
 }
 
 static bool hibernation_e820_mismatch(void *buf)
@@ -244,7 +244,7 @@ static bool hibernation_e820_mismatch(void *buf)
        if (!memcmp(result, buf, MD5_DIGEST_SIZE))
                return false;
 
-       ret = get_e820_md5(e820_saved, result);
+       ret = get_e820_md5(e820_table_firmware, result);
        if (ret)
                return true;
 
index 0bc60a3..2a2d89d 100644 (file)
@@ -7,3 +7,17 @@ config MCE_AMD_INJ
          aspects of the MCE handling code.
 
          WARNING: Do not even assume this interface is staying stable!
+
+config RAS_CEC
+       bool "Correctable Errors Collector"
+       depends on X86_MCE && MEMORY_FAILURE && DEBUG_FS
+       ---help---
+         This is a small cache which collects correctable memory errors per 4K
+         page PFN and counts their repeated occurrence. Once the counter for a
+         PFN overflows, we try to soft-offline that page as we take it to mean
+         that it has reached a relatively high error count and would probably
+         be best if we don't use it anymore.
+
+         Bear in mind that this is absolutely useless if your platform doesn't
+         have ECC DIMMs and doesn't have DRAM ECC checking enabled in the BIOS.
+
index e7e7055..69f0827 100644 (file)
@@ -16,7 +16,7 @@ obj-y = bug.o bugs_$(BITS).o delay.o fault.o ldt.o \
 
 ifeq ($(CONFIG_X86_32),y)
 
-obj-y += checksum_32.o
+obj-y += checksum_32.o syscalls_32.o
 obj-$(CONFIG_ELF_CORE) += elfcore.o
 
 subarch-y = ../lib/string_32.o ../lib/atomic64_32.o ../lib/atomic64_cx8_32.o
index e59eef2..b291ca5 100644 (file)
@@ -78,7 +78,7 @@ static inline int ptrace_set_thread_area(struct task_struct *child, int idx,
         return -ENOSYS;
 }
 
-extern long arch_prctl(struct task_struct *task, int code,
+extern long arch_prctl(struct task_struct *task, int option,
                       unsigned long __user *addr);
 
 #endif
index 96eb2bd..8431e87 100644 (file)
@@ -6,7 +6,7 @@
 #include <sys/ptrace.h>
 #include <asm/ptrace.h>
 
-int os_arch_prctl(int pid, int code, unsigned long *addr)
+int os_arch_prctl(int pid, int option, unsigned long *arg2)
 {
-        return ptrace(PTRACE_ARCH_PRCTL, pid, (unsigned long) addr, code);
+       return ptrace(PTRACE_ARCH_PRCTL, pid, (unsigned long) arg2, option);
 }
diff --git a/arch/x86/um/syscalls_32.c b/arch/x86/um/syscalls_32.c
new file mode 100644 (file)
index 0000000..627d688
--- /dev/null
@@ -0,0 +1,7 @@
+#include <linux/syscalls.h>
+#include <os.h>
+
+SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
+{
+       return -EINVAL;
+}
index 10d9070..58f5166 100644 (file)
@@ -7,13 +7,15 @@
 
 #include <linux/sched.h>
 #include <linux/sched/mm.h>
+#include <linux/syscalls.h>
 #include <linux/uaccess.h>
 #include <asm/prctl.h> /* XXX This should get the constants from libc */
 #include <os.h>
 
-long arch_prctl(struct task_struct *task, int code, unsigned long __user *addr)
+long arch_prctl(struct task_struct *task, int option,
+               unsigned long __user *arg2)
 {
-       unsigned long *ptr = addr, tmp;
+       unsigned long *ptr = arg2, tmp;
        long ret;
        int pid = task->mm->context.id.u.pid;
 
@@ -30,7 +32,7 @@ long arch_prctl(struct task_struct *task, int code, unsigned long __user *addr)
         * arch_prctl is run on the host, then the registers are read
         * back.
         */
-       switch (code) {
+       switch (option) {
        case ARCH_SET_FS:
        case ARCH_SET_GS:
                ret = restore_registers(pid, &current->thread.regs.regs);
@@ -50,11 +52,11 @@ long arch_prctl(struct task_struct *task, int code, unsigned long __user *addr)
                ptr = &tmp;
        }
 
-       ret = os_arch_prctl(pid, code, ptr);
+       ret = os_arch_prctl(pid, option, ptr);
        if (ret)
                return ret;
 
-       switch (code) {
+       switch (option) {
        case ARCH_SET_FS:
                current->thread.arch.fs = (unsigned long) ptr;
                ret = save_registers(pid, &current->thread.regs.regs);
@@ -63,19 +65,19 @@ long arch_prctl(struct task_struct *task, int code, unsigned long __user *addr)
                ret = save_registers(pid, &current->thread.regs.regs);
                break;
        case ARCH_GET_FS:
-               ret = put_user(tmp, addr);
+               ret = put_user(tmp, arg2);
                break;
        case ARCH_GET_GS:
-               ret = put_user(tmp, addr);
+               ret = put_user(tmp, arg2);
                break;
        }
 
        return ret;
 }
 
-long sys_arch_prctl(int code, unsigned long addr)
+SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
 {
-       return arch_prctl(current, code, (unsigned long __user *) addr);
+       return arch_prctl(current, option, (unsigned long __user *) arg2);
 }
 
 void arch_switch_to(struct task_struct *to)
index bc3dab5..504ec74 100644 (file)
@@ -76,6 +76,7 @@
 #include <asm/mwait.h>
 #include <asm/pci_x86.h>
 #include <asm/cpu.h>
+#include <asm/e820/api.h> 
 
 #ifdef CONFIG_ACPI
 #include <linux/acpi.h>
@@ -1689,34 +1690,32 @@ static void __init init_pvh_bootparams(void)
 
        memset(&pvh_bootparams, 0, sizeof(pvh_bootparams));
 
-       memmap.nr_entries = ARRAY_SIZE(pvh_bootparams.e820_map);
-       set_xen_guest_handle(memmap.buffer, pvh_bootparams.e820_map);
+       memmap.nr_entries = ARRAY_SIZE(pvh_bootparams.e820_table);
+       set_xen_guest_handle(memmap.buffer, pvh_bootparams.e820_table);
        rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
        if (rc) {
                xen_raw_printk("XENMEM_memory_map failed (%d)\n", rc);
                BUG();
        }
 
-       if (memmap.nr_entries < E820MAX - 1) {
-               pvh_bootparams.e820_map[memmap.nr_entries].addr =
+       if (memmap.nr_entries < E820_MAX_ENTRIES_ZEROPAGE - 1) {
+               pvh_bootparams.e820_table[memmap.nr_entries].addr =
                        ISA_START_ADDRESS;
-               pvh_bootparams.e820_map[memmap.nr_entries].size =
+               pvh_bootparams.e820_table[memmap.nr_entries].size =
                        ISA_END_ADDRESS - ISA_START_ADDRESS;
-               pvh_bootparams.e820_map[memmap.nr_entries].type =
-                       E820_RESERVED;
+               pvh_bootparams.e820_table[memmap.nr_entries].type =
+                       E820_TYPE_RESERVED;
                memmap.nr_entries++;
        } else
                xen_raw_printk("Warning: Can fit ISA range into e820\n");
 
-       sanitize_e820_map(pvh_bootparams.e820_map,
-                         ARRAY_SIZE(pvh_bootparams.e820_map),
-                         &memmap.nr_entries);
-
        pvh_bootparams.e820_entries = memmap.nr_entries;
        for (i = 0; i < pvh_bootparams.e820_entries; i++)
-               e820_add_region(pvh_bootparams.e820_map[i].addr,
-                               pvh_bootparams.e820_map[i].size,
-                               pvh_bootparams.e820_map[i].type);
+               e820__range_add(pvh_bootparams.e820_table[i].addr,
+                               pvh_bootparams.e820_table[i].size,
+                               pvh_bootparams.e820_table[i].type);
+
+       e820__update_table(e820_table);
 
        pvh_bootparams.hdr.cmd_line_ptr =
                pvh_start_info.cmdline_paddr;
index 37cb5aa..1d68be6 100644 (file)
@@ -58,7 +58,7 @@
 #include <asm/mmu_context.h>
 #include <asm/setup.h>
 #include <asm/paravirt.h>
-#include <asm/e820.h>
+#include <asm/e820/api.h>
 #include <asm/linkage.h>
 #include <asm/page.h>
 #include <asm/init.h>
index a8c306c..a5bf7c4 100644 (file)
@@ -14,7 +14,7 @@
 
 #include <asm/elf.h>
 #include <asm/vdso.h>
-#include <asm/e820.h>
+#include <asm/e820/api.h>
 #include <asm/setup.h>
 #include <asm/acpi.h>
 #include <asm/numa.h>
@@ -41,8 +41,7 @@ struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
 unsigned long xen_released_pages;
 
 /* E820 map used during setting up memory. */
-static struct e820entry xen_e820_map[E820_X_MAX] __initdata;
-static u32 xen_e820_map_entries __initdata;
+static struct e820_table xen_e820_table __initdata;
 
 /*
  * Buffer used to remap identity mapped pages. We only need the virtual space.
@@ -198,15 +197,15 @@ void __init xen_inv_extra_mem(void)
  */
 static unsigned long __init xen_find_pfn_range(unsigned long *min_pfn)
 {
-       const struct e820entry *entry = xen_e820_map;
+       const struct e820_entry *entry = xen_e820_table.entries;
        unsigned int i;
        unsigned long done = 0;
 
-       for (i = 0; i < xen_e820_map_entries; i++, entry++) {
+       for (i = 0; i < xen_e820_table.nr_entries; i++, entry++) {
                unsigned long s_pfn;
                unsigned long e_pfn;
 
-               if (entry->type != E820_RAM)
+               if (entry->type != E820_TYPE_RAM)
                        continue;
 
                e_pfn = PFN_DOWN(entry->addr + entry->size);
@@ -457,7 +456,7 @@ static unsigned long __init xen_foreach_remap_area(unsigned long nr_pages,
 {
        phys_addr_t start = 0;
        unsigned long ret_val = 0;
-       const struct e820entry *entry = xen_e820_map;
+       const struct e820_entry *entry = xen_e820_table.entries;
        int i;
 
        /*
@@ -471,13 +470,13 @@ static unsigned long __init xen_foreach_remap_area(unsigned long nr_pages,
         * example) the DMI tables in a reserved region that begins on
         * a non-page boundary.
         */
-       for (i = 0; i < xen_e820_map_entries; i++, entry++) {
+       for (i = 0; i < xen_e820_table.nr_entries; i++, entry++) {
                phys_addr_t end = entry->addr + entry->size;
-               if (entry->type == E820_RAM || i == xen_e820_map_entries - 1) {
+               if (entry->type == E820_TYPE_RAM || i == xen_e820_table.nr_entries - 1) {
                        unsigned long start_pfn = PFN_DOWN(start);
                        unsigned long end_pfn = PFN_UP(end);
 
-                       if (entry->type == E820_RAM)
+                       if (entry->type == E820_TYPE_RAM)
                                end_pfn = PFN_UP(entry->addr);
 
                        if (start_pfn < end_pfn)
@@ -591,28 +590,28 @@ static void __init xen_align_and_add_e820_region(phys_addr_t start,
        phys_addr_t end = start + size;
 
        /* Align RAM regions to page boundaries. */
-       if (type == E820_RAM) {
+       if (type == E820_TYPE_RAM) {
                start = PAGE_ALIGN(start);
                end &= ~((phys_addr_t)PAGE_SIZE - 1);
        }
 
-       e820_add_region(start, end - start, type);
+       e820__range_add(start, end - start, type);
 }
 
 static void __init xen_ignore_unusable(void)
 {
-       struct e820entry *entry = xen_e820_map;
+       struct e820_entry *entry = xen_e820_table.entries;
        unsigned int i;
 
-       for (i = 0; i < xen_e820_map_entries; i++, entry++) {
-               if (entry->type == E820_UNUSABLE)
-                       entry->type = E820_RAM;
+       for (i = 0; i < xen_e820_table.nr_entries; i++, entry++) {
+               if (entry->type == E820_TYPE_UNUSABLE)
+                       entry->type = E820_TYPE_RAM;
        }
 }
 
 bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size)
 {
-       struct e820entry *entry;
+       struct e820_entry *entry;
        unsigned mapcnt;
        phys_addr_t end;
 
@@ -620,10 +619,10 @@ bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size)
                return false;
 
        end = start + size;
-       entry = xen_e820_map;
+       entry = xen_e820_table.entries;
 
-       for (mapcnt = 0; mapcnt < xen_e820_map_entries; mapcnt++) {
-               if (entry->type == E820_RAM && entry->addr <= start &&
+       for (mapcnt = 0; mapcnt < xen_e820_table.nr_entries; mapcnt++) {
+               if (entry->type == E820_TYPE_RAM && entry->addr <= start &&
                    (entry->addr + entry->size) >= end)
                        return false;
 
@@ -645,10 +644,10 @@ phys_addr_t __init xen_find_free_area(phys_addr_t size)
 {
        unsigned mapcnt;
        phys_addr_t addr, start;
-       struct e820entry *entry = xen_e820_map;
+       struct e820_entry *entry = xen_e820_table.entries;
 
-       for (mapcnt = 0; mapcnt < xen_e820_map_entries; mapcnt++, entry++) {
-               if (entry->type != E820_RAM || entry->size < size)
+       for (mapcnt = 0; mapcnt < xen_e820_table.nr_entries; mapcnt++, entry++) {
+               if (entry->type != E820_TYPE_RAM || entry->size < size)
                        continue;
                start = entry->addr;
                for (addr = start; addr < start + size; addr += PAGE_SIZE) {
@@ -750,8 +749,8 @@ char * __init xen_memory_setup(void)
        max_pfn = min(max_pfn, xen_start_info->nr_pages);
        mem_end = PFN_PHYS(max_pfn);
 
-       memmap.nr_entries = ARRAY_SIZE(xen_e820_map);
-       set_xen_guest_handle(memmap.buffer, xen_e820_map);
+       memmap.nr_entries = ARRAY_SIZE(xen_e820_table.entries);
+       set_xen_guest_handle(memmap.buffer, xen_e820_table.entries);
 
        op = xen_initial_domain() ?
                XENMEM_machine_memory_map :
@@ -760,16 +759,16 @@ char * __init xen_memory_setup(void)
        if (rc == -ENOSYS) {
                BUG_ON(xen_initial_domain());
                memmap.nr_entries = 1;
-               xen_e820_map[0].addr = 0ULL;
-               xen_e820_map[0].size = mem_end;
+               xen_e820_table.entries[0].addr = 0ULL;
+               xen_e820_table.entries[0].size = mem_end;
                /* 8MB slack (to balance backend allocations). */
-               xen_e820_map[0].size += 8ULL << 20;
-               xen_e820_map[0].type = E820_RAM;
+               xen_e820_table.entries[0].size += 8ULL << 20;
+               xen_e820_table.entries[0].type = E820_TYPE_RAM;
                rc = 0;
        }
        BUG_ON(rc);
        BUG_ON(memmap.nr_entries == 0);
-       xen_e820_map_entries = memmap.nr_entries;
+       xen_e820_table.nr_entries = memmap.nr_entries;
 
        /*
         * Xen won't allow a 1:1 mapping to be created to UNUSABLE
@@ -783,8 +782,7 @@ char * __init xen_memory_setup(void)
                xen_ignore_unusable();
 
        /* Make sure the Xen-supplied memory map is well-ordered. */
-       sanitize_e820_map(xen_e820_map, ARRAY_SIZE(xen_e820_map),
-                         &xen_e820_map_entries);
+       e820__update_table(&xen_e820_table);
 
        max_pages = xen_get_max_pages();
 
@@ -811,15 +809,15 @@ char * __init xen_memory_setup(void)
        extra_pages = min3(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
                           extra_pages, max_pages - max_pfn);
        i = 0;
-       addr = xen_e820_map[0].addr;
-       size = xen_e820_map[0].size;
-       while (i < xen_e820_map_entries) {
+       addr = xen_e820_table.entries[0].addr;
+       size = xen_e820_table.entries[0].size;
+       while (i < xen_e820_table.nr_entries) {
                bool discard = false;
 
                chunk_size = size;
-               type = xen_e820_map[i].type;
+               type = xen_e820_table.entries[i].type;
 
-               if (type == E820_RAM) {
+               if (type == E820_TYPE_RAM) {
                        if (addr < mem_end) {
                                chunk_size = min(size, mem_end - addr);
                        } else if (extra_pages) {
@@ -840,9 +838,9 @@ char * __init xen_memory_setup(void)
                size -= chunk_size;
                if (size == 0) {
                        i++;
-                       if (i < xen_e820_map_entries) {
-                               addr = xen_e820_map[i].addr;
-                               size = xen_e820_map[i].size;
+                       if (i < xen_e820_table.nr_entries) {
+                               addr = xen_e820_table.entries[i].addr;
+                               size = xen_e820_table.entries[i].size;
                        }
                }
        }
@@ -858,10 +856,9 @@ char * __init xen_memory_setup(void)
         * reserve ISA memory anyway because too many things poke
         * about in there.
         */
-       e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS,
-                       E820_RESERVED);
+       e820__range_add(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, E820_TYPE_RESERVED);
 
-       sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map);
+       e820__update_table(e820_table);
 
        /*
         * Check whether the kernel itself conflicts with the target E820 map.
@@ -915,6 +912,37 @@ char * __init xen_memory_setup(void)
 }
 
 /*
+ * Machine specific memory setup for auto-translated guests.
+ */
+char * __init xen_auto_xlated_memory_setup(void)
+{
+       struct xen_memory_map memmap;
+       int i;
+       int rc;
+
+       memmap.nr_entries = ARRAY_SIZE(xen_e820_table.entries);
+       set_xen_guest_handle(memmap.buffer, xen_e820_table.entries);
+
+       rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
+       if (rc < 0)
+               panic("No memory map (%d)\n", rc);
+
+       xen_e820_table.nr_entries = memmap.nr_entries;
+
+       e820__update_table(&xen_e820_table);
+
+       for (i = 0; i < xen_e820_table.nr_entries; i++)
+               e820__range_add(xen_e820_table.entries[i].addr, xen_e820_table.entries[i].size, xen_e820_table.entries[i].type);
+
+       /* Remove p2m info, it is not needed. */
+       xen_start_info->mfn_list = 0;
+       xen_start_info->first_p2m_pfn = 0;
+       xen_start_info->nr_p2m_frames = 0;
+
+       return "Xen";
+}
+
+/*
  * Set the bit indicating "nosegneg" library variants should be used.
  * We only need to bother in pure 32-bit mode; compat 32-bit processes
  * can have un-truncated segments, so wrapping around is allowed.
@@ -999,8 +1027,8 @@ void __init xen_pvmmu_arch_setup(void)
 void __init xen_arch_setup(void)
 {
        xen_panic_handler_init();
-
-       xen_pvmmu_arch_setup();
+       if (!xen_feature(XENFEAT_auto_translated_physmap))
+               xen_pvmmu_arch_setup();
 
 #ifdef CONFIG_ACPI
        if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
index 1e69956..7a30892 100644 (file)
@@ -209,7 +209,9 @@ static const struct clock_event_device xen_timerop_clockevent = {
        .features               = CLOCK_EVT_FEAT_ONESHOT,
 
        .max_delta_ns           = 0xffffffff,
+       .max_delta_ticks        = 0xffffffff,
        .min_delta_ns           = TIMER_SLOP,
+       .min_delta_ticks        = TIMER_SLOP,
 
        .mult                   = 1,
        .shift                  = 0,
@@ -268,7 +270,9 @@ static const struct clock_event_device xen_vcpuop_clockevent = {
        .features = CLOCK_EVT_FEAT_ONESHOT,
 
        .max_delta_ns = 0xffffffff,
+       .max_delta_ticks = 0xffffffff,
        .min_delta_ns = TIMER_SLOP,
+       .min_delta_ticks = TIMER_SLOP,
 
        .mult = 1,
        .shift = 0,
index f41408c..cc23e9e 100644 (file)
@@ -6,6 +6,7 @@ generic-y += dma-contiguous.h
 generic-y += emergency-restart.h
 generic-y += errno.h
 generic-y += exec.h
+generic-y += extable.h
 generic-y += fcntl.h
 generic-y += hardirq.h
 generic-y += ioctl.h
index a7a1100..dfdf9fa 100644 (file)
@@ -19,9 +19,6 @@
 #include <linux/errno.h>
 #include <asm/types.h>
 
-#define VERIFY_READ    0
-#define VERIFY_WRITE   1
-
 #include <asm/current.h>
 #include <asm/asm-offsets.h>
 #include <asm/processor.h>
index 848a3d7..2e7bac0 100644 (file)
 #ifndef _XTENSA_UACCESS_H
 #define _XTENSA_UACCESS_H
 
-#include <linux/errno.h>
 #include <linux/prefetch.h>
 #include <asm/types.h>
-
-#define VERIFY_READ    0
-#define VERIFY_WRITE   1
-
-#include <linux/sched.h>
+#include <asm/extable.h>
 
 /*
  * The fs value determines whether argument validity checking should
@@ -43,7 +38,7 @@
 
 #define segment_eq(a, b)       ((a).seg == (b).seg)
 
-#define __kernel_ok (segment_eq(get_fs(), KERNEL_DS))
+#define __kernel_ok (uaccess_kernel())
 #define __user_ok(addr, size) \
                (((size) <= TASK_SIZE)&&((addr) <= TASK_SIZE-(size)))
 #define __access_ok(addr, size) (__kernel_ok || __user_ok((addr), (size)))
@@ -239,60 +234,22 @@ __asm__ __volatile__(                     \
  * Copy to/from user space
  */
 
-/*
- * We use a generic, arbitrary-sized copy subroutine.  The Xtensa
- * architecture would cause heavy code bloat if we tried to inline
- * these functions and provide __constant_copy_* equivalents like the
- * i386 versions.  __xtensa_copy_user is quite efficient.  See the
- * .fixup section of __xtensa_copy_user for a discussion on the
- * X_zeroing equivalents for Xtensa.
- */
-
 extern unsigned __xtensa_copy_user(void *to, const void *from, unsigned n);
-#define __copy_user(to, from, size) __xtensa_copy_user(to, from, size)
-
 
 static inline unsigned long
-__generic_copy_from_user_nocheck(void *to, const void *from, unsigned long n)
+raw_copy_from_user(void *to, const void __user *from, unsigned long n)
 {
-       return __copy_user(to, from, n);
-}
-
-static inline unsigned long
-__generic_copy_to_user_nocheck(void *to, const void *from, unsigned long n)
-{
-       return __copy_user(to, from, n);
+       prefetchw(to);
+       return __xtensa_copy_user(to, (__force const void *)from, n);
 }
-
 static inline unsigned long
-__generic_copy_to_user(void *to, const void *from, unsigned long n)
+raw_copy_to_user(void __user *to, const void *from, unsigned long n)
 {
        prefetch(from);
-       if (access_ok(VERIFY_WRITE, to, n))
-               return __copy_user(to, from, n);
-       return n;
-}
-
-static inline unsigned long
-__generic_copy_from_user(void *to, const void *from, unsigned long n)
-{
-       prefetchw(to);
-       if (access_ok(VERIFY_READ, from, n))
-               return __copy_user(to, from, n);
-       else
-               memset(to, 0, n);
-       return n;
+       return __xtensa_copy_user((__force void *)to, from, n);
 }
-
-#define copy_to_user(to, from, n) __generic_copy_to_user((to), (from), (n))
-#define copy_from_user(to, from, n) __generic_copy_from_user((to), (from), (n))
-#define __copy_to_user(to, from, n) \
-       __generic_copy_to_user_nocheck((to), (from), (n))
-#define __copy_from_user(to, from, n) \
-       __generic_copy_from_user_nocheck((to), (from), (n))
-#define __copy_to_user_inatomic __copy_to_user
-#define __copy_from_user_inatomic __copy_from_user
-
+#define INLINE_COPY_FROM_USER
+#define INLINE_COPY_TO_USER
 
 /*
  * We need to return the number of bytes not cleared.  Our memset()
@@ -348,10 +305,4 @@ static inline long strnlen_user(const char *str, long len)
        return __strnlen_user(str, len);
 }
 
-
-struct exception_table_entry
-{
-       unsigned long insn, fixup;
-};
-
 #endif /* _XTENSA_UACCESS_H */
index 7ea4dd6..d9cd766 100644 (file)
@@ -102,9 +102,9 @@ __xtensa_copy_user:
        bltui   a4, 7, .Lbytecopy       # do short copies byte by byte
 
        # copy 1 byte
-       EX(l8ui, a6, a3, 0, l_fixup)
+       EX(l8ui, a6, a3, 0, fixup)
        addi    a3, a3,  1
-       EX(s8i, a6, a5,  0, s_fixup)
+       EX(s8i, a6, a5,  0, fixup)
        addi    a5, a5,  1
        addi    a4, a4, -1
        bbci.l  a5, 1, .Ldstaligned     # if dst is now aligned, then
@@ -112,11 +112,11 @@ __xtensa_copy_user:
 .Ldst2mod4:    # dst 16-bit aligned
        # copy 2 bytes
        bltui   a4, 6, .Lbytecopy       # do short copies byte by byte
-       EX(l8ui, a6, a3, 0, l_fixup)
-       EX(l8ui, a7, a3, 1, l_fixup)
+       EX(l8ui, a6, a3, 0, fixup)
+       EX(l8ui, a7, a3, 1, fixup)
        addi    a3, a3,  2
-       EX(s8i, a6, a5,  0, s_fixup)
-       EX(s8i, a7, a5,  1, s_fixup)
+       EX(s8i, a6, a5,  0, fixup)
+       EX(s8i, a7, a5,  1, fixup)
        addi    a5, a5,  2
        addi    a4, a4, -2
        j       .Ldstaligned    # dst is now aligned, return to main algorithm
@@ -135,9 +135,9 @@ __xtensa_copy_user:
        add     a7, a3, a4      # a7 = end address for source
 #endif /* !XCHAL_HAVE_LOOPS */
 .Lnextbyte:
-       EX(l8ui, a6, a3, 0, l_fixup)
+       EX(l8ui, a6, a3, 0, fixup)
        addi    a3, a3, 1
-       EX(s8i, a6, a5, 0, s_fixup)
+       EX(s8i, a6, a5, 0, fixup)
        addi    a5, a5, 1
 #if !XCHAL_HAVE_LOOPS
        blt     a3, a7, .Lnextbyte
@@ -161,15 +161,15 @@ __xtensa_copy_user:
        add     a8, a8, a3      # a8 = end of last 16B source chunk
 #endif /* !XCHAL_HAVE_LOOPS */
 .Loop1:
-       EX(l32i, a6, a3,  0, l_fixup)
-       EX(l32i, a7, a3,  4, l_fixup)
-       EX(s32i, a6, a5,  0, s_fixup)
-       EX(l32i, a6, a3,  8, l_fixup)
-       EX(s32i, a7, a5,  4, s_fixup)
-       EX(l32i, a7, a3, 12, l_fixup)
-       EX(s32i, a6, a5,  8, s_fixup)
+       EX(l32i, a6, a3,  0, fixup)
+       EX(l32i, a7, a3,  4, fixup)
+       EX(s32i, a6, a5,  0, fixup)
+       EX(l32i, a6, a3,  8, fixup)
+       EX(s32i, a7, a5,  4, fixup)
+       EX(l32i, a7, a3, 12, fixup)
+       EX(s32i, a6, a5,  8, fixup)
        addi    a3, a3, 16
-       EX(s32i, a7, a5, 12, s_fixup)
+       EX(s32i, a7, a5, 12, fixup)
        addi    a5, a5, 16
 #if !XCHAL_HAVE_LOOPS
        blt     a3, a8, .Loop1
@@ -177,31 +177,31 @@ __xtensa_copy_user:
 .Loop1done:
        bbci.l  a4, 3, .L2
        # copy 8 bytes
-       EX(l32i, a6, a3,  0, l_fixup)
-       EX(l32i, a7, a3,  4, l_fixup)
+       EX(l32i, a6, a3,  0, fixup)
+       EX(l32i, a7, a3,  4, fixup)
        addi    a3, a3,  8
-       EX(s32i, a6, a5,  0, s_fixup)
-       EX(s32i, a7, a5,  4, s_fixup)
+       EX(s32i, a6, a5,  0, fixup)
+       EX(s32i, a7, a5,  4, fixup)
        addi    a5, a5,  8
 .L2:
        bbci.l  a4, 2, .L3
        # copy 4 bytes
-       EX(l32i, a6, a3,  0, l_fixup)
+       EX(l32i, a6, a3,  0, fixup)
        addi    a3, a3,  4
-       EX(s32i, a6, a5,  0, s_fixup)
+       EX(s32i, a6, a5,  0, fixup)
        addi    a5, a5,  4
 .L3:
        bbci.l  a4, 1, .L4
        # copy 2 bytes
-       EX(l16ui, a6, a3,  0, l_fixup)
+       EX(l16ui, a6, a3,  0, fixup)
        addi    a3, a3,  2
-       EX(s16i,  a6, a5,  0, s_fixup)
+       EX(s16i,  a6, a5,  0, fixup)
        addi    a5, a5,  2
 .L4:
        bbci.l  a4, 0, .L5
        # copy 1 byte
-       EX(l8ui, a6, a3,  0, l_fixup)
-       EX(s8i,  a6, a5,  0, s_fixup)
+       EX(l8ui, a6, a3,  0, fixup)
+       EX(s8i,  a6, a5,  0, fixup)
 .L5:
        movi    a2, 0           # return success for len bytes copied
        retw
@@ -217,7 +217,7 @@ __xtensa_copy_user:
        # copy 16 bytes per iteration for word-aligned dst and unaligned src
        and     a10, a3, a8     # save unalignment offset for below
        sub     a3, a3, a10     # align a3 (to avoid sim warnings only; not needed for hardware)
-       EX(l32i, a6, a3, 0, l_fixup)    # load first word
+       EX(l32i, a6, a3, 0, fixup)      # load first word
 #if XCHAL_HAVE_LOOPS
        loopnez a7, .Loop2done
 #else /* !XCHAL_HAVE_LOOPS */
@@ -226,19 +226,19 @@ __xtensa_copy_user:
        add     a12, a12, a3    # a12 = end of last 16B source chunk
 #endif /* !XCHAL_HAVE_LOOPS */
 .Loop2:
-       EX(l32i, a7, a3,  4, l_fixup)
-       EX(l32i, a8, a3,  8, l_fixup)
+       EX(l32i, a7, a3,  4, fixup)
+       EX(l32i, a8, a3,  8, fixup)
        ALIGN(  a6, a6, a7)
-       EX(s32i, a6, a5,  0, s_fixup)
-       EX(l32i, a9, a3, 12, l_fixup)
+       EX(s32i, a6, a5,  0, fixup)
+       EX(l32i, a9, a3, 12, fixup)
        ALIGN(  a7, a7, a8)
-       EX(s32i, a7, a5,  4, s_fixup)
-       EX(l32i, a6, a3, 16, l_fixup)
+       EX(s32i, a7, a5,  4, fixup)
+       EX(l32i, a6, a3, 16, fixup)
        ALIGN(  a8, a8, a9)
-       EX(s32i, a8, a5,  8, s_fixup)
+       EX(s32i, a8, a5,  8, fixup)
        addi    a3, a3, 16
        ALIGN(  a9, a9, a6)
-       EX(s32i, a9, a5, 12, s_fixup)
+       EX(s32i, a9, a5, 12, fixup)
        addi    a5, a5, 16
 #if !XCHAL_HAVE_LOOPS
        blt     a3, a12, .Loop2
@@ -246,39 +246,39 @@ __xtensa_copy_user:
 .Loop2done:
        bbci.l  a4, 3, .L12
        # copy 8 bytes
-       EX(l32i, a7, a3,  4, l_fixup)
-       EX(l32i, a8, a3,  8, l_fixup)
+       EX(l32i, a7, a3,  4, fixup)
+       EX(l32i, a8, a3,  8, fixup)
        ALIGN(  a6, a6, a7)
-       EX(s32i, a6, a5,  0, s_fixup)
+       EX(s32i, a6, a5,  0, fixup)
        addi    a3, a3,  8
        ALIGN(  a7, a7, a8)
-       EX(s32i, a7, a5,  4, s_fixup)
+       EX(s32i, a7, a5,  4, fixup)
        addi    a5, a5,  8
        mov     a6, a8
 .L12:
        bbci.l  a4, 2, .L13
        # copy 4 bytes
-       EX(l32i, a7, a3,  4, l_fixup)
+       EX(l32i, a7, a3,  4, fixup)
        addi    a3, a3,  4
        ALIGN(  a6, a6, a7)
-       EX(s32i, a6, a5,  0, s_fixup)
+       EX(s32i, a6, a5,  0, fixup)
        addi    a5, a5,  4
        mov     a6, a7
 .L13:
        add     a3, a3, a10     # readjust a3 with correct misalignment
        bbci.l  a4, 1, .L14
        # copy 2 bytes
-       EX(l8ui, a6, a3,  0, l_fixup)
-       EX(l8ui, a7, a3,  1, l_fixup)
+       EX(l8ui, a6, a3,  0, fixup)
+       EX(l8ui, a7, a3,  1, fixup)
        addi    a3, a3,  2
-       EX(s8i, a6, a5,  0, s_fixup)
-       EX(s8i, a7, a5,  1, s_fixup)
+       EX(s8i, a6, a5,  0, fixup)
+       EX(s8i, a7, a5,  1, fixup)
        addi    a5, a5,  2
 .L14:
        bbci.l  a4, 0, .L15
        # copy 1 byte
-       EX(l8ui, a6, a3,  0, l_fixup)
-       EX(s8i,  a6, a5,  0, s_fixup)
+       EX(l8ui, a6, a3,  0, fixup)
+       EX(s8i,  a6, a5,  0, fixup)
 .L15:
        movi    a2, 0           # return success for len bytes copied
        retw
@@ -291,30 +291,10 @@ __xtensa_copy_user:
  * bytes_copied = a5 - a2
  * retval = bytes_not_copied = original len - bytes_copied
  * retval = a11 - (a5 - a2)
- *
- * Clearing the remaining pieces of kernel memory plugs security
- * holes.  This functionality is the equivalent of the *_zeroing
- * functions that some architectures provide.
  */
 
-.Lmemset:
-       .word   memset
 
-s_fixup:
+fixup:
        sub     a2, a5, a2      /* a2 <-- bytes copied */
        sub     a2, a11, a2     /* a2 <-- bytes not copied */
        retw
-
-l_fixup:
-       sub     a2, a5, a2      /* a2 <-- bytes copied */
-       sub     a2, a11, a2     /* a2 <-- bytes not copied == return value */
-
-       /* void *memset(void *s, int c, size_t n); */
-       mov     a6, a5          /* s */
-       movi    a7, 0           /* c */
-       mov     a8, a2          /* n */
-       l32r    a4, .Lmemset
-       callx4  a4
-       /* Ignore memset return value in a6. */
-       /* a2 still contains bytes not copied. */
-       retw
index e9f780f..89cd28f 100644 (file)
@@ -115,6 +115,18 @@ config BLK_DEV_THROTTLING
 
        See Documentation/cgroups/blkio-controller.txt for more information.
 
+config BLK_DEV_THROTTLING_LOW
+       bool "Block throttling .low limit interface support (EXPERIMENTAL)"
+       depends on BLK_DEV_THROTTLING
+       default n
+       ---help---
+       Add .low limit interface for block throttling. The low limit is a best
+       effort limit to prioritize cgroups. Depending on the setting, the limit
+       can be used to protect cgroups in terms of bandwidth/iops and better
+       utilize disk resource.
+
+       Note, this is an experimental interface and could be changed someday.
+
 config BLK_CMDLINE_PARSER
        bool "Block device command line partition parser"
        default n
index 58fc868..fd2cefa 100644 (file)
@@ -40,6 +40,7 @@ config CFQ_GROUP_IOSCHED
          Enable group IO scheduling in CFQ.
 
 choice
+
        prompt "Default I/O scheduler"
        default DEFAULT_CFQ
        help
@@ -69,6 +70,35 @@ config MQ_IOSCHED_DEADLINE
        ---help---
          MQ version of the deadline IO scheduler.
 
+config MQ_IOSCHED_KYBER
+       tristate "Kyber I/O scheduler"
+       default y
+       ---help---
+         The Kyber I/O scheduler is a low-overhead scheduler suitable for
+         multiqueue and other fast devices. Given target latencies for reads and
+         synchronous writes, it will self-tune queue depths to achieve that
+         goal.
+
+config IOSCHED_BFQ
+       tristate "BFQ I/O scheduler"
+       default n
+       ---help---
+       BFQ I/O scheduler for BLK-MQ. BFQ distributes the bandwidth of
+       of the device among all processes according to their weights,
+       regardless of the device parameters and with any workload. It
+       also guarantees a low latency to interactive and soft
+       real-time applications.  Details in
+       Documentation/block/bfq-iosched.txt
+
+config BFQ_GROUP_IOSCHED
+       bool "BFQ hierarchical scheduling support"
+       depends on IOSCHED_BFQ && BLK_CGROUP
+       default n
+       ---help---
+
+       Enable hierarchical scheduling in BFQ, using the blkio
+       (cgroups-v1) or io (cgroups-v2) controller.
+
 endmenu
 
 endif
index 081bb68..2b281cf 100644 (file)
@@ -20,6 +20,9 @@ obj-$(CONFIG_IOSCHED_NOOP)    += noop-iosched.o
 obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
 obj-$(CONFIG_IOSCHED_CFQ)      += cfq-iosched.o
 obj-$(CONFIG_MQ_IOSCHED_DEADLINE)      += mq-deadline.o
+obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o
+bfq-y                          := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o
+obj-$(CONFIG_IOSCHED_BFQ)      += bfq.o
 
 obj-$(CONFIG_BLOCK_COMPAT)     += compat_ioctl.o
 obj-$(CONFIG_BLK_CMDLINE_PARSER)       += cmdline-parser.o
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
new file mode 100644 (file)
index 0000000..c8a32fb
--- /dev/null
@@ -0,0 +1,1139 @@
+/*
+ * cgroups support for the BFQ I/O scheduler.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License as
+ *  published by the Free Software Foundation; either version 2 of the
+ *  License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ */
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/cgroup.h>
+#include <linux/elevator.h>
+#include <linux/ktime.h>
+#include <linux/rbtree.h>
+#include <linux/ioprio.h>
+#include <linux/sbitmap.h>
+#include <linux/delay.h>
+
+#include "bfq-iosched.h"
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+
+/* bfqg stats flags */
+enum bfqg_stats_flags {
+       BFQG_stats_waiting = 0,
+       BFQG_stats_idling,
+       BFQG_stats_empty,
+};
+
+#define BFQG_FLAG_FNS(name)                                            \
+static void bfqg_stats_mark_##name(struct bfqg_stats *stats)   \
+{                                                                      \
+       stats->flags |= (1 << BFQG_stats_##name);                       \
+}                                                                      \
+static void bfqg_stats_clear_##name(struct bfqg_stats *stats)  \
+{                                                                      \
+       stats->flags &= ~(1 << BFQG_stats_##name);                      \
+}                                                                      \
+static int bfqg_stats_##name(struct bfqg_stats *stats)         \
+{                                                                      \
+       return (stats->flags & (1 << BFQG_stats_##name)) != 0;          \
+}                                                                      \
+
+BFQG_FLAG_FNS(waiting)
+BFQG_FLAG_FNS(idling)
+BFQG_FLAG_FNS(empty)
+#undef BFQG_FLAG_FNS
+
+/* This should be called with the queue_lock held. */
+static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats)
+{
+       unsigned long long now;
+
+       if (!bfqg_stats_waiting(stats))
+               return;
+
+       now = sched_clock();
+       if (time_after64(now, stats->start_group_wait_time))
+               blkg_stat_add(&stats->group_wait_time,
+                             now - stats->start_group_wait_time);
+       bfqg_stats_clear_waiting(stats);
+}
+
+/* This should be called with the queue_lock held. */
+static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
+                                                struct bfq_group *curr_bfqg)
+{
+       struct bfqg_stats *stats = &bfqg->stats;
+
+       if (bfqg_stats_waiting(stats))
+               return;
+       if (bfqg == curr_bfqg)
+               return;
+       stats->start_group_wait_time = sched_clock();
+       bfqg_stats_mark_waiting(stats);
+}
+
+/* This should be called with the queue_lock held. */
+static void bfqg_stats_end_empty_time(struct bfqg_stats *stats)
+{
+       unsigned long long now;
+
+       if (!bfqg_stats_empty(stats))
+               return;
+
+       now = sched_clock();
+       if (time_after64(now, stats->start_empty_time))
+               blkg_stat_add(&stats->empty_time,
+                             now - stats->start_empty_time);
+       bfqg_stats_clear_empty(stats);
+}
+
+void bfqg_stats_update_dequeue(struct bfq_group *bfqg)
+{
+       blkg_stat_add(&bfqg->stats.dequeue, 1);
+}
+
+void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg)
+{
+       struct bfqg_stats *stats = &bfqg->stats;
+
+       if (blkg_rwstat_total(&stats->queued))
+               return;
+
+       /*
+        * group is already marked empty. This can happen if bfqq got new
+        * request in parent group and moved to this group while being added
+        * to service tree. Just ignore the event and move on.
+        */
+       if (bfqg_stats_empty(stats))
+               return;
+
+       stats->start_empty_time = sched_clock();
+       bfqg_stats_mark_empty(stats);
+}
+
+void bfqg_stats_update_idle_time(struct bfq_group *bfqg)
+{
+       struct bfqg_stats *stats = &bfqg->stats;
+
+       if (bfqg_stats_idling(stats)) {
+               unsigned long long now = sched_clock();
+
+               if (time_after64(now, stats->start_idle_time))
+                       blkg_stat_add(&stats->idle_time,
+                                     now - stats->start_idle_time);
+               bfqg_stats_clear_idling(stats);
+       }
+}
+
+void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg)
+{
+       struct bfqg_stats *stats = &bfqg->stats;
+
+       stats->start_idle_time = sched_clock();
+       bfqg_stats_mark_idling(stats);
+}
+
+void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg)
+{
+       struct bfqg_stats *stats = &bfqg->stats;
+
+       blkg_stat_add(&stats->avg_queue_size_sum,
+                     blkg_rwstat_total(&stats->queued));
+       blkg_stat_add(&stats->avg_queue_size_samples, 1);
+       bfqg_stats_update_group_wait_time(stats);
+}
+
+/*
+ * blk-cgroup policy-related handlers
+ * The following functions help in converting between blk-cgroup
+ * internal structures and BFQ-specific structures.
+ */
+
+static struct bfq_group *pd_to_bfqg(struct blkg_policy_data *pd)
+{
+       return pd ? container_of(pd, struct bfq_group, pd) : NULL;
+}
+
+struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg)
+{
+       return pd_to_blkg(&bfqg->pd);
+}
+
+static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg)
+{
+       return pd_to_bfqg(blkg_to_pd(blkg, &blkcg_policy_bfq));
+}
+
+/*
+ * bfq_group handlers
+ * The following functions help in navigating the bfq_group hierarchy
+ * by allowing to find the parent of a bfq_group or the bfq_group
+ * associated to a bfq_queue.
+ */
+
+static struct bfq_group *bfqg_parent(struct bfq_group *bfqg)
+{
+       struct blkcg_gq *pblkg = bfqg_to_blkg(bfqg)->parent;
+
+       return pblkg ? blkg_to_bfqg(pblkg) : NULL;
+}
+
+struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
+{
+       struct bfq_entity *group_entity = bfqq->entity.parent;
+
+       return group_entity ? container_of(group_entity, struct bfq_group,
+                                          entity) :
+                             bfqq->bfqd->root_group;
+}
+
+/*
+ * The following two functions handle get and put of a bfq_group by
+ * wrapping the related blk-cgroup hooks.
+ */
+
+static void bfqg_get(struct bfq_group *bfqg)
+{
+       return blkg_get(bfqg_to_blkg(bfqg));
+}
+
+void bfqg_put(struct bfq_group *bfqg)
+{
+       return blkg_put(bfqg_to_blkg(bfqg));
+}
+
+void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
+                             unsigned int op)
+{
+       blkg_rwstat_add(&bfqg->stats.queued, op, 1);
+       bfqg_stats_end_empty_time(&bfqg->stats);
+       if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue))
+               bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq));
+}
+
+void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op)
+{
+       blkg_rwstat_add(&bfqg->stats.queued, op, -1);
+}
+
+void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op)
+{
+       blkg_rwstat_add(&bfqg->stats.merged, op, 1);
+}
+
+void bfqg_stats_update_completion(struct bfq_group *bfqg, uint64_t start_time,
+                                 uint64_t io_start_time, unsigned int op)
+{
+       struct bfqg_stats *stats = &bfqg->stats;
+       unsigned long long now = sched_clock();
+
+       if (time_after64(now, io_start_time))
+               blkg_rwstat_add(&stats->service_time, op,
+                               now - io_start_time);
+       if (time_after64(io_start_time, start_time))
+               blkg_rwstat_add(&stats->wait_time, op,
+                               io_start_time - start_time);
+}
+
+/* @stats = 0 */
+static void bfqg_stats_reset(struct bfqg_stats *stats)
+{
+       /* queued stats shouldn't be cleared */
+       blkg_rwstat_reset(&stats->merged);
+       blkg_rwstat_reset(&stats->service_time);
+       blkg_rwstat_reset(&stats->wait_time);
+       blkg_stat_reset(&stats->time);
+       blkg_stat_reset(&stats->avg_queue_size_sum);
+       blkg_stat_reset(&stats->avg_queue_size_samples);
+       blkg_stat_reset(&stats->dequeue);
+       blkg_stat_reset(&stats->group_wait_time);
+       blkg_stat_reset(&stats->idle_time);
+       blkg_stat_reset(&stats->empty_time);
+}
+
+/* @to += @from */
+static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from)
+{
+       if (!to || !from)
+               return;
+
+       /* queued stats shouldn't be cleared */
+       blkg_rwstat_add_aux(&to->merged, &from->merged);
+       blkg_rwstat_add_aux(&to->service_time, &from->service_time);
+       blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);
+       blkg_stat_add_aux(&from->time, &from->time);
+       blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
+       blkg_stat_add_aux(&to->avg_queue_size_samples,
+                         &from->avg_queue_size_samples);
+       blkg_stat_add_aux(&to->dequeue, &from->dequeue);
+       blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time);
+       blkg_stat_add_aux(&to->idle_time, &from->idle_time);
+       blkg_stat_add_aux(&to->empty_time, &from->empty_time);
+}
+
+/*
+ * Transfer @bfqg's stats to its parent's aux counts so that the ancestors'
+ * recursive stats can still account for the amount used by this bfqg after
+ * it's gone.
+ */
+static void bfqg_stats_xfer_dead(struct bfq_group *bfqg)
+{
+       struct bfq_group *parent;
+
+       if (!bfqg) /* root_group */
+               return;
+
+       parent = bfqg_parent(bfqg);
+
+       lockdep_assert_held(bfqg_to_blkg(bfqg)->q->queue_lock);
+
+       if (unlikely(!parent))
+               return;
+
+       bfqg_stats_add_aux(&parent->stats, &bfqg->stats);
+       bfqg_stats_reset(&bfqg->stats);
+}
+
+void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg)
+{
+       struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+       entity->weight = entity->new_weight;
+       entity->orig_weight = entity->new_weight;
+       if (bfqq) {
+               bfqq->ioprio = bfqq->new_ioprio;
+               bfqq->ioprio_class = bfqq->new_ioprio_class;
+               bfqg_get(bfqg);
+       }
+       entity->parent = bfqg->my_entity; /* NULL for root group */
+       entity->sched_data = &bfqg->sched_data;
+}
+
+static void bfqg_stats_exit(struct bfqg_stats *stats)
+{
+       blkg_rwstat_exit(&stats->merged);
+       blkg_rwstat_exit(&stats->service_time);
+       blkg_rwstat_exit(&stats->wait_time);
+       blkg_rwstat_exit(&stats->queued);
+       blkg_stat_exit(&stats->time);
+       blkg_stat_exit(&stats->avg_queue_size_sum);
+       blkg_stat_exit(&stats->avg_queue_size_samples);
+       blkg_stat_exit(&stats->dequeue);
+       blkg_stat_exit(&stats->group_wait_time);
+       blkg_stat_exit(&stats->idle_time);
+       blkg_stat_exit(&stats->empty_time);
+}
+
+static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
+{
+       if (blkg_rwstat_init(&stats->merged, gfp) ||
+           blkg_rwstat_init(&stats->service_time, gfp) ||
+           blkg_rwstat_init(&stats->wait_time, gfp) ||
+           blkg_rwstat_init(&stats->queued, gfp) ||
+           blkg_stat_init(&stats->time, gfp) ||
+           blkg_stat_init(&stats->avg_queue_size_sum, gfp) ||
+           blkg_stat_init(&stats->avg_queue_size_samples, gfp) ||
+           blkg_stat_init(&stats->dequeue, gfp) ||
+           blkg_stat_init(&stats->group_wait_time, gfp) ||
+           blkg_stat_init(&stats->idle_time, gfp) ||
+           blkg_stat_init(&stats->empty_time, gfp)) {
+               bfqg_stats_exit(stats);
+               return -ENOMEM;
+       }
+
+       return 0;
+}
+
+static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd)
+{
+       return cpd ? container_of(cpd, struct bfq_group_data, pd) : NULL;
+}
+
+static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg)
+{
+       return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq));
+}
+
+struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp)
+{
+       struct bfq_group_data *bgd;
+
+       bgd = kzalloc(sizeof(*bgd), gfp);
+       if (!bgd)
+               return NULL;
+       return &bgd->pd;
+}
+
+void bfq_cpd_init(struct blkcg_policy_data *cpd)
+{
+       struct bfq_group_data *d = cpd_to_bfqgd(cpd);
+
+       d->weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ?
+               CGROUP_WEIGHT_DFL : BFQ_WEIGHT_LEGACY_DFL;
+}
+
+void bfq_cpd_free(struct blkcg_policy_data *cpd)
+{
+       kfree(cpd_to_bfqgd(cpd));
+}
+
+struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node)
+{
+       struct bfq_group *bfqg;
+
+       bfqg = kzalloc_node(sizeof(*bfqg), gfp, node);
+       if (!bfqg)
+               return NULL;
+
+       if (bfqg_stats_init(&bfqg->stats, gfp)) {
+               kfree(bfqg);
+               return NULL;
+       }
+
+       return &bfqg->pd;
+}
+
+void bfq_pd_init(struct blkg_policy_data *pd)
+{
+       struct blkcg_gq *blkg = pd_to_blkg(pd);
+       struct bfq_group *bfqg = blkg_to_bfqg(blkg);
+       struct bfq_data *bfqd = blkg->q->elevator->elevator_data;
+       struct bfq_entity *entity = &bfqg->entity;
+       struct bfq_group_data *d = blkcg_to_bfqgd(blkg->blkcg);
+
+       entity->orig_weight = entity->weight = entity->new_weight = d->weight;
+       entity->my_sched_data = &bfqg->sched_data;
+       bfqg->my_entity = entity; /*
+                                  * the root_group's will be set to NULL
+                                  * in bfq_init_queue()
+                                  */
+       bfqg->bfqd = bfqd;
+       bfqg->active_entities = 0;
+       bfqg->rq_pos_tree = RB_ROOT;
+}
+
+void bfq_pd_free(struct blkg_policy_data *pd)
+{
+       struct bfq_group *bfqg = pd_to_bfqg(pd);
+
+       bfqg_stats_exit(&bfqg->stats);
+       return kfree(bfqg);
+}
+
+void bfq_pd_reset_stats(struct blkg_policy_data *pd)
+{
+       struct bfq_group *bfqg = pd_to_bfqg(pd);
+
+       bfqg_stats_reset(&bfqg->stats);
+}
+
+static void bfq_group_set_parent(struct bfq_group *bfqg,
+                                       struct bfq_group *parent)
+{
+       struct bfq_entity *entity;
+
+       entity = &bfqg->entity;
+       entity->parent = parent->my_entity;
+       entity->sched_data = &parent->sched_data;
+}
+
+static struct bfq_group *bfq_lookup_bfqg(struct bfq_data *bfqd,
+                                        struct blkcg *blkcg)
+{
+       struct blkcg_gq *blkg;
+
+       blkg = blkg_lookup(blkcg, bfqd->queue);
+       if (likely(blkg))
+               return blkg_to_bfqg(blkg);
+       return NULL;
+}
+
+struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
+                                    struct blkcg *blkcg)
+{
+       struct bfq_group *bfqg, *parent;
+       struct bfq_entity *entity;
+
+       bfqg = bfq_lookup_bfqg(bfqd, blkcg);
+
+       if (unlikely(!bfqg))
+               return NULL;
+
+       /*
+        * Update chain of bfq_groups as we might be handling a leaf group
+        * which, along with some of its relatives, has not been hooked yet
+        * to the private hierarchy of BFQ.
+        */
+       entity = &bfqg->entity;
+       for_each_entity(entity) {
+               bfqg = container_of(entity, struct bfq_group, entity);
+               if (bfqg != bfqd->root_group) {
+                       parent = bfqg_parent(bfqg);
+                       if (!parent)
+                               parent = bfqd->root_group;
+                       bfq_group_set_parent(bfqg, parent);
+               }
+       }
+
+       return bfqg;
+}
+
+/**
+ * bfq_bfqq_move - migrate @bfqq to @bfqg.
+ * @bfqd: queue descriptor.
+ * @bfqq: the queue to move.
+ * @bfqg: the group to move to.
+ *
+ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating
+ * it on the new one.  Avoid putting the entity on the old group idle tree.
+ *
+ * Must be called under the queue lock; the cgroup owning @bfqg must
+ * not disappear (by now this just means that we are called under
+ * rcu_read_lock()).
+ */
+void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+                  struct bfq_group *bfqg)
+{
+       struct bfq_entity *entity = &bfqq->entity;
+
+       /* If bfqq is empty, then bfq_bfqq_expire also invokes
+        * bfq_del_bfqq_busy, thereby removing bfqq and its entity
+        * from data structures related to current group. Otherwise we
+        * need to remove bfqq explicitly with bfq_deactivate_bfqq, as
+        * we do below.
+        */
+       if (bfqq == bfqd->in_service_queue)
+               bfq_bfqq_expire(bfqd, bfqd->in_service_queue,
+                               false, BFQQE_PREEMPTED);
+
+       if (bfq_bfqq_busy(bfqq))
+               bfq_deactivate_bfqq(bfqd, bfqq, false, false);
+       else if (entity->on_st)
+               bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
+       bfqg_put(bfqq_group(bfqq));
+
+       /*
+        * Here we use a reference to bfqg.  We don't need a refcounter
+        * as the cgroup reference will not be dropped, so that its
+        * destroy() callback will not be invoked.
+        */
+       entity->parent = bfqg->my_entity;
+       entity->sched_data = &bfqg->sched_data;
+       bfqg_get(bfqg);
+
+       if (bfq_bfqq_busy(bfqq)) {
+               bfq_pos_tree_add_move(bfqd, bfqq);
+               bfq_activate_bfqq(bfqd, bfqq);
+       }
+
+       if (!bfqd->in_service_queue && !bfqd->rq_in_driver)
+               bfq_schedule_dispatch(bfqd);
+}
+
+/**
+ * __bfq_bic_change_cgroup - move @bic to @cgroup.
+ * @bfqd: the queue descriptor.
+ * @bic: the bic to move.
+ * @blkcg: the blk-cgroup to move to.
+ *
+ * Move bic to blkcg, assuming that bfqd->queue is locked; the caller
+ * has to make sure that the reference to cgroup is valid across the call.
+ *
+ * NOTE: an alternative approach might have been to store the current
+ * cgroup in bfqq and getting a reference to it, reducing the lookup
+ * time here, at the price of slightly more complex code.
+ */
+static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
+                                               struct bfq_io_cq *bic,
+                                               struct blkcg *blkcg)
+{
+       struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);
+       struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);
+       struct bfq_group *bfqg;
+       struct bfq_entity *entity;
+
+       bfqg = bfq_find_set_group(bfqd, blkcg);
+
+       if (unlikely(!bfqg))
+               bfqg = bfqd->root_group;
+
+       if (async_bfqq) {
+               entity = &async_bfqq->entity;
+
+               if (entity->sched_data != &bfqg->sched_data) {
+                       bic_set_bfqq(bic, NULL, 0);
+                       bfq_log_bfqq(bfqd, async_bfqq,
+                                    "bic_change_group: %p %d",
+                                    async_bfqq, async_bfqq->ref);
+                       bfq_put_queue(async_bfqq);
+               }
+       }
+
+       if (sync_bfqq) {
+               entity = &sync_bfqq->entity;
+               if (entity->sched_data != &bfqg->sched_data)
+                       bfq_bfqq_move(bfqd, sync_bfqq, bfqg);
+       }
+
+       return bfqg;
+}
+
+void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
+{
+       struct bfq_data *bfqd = bic_to_bfqd(bic);
+       struct bfq_group *bfqg = NULL;
+       uint64_t serial_nr;
+
+       rcu_read_lock();
+       serial_nr = bio_blkcg(bio)->css.serial_nr;
+
+       /*
+        * Check whether blkcg has changed.  The condition may trigger
+        * spuriously on a newly created cic but there's no harm.
+        */
+       if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr))
+               goto out;
+
+       bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio));
+       bic->blkcg_serial_nr = serial_nr;
+out:
+       rcu_read_unlock();
+}
+
+/**
+ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
+ * @st: the service tree being flushed.
+ */
+static void bfq_flush_idle_tree(struct bfq_service_tree *st)
+{
+       struct bfq_entity *entity = st->first_idle;
+
+       for (; entity ; entity = st->first_idle)
+               __bfq_deactivate_entity(entity, false);
+}
+
+/**
+ * bfq_reparent_leaf_entity - move leaf entity to the root_group.
+ * @bfqd: the device data structure with the root group.
+ * @entity: the entity to move.
+ */
+static void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
+                                    struct bfq_entity *entity)
+{
+       struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+       bfq_bfqq_move(bfqd, bfqq, bfqd->root_group);
+}
+
+/**
+ * bfq_reparent_active_entities - move to the root group all active
+ *                                entities.
+ * @bfqd: the device data structure with the root group.
+ * @bfqg: the group to move from.
+ * @st: the service tree with the entities.
+ *
+ * Needs queue_lock to be taken and reference to be valid over the call.
+ */
+static void bfq_reparent_active_entities(struct bfq_data *bfqd,
+                                        struct bfq_group *bfqg,
+                                        struct bfq_service_tree *st)
+{
+       struct rb_root *active = &st->active;
+       struct bfq_entity *entity = NULL;
+
+       if (!RB_EMPTY_ROOT(&st->active))
+               entity = bfq_entity_of(rb_first(active));
+
+       for (; entity ; entity = bfq_entity_of(rb_first(active)))
+               bfq_reparent_leaf_entity(bfqd, entity);
+
+       if (bfqg->sched_data.in_service_entity)
+               bfq_reparent_leaf_entity(bfqd,
+                       bfqg->sched_data.in_service_entity);
+}
+
+/**
+ * bfq_pd_offline - deactivate the entity associated with @pd,
+ *                 and reparent its children entities.
+ * @pd: descriptor of the policy going offline.
+ *
+ * blkio already grabs the queue_lock for us, so no need to use
+ * RCU-based magic
+ */
+void bfq_pd_offline(struct blkg_policy_data *pd)
+{
+       struct bfq_service_tree *st;
+       struct bfq_group *bfqg = pd_to_bfqg(pd);
+       struct bfq_data *bfqd = bfqg->bfqd;
+       struct bfq_entity *entity = bfqg->my_entity;
+       unsigned long flags;
+       int i;
+
+       if (!entity) /* root group */
+               return;
+
+       spin_lock_irqsave(&bfqd->lock, flags);
+       /*
+        * Empty all service_trees belonging to this group before
+        * deactivating the group itself.
+        */
+       for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {
+               st = bfqg->sched_data.service_tree + i;
+
+               /*
+                * The idle tree may still contain bfq_queues belonging
+                * to exited task because they never migrated to a different
+                * cgroup from the one being destroyed now.  No one else
+                * can access them so it's safe to act without any lock.
+                */
+               bfq_flush_idle_tree(st);
+
+               /*
+                * It may happen that some queues are still active
+                * (busy) upon group destruction (if the corresponding
+                * processes have been forced to terminate). We move
+                * all the leaf entities corresponding to these queues
+                * to the root_group.
+                * Also, it may happen that the group has an entity
+                * in service, which is disconnected from the active
+                * tree: it must be moved, too.
+                * There is no need to put the sync queues, as the
+                * scheduler has taken no reference.
+                */
+               bfq_reparent_active_entities(bfqd, bfqg, st);
+       }
+
+       __bfq_deactivate_entity(entity, false);
+       bfq_put_async_queues(bfqd, bfqg);
+
+       spin_unlock_irqrestore(&bfqd->lock, flags);
+       /*
+        * @blkg is going offline and will be ignored by
+        * blkg_[rw]stat_recursive_sum().  Transfer stats to the parent so
+        * that they don't get lost.  If IOs complete after this point, the
+        * stats for them will be lost.  Oh well...
+        */
+       bfqg_stats_xfer_dead(bfqg);
+}
+
+void bfq_end_wr_async(struct bfq_data *bfqd)
+{
+       struct blkcg_gq *blkg;
+
+       list_for_each_entry(blkg, &bfqd->queue->blkg_list, q_node) {
+               struct bfq_group *bfqg = blkg_to_bfqg(blkg);
+
+               bfq_end_wr_async_queues(bfqd, bfqg);
+       }
+       bfq_end_wr_async_queues(bfqd, bfqd->root_group);
+}
+
+static int bfq_io_show_weight(struct seq_file *sf, void *v)
+{
+       struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+       struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
+       unsigned int val = 0;
+
+       if (bfqgd)
+               val = bfqgd->weight;
+
+       seq_printf(sf, "%u\n", val);
+
+       return 0;
+}
+
+static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css,
+                                   struct cftype *cftype,
+                                   u64 val)
+{
+       struct blkcg *blkcg = css_to_blkcg(css);
+       struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
+       struct blkcg_gq *blkg;
+       int ret = -ERANGE;
+
+       if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT)
+               return ret;
+
+       ret = 0;
+       spin_lock_irq(&blkcg->lock);
+       bfqgd->weight = (unsigned short)val;
+       hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
+               struct bfq_group *bfqg = blkg_to_bfqg(blkg);
+
+               if (!bfqg)
+                       continue;
+               /*
+                * Setting the prio_changed flag of the entity
+                * to 1 with new_weight == weight would re-set
+                * the value of the weight to its ioprio mapping.
+                * Set the flag only if necessary.
+                */
+               if ((unsigned short)val != bfqg->entity.new_weight) {
+                       bfqg->entity.new_weight = (unsigned short)val;
+                       /*
+                        * Make sure that the above new value has been
+                        * stored in bfqg->entity.new_weight before
+                        * setting the prio_changed flag. In fact,
+                        * this flag may be read asynchronously (in
+                        * critical sections protected by a different
+                        * lock than that held here), and finding this
+                        * flag set may cause the execution of the code
+                        * for updating parameters whose value may
+                        * depend also on bfqg->entity.new_weight (in
+                        * __bfq_entity_update_weight_prio).
+                        * This barrier makes sure that the new value
+                        * of bfqg->entity.new_weight is correctly
+                        * seen in that code.
+                        */
+                       smp_wmb();
+                       bfqg->entity.prio_changed = 1;
+               }
+       }
+       spin_unlock_irq(&blkcg->lock);
+
+       return ret;
+}
+
+static ssize_t bfq_io_set_weight(struct kernfs_open_file *of,
+                                char *buf, size_t nbytes,
+                                loff_t off)
+{
+       u64 weight;
+       /* First unsigned long found in the file is used */
+       int ret = kstrtoull(strim(buf), 0, &weight);
+
+       if (ret)
+               return ret;
+
+       return bfq_io_set_weight_legacy(of_css(of), NULL, weight);
+}
+
+static int bfqg_print_stat(struct seq_file *sf, void *v)
+{
+       blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
+                         &blkcg_policy_bfq, seq_cft(sf)->private, false);
+       return 0;
+}
+
+static int bfqg_print_rwstat(struct seq_file *sf, void *v)
+{
+       blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,
+                         &blkcg_policy_bfq, seq_cft(sf)->private, true);
+       return 0;
+}
+
+static u64 bfqg_prfill_stat_recursive(struct seq_file *sf,
+                                     struct blkg_policy_data *pd, int off)
+{
+       u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd),
+                                         &blkcg_policy_bfq, off);
+       return __blkg_prfill_u64(sf, pd, sum);
+}
+
+static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf,
+                                       struct blkg_policy_data *pd, int off)
+{
+       struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd),
+                                                          &blkcg_policy_bfq,
+                                                          off);
+       return __blkg_prfill_rwstat(sf, pd, &sum);
+}
+
+static int bfqg_print_stat_recursive(struct seq_file *sf, void *v)
+{
+       blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+                         bfqg_prfill_stat_recursive, &blkcg_policy_bfq,
+                         seq_cft(sf)->private, false);
+       return 0;
+}
+
+static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
+{
+       blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+                         bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq,
+                         seq_cft(sf)->private, true);
+       return 0;
+}
+
+static u64 bfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd,
+                              int off)
+{
+       u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes);
+
+       return __blkg_prfill_u64(sf, pd, sum >> 9);
+}
+
+static int bfqg_print_stat_sectors(struct seq_file *sf, void *v)
+{
+       blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+                         bfqg_prfill_sectors, &blkcg_policy_bfq, 0, false);
+       return 0;
+}
+
+static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf,
+                                        struct blkg_policy_data *pd, int off)
+{
+       struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL,
+                                       offsetof(struct blkcg_gq, stat_bytes));
+       u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
+               atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
+
+       return __blkg_prfill_u64(sf, pd, sum >> 9);
+}
+
+static int bfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v)
+{
+       blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+                         bfqg_prfill_sectors_recursive, &blkcg_policy_bfq, 0,
+                         false);
+       return 0;
+}
+
+static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf,
+                                     struct blkg_policy_data *pd, int off)
+{
+       struct bfq_group *bfqg = pd_to_bfqg(pd);
+       u64 samples = blkg_stat_read(&bfqg->stats.avg_queue_size_samples);
+       u64 v = 0;
+
+       if (samples) {
+               v = blkg_stat_read(&bfqg->stats.avg_queue_size_sum);
+               v = div64_u64(v, samples);
+       }
+       __blkg_prfill_u64(sf, pd, v);
+       return 0;
+}
+
+/* print avg_queue_size */
+static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v)
+{
+       blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+                         bfqg_prfill_avg_queue_size, &blkcg_policy_bfq,
+                         0, false);
+       return 0;
+}
+
+struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)
+{
+       int ret;
+
+       ret = blkcg_activate_policy(bfqd->queue, &blkcg_policy_bfq);
+       if (ret)
+               return NULL;
+
+       return blkg_to_bfqg(bfqd->queue->root_blkg);
+}
+
+struct blkcg_policy blkcg_policy_bfq = {
+       .dfl_cftypes            = bfq_blkg_files,
+       .legacy_cftypes         = bfq_blkcg_legacy_files,
+
+       .cpd_alloc_fn           = bfq_cpd_alloc,
+       .cpd_init_fn            = bfq_cpd_init,
+       .cpd_bind_fn            = bfq_cpd_init,
+       .cpd_free_fn            = bfq_cpd_free,
+
+       .pd_alloc_fn            = bfq_pd_alloc,
+       .pd_init_fn             = bfq_pd_init,
+       .pd_offline_fn          = bfq_pd_offline,
+       .pd_free_fn             = bfq_pd_free,
+       .pd_reset_stats_fn      = bfq_pd_reset_stats,
+};
+
+struct cftype bfq_blkcg_legacy_files[] = {
+       {
+               .name = "bfq.weight",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .seq_show = bfq_io_show_weight,
+               .write_u64 = bfq_io_set_weight_legacy,
+       },
+
+       /* statistics, covers only the tasks in the bfqg */
+       {
+               .name = "bfq.time",
+               .private = offsetof(struct bfq_group, stats.time),
+               .seq_show = bfqg_print_stat,
+       },
+       {
+               .name = "bfq.sectors",
+               .seq_show = bfqg_print_stat_sectors,
+       },
+       {
+               .name = "bfq.io_service_bytes",
+               .private = (unsigned long)&blkcg_policy_bfq,
+               .seq_show = blkg_print_stat_bytes,
+       },
+       {
+               .name = "bfq.io_serviced",
+               .private = (unsigned long)&blkcg_policy_bfq,
+               .seq_show = blkg_print_stat_ios,
+       },
+       {
+               .name = "bfq.io_service_time",
+               .private = offsetof(struct bfq_group, stats.service_time),
+               .seq_show = bfqg_print_rwstat,
+       },
+       {
+               .name = "bfq.io_wait_time",
+               .private = offsetof(struct bfq_group, stats.wait_time),
+               .seq_show = bfqg_print_rwstat,
+       },
+       {
+               .name = "bfq.io_merged",
+               .private = offsetof(struct bfq_group, stats.merged),
+               .seq_show = bfqg_print_rwstat,
+       },
+       {
+               .name = "bfq.io_queued",
+               .private = offsetof(struct bfq_group, stats.queued),
+               .seq_show = bfqg_print_rwstat,
+       },
+
+       /* the same statictics which cover the bfqg and its descendants */
+       {
+               .name = "bfq.time_recursive",
+               .private = offsetof(struct bfq_group, stats.time),
+               .seq_show = bfqg_print_stat_recursive,
+       },
+       {
+               .name = "bfq.sectors_recursive",
+               .seq_show = bfqg_print_stat_sectors_recursive,
+       },
+       {
+               .name = "bfq.io_service_bytes_recursive",
+               .private = (unsigned long)&blkcg_policy_bfq,
+               .seq_show = blkg_print_stat_bytes_recursive,
+       },
+       {
+               .name = "bfq.io_serviced_recursive",
+               .private = (unsigned long)&blkcg_policy_bfq,
+               .seq_show = blkg_print_stat_ios_recursive,
+       },
+       {
+               .name = "bfq.io_service_time_recursive",
+               .private = offsetof(struct bfq_group, stats.service_time),
+               .seq_show = bfqg_print_rwstat_recursive,
+       },
+       {
+               .name = "bfq.io_wait_time_recursive",
+               .private = offsetof(struct bfq_group, stats.wait_time),
+               .seq_show = bfqg_print_rwstat_recursive,
+       },
+       {
+               .name = "bfq.io_merged_recursive",
+               .private = offsetof(struct bfq_group, stats.merged),
+               .seq_show = bfqg_print_rwstat_recursive,
+       },
+       {
+               .name = "bfq.io_queued_recursive",
+               .private = offsetof(struct bfq_group, stats.queued),
+               .seq_show = bfqg_print_rwstat_recursive,
+       },
+       {
+               .name = "bfq.avg_queue_size",
+               .seq_show = bfqg_print_avg_queue_size,
+       },
+       {
+               .name = "bfq.group_wait_time",
+               .private = offsetof(struct bfq_group, stats.group_wait_time),
+               .seq_show = bfqg_print_stat,
+       },
+       {
+               .name = "bfq.idle_time",
+               .private = offsetof(struct bfq_group, stats.idle_time),
+               .seq_show = bfqg_print_stat,
+       },
+       {
+               .name = "bfq.empty_time",
+               .private = offsetof(struct bfq_group, stats.empty_time),
+               .seq_show = bfqg_print_stat,
+       },
+       {
+               .name = "bfq.dequeue",
+               .private = offsetof(struct bfq_group, stats.dequeue),
+               .seq_show = bfqg_print_stat,
+       },
+       { }     /* terminate */
+};
+
+struct cftype bfq_blkg_files[] = {
+       {
+               .name = "bfq.weight",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .seq_show = bfq_io_show_weight,
+               .write = bfq_io_set_weight,
+       },
+       {} /* terminate */
+};
+
+#else  /* CONFIG_BFQ_GROUP_IOSCHED */
+
+void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
+                             unsigned int op) { }
+void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) { }
+void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) { }
+void bfqg_stats_update_completion(struct bfq_group *bfqg, uint64_t start_time,
+                                 uint64_t io_start_time, unsigned int op) { }
+void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { }
+void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { }
+void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { }
+void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { }
+void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { }
+
+void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+                  struct bfq_group *bfqg) {}
+
+void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg)
+{
+       struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+       entity->weight = entity->new_weight;
+       entity->orig_weight = entity->new_weight;
+       if (bfqq) {
+               bfqq->ioprio = bfqq->new_ioprio;
+               bfqq->ioprio_class = bfqq->new_ioprio_class;
+       }
+       entity->sched_data = &bfqg->sched_data;
+}
+
+void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) {}
+
+void bfq_end_wr_async(struct bfq_data *bfqd)
+{
+       bfq_end_wr_async_queues(bfqd, bfqd->root_group);
+}
+
+struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, struct blkcg *blkcg)
+{
+       return bfqd->root_group;
+}
+
+struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
+{
+       return bfqq->bfqd->root_group;
+}
+
+struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)
+{
+       struct bfq_group *bfqg;
+       int i;
+
+       bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);
+       if (!bfqg)
+               return NULL;
+
+       for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
+               bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
+
+       return bfqg;
+}
+#endif /* CONFIG_BFQ_GROUP_IOSCHED */
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
new file mode 100644 (file)
index 0000000..bd8499e
--- /dev/null
@@ -0,0 +1,5047 @@
+/*
+ * Budget Fair Queueing (BFQ) I/O scheduler.
+ *
+ * Based on ideas and code from CFQ:
+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
+ *
+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
+ *                   Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
+ *                    Arianna Avanzini <avanzini@google.com>
+ *
+ * Copyright (C) 2017 Paolo Valente <paolo.valente@linaro.org>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License as
+ *  published by the Free Software Foundation; either version 2 of the
+ *  License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ * BFQ is a proportional-share I/O scheduler, with some extra
+ * low-latency capabilities. BFQ also supports full hierarchical
+ * scheduling through cgroups. Next paragraphs provide an introduction
+ * on BFQ inner workings. Details on BFQ benefits, usage and
+ * limitations can be found in Documentation/block/bfq-iosched.txt.
+ *
+ * BFQ is a proportional-share storage-I/O scheduling algorithm based
+ * on the slice-by-slice service scheme of CFQ. But BFQ assigns
+ * budgets, measured in number of sectors, to processes instead of
+ * time slices. The device is not granted to the in-service process
+ * for a given time slice, but until it has exhausted its assigned
+ * budget. This change from the time to the service domain enables BFQ
+ * to distribute the device throughput among processes as desired,
+ * without any distortion due to throughput fluctuations, or to device
+ * internal queueing. BFQ uses an ad hoc internal scheduler, called
+ * B-WF2Q+, to schedule processes according to their budgets. More
+ * precisely, BFQ schedules queues associated with processes. Each
+ * process/queue is assigned a user-configurable weight, and B-WF2Q+
+ * guarantees that each queue receives a fraction of the throughput
+ * proportional to its weight. Thanks to the accurate policy of
+ * B-WF2Q+, BFQ can afford to assign high budgets to I/O-bound
+ * processes issuing sequential requests (to boost the throughput),
+ * and yet guarantee a low latency to interactive and soft real-time
+ * applications.
+ *
+ * In particular, to provide these low-latency guarantees, BFQ
+ * explicitly privileges the I/O of two classes of time-sensitive
+ * applications: interactive and soft real-time. This feature enables
+ * BFQ to provide applications in these classes with a very low
+ * latency. Finally, BFQ also features additional heuristics for
+ * preserving both a low latency and a high throughput on NCQ-capable,
+ * rotational or flash-based devices, and to get the job done quickly
+ * for applications consisting in many I/O-bound processes.
+ *
+ * BFQ is described in [1], where also a reference to the initial, more
+ * theoretical paper on BFQ can be found. The interested reader can find
+ * in the latter paper full details on the main algorithm, as well as
+ * formulas of the guarantees and formal proofs of all the properties.
+ * With respect to the version of BFQ presented in these papers, this
+ * implementation adds a few more heuristics, such as the one that
+ * guarantees a low latency to soft real-time applications, and a
+ * hierarchical extension based on H-WF2Q+.
+ *
+ * B-WF2Q+ is based on WF2Q+, which is described in [2], together with
+ * H-WF2Q+, while the augmented tree used here to implement B-WF2Q+
+ * with O(log N) complexity derives from the one introduced with EEVDF
+ * in [3].
+ *
+ * [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O
+ *     Scheduler", Proceedings of the First Workshop on Mobile System
+ *     Technologies (MST-2015), May 2015.
+ *     http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf
+ *
+ * [2] Jon C.R. Bennett and H. Zhang, "Hierarchical Packet Fair Queueing
+ *     Algorithms", IEEE/ACM Transactions on Networking, 5(5):675-689,
+ *     Oct 1997.
+ *
+ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz
+ *
+ * [3] I. Stoica and H. Abdel-Wahab, "Earliest Eligible Virtual Deadline
+ *     First: A Flexible and Accurate Mechanism for Proportional Share
+ *     Resource Allocation", technical report.
+ *
+ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
+ */
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/cgroup.h>
+#include <linux/elevator.h>
+#include <linux/ktime.h>
+#include <linux/rbtree.h>
+#include <linux/ioprio.h>
+#include <linux/sbitmap.h>
+#include <linux/delay.h>
+
+#include "blk.h"
+#include "blk-mq.h"
+#include "blk-mq-tag.h"
+#include "blk-mq-sched.h"
+#include "bfq-iosched.h"
+
+#define BFQ_BFQQ_FNS(name)                                             \
+void bfq_mark_bfqq_##name(struct bfq_queue *bfqq)                      \
+{                                                                      \
+       __set_bit(BFQQF_##name, &(bfqq)->flags);                        \
+}                                                                      \
+void bfq_clear_bfqq_##name(struct bfq_queue *bfqq)                     \
+{                                                                      \
+       __clear_bit(BFQQF_##name, &(bfqq)->flags);              \
+}                                                                      \
+int bfq_bfqq_##name(const struct bfq_queue *bfqq)                      \
+{                                                                      \
+       return test_bit(BFQQF_##name, &(bfqq)->flags);          \
+}
+
+BFQ_BFQQ_FNS(just_created);
+BFQ_BFQQ_FNS(busy);
+BFQ_BFQQ_FNS(wait_request);
+BFQ_BFQQ_FNS(non_blocking_wait_rq);
+BFQ_BFQQ_FNS(fifo_expire);
+BFQ_BFQQ_FNS(idle_window);
+BFQ_BFQQ_FNS(sync);
+BFQ_BFQQ_FNS(IO_bound);
+BFQ_BFQQ_FNS(in_large_burst);
+BFQ_BFQQ_FNS(coop);
+BFQ_BFQQ_FNS(split_coop);
+BFQ_BFQQ_FNS(softrt_update);
+#undef BFQ_BFQQ_FNS                                            \
+
+/* Expiration time of sync (0) and async (1) requests, in ns. */
+static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 };
+
+/* Maximum backwards seek (magic number lifted from CFQ), in KiB. */
+static const int bfq_back_max = 16 * 1024;
+
+/* Penalty of a backwards seek, in number of sectors. */
+static const int bfq_back_penalty = 2;
+
+/* Idling period duration, in ns. */
+static u64 bfq_slice_idle = NSEC_PER_SEC / 125;
+
+/* Minimum number of assigned budgets for which stats are safe to compute. */
+static const int bfq_stats_min_budgets = 194;
+
+/* Default maximum budget values, in sectors and number of requests. */
+static const int bfq_default_max_budget = 16 * 1024;
+
+/*
+ * Async to sync throughput distribution is controlled as follows:
+ * when an async request is served, the entity is charged the number
+ * of sectors of the request, multiplied by the factor below
+ */
+static const int bfq_async_charge_factor = 10;
+
+/* Default timeout values, in jiffies, approximating CFQ defaults. */
+const int bfq_timeout = HZ / 8;
+
+static struct kmem_cache *bfq_pool;
+
+/* Below this threshold (in ns), we consider thinktime immediate. */
+#define BFQ_MIN_TT             (2 * NSEC_PER_MSEC)
+
+/* hw_tag detection: parallel requests threshold and min samples needed. */
+#define BFQ_HW_QUEUE_THRESHOLD 4
+#define BFQ_HW_QUEUE_SAMPLES   32
+
+#define BFQQ_SEEK_THR          (sector_t)(8 * 100)
+#define BFQQ_SECT_THR_NONROT   (sector_t)(2 * 32)
+#define BFQQ_CLOSE_THR         (sector_t)(8 * 1024)
+#define BFQQ_SEEKY(bfqq)       (hweight32(bfqq->seek_history) > 32/8)
+
+/* Min number of samples required to perform peak-rate update */
+#define BFQ_RATE_MIN_SAMPLES   32
+/* Min observation time interval required to perform a peak-rate update (ns) */
+#define BFQ_RATE_MIN_INTERVAL  (300*NSEC_PER_MSEC)
+/* Target observation time interval for a peak-rate update (ns) */
+#define BFQ_RATE_REF_INTERVAL  NSEC_PER_SEC
+
+/* Shift used for peak rate fixed precision calculations. */
+#define BFQ_RATE_SHIFT         16
+
+/*
+ * By default, BFQ computes the duration of the weight raising for
+ * interactive applications automatically, using the following formula:
+ * duration = (R / r) * T, where r is the peak rate of the device, and
+ * R and T are two reference parameters.
+ * In particular, R is the peak rate of the reference device (see below),
+ * and T is a reference time: given the systems that are likely to be
+ * installed on the reference device according to its speed class, T is
+ * about the maximum time needed, under BFQ and while reading two files in
+ * parallel, to load typical large applications on these systems.
+ * In practice, the slower/faster the device at hand is, the more/less it
+ * takes to load applications with respect to the reference device.
+ * Accordingly, the longer/shorter BFQ grants weight raising to interactive
+ * applications.
+ *
+ * BFQ uses four different reference pairs (R, T), depending on:
+ * . whether the device is rotational or non-rotational;
+ * . whether the device is slow, such as old or portable HDDs, as well as
+ *   SD cards, or fast, such as newer HDDs and SSDs.
+ *
+ * The device's speed class is dynamically (re)detected in
+ * bfq_update_peak_rate() every time the estimated peak rate is updated.
+ *
+ * In the following definitions, R_slow[0]/R_fast[0] and
+ * T_slow[0]/T_fast[0] are the reference values for a slow/fast
+ * rotational device, whereas R_slow[1]/R_fast[1] and
+ * T_slow[1]/T_fast[1] are the reference values for a slow/fast
+ * non-rotational device. Finally, device_speed_thresh are the
+ * thresholds used to switch between speed classes. The reference
+ * rates are not the actual peak rates of the devices used as a
+ * reference, but slightly lower values. The reason for using these
+ * slightly lower values is that the peak-rate estimator tends to
+ * yield slightly lower values than the actual peak rate (it can yield
+ * the actual peak rate only if there is only one process doing I/O,
+ * and the process does sequential I/O).
+ *
+ * Both the reference peak rates and the thresholds are measured in
+ * sectors/usec, left-shifted by BFQ_RATE_SHIFT.
+ */
+static int R_slow[2] = {1000, 10700};
+static int R_fast[2] = {14000, 33000};
+/*
+ * To improve readability, a conversion function is used to initialize the
+ * following arrays, which entails that they can be initialized only in a
+ * function.
+ */
+static int T_slow[2];
+static int T_fast[2];
+static int device_speed_thresh[2];
+
+#define RQ_BIC(rq)             ((struct bfq_io_cq *) (rq)->elv.priv[0])
+#define RQ_BFQQ(rq)            ((rq)->elv.priv[1])
+
+struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync)
+{
+       return bic->bfqq[is_sync];
+}
+
+void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync)
+{
+       bic->bfqq[is_sync] = bfqq;
+}
+
+struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)
+{
+       return bic->icq.q->elevator->elevator_data;
+}
+
+/**
+ * icq_to_bic - convert iocontext queue structure to bfq_io_cq.
+ * @icq: the iocontext queue.
+ */
+static struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
+{
+       /* bic->icq is the first member, %NULL will convert to %NULL */
+       return container_of(icq, struct bfq_io_cq, icq);
+}
+
+/**
+ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.
+ * @bfqd: the lookup key.
+ * @ioc: the io_context of the process doing I/O.
+ * @q: the request queue.
+ */
+static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
+                                       struct io_context *ioc,
+                                       struct request_queue *q)
+{
+       if (ioc) {
+               unsigned long flags;
+               struct bfq_io_cq *icq;
+
+               spin_lock_irqsave(q->queue_lock, flags);
+               icq = icq_to_bic(ioc_lookup_icq(ioc, q));
+               spin_unlock_irqrestore(q->queue_lock, flags);
+
+               return icq;
+       }
+
+       return NULL;
+}
+
+/*
+ * Scheduler run of queue, if there are requests pending and no one in the
+ * driver that will restart queueing.
+ */
+void bfq_schedule_dispatch(struct bfq_data *bfqd)
+{
+       if (bfqd->queued != 0) {
+               bfq_log(bfqd, "schedule dispatch");
+               blk_mq_run_hw_queues(bfqd->queue, true);
+       }
+}
+
+#define bfq_class_idle(bfqq)   ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
+#define bfq_class_rt(bfqq)     ((bfqq)->ioprio_class == IOPRIO_CLASS_RT)
+
+#define bfq_sample_valid(samples)      ((samples) > 80)
+
+/*
+ * Lifted from AS - choose which of rq1 and rq2 that is best served now.
+ * We choose the request that is closesr to the head right now.  Distance
+ * behind the head is penalized and only allowed to a certain extent.
+ */
+static struct request *bfq_choose_req(struct bfq_data *bfqd,
+                                     struct request *rq1,
+                                     struct request *rq2,
+                                     sector_t last)
+{
+       sector_t s1, s2, d1 = 0, d2 = 0;
+       unsigned long back_max;
+#define BFQ_RQ1_WRAP   0x01 /* request 1 wraps */
+#define BFQ_RQ2_WRAP   0x02 /* request 2 wraps */
+       unsigned int wrap = 0; /* bit mask: requests behind the disk head? */
+
+       if (!rq1 || rq1 == rq2)
+               return rq2;
+       if (!rq2)
+               return rq1;
+
+       if (rq_is_sync(rq1) && !rq_is_sync(rq2))
+               return rq1;
+       else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
+               return rq2;
+       if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
+               return rq1;
+       else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))
+               return rq2;
+
+       s1 = blk_rq_pos(rq1);
+       s2 = blk_rq_pos(rq2);
+
+       /*
+        * By definition, 1KiB is 2 sectors.
+        */
+       back_max = bfqd->bfq_back_max * 2;
+
+       /*
+        * Strict one way elevator _except_ in the case where we allow
+        * short backward seeks which are biased as twice the cost of a
+        * similar forward seek.
+        */
+       if (s1 >= last)
+               d1 = s1 - last;
+       else if (s1 + back_max >= last)
+               d1 = (last - s1) * bfqd->bfq_back_penalty;
+       else
+               wrap |= BFQ_RQ1_WRAP;
+
+       if (s2 >= last)
+               d2 = s2 - last;
+       else if (s2 + back_max >= last)
+               d2 = (last - s2) * bfqd->bfq_back_penalty;
+       else
+               wrap |= BFQ_RQ2_WRAP;
+
+       /* Found required data */
+
+       /*
+        * By doing switch() on the bit mask "wrap" we avoid having to
+        * check two variables for all permutations: --> faster!
+        */
+       switch (wrap) {
+       case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
+               if (d1 < d2)
+                       return rq1;
+               else if (d2 < d1)
+                       return rq2;
+
+               if (s1 >= s2)
+                       return rq1;
+               else
+                       return rq2;
+
+       case BFQ_RQ2_WRAP:
+               return rq1;
+       case BFQ_RQ1_WRAP:
+               return rq2;
+       case BFQ_RQ1_WRAP|BFQ_RQ2_WRAP: /* both rqs wrapped */
+       default:
+               /*
+                * Since both rqs are wrapped,
+                * start with the one that's further behind head
+                * (--> only *one* back seek required),
+                * since back seek takes more time than forward.
+                */
+               if (s1 <= s2)
+                       return rq1;
+               else
+                       return rq2;
+       }
+}
+
+static struct bfq_queue *
+bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,
+                    sector_t sector, struct rb_node **ret_parent,
+                    struct rb_node ***rb_link)
+{
+       struct rb_node **p, *parent;
+       struct bfq_queue *bfqq = NULL;
+
+       parent = NULL;
+       p = &root->rb_node;
+       while (*p) {
+               struct rb_node **n;
+
+               parent = *p;
+               bfqq = rb_entry(parent, struct bfq_queue, pos_node);
+
+               /*
+                * Sort strictly based on sector. Smallest to the left,
+                * largest to the right.
+                */
+               if (sector > blk_rq_pos(bfqq->next_rq))
+                       n = &(*p)->rb_right;
+               else if (sector < blk_rq_pos(bfqq->next_rq))
+                       n = &(*p)->rb_left;
+               else
+                       break;
+               p = n;
+               bfqq = NULL;
+       }
+
+       *ret_parent = parent;
+       if (rb_link)
+               *rb_link = p;
+
+       bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d",
+               (unsigned long long)sector,
+               bfqq ? bfqq->pid : 0);
+
+       return bfqq;
+}
+
+void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+       struct rb_node **p, *parent;
+       struct bfq_queue *__bfqq;
+
+       if (bfqq->pos_root) {
+               rb_erase(&bfqq->pos_node, bfqq->pos_root);
+               bfqq->pos_root = NULL;
+       }
+
+       if (bfq_class_idle(bfqq))
+               return;
+       if (!bfqq->next_rq)
+               return;
+
+       bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree;
+       __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,
+                       blk_rq_pos(bfqq->next_rq), &parent, &p);
+       if (!__bfqq) {
+               rb_link_node(&bfqq->pos_node, parent, p);
+               rb_insert_color(&bfqq->pos_node, bfqq->pos_root);
+       } else
+               bfqq->pos_root = NULL;
+}
+
+/*
+ * Tell whether there are active queues or groups with differentiated weights.
+ */
+static bool bfq_differentiated_weights(struct bfq_data *bfqd)
+{
+       /*
+        * For weights to differ, at least one of the trees must contain
+        * at least two nodes.
+        */
+       return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) &&
+               (bfqd->queue_weights_tree.rb_node->rb_left ||
+                bfqd->queue_weights_tree.rb_node->rb_right)
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+              ) ||
+              (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) &&
+               (bfqd->group_weights_tree.rb_node->rb_left ||
+                bfqd->group_weights_tree.rb_node->rb_right)
+#endif
+              );
+}
+
+/*
+ * The following function returns true if every queue must receive the
+ * same share of the throughput (this condition is used when deciding
+ * whether idling may be disabled, see the comments in the function
+ * bfq_bfqq_may_idle()).
+ *
+ * Such a scenario occurs when:
+ * 1) all active queues have the same weight,
+ * 2) all active groups at the same level in the groups tree have the same
+ *    weight,
+ * 3) all active groups at the same level in the groups tree have the same
+ *    number of children.
+ *
+ * Unfortunately, keeping the necessary state for evaluating exactly the
+ * above symmetry conditions would be quite complex and time-consuming.
+ * Therefore this function evaluates, instead, the following stronger
+ * sub-conditions, for which it is much easier to maintain the needed
+ * state:
+ * 1) all active queues have the same weight,
+ * 2) all active groups have the same weight,
+ * 3) all active groups have at most one active child each.
+ * In particular, the last two conditions are always true if hierarchical
+ * support and the cgroups interface are not enabled, thus no state needs
+ * to be maintained in this case.
+ */
+static bool bfq_symmetric_scenario(struct bfq_data *bfqd)
+{
+       return !bfq_differentiated_weights(bfqd);
+}
+
+/*
+ * If the weight-counter tree passed as input contains no counter for
+ * the weight of the input entity, then add that counter; otherwise just
+ * increment the existing counter.
+ *
+ * Note that weight-counter trees contain few nodes in mostly symmetric
+ * scenarios. For example, if all queues have the same weight, then the
+ * weight-counter tree for the queues may contain at most one node.
+ * This holds even if low_latency is on, because weight-raised queues
+ * are not inserted in the tree.
+ * In most scenarios, the rate at which nodes are created/destroyed
+ * should be low too.
+ */
+void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_entity *entity,
+                         struct rb_root *root)
+{
+       struct rb_node **new = &(root->rb_node), *parent = NULL;
+
+       /*
+        * Do not insert if the entity is already associated with a
+        * counter, which happens if:
+        *   1) the entity is associated with a queue,
+        *   2) a request arrival has caused the queue to become both
+        *      non-weight-raised, and hence change its weight, and
+        *      backlogged; in this respect, each of the two events
+        *      causes an invocation of this function,
+        *   3) this is the invocation of this function caused by the
+        *      second event. This second invocation is actually useless,
+        *      and we handle this fact by exiting immediately. More
+        *      efficient or clearer solutions might possibly be adopted.
+        */
+       if (entity->weight_counter)
+               return;
+
+       while (*new) {
+               struct bfq_weight_counter *__counter = container_of(*new,
+                                               struct bfq_weight_counter,
+                                               weights_node);
+               parent = *new;
+
+               if (entity->weight == __counter->weight) {
+                       entity->weight_counter = __counter;
+                       goto inc_counter;
+               }
+               if (entity->weight < __counter->weight)
+                       new = &((*new)->rb_left);
+               else
+                       new = &((*new)->rb_right);
+       }
+
+       entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter),
+                                        GFP_ATOMIC);
+
+       /*
+        * In the unlucky event of an allocation failure, we just
+        * exit. This will cause the weight of entity to not be
+        * considered in bfq_differentiated_weights, which, in its
+        * turn, causes the scenario to be deemed wrongly symmetric in
+        * case entity's weight would have been the only weight making
+        * the scenario asymmetric. On the bright side, no unbalance
+        * will however occur when entity becomes inactive again (the
+        * invocation of this function is triggered by an activation
+        * of entity). In fact, bfq_weights_tree_remove does nothing
+        * if !entity->weight_counter.
+        */
+       if (unlikely(!entity->weight_counter))
+               return;
+
+       entity->weight_counter->weight = entity->weight;
+       rb_link_node(&entity->weight_counter->weights_node, parent, new);
+       rb_insert_color(&entity->weight_counter->weights_node, root);
+
+inc_counter:
+       entity->weight_counter->num_active++;
+}
+
+/*
+ * Decrement the weight counter associated with the entity, and, if the
+ * counter reaches 0, remove the counter from the tree.
+ * See the comments to the function bfq_weights_tree_add() for considerations
+ * about overhead.
+ */
+void bfq_weights_tree_remove(struct bfq_data *bfqd, struct bfq_entity *entity,
+                            struct rb_root *root)
+{
+       if (!entity->weight_counter)
+               return;
+
+       entity->weight_counter->num_active--;
+       if (entity->weight_counter->num_active > 0)
+               goto reset_entity_pointer;
+
+       rb_erase(&entity->weight_counter->weights_node, root);
+       kfree(entity->weight_counter);
+
+reset_entity_pointer:
+       entity->weight_counter = NULL;
+}
+
+/*
+ * Return expired entry, or NULL to just start from scratch in rbtree.
+ */
+static struct request *bfq_check_fifo(struct bfq_queue *bfqq,
+                                     struct request *last)
+{
+       struct request *rq;
+
+       if (bfq_bfqq_fifo_expire(bfqq))
+               return NULL;
+
+       bfq_mark_bfqq_fifo_expire(bfqq);
+
+       rq = rq_entry_fifo(bfqq->fifo.next);
+
+       if (rq == last || ktime_get_ns() < rq->fifo_time)
+               return NULL;
+
+       bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq);
+       return rq;
+}
+
+static struct request *bfq_find_next_rq(struct bfq_data *bfqd,
+                                       struct bfq_queue *bfqq,
+                                       struct request *last)
+{
+       struct rb_node *rbnext = rb_next(&last->rb_node);
+       struct rb_node *rbprev = rb_prev(&last->rb_node);
+       struct request *next, *prev = NULL;
+
+       /* Follow expired path, else get first next available. */
+       next = bfq_check_fifo(bfqq, last);
+       if (next)
+               return next;
+
+       if (rbprev)
+               prev = rb_entry_rq(rbprev);
+
+       if (rbnext)
+               next = rb_entry_rq(rbnext);
+       else {
+               rbnext = rb_first(&bfqq->sort_list);
+               if (rbnext && rbnext != &last->rb_node)
+                       next = rb_entry_rq(rbnext);
+       }
+
+       return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));
+}
+
+/* see the definition of bfq_async_charge_factor for details */
+static unsigned long bfq_serv_to_charge(struct request *rq,
+                                       struct bfq_queue *bfqq)
+{
+       if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1)
+               return blk_rq_sectors(rq);
+
+       /*
+        * If there are no weight-raised queues, then amplify service
+        * by just the async charge factor; otherwise amplify service
+        * by twice the async charge factor, to further reduce latency
+        * for weight-raised queues.
+        */
+       if (bfqq->bfqd->wr_busy_queues == 0)
+               return blk_rq_sectors(rq) * bfq_async_charge_factor;
+
+       return blk_rq_sectors(rq) * 2 * bfq_async_charge_factor;
+}
+
+/**
+ * bfq_updated_next_req - update the queue after a new next_rq selection.
+ * @bfqd: the device data the queue belongs to.
+ * @bfqq: the queue to update.
+ *
+ * If the first request of a queue changes we make sure that the queue
+ * has enough budget to serve at least its first request (if the
+ * request has grown).  We do this because if the queue has not enough
+ * budget for its first request, it has to go through two dispatch
+ * rounds to actually get it dispatched.
+ */
+static void bfq_updated_next_req(struct bfq_data *bfqd,
+                                struct bfq_queue *bfqq)
+{
+       struct bfq_entity *entity = &bfqq->entity;
+       struct request *next_rq = bfqq->next_rq;
+       unsigned long new_budget;
+
+       if (!next_rq)
+               return;
+
+       if (bfqq == bfqd->in_service_queue)
+               /*
+                * In order not to break guarantees, budgets cannot be
+                * changed after an entity has been selected.
+                */
+               return;
+
+       new_budget = max_t(unsigned long, bfqq->max_budget,
+                          bfq_serv_to_charge(next_rq, bfqq));
+       if (entity->budget != new_budget) {
+               entity->budget = new_budget;
+               bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu",
+                                        new_budget);
+               bfq_requeue_bfqq(bfqd, bfqq);
+       }
+}
+
+static void
+bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
+{
+       if (bic->saved_idle_window)
+               bfq_mark_bfqq_idle_window(bfqq);
+       else
+               bfq_clear_bfqq_idle_window(bfqq);
+
+       if (bic->saved_IO_bound)
+               bfq_mark_bfqq_IO_bound(bfqq);
+       else
+               bfq_clear_bfqq_IO_bound(bfqq);
+
+       bfqq->ttime = bic->saved_ttime;
+       bfqq->wr_coeff = bic->saved_wr_coeff;
+       bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt;
+       bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish;
+       bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time;
+
+       if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) ||
+           time_is_before_jiffies(bfqq->last_wr_start_finish +
+                                  bfqq->wr_cur_max_time))) {
+               bfq_log_bfqq(bfqq->bfqd, bfqq,
+                   "resume state: switching off wr");
+
+               bfqq->wr_coeff = 1;
+       }
+
+       /* make sure weight will be updated, however we got here */
+       bfqq->entity.prio_changed = 1;
+}
+
+static int bfqq_process_refs(struct bfq_queue *bfqq)
+{
+       return bfqq->ref - bfqq->allocated - bfqq->entity.on_st;
+}
+
+/* Empty burst list and add just bfqq (see comments on bfq_handle_burst) */
+static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+       struct bfq_queue *item;
+       struct hlist_node *n;
+
+       hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node)
+               hlist_del_init(&item->burst_list_node);
+       hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);
+       bfqd->burst_size = 1;
+       bfqd->burst_parent_entity = bfqq->entity.parent;
+}
+
+/* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */
+static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+       /* Increment burst size to take into account also bfqq */
+       bfqd->burst_size++;
+
+       if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) {
+               struct bfq_queue *pos, *bfqq_item;
+               struct hlist_node *n;
+
+               /*
+                * Enough queues have been activated shortly after each
+                * other to consider this burst as large.
+                */
+               bfqd->large_burst = true;
+
+               /*
+                * We can now mark all queues in the burst list as
+                * belonging to a large burst.
+                */
+               hlist_for_each_entry(bfqq_item, &bfqd->burst_list,
+                                    burst_list_node)
+                       bfq_mark_bfqq_in_large_burst(bfqq_item);
+               bfq_mark_bfqq_in_large_burst(bfqq);
+
+               /*
+                * From now on, and until the current burst finishes, any
+                * new queue being activated shortly after the last queue
+                * was inserted in the burst can be immediately marked as
+                * belonging to a large burst. So the burst list is not
+                * needed any more. Remove it.
+                */
+               hlist_for_each_entry_safe(pos, n, &bfqd->burst_list,
+                                         burst_list_node)
+                       hlist_del_init(&pos->burst_list_node);
+       } else /*
+               * Burst not yet large: add bfqq to the burst list. Do
+               * not increment the ref counter for bfqq, because bfqq
+               * is removed from the burst list before freeing bfqq
+               * in put_queue.
+               */
+               hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);
+}
+
+/*
+ * If many queues belonging to the same group happen to be created
+ * shortly after each other, then the processes associated with these
+ * queues have typically a common goal. In particular, bursts of queue
+ * creations are usually caused by services or applications that spawn
+ * many parallel threads/processes. Examples are systemd during boot,
+ * or git grep. To help these processes get their job done as soon as
+ * possible, it is usually better to not grant either weight-raising
+ * or device idling to their queues.
+ *
+ * In this comment we describe, firstly, the reasons why this fact
+ * holds, and, secondly, the next function, which implements the main
+ * steps needed to properly mark these queues so that they can then be
+ * treated in a different way.
+ *
+ * The above services or applications benefit mostly from a high
+ * throughput: the quicker the requests of the activated queues are
+ * cumulatively served, the sooner the target job of these queues gets
+ * completed. As a consequence, weight-raising any of these queues,
+ * which also implies idling the device for it, is almost always
+ * counterproductive. In most cases it just lowers throughput.
+ *
+ * On the other hand, a burst of queue creations may be caused also by
+ * the start of an application that does not consist of a lot of
+ * parallel I/O-bound threads. In fact, with a complex application,
+ * several short processes may need to be executed to start-up the
+ * application. In this respect, to start an application as quickly as
+ * possible, the best thing to do is in any case to privilege the I/O
+ * related to the application with respect to all other
+ * I/O. Therefore, the best strategy to start as quickly as possible
+ * an application that causes a burst of queue creations is to
+ * weight-raise all the queues created during the burst. This is the
+ * exact opposite of the best strategy for the other type of bursts.
+ *
+ * In the end, to take the best action for each of the two cases, the
+ * two types of bursts need to be distinguished. Fortunately, this
+ * seems relatively easy, by looking at the sizes of the bursts. In
+ * particular, we found a threshold such that only bursts with a
+ * larger size than that threshold are apparently caused by
+ * services or commands such as systemd or git grep. For brevity,
+ * hereafter we call just 'large' these bursts. BFQ *does not*
+ * weight-raise queues whose creation occurs in a large burst. In
+ * addition, for each of these queues BFQ performs or does not perform
+ * idling depending on which choice boosts the throughput more. The
+ * exact choice depends on the device and request pattern at
+ * hand.
+ *
+ * Unfortunately, false positives may occur while an interactive task
+ * is starting (e.g., an application is being started). The
+ * consequence is that the queues associated with the task do not
+ * enjoy weight raising as expected. Fortunately these false positives
+ * are very rare. They typically occur if some service happens to
+ * start doing I/O exactly when the interactive task starts.
+ *
+ * Turning back to the next function, it implements all the steps
+ * needed to detect the occurrence of a large burst and to properly
+ * mark all the queues belonging to it (so that they can then be
+ * treated in a different way). This goal is achieved by maintaining a
+ * "burst list" that holds, temporarily, the queues that belong to the
+ * burst in progress. The list is then used to mark these queues as
+ * belonging to a large burst if the burst does become large. The main
+ * steps are the following.
+ *
+ * . when the very first queue is created, the queue is inserted into the
+ *   list (as it could be the first queue in a possible burst)
+ *
+ * . if the current burst has not yet become large, and a queue Q that does
+ *   not yet belong to the burst is activated shortly after the last time
+ *   at which a new queue entered the burst list, then the function appends
+ *   Q to the burst list
+ *
+ * . if, as a consequence of the previous step, the burst size reaches
+ *   the large-burst threshold, then
+ *
+ *     . all the queues in the burst list are marked as belonging to a
+ *       large burst
+ *
+ *     . the burst list is deleted; in fact, the burst list already served
+ *       its purpose (keeping temporarily track of the queues in a burst,
+ *       so as to be able to mark them as belonging to a large burst in the
+ *       previous sub-step), and now is not needed any more
+ *
+ *     . the device enters a large-burst mode
+ *
+ * . if a queue Q that does not belong to the burst is created while
+ *   the device is in large-burst mode and shortly after the last time
+ *   at which a queue either entered the burst list or was marked as
+ *   belonging to the current large burst, then Q is immediately marked
+ *   as belonging to a large burst.
+ *
+ * . if a queue Q that does not belong to the burst is created a while
+ *   later, i.e., not shortly after, than the last time at which a queue
+ *   either entered the burst list or was marked as belonging to the
+ *   current large burst, then the current burst is deemed as finished and:
+ *
+ *        . the large-burst mode is reset if set
+ *
+ *        . the burst list is emptied
+ *
+ *        . Q is inserted in the burst list, as Q may be the first queue
+ *          in a possible new burst (then the burst list contains just Q
+ *          after this step).
+ */
+static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+       /*
+        * If bfqq is already in the burst list or is part of a large
+        * burst, or finally has just been split, then there is
+        * nothing else to do.
+        */
+       if (!hlist_unhashed(&bfqq->burst_list_node) ||
+           bfq_bfqq_in_large_burst(bfqq) ||
+           time_is_after_eq_jiffies(bfqq->split_time +
+                                    msecs_to_jiffies(10)))
+               return;
+
+       /*
+        * If bfqq's creation happens late enough, or bfqq belongs to
+        * a different group than the burst group, then the current
+        * burst is finished, and related data structures must be
+        * reset.
+        *
+        * In this respect, consider the special case where bfqq is
+        * the very first queue created after BFQ is selected for this
+        * device. In this case, last_ins_in_burst and
+        * burst_parent_entity are not yet significant when we get
+        * here. But it is easy to verify that, whether or not the
+        * following condition is true, bfqq will end up being
+        * inserted into the burst list. In particular the list will
+        * happen to contain only bfqq. And this is exactly what has
+        * to happen, as bfqq may be the first queue of the first
+        * burst.
+        */
+       if (time_is_before_jiffies(bfqd->last_ins_in_burst +
+           bfqd->bfq_burst_interval) ||
+           bfqq->entity.parent != bfqd->burst_parent_entity) {
+               bfqd->large_burst = false;
+               bfq_reset_burst_list(bfqd, bfqq);
+               goto end;
+       }
+
+       /*
+        * If we get here, then bfqq is being activated shortly after the
+        * last queue. So, if the current burst is also large, we can mark
+        * bfqq as belonging to this large burst immediately.
+        */
+       if (bfqd->large_burst) {
+               bfq_mark_bfqq_in_large_burst(bfqq);
+               goto end;
+       }
+
+       /*
+        * If we get here, then a large-burst state has not yet been
+        * reached, but bfqq is being activated shortly after the last
+        * queue. Then we add bfqq to the burst.
+        */
+       bfq_add_to_burst(bfqd, bfqq);
+end:
+       /*
+        * At this point, bfqq either has been added to the current
+        * burst or has caused the current burst to terminate and a
+        * possible new burst to start. In particular, in the second
+        * case, bfqq has become the first queue in the possible new
+        * burst.  In both cases last_ins_in_burst needs to be moved
+        * forward.
+        */
+       bfqd->last_ins_in_burst = jiffies;
+}
+
+static int bfq_bfqq_budget_left(struct bfq_queue *bfqq)
+{
+       struct bfq_entity *entity = &bfqq->entity;
+
+       return entity->budget - entity->service;
+}
+
+/*
+ * If enough samples have been computed, return the current max budget
+ * stored in bfqd, which is dynamically updated according to the
+ * estimated disk peak rate; otherwise return the default max budget
+ */
+static int bfq_max_budget(struct bfq_data *bfqd)
+{
+       if (bfqd->budgets_assigned < bfq_stats_min_budgets)
+               return bfq_default_max_budget;
+       else
+               return bfqd->bfq_max_budget;
+}
+
+/*
+ * Return min budget, which is a fraction of the current or default
+ * max budget (trying with 1/32)
+ */
+static int bfq_min_budget(struct bfq_data *bfqd)
+{
+       if (bfqd->budgets_assigned < bfq_stats_min_budgets)
+               return bfq_default_max_budget / 32;
+       else
+               return bfqd->bfq_max_budget / 32;
+}
+
+/*
+ * The next function, invoked after the input queue bfqq switches from
+ * idle to busy, updates the budget of bfqq. The function also tells
+ * whether the in-service queue should be expired, by returning
+ * true. The purpose of expiring the in-service queue is to give bfqq
+ * the chance to possibly preempt the in-service queue, and the reason
+ * for preempting the in-service queue is to achieve one of the two
+ * goals below.
+ *
+ * 1. Guarantee to bfqq its reserved bandwidth even if bfqq has
+ * expired because it has remained idle. In particular, bfqq may have
+ * expired for one of the following two reasons:
+ *
+ * - BFQQE_NO_MORE_REQUESTS bfqq did not enjoy any device idling
+ *   and did not make it to issue a new request before its last
+ *   request was served;
+ *
+ * - BFQQE_TOO_IDLE bfqq did enjoy device idling, but did not issue
+ *   a new request before the expiration of the idling-time.
+ *
+ * Even if bfqq has expired for one of the above reasons, the process
+ * associated with the queue may be however issuing requests greedily,
+ * and thus be sensitive to the bandwidth it receives (bfqq may have
+ * remained idle for other reasons: CPU high load, bfqq not enjoying
+ * idling, I/O throttling somewhere in the path from the process to
+ * the I/O scheduler, ...). But if, after every expiration for one of
+ * the above two reasons, bfqq has to wait for the service of at least
+ * one full budget of another queue before being served again, then
+ * bfqq is likely to get a much lower bandwidth or resource time than
+ * its reserved ones. To address this issue, two countermeasures need
+ * to be taken.
+ *
+ * First, the budget and the timestamps of bfqq need to be updated in
+ * a special way on bfqq reactivation: they need to be updated as if
+ * bfqq did not remain idle and did not expire. In fact, if they are
+ * computed as if bfqq expired and remained idle until reactivation,
+ * then the process associated with bfqq is treated as if, instead of
+ * being greedy, it stopped issuing requests when bfqq remained idle,
+ * and restarts issuing requests only on this reactivation. In other
+ * words, the scheduler does not help the process recover the "service
+ * hole" between bfqq expiration and reactivation. As a consequence,
+ * the process receives a lower bandwidth than its reserved one. In
+ * contrast, to recover this hole, the budget must be updated as if
+ * bfqq was not expired at all before this reactivation, i.e., it must
+ * be set to the value of the remaining budget when bfqq was
+ * expired. Along the same line, timestamps need to be assigned the
+ * value they had the last time bfqq was selected for service, i.e.,
+ * before last expiration. Thus timestamps need to be back-shifted
+ * with respect to their normal computation (see [1] for more details
+ * on this tricky aspect).
+ *
+ * Secondly, to allow the process to recover the hole, the in-service
+ * queue must be expired too, to give bfqq the chance to preempt it
+ * immediately. In fact, if bfqq has to wait for a full budget of the
+ * in-service queue to be completed, then it may become impossible to
+ * let the process recover the hole, even if the back-shifted
+ * timestamps of bfqq are lower than those of the in-service queue. If
+ * this happens for most or all of the holes, then the process may not
+ * receive its reserved bandwidth. In this respect, it is worth noting
+ * that, being the service of outstanding requests unpreemptible, a
+ * little fraction of the holes may however be unrecoverable, thereby
+ * causing a little loss of bandwidth.
+ *
+ * The last important point is detecting whether bfqq does need this
+ * bandwidth recovery. In this respect, the next function deems the
+ * process associated with bfqq greedy, and thus allows it to recover
+ * the hole, if: 1) the process is waiting for the arrival of a new
+ * request (which implies that bfqq expired for one of the above two
+ * reasons), and 2) such a request has arrived soon. The first
+ * condition is controlled through the flag non_blocking_wait_rq,
+ * while the second through the flag arrived_in_time. If both
+ * conditions hold, then the function computes the budget in the
+ * above-described special way, and signals that the in-service queue
+ * should be expired. Timestamp back-shifting is done later in
+ * __bfq_activate_entity.
+ *
+ * 2. Reduce latency. Even if timestamps are not backshifted to let
+ * the process associated with bfqq recover a service hole, bfqq may
+ * however happen to have, after being (re)activated, a lower finish
+ * timestamp than the in-service queue.         That is, the next budget of
+ * bfqq may have to be completed before the one of the in-service
+ * queue. If this is the case, then preempting the in-service queue
+ * allows this goal to be achieved, apart from the unpreemptible,
+ * outstanding requests mentioned above.
+ *
+ * Unfortunately, regardless of which of the above two goals one wants
+ * to achieve, service trees need first to be updated to know whether
+ * the in-service queue must be preempted. To have service trees
+ * correctly updated, the in-service queue must be expired and
+ * rescheduled, and bfqq must be scheduled too. This is one of the
+ * most costly operations (in future versions, the scheduling
+ * mechanism may be re-designed in such a way to make it possible to
+ * know whether preemption is needed without needing to update service
+ * trees). In addition, queue preemptions almost always cause random
+ * I/O, and thus loss of throughput. Because of these facts, the next
+ * function adopts the following simple scheme to avoid both costly
+ * operations and too frequent preemptions: it requests the expiration
+ * of the in-service queue (unconditionally) only for queues that need
+ * to recover a hole, or that either are weight-raised or deserve to
+ * be weight-raised.
+ */
+static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
+                                               struct bfq_queue *bfqq,
+                                               bool arrived_in_time,
+                                               bool wr_or_deserves_wr)
+{
+       struct bfq_entity *entity = &bfqq->entity;
+
+       if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time) {
+               /*
+                * We do not clear the flag non_blocking_wait_rq here, as
+                * the latter is used in bfq_activate_bfqq to signal
+                * that timestamps need to be back-shifted (and is
+                * cleared right after).
+                */
+
+               /*
+                * In next assignment we rely on that either
+                * entity->service or entity->budget are not updated
+                * on expiration if bfqq is empty (see
+                * __bfq_bfqq_recalc_budget). Thus both quantities
+                * remain unchanged after such an expiration, and the
+                * following statement therefore assigns to
+                * entity->budget the remaining budget on such an
+                * expiration. For clarity, entity->service is not
+                * updated on expiration in any case, and, in normal
+                * operation, is reset only when bfqq is selected for
+                * service (see bfq_get_next_queue).
+                */
+               entity->budget = min_t(unsigned long,
+                                      bfq_bfqq_budget_left(bfqq),
+                                      bfqq->max_budget);
+
+               return true;
+       }
+
+       entity->budget = max_t(unsigned long, bfqq->max_budget,
+                              bfq_serv_to_charge(bfqq->next_rq, bfqq));
+       bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
+       return wr_or_deserves_wr;
+}
+
+static unsigned int bfq_wr_duration(struct bfq_data *bfqd)
+{
+       u64 dur;
+
+       if (bfqd->bfq_wr_max_time > 0)
+               return bfqd->bfq_wr_max_time;
+
+       dur = bfqd->RT_prod;
+       do_div(dur, bfqd->peak_rate);
+
+       /*
+        * Limit duration between 3 and 13 seconds. Tests show that
+        * higher values than 13 seconds often yield the opposite of
+        * the desired result, i.e., worsen responsiveness by letting
+        * non-interactive and non-soft-real-time applications
+        * preserve weight raising for a too long time interval.
+        *
+        * On the other end, lower values than 3 seconds make it
+        * difficult for most interactive tasks to complete their jobs
+        * before weight-raising finishes.
+        */
+       if (dur > msecs_to_jiffies(13000))
+               dur = msecs_to_jiffies(13000);
+       else if (dur < msecs_to_jiffies(3000))
+               dur = msecs_to_jiffies(3000);
+
+       return dur;
+}
+
+static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd,
+                                            struct bfq_queue *bfqq,
+                                            unsigned int old_wr_coeff,
+                                            bool wr_or_deserves_wr,
+                                            bool interactive,
+                                            bool in_burst,
+                                            bool soft_rt)
+{
+       if (old_wr_coeff == 1 && wr_or_deserves_wr) {
+               /* start a weight-raising period */
+               if (interactive) {
+                       bfqq->wr_coeff = bfqd->bfq_wr_coeff;
+                       bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
+               } else {
+                       bfqq->wr_start_at_switch_to_srt = jiffies;
+                       bfqq->wr_coeff = bfqd->bfq_wr_coeff *
+                               BFQ_SOFTRT_WEIGHT_FACTOR;
+                       bfqq->wr_cur_max_time =
+                               bfqd->bfq_wr_rt_max_time;
+               }
+
+               /*
+                * If needed, further reduce budget to make sure it is
+                * close to bfqq's backlog, so as to reduce the
+                * scheduling-error component due to a too large
+                * budget. Do not care about throughput consequences,
+                * but only about latency. Finally, do not assign a
+                * too small budget either, to avoid increasing
+                * latency by causing too frequent expirations.
+                */
+               bfqq->entity.budget = min_t(unsigned long,
+                                           bfqq->entity.budget,
+                                           2 * bfq_min_budget(bfqd));
+       } else if (old_wr_coeff > 1) {
+               if (interactive) { /* update wr coeff and duration */
+                       bfqq->wr_coeff = bfqd->bfq_wr_coeff;
+                       bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
+               } else if (in_burst)
+                       bfqq->wr_coeff = 1;
+               else if (soft_rt) {
+                       /*
+                        * The application is now or still meeting the
+                        * requirements for being deemed soft rt.  We
+                        * can then correctly and safely (re)charge
+                        * the weight-raising duration for the
+                        * application with the weight-raising
+                        * duration for soft rt applications.
+                        *
+                        * In particular, doing this recharge now, i.e.,
+                        * before the weight-raising period for the
+                        * application finishes, reduces the probability
+                        * of the following negative scenario:
+                        * 1) the weight of a soft rt application is
+                        *    raised at startup (as for any newly
+                        *    created application),
+                        * 2) since the application is not interactive,
+                        *    at a certain time weight-raising is
+                        *    stopped for the application,
+                        * 3) at that time the application happens to
+                        *    still have pending requests, and hence
+                        *    is destined to not have a chance to be
+                        *    deemed soft rt before these requests are
+                        *    completed (see the comments to the
+                        *    function bfq_bfqq_softrt_next_start()
+                        *    for details on soft rt detection),
+                        * 4) these pending requests experience a high
+                        *    latency because the application is not
+                        *    weight-raised while they are pending.
+                        */
+                       if (bfqq->wr_cur_max_time !=
+                               bfqd->bfq_wr_rt_max_time) {
+                               bfqq->wr_start_at_switch_to_srt =
+                                       bfqq->last_wr_start_finish;
+
+                               bfqq->wr_cur_max_time =
+                                       bfqd->bfq_wr_rt_max_time;
+                               bfqq->wr_coeff = bfqd->bfq_wr_coeff *
+                                       BFQ_SOFTRT_WEIGHT_FACTOR;
+                       }
+                       bfqq->last_wr_start_finish = jiffies;
+               }
+       }
+}
+
+static bool bfq_bfqq_idle_for_long_time(struct bfq_data *bfqd,
+                                       struct bfq_queue *bfqq)
+{
+       return bfqq->dispatched == 0 &&
+               time_is_before_jiffies(
+                       bfqq->budget_timeout +
+                       bfqd->bfq_wr_min_idle_time);
+}
+
+static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
+                                            struct bfq_queue *bfqq,
+                                            int old_wr_coeff,
+                                            struct request *rq,
+                                            bool *interactive)
+{
+       bool soft_rt, in_burst, wr_or_deserves_wr,
+               bfqq_wants_to_preempt,
+               idle_for_long_time = bfq_bfqq_idle_for_long_time(bfqd, bfqq),
+               /*
+                * See the comments on
+                * bfq_bfqq_update_budg_for_activation for
+                * details on the usage of the next variable.
+                */
+               arrived_in_time =  ktime_get_ns() <=
+                       bfqq->ttime.last_end_request +
+                       bfqd->bfq_slice_idle * 3;
+
+       bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, rq->cmd_flags);
+
+       /*
+        * bfqq deserves to be weight-raised if:
+        * - it is sync,
+        * - it does not belong to a large burst,
+        * - it has been idle for enough time or is soft real-time,
+        * - is linked to a bfq_io_cq (it is not shared in any sense).
+        */
+       in_burst = bfq_bfqq_in_large_burst(bfqq);
+       soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 &&
+               !in_burst &&
+               time_is_before_jiffies(bfqq->soft_rt_next_start);
+       *interactive = !in_burst && idle_for_long_time;
+       wr_or_deserves_wr = bfqd->low_latency &&
+               (bfqq->wr_coeff > 1 ||
+                (bfq_bfqq_sync(bfqq) &&
+                 bfqq->bic && (*interactive || soft_rt)));
+
+       /*
+        * Using the last flag, update budget and check whether bfqq
+        * may want to preempt the in-service queue.
+        */
+       bfqq_wants_to_preempt =
+               bfq_bfqq_update_budg_for_activation(bfqd, bfqq,
+                                                   arrived_in_time,
+                                                   wr_or_deserves_wr);
+
+       /*
+        * If bfqq happened to be activated in a burst, but has been
+        * idle for much more than an interactive queue, then we
+        * assume that, in the overall I/O initiated in the burst, the
+        * I/O associated with bfqq is finished. So bfqq does not need
+        * to be treated as a queue belonging to a burst
+        * anymore. Accordingly, we reset bfqq's in_large_burst flag
+        * if set, and remove bfqq from the burst list if it's
+        * there. We do not decrement burst_size, because the fact
+        * that bfqq does not need to belong to the burst list any
+        * more does not invalidate the fact that bfqq was created in
+        * a burst.
+        */
+       if (likely(!bfq_bfqq_just_created(bfqq)) &&
+           idle_for_long_time &&
+           time_is_before_jiffies(
+                   bfqq->budget_timeout +
+                   msecs_to_jiffies(10000))) {
+               hlist_del_init(&bfqq->burst_list_node);
+               bfq_clear_bfqq_in_large_burst(bfqq);
+       }
+
+       bfq_clear_bfqq_just_created(bfqq);
+
+
+       if (!bfq_bfqq_IO_bound(bfqq)) {
+               if (arrived_in_time) {
+                       bfqq->requests_within_timer++;
+                       if (bfqq->requests_within_timer >=
+                           bfqd->bfq_requests_within_timer)
+                               bfq_mark_bfqq_IO_bound(bfqq);
+               } else
+                       bfqq->requests_within_timer = 0;
+       }
+
+       if (bfqd->low_latency) {
+               if (unlikely(time_is_after_jiffies(bfqq->split_time)))
+                       /* wraparound */
+                       bfqq->split_time =
+                               jiffies - bfqd->bfq_wr_min_idle_time - 1;
+
+               if (time_is_before_jiffies(bfqq->split_time +
+                                          bfqd->bfq_wr_min_idle_time)) {
+                       bfq_update_bfqq_wr_on_rq_arrival(bfqd, bfqq,
+                                                        old_wr_coeff,
+                                                        wr_or_deserves_wr,
+                                                        *interactive,
+                                                        in_burst,
+                                                        soft_rt);
+
+                       if (old_wr_coeff != bfqq->wr_coeff)
+                               bfqq->entity.prio_changed = 1;
+               }
+       }
+
+       bfqq->last_idle_bklogged = jiffies;
+       bfqq->service_from_backlogged = 0;
+       bfq_clear_bfqq_softrt_update(bfqq);
+
+       bfq_add_bfqq_busy(bfqd, bfqq);
+
+       /*
+        * Expire in-service queue only if preemption may be needed
+        * for guarantees. In this respect, the function
+        * next_queue_may_preempt just checks a simple, necessary
+        * condition, and not a sufficient condition based on
+        * timestamps. In fact, for the latter condition to be
+        * evaluated, timestamps would need first to be updated, and
+        * this operation is quite costly (see the comments on the
+        * function bfq_bfqq_update_budg_for_activation).
+        */
+       if (bfqd->in_service_queue && bfqq_wants_to_preempt &&
+           bfqd->in_service_queue->wr_coeff < bfqq->wr_coeff &&
+           next_queue_may_preempt(bfqd))
+               bfq_bfqq_expire(bfqd, bfqd->in_service_queue,
+                               false, BFQQE_PREEMPTED);
+}
+
+static void bfq_add_request(struct request *rq)
+{
+       struct bfq_queue *bfqq = RQ_BFQQ(rq);
+       struct bfq_data *bfqd = bfqq->bfqd;
+       struct request *next_rq, *prev;
+       unsigned int old_wr_coeff = bfqq->wr_coeff;
+       bool interactive = false;
+
+       bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq));
+       bfqq->queued[rq_is_sync(rq)]++;
+       bfqd->queued++;
+
+       elv_rb_add(&bfqq->sort_list, rq);
+
+       /*
+        * Check if this request is a better next-serve candidate.
+        */
+       prev = bfqq->next_rq;
+       next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);
+       bfqq->next_rq = next_rq;
+
+       /*
+        * Adjust priority tree position, if next_rq changes.
+        */
+       if (prev != bfqq->next_rq)
+               bfq_pos_tree_add_move(bfqd, bfqq);
+
+       if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */
+               bfq_bfqq_handle_idle_busy_switch(bfqd, bfqq, old_wr_coeff,
+                                                rq, &interactive);
+       else {
+               if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) &&
+                   time_is_before_jiffies(
+                               bfqq->last_wr_start_finish +
+                               bfqd->bfq_wr_min_inter_arr_async)) {
+                       bfqq->wr_coeff = bfqd->bfq_wr_coeff;
+                       bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
+
+                       bfqd->wr_busy_queues++;
+                       bfqq->entity.prio_changed = 1;
+               }
+               if (prev != bfqq->next_rq)
+                       bfq_updated_next_req(bfqd, bfqq);
+       }
+
+       /*
+        * Assign jiffies to last_wr_start_finish in the following
+        * cases:
+        *
+        * . if bfqq is not going to be weight-raised, because, for
+        *   non weight-raised queues, last_wr_start_finish stores the
+        *   arrival time of the last request; as of now, this piece
+        *   of information is used only for deciding whether to
+        *   weight-raise async queues
+        *
+        * . if bfqq is not weight-raised, because, if bfqq is now
+        *   switching to weight-raised, then last_wr_start_finish
+        *   stores the time when weight-raising starts
+        *
+        * . if bfqq is interactive, because, regardless of whether
+        *   bfqq is currently weight-raised, the weight-raising
+        *   period must start or restart (this case is considered
+        *   separately because it is not detected by the above
+        *   conditions, if bfqq is already weight-raised)
+        *
+        * last_wr_start_finish has to be updated also if bfqq is soft
+        * real-time, because the weight-raising period is constantly
+        * restarted on idle-to-busy transitions for these queues, but
+        * this is already done in bfq_bfqq_handle_idle_busy_switch if
+        * needed.
+        */
+       if (bfqd->low_latency &&
+               (old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive))
+               bfqq->last_wr_start_finish = jiffies;
+}
+
+static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,
+                                         struct bio *bio,
+                                         struct request_queue *q)
+{
+       struct bfq_queue *bfqq = bfqd->bio_bfqq;
+
+
+       if (bfqq)
+               return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio));
+
+       return NULL;
+}
+
+static sector_t get_sdist(sector_t last_pos, struct request *rq)
+{
+       if (last_pos)
+               return abs(blk_rq_pos(rq) - last_pos);
+
+       return 0;
+}
+
+#if 0 /* Still not clear if we can do without next two functions */
+static void bfq_activate_request(struct request_queue *q, struct request *rq)
+{
+       struct bfq_data *bfqd = q->elevator->elevator_data;
+
+       bfqd->rq_in_driver++;
+}
+
+static void bfq_deactivate_request(struct request_queue *q, struct request *rq)
+{
+       struct bfq_data *bfqd = q->elevator->elevator_data;
+
+       bfqd->rq_in_driver--;
+}
+#endif
+
+static void bfq_remove_request(struct request_queue *q,
+                              struct request *rq)
+{
+       struct bfq_queue *bfqq = RQ_BFQQ(rq);
+       struct bfq_data *bfqd = bfqq->bfqd;
+       const int sync = rq_is_sync(rq);
+
+       if (bfqq->next_rq == rq) {
+               bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);
+               bfq_updated_next_req(bfqd, bfqq);
+       }
+
+       if (rq->queuelist.prev != &rq->queuelist)
+               list_del_init(&rq->queuelist);
+       bfqq->queued[sync]--;
+       bfqd->queued--;
+       elv_rb_del(&bfqq->sort_list, rq);
+
+       elv_rqhash_del(q, rq);
+       if (q->last_merge == rq)
+               q->last_merge = NULL;
+
+       if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
+               bfqq->next_rq = NULL;
+
+               if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) {
+                       bfq_del_bfqq_busy(bfqd, bfqq, false);
+                       /*
+                        * bfqq emptied. In normal operation, when
+                        * bfqq is empty, bfqq->entity.service and
+                        * bfqq->entity.budget must contain,
+                        * respectively, the service received and the
+                        * budget used last time bfqq emptied. These
+                        * facts do not hold in this case, as at least
+                        * this last removal occurred while bfqq is
+                        * not in service. To avoid inconsistencies,
+                        * reset both bfqq->entity.service and
+                        * bfqq->entity.budget, if bfqq has still a
+                        * process that may issue I/O requests to it.
+                        */
+                       bfqq->entity.budget = bfqq->entity.service = 0;
+               }
+
+               /*
+                * Remove queue from request-position tree as it is empty.
+                */
+               if (bfqq->pos_root) {
+                       rb_erase(&bfqq->pos_node, bfqq->pos_root);
+                       bfqq->pos_root = NULL;
+               }
+       }
+
+       if (rq->cmd_flags & REQ_META)
+               bfqq->meta_pending--;
+
+       bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags);
+}
+
+static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
+{
+       struct request_queue *q = hctx->queue;
+       struct bfq_data *bfqd = q->elevator->elevator_data;
+       struct request *free = NULL;
+       /*
+        * bfq_bic_lookup grabs the queue_lock: invoke it now and
+        * store its return value for later use, to avoid nesting
+        * queue_lock inside the bfqd->lock. We assume that the bic
+        * returned by bfq_bic_lookup does not go away before
+        * bfqd->lock is taken.
+        */
+       struct bfq_io_cq *bic = bfq_bic_lookup(bfqd, current->io_context, q);
+       bool ret;
+
+       spin_lock_irq(&bfqd->lock);
+
+       if (bic)
+               bfqd->bio_bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf));
+       else
+               bfqd->bio_bfqq = NULL;
+       bfqd->bio_bic = bic;
+
+       ret = blk_mq_sched_try_merge(q, bio, &free);
+
+       if (free)
+               blk_mq_free_request(free);
+       spin_unlock_irq(&bfqd->lock);
+
+       return ret;
+}
+
+static int bfq_request_merge(struct request_queue *q, struct request **req,
+                            struct bio *bio)
+{
+       struct bfq_data *bfqd = q->elevator->elevator_data;
+       struct request *__rq;
+
+       __rq = bfq_find_rq_fmerge(bfqd, bio, q);
+       if (__rq && elv_bio_merge_ok(__rq, bio)) {
+               *req = __rq;
+               return ELEVATOR_FRONT_MERGE;
+       }
+
+       return ELEVATOR_NO_MERGE;
+}
+
+static void bfq_request_merged(struct request_queue *q, struct request *req,
+                              enum elv_merge type)
+{
+       if (type == ELEVATOR_FRONT_MERGE &&
+           rb_prev(&req->rb_node) &&
+           blk_rq_pos(req) <
+           blk_rq_pos(container_of(rb_prev(&req->rb_node),
+                                   struct request, rb_node))) {
+               struct bfq_queue *bfqq = RQ_BFQQ(req);
+               struct bfq_data *bfqd = bfqq->bfqd;
+               struct request *prev, *next_rq;
+
+               /* Reposition request in its sort_list */
+               elv_rb_del(&bfqq->sort_list, req);
+               elv_rb_add(&bfqq->sort_list, req);
+
+               /* Choose next request to be served for bfqq */
+               prev = bfqq->next_rq;
+               next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req,
+                                        bfqd->last_position);
+               bfqq->next_rq = next_rq;
+               /*
+                * If next_rq changes, update both the queue's budget to
+                * fit the new request and the queue's position in its
+                * rq_pos_tree.
+                */
+               if (prev != bfqq->next_rq) {
+                       bfq_updated_next_req(bfqd, bfqq);
+                       bfq_pos_tree_add_move(bfqd, bfqq);
+               }
+       }
+}
+
+static void bfq_requests_merged(struct request_queue *q, struct request *rq,
+                               struct request *next)
+{
+       struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next);
+
+       if (!RB_EMPTY_NODE(&rq->rb_node))
+               goto end;
+       spin_lock_irq(&bfqq->bfqd->lock);
+
+       /*
+        * If next and rq belong to the same bfq_queue and next is older
+        * than rq, then reposition rq in the fifo (by substituting next
+        * with rq). Otherwise, if next and rq belong to different
+        * bfq_queues, never reposition rq: in fact, we would have to
+        * reposition it with respect to next's position in its own fifo,
+        * which would most certainly be too expensive with respect to
+        * the benefits.
+        */
+       if (bfqq == next_bfqq &&
+           !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
+           next->fifo_time < rq->fifo_time) {
+               list_del_init(&rq->queuelist);
+               list_replace_init(&next->queuelist, &rq->queuelist);
+               rq->fifo_time = next->fifo_time;
+       }
+
+       if (bfqq->next_rq == next)
+               bfqq->next_rq = rq;
+
+       bfq_remove_request(q, next);
+
+       spin_unlock_irq(&bfqq->bfqd->lock);
+end:
+       bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags);
+}
+
+/* Must be called with bfqq != NULL */
+static void bfq_bfqq_end_wr(struct bfq_queue *bfqq)
+{
+       if (bfq_bfqq_busy(bfqq))
+               bfqq->bfqd->wr_busy_queues--;
+       bfqq->wr_coeff = 1;
+       bfqq->wr_cur_max_time = 0;
+       bfqq->last_wr_start_finish = jiffies;
+       /*
+        * Trigger a weight change on the next invocation of
+        * __bfq_entity_update_weight_prio.
+        */
+       bfqq->entity.prio_changed = 1;
+}
+
+void bfq_end_wr_async_queues(struct bfq_data *bfqd,
+                            struct bfq_group *bfqg)
+{
+       int i, j;
+
+       for (i = 0; i < 2; i++)
+               for (j = 0; j < IOPRIO_BE_NR; j++)
+                       if (bfqg->async_bfqq[i][j])
+                               bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]);
+       if (bfqg->async_idle_bfqq)
+               bfq_bfqq_end_wr(bfqg->async_idle_bfqq);
+}
+
+static void bfq_end_wr(struct bfq_data *bfqd)
+{
+       struct bfq_queue *bfqq;
+
+       spin_lock_irq(&bfqd->lock);
+
+       list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)
+               bfq_bfqq_end_wr(bfqq);
+       list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)
+               bfq_bfqq_end_wr(bfqq);
+       bfq_end_wr_async(bfqd);
+
+       spin_unlock_irq(&bfqd->lock);
+}
+
+static sector_t bfq_io_struct_pos(void *io_struct, bool request)
+{
+       if (request)
+               return blk_rq_pos(io_struct);
+       else
+               return ((struct bio *)io_struct)->bi_iter.bi_sector;
+}
+
+static int bfq_rq_close_to_sector(void *io_struct, bool request,
+                                 sector_t sector)
+{
+       return abs(bfq_io_struct_pos(io_struct, request) - sector) <=
+              BFQQ_CLOSE_THR;
+}
+
+static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd,
+                                        struct bfq_queue *bfqq,
+                                        sector_t sector)
+{
+       struct rb_root *root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree;
+       struct rb_node *parent, *node;
+       struct bfq_queue *__bfqq;
+
+       if (RB_EMPTY_ROOT(root))
+               return NULL;
+
+       /*
+        * First, if we find a request starting at the end of the last
+        * request, choose it.
+        */
+       __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL);
+       if (__bfqq)
+               return __bfqq;
+
+       /*
+        * If the exact sector wasn't found, the parent of the NULL leaf
+        * will contain the closest sector (rq_pos_tree sorted by
+        * next_request position).
+        */
+       __bfqq = rb_entry(parent, struct bfq_queue, pos_node);
+       if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))
+               return __bfqq;
+
+       if (blk_rq_pos(__bfqq->next_rq) < sector)
+               node = rb_next(&__bfqq->pos_node);
+       else
+               node = rb_prev(&__bfqq->pos_node);
+       if (!node)
+               return NULL;
+
+       __bfqq = rb_entry(node, struct bfq_queue, pos_node);
+       if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))
+               return __bfqq;
+
+       return NULL;
+}
+
+static struct bfq_queue *bfq_find_close_cooperator(struct bfq_data *bfqd,
+                                                  struct bfq_queue *cur_bfqq,
+                                                  sector_t sector)
+{
+       struct bfq_queue *bfqq;
+
+       /*
+        * We shall notice if some of the queues are cooperating,
+        * e.g., working closely on the same area of the device. In
+        * that case, we can group them together and: 1) don't waste
+        * time idling, and 2) serve the union of their requests in
+        * the best possible order for throughput.
+        */
+       bfqq = bfqq_find_close(bfqd, cur_bfqq, sector);
+       if (!bfqq || bfqq == cur_bfqq)
+               return NULL;
+
+       return bfqq;
+}
+
+static struct bfq_queue *
+bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
+{
+       int process_refs, new_process_refs;
+       struct bfq_queue *__bfqq;
+
+       /*
+        * If there are no process references on the new_bfqq, then it is
+        * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
+        * may have dropped their last reference (not just their last process
+        * reference).
+        */
+       if (!bfqq_process_refs(new_bfqq))
+               return NULL;
+
+       /* Avoid a circular list and skip interim queue merges. */
+       while ((__bfqq = new_bfqq->new_bfqq)) {
+               if (__bfqq == bfqq)
+                       return NULL;
+               new_bfqq = __bfqq;
+       }
+
+       process_refs = bfqq_process_refs(bfqq);
+       new_process_refs = bfqq_process_refs(new_bfqq);
+       /*
+        * If the process for the bfqq has gone away, there is no
+        * sense in merging the queues.
+        */
+       if (process_refs == 0 || new_process_refs == 0)
+               return NULL;
+
+       bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
+               new_bfqq->pid);
+
+       /*
+        * Merging is just a redirection: the requests of the process
+        * owning one of the two queues are redirected to the other queue.
+        * The latter queue, in its turn, is set as shared if this is the
+        * first time that the requests of some process are redirected to
+        * it.
+        *
+        * We redirect bfqq to new_bfqq and not the opposite, because
+        * we are in the context of the process owning bfqq, thus we
+        * have the io_cq of this process. So we can immediately
+        * configure this io_cq to redirect the requests of the
+        * process to new_bfqq. In contrast, the io_cq of new_bfqq is
+        * not available any more (new_bfqq->bic == NULL).
+        *
+        * Anyway, even in case new_bfqq coincides with the in-service
+        * queue, redirecting requests the in-service queue is the
+        * best option, as we feed the in-service queue with new
+        * requests close to the last request served and, by doing so,
+        * are likely to increase the throughput.
+        */
+       bfqq->new_bfqq = new_bfqq;
+       new_bfqq->ref += process_refs;
+       return new_bfqq;
+}
+
+static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq,
+                                       struct bfq_queue *new_bfqq)
+{
+       if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) ||
+           (bfqq->ioprio_class != new_bfqq->ioprio_class))
+               return false;
+
+       /*
+        * If either of the queues has already been detected as seeky,
+        * then merging it with the other queue is unlikely to lead to
+        * sequential I/O.
+        */
+       if (BFQQ_SEEKY(bfqq) || BFQQ_SEEKY(new_bfqq))
+               return false;
+
+       /*
+        * Interleaved I/O is known to be done by (some) applications
+        * only for reads, so it does not make sense to merge async
+        * queues.
+        */
+       if (!bfq_bfqq_sync(bfqq) || !bfq_bfqq_sync(new_bfqq))
+               return false;
+
+       return true;
+}
+
+/*
+ * If this function returns true, then bfqq cannot be merged. The idea
+ * is that true cooperation happens very early after processes start
+ * to do I/O. Usually, late cooperations are just accidental false
+ * positives. In case bfqq is weight-raised, such false positives
+ * would evidently degrade latency guarantees for bfqq.
+ */
+static bool wr_from_too_long(struct bfq_queue *bfqq)
+{
+       return bfqq->wr_coeff > 1 &&
+               time_is_before_jiffies(bfqq->last_wr_start_finish +
+                                      msecs_to_jiffies(100));
+}
+
+/*
+ * Attempt to schedule a merge of bfqq with the currently in-service
+ * queue or with a close queue among the scheduled queues.  Return
+ * NULL if no merge was scheduled, a pointer to the shared bfq_queue
+ * structure otherwise.
+ *
+ * The OOM queue is not allowed to participate to cooperation: in fact, since
+ * the requests temporarily redirected to the OOM queue could be redirected
+ * again to dedicated queues at any time, the state needed to correctly
+ * handle merging with the OOM queue would be quite complex and expensive
+ * to maintain. Besides, in such a critical condition as an out of memory,
+ * the benefits of queue merging may be little relevant, or even negligible.
+ *
+ * Weight-raised queues can be merged only if their weight-raising
+ * period has just started. In fact cooperating processes are usually
+ * started together. Thus, with this filter we avoid false positives
+ * that would jeopardize low-latency guarantees.
+ *
+ * WARNING: queue merging may impair fairness among non-weight raised
+ * queues, for at least two reasons: 1) the original weight of a
+ * merged queue may change during the merged state, 2) even being the
+ * weight the same, a merged queue may be bloated with many more
+ * requests than the ones produced by its originally-associated
+ * process.
+ */
+static struct bfq_queue *
+bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+                    void *io_struct, bool request)
+{
+       struct bfq_queue *in_service_bfqq, *new_bfqq;
+
+       if (bfqq->new_bfqq)
+               return bfqq->new_bfqq;
+
+       if (!io_struct ||
+           wr_from_too_long(bfqq) ||
+           unlikely(bfqq == &bfqd->oom_bfqq))
+               return NULL;
+
+       /* If there is only one backlogged queue, don't search. */
+       if (bfqd->busy_queues == 1)
+               return NULL;
+
+       in_service_bfqq = bfqd->in_service_queue;
+
+       if (!in_service_bfqq || in_service_bfqq == bfqq
+           || wr_from_too_long(in_service_bfqq) ||
+           unlikely(in_service_bfqq == &bfqd->oom_bfqq))
+               goto check_scheduled;
+
+       if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&
+           bfqq->entity.parent == in_service_bfqq->entity.parent &&
+           bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) {
+               new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq);
+               if (new_bfqq)
+                       return new_bfqq;
+       }
+       /*
+        * Check whether there is a cooperator among currently scheduled
+        * queues. The only thing we need is that the bio/request is not
+        * NULL, as we need it to establish whether a cooperator exists.
+        */
+check_scheduled:
+       new_bfqq = bfq_find_close_cooperator(bfqd, bfqq,
+                       bfq_io_struct_pos(io_struct, request));
+
+       if (new_bfqq && !wr_from_too_long(new_bfqq) &&
+           likely(new_bfqq != &bfqd->oom_bfqq) &&
+           bfq_may_be_close_cooperator(bfqq, new_bfqq))
+               return bfq_setup_merge(bfqq, new_bfqq);
+
+       return NULL;
+}
+
+static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
+{
+       struct bfq_io_cq *bic = bfqq->bic;
+
+       /*
+        * If !bfqq->bic, the queue is already shared or its requests
+        * have already been redirected to a shared queue; both idle window
+        * and weight raising state have already been saved. Do nothing.
+        */
+       if (!bic)
+               return;
+
+       bic->saved_ttime = bfqq->ttime;
+       bic->saved_idle_window = bfq_bfqq_idle_window(bfqq);
+       bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq);
+       bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq);
+       bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node);
+       bic->saved_wr_coeff = bfqq->wr_coeff;
+       bic->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt;
+       bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish;
+       bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time;
+}
+
+static void
+bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
+               struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
+{
+       bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
+               (unsigned long)new_bfqq->pid);
+       /* Save weight raising and idle window of the merged queues */
+       bfq_bfqq_save_state(bfqq);
+       bfq_bfqq_save_state(new_bfqq);
+       if (bfq_bfqq_IO_bound(bfqq))
+               bfq_mark_bfqq_IO_bound(new_bfqq);
+       bfq_clear_bfqq_IO_bound(bfqq);
+
+       /*
+        * If bfqq is weight-raised, then let new_bfqq inherit
+        * weight-raising. To reduce false positives, neglect the case
+        * where bfqq has just been created, but has not yet made it
+        * to be weight-raised (which may happen because EQM may merge
+        * bfqq even before bfq_add_request is executed for the first
+        * time for bfqq). Handling this case would however be very
+        * easy, thanks to the flag just_created.
+        */
+       if (new_bfqq->wr_coeff == 1 && bfqq->wr_coeff > 1) {
+               new_bfqq->wr_coeff = bfqq->wr_coeff;
+               new_bfqq->wr_cur_max_time = bfqq->wr_cur_max_time;
+               new_bfqq->last_wr_start_finish = bfqq->last_wr_start_finish;
+               new_bfqq->wr_start_at_switch_to_srt =
+                       bfqq->wr_start_at_switch_to_srt;
+               if (bfq_bfqq_busy(new_bfqq))
+                       bfqd->wr_busy_queues++;
+               new_bfqq->entity.prio_changed = 1;
+       }
+
+       if (bfqq->wr_coeff > 1) { /* bfqq has given its wr to new_bfqq */
+               bfqq->wr_coeff = 1;
+               bfqq->entity.prio_changed = 1;
+               if (bfq_bfqq_busy(bfqq))
+                       bfqd->wr_busy_queues--;
+       }
+
+       bfq_log_bfqq(bfqd, new_bfqq, "merge_bfqqs: wr_busy %d",
+                    bfqd->wr_busy_queues);
+
+       /*
+        * Merge queues (that is, let bic redirect its requests to new_bfqq)
+        */
+       bic_set_bfqq(bic, new_bfqq, 1);
+       bfq_mark_bfqq_coop(new_bfqq);
+       /*
+        * new_bfqq now belongs to at least two bics (it is a shared queue):
+        * set new_bfqq->bic to NULL. bfqq either:
+        * - does not belong to any bic any more, and hence bfqq->bic must
+        *   be set to NULL, or
+        * - is a queue whose owning bics have already been redirected to a
+        *   different queue, hence the queue is destined to not belong to
+        *   any bic soon and bfqq->bic is already NULL (therefore the next
+        *   assignment causes no harm).
+        */
+       new_bfqq->bic = NULL;
+       bfqq->bic = NULL;
+       /* release process reference to bfqq */
+       bfq_put_queue(bfqq);
+}
+
+static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq,
+                               struct bio *bio)
+{
+       struct bfq_data *bfqd = q->elevator->elevator_data;
+       bool is_sync = op_is_sync(bio->bi_opf);
+       struct bfq_queue *bfqq = bfqd->bio_bfqq, *new_bfqq;
+
+       /*
+        * Disallow merge of a sync bio into an async request.
+        */
+       if (is_sync && !rq_is_sync(rq))
+               return false;
+
+       /*
+        * Lookup the bfqq that this bio will be queued with. Allow
+        * merge only if rq is queued there.
+        */
+       if (!bfqq)
+               return false;
+
+       /*
+        * We take advantage of this function to perform an early merge
+        * of the queues of possible cooperating processes.
+        */
+       new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false);
+       if (new_bfqq) {
+               /*
+                * bic still points to bfqq, then it has not yet been
+                * redirected to some other bfq_queue, and a queue
+                * merge beween bfqq and new_bfqq can be safely
+                * fulfillled, i.e., bic can be redirected to new_bfqq
+                * and bfqq can be put.
+                */
+               bfq_merge_bfqqs(bfqd, bfqd->bio_bic, bfqq,
+                               new_bfqq);
+               /*
+                * If we get here, bio will be queued into new_queue,
+                * so use new_bfqq to decide whether bio and rq can be
+                * merged.
+                */
+               bfqq = new_bfqq;
+
+               /*
+                * Change also bqfd->bio_bfqq, as
+                * bfqd->bio_bic now points to new_bfqq, and
+                * this function may be invoked again (and then may
+                * use again bqfd->bio_bfqq).
+                */
+               bfqd->bio_bfqq = bfqq;
+       }
+
+       return bfqq == RQ_BFQQ(rq);
+}
+
+/*
+ * Set the maximum time for the in-service queue to consume its
+ * budget. This prevents seeky processes from lowering the throughput.
+ * In practice, a time-slice service scheme is used with seeky
+ * processes.
+ */
+static void bfq_set_budget_timeout(struct bfq_data *bfqd,
+                                  struct bfq_queue *bfqq)
+{
+       unsigned int timeout_coeff;
+
+       if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time)
+               timeout_coeff = 1;
+       else
+               timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;
+
+       bfqd->last_budget_start = ktime_get();
+
+       bfqq->budget_timeout = jiffies +
+               bfqd->bfq_timeout * timeout_coeff;
+}
+
+static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
+                                      struct bfq_queue *bfqq)
+{
+       if (bfqq) {
+               bfqg_stats_update_avg_queue_size(bfqq_group(bfqq));
+               bfq_clear_bfqq_fifo_expire(bfqq);
+
+               bfqd->budgets_assigned = (bfqd->budgets_assigned * 7 + 256) / 8;
+
+               if (time_is_before_jiffies(bfqq->last_wr_start_finish) &&
+                   bfqq->wr_coeff > 1 &&
+                   bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time &&
+                   time_is_before_jiffies(bfqq->budget_timeout)) {
+                       /*
+                        * For soft real-time queues, move the start
+                        * of the weight-raising period forward by the
+                        * time the queue has not received any
+                        * service. Otherwise, a relatively long
+                        * service delay is likely to cause the
+                        * weight-raising period of the queue to end,
+                        * because of the short duration of the
+                        * weight-raising period of a soft real-time
+                        * queue.  It is worth noting that this move
+                        * is not so dangerous for the other queues,
+                        * because soft real-time queues are not
+                        * greedy.
+                        *
+                        * To not add a further variable, we use the
+                        * overloaded field budget_timeout to
+                        * determine for how long the queue has not
+                        * received service, i.e., how much time has
+                        * elapsed since the queue expired. However,
+                        * this is a little imprecise, because
+                        * budget_timeout is set to jiffies if bfqq
+                        * not only expires, but also remains with no
+                        * request.
+                        */
+                       if (time_after(bfqq->budget_timeout,
+                                      bfqq->last_wr_start_finish))
+                               bfqq->last_wr_start_finish +=
+                                       jiffies - bfqq->budget_timeout;
+                       else
+                               bfqq->last_wr_start_finish = jiffies;
+               }
+
+               bfq_set_budget_timeout(bfqd, bfqq);
+               bfq_log_bfqq(bfqd, bfqq,
+                            "set_in_service_queue, cur-budget = %d",
+                            bfqq->entity.budget);
+       }
+
+       bfqd->in_service_queue = bfqq;
+}
+
+/*
+ * Get and set a new queue for service.
+ */
+static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd)
+{
+       struct bfq_queue *bfqq = bfq_get_next_queue(bfqd);
+
+       __bfq_set_in_service_queue(bfqd, bfqq);
+       return bfqq;
+}
+
+static void bfq_arm_slice_timer(struct bfq_data *bfqd)
+{
+       struct bfq_queue *bfqq = bfqd->in_service_queue;
+       u32 sl;
+
+       bfq_mark_bfqq_wait_request(bfqq);
+
+       /*
+        * We don't want to idle for seeks, but we do want to allow
+        * fair distribution of slice time for a process doing back-to-back
+        * seeks. So allow a little bit of time for him to submit a new rq.
+        */
+       sl = bfqd->bfq_slice_idle;
+       /*
+        * Unless the queue is being weight-raised or the scenario is
+        * asymmetric, grant only minimum idle time if the queue
+        * is seeky. A long idling is preserved for a weight-raised
+        * queue, or, more in general, in an asymmetric scenario,
+        * because a long idling is needed for guaranteeing to a queue
+        * its reserved share of the throughput (in particular, it is
+        * needed if the queue has a higher weight than some other
+        * queue).
+        */
+       if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 &&
+           bfq_symmetric_scenario(bfqd))
+               sl = min_t(u64, sl, BFQ_MIN_TT);
+
+       bfqd->last_idling_start = ktime_get();
+       hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl),
+                     HRTIMER_MODE_REL);
+       bfqg_stats_set_start_idle_time(bfqq_group(bfqq));
+}
+
+/*
+ * In autotuning mode, max_budget is dynamically recomputed as the
+ * amount of sectors transferred in timeout at the estimated peak
+ * rate. This enables BFQ to utilize a full timeslice with a full
+ * budget, even if the in-service queue is served at peak rate. And
+ * this maximises throughput with sequential workloads.
+ */
+static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd)
+{
+       return (u64)bfqd->peak_rate * USEC_PER_MSEC *
+               jiffies_to_msecs(bfqd->bfq_timeout)>>BFQ_RATE_SHIFT;
+}
+
+/*
+ * Update parameters related to throughput and responsiveness, as a
+ * function of the estimated peak rate. See comments on
+ * bfq_calc_max_budget(), and on T_slow and T_fast arrays.
+ */
+static void update_thr_responsiveness_params(struct bfq_data *bfqd)
+{
+       int dev_type = blk_queue_nonrot(bfqd->queue);
+
+       if (bfqd->bfq_user_max_budget == 0)
+               bfqd->bfq_max_budget =
+                       bfq_calc_max_budget(bfqd);
+
+       if (bfqd->device_speed == BFQ_BFQD_FAST &&
+           bfqd->peak_rate < device_speed_thresh[dev_type]) {
+               bfqd->device_speed = BFQ_BFQD_SLOW;
+               bfqd->RT_prod = R_slow[dev_type] *
+                       T_slow[dev_type];
+       } else if (bfqd->device_speed == BFQ_BFQD_SLOW &&
+                  bfqd->peak_rate > device_speed_thresh[dev_type]) {
+               bfqd->device_speed = BFQ_BFQD_FAST;
+               bfqd->RT_prod = R_fast[dev_type] *
+                       T_fast[dev_type];
+       }
+
+       bfq_log(bfqd,
+"dev_type %s dev_speed_class = %s (%llu sects/sec), thresh %llu setcs/sec",
+               dev_type == 0 ? "ROT" : "NONROT",
+               bfqd->device_speed == BFQ_BFQD_FAST ? "FAST" : "SLOW",
+               bfqd->device_speed == BFQ_BFQD_FAST ?
+               (USEC_PER_SEC*(u64)R_fast[dev_type])>>BFQ_RATE_SHIFT :
+               (USEC_PER_SEC*(u64)R_slow[dev_type])>>BFQ_RATE_SHIFT,
+               (USEC_PER_SEC*(u64)device_speed_thresh[dev_type])>>
+               BFQ_RATE_SHIFT);
+}
+
+static void bfq_reset_rate_computation(struct bfq_data *bfqd,
+                                      struct request *rq)
+{
+       if (rq != NULL) { /* new rq dispatch now, reset accordingly */
+               bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns();
+               bfqd->peak_rate_samples = 1;
+               bfqd->sequential_samples = 0;
+               bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size =
+                       blk_rq_sectors(rq);
+       } else /* no new rq dispatched, just reset the number of samples */
+               bfqd->peak_rate_samples = 0; /* full re-init on next disp. */
+
+       bfq_log(bfqd,
+               "reset_rate_computation at end, sample %u/%u tot_sects %llu",
+               bfqd->peak_rate_samples, bfqd->sequential_samples,
+               bfqd->tot_sectors_dispatched);
+}
+
+static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq)
+{
+       u32 rate, weight, divisor;
+
+       /*
+        * For the convergence property to hold (see comments on
+        * bfq_update_peak_rate()) and for the assessment to be
+        * reliable, a minimum number of samples must be present, and
+        * a minimum amount of time must have elapsed. If not so, do
+        * not compute new rate. Just reset parameters, to get ready
+        * for a new evaluation attempt.
+        */
+       if (bfqd->peak_rate_samples < BFQ_RATE_MIN_SAMPLES ||
+           bfqd->delta_from_first < BFQ_RATE_MIN_INTERVAL)
+               goto reset_computation;
+
+       /*
+        * If a new request completion has occurred after last
+        * dispatch, then, to approximate the rate at which requests
+        * have been served by the device, it is more precise to
+        * extend the observation interval to the last completion.
+        */
+       bfqd->delta_from_first =
+               max_t(u64, bfqd->delta_from_first,
+                     bfqd->last_completion - bfqd->first_dispatch);
+
+       /*
+        * Rate computed in sects/usec, and not sects/nsec, for
+        * precision issues.
+        */
+       rate = div64_ul(bfqd->tot_sectors_dispatched<<BFQ_RATE_SHIFT,
+                       div_u64(bfqd->delta_from_first, NSEC_PER_USEC));
+
+       /*
+        * Peak rate not updated if:
+        * - the percentage of sequential dispatches is below 3/4 of the
+        *   total, and rate is below the current estimated peak rate
+        * - rate is unreasonably high (> 20M sectors/sec)
+        */
+       if ((bfqd->sequential_samples < (3 * bfqd->peak_rate_samples)>>2 &&
+            rate <= bfqd->peak_rate) ||
+               rate > 20<<BFQ_RATE_SHIFT)
+               goto reset_computation;
+
+       /*
+        * We have to update the peak rate, at last! To this purpose,
+        * we use a low-pass filter. We compute the smoothing constant
+        * of the filter as a function of the 'weight' of the new
+        * measured rate.
+        *
+        * As can be seen in next formulas, we define this weight as a
+        * quantity proportional to how sequential the workload is,
+        * and to how long the observation time interval is.
+        *
+        * The weight runs from 0 to 8. The maximum value of the
+        * weight, 8, yields the minimum value for the smoothing
+        * constant. At this minimum value for the smoothing constant,
+        * the measured rate contributes for half of the next value of
+        * the estimated peak rate.
+        *
+        * So, the first step is to compute the weight as a function
+        * of how sequential the workload is. Note that the weight
+        * cannot reach 9, because bfqd->sequential_samples cannot
+        * become equal to bfqd->peak_rate_samples, which, in its
+        * turn, holds true because bfqd->sequential_samples is not
+        * incremented for the first sample.
+        */
+       weight = (9 * bfqd->sequential_samples) / bfqd->peak_rate_samples;
+
+       /*
+        * Second step: further refine the weight as a function of the
+        * duration of the observation interval.
+        */
+       weight = min_t(u32, 8,
+                      div_u64(weight * bfqd->delta_from_first,
+                              BFQ_RATE_REF_INTERVAL));
+
+       /*
+        * Divisor ranging from 10, for minimum weight, to 2, for
+        * maximum weight.
+        */
+       divisor = 10 - weight;
+
+       /*
+        * Finally, update peak rate:
+        *
+        * peak_rate = peak_rate * (divisor-1) / divisor  +  rate / divisor
+        */
+       bfqd->peak_rate *= divisor-1;
+       bfqd->peak_rate /= divisor;
+       rate /= divisor; /* smoothing constant alpha = 1/divisor */
+
+       bfqd->peak_rate += rate;
+       update_thr_responsiveness_params(bfqd);
+
+reset_computation:
+       bfq_reset_rate_computation(bfqd, rq);
+}
+
+/*
+ * Update the read/write peak rate (the main quantity used for
+ * auto-tuning, see update_thr_responsiveness_params()).
+ *
+ * It is not trivial to estimate the peak rate (correctly): because of
+ * the presence of sw and hw queues between the scheduler and the
+ * device components that finally serve I/O requests, it is hard to
+ * say exactly when a given dispatched request is served inside the
+ * device, and for how long. As a consequence, it is hard to know
+ * precisely at what rate a given set of requests is actually served
+ * by the device.
+ *
+ * On the opposite end, the dispatch time of any request is trivially
+ * available, and, from this piece of information, the "dispatch rate"
+ * of requests can be immediately computed. So, the idea in the next
+ * function is to use what is known, namely request dispatch times
+ * (plus, when useful, request completion times), to estimate what is
+ * unknown, namely in-device request service rate.
+ *
+ * The main issue is that, because of the above facts, the rate at
+ * which a certain set of requests is dispatched over a certain time
+ * interval can vary greatly with respect to the rate at which the
+ * same requests are then served. But, since the size of any
+ * intermediate queue is limited, and the service scheme is lossless
+ * (no request is silently dropped), the following obvious convergence
+ * property holds: the number of requests dispatched MUST become
+ * closer and closer to the number of requests completed as the
+ * observation interval grows. This is the key property used in
+ * the next function to estimate the peak service rate as a function
+ * of the observed dispatch rate. The function assumes to be invoked
+ * on every request dispatch.
+ */
+static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq)
+{
+       u64 now_ns = ktime_get_ns();
+
+       if (bfqd->peak_rate_samples == 0) { /* first dispatch */
+               bfq_log(bfqd, "update_peak_rate: goto reset, samples %d",
+                       bfqd->peak_rate_samples);
+               bfq_reset_rate_computation(bfqd, rq);
+               goto update_last_values; /* will add one sample */
+       }
+
+       /*
+        * Device idle for very long: the observation interval lasting
+        * up to this dispatch cannot be a valid observation interval
+        * for computing a new peak rate (similarly to the late-
+        * completion event in bfq_completed_request()). Go to
+        * update_rate_and_reset to have the following three steps
+        * taken:
+        * - close the observation interval at the last (previous)
+        *   request dispatch or completion
+        * - compute rate, if possible, for that observation interval
+        * - start a new observation interval with this dispatch
+        */
+       if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC &&
+           bfqd->rq_in_driver == 0)
+               goto update_rate_and_reset;
+
+       /* Update sampling information */
+       bfqd->peak_rate_samples++;
+
+       if ((bfqd->rq_in_driver > 0 ||
+               now_ns - bfqd->last_completion < BFQ_MIN_TT)
+            && get_sdist(bfqd->last_position, rq) < BFQQ_SEEK_THR)
+               bfqd->sequential_samples++;
+
+       bfqd->tot_sectors_dispatched += blk_rq_sectors(rq);
+
+       /* Reset max observed rq size every 32 dispatches */
+       if (likely(bfqd->peak_rate_samples % 32))
+               bfqd->last_rq_max_size =
+                       max_t(u32, blk_rq_sectors(rq), bfqd->last_rq_max_size);
+       else
+               bfqd->last_rq_max_size = blk_rq_sectors(rq);
+
+       bfqd->delta_from_first = now_ns - bfqd->first_dispatch;
+
+       /* Target observation interval not yet reached, go on sampling */
+       if (bfqd->delta_from_first < BFQ_RATE_REF_INTERVAL)
+               goto update_last_values;
+
+update_rate_and_reset:
+       bfq_update_rate_reset(bfqd, rq);
+update_last_values:
+       bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
+       bfqd->last_dispatch = now_ns;
+}
+
+/*
+ * Remove request from internal lists.
+ */
+static void bfq_dispatch_remove(struct request_queue *q, struct request *rq)
+{
+       struct bfq_queue *bfqq = RQ_BFQQ(rq);
+
+       /*
+        * For consistency, the next instruction should have been
+        * executed after removing the request from the queue and
+        * dispatching it.  We execute instead this instruction before
+        * bfq_remove_request() (and hence introduce a temporary
+        * inconsistency), for efficiency.  In fact, should this
+        * dispatch occur for a non in-service bfqq, this anticipated
+        * increment prevents two counters related to bfqq->dispatched
+        * from risking to be, first, uselessly decremented, and then
+        * incremented again when the (new) value of bfqq->dispatched
+        * happens to be taken into account.
+        */
+       bfqq->dispatched++;
+       bfq_update_peak_rate(q->elevator->elevator_data, rq);
+
+       bfq_remove_request(q, rq);
+}
+
+static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+       /*
+        * If this bfqq is shared between multiple processes, check
+        * to make sure that those processes are still issuing I/Os
+        * within the mean seek distance. If not, it may be time to
+        * break the queues apart again.
+        */
+       if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq))
+               bfq_mark_bfqq_split_coop(bfqq);
+
+       if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
+               if (bfqq->dispatched == 0)
+                       /*
+                        * Overloading budget_timeout field to store
+                        * the time at which the queue remains with no
+                        * backlog and no outstanding request; used by
+                        * the weight-raising mechanism.
+                        */
+                       bfqq->budget_timeout = jiffies;
+
+               bfq_del_bfqq_busy(bfqd, bfqq, true);
+       } else {
+               bfq_requeue_bfqq(bfqd, bfqq);
+               /*
+                * Resort priority tree of potential close cooperators.
+                */
+               bfq_pos_tree_add_move(bfqd, bfqq);
+       }
+
+       /*
+        * All in-service entities must have been properly deactivated
+        * or requeued before executing the next function, which
+        * resets all in-service entites as no more in service.
+        */
+       __bfq_bfqd_reset_in_service(bfqd);
+}
+
+/**
+ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.
+ * @bfqd: device data.
+ * @bfqq: queue to update.
+ * @reason: reason for expiration.
+ *
+ * Handle the feedback on @bfqq budget at queue expiration.
+ * See the body for detailed comments.
+ */
+static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
+                                    struct bfq_queue *bfqq,
+                                    enum bfqq_expiration reason)
+{
+       struct request *next_rq;
+       int budget, min_budget;
+
+       min_budget = bfq_min_budget(bfqd);
+
+       if (bfqq->wr_coeff == 1)
+               budget = bfqq->max_budget;
+       else /*
+             * Use a constant, low budget for weight-raised queues,
+             * to help achieve a low latency. Keep it slightly higher
+             * than the minimum possible budget, to cause a little
+             * bit fewer expirations.
+             */
+               budget = 2 * min_budget;
+
+       bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d",
+               bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));
+       bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %d, min budg %d",
+               budget, bfq_min_budget(bfqd));
+       bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",
+               bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue));
+
+       if (bfq_bfqq_sync(bfqq) && bfqq->wr_coeff == 1) {
+               switch (reason) {
+               /*
+                * Caveat: in all the following cases we trade latency
+                * for throughput.
+                */
+               case BFQQE_TOO_IDLE:
+                       /*
+                        * This is the only case where we may reduce
+                        * the budget: if there is no request of the
+                        * process still waiting for completion, then
+                        * we assume (tentatively) that the timer has
+                        * expired because the batch of requests of
+                        * the process could have been served with a
+                        * smaller budget.  Hence, betting that
+                        * process will behave in the same way when it
+                        * becomes backlogged again, we reduce its
+                        * next budget.  As long as we guess right,
+                        * this budget cut reduces the latency
+                        * experienced by the process.
+                        *
+                        * However, if there are still outstanding
+                        * requests, then the process may have not yet
+                        * issued its next request just because it is
+                        * still waiting for the completion of some of
+                        * the still outstanding ones.  So in this
+                        * subcase we do not reduce its budget, on the
+                        * contrary we increase it to possibly boost
+                        * the throughput, as discussed in the
+                        * comments to the BUDGET_TIMEOUT case.
+                        */
+                       if (bfqq->dispatched > 0) /* still outstanding reqs */
+                               budget = min(budget * 2, bfqd->bfq_max_budget);
+                       else {
+                               if (budget > 5 * min_budget)
+                                       budget -= 4 * min_budget;
+                               else
+                                       budget = min_budget;
+                       }
+                       break;
+               case BFQQE_BUDGET_TIMEOUT:
+                       /*
+                        * We double the budget here because it gives
+                        * the chance to boost the throughput if this
+                        * is not a seeky process (and has bumped into
+                        * this timeout because of, e.g., ZBR).
+                        */
+                       budget = min(budget * 2, bfqd->bfq_max_budget);
+                       break;
+               case BFQQE_BUDGET_EXHAUSTED:
+                       /*
+                        * The process still has backlog, and did not
+                        * let either the budget timeout or the disk
+                        * idling timeout expire. Hence it is not
+                        * seeky, has a short thinktime and may be
+                        * happy with a higher budget too. So
+                        * definitely increase the budget of this good
+                        * candidate to boost the disk throughput.
+                        */
+                       budget = min(budget * 4, bfqd->bfq_max_budget);
+                       break;
+               case BFQQE_NO_MORE_REQUESTS:
+                       /*
+                        * For queues that expire for this reason, it
+                        * is particularly important to keep the
+                        * budget close to the actual service they
+                        * need. Doing so reduces the timestamp
+                        * misalignment problem described in the
+                        * comments in the body of
+                        * __bfq_activate_entity. In fact, suppose
+                        * that a queue systematically expires for
+                        * BFQQE_NO_MORE_REQUESTS and presents a
+                        * new request in time to enjoy timestamp
+                        * back-shifting. The larger the budget of the
+                        * queue is with respect to the service the
+                        * queue actually requests in each service
+                        * slot, the more times the queue can be
+                        * reactivated with the same virtual finish
+                        * time. It follows that, even if this finish
+                        * time is pushed to the system virtual time
+                        * to reduce the consequent timestamp
+                        * misalignment, the queue unjustly enjoys for
+                        * many re-activations a lower finish time
+                        * than all newly activated queues.
+                        *
+                        * The service needed by bfqq is measured
+                        * quite precisely by bfqq->entity.service.
+                        * Since bfqq does not enjoy device idling,
+                        * bfqq->entity.service is equal to the number
+                        * of sectors that the process associated with
+                        * bfqq requested to read/write before waiting
+                        * for request completions, or blocking for
+                        * other reasons.
+                        */
+                       budget = max_t(int, bfqq->entity.service, min_budget);
+                       break;
+               default:
+                       return;
+               }
+       } else if (!bfq_bfqq_sync(bfqq)) {
+               /*
+                * Async queues get always the maximum possible
+                * budget, as for them we do not care about latency
+                * (in addition, their ability to dispatch is limited
+                * by the charging factor).
+                */
+               budget = bfqd->bfq_max_budget;
+       }
+
+       bfqq->max_budget = budget;
+
+       if (bfqd->budgets_assigned >= bfq_stats_min_budgets &&
+           !bfqd->bfq_user_max_budget)
+               bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget);
+
+       /*
+        * If there is still backlog, then assign a new budget, making
+        * sure that it is large enough for the next request.  Since
+        * the finish time of bfqq must be kept in sync with the
+        * budget, be sure to call __bfq_bfqq_expire() *after* this
+        * update.
+        *
+        * If there is no backlog, then no need to update the budget;
+        * it will be updated on the arrival of a new request.
+        */
+       next_rq = bfqq->next_rq;
+       if (next_rq)
+               bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,
+                                           bfq_serv_to_charge(next_rq, bfqq));
+
+       bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d",
+                       next_rq ? blk_rq_sectors(next_rq) : 0,
+                       bfqq->entity.budget);
+}
+
+/*
+ * Return true if the process associated with bfqq is "slow". The slow
+ * flag is used, in addition to the budget timeout, to reduce the
+ * amount of service provided to seeky processes, and thus reduce
+ * their chances to lower the throughput. More details in the comments
+ * on the function bfq_bfqq_expire().
+ *
+ * An important observation is in order: as discussed in the comments
+ * on the function bfq_update_peak_rate(), with devices with internal
+ * queues, it is hard if ever possible to know when and for how long
+ * an I/O request is processed by the device (apart from the trivial
+ * I/O pattern where a new request is dispatched only after the
+ * previous one has been completed). This makes it hard to evaluate
+ * the real rate at which the I/O requests of each bfq_queue are
+ * served.  In fact, for an I/O scheduler like BFQ, serving a
+ * bfq_queue means just dispatching its requests during its service
+ * slot (i.e., until the budget of the queue is exhausted, or the
+ * queue remains idle, or, finally, a timeout fires). But, during the
+ * service slot of a bfq_queue, around 100 ms at most, the device may
+ * be even still processing requests of bfq_queues served in previous
+ * service slots. On the opposite end, the requests of the in-service
+ * bfq_queue may be completed after the service slot of the queue
+ * finishes.
+ *
+ * Anyway, unless more sophisticated solutions are used
+ * (where possible), the sum of the sizes of the requests dispatched
+ * during the service slot of a bfq_queue is probably the only
+ * approximation available for the service received by the bfq_queue
+ * during its service slot. And this sum is the quantity used in this
+ * function to evaluate the I/O speed of a process.
+ */
+static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+                                bool compensate, enum bfqq_expiration reason,
+                                unsigned long *delta_ms)
+{
+       ktime_t delta_ktime;
+       u32 delta_usecs;
+       bool slow = BFQQ_SEEKY(bfqq); /* if delta too short, use seekyness */
+
+       if (!bfq_bfqq_sync(bfqq))
+               return false;
+
+       if (compensate)
+               delta_ktime = bfqd->last_idling_start;
+       else
+               delta_ktime = ktime_get();
+       delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start);
+       delta_usecs = ktime_to_us(delta_ktime);
+
+       /* don't use too short time intervals */
+       if (delta_usecs < 1000) {
+               if (blk_queue_nonrot(bfqd->queue))
+                        /*
+                         * give same worst-case guarantees as idling
+                         * for seeky
+                         */
+                       *delta_ms = BFQ_MIN_TT / NSEC_PER_MSEC;
+               else /* charge at least one seek */
+                       *delta_ms = bfq_slice_idle / NSEC_PER_MSEC;
+
+               return slow;
+       }
+
+       *delta_ms = delta_usecs / USEC_PER_MSEC;
+
+       /*
+        * Use only long (> 20ms) intervals to filter out excessive
+        * spikes in service rate estimation.
+        */
+       if (delta_usecs > 20000) {
+               /*
+                * Caveat for rotational devices: processes doing I/O
+                * in the slower disk zones tend to be slow(er) even
+                * if not seeky. In this respect, the estimated peak
+                * rate is likely to be an average over the disk
+                * surface. Accordingly, to not be too harsh with
+                * unlucky processes, a process is deemed slow only if
+                * its rate has been lower than half of the estimated
+                * peak rate.
+                */
+               slow = bfqq->entity.service < bfqd->bfq_max_budget / 2;
+       }
+
+       bfq_log_bfqq(bfqd, bfqq, "bfq_bfqq_is_slow: slow %d", slow);
+
+       return slow;
+}
+
+/*
+ * To be deemed as soft real-time, an application must meet two
+ * requirements. First, the application must not require an average
+ * bandwidth higher than the approximate bandwidth required to playback or
+ * record a compressed high-definition video.
+ * The next function is invoked on the completion of the last request of a
+ * batch, to compute the next-start time instant, soft_rt_next_start, such
+ * that, if the next request of the application does not arrive before
+ * soft_rt_next_start, then the above requirement on the bandwidth is met.
+ *
+ * The second requirement is that the request pattern of the application is
+ * isochronous, i.e., that, after issuing a request or a batch of requests,
+ * the application stops issuing new requests until all its pending requests
+ * have been completed. After that, the application may issue a new batch,
+ * and so on.
+ * For this reason the next function is invoked to compute
+ * soft_rt_next_start only for applications that meet this requirement,
+ * whereas soft_rt_next_start is set to infinity for applications that do
+ * not.
+ *
+ * Unfortunately, even a greedy application may happen to behave in an
+ * isochronous way if the CPU load is high. In fact, the application may
+ * stop issuing requests while the CPUs are busy serving other processes,
+ * then restart, then stop again for a while, and so on. In addition, if
+ * the disk achieves a low enough throughput with the request pattern
+ * issued by the application (e.g., because the request pattern is random
+ * and/or the device is slow), then the application may meet the above
+ * bandwidth requirement too. To prevent such a greedy application to be
+ * deemed as soft real-time, a further rule is used in the computation of
+ * soft_rt_next_start: soft_rt_next_start must be higher than the current
+ * time plus the maximum time for which the arrival of a request is waited
+ * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle.
+ * This filters out greedy applications, as the latter issue instead their
+ * next request as soon as possible after the last one has been completed
+ * (in contrast, when a batch of requests is completed, a soft real-time
+ * application spends some time processing data).
+ *
+ * Unfortunately, the last filter may easily generate false positives if
+ * only bfqd->bfq_slice_idle is used as a reference time interval and one
+ * or both the following cases occur:
+ * 1) HZ is so low that the duration of a jiffy is comparable to or higher
+ *    than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with
+ *    HZ=100.
+ * 2) jiffies, instead of increasing at a constant rate, may stop increasing
+ *    for a while, then suddenly 'jump' by several units to recover the lost
+ *    increments. This seems to happen, e.g., inside virtual machines.
+ * To address this issue, we do not use as a reference time interval just
+ * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In
+ * particular we add the minimum number of jiffies for which the filter
+ * seems to be quite precise also in embedded systems and KVM/QEMU virtual
+ * machines.
+ */
+static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,
+                                               struct bfq_queue *bfqq)
+{
+       return max(bfqq->last_idle_bklogged +
+                  HZ * bfqq->service_from_backlogged /
+                  bfqd->bfq_wr_max_softrt_rate,
+                  jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4);
+}
+
+/*
+ * Return the farthest future time instant according to jiffies
+ * macros.
+ */
+static unsigned long bfq_greatest_from_now(void)
+{
+       return jiffies + MAX_JIFFY_OFFSET;
+}
+
+/*
+ * Return the farthest past time instant according to jiffies
+ * macros.
+ */
+static unsigned long bfq_smallest_from_now(void)
+{
+       return jiffies - MAX_JIFFY_OFFSET;
+}
+
+/**
+ * bfq_bfqq_expire - expire a queue.
+ * @bfqd: device owning the queue.
+ * @bfqq: the queue to expire.
+ * @compensate: if true, compensate for the time spent idling.
+ * @reason: the reason causing the expiration.
+ *
+ * If the process associated with bfqq does slow I/O (e.g., because it
+ * issues random requests), we charge bfqq with the time it has been
+ * in service instead of the service it has received (see
+ * bfq_bfqq_charge_time for details on how this goal is achieved). As
+ * a consequence, bfqq will typically get higher timestamps upon
+ * reactivation, and hence it will be rescheduled as if it had
+ * received more service than what it has actually received. In the
+ * end, bfqq receives less service in proportion to how slowly its
+ * associated process consumes its budgets (and hence how seriously it
+ * tends to lower the throughput). In addition, this time-charging
+ * strategy guarantees time fairness among slow processes. In
+ * contrast, if the process associated with bfqq is not slow, we
+ * charge bfqq exactly with the service it has received.
+ *
+ * Charging time to the first type of queues and the exact service to
+ * the other has the effect of using the WF2Q+ policy to schedule the
+ * former on a timeslice basis, without violating service domain
+ * guarantees among the latter.
+ */
+void bfq_bfqq_expire(struct bfq_data *bfqd,
+                    struct bfq_queue *bfqq,
+                    bool compensate,
+                    enum bfqq_expiration reason)
+{
+       bool slow;
+       unsigned long delta = 0;
+       struct bfq_entity *entity = &bfqq->entity;
+       int ref;
+
+       /*
+        * Check whether the process is slow (see bfq_bfqq_is_slow).
+        */
+       slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta);
+
+       /*
+        * Increase service_from_backlogged before next statement,
+        * because the possible next invocation of
+        * bfq_bfqq_charge_time would likely inflate
+        * entity->service. In contrast, service_from_backlogged must
+        * contain real service, to enable the soft real-time
+        * heuristic to correctly compute the bandwidth consumed by
+        * bfqq.
+        */
+       bfqq->service_from_backlogged += entity->service;
+
+       /*
+        * As above explained, charge slow (typically seeky) and
+        * timed-out queues with the time and not the service
+        * received, to favor sequential workloads.
+        *
+        * Processes doing I/O in the slower disk zones will tend to
+        * be slow(er) even if not seeky. Therefore, since the
+        * estimated peak rate is actually an average over the disk
+        * surface, these processes may timeout just for bad luck. To
+        * avoid punishing them, do not charge time to processes that
+        * succeeded in consuming at least 2/3 of their budget. This
+        * allows BFQ to preserve enough elasticity to still perform
+        * bandwidth, and not time, distribution with little unlucky
+        * or quasi-sequential processes.
+        */
+       if (bfqq->wr_coeff == 1 &&
+           (slow ||
+            (reason == BFQQE_BUDGET_TIMEOUT &&
+             bfq_bfqq_budget_left(bfqq) >=  entity->budget / 3)))
+               bfq_bfqq_charge_time(bfqd, bfqq, delta);
+
+       if (reason == BFQQE_TOO_IDLE &&
+           entity->service <= 2 * entity->budget / 10)
+               bfq_clear_bfqq_IO_bound(bfqq);
+
+       if (bfqd->low_latency && bfqq->wr_coeff == 1)
+               bfqq->last_wr_start_finish = jiffies;
+
+       if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 &&
+           RB_EMPTY_ROOT(&bfqq->sort_list)) {
+               /*
+                * If we get here, and there are no outstanding
+                * requests, then the request pattern is isochronous
+                * (see the comments on the function
+                * bfq_bfqq_softrt_next_start()). Thus we can compute
+                * soft_rt_next_start. If, instead, the queue still
+                * has outstanding requests, then we have to wait for
+                * the completion of all the outstanding requests to
+                * discover whether the request pattern is actually
+                * isochronous.
+                */
+               if (bfqq->dispatched == 0)
+                       bfqq->soft_rt_next_start =
+                               bfq_bfqq_softrt_next_start(bfqd, bfqq);
+               else {
+                       /*
+                        * The application is still waiting for the
+                        * completion of one or more requests:
+                        * prevent it from possibly being incorrectly
+                        * deemed as soft real-time by setting its
+                        * soft_rt_next_start to infinity. In fact,
+                        * without this assignment, the application
+                        * would be incorrectly deemed as soft
+                        * real-time if:
+                        * 1) it issued a new request before the
+                        *    completion of all its in-flight
+                        *    requests, and
+                        * 2) at that time, its soft_rt_next_start
+                        *    happened to be in the past.
+                        */
+                       bfqq->soft_rt_next_start =
+                               bfq_greatest_from_now();
+                       /*
+                        * Schedule an update of soft_rt_next_start to when
+                        * the task may be discovered to be isochronous.
+                        */
+                       bfq_mark_bfqq_softrt_update(bfqq);
+               }
+       }
+
+       bfq_log_bfqq(bfqd, bfqq,
+               "expire (%d, slow %d, num_disp %d, idle_win %d)", reason,
+               slow, bfqq->dispatched, bfq_bfqq_idle_window(bfqq));
+
+       /*
+        * Increase, decrease or leave budget unchanged according to
+        * reason.
+        */
+       __bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
+       ref = bfqq->ref;
+       __bfq_bfqq_expire(bfqd, bfqq);
+
+       /* mark bfqq as waiting a request only if a bic still points to it */
+       if (ref > 1 && !bfq_bfqq_busy(bfqq) &&
+           reason != BFQQE_BUDGET_TIMEOUT &&
+           reason != BFQQE_BUDGET_EXHAUSTED)
+               bfq_mark_bfqq_non_blocking_wait_rq(bfqq);
+}
+
+/*
+ * Budget timeout is not implemented through a dedicated timer, but
+ * just checked on request arrivals and completions, as well as on
+ * idle timer expirations.
+ */
+static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
+{
+       return time_is_before_eq_jiffies(bfqq->budget_timeout);
+}
+
+/*
+ * If we expire a queue that is actively waiting (i.e., with the
+ * device idled) for the arrival of a new request, then we may incur
+ * the timestamp misalignment problem described in the body of the
+ * function __bfq_activate_entity. Hence we return true only if this
+ * condition does not hold, or if the queue is slow enough to deserve
+ * only to be kicked off for preserving a high throughput.
+ */
+static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
+{
+       bfq_log_bfqq(bfqq->bfqd, bfqq,
+               "may_budget_timeout: wait_request %d left %d timeout %d",
+               bfq_bfqq_wait_request(bfqq),
+                       bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3,
+               bfq_bfqq_budget_timeout(bfqq));
+
+       return (!bfq_bfqq_wait_request(bfqq) ||
+               bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3)
+               &&
+               bfq_bfqq_budget_timeout(bfqq);
+}
+
+/*
+ * For a queue that becomes empty, device idling is allowed only if
+ * this function returns true for the queue. As a consequence, since
+ * device idling plays a critical role in both throughput boosting and
+ * service guarantees, the return value of this function plays a
+ * critical role in both these aspects as well.
+ *
+ * In a nutshell, this function returns true only if idling is
+ * beneficial for throughput or, even if detrimental for throughput,
+ * idling is however necessary to preserve service guarantees (low
+ * latency, desired throughput distribution, ...). In particular, on
+ * NCQ-capable devices, this function tries to return false, so as to
+ * help keep the drives' internal queues full, whenever this helps the
+ * device boost the throughput without causing any service-guarantee
+ * issue.
+ *
+ * In more detail, the return value of this function is obtained by,
+ * first, computing a number of boolean variables that take into
+ * account throughput and service-guarantee issues, and, then,
+ * combining these variables in a logical expression. Most of the
+ * issues taken into account are not trivial. We discuss these issues
+ * individually while introducing the variables.
+ */
+static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
+{
+       struct bfq_data *bfqd = bfqq->bfqd;
+       bool idling_boosts_thr, idling_boosts_thr_without_issues,
+               idling_needed_for_service_guarantees,
+               asymmetric_scenario;
+
+       if (bfqd->strict_guarantees)
+               return true;
+
+       /*
+        * The next variable takes into account the cases where idling
+        * boosts the throughput.
+        *
+        * The value of the variable is computed considering, first, that
+        * idling is virtually always beneficial for the throughput if:
+        * (a) the device is not NCQ-capable, or
+        * (b) regardless of the presence of NCQ, the device is rotational
+        *     and the request pattern for bfqq is I/O-bound and sequential.
+        *
+        * Secondly, and in contrast to the above item (b), idling an
+        * NCQ-capable flash-based device would not boost the
+        * throughput even with sequential I/O; rather it would lower
+        * the throughput in proportion to how fast the device
+        * is. Accordingly, the next variable is true if any of the
+        * above conditions (a) and (b) is true, and, in particular,
+        * happens to be false if bfqd is an NCQ-capable flash-based
+        * device.
+        */
+       idling_boosts_thr = !bfqd->hw_tag ||
+               (!blk_queue_nonrot(bfqd->queue) && bfq_bfqq_IO_bound(bfqq) &&
+                bfq_bfqq_idle_window(bfqq));
+
+       /*
+        * The value of the next variable,
+        * idling_boosts_thr_without_issues, is equal to that of
+        * idling_boosts_thr, unless a special case holds. In this
+        * special case, described below, idling may cause problems to
+        * weight-raised queues.
+        *
+        * When the request pool is saturated (e.g., in the presence
+        * of write hogs), if the processes associated with
+        * non-weight-raised queues ask for requests at a lower rate,
+        * then processes associated with weight-raised queues have a
+        * higher probability to get a request from the pool
+        * immediately (or at least soon) when they need one. Thus
+        * they have a higher probability to actually get a fraction
+        * of the device throughput proportional to their high
+        * weight. This is especially true with NCQ-capable drives,
+        * which enqueue several requests in advance, and further
+        * reorder internally-queued requests.
+        *
+        * For this reason, we force to false the value of
+        * idling_boosts_thr_without_issues if there are weight-raised
+        * busy queues. In this case, and if bfqq is not weight-raised,
+        * this guarantees that the device is not idled for bfqq (if,
+        * instead, bfqq is weight-raised, then idling will be
+        * guaranteed by another variable, see below). Combined with
+        * the timestamping rules of BFQ (see [1] for details), this
+        * behavior causes bfqq, and hence any sync non-weight-raised
+        * queue, to get a lower number of requests served, and thus
+        * to ask for a lower number of requests from the request
+        * pool, before the busy weight-raised queues get served
+        * again. This often mitigates starvation problems in the
+        * presence of heavy write workloads and NCQ, thereby
+        * guaranteeing a higher application and system responsiveness
+        * in these hostile scenarios.
+        */
+       idling_boosts_thr_without_issues = idling_boosts_thr &&
+               bfqd->wr_busy_queues == 0;
+
+       /*
+        * There is then a case where idling must be performed not
+        * for throughput concerns, but to preserve service
+        * guarantees.
+        *
+        * To introduce this case, we can note that allowing the drive
+        * to enqueue more than one request at a time, and hence
+        * delegating de facto final scheduling decisions to the
+        * drive's internal scheduler, entails loss of control on the
+        * actual request service order. In particular, the critical
+        * situation is when requests from different processes happen
+        * to be present, at the same time, in the internal queue(s)
+        * of the drive. In such a situation, the drive, by deciding
+        * the service order of the internally-queued requests, does
+        * determine also the actual throughput distribution among
+        * these processes. But the drive typically has no notion or
+        * concern about per-process throughput distribution, and
+        * makes its decisions only on a per-request basis. Therefore,
+        * the service distribution enforced by the drive's internal
+        * scheduler is likely to coincide with the desired
+        * device-throughput distribution only in a completely
+        * symmetric scenario where:
+        * (i)  each of these processes must get the same throughput as
+        *      the others;
+        * (ii) all these processes have the same I/O pattern
+               (either sequential or random).
+        * In fact, in such a scenario, the drive will tend to treat
+        * the requests of each of these processes in about the same
+        * way as the requests of the others, and thus to provide
+        * each of these processes with about the same throughput
+        * (which is exactly the desired throughput distribution). In
+        * contrast, in any asymmetric scenario, device idling is
+        * certainly needed to guarantee that bfqq receives its
+        * assigned fraction of the device throughput (see [1] for
+        * details).
+        *
+        * We address this issue by controlling, actually, only the
+        * symmetry sub-condition (i), i.e., provided that
+        * sub-condition (i) holds, idling is not performed,
+        * regardless of whether sub-condition (ii) holds. In other
+        * words, only if sub-condition (i) holds, then idling is
+        * allowed, and the device tends to be prevented from queueing
+        * many requests, possibly of several processes. The reason
+        * for not controlling also sub-condition (ii) is that we
+        * exploit preemption to preserve guarantees in case of
+        * symmetric scenarios, even if (ii) does not hold, as
+        * explained in the next two paragraphs.
+        *
+        * Even if a queue, say Q, is expired when it remains idle, Q
+        * can still preempt the new in-service queue if the next
+        * request of Q arrives soon (see the comments on
+        * bfq_bfqq_update_budg_for_activation). If all queues and
+        * groups have the same weight, this form of preemption,
+        * combined with the hole-recovery heuristic described in the
+        * comments on function bfq_bfqq_update_budg_for_activation,
+        * are enough to preserve a correct bandwidth distribution in
+        * the mid term, even without idling. In fact, even if not
+        * idling allows the internal queues of the device to contain
+        * many requests, and thus to reorder requests, we can rather
+        * safely assume that the internal scheduler still preserves a
+        * minimum of mid-term fairness. The motivation for using
+        * preemption instead of idling is that, by not idling,
+        * service guarantees are preserved without minimally
+        * sacrificing throughput. In other words, both a high
+        * throughput and its desired distribution are obtained.
+        *
+        * More precisely, this preemption-based, idleless approach
+        * provides fairness in terms of IOPS, and not sectors per
+        * second. This can be seen with a simple example. Suppose
+        * that there are two queues with the same weight, but that
+        * the first queue receives requests of 8 sectors, while the
+        * second queue receives requests of 1024 sectors. In
+        * addition, suppose that each of the two queues contains at
+        * most one request at a time, which implies that each queue
+        * always remains idle after it is served. Finally, after
+        * remaining idle, each queue receives very quickly a new
+        * request. It follows that the two queues are served
+        * alternatively, preempting each other if needed. This
+        * implies that, although both queues have the same weight,
+        * the queue with large requests receives a service that is
+        * 1024/8 times as high as the service received by the other
+        * queue.
+        *
+        * On the other hand, device idling is performed, and thus
+        * pure sector-domain guarantees are provided, for the
+        * following queues, which are likely to need stronger
+        * throughput guarantees: weight-raised queues, and queues
+        * with a higher weight than other queues. When such queues
+        * are active, sub-condition (i) is false, which triggers
+        * device idling.
+        *
+        * According to the above considerations, the next variable is
+        * true (only) if sub-condition (i) holds. To compute the
+        * value of this variable, we not only use the return value of
+        * the function bfq_symmetric_scenario(), but also check
+        * whether bfqq is being weight-raised, because
+        * bfq_symmetric_scenario() does not take into account also
+        * weight-raised queues (see comments on
+        * bfq_weights_tree_add()).
+        *
+        * As a side note, it is worth considering that the above
+        * device-idling countermeasures may however fail in the
+        * following unlucky scenario: if idling is (correctly)
+        * disabled in a time period during which all symmetry
+        * sub-conditions hold, and hence the device is allowed to
+        * enqueue many requests, but at some later point in time some
+        * sub-condition stops to hold, then it may become impossible
+        * to let requests be served in the desired order until all
+        * the requests already queued in the device have been served.
+        */
+       asymmetric_scenario = bfqq->wr_coeff > 1 ||
+               !bfq_symmetric_scenario(bfqd);
+
+       /*
+        * Finally, there is a case where maximizing throughput is the
+        * best choice even if it may cause unfairness toward
+        * bfqq. Such a case is when bfqq became active in a burst of
+        * queue activations. Queues that became active during a large
+        * burst benefit only from throughput, as discussed in the
+        * comments on bfq_handle_burst. Thus, if bfqq became active
+        * in a burst and not idling the device maximizes throughput,
+        * then the device must no be idled, because not idling the
+        * device provides bfqq and all other queues in the burst with
+        * maximum benefit. Combining this and the above case, we can
+        * now establish when idling is actually needed to preserve
+        * service guarantees.
+        */
+       idling_needed_for_service_guarantees =
+               asymmetric_scenario && !bfq_bfqq_in_large_burst(bfqq);
+
+       /*
+        * We have now all the components we need to compute the return
+        * value of the function, which is true only if both the following
+        * conditions hold:
+        * 1) bfqq is sync, because idling make sense only for sync queues;
+        * 2) idling either boosts the throughput (without issues), or
+        *    is necessary to preserve service guarantees.
+        */
+       return bfq_bfqq_sync(bfqq) &&
+               (idling_boosts_thr_without_issues ||
+                idling_needed_for_service_guarantees);
+}
+
+/*
+ * If the in-service queue is empty but the function bfq_bfqq_may_idle
+ * returns true, then:
+ * 1) the queue must remain in service and cannot be expired, and
+ * 2) the device must be idled to wait for the possible arrival of a new
+ *    request for the queue.
+ * See the comments on the function bfq_bfqq_may_idle for the reasons
+ * why performing device idling is the best choice to boost the throughput
+ * and preserve service guarantees when bfq_bfqq_may_idle itself
+ * returns true.
+ */
+static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)
+{
+       struct bfq_data *bfqd = bfqq->bfqd;
+
+       return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 &&
+              bfq_bfqq_may_idle(bfqq);
+}
+
+/*
+ * Select a queue for service.  If we have a current queue in service,
+ * check whether to continue servicing it, or retrieve and set a new one.
+ */
+static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
+{
+       struct bfq_queue *bfqq;
+       struct request *next_rq;
+       enum bfqq_expiration reason = BFQQE_BUDGET_TIMEOUT;
+
+       bfqq = bfqd->in_service_queue;
+       if (!bfqq)
+               goto new_queue;
+
+       bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");
+
+       if (bfq_may_expire_for_budg_timeout(bfqq) &&
+           !bfq_bfqq_wait_request(bfqq) &&
+           !bfq_bfqq_must_idle(bfqq))
+               goto expire;
+
+check_queue:
+       /*
+        * This loop is rarely executed more than once. Even when it
+        * happens, it is much more convenient to re-execute this loop
+        * than to return NULL and trigger a new dispatch to get a
+        * request served.
+        */
+       next_rq = bfqq->next_rq;
+       /*
+        * If bfqq has requests queued and it has enough budget left to
+        * serve them, keep the queue, otherwise expire it.
+        */
+       if (next_rq) {
+               if (bfq_serv_to_charge(next_rq, bfqq) >
+                       bfq_bfqq_budget_left(bfqq)) {
+                       /*
+                        * Expire the queue for budget exhaustion,
+                        * which makes sure that the next budget is
+                        * enough to serve the next request, even if
+                        * it comes from the fifo expired path.
+                        */
+                       reason = BFQQE_BUDGET_EXHAUSTED;
+                       goto expire;
+               } else {
+                       /*
+                        * The idle timer may be pending because we may
+                        * not disable disk idling even when a new request
+                        * arrives.
+                        */
+                       if (bfq_bfqq_wait_request(bfqq)) {
+                               /*
+                                * If we get here: 1) at least a new request
+                                * has arrived but we have not disabled the
+                                * timer because the request was too small,
+                                * 2) then the block layer has unplugged
+                                * the device, causing the dispatch to be
+                                * invoked.
+                                *
+                                * Since the device is unplugged, now the
+                                * requests are probably large enough to
+                                * provide a reasonable throughput.
+                                * So we disable idling.
+                                */
+                               bfq_clear_bfqq_wait_request(bfqq);
+                               hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
+                               bfqg_stats_update_idle_time(bfqq_group(bfqq));
+                       }
+                       goto keep_queue;
+               }
+       }
+
+       /*
+        * No requests pending. However, if the in-service queue is idling
+        * for a new request, or has requests waiting for a completion and
+        * may idle after their completion, then keep it anyway.
+        */
+       if (bfq_bfqq_wait_request(bfqq) ||
+           (bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) {
+               bfqq = NULL;
+               goto keep_queue;
+       }
+
+       reason = BFQQE_NO_MORE_REQUESTS;
+expire:
+       bfq_bfqq_expire(bfqd, bfqq, false, reason);
+new_queue:
+       bfqq = bfq_set_in_service_queue(bfqd);
+       if (bfqq) {
+               bfq_log_bfqq(bfqd, bfqq, "select_queue: checking new queue");
+               goto check_queue;
+       }
+keep_queue:
+       if (bfqq)
+               bfq_log_bfqq(bfqd, bfqq, "select_queue: returned this queue");
+       else
+               bfq_log(bfqd, "select_queue: no queue returned");
+
+       return bfqq;
+}
+
+static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+       struct bfq_entity *entity = &bfqq->entity;
+
+       if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */
+               bfq_log_bfqq(bfqd, bfqq,
+                       "raising period dur %u/%u msec, old coeff %u, w %d(%d)",
+                       jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish),
+                       jiffies_to_msecs(bfqq->wr_cur_max_time),
+                       bfqq->wr_coeff,
+                       bfqq->entity.weight, bfqq->entity.orig_weight);
+
+               if (entity->prio_changed)
+                       bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change");
+
+               /*
+                * If the queue was activated in a burst, or too much
+                * time has elapsed from the beginning of this
+                * weight-raising period, then end weight raising.
+                */
+               if (bfq_bfqq_in_large_burst(bfqq))
+                       bfq_bfqq_end_wr(bfqq);
+               else if (time_is_before_jiffies(bfqq->last_wr_start_finish +
+                                               bfqq->wr_cur_max_time)) {
+                       if (bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time ||
+                       time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt +
+                                              bfq_wr_duration(bfqd)))
+                               bfq_bfqq_end_wr(bfqq);
+                       else {
+                               /* switch back to interactive wr */
+                               bfqq->wr_coeff = bfqd->bfq_wr_coeff;
+                               bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
+                               bfqq->last_wr_start_finish =
+                                       bfqq->wr_start_at_switch_to_srt;
+                               bfqq->entity.prio_changed = 1;
+                       }
+               }
+       }
+       /* Update weight both if it must be raised and if it must be lowered */
+       if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1))
+               __bfq_entity_update_weight_prio(
+                       bfq_entity_service_tree(entity),
+                       entity);
+}
+
+/*
+ * Dispatch next request from bfqq.
+ */
+static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd,
+                                                struct bfq_queue *bfqq)
+{
+       struct request *rq = bfqq->next_rq;
+       unsigned long service_to_charge;
+
+       service_to_charge = bfq_serv_to_charge(rq, bfqq);
+
+       bfq_bfqq_served(bfqq, service_to_charge);
+
+       bfq_dispatch_remove(bfqd->queue, rq);
+
+       /*
+        * If weight raising has to terminate for bfqq, then next
+        * function causes an immediate update of bfqq's weight,
+        * without waiting for next activation. As a consequence, on
+        * expiration, bfqq will be timestamped as if has never been
+        * weight-raised during this service slot, even if it has
+        * received part or even most of the service as a
+        * weight-raised queue. This inflates bfqq's timestamps, which
+        * is beneficial, as bfqq is then more willing to leave the
+        * device immediately to possible other weight-raised queues.
+        */
+       bfq_update_wr_data(bfqd, bfqq);
+
+       /*
+        * Expire bfqq, pretending that its budget expired, if bfqq
+        * belongs to CLASS_IDLE and other queues are waiting for
+        * service.
+        */
+       if (bfqd->busy_queues > 1 && bfq_class_idle(bfqq))
+               goto expire;
+
+       return rq;
+
+expire:
+       bfq_bfqq_expire(bfqd, bfqq, false, BFQQE_BUDGET_EXHAUSTED);
+       return rq;
+}
+
+static bool bfq_has_work(struct blk_mq_hw_ctx *hctx)
+{
+       struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
+
+       /*
+        * Avoiding lock: a race on bfqd->busy_queues should cause at
+        * most a call to dispatch for nothing
+        */
+       return !list_empty_careful(&bfqd->dispatch) ||
+               bfqd->busy_queues > 0;
+}
+
+static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
+{
+       struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
+       struct request *rq = NULL;
+       struct bfq_queue *bfqq = NULL;
+
+       if (!list_empty(&bfqd->dispatch)) {
+               rq = list_first_entry(&bfqd->dispatch, struct request,
+                                     queuelist);
+               list_del_init(&rq->queuelist);
+
+               bfqq = RQ_BFQQ(rq);
+
+               if (bfqq) {
+                       /*
+                        * Increment counters here, because this
+                        * dispatch does not follow the standard
+                        * dispatch flow (where counters are
+                        * incremented)
+                        */
+                       bfqq->dispatched++;
+
+                       goto inc_in_driver_start_rq;
+               }
+
+               /*
+                * We exploit the put_rq_private hook to decrement
+                * rq_in_driver, but put_rq_private will not be
+                * invoked on this request. So, to avoid unbalance,
+                * just start this request, without incrementing
+                * rq_in_driver. As a negative consequence,
+                * rq_in_driver is deceptively lower than it should be
+                * while this request is in service. This may cause
+                * bfq_schedule_dispatch to be invoked uselessly.
+                *
+                * As for implementing an exact solution, the
+                * put_request hook, if defined, is probably invoked
+                * also on this request. So, by exploiting this hook,
+                * we could 1) increment rq_in_driver here, and 2)
+                * decrement it in put_request. Such a solution would
+                * let the value of the counter be always accurate,
+                * but it would entail using an extra interface
+                * function. This cost seems higher than the benefit,
+                * being the frequency of non-elevator-private
+                * requests very low.
+                */
+               goto start_rq;
+       }
+
+       bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);
+
+       if (bfqd->busy_queues == 0)
+               goto exit;
+
+       /*
+        * Force device to serve one request at a time if
+        * strict_guarantees is true. Forcing this service scheme is
+        * currently the ONLY way to guarantee that the request
+        * service order enforced by the scheduler is respected by a
+        * queueing device. Otherwise the device is free even to make
+        * some unlucky request wait for as long as the device
+        * wishes.
+        *
+        * Of course, serving one request at at time may cause loss of
+        * throughput.
+        */
+       if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0)
+               goto exit;
+
+       bfqq = bfq_select_queue(bfqd);
+       if (!bfqq)
+               goto exit;
+
+       rq = bfq_dispatch_rq_from_bfqq(bfqd, bfqq);
+
+       if (rq) {
+inc_in_driver_start_rq:
+               bfqd->rq_in_driver++;
+start_rq:
+               rq->rq_flags |= RQF_STARTED;
+       }
+exit:
+       return rq;
+}
+
+static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
+{
+       struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
+       struct request *rq;
+
+       spin_lock_irq(&bfqd->lock);
+
+       rq = __bfq_dispatch_request(hctx);
+       spin_unlock_irq(&bfqd->lock);
+
+       return rq;
+}
+
+/*
+ * Task holds one reference to the queue, dropped when task exits.  Each rq
+ * in-flight on this queue also holds a reference, dropped when rq is freed.
+ *
+ * Scheduler lock must be held here. Recall not to use bfqq after calling
+ * this function on it.
+ */
+void bfq_put_queue(struct bfq_queue *bfqq)
+{
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+       struct bfq_group *bfqg = bfqq_group(bfqq);
+#endif
+
+       if (bfqq->bfqd)
+               bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d",
+                            bfqq, bfqq->ref);
+
+       bfqq->ref--;
+       if (bfqq->ref)
+               return;
+
+       if (bfq_bfqq_sync(bfqq))
+               /*
+                * The fact that this queue is being destroyed does not
+                * invalidate the fact that this queue may have been
+                * activated during the current burst. As a consequence,
+                * although the queue does not exist anymore, and hence
+                * needs to be removed from the burst list if there,
+                * the burst size has not to be decremented.
+                */
+               hlist_del_init(&bfqq->burst_list_node);
+
+       kmem_cache_free(bfq_pool, bfqq);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+       bfqg_put(bfqg);
+#endif
+}
+
+static void bfq_put_cooperator(struct bfq_queue *bfqq)
+{
+       struct bfq_queue *__bfqq, *next;
+
+       /*
+        * If this queue was scheduled to merge with another queue, be
+        * sure to drop the reference taken on that queue (and others in
+        * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs.
+        */
+       __bfqq = bfqq->new_bfqq;
+       while (__bfqq) {
+               if (__bfqq == bfqq)
+                       break;
+               next = __bfqq->new_bfqq;
+               bfq_put_queue(__bfqq);
+               __bfqq = next;
+       }
+}
+
+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+       if (bfqq == bfqd->in_service_queue) {
+               __bfq_bfqq_expire(bfqd, bfqq);
+               bfq_schedule_dispatch(bfqd);
+       }
+
+       bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref);
+
+       bfq_put_cooperator(bfqq);
+
+       bfq_put_queue(bfqq); /* release process reference */
+}
+
+static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync)
+{
+       struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync);
+       struct bfq_data *bfqd;
+
+       if (bfqq)
+               bfqd = bfqq->bfqd; /* NULL if scheduler already exited */
+
+       if (bfqq && bfqd) {
+               unsigned long flags;
+
+               spin_lock_irqsave(&bfqd->lock, flags);
+               bfq_exit_bfqq(bfqd, bfqq);
+               bic_set_bfqq(bic, NULL, is_sync);
+               spin_unlock_irqrestore(&bfqd->lock, flags);
+       }
+}
+
+static void bfq_exit_icq(struct io_cq *icq)
+{
+       struct bfq_io_cq *bic = icq_to_bic(icq);
+
+       bfq_exit_icq_bfqq(bic, true);
+       bfq_exit_icq_bfqq(bic, false);
+}
+
+/*
+ * Update the entity prio values; note that the new values will not
+ * be used until the next (re)activation.
+ */
+static void
+bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
+{
+       struct task_struct *tsk = current;
+       int ioprio_class;
+       struct bfq_data *bfqd = bfqq->bfqd;
+
+       if (!bfqd)
+               return;
+
+       ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
+       switch (ioprio_class) {
+       default:
+               dev_err(bfqq->bfqd->queue->backing_dev_info->dev,
+                       "bfq: bad prio class %d\n", ioprio_class);
+       case IOPRIO_CLASS_NONE:
+               /*
+                * No prio set, inherit CPU scheduling settings.
+                */
+               bfqq->new_ioprio = task_nice_ioprio(tsk);
+               bfqq->new_ioprio_class = task_nice_ioclass(tsk);
+               break;
+       case IOPRIO_CLASS_RT:
+               bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
+               bfqq->new_ioprio_class = IOPRIO_CLASS_RT;
+               break;
+       case IOPRIO_CLASS_BE:
+               bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
+               bfqq->new_ioprio_class = IOPRIO_CLASS_BE;
+               break;
+       case IOPRIO_CLASS_IDLE:
+               bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE;
+               bfqq->new_ioprio = 7;
+               bfq_clear_bfqq_idle_window(bfqq);
+               break;
+       }
+
+       if (bfqq->new_ioprio >= IOPRIO_BE_NR) {
+               pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n",
+                       bfqq->new_ioprio);
+               bfqq->new_ioprio = IOPRIO_BE_NR;
+       }
+
+       bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio);
+       bfqq->entity.prio_changed = 1;
+}
+
+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
+                                      struct bio *bio, bool is_sync,
+                                      struct bfq_io_cq *bic);
+
+static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio)
+{
+       struct bfq_data *bfqd = bic_to_bfqd(bic);
+       struct bfq_queue *bfqq;
+       int ioprio = bic->icq.ioc->ioprio;
+
+       /*
+        * This condition may trigger on a newly created bic, be sure to
+        * drop the lock before returning.
+        */
+       if (unlikely(!bfqd) || likely(bic->ioprio == ioprio))
+               return;
+
+       bic->ioprio = ioprio;
+
+       bfqq = bic_to_bfqq(bic, false);
+       if (bfqq) {
+               /* release process reference on this queue */
+               bfq_put_queue(bfqq);
+               bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic);
+               bic_set_bfqq(bic, bfqq, false);
+       }
+
+       bfqq = bic_to_bfqq(bic, true);
+       if (bfqq)
+               bfq_set_next_ioprio_data(bfqq, bic);
+}
+
+static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+                         struct bfq_io_cq *bic, pid_t pid, int is_sync)
+{
+       RB_CLEAR_NODE(&bfqq->entity.rb_node);
+       INIT_LIST_HEAD(&bfqq->fifo);
+       INIT_HLIST_NODE(&bfqq->burst_list_node);
+
+       bfqq->ref = 0;
+       bfqq->bfqd = bfqd;
+
+       if (bic)
+               bfq_set_next_ioprio_data(bfqq, bic);
+
+       if (is_sync) {
+               if (!bfq_class_idle(bfqq))
+                       bfq_mark_bfqq_idle_window(bfqq);
+               bfq_mark_bfqq_sync(bfqq);
+               bfq_mark_bfqq_just_created(bfqq);
+       } else
+               bfq_clear_bfqq_sync(bfqq);
+
+       /* set end request to minus infinity from now */
+       bfqq->ttime.last_end_request = ktime_get_ns() + 1;
+
+       bfq_mark_bfqq_IO_bound(bfqq);
+
+       bfqq->pid = pid;
+
+       /* Tentative initial value to trade off between thr and lat */
+       bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;
+       bfqq->budget_timeout = bfq_smallest_from_now();
+
+       bfqq->wr_coeff = 1;
+       bfqq->last_wr_start_finish = jiffies;
+       bfqq->wr_start_at_switch_to_srt = bfq_smallest_from_now();
+       bfqq->split_time = bfq_smallest_from_now();
+
+       /*
+        * Set to the value for which bfqq will not be deemed as
+        * soft rt when it becomes backlogged.
+        */
+       bfqq->soft_rt_next_start = bfq_greatest_from_now();
+
+       /* first request is almost certainly seeky */
+       bfqq->seek_history = 1;
+}
+
+static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
+                                              struct bfq_group *bfqg,
+                                              int ioprio_class, int ioprio)
+{
+       switch (ioprio_class) {
+       case IOPRIO_CLASS_RT:
+               return &bfqg->async_bfqq[0][ioprio];
+       case IOPRIO_CLASS_NONE:
+               ioprio = IOPRIO_NORM;
+               /* fall through */
+       case IOPRIO_CLASS_BE:
+               return &bfqg->async_bfqq[1][ioprio];
+       case IOPRIO_CLASS_IDLE:
+               return &bfqg->async_idle_bfqq;
+       default:
+               return NULL;
+       }
+}
+
+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
+                                      struct bio *bio, bool is_sync,
+                                      struct bfq_io_cq *bic)
+{
+       const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
+       const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
+       struct bfq_queue **async_bfqq = NULL;
+       struct bfq_queue *bfqq;
+       struct bfq_group *bfqg;
+
+       rcu_read_lock();
+
+       bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio));
+       if (!bfqg) {
+               bfqq = &bfqd->oom_bfqq;
+               goto out;
+       }
+
+       if (!is_sync) {
+               async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,
+                                                 ioprio);
+               bfqq = *async_bfqq;
+               if (bfqq)
+                       goto out;
+       }
+
+       bfqq = kmem_cache_alloc_node(bfq_pool,
+                                    GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN,
+                                    bfqd->queue->node);
+
+       if (bfqq) {
+               bfq_init_bfqq(bfqd, bfqq, bic, current->pid,
+                             is_sync);
+               bfq_init_entity(&bfqq->entity, bfqg);
+               bfq_log_bfqq(bfqd, bfqq, "allocated");
+       } else {
+               bfqq = &bfqd->oom_bfqq;
+               bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");
+               goto out;
+       }
+
+       /*
+        * Pin the queue now that it's allocated, scheduler exit will
+        * prune it.
+        */
+       if (async_bfqq) {
+               bfqq->ref++; /*
+                             * Extra group reference, w.r.t. sync
+                             * queue. This extra reference is removed
+                             * only if bfqq->bfqg disappears, to
+                             * guarantee that this queue is not freed
+                             * until its group goes away.
+                             */
+               bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",
+                            bfqq, bfqq->ref);
+               *async_bfqq = bfqq;
+       }
+
+out:
+       bfqq->ref++; /* get a process reference to this queue */
+       bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref);
+       rcu_read_unlock();
+       return bfqq;
+}
+
+static void bfq_update_io_thinktime(struct bfq_data *bfqd,
+                                   struct bfq_queue *bfqq)
+{
+       struct bfq_ttime *ttime = &bfqq->ttime;
+       u64 elapsed = ktime_get_ns() - bfqq->ttime.last_end_request;
+
+       elapsed = min_t(u64, elapsed, 2ULL * bfqd->bfq_slice_idle);
+
+       ttime->ttime_samples = (7*bfqq->ttime.ttime_samples + 256) / 8;
+       ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed,  8);
+       ttime->ttime_mean = div64_ul(ttime->ttime_total + 128,
+                                    ttime->ttime_samples);
+}
+
+static void
+bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+                      struct request *rq)
+{
+       bfqq->seek_history <<= 1;
+       bfqq->seek_history |=
+               get_sdist(bfqq->last_request_pos, rq) > BFQQ_SEEK_THR &&
+               (!blk_queue_nonrot(bfqd->queue) ||
+                blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT);
+}
+
+/*
+ * Disable idle window if the process thinks too long or seeks so much that
+ * it doesn't matter.
+ */
+static void bfq_update_idle_window(struct bfq_data *bfqd,
+                                  struct bfq_queue *bfqq,
+                                  struct bfq_io_cq *bic)
+{
+       int enable_idle;
+
+       /* Don't idle for async or idle io prio class. */
+       if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))
+               return;
+
+       /* Idle window just restored, statistics are meaningless. */
+       if (time_is_after_eq_jiffies(bfqq->split_time +
+                                    bfqd->bfq_wr_min_idle_time))
+               return;
+
+       enable_idle = bfq_bfqq_idle_window(bfqq);
+
+       if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||
+           bfqd->bfq_slice_idle == 0 ||
+               (bfqd->hw_tag && BFQQ_SEEKY(bfqq) &&
+                       bfqq->wr_coeff == 1))
+               enable_idle = 0;
+       else if (bfq_sample_valid(bfqq->ttime.ttime_samples)) {
+               if (bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle &&
+                       bfqq->wr_coeff == 1)
+                       enable_idle = 0;
+               else
+                       enable_idle = 1;
+       }
+       bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",
+               enable_idle);
+
+       if (enable_idle)
+               bfq_mark_bfqq_idle_window(bfqq);
+       else
+               bfq_clear_bfqq_idle_window(bfqq);
+}
+
+/*
+ * Called when a new fs request (rq) is added to bfqq.  Check if there's
+ * something we should do about it.
+ */
+static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+                           struct request *rq)
+{
+       struct bfq_io_cq *bic = RQ_BIC(rq);
+
+       if (rq->cmd_flags & REQ_META)
+               bfqq->meta_pending++;
+
+       bfq_update_io_thinktime(bfqd, bfqq);
+       bfq_update_io_seektime(bfqd, bfqq, rq);
+       if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||
+           !BFQQ_SEEKY(bfqq))
+               bfq_update_idle_window(bfqd, bfqq, bic);
+
+       bfq_log_bfqq(bfqd, bfqq,
+                    "rq_enqueued: idle_window=%d (seeky %d)",
+                    bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq));
+
+       bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
+
+       if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) {
+               bool small_req = bfqq->queued[rq_is_sync(rq)] == 1 &&
+                                blk_rq_sectors(rq) < 32;
+               bool budget_timeout = bfq_bfqq_budget_timeout(bfqq);
+
+               /*
+                * There is just this request queued: if the request
+                * is small and the queue is not to be expired, then
+                * just exit.
+                *
+                * In this way, if the device is being idled to wait
+                * for a new request from the in-service queue, we
+                * avoid unplugging the device and committing the
+                * device to serve just a small request. On the
+                * contrary, we wait for the block layer to decide
+                * when to unplug the device: hopefully, new requests
+                * will be merged to this one quickly, then the device
+                * will be unplugged and larger requests will be
+                * dispatched.
+                */
+               if (small_req && !budget_timeout)
+                       return;
+
+               /*
+                * A large enough request arrived, or the queue is to
+                * be expired: in both cases disk idling is to be
+                * stopped, so clear wait_request flag and reset
+                * timer.
+                */
+               bfq_clear_bfqq_wait_request(bfqq);
+               hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
+               bfqg_stats_update_idle_time(bfqq_group(bfqq));
+
+               /*
+                * The queue is not empty, because a new request just
+                * arrived. Hence we can safely expire the queue, in
+                * case of budget timeout, without risking that the
+                * timestamps of the queue are not updated correctly.
+                * See [1] for more details.
+                */
+               if (budget_timeout)
+                       bfq_bfqq_expire(bfqd, bfqq, false,
+                                       BFQQE_BUDGET_TIMEOUT);
+       }
+}
+
+static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
+{
+       struct bfq_queue *bfqq = RQ_BFQQ(rq),
+               *new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true);
+
+       if (new_bfqq) {
+               if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq)
+                       new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1);
+               /*
+                * Release the request's reference to the old bfqq
+                * and make sure one is taken to the shared queue.
+                */
+               new_bfqq->allocated++;
+               bfqq->allocated--;
+               new_bfqq->ref++;
+               bfq_clear_bfqq_just_created(bfqq);
+               /*
+                * If the bic associated with the process
+                * issuing this request still points to bfqq
+                * (and thus has not been already redirected
+                * to new_bfqq or even some other bfq_queue),
+                * then complete the merge and redirect it to
+                * new_bfqq.
+                */
+               if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq)
+                       bfq_merge_bfqqs(bfqd, RQ_BIC(rq),
+                                       bfqq, new_bfqq);
+               /*
+                * rq is about to be enqueued into new_bfqq,
+                * release rq reference on bfqq
+                */
+               bfq_put_queue(bfqq);
+               rq->elv.priv[1] = new_bfqq;
+               bfqq = new_bfqq;
+       }
+
+       bfq_add_request(rq);
+
+       rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)];
+       list_add_tail(&rq->queuelist, &bfqq->fifo);
+
+       bfq_rq_enqueued(bfqd, bfqq, rq);
+}
+
+static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
+                              bool at_head)
+{
+       struct request_queue *q = hctx->queue;
+       struct bfq_data *bfqd = q->elevator->elevator_data;
+
+       spin_lock_irq(&bfqd->lock);
+       if (blk_mq_sched_try_insert_merge(q, rq)) {
+               spin_unlock_irq(&bfqd->lock);
+               return;
+       }
+
+       spin_unlock_irq(&bfqd->lock);
+
+       blk_mq_sched_request_inserted(rq);
+
+       spin_lock_irq(&bfqd->lock);
+       if (at_head || blk_rq_is_passthrough(rq)) {
+               if (at_head)
+                       list_add(&rq->queuelist, &bfqd->dispatch);
+               else
+                       list_add_tail(&rq->queuelist, &bfqd->dispatch);
+       } else {
+               __bfq_insert_request(bfqd, rq);
+
+               if (rq_mergeable(rq)) {
+                       elv_rqhash_add(q, rq);
+                       if (!q->last_merge)
+                               q->last_merge = rq;
+               }
+       }
+
+       spin_unlock_irq(&bfqd->lock);
+}
+
+static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx,
+                               struct list_head *list, bool at_head)
+{
+       while (!list_empty(list)) {
+               struct request *rq;
+
+               rq = list_first_entry(list, struct request, queuelist);
+               list_del_init(&rq->queuelist);
+               bfq_insert_request(hctx, rq, at_head);
+       }
+}
+
+static void bfq_update_hw_tag(struct bfq_data *bfqd)
+{
+       bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver,
+                                      bfqd->rq_in_driver);
+
+       if (bfqd->hw_tag == 1)
+               return;
+
+       /*
+        * This sample is valid if the number of outstanding requests
+        * is large enough to allow a queueing behavior.  Note that the
+        * sum is not exact, as it's not taking into account deactivated
+        * requests.
+        */
+       if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)
+               return;
+
+       if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
+               return;
+
+       bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;
+       bfqd->max_rq_in_driver = 0;
+       bfqd->hw_tag_samples = 0;
+}
+
+static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
+{
+       u64 now_ns;
+       u32 delta_us;
+
+       bfq_update_hw_tag(bfqd);
+
+       bfqd->rq_in_driver--;
+       bfqq->dispatched--;
+
+       if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) {
+               /*
+                * Set budget_timeout (which we overload to store the
+                * time at which the queue remains with no backlog and
+                * no outstanding request; used by the weight-raising
+                * mechanism).
+                */
+               bfqq->budget_timeout = jiffies;
+
+               bfq_weights_tree_remove(bfqd, &bfqq->entity,
+                                       &bfqd->queue_weights_tree);
+       }
+
+       now_ns = ktime_get_ns();
+
+       bfqq->ttime.last_end_request = now_ns;
+
+       /*
+        * Using us instead of ns, to get a reasonable precision in
+        * computing rate in next check.
+        */
+       delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC);
+
+       /*
+        * If the request took rather long to complete, and, according
+        * to the maximum request size recorded, this completion latency
+        * implies that the request was certainly served at a very low
+        * rate (less than 1M sectors/sec), then the whole observation
+        * interval that lasts up to this time instant cannot be a
+        * valid time interval for computing a new peak rate.  Invoke
+        * bfq_update_rate_reset to have the following three steps
+        * taken:
+        * - close the observation interval at the last (previous)
+        *   request dispatch or completion
+        * - compute rate, if possible, for that observation interval
+        * - reset to zero samples, which will trigger a proper
+        *   re-initialization of the observation interval on next
+        *   dispatch
+        */
+       if (delta_us > BFQ_MIN_TT/NSEC_PER_USEC &&
+          (bfqd->last_rq_max_size<<BFQ_RATE_SHIFT)/delta_us <
+                       1UL<<(BFQ_RATE_SHIFT - 10))
+               bfq_update_rate_reset(bfqd, NULL);
+       bfqd->last_completion = now_ns;
+
+       /*
+        * If we are waiting to discover whether the request pattern
+        * of the task associated with the queue is actually
+        * isochronous, and both requisites for this condition to hold
+        * are now satisfied, then compute soft_rt_next_start (see the
+        * comments on the function bfq_bfqq_softrt_next_start()). We
+        * schedule this delayed check when bfqq expires, if it still
+        * has in-flight requests.
+        */
+       if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 &&
+           RB_EMPTY_ROOT(&bfqq->sort_list))
+               bfqq->soft_rt_next_start =
+                       bfq_bfqq_softrt_next_start(bfqd, bfqq);
+
+       /*
+        * If this is the in-service queue, check if it needs to be expired,
+        * or if we want to idle in case it has no pending requests.
+        */
+       if (bfqd->in_service_queue == bfqq) {
+               if (bfqq->dispatched == 0 && bfq_bfqq_must_idle(bfqq)) {
+                       bfq_arm_slice_timer(bfqd);
+                       return;
+               } else if (bfq_may_expire_for_budg_timeout(bfqq))
+                       bfq_bfqq_expire(bfqd, bfqq, false,
+                                       BFQQE_BUDGET_TIMEOUT);
+               else if (RB_EMPTY_ROOT(&bfqq->sort_list) &&
+                        (bfqq->dispatched == 0 ||
+                         !bfq_bfqq_may_idle(bfqq)))
+                       bfq_bfqq_expire(bfqd, bfqq, false,
+                                       BFQQE_NO_MORE_REQUESTS);
+       }
+}
+
+static void bfq_put_rq_priv_body(struct bfq_queue *bfqq)
+{
+       bfqq->allocated--;
+
+       bfq_put_queue(bfqq);
+}
+
+static void bfq_put_rq_private(struct request_queue *q, struct request *rq)
+{
+       struct bfq_queue *bfqq = RQ_BFQQ(rq);
+       struct bfq_data *bfqd = bfqq->bfqd;
+
+       if (rq->rq_flags & RQF_STARTED)
+               bfqg_stats_update_completion(bfqq_group(bfqq),
+                                            rq_start_time_ns(rq),
+                                            rq_io_start_time_ns(rq),
+                                            rq->cmd_flags);
+
+       if (likely(rq->rq_flags & RQF_STARTED)) {
+               unsigned long flags;
+
+               spin_lock_irqsave(&bfqd->lock, flags);
+
+               bfq_completed_request(bfqq, bfqd);
+               bfq_put_rq_priv_body(bfqq);
+
+               spin_unlock_irqrestore(&bfqd->lock, flags);
+       } else {
+               /*
+                * Request rq may be still/already in the scheduler,
+                * in which case we need to remove it. And we cannot
+                * defer such a check and removal, to avoid
+                * inconsistencies in the time interval from the end
+                * of this function to the start of the deferred work.
+                * This situation seems to occur only in process
+                * context, as a consequence of a merge. In the
+                * current version of the code, this implies that the
+                * lock is held.
+                */
+
+               if (!RB_EMPTY_NODE(&rq->rb_node))
+                       bfq_remove_request(q, rq);
+               bfq_put_rq_priv_body(bfqq);
+       }
+
+       rq->elv.priv[0] = NULL;
+       rq->elv.priv[1] = NULL;
+}
+
+/*
+ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this
+ * was the last process referring to that bfqq.
+ */
+static struct bfq_queue *
+bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
+{
+       bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");
+
+       if (bfqq_process_refs(bfqq) == 1) {
+               bfqq->pid = current->pid;
+               bfq_clear_bfqq_coop(bfqq);
+               bfq_clear_bfqq_split_coop(bfqq);
+               return bfqq;
+       }
+
+       bic_set_bfqq(bic, NULL, 1);
+
+       bfq_put_cooperator(bfqq);
+
+       bfq_put_queue(bfqq);
+       return NULL;
+}
+
+static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd,
+                                                  struct bfq_io_cq *bic,
+                                                  struct bio *bio,
+                                                  bool split, bool is_sync,
+                                                  bool *new_queue)
+{
+       struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync);
+
+       if (likely(bfqq && bfqq != &bfqd->oom_bfqq))
+               return bfqq;
+
+       if (new_queue)
+               *new_queue = true;
+
+       if (bfqq)
+               bfq_put_queue(bfqq);
+       bfqq = bfq_get_queue(bfqd, bio, is_sync, bic);
+
+       bic_set_bfqq(bic, bfqq, is_sync);
+       if (split && is_sync) {
+               if ((bic->was_in_burst_list && bfqd->large_burst) ||
+                   bic->saved_in_large_burst)
+                       bfq_mark_bfqq_in_large_burst(bfqq);
+               else {
+                       bfq_clear_bfqq_in_large_burst(bfqq);
+                       if (bic->was_in_burst_list)
+                               hlist_add_head(&bfqq->burst_list_node,
+                                              &bfqd->burst_list);
+               }
+               bfqq->split_time = jiffies;
+       }
+
+       return bfqq;
+}
+
+/*
+ * Allocate bfq data structures associated with this request.
+ */
+static int bfq_get_rq_private(struct request_queue *q, struct request *rq,
+                             struct bio *bio)
+{
+       struct bfq_data *bfqd = q->elevator->elevator_data;
+       struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq);
+       const int is_sync = rq_is_sync(rq);
+       struct bfq_queue *bfqq;
+       bool new_queue = false;
+       bool split = false;
+
+       spin_lock_irq(&bfqd->lock);
+
+       if (!bic)
+               goto queue_fail;
+
+       bfq_check_ioprio_change(bic, bio);
+
+       bfq_bic_update_cgroup(bic, bio);
+
+       bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, false, is_sync,
+                                        &new_queue);
+
+       if (likely(!new_queue)) {
+               /* If the queue was seeky for too long, break it apart. */
+               if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {
+                       bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");
+
+                       /* Update bic before losing reference to bfqq */
+                       if (bfq_bfqq_in_large_burst(bfqq))
+                               bic->saved_in_large_burst = true;
+
+                       bfqq = bfq_split_bfqq(bic, bfqq);
+                       split = true;
+
+                       if (!bfqq)
+                               bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio,
+                                                                true, is_sync,
+                                                                NULL);
+               }
+       }
+
+       bfqq->allocated++;
+       bfqq->ref++;
+       bfq_log_bfqq(bfqd, bfqq, "get_request %p: bfqq %p, %d",
+                    rq, bfqq, bfqq->ref);
+
+       rq->elv.priv[0] = bic;
+       rq->elv.priv[1] = bfqq;
+
+       /*
+        * If a bfq_queue has only one process reference, it is owned
+        * by only this bic: we can then set bfqq->bic = bic. in
+        * addition, if the queue has also just been split, we have to
+        * resume its state.
+        */
+       if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) {
+               bfqq->bic = bic;
+               if (split) {
+                       /*
+                        * The queue has just been split from a shared
+                        * queue: restore the idle window and the
+                        * possible weight raising period.
+                        */
+                       bfq_bfqq_resume_state(bfqq, bic);
+               }
+       }
+
+       if (unlikely(bfq_bfqq_just_created(bfqq)))
+               bfq_handle_burst(bfqd, bfqq);
+
+       spin_unlock_irq(&bfqd->lock);
+
+       return 0;
+
+queue_fail:
+       spin_unlock_irq(&bfqd->lock);
+
+       return 1;
+}
+
+static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq)
+{
+       struct bfq_data *bfqd = bfqq->bfqd;
+       enum bfqq_expiration reason;
+       unsigned long flags;
+
+       spin_lock_irqsave(&bfqd->lock, flags);
+       bfq_clear_bfqq_wait_request(bfqq);
+
+       if (bfqq != bfqd->in_service_queue) {
+               spin_unlock_irqrestore(&bfqd->lock, flags);
+               return;
+       }
+
+       if (bfq_bfqq_budget_timeout(bfqq))
+               /*
+                * Also here the queue can be safely expired
+                * for budget timeout without wasting
+                * guarantees
+                */
+               reason = BFQQE_BUDGET_TIMEOUT;
+       else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)
+               /*
+                * The queue may not be empty upon timer expiration,
+                * because we may not disable the timer when the
+                * first request of the in-service queue arrives
+                * during disk idling.
+                */
+               reason = BFQQE_TOO_IDLE;
+       else
+               goto schedule_dispatch;
+
+       bfq_bfqq_expire(bfqd, bfqq, true, reason);
+
+schedule_dispatch:
+       spin_unlock_irqrestore(&bfqd->lock, flags);
+       bfq_schedule_dispatch(bfqd);
+}
+
+/*
+ * Handler of the expiration of the timer running if the in-service queue
+ * is idling inside its time slice.
+ */
+static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer)
+{
+       struct bfq_data *bfqd = container_of(timer, struct bfq_data,
+                                            idle_slice_timer);
+       struct bfq_queue *bfqq = bfqd->in_service_queue;
+
+       /*
+        * Theoretical race here: the in-service queue can be NULL or
+        * different from the queue that was idling if a new request
+        * arrives for the current queue and there is a full dispatch
+        * cycle that changes the in-service queue.  This can hardly
+        * happen, but in the worst case we just expire a queue too
+        * early.
+        */
+       if (bfqq)
+               bfq_idle_slice_timer_body(bfqq);
+
+       return HRTIMER_NORESTART;
+}
+
+static void __bfq_put_async_bfqq(struct bfq_data *bfqd,
+                                struct bfq_queue **bfqq_ptr)
+{
+       struct bfq_queue *bfqq = *bfqq_ptr;
+
+       bfq_log(bfqd, "put_async_bfqq: %p", bfqq);
+       if (bfqq) {
+               bfq_bfqq_move(bfqd, bfqq, bfqd->root_group);
+
+               bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",
+                            bfqq, bfqq->ref);
+               bfq_put_queue(bfqq);
+               *bfqq_ptr = NULL;
+       }
+}
+
+/*
+ * Release all the bfqg references to its async queues.  If we are
+ * deallocating the group these queues may still contain requests, so
+ * we reparent them to the root cgroup (i.e., the only one that will
+ * exist for sure until all the requests on a device are gone).
+ */
+void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
+{
+       int i, j;
+
+       for (i = 0; i < 2; i++)
+               for (j = 0; j < IOPRIO_BE_NR; j++)
+                       __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);
+
+       __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
+}
+
+static void bfq_exit_queue(struct elevator_queue *e)
+{
+       struct bfq_data *bfqd = e->elevator_data;
+       struct bfq_queue *bfqq, *n;
+
+       hrtimer_cancel(&bfqd->idle_slice_timer);
+
+       spin_lock_irq(&bfqd->lock);
+       list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)
+               bfq_deactivate_bfqq(bfqd, bfqq, false, false);
+       spin_unlock_irq(&bfqd->lock);
+
+       hrtimer_cancel(&bfqd->idle_slice_timer);
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+       blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq);
+#else
+       spin_lock_irq(&bfqd->lock);
+       bfq_put_async_queues(bfqd, bfqd->root_group);
+       kfree(bfqd->root_group);
+       spin_unlock_irq(&bfqd->lock);
+#endif
+
+       kfree(bfqd);
+}
+
+static void bfq_init_root_group(struct bfq_group *root_group,
+                               struct bfq_data *bfqd)
+{
+       int i;
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+       root_group->entity.parent = NULL;
+       root_group->my_entity = NULL;
+       root_group->bfqd = bfqd;
+#endif
+       root_group->rq_pos_tree = RB_ROOT;
+       for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
+               root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
+       root_group->sched_data.bfq_class_idle_last_service = jiffies;
+}
+
+static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
+{
+       struct bfq_data *bfqd;
+       struct elevator_queue *eq;
+
+       eq = elevator_alloc(q, e);
+       if (!eq)
+               return -ENOMEM;
+
+       bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node);
+       if (!bfqd) {
+               kobject_put(&eq->kobj);
+               return -ENOMEM;
+       }
+       eq->elevator_data = bfqd;
+
+       spin_lock_irq(q->queue_lock);
+       q->elevator = eq;
+       spin_unlock_irq(q->queue_lock);
+
+       /*
+        * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
+        * Grab a permanent reference to it, so that the normal code flow
+        * will not attempt to free it.
+        */
+       bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0);
+       bfqd->oom_bfqq.ref++;
+       bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO;
+       bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE;
+       bfqd->oom_bfqq.entity.new_weight =
+               bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio);
+
+       /* oom_bfqq does not participate to bursts */
+       bfq_clear_bfqq_just_created(&bfqd->oom_bfqq);
+
+       /*
+        * Trigger weight initialization, according to ioprio, at the
+        * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio
+        * class won't be changed any more.
+        */
+       bfqd->oom_bfqq.entity.prio_changed = 1;
+
+       bfqd->queue = q;
+
+       INIT_LIST_HEAD(&bfqd->dispatch);
+
+       hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC,
+                    HRTIMER_MODE_REL);
+       bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
+
+       bfqd->queue_weights_tree = RB_ROOT;
+       bfqd->group_weights_tree = RB_ROOT;
+
+       INIT_LIST_HEAD(&bfqd->active_list);
+       INIT_LIST_HEAD(&bfqd->idle_list);
+       INIT_HLIST_HEAD(&bfqd->burst_list);
+
+       bfqd->hw_tag = -1;
+
+       bfqd->bfq_max_budget = bfq_default_max_budget;
+
+       bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];
+       bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];
+       bfqd->bfq_back_max = bfq_back_max;
+       bfqd->bfq_back_penalty = bfq_back_penalty;
+       bfqd->bfq_slice_idle = bfq_slice_idle;
+       bfqd->bfq_timeout = bfq_timeout;
+
+       bfqd->bfq_requests_within_timer = 120;
+
+       bfqd->bfq_large_burst_thresh = 8;
+       bfqd->bfq_burst_interval = msecs_to_jiffies(180);
+
+       bfqd->low_latency = true;
+
+       /*
+        * Trade-off between responsiveness and fairness.
+        */
+       bfqd->bfq_wr_coeff = 30;
+       bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300);
+       bfqd->bfq_wr_max_time = 0;
+       bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000);
+       bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500);
+       bfqd->bfq_wr_max_softrt_rate = 7000; /*
+                                             * Approximate rate required
+                                             * to playback or record a
+                                             * high-definition compressed
+                                             * video.
+                                             */
+       bfqd->wr_busy_queues = 0;
+
+       /*
+        * Begin by assuming, optimistically, that the device is a
+        * high-speed one, and that its peak rate is equal to 2/3 of
+        * the highest reference rate.
+        */
+       bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] *
+                       T_fast[blk_queue_nonrot(bfqd->queue)];
+       bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)] * 2 / 3;
+       bfqd->device_speed = BFQ_BFQD_FAST;
+
+       spin_lock_init(&bfqd->lock);
+
+       /*
+        * The invocation of the next bfq_create_group_hierarchy
+        * function is the head of a chain of function calls
+        * (bfq_create_group_hierarchy->blkcg_activate_policy->
+        * blk_mq_freeze_queue) that may lead to the invocation of the
+        * has_work hook function. For this reason,
+        * bfq_create_group_hierarchy is invoked only after all
+        * scheduler data has been initialized, apart from the fields
+        * that can be initialized only after invoking
+        * bfq_create_group_hierarchy. This, in particular, enables
+        * has_work to correctly return false. Of course, to avoid
+        * other inconsistencies, the blk-mq stack must then refrain
+        * from invoking further scheduler hooks before this init
+        * function is finished.
+        */
+       bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node);
+       if (!bfqd->root_group)
+               goto out_free;
+       bfq_init_root_group(bfqd->root_group, bfqd);
+       bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group);
+
+
+       return 0;
+
+out_free:
+       kfree(bfqd);
+       kobject_put(&eq->kobj);
+       return -ENOMEM;
+}
+
+static void bfq_slab_kill(void)
+{
+       kmem_cache_destroy(bfq_pool);
+}
+
+static int __init bfq_slab_setup(void)
+{
+       bfq_pool = KMEM_CACHE(bfq_queue, 0);
+       if (!bfq_pool)
+               return -ENOMEM;
+       return 0;
+}
+
+static ssize_t bfq_var_show(unsigned int var, char *page)
+{
+       return sprintf(page, "%u\n", var);
+}
+
+static ssize_t bfq_var_store(unsigned long *var, const char *page,
+                            size_t count)
+{
+       unsigned long new_val;
+       int ret = kstrtoul(page, 10, &new_val);
+
+       if (ret == 0)
+               *var = new_val;
+
+       return count;
+}
+
+#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)                           \
+static ssize_t __FUNC(struct elevator_queue *e, char *page)            \
+{                                                                      \
+       struct bfq_data *bfqd = e->elevator_data;                       \
+       u64 __data = __VAR;                                             \
+       if (__CONV == 1)                                                \
+               __data = jiffies_to_msecs(__data);                      \
+       else if (__CONV == 2)                                           \
+               __data = div_u64(__data, NSEC_PER_MSEC);                \
+       return bfq_var_show(__data, (page));                            \
+}
+SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 2);
+SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 2);
+SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);
+SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);
+SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 2);
+SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);
+SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout, 1);
+SHOW_FUNCTION(bfq_strict_guarantees_show, bfqd->strict_guarantees, 0);
+SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);
+#undef SHOW_FUNCTION
+
+#define USEC_SHOW_FUNCTION(__FUNC, __VAR)                              \
+static ssize_t __FUNC(struct elevator_queue *e, char *page)            \
+{                                                                      \
+       struct bfq_data *bfqd = e->elevator_data;                       \
+       u64 __data = __VAR;                                             \
+       __data = div_u64(__data, NSEC_PER_USEC);                        \
+       return bfq_var_show(__data, (page));                            \
+}
+USEC_SHOW_FUNCTION(bfq_slice_idle_us_show, bfqd->bfq_slice_idle);
+#undef USEC_SHOW_FUNCTION
+
+#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)                        \
+static ssize_t                                                         \
+__FUNC(struct elevator_queue *e, const char *page, size_t count)       \
+{                                                                      \
+       struct bfq_data *bfqd = e->elevator_data;                       \
+       unsigned long uninitialized_var(__data);                        \
+       int ret = bfq_var_store(&__data, (page), count);                \
+       if (__data < (MIN))                                             \
+               __data = (MIN);                                         \
+       else if (__data > (MAX))                                        \
+               __data = (MAX);                                         \
+       if (__CONV == 1)                                                \
+               *(__PTR) = msecs_to_jiffies(__data);                    \
+       else if (__CONV == 2)                                           \
+               *(__PTR) = (u64)__data * NSEC_PER_MSEC;                 \
+       else                                                            \
+               *(__PTR) = __data;                                      \
+       return ret;                                                     \
+}
+STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,
+               INT_MAX, 2);
+STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,
+               INT_MAX, 2);
+STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);
+STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,
+               INT_MAX, 0);
+STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 2);
+#undef STORE_FUNCTION
+
+#define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX)                   \
+static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)\
+{                                                                      \
+       struct bfq_data *bfqd = e->elevator_data;                       \
+       unsigned long uninitialized_var(__data);                        \
+       int ret = bfq_var_store(&__data, (page), count);                \
+       if (__data < (MIN))                                             \
+               __data = (MIN);                                         \
+       else if (__data > (MAX))                                        \
+               __data = (MAX);                                         \
+       *(__PTR) = (u64)__data * NSEC_PER_USEC;                         \
+       return ret;                                                     \
+}
+USEC_STORE_FUNCTION(bfq_slice_idle_us_store, &bfqd->bfq_slice_idle, 0,
+                   UINT_MAX);
+#undef USEC_STORE_FUNCTION
+
+static ssize_t bfq_max_budget_store(struct elevator_queue *e,
+                                   const char *page, size_t count)
+{
+       struct bfq_data *bfqd = e->elevator_data;
+       unsigned long uninitialized_var(__data);
+       int ret = bfq_var_store(&__data, (page), count);
+
+       if (__data == 0)
+               bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd);
+       else {
+               if (__data > INT_MAX)
+                       __data = INT_MAX;
+               bfqd->bfq_max_budget = __data;
+       }
+
+       bfqd->bfq_user_max_budget = __data;
+
+       return ret;
+}
+
+/*
+ * Leaving this name to preserve name compatibility with cfq
+ * parameters, but this timeout is used for both sync and async.
+ */
+static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,
+                                     const char *page, size_t count)
+{
+       struct bfq_data *bfqd = e->elevator_data;
+       unsigned long uninitialized_var(__data);
+       int ret = bfq_var_store(&__data, (page), count);
+
+       if (__data < 1)
+               __data = 1;
+       else if (__data > INT_MAX)
+               __data = INT_MAX;
+
+       bfqd->bfq_timeout = msecs_to_jiffies(__data);
+       if (bfqd->bfq_user_max_budget == 0)
+               bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd);
+
+       return ret;
+}
+
+static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e,
+                                    const char *page, size_t count)
+{
+       struct bfq_data *bfqd = e->elevator_data;
+       unsigned long uninitialized_var(__data);
+       int ret = bfq_var_store(&__data, (page), count);
+
+       if (__data > 1)
+               __data = 1;
+       if (!bfqd->strict_guarantees && __data == 1
+           && bfqd->bfq_slice_idle < 8 * NSEC_PER_MSEC)
+               bfqd->bfq_slice_idle = 8 * NSEC_PER_MSEC;
+
+       bfqd->strict_guarantees = __data;
+
+       return ret;
+}
+
+static ssize_t bfq_low_latency_store(struct elevator_queue *e,
+                                    const char *page, size_t count)
+{
+       struct bfq_data *bfqd = e->elevator_data;
+       unsigned long uninitialized_var(__data);
+       int ret = bfq_var_store(&__data, (page), count);
+
+       if (__data > 1)
+               __data = 1;
+       if (__data == 0 && bfqd->low_latency != 0)
+               bfq_end_wr(bfqd);
+       bfqd->low_latency = __data;
+
+       return ret;
+}
+
+#define BFQ_ATTR(name) \
+       __ATTR(name, 0644, bfq_##name##_show, bfq_##name##_store)
+
+static struct elv_fs_entry bfq_attrs[] = {
+       BFQ_ATTR(fifo_expire_sync),
+       BFQ_ATTR(fifo_expire_async),
+       BFQ_ATTR(back_seek_max),
+       BFQ_ATTR(back_seek_penalty),
+       BFQ_ATTR(slice_idle),
+       BFQ_ATTR(slice_idle_us),
+       BFQ_ATTR(max_budget),
+       BFQ_ATTR(timeout_sync),
+       BFQ_ATTR(strict_guarantees),
+       BFQ_ATTR(low_latency),
+       __ATTR_NULL
+};
+
+static struct elevator_type iosched_bfq_mq = {
+       .ops.mq = {
+               .get_rq_priv            = bfq_get_rq_private,
+               .put_rq_priv            = bfq_put_rq_private,
+               .exit_icq               = bfq_exit_icq,
+               .insert_requests        = bfq_insert_requests,
+               .dispatch_request       = bfq_dispatch_request,
+               .next_request           = elv_rb_latter_request,
+               .former_request         = elv_rb_former_request,
+               .allow_merge            = bfq_allow_bio_merge,
+               .bio_merge              = bfq_bio_merge,
+               .request_merge          = bfq_request_merge,
+               .requests_merged        = bfq_requests_merged,
+               .request_merged         = bfq_request_merged,
+               .has_work               = bfq_has_work,
+               .init_sched             = bfq_init_queue,
+               .exit_sched             = bfq_exit_queue,
+       },
+
+       .uses_mq =              true,
+       .icq_size =             sizeof(struct bfq_io_cq),
+       .icq_align =            __alignof__(struct bfq_io_cq),
+       .elevator_attrs =       bfq_attrs,
+       .elevator_name =        "bfq",
+       .elevator_owner =       THIS_MODULE,
+};
+
+static int __init bfq_init(void)
+{
+       int ret;
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+       ret = blkcg_policy_register(&blkcg_policy_bfq);
+       if (ret)
+               return ret;
+#endif
+
+       ret = -ENOMEM;
+       if (bfq_slab_setup())
+               goto err_pol_unreg;
+
+       /*
+        * Times to load large popular applications for the typical
+        * systems installed on the reference devices (see the
+        * comments before the definitions of the next two
+        * arrays). Actually, we use slightly slower values, as the
+        * estimated peak rate tends to be smaller than the actual
+        * peak rate.  The reason for this last fact is that estimates
+        * are computed over much shorter time intervals than the long
+        * intervals typically used for benchmarking. Why? First, to
+        * adapt more quickly to variations. Second, because an I/O
+        * scheduler cannot rely on a peak-rate-evaluation workload to
+        * be run for a long time.
+        */
+       T_slow[0] = msecs_to_jiffies(3500); /* actually 4 sec */
+       T_slow[1] = msecs_to_jiffies(6000); /* actually 6.5 sec */
+       T_fast[0] = msecs_to_jiffies(7000); /* actually 8 sec */
+       T_fast[1] = msecs_to_jiffies(2500); /* actually 3 sec */
+
+       /*
+        * Thresholds that determine the switch between speed classes
+        * (see the comments before the definition of the array
+        * device_speed_thresh). These thresholds are biased towards
+        * transitions to the fast class. This is safer than the
+        * opposite bias. In fact, a wrong transition to the slow
+        * class results in short weight-raising periods, because the
+        * speed of the device then tends to be higher that the
+        * reference peak rate. On the opposite end, a wrong
+        * transition to the fast class tends to increase
+        * weight-raising periods, because of the opposite reason.
+        */
+       device_speed_thresh[0] = (4 * R_slow[0]) / 3;
+       device_speed_thresh[1] = (4 * R_slow[1]) / 3;
+
+       ret = elv_register(&iosched_bfq_mq);
+       if (ret)
+               goto err_pol_unreg;
+
+       return 0;
+
+err_pol_unreg:
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+       blkcg_policy_unregister(&blkcg_policy_bfq);
+#endif
+       return ret;
+}
+
+static void __exit bfq_exit(void)
+{
+       elv_unregister(&iosched_bfq_mq);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+       blkcg_policy_unregister(&blkcg_policy_bfq);
+#endif
+       bfq_slab_kill();
+}
+
+module_init(bfq_init);
+module_exit(bfq_exit);
+
+MODULE_AUTHOR("Paolo Valente");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("MQ Budget Fair Queueing I/O Scheduler");
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
new file mode 100644 (file)
index 0000000..ae783c0
--- /dev/null
@@ -0,0 +1,941 @@
+/*
+ * Header file for the BFQ I/O scheduler: data structures and
+ * prototypes of interface functions among BFQ components.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License as
+ *  published by the Free Software Foundation; either version 2 of the
+ *  License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ */
+#ifndef _BFQ_H
+#define _BFQ_H
+
+#include <linux/blktrace_api.h>
+#include <linux/hrtimer.h>
+#include <linux/blk-cgroup.h>
+
+#define BFQ_IOPRIO_CLASSES     3
+#define BFQ_CL_IDLE_TIMEOUT    (HZ/5)
+
+#define BFQ_MIN_WEIGHT                 1
+#define BFQ_MAX_WEIGHT                 1000
+#define BFQ_WEIGHT_CONVERSION_COEFF    10
+
+#define BFQ_DEFAULT_QUEUE_IOPRIO       4
+
+#define BFQ_WEIGHT_LEGACY_DFL  100
+#define BFQ_DEFAULT_GRP_IOPRIO 0
+#define BFQ_DEFAULT_GRP_CLASS  IOPRIO_CLASS_BE
+
+/*
+ * Soft real-time applications are extremely more latency sensitive
+ * than interactive ones. Over-raise the weight of the former to
+ * privilege them against the latter.
+ */
+#define BFQ_SOFTRT_WEIGHT_FACTOR       100
+
+struct bfq_entity;
+
+/**
+ * struct bfq_service_tree - per ioprio_class service tree.
+ *
+ * Each service tree represents a B-WF2Q+ scheduler on its own.  Each
+ * ioprio_class has its own independent scheduler, and so its own
+ * bfq_service_tree.  All the fields are protected by the queue lock
+ * of the containing bfqd.
+ */
+struct bfq_service_tree {
+       /* tree for active entities (i.e., those backlogged) */
+       struct rb_root active;
+       /* tree for idle entities (i.e., not backlogged, with V <= F_i)*/
+       struct rb_root idle;
+
+       /* idle entity with minimum F_i */
+       struct bfq_entity *first_idle;
+       /* idle entity with maximum F_i */
+       struct bfq_entity *last_idle;
+
+       /* scheduler virtual time */
+       u64 vtime;
+       /* scheduler weight sum; active and idle entities contribute to it */
+       unsigned long wsum;
+};
+
+/**
+ * struct bfq_sched_data - multi-class scheduler.
+ *
+ * bfq_sched_data is the basic scheduler queue.  It supports three
+ * ioprio_classes, and can be used either as a toplevel queue or as an
+ * intermediate queue on a hierarchical setup.  @next_in_service
+ * points to the active entity of the sched_data service trees that
+ * will be scheduled next. It is used to reduce the number of steps
+ * needed for each hierarchical-schedule update.
+ *
+ * The supported ioprio_classes are the same as in CFQ, in descending
+ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
+ * Requests from higher priority queues are served before all the
+ * requests from lower priority queues; among requests of the same
+ * queue requests are served according to B-WF2Q+.
+ * All the fields are protected by the queue lock of the containing bfqd.
+ */
+struct bfq_sched_data {
+       /* entity in service */
+       struct bfq_entity *in_service_entity;
+       /* head-of-line entity (see comments above) */
+       struct bfq_entity *next_in_service;
+       /* array of service trees, one per ioprio_class */
+       struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];
+       /* last time CLASS_IDLE was served */
+       unsigned long bfq_class_idle_last_service;
+
+};
+
+/**
+ * struct bfq_weight_counter - counter of the number of all active entities
+ *                             with a given weight.
+ */
+struct bfq_weight_counter {
+       unsigned int weight; /* weight of the entities this counter refers to */
+       unsigned int num_active; /* nr of active entities with this weight */
+       /*
+        * Weights tree member (see bfq_data's @queue_weights_tree and
+        * @group_weights_tree)
+        */
+       struct rb_node weights_node;
+};
+
+/**
+ * struct bfq_entity - schedulable entity.
+ *
+ * A bfq_entity is used to represent either a bfq_queue (leaf node in the
+ * cgroup hierarchy) or a bfq_group into the upper level scheduler.  Each
+ * entity belongs to the sched_data of the parent group in the cgroup
+ * hierarchy.  Non-leaf entities have also their own sched_data, stored
+ * in @my_sched_data.
+ *
+ * Each entity stores independently its priority values; this would
+ * allow different weights on different devices, but this
+ * functionality is not exported to userspace by now.  Priorities and
+ * weights are updated lazily, first storing the new values into the
+ * new_* fields, then setting the @prio_changed flag.  As soon as
+ * there is a transition in the entity state that allows the priority
+ * update to take place the effective and the requested priority
+ * values are synchronized.
+ *
+ * Unless cgroups are used, the weight value is calculated from the
+ * ioprio to export the same interface as CFQ.  When dealing with
+ * ``well-behaved'' queues (i.e., queues that do not spend too much
+ * time to consume their budget and have true sequential behavior, and
+ * when there are no external factors breaking anticipation) the
+ * relative weights at each level of the cgroups hierarchy should be
+ * guaranteed.  All the fields are protected by the queue lock of the
+ * containing bfqd.
+ */
+struct bfq_entity {
+       /* service_tree member */
+       struct rb_node rb_node;
+       /* pointer to the weight counter associated with this entity */
+       struct bfq_weight_counter *weight_counter;
+
+       /*
+        * Flag, true if the entity is on a tree (either the active or
+        * the idle one of its service_tree) or is in service.
+        */
+       bool on_st;
+
+       /* B-WF2Q+ start and finish timestamps [sectors/weight] */
+       u64 start, finish;
+
+       /* tree the entity is enqueued into; %NULL if not on a tree */
+       struct rb_root *tree;
+
+       /*
+        * minimum start time of the (active) subtree rooted at this
+        * entity; used for O(log N) lookups into active trees
+        */
+       u64 min_start;
+
+       /* amount of service received during the last service slot */
+       int service;
+
+       /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */
+       int budget;
+
+       /* weight of the queue */
+       int weight;
+       /* next weight if a change is in progress */
+       int new_weight;
+
+       /* original weight, used to implement weight boosting */
+       int orig_weight;
+
+       /* parent entity, for hierarchical scheduling */
+       struct bfq_entity *parent;
+
+       /*
+        * For non-leaf nodes in the hierarchy, the associated
+        * scheduler queue, %NULL on leaf nodes.
+        */
+       struct bfq_sched_data *my_sched_data;
+       /* the scheduler queue this entity belongs to */
+       struct bfq_sched_data *sched_data;
+
+       /* flag, set to request a weight, ioprio or ioprio_class change  */
+       int prio_changed;
+};
+
+struct bfq_group;
+
+/**
+ * struct bfq_ttime - per process thinktime stats.
+ */
+struct bfq_ttime {
+       /* completion time of the last request */
+       u64 last_end_request;
+
+       /* total process thinktime */
+       u64 ttime_total;
+       /* number of thinktime samples */
+       unsigned long ttime_samples;
+       /* average process thinktime */
+       u64 ttime_mean;
+};
+
+/**
+ * struct bfq_queue - leaf schedulable entity.
+ *
+ * A bfq_queue is a leaf request queue; it can be associated with an
+ * io_context or more, if it  is  async or shared  between  cooperating
+ * processes. @cgroup holds a reference to the cgroup, to be sure that it
+ * does not disappear while a bfqq still references it (mostly to avoid
+ * races between request issuing and task migration followed by cgroup
+ * destruction).
+ * All the fields are protected by the queue lock of the containing bfqd.
+ */
+struct bfq_queue {
+       /* reference counter */
+       int ref;
+       /* parent bfq_data */
+       struct bfq_data *bfqd;
+
+       /* current ioprio and ioprio class */
+       unsigned short ioprio, ioprio_class;
+       /* next ioprio and ioprio class if a change is in progress */
+       unsigned short new_ioprio, new_ioprio_class;
+
+       /*
+        * Shared bfq_queue if queue is cooperating with one or more
+        * other queues.
+        */
+       struct bfq_queue *new_bfqq;
+       /* request-position tree member (see bfq_group's @rq_pos_tree) */
+       struct rb_node pos_node;
+       /* request-position tree root (see bfq_group's @rq_pos_tree) */
+       struct rb_root *pos_root;
+
+       /* sorted list of pending requests */
+       struct rb_root sort_list;
+       /* if fifo isn't expired, next request to serve */
+       struct request *next_rq;
+       /* number of sync and async requests queued */
+       int queued[2];
+       /* number of requests currently allocated */
+       int allocated;
+       /* number of pending metadata requests */
+       int meta_pending;
+       /* fifo list of requests in sort_list */
+       struct list_head fifo;
+
+       /* entity representing this queue in the scheduler */
+       struct bfq_entity entity;
+
+       /* maximum budget allowed from the feedback mechanism */
+       int max_budget;
+       /* budget expiration (in jiffies) */
+       unsigned long budget_timeout;
+
+       /* number of requests on the dispatch list or inside driver */
+       int dispatched;
+
+       /* status flags */
+       unsigned long flags;
+
+       /* node for active/idle bfqq list inside parent bfqd */
+       struct list_head bfqq_list;
+
+       /* associated @bfq_ttime struct */
+       struct bfq_ttime ttime;
+
+       /* bit vector: a 1 for each seeky requests in history */
+       u32 seek_history;
+
+       /* node for the device's burst list */
+       struct hlist_node burst_list_node;
+
+       /* position of the last request enqueued */
+       sector_t last_request_pos;
+
+       /* Number of consecutive pairs of request completion and
+        * arrival, such that the queue becomes idle after the
+        * completion, but the next request arrives within an idle
+        * time slice; used only if the queue's IO_bound flag has been
+        * cleared.
+        */
+       unsigned int requests_within_timer;
+
+       /* pid of the process owning the queue, used for logging purposes */
+       pid_t pid;
+
+       /*
+        * Pointer to the bfq_io_cq owning the bfq_queue, set to %NULL
+        * if the queue is shared.
+        */
+       struct bfq_io_cq *bic;
+
+       /* current maximum weight-raising time for this queue */
+       unsigned long wr_cur_max_time;
+       /*
+        * Minimum time instant such that, only if a new request is
+        * enqueued after this time instant in an idle @bfq_queue with
+        * no outstanding requests, then the task associated with the
+        * queue it is deemed as soft real-time (see the comments on
+        * the function bfq_bfqq_softrt_next_start())
+        */
+       unsigned long soft_rt_next_start;
+       /*
+        * Start time of the current weight-raising period if
+        * the @bfq-queue is being weight-raised, otherwise
+        * finish time of the last weight-raising period.
+        */
+       unsigned long last_wr_start_finish;
+       /* factor by which the weight of this queue is multiplied */
+       unsigned int wr_coeff;
+       /*
+        * Time of the last transition of the @bfq_queue from idle to
+        * backlogged.
+        */
+       unsigned long last_idle_bklogged;
+       /*
+        * Cumulative service received from the @bfq_queue since the
+        * last transition from idle to backlogged.
+        */
+       unsigned long service_from_backlogged;
+
+       /*
+        * Value of wr start time when switching to soft rt
+        */
+       unsigned long wr_start_at_switch_to_srt;
+
+       unsigned long split_time; /* time of last split */
+};
+
+/**
+ * struct bfq_io_cq - per (request_queue, io_context) structure.
+ */
+struct bfq_io_cq {
+       /* associated io_cq structure */
+       struct io_cq icq; /* must be the first member */
+       /* array of two process queues, the sync and the async */
+       struct bfq_queue *bfqq[2];
+       /* per (request_queue, blkcg) ioprio */
+       int ioprio;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+       uint64_t blkcg_serial_nr; /* the current blkcg serial */
+#endif
+       /*
+        * Snapshot of the idle window before merging; taken to
+        * remember this value while the queue is merged, so as to be
+        * able to restore it in case of split.
+        */
+       bool saved_idle_window;
+       /*
+        * Same purpose as the previous two fields for the I/O bound
+        * classification of a queue.
+        */
+       bool saved_IO_bound;
+
+       /*
+        * Same purpose as the previous fields for the value of the
+        * field keeping the queue's belonging to a large burst
+        */
+       bool saved_in_large_burst;
+       /*
+        * True if the queue belonged to a burst list before its merge
+        * with another cooperating queue.
+        */
+       bool was_in_burst_list;
+
+       /*
+        * Similar to previous fields: save wr information.
+        */
+       unsigned long saved_wr_coeff;
+       unsigned long saved_last_wr_start_finish;
+       unsigned long saved_wr_start_at_switch_to_srt;
+       unsigned int saved_wr_cur_max_time;
+       struct bfq_ttime saved_ttime;
+};
+
+enum bfq_device_speed {
+       BFQ_BFQD_FAST,
+       BFQ_BFQD_SLOW,
+};
+
+/**
+ * struct bfq_data - per-device data structure.
+ *
+ * All the fields are protected by @lock.
+ */
+struct bfq_data {
+       /* device request queue */
+       struct request_queue *queue;
+       /* dispatch queue */
+       struct list_head dispatch;
+
+       /* root bfq_group for the device */
+       struct bfq_group *root_group;
+
+       /*
+        * rbtree of weight counters of @bfq_queues, sorted by
+        * weight. Used to keep track of whether all @bfq_queues have
+        * the same weight. The tree contains one counter for each
+        * distinct weight associated to some active and not
+        * weight-raised @bfq_queue (see the comments to the functions
+        * bfq_weights_tree_[add|remove] for further details).
+        */
+       struct rb_root queue_weights_tree;
+       /*
+        * rbtree of non-queue @bfq_entity weight counters, sorted by
+        * weight. Used to keep track of whether all @bfq_groups have
+        * the same weight. The tree contains one counter for each
+        * distinct weight associated to some active @bfq_group (see
+        * the comments to the functions bfq_weights_tree_[add|remove]
+        * for further details).
+        */
+       struct rb_root group_weights_tree;
+
+       /*
+        * Number of bfq_queues containing requests (including the
+        * queue in service, even if it is idling).
+        */
+       int busy_queues;
+       /* number of weight-raised busy @bfq_queues */
+       int wr_busy_queues;
+       /* number of queued requests */
+       int queued;
+       /* number of requests dispatched and waiting for completion */
+       int rq_in_driver;
+
+       /*
+        * Maximum number of requests in driver in the last
+        * @hw_tag_samples completed requests.
+        */
+       int max_rq_in_driver;
+       /* number of samples used to calculate hw_tag */
+       int hw_tag_samples;
+       /* flag set to one if the driver is showing a queueing behavior */
+       int hw_tag;
+
+       /* number of budgets assigned */
+       int budgets_assigned;
+
+       /*
+        * Timer set when idling (waiting) for the next request from
+        * the queue in service.
+        */
+       struct hrtimer idle_slice_timer;
+
+       /* bfq_queue in service */
+       struct bfq_queue *in_service_queue;
+
+       /* on-disk position of the last served request */
+       sector_t last_position;
+
+       /* time of last request completion (ns) */
+       u64 last_completion;
+
+       /* time of first rq dispatch in current observation interval (ns) */
+       u64 first_dispatch;
+       /* time of last rq dispatch in current observation interval (ns) */
+       u64 last_dispatch;
+
+       /* beginning of the last budget */
+       ktime_t last_budget_start;
+       /* beginning of the last idle slice */
+       ktime_t last_idling_start;
+
+       /* number of samples in current observation interval */
+       int peak_rate_samples;
+       /* num of samples of seq dispatches in current observation interval */
+       u32 sequential_samples;
+       /* total num of sectors transferred in current observation interval */
+       u64 tot_sectors_dispatched;
+       /* max rq size seen during current observation interval (sectors) */
+       u32 last_rq_max_size;
+       /* time elapsed from first dispatch in current observ. interval (us) */
+       u64 delta_from_first;
+       /*
+        * Current estimate of the device peak rate, measured in
+        * [BFQ_RATE_SHIFT * sectors/usec]. The left-shift by
+        * BFQ_RATE_SHIFT is performed to increase precision in
+        * fixed-point calculations.
+        */
+       u32 peak_rate;
+
+       /* maximum budget allotted to a bfq_queue before rescheduling */
+       int bfq_max_budget;
+
+       /* list of all the bfq_queues active on the device */
+       struct list_head active_list;
+       /* list of all the bfq_queues idle on the device */
+       struct list_head idle_list;
+
+       /*
+        * Timeout for async/sync requests; when it fires, requests
+        * are served in fifo order.
+        */
+       u64 bfq_fifo_expire[2];
+       /* weight of backward seeks wrt forward ones */
+       unsigned int bfq_back_penalty;
+       /* maximum allowed backward seek */
+       unsigned int bfq_back_max;
+       /* maximum idling time */
+       u32 bfq_slice_idle;
+
+       /* user-configured max budget value (0 for auto-tuning) */
+       int bfq_user_max_budget;
+       /*
+        * Timeout for bfq_queues to consume their budget; used to
+        * prevent seeky queues from imposing long latencies to
+        * sequential or quasi-sequential ones (this also implies that
+        * seeky queues cannot receive guarantees in the service
+        * domain; after a timeout they are charged for the time they
+        * have been in service, to preserve fairness among them, but
+        * without service-domain guarantees).
+        */
+       unsigned int bfq_timeout;
+
+       /*
+        * Number of consecutive requests that must be issued within
+        * the idle time slice to set again idling to a queue which
+        * was marked as non-I/O-bound (see the definition of the
+        * IO_bound flag for further details).
+        */
+       unsigned int bfq_requests_within_timer;
+
+       /*
+        * Force device idling whenever needed to provide accurate
+        * service guarantees, without caring about throughput
+        * issues. CAVEAT: this may even increase latencies, in case
+        * of useless idling for processes that did stop doing I/O.
+        */
+       bool strict_guarantees;
+
+       /*
+        * Last time at which a queue entered the current burst of
+        * queues being activated shortly after each other; for more
+        * details about this and the following parameters related to
+        * a burst of activations, see the comments on the function
+        * bfq_handle_burst.
+        */
+       unsigned long last_ins_in_burst;
+       /*
+        * Reference time interval used to decide whether a queue has
+        * been activated shortly after @last_ins_in_burst.
+        */
+       unsigned long bfq_burst_interval;
+       /* number of queues in the current burst of queue activations */
+       int burst_size;
+
+       /* common parent entity for the queues in the burst */
+       struct bfq_entity *burst_parent_entity;
+       /* Maximum burst size above which the current queue-activation
+        * burst is deemed as 'large'.
+        */
+       unsigned long bfq_large_burst_thresh;
+       /* true if a large queue-activation burst is in progress */
+       bool large_burst;
+       /*
+        * Head of the burst list (as for the above fields, more
+        * details in the comments on the function bfq_handle_burst).
+        */
+       struct hlist_head burst_list;
+
+       /* if set to true, low-latency heuristics are enabled */
+       bool low_latency;
+       /*
+        * Maximum factor by which the weight of a weight-raised queue
+        * is multiplied.
+        */
+       unsigned int bfq_wr_coeff;
+       /* maximum duration of a weight-raising period (jiffies) */
+       unsigned int bfq_wr_max_time;
+
+       /* Maximum weight-raising duration for soft real-time processes */
+       unsigned int bfq_wr_rt_max_time;
+       /*
+        * Minimum idle period after which weight-raising may be
+        * reactivated for a queue (in jiffies).
+        */
+       unsigned int bfq_wr_min_idle_time;
+       /*
+        * Minimum period between request arrivals after which
+        * weight-raising may be reactivated for an already busy async
+        * queue (in jiffies).
+        */
+       unsigned long bfq_wr_min_inter_arr_async;
+
+       /* Max service-rate for a soft real-time queue, in sectors/sec */
+       unsigned int bfq_wr_max_softrt_rate;
+       /*
+        * Cached value of the product R*T, used for computing the
+        * maximum duration of weight raising automatically.
+        */
+       u64 RT_prod;
+       /* device-speed class for the low-latency heuristic */
+       enum bfq_device_speed device_speed;
+
+       /* fallback dummy bfqq for extreme OOM conditions */
+       struct bfq_queue oom_bfqq;
+
+       spinlock_t lock;
+
+       /*
+        * bic associated with the task issuing current bio for
+        * merging. This and the next field are used as a support to
+        * be able to perform the bic lookup, needed by bio-merge
+        * functions, before the scheduler lock is taken, and thus
+        * avoid taking the request-queue lock while the scheduler
+        * lock is being held.
+        */
+       struct bfq_io_cq *bio_bic;
+       /* bfqq associated with the task issuing current bio for merging */
+       struct bfq_queue *bio_bfqq;
+};
+
+enum bfqq_state_flags {
+       BFQQF_just_created = 0, /* queue just allocated */
+       BFQQF_busy,             /* has requests or is in service */
+       BFQQF_wait_request,     /* waiting for a request */
+       BFQQF_non_blocking_wait_rq, /*
+                                    * waiting for a request
+                                    * without idling the device
+                                    */
+       BFQQF_fifo_expire,      /* FIFO checked in this slice */
+       BFQQF_idle_window,      /* slice idling enabled */
+       BFQQF_sync,             /* synchronous queue */
+       BFQQF_IO_bound,         /*
+                                * bfqq has timed-out at least once
+                                * having consumed at most 2/10 of
+                                * its budget
+                                */
+       BFQQF_in_large_burst,   /*
+                                * bfqq activated in a large burst,
+                                * see comments to bfq_handle_burst.
+                                */
+       BFQQF_softrt_update,    /*
+                                * may need softrt-next-start
+                                * update
+                                */
+       BFQQF_coop,             /* bfqq is shared */
+       BFQQF_split_coop        /* shared bfqq will be split */
+};
+
+#define BFQ_BFQQ_FNS(name)                                             \
+void bfq_mark_bfqq_##name(struct bfq_queue *bfqq);                     \
+void bfq_clear_bfqq_##name(struct bfq_queue *bfqq);                    \
+int bfq_bfqq_##name(const struct bfq_queue *bfqq);
+
+BFQ_BFQQ_FNS(just_created);
+BFQ_BFQQ_FNS(busy);
+BFQ_BFQQ_FNS(wait_request);
+BFQ_BFQQ_FNS(non_blocking_wait_rq);
+BFQ_BFQQ_FNS(fifo_expire);
+BFQ_BFQQ_FNS(idle_window);
+BFQ_BFQQ_FNS(sync);
+BFQ_BFQQ_FNS(IO_bound);
+BFQ_BFQQ_FNS(in_large_burst);
+BFQ_BFQQ_FNS(coop);
+BFQ_BFQQ_FNS(split_coop);
+BFQ_BFQQ_FNS(softrt_update);
+#undef BFQ_BFQQ_FNS
+
+/* Expiration reasons. */
+enum bfqq_expiration {
+       BFQQE_TOO_IDLE = 0,             /*
+                                        * queue has been idling for
+                                        * too long
+                                        */
+       BFQQE_BUDGET_TIMEOUT,   /* budget took too long to be used */
+       BFQQE_BUDGET_EXHAUSTED, /* budget consumed */
+       BFQQE_NO_MORE_REQUESTS, /* the queue has no more requests */
+       BFQQE_PREEMPTED         /* preemption in progress */
+};
+
+struct bfqg_stats {
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+       /* number of ios merged */
+       struct blkg_rwstat              merged;
+       /* total time spent on device in ns, may not be accurate w/ queueing */
+       struct blkg_rwstat              service_time;
+       /* total time spent waiting in scheduler queue in ns */
+       struct blkg_rwstat              wait_time;
+       /* number of IOs queued up */
+       struct blkg_rwstat              queued;
+       /* total disk time and nr sectors dispatched by this group */
+       struct blkg_stat                time;
+       /* sum of number of ios queued across all samples */
+       struct blkg_stat                avg_queue_size_sum;
+       /* count of samples taken for average */
+       struct blkg_stat                avg_queue_size_samples;
+       /* how many times this group has been removed from service tree */
+       struct blkg_stat                dequeue;
+       /* total time spent waiting for it to be assigned a timeslice. */
+       struct blkg_stat                group_wait_time;
+       /* time spent idling for this blkcg_gq */
+       struct blkg_stat                idle_time;
+       /* total time with empty current active q with other requests queued */
+       struct blkg_stat                empty_time;
+       /* fields after this shouldn't be cleared on stat reset */
+       uint64_t                        start_group_wait_time;
+       uint64_t                        start_idle_time;
+       uint64_t                        start_empty_time;
+       uint16_t                        flags;
+#endif /* CONFIG_BFQ_GROUP_IOSCHED */
+};
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+
+/*
+ * struct bfq_group_data - per-blkcg storage for the blkio subsystem.
+ *
+ * @ps: @blkcg_policy_storage that this structure inherits
+ * @weight: weight of the bfq_group
+ */
+struct bfq_group_data {
+       /* must be the first member */
+       struct blkcg_policy_data pd;
+
+       unsigned int weight;
+};
+
+/**
+ * struct bfq_group - per (device, cgroup) data structure.
+ * @entity: schedulable entity to insert into the parent group sched_data.
+ * @sched_data: own sched_data, to contain child entities (they may be
+ *              both bfq_queues and bfq_groups).
+ * @bfqd: the bfq_data for the device this group acts upon.
+ * @async_bfqq: array of async queues for all the tasks belonging to
+ *              the group, one queue per ioprio value per ioprio_class,
+ *              except for the idle class that has only one queue.
+ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).
+ * @my_entity: pointer to @entity, %NULL for the toplevel group; used
+ *             to avoid too many special cases during group creation/
+ *             migration.
+ * @stats: stats for this bfqg.
+ * @active_entities: number of active entities belonging to the group;
+ *                   unused for the root group. Used to know whether there
+ *                   are groups with more than one active @bfq_entity
+ *                   (see the comments to the function
+ *                   bfq_bfqq_may_idle()).
+ * @rq_pos_tree: rbtree sorted by next_request position, used when
+ *               determining if two or more queues have interleaving
+ *               requests (see bfq_find_close_cooperator()).
+ *
+ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
+ * there is a set of bfq_groups, each one collecting the lower-level
+ * entities belonging to the group that are acting on the same device.
+ *
+ * Locking works as follows:
+ *    o @bfqd is protected by the queue lock, RCU is used to access it
+ *      from the readers.
+ *    o All the other fields are protected by the @bfqd queue lock.
+ */
+struct bfq_group {
+       /* must be the first member */
+       struct blkg_policy_data pd;
+
+       struct bfq_entity entity;
+       struct bfq_sched_data sched_data;
+
+       void *bfqd;
+
+       struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
+       struct bfq_queue *async_idle_bfqq;
+
+       struct bfq_entity *my_entity;
+
+       int active_entities;
+
+       struct rb_root rq_pos_tree;
+
+       struct bfqg_stats stats;
+};
+
+#else
+struct bfq_group {
+       struct bfq_sched_data sched_data;
+
+       struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
+       struct bfq_queue *async_idle_bfqq;
+
+       struct rb_root rq_pos_tree;
+};
+#endif
+
+struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity);
+
+/* --------------- main algorithm interface ----------------- */
+
+#define BFQ_SERVICE_TREE_INIT  ((struct bfq_service_tree)              \
+                               { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
+
+extern const int bfq_timeout;
+
+struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync);
+void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync);
+struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic);
+void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
+void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq);
+void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_entity *entity,
+                         struct rb_root *root);
+void bfq_weights_tree_remove(struct bfq_data *bfqd, struct bfq_entity *entity,
+                            struct rb_root *root);
+void bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+                    bool compensate, enum bfqq_expiration reason);
+void bfq_put_queue(struct bfq_queue *bfqq);
+void bfq_end_wr_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
+void bfq_schedule_dispatch(struct bfq_data *bfqd);
+void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
+
+/* ------------ end of main algorithm interface -------------- */
+
+/* ---------------- cgroups-support interface ---------------- */
+
+void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
+                             unsigned int op);
+void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op);
+void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op);
+void bfqg_stats_update_completion(struct bfq_group *bfqg, uint64_t start_time,
+                                 uint64_t io_start_time, unsigned int op);
+void bfqg_stats_update_dequeue(struct bfq_group *bfqg);
+void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg);
+void bfqg_stats_update_idle_time(struct bfq_group *bfqg);
+void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg);
+void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg);
+void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+                  struct bfq_group *bfqg);
+
+void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg);
+void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio);
+void bfq_end_wr_async(struct bfq_data *bfqd);
+struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
+                                    struct blkcg *blkcg);
+struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg);
+struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
+struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node);
+void bfqg_put(struct bfq_group *bfqg);
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+extern struct cftype bfq_blkcg_legacy_files[];
+extern struct cftype bfq_blkg_files[];
+extern struct blkcg_policy blkcg_policy_bfq;
+#endif
+
+/* ------------- end of cgroups-support interface ------------- */
+
+/* - interface of the internal hierarchical B-WF2Q+ scheduler - */
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+/* both next loops stop at one of the child entities of the root group */
+#define for_each_entity(entity)        \
+       for (; entity ; entity = entity->parent)
+
+/*
+ * For each iteration, compute parent in advance, so as to be safe if
+ * entity is deallocated during the iteration. Such a deallocation may
+ * happen as a consequence of a bfq_put_queue that frees the bfq_queue
+ * containing entity.
+ */
+#define for_each_entity_safe(entity, parent) \
+       for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
+
+#else /* CONFIG_BFQ_GROUP_IOSCHED */
+/*
+ * Next two macros are fake loops when cgroups support is not
+ * enabled. I fact, in such a case, there is only one level to go up
+ * (to reach the root group).
+ */
+#define for_each_entity(entity)        \
+       for (; entity ; entity = NULL)
+
+#define for_each_entity_safe(entity, parent) \
+       for (parent = NULL; entity ; entity = parent)
+#endif /* CONFIG_BFQ_GROUP_IOSCHED */
+
+struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq);
+struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity);
+struct bfq_service_tree *bfq_entity_service_tree(struct bfq_entity *entity);
+struct bfq_entity *bfq_entity_of(struct rb_node *node);
+unsigned short bfq_ioprio_to_weight(int ioprio);
+void bfq_put_idle_entity(struct bfq_service_tree *st,
+                        struct bfq_entity *entity);
+struct bfq_service_tree *
+__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
+                               struct bfq_entity *entity);
+void bfq_bfqq_served(struct bfq_queue *bfqq, int served);
+void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+                         unsigned long time_ms);
+bool __bfq_deactivate_entity(struct bfq_entity *entity,
+                            bool ins_into_idle_tree);
+bool next_queue_may_preempt(struct bfq_data *bfqd);
+struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd);
+void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd);
+void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+                        bool ins_into_idle_tree, bool expiration);
+void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
+void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
+void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+                      bool expiration);
+void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq);
+
+/* --------------- end of interface of B-WF2Q+ ---------------- */
+
+/* Logging facilities. */
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
+
+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do {                    \
+       char __pbuf[128];                                               \
+                                                                       \
+       blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \
+       blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, (bfqq)->pid, \
+                       bfq_bfqq_sync((bfqq)) ? 'S' : 'A',              \
+                         __pbuf, ##args);                              \
+} while (0)
+
+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {                    \
+       char __pbuf[128];                                               \
+                                                                       \
+       blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf));          \
+       blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args);    \
+} while (0)
+
+#else /* CONFIG_BFQ_GROUP_IOSCHED */
+
+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \
+       blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid,   \
+                       bfq_bfqq_sync((bfqq)) ? 'S' : 'A',              \
+                               ##args)
+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...)         do {} while (0)
+
+#endif /* CONFIG_BFQ_GROUP_IOSCHED */
+
+#define bfq_log(bfqd, fmt, args...) \
+       blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)
+
+#endif /* _BFQ_H */
diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c
new file mode 100644 (file)
index 0000000..b4fc3e4
--- /dev/null
@@ -0,0 +1,1616 @@
+/*
+ * Hierarchical Budget Worst-case Fair Weighted Fair Queueing
+ * (B-WF2Q+): hierarchical scheduling algorithm by which the BFQ I/O
+ * scheduler schedules generic entities. The latter can represent
+ * either single bfq queues (associated with processes) or groups of
+ * bfq queues (associated with cgroups).
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License as
+ *  published by the Free Software Foundation; either version 2 of the
+ *  License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ */
+#include "bfq-iosched.h"
+
+/**
+ * bfq_gt - compare two timestamps.
+ * @a: first ts.
+ * @b: second ts.
+ *
+ * Return @a > @b, dealing with wrapping correctly.
+ */
+static int bfq_gt(u64 a, u64 b)
+{
+       return (s64)(a - b) > 0;
+}
+
+static struct bfq_entity *bfq_root_active_entity(struct rb_root *tree)
+{
+       struct rb_node *node = tree->rb_node;
+
+       return rb_entry(node, struct bfq_entity, rb_node);
+}
+
+static unsigned int bfq_class_idx(struct bfq_entity *entity)
+{
+       struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+       return bfqq ? bfqq->ioprio_class - 1 :
+               BFQ_DEFAULT_GRP_CLASS - 1;
+}
+
+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd);
+
+static bool bfq_update_parent_budget(struct bfq_entity *next_in_service);
+
+/**
+ * bfq_update_next_in_service - update sd->next_in_service
+ * @sd: sched_data for which to perform the update.
+ * @new_entity: if not NULL, pointer to the entity whose activation,
+ *             requeueing or repositionig triggered the invocation of
+ *             this function.
+ *
+ * This function is called to update sd->next_in_service, which, in
+ * its turn, may change as a consequence of the insertion or
+ * extraction of an entity into/from one of the active trees of
+ * sd. These insertions/extractions occur as a consequence of
+ * activations/deactivations of entities, with some activations being
+ * 'true' activations, and other activations being requeueings (i.e.,
+ * implementing the second, requeueing phase of the mechanism used to
+ * reposition an entity in its active tree; see comments on
+ * __bfq_activate_entity and __bfq_requeue_entity for details). In
+ * both the last two activation sub-cases, new_entity points to the
+ * just activated or requeued entity.
+ *
+ * Returns true if sd->next_in_service changes in such a way that
+ * entity->parent may become the next_in_service for its parent
+ * entity.
+ */
+static bool bfq_update_next_in_service(struct bfq_sched_data *sd,
+                                      struct bfq_entity *new_entity)
+{
+       struct bfq_entity *next_in_service = sd->next_in_service;
+       bool parent_sched_may_change = false;
+
+       /*
+        * If this update is triggered by the activation, requeueing
+        * or repositiong of an entity that does not coincide with
+        * sd->next_in_service, then a full lookup in the active tree
+        * can be avoided. In fact, it is enough to check whether the
+        * just-modified entity has a higher priority than
+        * sd->next_in_service, or, even if it has the same priority
+        * as sd->next_in_service, is eligible and has a lower virtual
+        * finish time than sd->next_in_service. If this compound
+        * condition holds, then the new entity becomes the new
+        * next_in_service. Otherwise no change is needed.
+        */
+       if (new_entity && new_entity != sd->next_in_service) {
+               /*
+                * Flag used to decide whether to replace
+                * sd->next_in_service with new_entity. Tentatively
+                * set to true, and left as true if
+                * sd->next_in_service is NULL.
+                */
+               bool replace_next = true;
+
+               /*
+                * If there is already a next_in_service candidate
+                * entity, then compare class priorities or timestamps
+                * to decide whether to replace sd->service_tree with
+                * new_entity.
+                */
+               if (next_in_service) {
+                       unsigned int new_entity_class_idx =
+                               bfq_class_idx(new_entity);
+                       struct bfq_service_tree *st =
+                               sd->service_tree + new_entity_class_idx;
+
+                       /*
+                        * For efficiency, evaluate the most likely
+                        * sub-condition first.
+                        */
+                       replace_next =
+                               (new_entity_class_idx ==
+                                bfq_class_idx(next_in_service)
+                                &&
+                                !bfq_gt(new_entity->start, st->vtime)
+                                &&
+                                bfq_gt(next_in_service->finish,
+                                       new_entity->finish))
+                               ||
+                               new_entity_class_idx <
+                               bfq_class_idx(next_in_service);
+               }
+
+               if (replace_next)
+                       next_in_service = new_entity;
+       } else /* invoked because of a deactivation: lookup needed */
+               next_in_service = bfq_lookup_next_entity(sd);
+
+       if (next_in_service) {
+               parent_sched_may_change = !sd->next_in_service ||
+                       bfq_update_parent_budget(next_in_service);
+       }
+
+       sd->next_in_service = next_in_service;
+
+       if (!next_in_service)
+               return parent_sched_may_change;
+
+       return parent_sched_may_change;
+}
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+
+struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)
+{
+       struct bfq_entity *group_entity = bfqq->entity.parent;
+
+       if (!group_entity)
+               group_entity = &bfqq->bfqd->root_group->entity;
+
+       return container_of(group_entity, struct bfq_group, entity);
+}
+
+/*
+ * Returns true if this budget changes may let next_in_service->parent
+ * become the next_in_service entity for its parent entity.
+ */
+static bool bfq_update_parent_budget(struct bfq_entity *next_in_service)
+{
+       struct bfq_entity *bfqg_entity;
+       struct bfq_group *bfqg;
+       struct bfq_sched_data *group_sd;
+       bool ret = false;
+
+       group_sd = next_in_service->sched_data;
+
+       bfqg = container_of(group_sd, struct bfq_group, sched_data);
+       /*
+        * bfq_group's my_entity field is not NULL only if the group
+        * is not the root group. We must not touch the root entity
+        * as it must never become an in-service entity.
+        */
+       bfqg_entity = bfqg->my_entity;
+       if (bfqg_entity) {
+               if (bfqg_entity->budget > next_in_service->budget)
+                       ret = true;
+               bfqg_entity->budget = next_in_service->budget;
+       }
+
+       return ret;
+}
+
+/*
+ * This function tells whether entity stops being a candidate for next
+ * service, according to the following logic.
+ *
+ * This function is invoked for an entity that is about to be set in
+ * service. If such an entity is a queue, then the entity is no longer
+ * a candidate for next service (i.e, a candidate entity to serve
+ * after the in-service entity is expired). The function then returns
+ * true.
+ *
+ * In contrast, the entity could stil be a candidate for next service
+ * if it is not a queue, and has more than one child. In fact, even if
+ * one of its children is about to be set in service, other children
+ * may still be the next to serve. As a consequence, a non-queue
+ * entity is not a candidate for next-service only if it has only one
+ * child. And only if this condition holds, then the function returns
+ * true for a non-queue entity.
+ */
+static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
+{
+       struct bfq_group *bfqg;
+
+       if (bfq_entity_to_bfqq(entity))
+               return true;
+
+       bfqg = container_of(entity, struct bfq_group, entity);
+
+       if (bfqg->active_entities == 1)
+               return true;
+
+       return false;
+}
+
+#else /* CONFIG_BFQ_GROUP_IOSCHED */
+
+struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)
+{
+       return bfqq->bfqd->root_group;
+}
+
+static bool bfq_update_parent_budget(struct bfq_entity *next_in_service)
+{
+       return false;
+}
+
+static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
+{
+       return true;
+}
+
+#endif /* CONFIG_BFQ_GROUP_IOSCHED */
+
+/*
+ * Shift for timestamp calculations.  This actually limits the maximum
+ * service allowed in one timestamp delta (small shift values increase it),
+ * the maximum total weight that can be used for the queues in the system
+ * (big shift values increase it), and the period of virtual time
+ * wraparounds.
+ */
+#define WFQ_SERVICE_SHIFT      22
+
+struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
+{
+       struct bfq_queue *bfqq = NULL;
+
+       if (!entity->my_sched_data)
+               bfqq = container_of(entity, struct bfq_queue, entity);
+
+       return bfqq;
+}
+
+
+/**
+ * bfq_delta - map service into the virtual time domain.
+ * @service: amount of service.
+ * @weight: scale factor (weight of an entity or weight sum).
+ */
+static u64 bfq_delta(unsigned long service, unsigned long weight)
+{
+       u64 d = (u64)service << WFQ_SERVICE_SHIFT;
+
+       do_div(d, weight);
+       return d;
+}
+
+/**
+ * bfq_calc_finish - assign the finish time to an entity.
+ * @entity: the entity to act upon.
+ * @service: the service to be charged to the entity.
+ */
+static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service)
+{
+       struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+       entity->finish = entity->start +
+               bfq_delta(service, entity->weight);
+
+       if (bfqq) {
+               bfq_log_bfqq(bfqq->bfqd, bfqq,
+                       "calc_finish: serv %lu, w %d",
+                       service, entity->weight);
+               bfq_log_bfqq(bfqq->bfqd, bfqq,
+                       "calc_finish: start %llu, finish %llu, delta %llu",
+                       entity->start, entity->finish,
+                       bfq_delta(service, entity->weight));
+       }
+}
+
+/**
+ * bfq_entity_of - get an entity from a node.
+ * @node: the node field of the entity.
+ *
+ * Convert a node pointer to the relative entity.  This is used only
+ * to simplify the logic of some functions and not as the generic
+ * conversion mechanism because, e.g., in the tree walking functions,
+ * the check for a %NULL value would be redundant.
+ */
+struct bfq_entity *bfq_entity_of(struct rb_node *node)
+{
+       struct bfq_entity *entity = NULL;
+
+       if (node)
+               entity = rb_entry(node, struct bfq_entity, rb_node);
+
+       return entity;
+}
+
+/**
+ * bfq_extract - remove an entity from a tree.
+ * @root: the tree root.
+ * @entity: the entity to remove.
+ */
+static void bfq_extract(struct rb_root *root, struct bfq_entity *entity)
+{
+       entity->tree = NULL;
+       rb_erase(&entity->rb_node, root);
+}
+
+/**
+ * bfq_idle_extract - extract an entity from the idle tree.
+ * @st: the service tree of the owning @entity.
+ * @entity: the entity being removed.
+ */
+static void bfq_idle_extract(struct bfq_service_tree *st,
+                            struct bfq_entity *entity)
+{
+       struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+       struct rb_node *next;
+
+       if (entity == st->first_idle) {
+               next = rb_next(&entity->rb_node);
+               st->first_idle = bfq_entity_of(next);
+       }
+
+       if (entity == st->last_idle) {
+               next = rb_prev(&entity->rb_node);
+               st->last_idle = bfq_entity_of(next);
+       }
+
+       bfq_extract(&st->idle, entity);
+
+       if (bfqq)
+               list_del(&bfqq->bfqq_list);
+}
+
+/**
+ * bfq_insert - generic tree insertion.
+ * @root: tree root.
+ * @entity: entity to insert.
+ *
+ * This is used for the idle and the active tree, since they are both
+ * ordered by finish time.
+ */
+static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)
+{
+       struct bfq_entity *entry;
+       struct rb_node **node = &root->rb_node;
+       struct rb_node *parent = NULL;
+
+       while (*node) {
+               parent = *node;
+               entry = rb_entry(parent, struct bfq_entity, rb_node);
+
+               if (bfq_gt(entry->finish, entity->finish))
+                       node = &parent->rb_left;
+               else
+                       node = &parent->rb_right;
+       }
+
+       rb_link_node(&entity->rb_node, parent, node);
+       rb_insert_color(&entity->rb_node, root);
+
+       entity->tree = root;
+}
+
+/**
+ * bfq_update_min - update the min_start field of a entity.
+ * @entity: the entity to update.
+ * @node: one of its children.
+ *
+ * This function is called when @entity may store an invalid value for
+ * min_start due to updates to the active tree.  The function  assumes
+ * that the subtree rooted at @node (which may be its left or its right
+ * child) has a valid min_start value.
+ */
+static void bfq_update_min(struct bfq_entity *entity, struct rb_node *node)
+{
+       struct bfq_entity *child;
+
+       if (node) {
+               child = rb_entry(node, struct bfq_entity, rb_node);
+               if (bfq_gt(entity->min_start, child->min_start))
+                       entity->min_start = child->min_start;
+       }
+}
+
+/**
+ * bfq_update_active_node - recalculate min_start.
+ * @node: the node to update.
+ *
+ * @node may have changed position or one of its children may have moved,
+ * this function updates its min_start value.  The left and right subtrees
+ * are assumed to hold a correct min_start value.
+ */
+static void bfq_update_active_node(struct rb_node *node)
+{
+       struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);
+
+       entity->min_start = entity->start;
+       bfq_update_min(entity, node->rb_right);
+       bfq_update_min(entity, node->rb_left);
+}
+
+/**
+ * bfq_update_active_tree - update min_start for the whole active tree.
+ * @node: the starting node.
+ *
+ * @node must be the deepest modified node after an update.  This function
+ * updates its min_start using the values held by its children, assuming
+ * that they did not change, and then updates all the nodes that may have
+ * changed in the path to the root.  The only nodes that may have changed
+ * are the ones in the path or their siblings.
+ */
+static void bfq_update_active_tree(struct rb_node *node)
+{
+       struct rb_node *parent;
+
+up:
+       bfq_update_active_node(node);
+
+       parent = rb_parent(node);
+       if (!parent)
+               return;
+
+       if (node == parent->rb_left && parent->rb_right)
+               bfq_update_active_node(parent->rb_right);
+       else if (parent->rb_left)
+               bfq_update_active_node(parent->rb_left);
+
+       node = parent;
+       goto up;
+}
+
+/**
+ * bfq_active_insert - insert an entity in the active tree of its
+ *                     group/device.
+ * @st: the service tree of the entity.
+ * @entity: the entity being inserted.
+ *
+ * The active tree is ordered by finish time, but an extra key is kept
+ * per each node, containing the minimum value for the start times of
+ * its children (and the node itself), so it's possible to search for
+ * the eligible node with the lowest finish time in logarithmic time.
+ */
+static void bfq_active_insert(struct bfq_service_tree *st,
+                             struct bfq_entity *entity)
+{
+       struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+       struct rb_node *node = &entity->rb_node;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+       struct bfq_sched_data *sd = NULL;
+       struct bfq_group *bfqg = NULL;
+       struct bfq_data *bfqd = NULL;
+#endif
+
+       bfq_insert(&st->active, entity);
+
+       if (node->rb_left)
+               node = node->rb_left;
+       else if (node->rb_right)
+               node = node->rb_right;
+
+       bfq_update_active_tree(node);
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+       sd = entity->sched_data;
+       bfqg = container_of(sd, struct bfq_group, sched_data);
+       bfqd = (struct bfq_data *)bfqg->bfqd;
+#endif
+       if (bfqq)
+               list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+       else /* bfq_group */
+               bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree);
+
+       if (bfqg != bfqd->root_group)
+               bfqg->active_entities++;
+#endif
+}
+
+/**
+ * bfq_ioprio_to_weight - calc a weight from an ioprio.
+ * @ioprio: the ioprio value to convert.
+ */
+unsigned short bfq_ioprio_to_weight(int ioprio)
+{
+       return (IOPRIO_BE_NR - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF;
+}
+
+/**
+ * bfq_weight_to_ioprio - calc an ioprio from a weight.
+ * @weight: the weight value to convert.
+ *
+ * To preserve as much as possible the old only-ioprio user interface,
+ * 0 is used as an escape ioprio value for weights (numerically) equal or
+ * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF.
+ */
+static unsigned short bfq_weight_to_ioprio(int weight)
+{
+       return max_t(int, 0,
+                    IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight);
+}
+
+static void bfq_get_entity(struct bfq_entity *entity)
+{
+       struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+       if (bfqq) {
+               bfqq->ref++;
+               bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",
+                            bfqq, bfqq->ref);
+       }
+}
+
+/**
+ * bfq_find_deepest - find the deepest node that an extraction can modify.
+ * @node: the node being removed.
+ *
+ * Do the first step of an extraction in an rb tree, looking for the
+ * node that will replace @node, and returning the deepest node that
+ * the following modifications to the tree can touch.  If @node is the
+ * last node in the tree return %NULL.
+ */
+static struct rb_node *bfq_find_deepest(struct rb_node *node)
+{
+       struct rb_node *deepest;
+
+       if (!node->rb_right && !node->rb_left)
+               deepest = rb_parent(node);
+       else if (!node->rb_right)
+               deepest = node->rb_left;
+       else if (!node->rb_left)
+               deepest = node->rb_right;
+       else {
+               deepest = rb_next(node);
+               if (deepest->rb_right)
+                       deepest = deepest->rb_right;
+               else if (rb_parent(deepest) != node)
+                       deepest = rb_parent(deepest);
+       }
+
+       return deepest;
+}
+
+/**
+ * bfq_active_extract - remove an entity from the active tree.
+ * @st: the service_tree containing the tree.
+ * @entity: the entity being removed.
+ */
+static void bfq_active_extract(struct bfq_service_tree *st,
+                              struct bfq_entity *entity)
+{
+       struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+       struct rb_node *node;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+       struct bfq_sched_data *sd = NULL;
+       struct bfq_group *bfqg = NULL;
+       struct bfq_data *bfqd = NULL;
+#endif
+
+       node = bfq_find_deepest(&entity->rb_node);
+       bfq_extract(&st->active, entity);
+
+       if (node)
+               bfq_update_active_tree(node);
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+       sd = entity->sched_data;
+       bfqg = container_of(sd, struct bfq_group, sched_data);
+       bfqd = (struct bfq_data *)bfqg->bfqd;
+#endif
+       if (bfqq)
+               list_del(&bfqq->bfqq_list);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+       else /* bfq_group */
+               bfq_weights_tree_remove(bfqd, entity,
+                                       &bfqd->group_weights_tree);
+
+       if (bfqg != bfqd->root_group)
+               bfqg->active_entities--;
+#endif
+}
+
+/**
+ * bfq_idle_insert - insert an entity into the idle tree.
+ * @st: the service tree containing the tree.
+ * @entity: the entity to insert.
+ */
+static void bfq_idle_insert(struct bfq_service_tree *st,
+                           struct bfq_entity *entity)
+{
+       struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+       struct bfq_entity *first_idle = st->first_idle;
+       struct bfq_entity *last_idle = st->last_idle;
+
+       if (!first_idle || bfq_gt(first_idle->finish, entity->finish))
+               st->first_idle = entity;
+       if (!last_idle || bfq_gt(entity->finish, last_idle->finish))
+               st->last_idle = entity;
+
+       bfq_insert(&st->idle, entity);
+
+       if (bfqq)
+               list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);
+}
+
+/**
+ * bfq_forget_entity - do not consider entity any longer for scheduling
+ * @st: the service tree.
+ * @entity: the entity being removed.
+ * @is_in_service: true if entity is currently the in-service entity.
+ *
+ * Forget everything about @entity. In addition, if entity represents
+ * a queue, and the latter is not in service, then release the service
+ * reference to the queue (the one taken through bfq_get_entity). In
+ * fact, in this case, there is really no more service reference to
+ * the queue, as the latter is also outside any service tree. If,
+ * instead, the queue is in service, then __bfq_bfqd_reset_in_service
+ * will take care of putting the reference when the queue finally
+ * stops being served.
+ */
+static void bfq_forget_entity(struct bfq_service_tree *st,
+                             struct bfq_entity *entity,
+                             bool is_in_service)
+{
+       struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+       entity->on_st = false;
+       st->wsum -= entity->weight;
+       if (bfqq && !is_in_service)
+               bfq_put_queue(bfqq);
+}
+
+/**
+ * bfq_put_idle_entity - release the idle tree ref of an entity.
+ * @st: service tree for the entity.
+ * @entity: the entity being released.
+ */
+void bfq_put_idle_entity(struct bfq_service_tree *st, struct bfq_entity *entity)
+{
+       bfq_idle_extract(st, entity);
+       bfq_forget_entity(st, entity,
+                         entity == entity->sched_data->in_service_entity);
+}
+
+/**
+ * bfq_forget_idle - update the idle tree if necessary.
+ * @st: the service tree to act upon.
+ *
+ * To preserve the global O(log N) complexity we only remove one entry here;
+ * as the idle tree will not grow indefinitely this can be done safely.
+ */
+static void bfq_forget_idle(struct bfq_service_tree *st)
+{
+       struct bfq_entity *first_idle = st->first_idle;
+       struct bfq_entity *last_idle = st->last_idle;
+
+       if (RB_EMPTY_ROOT(&st->active) && last_idle &&
+           !bfq_gt(last_idle->finish, st->vtime)) {
+               /*
+                * Forget the whole idle tree, increasing the vtime past
+                * the last finish time of idle entities.
+                */
+               st->vtime = last_idle->finish;
+       }
+
+       if (first_idle && !bfq_gt(first_idle->finish, st->vtime))
+               bfq_put_idle_entity(st, first_idle);
+}
+
+struct bfq_service_tree *bfq_entity_service_tree(struct bfq_entity *entity)
+{
+       struct bfq_sched_data *sched_data = entity->sched_data;
+       unsigned int idx = bfq_class_idx(entity);
+
+       return sched_data->service_tree + idx;
+}
+
+
+struct bfq_service_tree *
+__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
+                               struct bfq_entity *entity)
+{
+       struct bfq_service_tree *new_st = old_st;
+
+       if (entity->prio_changed) {
+               struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+               unsigned int prev_weight, new_weight;
+               struct bfq_data *bfqd = NULL;
+               struct rb_root *root;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+               struct bfq_sched_data *sd;
+               struct bfq_group *bfqg;
+#endif
+
+               if (bfqq)
+                       bfqd = bfqq->bfqd;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+               else {
+                       sd = entity->my_sched_data;
+                       bfqg = container_of(sd, struct bfq_group, sched_data);
+                       bfqd = (struct bfq_data *)bfqg->bfqd;
+               }
+#endif
+
+               old_st->wsum -= entity->weight;
+
+               if (entity->new_weight != entity->orig_weight) {
+                       if (entity->new_weight < BFQ_MIN_WEIGHT ||
+                           entity->new_weight > BFQ_MAX_WEIGHT) {
+                               pr_crit("update_weight_prio: new_weight %d\n",
+                                       entity->new_weight);
+                               if (entity->new_weight < BFQ_MIN_WEIGHT)
+                                       entity->new_weight = BFQ_MIN_WEIGHT;
+                               else
+                                       entity->new_weight = BFQ_MAX_WEIGHT;
+                       }
+                       entity->orig_weight = entity->new_weight;
+                       if (bfqq)
+                               bfqq->ioprio =
+                                 bfq_weight_to_ioprio(entity->orig_weight);
+               }
+
+               if (bfqq)
+                       bfqq->ioprio_class = bfqq->new_ioprio_class;
+               entity->prio_changed = 0;
+
+               /*
+                * NOTE: here we may be changing the weight too early,
+                * this will cause unfairness.  The correct approach
+                * would have required additional complexity to defer
+                * weight changes to the proper time instants (i.e.,
+                * when entity->finish <= old_st->vtime).
+                */
+               new_st = bfq_entity_service_tree(entity);
+
+               prev_weight = entity->weight;
+               new_weight = entity->orig_weight *
+                            (bfqq ? bfqq->wr_coeff : 1);
+               /*
+                * If the weight of the entity changes, remove the entity
+                * from its old weight counter (if there is a counter
+                * associated with the entity), and add it to the counter
+                * associated with its new weight.
+                */
+               if (prev_weight != new_weight) {
+                       root = bfqq ? &bfqd->queue_weights_tree :
+                                     &bfqd->group_weights_tree;
+                       bfq_weights_tree_remove(bfqd, entity, root);
+               }
+               entity->weight = new_weight;
+               /*
+                * Add the entity to its weights tree only if it is
+                * not associated with a weight-raised queue.
+                */
+               if (prev_weight != new_weight &&
+                   (bfqq ? bfqq->wr_coeff == 1 : 1))
+                       /* If we get here, root has been initialized. */
+                       bfq_weights_tree_add(bfqd, entity, root);
+
+               new_st->wsum += entity->weight;
+
+               if (new_st != old_st)
+                       entity->start = new_st->vtime;
+       }
+
+       return new_st;
+}
+
+/**
+ * bfq_bfqq_served - update the scheduler status after selection for
+ *                   service.
+ * @bfqq: the queue being served.
+ * @served: bytes to transfer.
+ *
+ * NOTE: this can be optimized, as the timestamps of upper level entities
+ * are synchronized every time a new bfqq is selected for service.  By now,
+ * we keep it to better check consistency.
+ */
+void bfq_bfqq_served(struct bfq_queue *bfqq, int served)
+{
+       struct bfq_entity *entity = &bfqq->entity;
+       struct bfq_service_tree *st;
+
+       for_each_entity(entity) {
+               st = bfq_entity_service_tree(entity);
+
+               entity->service += served;
+
+               st->vtime += bfq_delta(served, st->wsum);
+               bfq_forget_idle(st);
+       }
+       bfqg_stats_set_start_empty_time(bfqq_group(bfqq));
+       bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs", served);
+}
+
+/**
+ * bfq_bfqq_charge_time - charge an amount of service equivalent to the length
+ *                       of the time interval during which bfqq has been in
+ *                       service.
+ * @bfqd: the device
+ * @bfqq: the queue that needs a service update.
+ * @time_ms: the amount of time during which the queue has received service
+ *
+ * If a queue does not consume its budget fast enough, then providing
+ * the queue with service fairness may impair throughput, more or less
+ * severely. For this reason, queues that consume their budget slowly
+ * are provided with time fairness instead of service fairness. This
+ * goal is achieved through the BFQ scheduling engine, even if such an
+ * engine works in the service, and not in the time domain. The trick
+ * is charging these queues with an inflated amount of service, equal
+ * to the amount of service that they would have received during their
+ * service slot if they had been fast, i.e., if their requests had
+ * been dispatched at a rate equal to the estimated peak rate.
+ *
+ * It is worth noting that time fairness can cause important
+ * distortions in terms of bandwidth distribution, on devices with
+ * internal queueing. The reason is that I/O requests dispatched
+ * during the service slot of a queue may be served after that service
+ * slot is finished, and may have a total processing time loosely
+ * correlated with the duration of the service slot. This is
+ * especially true for short service slots.
+ */
+void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+                         unsigned long time_ms)
+{
+       struct bfq_entity *entity = &bfqq->entity;
+       int tot_serv_to_charge = entity->service;
+       unsigned int timeout_ms = jiffies_to_msecs(bfq_timeout);
+
+       if (time_ms > 0 && time_ms < timeout_ms)
+               tot_serv_to_charge =
+                       (bfqd->bfq_max_budget * time_ms) / timeout_ms;
+
+       if (tot_serv_to_charge < entity->service)
+               tot_serv_to_charge = entity->service;
+
+       /* Increase budget to avoid inconsistencies */
+       if (tot_serv_to_charge > entity->budget)
+               entity->budget = tot_serv_to_charge;
+
+       bfq_bfqq_served(bfqq,
+                       max_t(int, 0, tot_serv_to_charge - entity->service));
+}
+
+static void bfq_update_fin_time_enqueue(struct bfq_entity *entity,
+                                       struct bfq_service_tree *st,
+                                       bool backshifted)
+{
+       struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+       st = __bfq_entity_update_weight_prio(st, entity);
+       bfq_calc_finish(entity, entity->budget);
+
+       /*
+        * If some queues enjoy backshifting for a while, then their
+        * (virtual) finish timestamps may happen to become lower and
+        * lower than the system virtual time.  In particular, if
+        * these queues often happen to be idle for short time
+        * periods, and during such time periods other queues with
+        * higher timestamps happen to be busy, then the backshifted
+        * timestamps of the former queues can become much lower than
+        * the system virtual time. In fact, to serve the queues with
+        * higher timestamps while the ones with lower timestamps are
+        * idle, the system virtual time may be pushed-up to much
+        * higher values than the finish timestamps of the idle
+        * queues. As a consequence, the finish timestamps of all new
+        * or newly activated queues may end up being much larger than
+        * those of lucky queues with backshifted timestamps. The
+        * latter queues may then monopolize the device for a lot of
+        * time. This would simply break service guarantees.
+        *
+        * To reduce this problem, push up a little bit the
+        * backshifted timestamps of the queue associated with this
+        * entity (only a queue can happen to have the backshifted
+        * flag set): just enough to let the finish timestamp of the
+        * queue be equal to the current value of the system virtual
+        * time. This may introduce a little unfairness among queues
+        * with backshifted timestamps, but it does not break
+        * worst-case fairness guarantees.
+        *
+        * As a special case, if bfqq is weight-raised, push up
+        * timestamps much less, to keep very low the probability that
+        * this push up causes the backshifted finish timestamps of
+        * weight-raised queues to become higher than the backshifted
+        * finish timestamps of non weight-raised queues.
+        */
+       if (backshifted && bfq_gt(st->vtime, entity->finish)) {
+               unsigned long delta = st->vtime - entity->finish;
+
+               if (bfqq)
+                       delta /= bfqq->wr_coeff;
+
+               entity->start += delta;
+               entity->finish += delta;
+       }
+
+       bfq_active_insert(st, entity);
+}
+
+/**
+ * __bfq_activate_entity - handle activation of entity.
+ * @entity: the entity being activated.
+ * @non_blocking_wait_rq: true if entity was waiting for a request
+ *
+ * Called for a 'true' activation, i.e., if entity is not active and
+ * one of its children receives a new request.
+ *
+ * Basically, this function updates the timestamps of entity and
+ * inserts entity into its active tree, ater possible extracting it
+ * from its idle tree.
+ */
+static void __bfq_activate_entity(struct bfq_entity *entity,
+                                 bool non_blocking_wait_rq)
+{
+       struct bfq_service_tree *st = bfq_entity_service_tree(entity);
+       bool backshifted = false;
+       unsigned long long min_vstart;
+
+       /* See comments on bfq_fqq_update_budg_for_activation */
+       if (non_blocking_wait_rq && bfq_gt(st->vtime, entity->finish)) {
+               backshifted = true;
+               min_vstart = entity->finish;
+       } else
+               min_vstart = st->vtime;
+
+       if (entity->tree == &st->idle) {
+               /*
+                * Must be on the idle tree, bfq_idle_extract() will
+                * check for that.
+                */
+               bfq_idle_extract(st, entity);
+               entity->start = bfq_gt(min_vstart, entity->finish) ?
+                       min_vstart : entity->finish;
+       } else {
+               /*
+                * The finish time of the entity may be invalid, and
+                * it is in the past for sure, otherwise the queue
+                * would have been on the idle tree.
+                */
+               entity->start = min_vstart;
+               st->wsum += entity->weight;
+               /*
+                * entity is about to be inserted into a service tree,
+                * and then set in service: get a reference to make
+                * sure entity does not disappear until it is no
+                * longer in service or scheduled for service.
+                */
+               bfq_get_entity(entity);
+
+               entity->on_st = true;
+       }
+
+       bfq_update_fin_time_enqueue(entity, st, backshifted);
+}
+
+/**
+ * __bfq_requeue_entity - handle requeueing or repositioning of an entity.
+ * @entity: the entity being requeued or repositioned.
+ *
+ * Requeueing is needed if this entity stops being served, which
+ * happens if a leaf descendant entity has expired. On the other hand,
+ * repositioning is needed if the next_inservice_entity for the child
+ * entity has changed. See the comments inside the function for
+ * details.
+ *
+ * Basically, this function: 1) removes entity from its active tree if
+ * present there, 2) updates the timestamps of entity and 3) inserts
+ * entity back into its active tree (in the new, right position for
+ * the new values of the timestamps).
+ */
+static void __bfq_requeue_entity(struct bfq_entity *entity)
+{
+       struct bfq_sched_data *sd = entity->sched_data;
+       struct bfq_service_tree *st = bfq_entity_service_tree(entity);
+
+       if (entity == sd->in_service_entity) {
+               /*
+                * We are requeueing the current in-service entity,
+                * which may have to be done for one of the following
+                * reasons:
+                * - entity represents the in-service queue, and the
+                *   in-service queue is being requeued after an
+                *   expiration;
+                * - entity represents a group, and its budget has
+                *   changed because one of its child entities has
+                *   just been either activated or requeued for some
+                *   reason; the timestamps of the entity need then to
+                *   be updated, and the entity needs to be enqueued
+                *   or repositioned accordingly.
+                *
+                * In particular, before requeueing, the start time of
+                * the entity must be moved forward to account for the
+                * service that the entity has received while in
+                * service. This is done by the next instructions. The
+                * finish time will then be updated according to this
+                * new value of the start time, and to the budget of
+                * the entity.
+                */
+               bfq_calc_finish(entity, entity->service);
+               entity->start = entity->finish;
+               /*
+                * In addition, if the entity had more than one child
+                * when set in service, then was not extracted from
+                * the active tree. This implies that the position of
+                * the entity in the active tree may need to be
+                * changed now, because we have just updated the start
+                * time of the entity, and we will update its finish
+                * time in a moment (the requeueing is then, more
+                * precisely, a repositioning in this case). To
+                * implement this repositioning, we: 1) dequeue the
+                * entity here, 2) update the finish time and
+                * requeue the entity according to the new
+                * timestamps below.
+                */
+               if (entity->tree)
+                       bfq_active_extract(st, entity);
+       } else { /* The entity is already active, and not in service */
+               /*
+                * In this case, this function gets called only if the
+                * next_in_service entity below this entity has
+                * changed, and this change has caused the budget of
+                * this entity to change, which, finally implies that
+                * the finish time of this entity must be
+                * updated. Such an update may cause the scheduling,
+                * i.e., the position in the active tree, of this
+                * entity to change. We handle this change by: 1)
+                * dequeueing the entity here, 2) updating the finish
+                * time and requeueing the entity according to the new
+                * timestamps below. This is the same approach as the
+                * non-extracted-entity sub-case above.
+                */
+               bfq_active_extract(st, entity);
+       }
+
+       bfq_update_fin_time_enqueue(entity, st, false);
+}
+
+static void __bfq_activate_requeue_entity(struct bfq_entity *entity,
+                                         struct bfq_sched_data *sd,
+                                         bool non_blocking_wait_rq)
+{
+       struct bfq_service_tree *st = bfq_entity_service_tree(entity);
+
+       if (sd->in_service_entity == entity || entity->tree == &st->active)
+                /*
+                 * in service or already queued on the active tree,
+                 * requeue or reposition
+                 */
+               __bfq_requeue_entity(entity);
+       else
+               /*
+                * Not in service and not queued on its active tree:
+                * the activity is idle and this is a true activation.
+                */
+               __bfq_activate_entity(entity, non_blocking_wait_rq);
+}
+
+
+/**
+ * bfq_activate_entity - activate or requeue an entity representing a bfq_queue,
+ *                      and activate, requeue or reposition all ancestors
+ *                      for which such an update becomes necessary.
+ * @entity: the entity to activate.
+ * @non_blocking_wait_rq: true if this entity was waiting for a request
+ * @requeue: true if this is a requeue, which implies that bfqq is
+ *          being expired; thus ALL its ancestors stop being served and must
+ *          therefore be requeued
+ */
+static void bfq_activate_requeue_entity(struct bfq_entity *entity,
+                                       bool non_blocking_wait_rq,
+                                       bool requeue)
+{
+       struct bfq_sched_data *sd;
+
+       for_each_entity(entity) {
+               sd = entity->sched_data;
+               __bfq_activate_requeue_entity(entity, sd, non_blocking_wait_rq);
+
+               if (!bfq_update_next_in_service(sd, entity) && !requeue)
+                       break;
+       }
+}
+
+/**
+ * __bfq_deactivate_entity - deactivate an entity from its service tree.
+ * @entity: the entity to deactivate.
+ * @ins_into_idle_tree: if false, the entity will not be put into the
+ *                     idle tree.
+ *
+ * Deactivates an entity, independently from its previous state.  Must
+ * be invoked only if entity is on a service tree. Extracts the entity
+ * from that tree, and if necessary and allowed, puts it on the idle
+ * tree.
+ */
+bool __bfq_deactivate_entity(struct bfq_entity *entity, bool ins_into_idle_tree)
+{
+       struct bfq_sched_data *sd = entity->sched_data;
+       struct bfq_service_tree *st = bfq_entity_service_tree(entity);
+       int is_in_service = entity == sd->in_service_entity;
+
+       if (!entity->on_st) /* entity never activated, or already inactive */
+               return false;
+
+       if (is_in_service)
+               bfq_calc_finish(entity, entity->service);
+
+       if (entity->tree == &st->active)
+               bfq_active_extract(st, entity);
+       else if (!is_in_service && entity->tree == &st->idle)
+               bfq_idle_extract(st, entity);
+
+       if (!ins_into_idle_tree || !bfq_gt(entity->finish, st->vtime))
+               bfq_forget_entity(st, entity, is_in_service);
+       else
+               bfq_idle_insert(st, entity);
+
+       return true;
+}
+
+/**
+ * bfq_deactivate_entity - deactivate an entity representing a bfq_queue.
+ * @entity: the entity to deactivate.
+ * @ins_into_idle_tree: true if the entity can be put on the idle tree
+ */
+static void bfq_deactivate_entity(struct bfq_entity *entity,
+                                 bool ins_into_idle_tree,
+                                 bool expiration)
+{
+       struct bfq_sched_data *sd;
+       struct bfq_entity *parent = NULL;
+
+       for_each_entity_safe(entity, parent) {
+               sd = entity->sched_data;
+
+               if (!__bfq_deactivate_entity(entity, ins_into_idle_tree)) {
+                       /*
+                        * entity is not in any tree any more, so
+                        * this deactivation is a no-op, and there is
+                        * nothing to change for upper-level entities
+                        * (in case of expiration, this can never
+                        * happen).
+                        */
+                       return;
+               }
+
+               if (sd->next_in_service == entity)
+                       /*
+                        * entity was the next_in_service entity,
+                        * then, since entity has just been
+                        * deactivated, a new one must be found.
+                        */
+                       bfq_update_next_in_service(sd, NULL);
+
+               if (sd->next_in_service)
+                       /*
+                        * The parent entity is still backlogged,
+                        * because next_in_service is not NULL. So, no
+                        * further upwards deactivation must be
+                        * performed.  Yet, next_in_service has
+                        * changed.  Then the schedule does need to be
+                        * updated upwards.
+                        */
+                       break;
+
+               /*
+                * If we get here, then the parent is no more
+                * backlogged and we need to propagate the
+                * deactivation upwards. Thus let the loop go on.
+                */
+
+               /*
+                * Also let parent be queued into the idle tree on
+                * deactivation, to preserve service guarantees, and
+                * assuming that who invoked this function does not
+                * need parent entities too to be removed completely.
+                */
+               ins_into_idle_tree = true;
+       }
+
+       /*
+        * If the deactivation loop is fully executed, then there are
+        * no more entities to touch and next loop is not executed at
+        * all. Otherwise, requeue remaining entities if they are
+        * about to stop receiving service, or reposition them if this
+        * is not the case.
+        */
+       entity = parent;
+       for_each_entity(entity) {
+               /*
+                * Invoke __bfq_requeue_entity on entity, even if
+                * already active, to requeue/reposition it in the
+                * active tree (because sd->next_in_service has
+                * changed)
+                */
+               __bfq_requeue_entity(entity);
+
+               sd = entity->sched_data;
+               if (!bfq_update_next_in_service(sd, entity) &&
+                   !expiration)
+                       /*
+                        * next_in_service unchanged or not causing
+                        * any change in entity->parent->sd, and no
+                        * requeueing needed for expiration: stop
+                        * here.
+                        */
+                       break;
+       }
+}
+
+/**
+ * bfq_calc_vtime_jump - compute the value to which the vtime should jump,
+ *                       if needed, to have at least one entity eligible.
+ * @st: the service tree to act upon.
+ *
+ * Assumes that st is not empty.
+ */
+static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st)
+{
+       struct bfq_entity *root_entity = bfq_root_active_entity(&st->active);
+
+       if (bfq_gt(root_entity->min_start, st->vtime))
+               return root_entity->min_start;
+
+       return st->vtime;
+}
+
+static void bfq_update_vtime(struct bfq_service_tree *st, u64 new_value)
+{
+       if (new_value > st->vtime) {
+               st->vtime = new_value;
+               bfq_forget_idle(st);
+       }
+}
+
+/**
+ * bfq_first_active_entity - find the eligible entity with
+ *                           the smallest finish time
+ * @st: the service tree to select from.
+ * @vtime: the system virtual to use as a reference for eligibility
+ *
+ * This function searches the first schedulable entity, starting from the
+ * root of the tree and going on the left every time on this side there is
+ * a subtree with at least one eligible (start >= vtime) entity. The path on
+ * the right is followed only if a) the left subtree contains no eligible
+ * entities and b) no eligible entity has been found yet.
+ */
+static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st,
+                                                 u64 vtime)
+{
+       struct bfq_entity *entry, *first = NULL;
+       struct rb_node *node = st->active.rb_node;
+
+       while (node) {
+               entry = rb_entry(node, struct bfq_entity, rb_node);
+left:
+               if (!bfq_gt(entry->start, vtime))
+                       first = entry;
+
+               if (node->rb_left) {
+                       entry = rb_entry(node->rb_left,
+                                        struct bfq_entity, rb_node);
+                       if (!bfq_gt(entry->min_start, vtime)) {
+                               node = node->rb_left;
+                               goto left;
+                       }
+               }
+               if (first)
+                       break;
+               node = node->rb_right;
+       }
+
+       return first;
+}
+
+/**
+ * __bfq_lookup_next_entity - return the first eligible entity in @st.
+ * @st: the service tree.
+ *
+ * If there is no in-service entity for the sched_data st belongs to,
+ * then return the entity that will be set in service if:
+ * 1) the parent entity this st belongs to is set in service;
+ * 2) no entity belonging to such parent entity undergoes a state change
+ * that would influence the timestamps of the entity (e.g., becomes idle,
+ * becomes backlogged, changes its budget, ...).
+ *
+ * In this first case, update the virtual time in @st too (see the
+ * comments on this update inside the function).
+ *
+ * In constrast, if there is an in-service entity, then return the
+ * entity that would be set in service if not only the above
+ * conditions, but also the next one held true: the currently
+ * in-service entity, on expiration,
+ * 1) gets a finish time equal to the current one, or
+ * 2) is not eligible any more, or
+ * 3) is idle.
+ */
+static struct bfq_entity *
+__bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service)
+{
+       struct bfq_entity *entity;
+       u64 new_vtime;
+
+       if (RB_EMPTY_ROOT(&st->active))
+               return NULL;
+
+       /*
+        * Get the value of the system virtual time for which at
+        * least one entity is eligible.
+        */
+       new_vtime = bfq_calc_vtime_jump(st);
+
+       /*
+        * If there is no in-service entity for the sched_data this
+        * active tree belongs to, then push the system virtual time
+        * up to the value that guarantees that at least one entity is
+        * eligible. If, instead, there is an in-service entity, then
+        * do not make any such update, because there is already an
+        * eligible entity, namely the in-service one (even if the
+        * entity is not on st, because it was extracted when set in
+        * service).
+        */
+       if (!in_service)
+               bfq_update_vtime(st, new_vtime);
+
+       entity = bfq_first_active_entity(st, new_vtime);
+
+       return entity;
+}
+
+/**
+ * bfq_lookup_next_entity - return the first eligible entity in @sd.
+ * @sd: the sched_data.
+ *
+ * This function is invoked when there has been a change in the trees
+ * for sd, and we need know what is the new next entity after this
+ * change.
+ */
+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd)
+{
+       struct bfq_service_tree *st = sd->service_tree;
+       struct bfq_service_tree *idle_class_st = st + (BFQ_IOPRIO_CLASSES - 1);
+       struct bfq_entity *entity = NULL;
+       int class_idx = 0;
+
+       /*
+        * Choose from idle class, if needed to guarantee a minimum
+        * bandwidth to this class (and if there is some active entity
+        * in idle class). This should also mitigate
+        * priority-inversion problems in case a low priority task is
+        * holding file system resources.
+        */
+       if (time_is_before_jiffies(sd->bfq_class_idle_last_service +
+                                  BFQ_CL_IDLE_TIMEOUT)) {
+               if (!RB_EMPTY_ROOT(&idle_class_st->active))
+                       class_idx = BFQ_IOPRIO_CLASSES - 1;
+               /* About to be served if backlogged, or not yet backlogged */
+               sd->bfq_class_idle_last_service = jiffies;
+       }
+
+       /*
+        * Find the next entity to serve for the highest-priority
+        * class, unless the idle class needs to be served.
+        */
+       for (; class_idx < BFQ_IOPRIO_CLASSES; class_idx++) {
+               entity = __bfq_lookup_next_entity(st + class_idx,
+                                                 sd->in_service_entity);
+
+               if (entity)
+                       break;
+       }
+
+       if (!entity)
+               return NULL;
+
+       return entity;
+}
+
+bool next_queue_may_preempt(struct bfq_data *bfqd)
+{
+       struct bfq_sched_data *sd = &bfqd->root_group->sched_data;
+
+       return sd->next_in_service != sd->in_service_entity;
+}
+
+/*
+ * Get next queue for service.
+ */
+struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
+{
+       struct bfq_entity *entity = NULL;
+       struct bfq_sched_data *sd;
+       struct bfq_queue *bfqq;
+
+       if (bfqd->busy_queues == 0)
+               return NULL;
+
+       /*
+        * Traverse the path from the root to the leaf entity to
+        * serve. Set in service all the entities visited along the
+        * way.
+        */
+       sd = &bfqd->root_group->sched_data;
+       for (; sd ; sd = entity->my_sched_data) {
+               /*
+                * WARNING. We are about to set the in-service entity
+                * to sd->next_in_service, i.e., to the (cached) value
+                * returned by bfq_lookup_next_entity(sd) the last
+                * time it was invoked, i.e., the last time when the
+                * service order in sd changed as a consequence of the
+                * activation or deactivation of an entity. In this
+                * respect, if we execute bfq_lookup_next_entity(sd)
+                * in this very moment, it may, although with low
+                * probability, yield a different entity than that
+                * pointed to by sd->next_in_service. This rare event
+                * happens in case there was no CLASS_IDLE entity to
+                * serve for sd when bfq_lookup_next_entity(sd) was
+                * invoked for the last time, while there is now one
+                * such entity.
+                *
+                * If the above event happens, then the scheduling of
+                * such entity in CLASS_IDLE is postponed until the
+                * service of the sd->next_in_service entity
+                * finishes. In fact, when the latter is expired,
+                * bfq_lookup_next_entity(sd) gets called again,
+                * exactly to update sd->next_in_service.
+                */
+
+               /* Make next_in_service entity become in_service_entity */
+               entity = sd->next_in_service;
+               sd->in_service_entity = entity;
+
+               /*
+                * Reset the accumulator of the amount of service that
+                * the entity is about to receive.
+                */
+               entity->service = 0;
+
+               /*
+                * If entity is no longer a candidate for next
+                * service, then we extract it from its active tree,
+                * for the following reason. To further boost the
+                * throughput in some special case, BFQ needs to know
+                * which is the next candidate entity to serve, while
+                * there is already an entity in service. In this
+                * respect, to make it easy to compute/update the next
+                * candidate entity to serve after the current
+                * candidate has been set in service, there is a case
+                * where it is necessary to extract the current
+                * candidate from its service tree. Such a case is
+                * when the entity just set in service cannot be also
+                * a candidate for next service. Details about when
+                * this conditions holds are reported in the comments
+                * on the function bfq_no_longer_next_in_service()
+                * invoked below.
+                */
+               if (bfq_no_longer_next_in_service(entity))
+                       bfq_active_extract(bfq_entity_service_tree(entity),
+                                          entity);
+
+               /*
+                * For the same reason why we may have just extracted
+                * entity from its active tree, we may need to update
+                * next_in_service for the sched_data of entity too,
+                * regardless of whether entity has been extracted.
+                * In fact, even if entity has not been extracted, a
+                * descendant entity may get extracted. Such an event
+                * would cause a change in next_in_service for the
+                * level of the descendant entity, and thus possibly
+                * back to upper levels.
+                *
+                * We cannot perform the resulting needed update
+                * before the end of this loop, because, to know which
+                * is the correct next-to-serve candidate entity for
+                * each level, we need first to find the leaf entity
+                * to set in service. In fact, only after we know
+                * which is the next-to-serve leaf entity, we can
+                * discover whether the parent entity of the leaf
+                * entity becomes the next-to-serve, and so on.
+                */
+
+       }
+
+       bfqq = bfq_entity_to_bfqq(entity);
+
+       /*
+        * We can finally update all next-to-serve entities along the
+        * path from the leaf entity just set in service to the root.
+        */
+       for_each_entity(entity) {
+               struct bfq_sched_data *sd = entity->sched_data;
+
+               if (!bfq_update_next_in_service(sd, NULL))
+                       break;
+       }
+
+       return bfqq;
+}
+
+void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
+{
+       struct bfq_queue *in_serv_bfqq = bfqd->in_service_queue;
+       struct bfq_entity *in_serv_entity = &in_serv_bfqq->entity;
+       struct bfq_entity *entity = in_serv_entity;
+
+       bfq_clear_bfqq_wait_request(in_serv_bfqq);
+       hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
+       bfqd->in_service_queue = NULL;
+
+       /*
+        * When this function is called, all in-service entities have
+        * been properly deactivated or requeued, so we can safely
+        * execute the final step: reset in_service_entity along the
+        * path from entity to the root.
+        */
+       for_each_entity(entity)
+               entity->sched_data->in_service_entity = NULL;
+
+       /*
+        * in_serv_entity is no longer in service, so, if it is in no
+        * service tree either, then release the service reference to
+        * the queue it represents (taken with bfq_get_entity).
+        */
+       if (!in_serv_entity->on_st)
+               bfq_put_queue(in_serv_bfqq);
+}
+
+void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+                        bool ins_into_idle_tree, bool expiration)
+{
+       struct bfq_entity *entity = &bfqq->entity;
+
+       bfq_deactivate_entity(entity, ins_into_idle_tree, expiration);
+}
+
+void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+       struct bfq_entity *entity = &bfqq->entity;
+
+       bfq_activate_requeue_entity(entity, bfq_bfqq_non_blocking_wait_rq(bfqq),
+                                   false);
+       bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
+}
+
+void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+       struct bfq_entity *entity = &bfqq->entity;
+
+       bfq_activate_requeue_entity(entity, false,
+                                   bfqq == bfqd->in_service_queue);
+}
+
+/*
+ * Called when the bfqq no longer has requests pending, remove it from
+ * the service tree. As a special case, it can be invoked during an
+ * expiration.
+ */
+void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+                      bool expiration)
+{
+       bfq_log_bfqq(bfqd, bfqq, "del from busy");
+
+       bfq_clear_bfqq_busy(bfqq);
+
+       bfqd->busy_queues--;
+
+       if (!bfqq->dispatched)
+               bfq_weights_tree_remove(bfqd, &bfqq->entity,
+                                       &bfqd->queue_weights_tree);
+
+       if (bfqq->wr_coeff > 1)
+               bfqd->wr_busy_queues--;
+
+       bfqg_stats_update_dequeue(bfqq_group(bfqq));
+
+       bfq_deactivate_bfqq(bfqd, bfqq, true, expiration);
+}
+
+/*
+ * Called when an inactive queue receives a new request.
+ */
+void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+       bfq_log_bfqq(bfqd, bfqq, "add to busy");
+
+       bfq_activate_bfqq(bfqd, bfqq);
+
+       bfq_mark_bfqq_busy(bfqq);
+       bfqd->busy_queues++;
+
+       if (!bfqq->dispatched)
+               if (bfqq->wr_coeff == 1)
+                       bfq_weights_tree_add(bfqd, &bfqq->entity,
+                                            &bfqd->queue_weights_tree);
+
+       if (bfqq->wr_coeff > 1)
+               bfqd->wr_busy_queues++;
+}
index e75878f..f4d2071 100644 (file)
@@ -30,6 +30,7 @@
 #include <linux/cgroup.h>
 
 #include <trace/events/block.h>
+#include "blk.h"
 
 /*
  * Test patch to inline a certain number of bi_io_vec's inside the bio
@@ -427,7 +428,8 @@ static void punt_bios_to_rescuer(struct bio_set *bs)
  *   RETURNS:
  *   Pointer to new bio on success, NULL on failure.
  */
-struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
+struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs,
+                            struct bio_set *bs)
 {
        gfp_t saved_gfp = gfp_mask;
        unsigned front_pad;
@@ -1824,6 +1826,11 @@ static inline bool bio_remaining_done(struct bio *bio)
  *   bio_endio() will end I/O on the whole bio. bio_endio() is the preferred
  *   way to end I/O on a bio. No one should call bi_end_io() directly on a
  *   bio unless they own it and thus know that it has an end_io function.
+ *
+ *   bio_endio() can be called several times on a bio that has been chained
+ *   using bio_chain().  The ->bi_end_io() function will only be called the
+ *   last time.  At this point the BLK_TA_COMPLETE tracing event will be
+ *   generated if BIO_TRACE_COMPLETION is set.
  **/
 void bio_endio(struct bio *bio)
 {
@@ -1844,6 +1851,13 @@ again:
                goto again;
        }
 
+       if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) {
+               trace_block_bio_complete(bdev_get_queue(bio->bi_bdev),
+                                        bio, bio->bi_error);
+               bio_clear_flag(bio, BIO_TRACE_COMPLETION);
+       }
+
+       blk_throtl_bio_endio(bio);
        if (bio->bi_end_io)
                bio->bi_end_io(bio);
 }
@@ -1882,6 +1896,9 @@ struct bio *bio_split(struct bio *bio, int sectors,
 
        bio_advance(bio, split->bi_iter.bi_size);
 
+       if (bio_flagged(bio, BIO_TRACE_COMPLETION))
+               bio_set_flag(bio, BIO_TRACE_COMPLETION);
+
        return split;
 }
 EXPORT_SYMBOL(bio_split);
index bbe7ee0..7c29471 100644 (file)
@@ -772,6 +772,27 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg,
 }
 EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum);
 
+/* Performs queue bypass and policy enabled checks then looks up blkg. */
+static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg,
+                                         const struct blkcg_policy *pol,
+                                         struct request_queue *q)
+{
+       WARN_ON_ONCE(!rcu_read_lock_held());
+       lockdep_assert_held(q->queue_lock);
+
+       if (!blkcg_policy_enabled(q, pol))
+               return ERR_PTR(-EOPNOTSUPP);
+
+       /*
+        * This could be the first entry point of blkcg implementation and
+        * we shouldn't allow anything to go through for a bypassing queue.
+        */
+       if (unlikely(blk_queue_bypass(q)))
+               return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY);
+
+       return __blkg_lookup(blkcg, q, true /* update_hint */);
+}
+
 /**
  * blkg_conf_prep - parse and prepare for per-blkg config update
  * @blkcg: target block cgroup
@@ -789,6 +810,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
        __acquires(rcu) __acquires(disk->queue->queue_lock)
 {
        struct gendisk *disk;
+       struct request_queue *q;
        struct blkcg_gq *blkg;
        struct module *owner;
        unsigned int major, minor;
@@ -807,44 +829,95 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
        if (!disk)
                return -ENODEV;
        if (part) {
-               owner = disk->fops->owner;
-               put_disk(disk);
-               module_put(owner);
-               return -ENODEV;
+               ret = -ENODEV;
+               goto fail;
        }
 
-       rcu_read_lock();
-       spin_lock_irq(disk->queue->queue_lock);
+       q = disk->queue;
 
-       if (blkcg_policy_enabled(disk->queue, pol))
-               blkg = blkg_lookup_create(blkcg, disk->queue);
-       else
-               blkg = ERR_PTR(-EOPNOTSUPP);
+       rcu_read_lock();
+       spin_lock_irq(q->queue_lock);
 
+       blkg = blkg_lookup_check(blkcg, pol, q);
        if (IS_ERR(blkg)) {
                ret = PTR_ERR(blkg);
+               goto fail_unlock;
+       }
+
+       if (blkg)
+               goto success;
+
+       /*
+        * Create blkgs walking down from blkcg_root to @blkcg, so that all
+        * non-root blkgs have access to their parents.
+        */
+       while (true) {
+               struct blkcg *pos = blkcg;
+               struct blkcg *parent;
+               struct blkcg_gq *new_blkg;
+
+               parent = blkcg_parent(blkcg);
+               while (parent && !__blkg_lookup(parent, q, false)) {
+                       pos = parent;
+                       parent = blkcg_parent(parent);
+               }
+
+               /* Drop locks to do new blkg allocation with GFP_KERNEL. */
+               spin_unlock_irq(q->queue_lock);
                rcu_read_unlock();
-               spin_unlock_irq(disk->queue->queue_lock);
-               owner = disk->fops->owner;
-               put_disk(disk);
-               module_put(owner);
-               /*
-                * If queue was bypassing, we should retry.  Do so after a
-                * short msleep().  It isn't strictly necessary but queue
-                * can be bypassing for some time and it's always nice to
-                * avoid busy looping.
-                */
-               if (ret == -EBUSY) {
-                       msleep(10);
-                       ret = restart_syscall();
+
+               new_blkg = blkg_alloc(pos, q, GFP_KERNEL);
+               if (unlikely(!new_blkg)) {
+                       ret = -ENOMEM;
+                       goto fail;
                }
-               return ret;
-       }
 
+               rcu_read_lock();
+               spin_lock_irq(q->queue_lock);
+
+               blkg = blkg_lookup_check(pos, pol, q);
+               if (IS_ERR(blkg)) {
+                       ret = PTR_ERR(blkg);
+                       goto fail_unlock;
+               }
+
+               if (blkg) {
+                       blkg_free(new_blkg);
+               } else {
+                       blkg = blkg_create(pos, q, new_blkg);
+                       if (unlikely(IS_ERR(blkg))) {
+                               ret = PTR_ERR(blkg);
+                               goto fail_unlock;
+                       }
+               }
+
+               if (pos == blkcg)
+                       goto success;
+       }
+success:
        ctx->disk = disk;
        ctx->blkg = blkg;
        ctx->body = body;
        return 0;
+
+fail_unlock:
+       spin_unlock_irq(q->queue_lock);
+       rcu_read_unlock();
+fail:
+       owner = disk->fops->owner;
+       put_disk(disk);
+       module_put(owner);
+       /*
+        * If queue was bypassing, we should retry.  Do so after a
+        * short msleep().  It isn't strictly necessary but queue
+        * can be bypassing for some time and it's always nice to
+        * avoid busy looping.
+        */
+       if (ret == -EBUSY) {
+               msleep(10);
+               ret = restart_syscall();
+       }
+       return ret;
 }
 EXPORT_SYMBOL_GPL(blkg_conf_prep);
 
index d772c22..24886b6 100644 (file)
@@ -268,10 +268,8 @@ void blk_sync_queue(struct request_queue *q)
                struct blk_mq_hw_ctx *hctx;
                int i;
 
-               queue_for_each_hw_ctx(q, hctx, i) {
-                       cancel_work_sync(&hctx->run_work);
-                       cancel_delayed_work_sync(&hctx->delay_work);
-               }
+               queue_for_each_hw_ctx(q, hctx, i)
+                       cancel_delayed_work_sync(&hctx->run_work);
        } else {
                cancel_delayed_work_sync(&q->delay_work);
        }
@@ -500,6 +498,13 @@ void blk_set_queue_dying(struct request_queue *q)
        queue_flag_set(QUEUE_FLAG_DYING, q);
        spin_unlock_irq(q->queue_lock);
 
+       /*
+        * When queue DYING flag is set, we need to block new req
+        * entering queue, so we call blk_freeze_queue_start() to
+        * prevent I/O from crossing blk_queue_enter().
+        */
+       blk_freeze_queue_start(q);
+
        if (q->mq_ops)
                blk_mq_wake_waiters(q);
        else {
@@ -556,9 +561,13 @@ void blk_cleanup_queue(struct request_queue *q)
         * prevent that q->request_fn() gets invoked after draining finished.
         */
        blk_freeze_queue(q);
-       spin_lock_irq(lock);
-       if (!q->mq_ops)
+       if (!q->mq_ops) {
+               spin_lock_irq(lock);
                __blk_drain_queue(q, true);
+       } else {
+               blk_mq_debugfs_unregister_mq(q);
+               spin_lock_irq(lock);
+       }
        queue_flag_set(QUEUE_FLAG_DEAD, q);
        spin_unlock_irq(lock);
 
@@ -669,6 +678,15 @@ int blk_queue_enter(struct request_queue *q, bool nowait)
                if (nowait)
                        return -EBUSY;
 
+               /*
+                * read pair of barrier in blk_freeze_queue_start(),
+                * we need to order reading __PERCPU_REF_DEAD flag of
+                * .q_usage_counter and reading .mq_freeze_depth or
+                * queue dying flag, otherwise the following wait may
+                * never return if the two reads are reordered.
+                */
+               smp_rmb();
+
                ret = wait_event_interruptible(q->mq_freeze_wq,
                                !atomic_read(&q->mq_freeze_depth) ||
                                blk_queue_dying(q));
@@ -720,6 +738,10 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
        if (!q->backing_dev_info)
                goto fail_split;
 
+       q->stats = blk_alloc_queue_stats();
+       if (!q->stats)
+               goto fail_stats;
+
        q->backing_dev_info->ra_pages =
                        (VM_MAX_READAHEAD * 1024) / PAGE_SIZE;
        q->backing_dev_info->capabilities = BDI_CAP_CGROUP_WRITEBACK;
@@ -776,6 +798,8 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 fail_ref:
        percpu_ref_exit(&q->q_usage_counter);
 fail_bdi:
+       blk_free_queue_stats(q->stats);
+fail_stats:
        bdi_put(q->backing_dev_info);
 fail_split:
        bioset_free(q->bio_split);
@@ -889,7 +913,6 @@ out_exit_flush_rq:
                q->exit_rq_fn(q, q->fq->flush_rq);
 out_free_flush_queue:
        blk_free_flush_queue(q->fq);
-       wbt_exit(q);
        return -ENOMEM;
 }
 EXPORT_SYMBOL(blk_init_allocated_queue);
@@ -1128,7 +1151,6 @@ static struct request *__get_request(struct request_list *rl, unsigned int op,
 
        blk_rq_init(q, rq);
        blk_rq_set_rl(rq, rl);
-       blk_rq_set_prio(rq, ioc);
        rq->cmd_flags = op;
        rq->rq_flags = rq_flags;
 
@@ -1608,17 +1630,23 @@ out:
        return ret;
 }
 
-void init_request_from_bio(struct request *req, struct bio *bio)
+void blk_init_request_from_bio(struct request *req, struct bio *bio)
 {
+       struct io_context *ioc = rq_ioc(bio);
+
        if (bio->bi_opf & REQ_RAHEAD)
                req->cmd_flags |= REQ_FAILFAST_MASK;
 
-       req->errors = 0;
        req->__sector = bio->bi_iter.bi_sector;
        if (ioprio_valid(bio_prio(bio)))
                req->ioprio = bio_prio(bio);
+       else if (ioc)
+               req->ioprio = ioc->ioprio;
+       else
+               req->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);
        blk_rq_bio_prep(req->q, req, bio);
 }
+EXPORT_SYMBOL_GPL(blk_init_request_from_bio);
 
 static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
 {
@@ -1709,7 +1737,7 @@ get_rq:
         * We don't worry about that case for efficiency. It won't happen
         * often, and the elevators are able to handle it.
         */
-       init_request_from_bio(req, bio);
+       blk_init_request_from_bio(req, bio);
 
        if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags))
                req->cpu = raw_smp_processor_id();
@@ -1936,7 +1964,13 @@ generic_make_request_checks(struct bio *bio)
        if (!blkcg_bio_issue_check(q, bio))
                return false;
 
-       trace_block_bio_queue(q, bio);
+       if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) {
+               trace_block_bio_queue(q, bio);
+               /* Now that enqueuing has been traced, we need to trace
+                * completion as well.
+                */
+               bio_set_flag(bio, BIO_TRACE_COMPLETION);
+       }
        return true;
 
 not_supported:
@@ -2478,7 +2512,7 @@ void blk_start_request(struct request *req)
        blk_dequeue_request(req);
 
        if (test_bit(QUEUE_FLAG_STATS, &req->q->queue_flags)) {
-               blk_stat_set_issue_time(&req->issue_stat);
+               blk_stat_set_issue(&req->issue_stat, blk_rq_sectors(req));
                req->rq_flags |= RQF_STATS;
                wbt_issue(req->q->rq_wb, &req->issue_stat);
        }
@@ -2540,22 +2574,11 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
 {
        int total_bytes;
 
-       trace_block_rq_complete(req->q, req, nr_bytes);
+       trace_block_rq_complete(req, error, nr_bytes);
 
        if (!req->bio)
                return false;
 
-       /*
-        * For fs requests, rq is just carrier of independent bio's
-        * and each partial completion should be handled separately.
-        * Reset per-request error on each partial completion.
-        *
-        * TODO: tj: This is too subtle.  It would be better to let
-        * low level drivers do what they see fit.
-        */
-       if (!blk_rq_is_passthrough(req))
-               req->errors = 0;
-
        if (error && !blk_rq_is_passthrough(req) &&
            !(req->rq_flags & RQF_QUIET)) {
                char *error_type;
@@ -2601,6 +2624,8 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
                if (bio_bytes == bio->bi_iter.bi_size)
                        req->bio = bio->bi_next;
 
+               /* Completion has already been traced */
+               bio_clear_flag(bio, BIO_TRACE_COMPLETION);
                req_bio_endio(req, bio, bio_bytes, error);
 
                total_bytes += bio_bytes;
@@ -2699,7 +2724,7 @@ void blk_finish_request(struct request *req, int error)
        struct request_queue *q = req->q;
 
        if (req->rq_flags & RQF_STATS)
-               blk_stat_add(&q->rq_stats[rq_data_dir(req)], req);
+               blk_stat_add(req);
 
        if (req->rq_flags & RQF_QUEUED)
                blk_queue_end_tag(q, req);
@@ -2776,7 +2801,7 @@ static bool blk_end_bidi_request(struct request *rq, int error,
  *     %false - we are done with this request
  *     %true  - still buffers pending for this request
  **/
-bool __blk_end_bidi_request(struct request *rq, int error,
+static bool __blk_end_bidi_request(struct request *rq, int error,
                                   unsigned int nr_bytes, unsigned int bidi_bytes)
 {
        if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes))
@@ -2829,43 +2854,6 @@ void blk_end_request_all(struct request *rq, int error)
 EXPORT_SYMBOL(blk_end_request_all);
 
 /**
- * blk_end_request_cur - Helper function to finish the current request chunk.
- * @rq: the request to finish the current chunk for
- * @error: %0 for success, < %0 for error
- *
- * Description:
- *     Complete the current consecutively mapped chunk from @rq.
- *
- * Return:
- *     %false - we are done with this request
- *     %true  - still buffers pending for this request
- */
-bool blk_end_request_cur(struct request *rq, int error)
-{
-       return blk_end_request(rq, error, blk_rq_cur_bytes(rq));
-}
-EXPORT_SYMBOL(blk_end_request_cur);
-
-/**
- * blk_end_request_err - Finish a request till the next failure boundary.
- * @rq: the request to finish till the next failure boundary for
- * @error: must be negative errno
- *
- * Description:
- *     Complete @rq till the next failure boundary.
- *
- * Return:
- *     %false - we are done with this request
- *     %true  - still buffers pending for this request
- */
-bool blk_end_request_err(struct request *rq, int error)
-{
-       WARN_ON(error >= 0);
-       return blk_end_request(rq, error, blk_rq_err_bytes(rq));
-}
-EXPORT_SYMBOL_GPL(blk_end_request_err);
-
-/**
  * __blk_end_request - Helper function for drivers to complete the request.
  * @rq:       the request being processed
  * @error:    %0 for success, < %0 for error
@@ -2924,26 +2912,6 @@ bool __blk_end_request_cur(struct request *rq, int error)
 }
 EXPORT_SYMBOL(__blk_end_request_cur);
 
-/**
- * __blk_end_request_err - Finish a request till the next failure boundary.
- * @rq: the request to finish till the next failure boundary for
- * @error: must be negative errno
- *
- * Description:
- *     Complete @rq till the next failure boundary.  Must be called
- *     with queue lock held.
- *
- * Return:
- *     %false - we are done with this request
- *     %true  - still buffers pending for this request
- */
-bool __blk_end_request_err(struct request *rq, int error)
-{
-       WARN_ON(error >= 0);
-       return __blk_end_request(rq, error, blk_rq_err_bytes(rq));
-}
-EXPORT_SYMBOL_GPL(__blk_end_request_err);
-
 void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
                     struct bio *bio)
 {
@@ -3106,6 +3074,13 @@ int kblockd_schedule_work_on(int cpu, struct work_struct *work)
 }
 EXPORT_SYMBOL(kblockd_schedule_work_on);
 
+int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork,
+                               unsigned long delay)
+{
+       return mod_delayed_work_on(cpu, kblockd_workqueue, dwork, delay);
+}
+EXPORT_SYMBOL(kblockd_mod_delayed_work_on);
+
 int kblockd_schedule_delayed_work(struct delayed_work *dwork,
                                  unsigned long delay)
 {
index 8cd0e9b..a9451e3 100644 (file)
@@ -69,8 +69,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
 
        if (unlikely(blk_queue_dying(q))) {
                rq->rq_flags |= RQF_QUIET;
-               rq->errors = -ENXIO;
-               __blk_end_request_all(rq, rq->errors);
+               __blk_end_request_all(rq, -ENXIO);
                spin_unlock_irq(q->queue_lock);
                return;
        }
@@ -92,11 +91,10 @@ EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
  *    Insert a fully prepared request at the back of the I/O scheduler queue
  *    for execution and wait for completion.
  */
-int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
+void blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
                   struct request *rq, int at_head)
 {
        DECLARE_COMPLETION_ONSTACK(wait);
-       int err = 0;
        unsigned long hang_check;
 
        rq->end_io_data = &wait;
@@ -108,10 +106,5 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
                while (!wait_for_completion_io_timeout(&wait, hang_check * (HZ/2)));
        else
                wait_for_completion_io(&wait);
-
-       if (rq->errors)
-               err = -EIO;
-
-       return err;
 }
 EXPORT_SYMBOL(blk_execute_rq);
index 0d5a9c1..c4e0880 100644 (file)
@@ -447,7 +447,7 @@ void blk_insert_flush(struct request *rq)
                if (q->mq_ops)
                        blk_mq_end_request(rq, 0);
                else
-                       __blk_end_bidi_request(rq, 0, 0, 0);
+                       __blk_end_request(rq, 0, 0);
                return;
        }
 
@@ -497,8 +497,7 @@ void blk_insert_flush(struct request *rq)
  * Description:
  *    Issue a flush for the block device in question. Caller can supply
  *    room for storing the error offset in case of a flush error, if they
- *    wish to. If WAIT flag is not passed then caller may check only what
- *    request was pushed in some internal queue for later handling.
+ *    wish to.
  */
 int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
                sector_t *error_sector)
index 9f0ff5b..0f891a9 100644 (file)
@@ -389,7 +389,7 @@ static int blk_integrity_nop_fn(struct blk_integrity_iter *iter)
        return 0;
 }
 
-static struct blk_integrity_profile nop_profile = {
+static const struct blk_integrity_profile nop_profile = {
        .name = "nop",
        .generate_fn = blk_integrity_nop_fn,
        .verify_fn = blk_integrity_nop_fn,
@@ -412,12 +412,13 @@ void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template
 
        bi->flags = BLK_INTEGRITY_VERIFY | BLK_INTEGRITY_GENERATE |
                template->flags;
-       bi->interval_exp = ilog2(queue_logical_block_size(disk->queue));
+       bi->interval_exp = template->interval_exp ? :
+               ilog2(queue_logical_block_size(disk->queue));
        bi->profile = template->profile ? template->profile : &nop_profile;
        bi->tuple_size = template->tuple_size;
        bi->tag_size = template->tag_size;
 
-       blk_integrity_revalidate(disk);
+       disk->queue->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES;
 }
 EXPORT_SYMBOL(blk_integrity_register);
 
@@ -430,26 +431,11 @@ EXPORT_SYMBOL(blk_integrity_register);
  */
 void blk_integrity_unregister(struct gendisk *disk)
 {
-       blk_integrity_revalidate(disk);
+       disk->queue->backing_dev_info->capabilities &= ~BDI_CAP_STABLE_WRITES;
        memset(&disk->queue->integrity, 0, sizeof(struct blk_integrity));
 }
 EXPORT_SYMBOL(blk_integrity_unregister);
 
-void blk_integrity_revalidate(struct gendisk *disk)
-{
-       struct blk_integrity *bi = &disk->queue->integrity;
-
-       if (!(disk->flags & GENHD_FL_UP))
-               return;
-
-       if (bi->profile)
-               disk->queue->backing_dev_info->capabilities |=
-                       BDI_CAP_STABLE_WRITES;
-       else
-               disk->queue->backing_dev_info->capabilities &=
-                       ~BDI_CAP_STABLE_WRITES;
-}
-
 void blk_integrity_add(struct gendisk *disk)
 {
        if (kobject_init_and_add(&disk->integrity_kobj, &integrity_ktype,
index ed1e78e..e8caecd 100644 (file)
@@ -37,17 +37,12 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
                return -ENXIO;
 
        if (flags & BLKDEV_DISCARD_SECURE) {
-               if (flags & BLKDEV_DISCARD_ZERO)
-                       return -EOPNOTSUPP;
                if (!blk_queue_secure_erase(q))
                        return -EOPNOTSUPP;
                op = REQ_OP_SECURE_ERASE;
        } else {
                if (!blk_queue_discard(q))
                        return -EOPNOTSUPP;
-               if ((flags & BLKDEV_DISCARD_ZERO) &&
-                   !q->limits.discard_zeroes_data)
-                       return -EOPNOTSUPP;
                op = REQ_OP_DISCARD;
        }
 
@@ -109,7 +104,7 @@ EXPORT_SYMBOL(__blkdev_issue_discard);
  * @sector:    start sector
  * @nr_sects:  number of sectors to discard
  * @gfp_mask:  memory allocation flags (for bio_alloc)
- * @flags:     BLKDEV_IFL_* flags to control behaviour
+ * @flags:     BLKDEV_DISCARD_* flags to control behaviour
  *
  * Description:
  *    Issue a discard request for the sectors in question.
@@ -126,7 +121,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
                        &bio);
        if (!ret && bio) {
                ret = submit_bio_wait(bio);
-               if (ret == -EOPNOTSUPP && !(flags & BLKDEV_DISCARD_ZERO))
+               if (ret == -EOPNOTSUPP)
                        ret = 0;
                bio_put(bio);
        }
@@ -226,20 +221,9 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
 }
 EXPORT_SYMBOL(blkdev_issue_write_same);
 
-/**
- * __blkdev_issue_write_zeroes - generate number of bios with WRITE ZEROES
- * @bdev:      blockdev to issue
- * @sector:    start sector
- * @nr_sects:  number of sectors to write
- * @gfp_mask:  memory allocation flags (for bio_alloc)
- * @biop:      pointer to anchor bio
- *
- * Description:
- *  Generate and issue number of bios(REQ_OP_WRITE_ZEROES) with zerofiled pages.
- */
 static int __blkdev_issue_write_zeroes(struct block_device *bdev,
                sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
-               struct bio **biop)
+               struct bio **biop, unsigned flags)
 {
        struct bio *bio = *biop;
        unsigned int max_write_zeroes_sectors;
@@ -258,7 +242,9 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev,
                bio = next_bio(bio, 0, gfp_mask);
                bio->bi_iter.bi_sector = sector;
                bio->bi_bdev = bdev;
-               bio_set_op_attrs(bio, REQ_OP_WRITE_ZEROES, 0);
+               bio->bi_opf = REQ_OP_WRITE_ZEROES;
+               if (flags & BLKDEV_ZERO_NOUNMAP)
+                       bio->bi_opf |= REQ_NOUNMAP;
 
                if (nr_sects > max_write_zeroes_sectors) {
                        bio->bi_iter.bi_size = max_write_zeroes_sectors << 9;
@@ -282,14 +268,27 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev,
  * @nr_sects:  number of sectors to write
  * @gfp_mask:  memory allocation flags (for bio_alloc)
  * @biop:      pointer to anchor bio
- * @discard:   discard flag
+ * @flags:     controls detailed behavior
  *
  * Description:
- *  Generate and issue number of bios with zerofiled pages.
+ *  Zero-fill a block range, either using hardware offload or by explicitly
+ *  writing zeroes to the device.
+ *
+ *  Note that this function may fail with -EOPNOTSUPP if the driver signals
+ *  zeroing offload support, but the device fails to process the command (for
+ *  some devices there is no non-destructive way to verify whether this
+ *  operation is actually supported).  In this case the caller should call
+ *  retry the call to blkdev_issue_zeroout() and the fallback path will be used.
+ *
+ *  If a device is using logical block provisioning, the underlying space will
+ *  not be released if %flags contains BLKDEV_ZERO_NOUNMAP.
+ *
+ *  If %flags contains BLKDEV_ZERO_NOFALLBACK, the function will return
+ *  -EOPNOTSUPP if no explicit hardware offload for zeroing is provided.
  */
 int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
                sector_t nr_sects, gfp_t gfp_mask, struct bio **biop,
-               bool discard)
+               unsigned flags)
 {
        int ret;
        int bi_size = 0;
@@ -302,8 +301,8 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
                return -EINVAL;
 
        ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp_mask,
-                       biop);
-       if (ret == 0 || (ret && ret != -EOPNOTSUPP))
+                       biop, flags);
+       if (ret != -EOPNOTSUPP || (flags & BLKDEV_ZERO_NOFALLBACK))
                goto out;
 
        ret = 0;
@@ -337,40 +336,23 @@ EXPORT_SYMBOL(__blkdev_issue_zeroout);
  * @sector:    start sector
  * @nr_sects:  number of sectors to write
  * @gfp_mask:  memory allocation flags (for bio_alloc)
- * @discard:   whether to discard the block range
+ * @flags:     controls detailed behavior
  *
  * Description:
- *  Zero-fill a block range.  If the discard flag is set and the block
- *  device guarantees that subsequent READ operations to the block range
- *  in question will return zeroes, the blocks will be discarded. Should
- *  the discard request fail, if the discard flag is not set, or if
- *  discard_zeroes_data is not supported, this function will resort to
- *  zeroing the blocks manually, thus provisioning (allocating,
- *  anchoring) them. If the block device supports WRITE ZEROES or WRITE SAME
- *  command(s), blkdev_issue_zeroout() will use it to optimize the process of
- *  clearing the block range. Otherwise the zeroing will be performed
- *  using regular WRITE calls.
+ *  Zero-fill a block range, either using hardware offload or by explicitly
+ *  writing zeroes to the device.  See __blkdev_issue_zeroout() for the
+ *  valid values for %flags.
  */
 int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
-                        sector_t nr_sects, gfp_t gfp_mask, bool discard)
+               sector_t nr_sects, gfp_t gfp_mask, unsigned flags)
 {
        int ret;
        struct bio *bio = NULL;
        struct blk_plug plug;
 
-       if (discard) {
-               if (!blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask,
-                               BLKDEV_DISCARD_ZERO))
-                       return 0;
-       }
-
-       if (!blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask,
-                       ZERO_PAGE(0)))
-               return 0;
-
        blk_start_plug(&plug);
        ret = __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask,
-                       &bio, discard);
+                       &bio, flags);
        if (ret == 0 && bio) {
                ret = submit_bio_wait(bio);
                bio_put(bio);
index 2afa262..3990ae4 100644 (file)
@@ -54,6 +54,20 @@ static struct bio *blk_bio_discard_split(struct request_queue *q,
        return bio_split(bio, split_sectors, GFP_NOIO, bs);
 }
 
+static struct bio *blk_bio_write_zeroes_split(struct request_queue *q,
+               struct bio *bio, struct bio_set *bs, unsigned *nsegs)
+{
+       *nsegs = 1;
+
+       if (!q->limits.max_write_zeroes_sectors)
+               return NULL;
+
+       if (bio_sectors(bio) <= q->limits.max_write_zeroes_sectors)
+               return NULL;
+
+       return bio_split(bio, q->limits.max_write_zeroes_sectors, GFP_NOIO, bs);
+}
+
 static struct bio *blk_bio_write_same_split(struct request_queue *q,
                                            struct bio *bio,
                                            struct bio_set *bs,
@@ -200,8 +214,7 @@ void blk_queue_split(struct request_queue *q, struct bio **bio,
                split = blk_bio_discard_split(q, *bio, bs, &nsegs);
                break;
        case REQ_OP_WRITE_ZEROES:
-               split = NULL;
-               nsegs = (*bio)->bi_phys_segments;
+               split = blk_bio_write_zeroes_split(q, *bio, bs, &nsegs);
                break;
        case REQ_OP_WRITE_SAME:
                split = blk_bio_write_same_split(q, *bio, bs, &nsegs);
index f6d9179..bcd2a7d 100644 (file)
@@ -43,11 +43,157 @@ static int blk_mq_debugfs_seq_open(struct inode *inode, struct file *file,
        return ret;
 }
 
+static int blk_flags_show(struct seq_file *m, const unsigned long flags,
+                         const char *const *flag_name, int flag_name_count)
+{
+       bool sep = false;
+       int i;
+
+       for (i = 0; i < sizeof(flags) * BITS_PER_BYTE; i++) {
+               if (!(flags & BIT(i)))
+                       continue;
+               if (sep)
+                       seq_puts(m, " ");
+               sep = true;
+               if (i < flag_name_count && flag_name[i])
+                       seq_puts(m, flag_name[i]);
+               else
+                       seq_printf(m, "%d", i);
+       }
+       return 0;
+}
+
+static const char *const blk_queue_flag_name[] = {
+       [QUEUE_FLAG_QUEUED]      = "QUEUED",
+       [QUEUE_FLAG_STOPPED]     = "STOPPED",
+       [QUEUE_FLAG_SYNCFULL]    = "SYNCFULL",
+       [QUEUE_FLAG_ASYNCFULL]   = "ASYNCFULL",
+       [QUEUE_FLAG_DYING]       = "DYING",
+       [QUEUE_FLAG_BYPASS]      = "BYPASS",
+       [QUEUE_FLAG_BIDI]        = "BIDI",
+       [QUEUE_FLAG_NOMERGES]    = "NOMERGES",
+       [QUEUE_FLAG_SAME_COMP]   = "SAME_COMP",
+       [QUEUE_FLAG_FAIL_IO]     = "FAIL_IO",
+       [QUEUE_FLAG_STACKABLE]   = "STACKABLE",
+       [QUEUE_FLAG_NONROT]      = "NONROT",
+       [QUEUE_FLAG_IO_STAT]     = "IO_STAT",
+       [QUEUE_FLAG_DISCARD]     = "DISCARD",
+       [QUEUE_FLAG_NOXMERGES]   = "NOXMERGES",
+       [QUEUE_FLAG_ADD_RANDOM]  = "ADD_RANDOM",
+       [QUEUE_FLAG_SECERASE]    = "SECERASE",
+       [QUEUE_FLAG_SAME_FORCE]  = "SAME_FORCE",
+       [QUEUE_FLAG_DEAD]        = "DEAD",
+       [QUEUE_FLAG_INIT_DONE]   = "INIT_DONE",
+       [QUEUE_FLAG_NO_SG_MERGE] = "NO_SG_MERGE",
+       [QUEUE_FLAG_POLL]        = "POLL",
+       [QUEUE_FLAG_WC]          = "WC",
+       [QUEUE_FLAG_FUA]         = "FUA",
+       [QUEUE_FLAG_FLUSH_NQ]    = "FLUSH_NQ",
+       [QUEUE_FLAG_DAX]         = "DAX",
+       [QUEUE_FLAG_STATS]       = "STATS",
+       [QUEUE_FLAG_POLL_STATS]  = "POLL_STATS",
+       [QUEUE_FLAG_REGISTERED]  = "REGISTERED",
+};
+
+static int blk_queue_flags_show(struct seq_file *m, void *v)
+{
+       struct request_queue *q = m->private;
+
+       blk_flags_show(m, q->queue_flags, blk_queue_flag_name,
+                      ARRAY_SIZE(blk_queue_flag_name));
+       seq_puts(m, "\n");
+       return 0;
+}
+
+static ssize_t blk_queue_flags_store(struct file *file, const char __user *ubuf,
+                                    size_t len, loff_t *offp)
+{
+       struct request_queue *q = file_inode(file)->i_private;
+       char op[16] = { }, *s;
+
+       len = min(len, sizeof(op) - 1);
+       if (copy_from_user(op, ubuf, len))
+               return -EFAULT;
+       s = op;
+       strsep(&s, " \t\n"); /* strip trailing whitespace */
+       if (strcmp(op, "run") == 0) {
+               blk_mq_run_hw_queues(q, true);
+       } else if (strcmp(op, "start") == 0) {
+               blk_mq_start_stopped_hw_queues(q, true);
+       } else {
+               pr_err("%s: unsupported operation %s. Use either 'run' or 'start'\n",
+                      __func__, op);
+               return -EINVAL;
+       }
+       return len;
+}
+
+static int blk_queue_flags_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, blk_queue_flags_show, inode->i_private);
+}
+
+static const struct file_operations blk_queue_flags_fops = {
+       .open           = blk_queue_flags_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = single_release,
+       .write          = blk_queue_flags_store,
+};
+
+static void print_stat(struct seq_file *m, struct blk_rq_stat *stat)
+{
+       if (stat->nr_samples) {
+               seq_printf(m, "samples=%d, mean=%lld, min=%llu, max=%llu",
+                          stat->nr_samples, stat->mean, stat->min, stat->max);
+       } else {
+               seq_puts(m, "samples=0");
+       }
+}
+
+static int queue_poll_stat_show(struct seq_file *m, void *v)
+{
+       struct request_queue *q = m->private;
+       int bucket;
+
+       for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS/2; bucket++) {
+               seq_printf(m, "read  (%d Bytes): ", 1 << (9+bucket));
+               print_stat(m, &q->poll_stat[2*bucket]);
+               seq_puts(m, "\n");
+
+               seq_printf(m, "write (%d Bytes): ",  1 << (9+bucket));
+               print_stat(m, &q->poll_stat[2*bucket+1]);
+               seq_puts(m, "\n");
+       }
+       return 0;
+}
+
+static int queue_poll_stat_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, queue_poll_stat_show, inode->i_private);
+}
+
+static const struct file_operations queue_poll_stat_fops = {
+       .open           = queue_poll_stat_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = single_release,
+};
+
+static const char *const hctx_state_name[] = {
+       [BLK_MQ_S_STOPPED]       = "STOPPED",
+       [BLK_MQ_S_TAG_ACTIVE]    = "TAG_ACTIVE",
+       [BLK_MQ_S_SCHED_RESTART] = "SCHED_RESTART",
+       [BLK_MQ_S_TAG_WAITING]   = "TAG_WAITING",
+
+};
 static int hctx_state_show(struct seq_file *m, void *v)
 {
        struct blk_mq_hw_ctx *hctx = m->private;
 
-       seq_printf(m, "0x%lx\n", hctx->state);
+       blk_flags_show(m, hctx->state, hctx_state_name,
+                      ARRAY_SIZE(hctx_state_name));
+       seq_puts(m, "\n");
        return 0;
 }
 
@@ -63,11 +209,35 @@ static const struct file_operations hctx_state_fops = {
        .release        = single_release,
 };
 
+static const char *const alloc_policy_name[] = {
+       [BLK_TAG_ALLOC_FIFO]    = "fifo",
+       [BLK_TAG_ALLOC_RR]      = "rr",
+};
+
+static const char *const hctx_flag_name[] = {
+       [ilog2(BLK_MQ_F_SHOULD_MERGE)]  = "SHOULD_MERGE",
+       [ilog2(BLK_MQ_F_TAG_SHARED)]    = "TAG_SHARED",
+       [ilog2(BLK_MQ_F_SG_MERGE)]      = "SG_MERGE",
+       [ilog2(BLK_MQ_F_BLOCKING)]      = "BLOCKING",
+       [ilog2(BLK_MQ_F_NO_SCHED)]      = "NO_SCHED",
+};
+
 static int hctx_flags_show(struct seq_file *m, void *v)
 {
        struct blk_mq_hw_ctx *hctx = m->private;
-
-       seq_printf(m, "0x%lx\n", hctx->flags);
+       const int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(hctx->flags);
+
+       seq_puts(m, "alloc_policy=");
+       if (alloc_policy < ARRAY_SIZE(alloc_policy_name) &&
+           alloc_policy_name[alloc_policy])
+               seq_puts(m, alloc_policy_name[alloc_policy]);
+       else
+               seq_printf(m, "%d", alloc_policy);
+       seq_puts(m, " ");
+       blk_flags_show(m,
+                      hctx->flags ^ BLK_ALLOC_POLICY_TO_MQ_FLAG(alloc_policy),
+                      hctx_flag_name, ARRAY_SIZE(hctx_flag_name));
+       seq_puts(m, "\n");
        return 0;
 }
 
@@ -83,13 +253,83 @@ static const struct file_operations hctx_flags_fops = {
        .release        = single_release,
 };
 
+static const char *const op_name[] = {
+       [REQ_OP_READ]           = "READ",
+       [REQ_OP_WRITE]          = "WRITE",
+       [REQ_OP_FLUSH]          = "FLUSH",
+       [REQ_OP_DISCARD]        = "DISCARD",
+       [REQ_OP_ZONE_REPORT]    = "ZONE_REPORT",
+       [REQ_OP_SECURE_ERASE]   = "SECURE_ERASE",
+       [REQ_OP_ZONE_RESET]     = "ZONE_RESET",
+       [REQ_OP_WRITE_SAME]     = "WRITE_SAME",
+       [REQ_OP_WRITE_ZEROES]   = "WRITE_ZEROES",
+       [REQ_OP_SCSI_IN]        = "SCSI_IN",
+       [REQ_OP_SCSI_OUT]       = "SCSI_OUT",
+       [REQ_OP_DRV_IN]         = "DRV_IN",
+       [REQ_OP_DRV_OUT]        = "DRV_OUT",
+};
+
+static const char *const cmd_flag_name[] = {
+       [__REQ_FAILFAST_DEV]            = "FAILFAST_DEV",
+       [__REQ_FAILFAST_TRANSPORT]      = "FAILFAST_TRANSPORT",
+       [__REQ_FAILFAST_DRIVER]         = "FAILFAST_DRIVER",
+       [__REQ_SYNC]                    = "SYNC",
+       [__REQ_META]                    = "META",
+       [__REQ_PRIO]                    = "PRIO",
+       [__REQ_NOMERGE]                 = "NOMERGE",
+       [__REQ_IDLE]                    = "IDLE",
+       [__REQ_INTEGRITY]               = "INTEGRITY",
+       [__REQ_FUA]                     = "FUA",
+       [__REQ_PREFLUSH]                = "PREFLUSH",
+       [__REQ_RAHEAD]                  = "RAHEAD",
+       [__REQ_BACKGROUND]              = "BACKGROUND",
+       [__REQ_NR_BITS]                 = "NR_BITS",
+};
+
+static const char *const rqf_name[] = {
+       [ilog2((__force u32)RQF_SORTED)]                = "SORTED",
+       [ilog2((__force u32)RQF_STARTED)]               = "STARTED",
+       [ilog2((__force u32)RQF_QUEUED)]                = "QUEUED",
+       [ilog2((__force u32)RQF_SOFTBARRIER)]           = "SOFTBARRIER",
+       [ilog2((__force u32)RQF_FLUSH_SEQ)]             = "FLUSH_SEQ",
+       [ilog2((__force u32)RQF_MIXED_MERGE)]           = "MIXED_MERGE",
+       [ilog2((__force u32)RQF_MQ_INFLIGHT)]           = "MQ_INFLIGHT",
+       [ilog2((__force u32)RQF_DONTPREP)]              = "DONTPREP",
+       [ilog2((__force u32)RQF_PREEMPT)]               = "PREEMPT",
+       [ilog2((__force u32)RQF_COPY_USER)]             = "COPY_USER",
+       [ilog2((__force u32)RQF_FAILED)]                = "FAILED",
+       [ilog2((__force u32)RQF_QUIET)]                 = "QUIET",
+       [ilog2((__force u32)RQF_ELVPRIV)]               = "ELVPRIV",
+       [ilog2((__force u32)RQF_IO_STAT)]               = "IO_STAT",
+       [ilog2((__force u32)RQF_ALLOCED)]               = "ALLOCED",
+       [ilog2((__force u32)RQF_PM)]                    = "PM",
+       [ilog2((__force u32)RQF_HASHED)]                = "HASHED",
+       [ilog2((__force u32)RQF_STATS)]                 = "STATS",
+       [ilog2((__force u32)RQF_SPECIAL_PAYLOAD)]       = "SPECIAL_PAYLOAD",
+};
+
 static int blk_mq_debugfs_rq_show(struct seq_file *m, void *v)
 {
        struct request *rq = list_entry_rq(v);
-
-       seq_printf(m, "%p {.cmd_flags=0x%x, .rq_flags=0x%x, .tag=%d, .internal_tag=%d}\n",
-                  rq, rq->cmd_flags, (__force unsigned int)rq->rq_flags,
-                  rq->tag, rq->internal_tag);
+       const struct blk_mq_ops *const mq_ops = rq->q->mq_ops;
+       const unsigned int op = rq->cmd_flags & REQ_OP_MASK;
+
+       seq_printf(m, "%p {.op=", rq);
+       if (op < ARRAY_SIZE(op_name) && op_name[op])
+               seq_printf(m, "%s", op_name[op]);
+       else
+               seq_printf(m, "%d", op);
+       seq_puts(m, ", .cmd_flags=");
+       blk_flags_show(m, rq->cmd_flags & ~REQ_OP_MASK, cmd_flag_name,
+                      ARRAY_SIZE(cmd_flag_name));
+       seq_puts(m, ", .rq_flags=");
+       blk_flags_show(m, (__force unsigned int)rq->rq_flags, rqf_name,
+                      ARRAY_SIZE(rqf_name));
+       seq_printf(m, ", .tag=%d, .internal_tag=%d", rq->tag,
+                  rq->internal_tag);
+       if (mq_ops->show_rq)
+               mq_ops->show_rq(m, rq);
+       seq_puts(m, "}\n");
        return 0;
 }
 
@@ -322,60 +562,6 @@ static const struct file_operations hctx_io_poll_fops = {
        .release        = single_release,
 };
 
-static void print_stat(struct seq_file *m, struct blk_rq_stat *stat)
-{
-       seq_printf(m, "samples=%d, mean=%lld, min=%llu, max=%llu",
-                  stat->nr_samples, stat->mean, stat->min, stat->max);
-}
-
-static int hctx_stats_show(struct seq_file *m, void *v)
-{
-       struct blk_mq_hw_ctx *hctx = m->private;
-       struct blk_rq_stat stat[2];
-
-       blk_stat_init(&stat[BLK_STAT_READ]);
-       blk_stat_init(&stat[BLK_STAT_WRITE]);
-
-       blk_hctx_stat_get(hctx, stat);
-
-       seq_puts(m, "read: ");
-       print_stat(m, &stat[BLK_STAT_READ]);
-       seq_puts(m, "\n");
-
-       seq_puts(m, "write: ");
-       print_stat(m, &stat[BLK_STAT_WRITE]);
-       seq_puts(m, "\n");
-       return 0;
-}
-
-static int hctx_stats_open(struct inode *inode, struct file *file)
-{
-       return single_open(file, hctx_stats_show, inode->i_private);
-}
-
-static ssize_t hctx_stats_write(struct file *file, const char __user *buf,
-                               size_t count, loff_t *ppos)
-{
-       struct seq_file *m = file->private_data;
-       struct blk_mq_hw_ctx *hctx = m->private;
-       struct blk_mq_ctx *ctx;
-       int i;
-
-       hctx_for_each_ctx(hctx, ctx, i) {
-               blk_stat_init(&ctx->stat[BLK_STAT_READ]);
-               blk_stat_init(&ctx->stat[BLK_STAT_WRITE]);
-       }
-       return count;
-}
-
-static const struct file_operations hctx_stats_fops = {
-       .open           = hctx_stats_open,
-       .read           = seq_read,
-       .write          = hctx_stats_write,
-       .llseek         = seq_lseek,
-       .release        = single_release,
-};
-
 static int hctx_dispatched_show(struct seq_file *m, void *v)
 {
        struct blk_mq_hw_ctx *hctx = m->private;
@@ -636,6 +822,12 @@ static const struct file_operations ctx_completed_fops = {
        .release        = single_release,
 };
 
+static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = {
+       {"poll_stat", 0400, &queue_poll_stat_fops},
+       {"state", 0600, &blk_queue_flags_fops},
+       {},
+};
+
 static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = {
        {"state", 0400, &hctx_state_fops},
        {"flags", 0400, &hctx_flags_fops},
@@ -646,7 +838,6 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = {
        {"sched_tags", 0400, &hctx_sched_tags_fops},
        {"sched_tags_bitmap", 0400, &hctx_sched_tags_bitmap_fops},
        {"io_poll", 0600, &hctx_io_poll_fops},
-       {"stats", 0600, &hctx_stats_fops},
        {"dispatched", 0600, &hctx_dispatched_fops},
        {"queued", 0600, &hctx_queued_fops},
        {"run", 0600, &hctx_run_fops},
@@ -662,16 +853,17 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = {
        {},
 };
 
-int blk_mq_debugfs_register(struct request_queue *q, const char *name)
+int blk_mq_debugfs_register(struct request_queue *q)
 {
        if (!blk_debugfs_root)
                return -ENOENT;
 
-       q->debugfs_dir = debugfs_create_dir(name, blk_debugfs_root);
+       q->debugfs_dir = debugfs_create_dir(kobject_name(q->kobj.parent),
+                                           blk_debugfs_root);
        if (!q->debugfs_dir)
                goto err;
 
-       if (blk_mq_debugfs_register_hctxs(q))
+       if (blk_mq_debugfs_register_mq(q))
                goto err;
 
        return 0;
@@ -741,7 +933,7 @@ static int blk_mq_debugfs_register_hctx(struct request_queue *q,
        return 0;
 }
 
-int blk_mq_debugfs_register_hctxs(struct request_queue *q)
+int blk_mq_debugfs_register_mq(struct request_queue *q)
 {
        struct blk_mq_hw_ctx *hctx;
        int i;
@@ -753,6 +945,9 @@ int blk_mq_debugfs_register_hctxs(struct request_queue *q)
        if (!q->mq_debugfs_dir)
                goto err;
 
+       if (!debugfs_create_files(q->mq_debugfs_dir, q, blk_mq_debugfs_queue_attrs))
+               goto err;
+
        queue_for_each_hw_ctx(q, hctx, i) {
                if (blk_mq_debugfs_register_hctx(q, hctx))
                        goto err;
@@ -761,11 +956,11 @@ int blk_mq_debugfs_register_hctxs(struct request_queue *q)
        return 0;
 
 err:
-       blk_mq_debugfs_unregister_hctxs(q);
+       blk_mq_debugfs_unregister_mq(q);
        return -ENOMEM;
 }
 
-void blk_mq_debugfs_unregister_hctxs(struct request_queue *q)
+void blk_mq_debugfs_unregister_mq(struct request_queue *q)
 {
        debugfs_remove_recursive(q->mq_debugfs_dir);
        q->mq_debugfs_dir = NULL;
index 966c216..0c3354c 100644 (file)
@@ -23,7 +23,7 @@
  * @pdev:      PCI device associated with @set.
  *
  * This function assumes the PCI device @pdev has at least as many available
- * interrupt vetors as @set has queues.  It will then queuery the vector
+ * interrupt vectors as @set has queues.  It will then query the vector
  * corresponding to each queue for it's affinity mask and built queue mapping
  * that maps a queue to the CPUs that have irq affinity for the corresponding
  * vector.
index 09af8ff..8b361e1 100644 (file)
@@ -30,43 +30,6 @@ void blk_mq_sched_free_hctx_data(struct request_queue *q,
 }
 EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data);
 
-int blk_mq_sched_init_hctx_data(struct request_queue *q, size_t size,
-                               int (*init)(struct blk_mq_hw_ctx *),
-                               void (*exit)(struct blk_mq_hw_ctx *))
-{
-       struct blk_mq_hw_ctx *hctx;
-       int ret;
-       int i;
-
-       queue_for_each_hw_ctx(q, hctx, i) {
-               hctx->sched_data = kmalloc_node(size, GFP_KERNEL, hctx->numa_node);
-               if (!hctx->sched_data) {
-                       ret = -ENOMEM;
-                       goto error;
-               }
-
-               if (init) {
-                       ret = init(hctx);
-                       if (ret) {
-                               /*
-                                * We don't want to give exit() a partially
-                                * initialized sched_data. init() must clean up
-                                * if it fails.
-                                */
-                               kfree(hctx->sched_data);
-                               hctx->sched_data = NULL;
-                               goto error;
-                       }
-               }
-       }
-
-       return 0;
-error:
-       blk_mq_sched_free_hctx_data(q, exit);
-       return ret;
-}
-EXPORT_SYMBOL_GPL(blk_mq_sched_init_hctx_data);
-
 static void __blk_mq_sched_assign_ioc(struct request_queue *q,
                                      struct request *rq,
                                      struct bio *bio,
@@ -119,7 +82,11 @@ struct request *blk_mq_sched_get_request(struct request_queue *q,
        if (likely(!data->hctx))
                data->hctx = blk_mq_map_queue(q, data->ctx->cpu);
 
-       if (e) {
+       /*
+        * For a reserved tag, allocate a normal request since we might
+        * have driver dependencies on the value of the internal tag.
+        */
+       if (e && !(data->flags & BLK_MQ_REQ_RESERVED)) {
                data->flags |= BLK_MQ_REQ_INTERNAL;
 
                /*
@@ -171,7 +138,8 @@ void blk_mq_sched_put_request(struct request *rq)
 
 void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 {
-       struct elevator_queue *e = hctx->queue->elevator;
+       struct request_queue *q = hctx->queue;
+       struct elevator_queue *e = q->elevator;
        const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request;
        bool did_work = false;
        LIST_HEAD(rq_list);
@@ -203,10 +171,10 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
         */
        if (!list_empty(&rq_list)) {
                blk_mq_sched_mark_restart_hctx(hctx);
-               did_work = blk_mq_dispatch_rq_list(hctx, &rq_list);
+               did_work = blk_mq_dispatch_rq_list(q, &rq_list);
        } else if (!has_sched_dispatch) {
                blk_mq_flush_busy_ctxs(hctx, &rq_list);
-               blk_mq_dispatch_rq_list(hctx, &rq_list);
+               blk_mq_dispatch_rq_list(q, &rq_list);
        }
 
        /*
@@ -222,26 +190,10 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
                        if (!rq)
                                break;
                        list_add(&rq->queuelist, &rq_list);
-               } while (blk_mq_dispatch_rq_list(hctx, &rq_list));
+               } while (blk_mq_dispatch_rq_list(q, &rq_list));
        }
 }
 
-void blk_mq_sched_move_to_dispatch(struct blk_mq_hw_ctx *hctx,
-                                  struct list_head *rq_list,
-                                  struct request *(*get_rq)(struct blk_mq_hw_ctx *))
-{
-       do {
-               struct request *rq;
-
-               rq = get_rq(hctx);
-               if (!rq)
-                       break;
-
-               list_add_tail(&rq->queuelist, rq_list);
-       } while (1);
-}
-EXPORT_SYMBOL_GPL(blk_mq_sched_move_to_dispatch);
-
 bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
                            struct request **merged_request)
 {
@@ -317,25 +269,68 @@ static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx,
        return true;
 }
 
-static void blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx)
+static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx)
 {
        if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) {
                clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
-               if (blk_mq_hctx_has_pending(hctx))
+               if (blk_mq_hctx_has_pending(hctx)) {
                        blk_mq_run_hw_queue(hctx, true);
+                       return true;
+               }
        }
+       return false;
 }
 
-void blk_mq_sched_restart_queues(struct blk_mq_hw_ctx *hctx)
-{
-       struct request_queue *q = hctx->queue;
-       unsigned int i;
+/**
+ * list_for_each_entry_rcu_rr - iterate in a round-robin fashion over rcu list
+ * @pos:    loop cursor.
+ * @skip:   the list element that will not be examined. Iteration starts at
+ *          @skip->next.
+ * @head:   head of the list to examine. This list must have at least one
+ *          element, namely @skip.
+ * @member: name of the list_head structure within typeof(*pos).
+ */
+#define list_for_each_entry_rcu_rr(pos, skip, head, member)            \
+       for ((pos) = (skip);                                            \
+            (pos = (pos)->member.next != (head) ? list_entry_rcu(      \
+                       (pos)->member.next, typeof(*pos), member) :     \
+             list_entry_rcu((pos)->member.next->next, typeof(*pos), member)), \
+            (pos) != (skip); )
 
-       if (test_bit(QUEUE_FLAG_RESTART, &q->queue_flags)) {
-               if (test_and_clear_bit(QUEUE_FLAG_RESTART, &q->queue_flags)) {
-                       queue_for_each_hw_ctx(q, hctx, i)
-                               blk_mq_sched_restart_hctx(hctx);
+/*
+ * Called after a driver tag has been freed to check whether a hctx needs to
+ * be restarted. Restarts @hctx if its tag set is not shared. Restarts hardware
+ * queues in a round-robin fashion if the tag set of @hctx is shared with other
+ * hardware queues.
+ */
+void blk_mq_sched_restart(struct blk_mq_hw_ctx *const hctx)
+{
+       struct blk_mq_tags *const tags = hctx->tags;
+       struct blk_mq_tag_set *const set = hctx->queue->tag_set;
+       struct request_queue *const queue = hctx->queue, *q;
+       struct blk_mq_hw_ctx *hctx2;
+       unsigned int i, j;
+
+       if (set->flags & BLK_MQ_F_TAG_SHARED) {
+               rcu_read_lock();
+               list_for_each_entry_rcu_rr(q, queue, &set->tag_list,
+                                          tag_set_list) {
+                       queue_for_each_hw_ctx(q, hctx2, i)
+                               if (hctx2->tags == tags &&
+                                   blk_mq_sched_restart_hctx(hctx2))
+                                       goto done;
                }
+               j = hctx->queue_num + 1;
+               for (i = 0; i < queue->nr_hw_queues; i++, j++) {
+                       if (j == queue->nr_hw_queues)
+                               j = 0;
+                       hctx2 = queue->queue_hw_ctx[j];
+                       if (hctx2->tags == tags &&
+                           blk_mq_sched_restart_hctx(hctx2))
+                               break;
+               }
+done:
+               rcu_read_unlock();
        } else {
                blk_mq_sched_restart_hctx(hctx);
        }
@@ -431,11 +426,86 @@ static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set,
        }
 }
 
-int blk_mq_sched_setup(struct request_queue *q)
+static int blk_mq_sched_alloc_tags(struct request_queue *q,
+                                  struct blk_mq_hw_ctx *hctx,
+                                  unsigned int hctx_idx)
 {
        struct blk_mq_tag_set *set = q->tag_set;
+       int ret;
+
+       hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests,
+                                              set->reserved_tags);
+       if (!hctx->sched_tags)
+               return -ENOMEM;
+
+       ret = blk_mq_alloc_rqs(set, hctx->sched_tags, hctx_idx, q->nr_requests);
+       if (ret)
+               blk_mq_sched_free_tags(set, hctx, hctx_idx);
+
+       return ret;
+}
+
+static void blk_mq_sched_tags_teardown(struct request_queue *q)
+{
+       struct blk_mq_tag_set *set = q->tag_set;
+       struct blk_mq_hw_ctx *hctx;
+       int i;
+
+       queue_for_each_hw_ctx(q, hctx, i)
+               blk_mq_sched_free_tags(set, hctx, i);
+}
+
+int blk_mq_sched_init_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
+                          unsigned int hctx_idx)
+{
+       struct elevator_queue *e = q->elevator;
+       int ret;
+
+       if (!e)
+               return 0;
+
+       ret = blk_mq_sched_alloc_tags(q, hctx, hctx_idx);
+       if (ret)
+               return ret;
+
+       if (e->type->ops.mq.init_hctx) {
+               ret = e->type->ops.mq.init_hctx(hctx, hctx_idx);
+               if (ret) {
+                       blk_mq_sched_free_tags(q->tag_set, hctx, hctx_idx);
+                       return ret;
+               }
+       }
+
+       return 0;
+}
+
+void blk_mq_sched_exit_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
+                           unsigned int hctx_idx)
+{
+       struct elevator_queue *e = q->elevator;
+
+       if (!e)
+               return;
+
+       if (e->type->ops.mq.exit_hctx && hctx->sched_data) {
+               e->type->ops.mq.exit_hctx(hctx, hctx_idx);
+               hctx->sched_data = NULL;
+       }
+
+       blk_mq_sched_free_tags(q->tag_set, hctx, hctx_idx);
+}
+
+int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
+{
        struct blk_mq_hw_ctx *hctx;
-       int ret, i;
+       struct elevator_queue *eq;
+       unsigned int i;
+       int ret;
+
+       if (!e) {
+               q->elevator = NULL;
+               return 0;
+       }
 
        /*
         * Default to 256, since we don't split into sync/async like the
@@ -443,49 +513,53 @@ int blk_mq_sched_setup(struct request_queue *q)
         */
        q->nr_requests = 2 * BLKDEV_MAX_RQ;
 
-       /*
-        * We're switching to using an IO scheduler, so setup the hctx
-        * scheduler tags and switch the request map from the regular
-        * tags to scheduler tags. First allocate what we need, so we
-        * can safely fail and fallback, if needed.
-        */
-       ret = 0;
        queue_for_each_hw_ctx(q, hctx, i) {
-               hctx->sched_tags = blk_mq_alloc_rq_map(set, i,
-                               q->nr_requests, set->reserved_tags);
-               if (!hctx->sched_tags) {
-                       ret = -ENOMEM;
-                       break;
-               }
-               ret = blk_mq_alloc_rqs(set, hctx->sched_tags, i, q->nr_requests);
+               ret = blk_mq_sched_alloc_tags(q, hctx, i);
                if (ret)
-                       break;
+                       goto err;
        }
 
-       /*
-        * If we failed, free what we did allocate
-        */
-       if (ret) {
+       ret = e->ops.mq.init_sched(q, e);
+       if (ret)
+               goto err;
+
+       if (e->ops.mq.init_hctx) {
                queue_for_each_hw_ctx(q, hctx, i) {
-                       if (!hctx->sched_tags)
-                               continue;
-                       blk_mq_sched_free_tags(set, hctx, i);
+                       ret = e->ops.mq.init_hctx(hctx, i);
+                       if (ret) {
+                               eq = q->elevator;
+                               blk_mq_exit_sched(q, eq);
+                               kobject_put(&eq->kobj);
+                               return ret;
+                       }
                }
-
-               return ret;
        }
 
        return 0;
+
+err:
+       blk_mq_sched_tags_teardown(q);
+       q->elevator = NULL;
+       return ret;
 }
 
-void blk_mq_sched_teardown(struct request_queue *q)
+void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
 {
-       struct blk_mq_tag_set *set = q->tag_set;
        struct blk_mq_hw_ctx *hctx;
-       int i;
+       unsigned int i;
 
-       queue_for_each_hw_ctx(q, hctx, i)
-               blk_mq_sched_free_tags(set, hctx, i);
+       if (e->type->ops.mq.exit_hctx) {
+               queue_for_each_hw_ctx(q, hctx, i) {
+                       if (hctx->sched_data) {
+                               e->type->ops.mq.exit_hctx(hctx, i);
+                               hctx->sched_data = NULL;
+                       }
+               }
+       }
+       if (e->type->ops.mq.exit_sched)
+               e->type->ops.mq.exit_sched(e);
+       blk_mq_sched_tags_teardown(q);
+       q->elevator = NULL;
 }
 
 int blk_mq_sched_init(struct request_queue *q)
index a75b16b..edafb53 100644 (file)
@@ -4,10 +4,6 @@
 #include "blk-mq.h"
 #include "blk-mq-tag.h"
 
-int blk_mq_sched_init_hctx_data(struct request_queue *q, size_t size,
-                               int (*init)(struct blk_mq_hw_ctx *),
-                               void (*exit)(struct blk_mq_hw_ctx *));
-
 void blk_mq_sched_free_hctx_data(struct request_queue *q,
                                 void (*exit)(struct blk_mq_hw_ctx *));
 
@@ -19,7 +15,7 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
                                struct request **merged_request);
 bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio);
 bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq);
-void blk_mq_sched_restart_queues(struct blk_mq_hw_ctx *hctx);
+void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);
 
 void blk_mq_sched_insert_request(struct request *rq, bool at_head,
                                 bool run_queue, bool async, bool can_block);
@@ -28,12 +24,14 @@ void blk_mq_sched_insert_requests(struct request_queue *q,
                                  struct list_head *list, bool run_queue_async);
 
 void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);
-void blk_mq_sched_move_to_dispatch(struct blk_mq_hw_ctx *hctx,
-                       struct list_head *rq_list,
-                       struct request *(*get_rq)(struct blk_mq_hw_ctx *));
 
-int blk_mq_sched_setup(struct request_queue *q);
-void blk_mq_sched_teardown(struct request_queue *q);
+int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e);
+void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e);
+
+int blk_mq_sched_init_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
+                          unsigned int hctx_idx);
+void blk_mq_sched_exit_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
+                           unsigned int hctx_idx);
 
 int blk_mq_sched_init(struct request_queue *q);
 
@@ -81,17 +79,12 @@ blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq,
        return true;
 }
 
-static inline void
-blk_mq_sched_completed_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
+static inline void blk_mq_sched_completed_request(struct request *rq)
 {
-       struct elevator_queue *e = hctx->queue->elevator;
+       struct elevator_queue *e = rq->q->elevator;
 
        if (e && e->type->ops.mq.completed_request)
-               e->type->ops.mq.completed_request(hctx, rq);
-
-       BUG_ON(rq->internal_tag == -1);
-
-       blk_mq_put_tag(hctx, hctx->sched_tags, rq->mq_ctx, rq->internal_tag);
+               e->type->ops.mq.completed_request(rq);
 }
 
 static inline void blk_mq_sched_started_request(struct request *rq)
@@ -131,20 +124,6 @@ static inline void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx)
                set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
 }
 
-/*
- * Mark a hardware queue and the request queue it belongs to as needing a
- * restart.
- */
-static inline void blk_mq_sched_mark_restart_queue(struct blk_mq_hw_ctx *hctx)
-{
-       struct request_queue *q = hctx->queue;
-
-       if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
-               set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
-       if (!test_bit(QUEUE_FLAG_RESTART, &q->queue_flags))
-               set_bit(QUEUE_FLAG_RESTART, &q->queue_flags);
-}
-
 static inline bool blk_mq_sched_needs_restart(struct blk_mq_hw_ctx *hctx)
 {
        return test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
index d745ab8..ec0afdf 100644 (file)
@@ -253,10 +253,12 @@ static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
        struct blk_mq_hw_ctx *hctx;
        int i;
 
+       lockdep_assert_held(&q->sysfs_lock);
+
        queue_for_each_hw_ctx(q, hctx, i)
                blk_mq_unregister_hctx(hctx);
 
-       blk_mq_debugfs_unregister_hctxs(q);
+       blk_mq_debugfs_unregister_mq(q);
 
        kobject_uevent(&q->mq_kobj, KOBJ_REMOVE);
        kobject_del(&q->mq_kobj);
@@ -267,9 +269,9 @@ static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
 
 void blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
 {
-       blk_mq_disable_hotplug();
+       mutex_lock(&q->sysfs_lock);
        __blk_mq_unregister_dev(dev, q);
-       blk_mq_enable_hotplug();
+       mutex_unlock(&q->sysfs_lock);
 }
 
 void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx)
@@ -302,12 +304,13 @@ void blk_mq_sysfs_init(struct request_queue *q)
        }
 }
 
-int blk_mq_register_dev(struct device *dev, struct request_queue *q)
+int __blk_mq_register_dev(struct device *dev, struct request_queue *q)
 {
        struct blk_mq_hw_ctx *hctx;
        int ret, i;
 
-       blk_mq_disable_hotplug();
+       WARN_ON_ONCE(!q->kobj.parent);
+       lockdep_assert_held(&q->sysfs_lock);
 
        ret = kobject_add(&q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq");
        if (ret < 0)
@@ -315,20 +318,38 @@ int blk_mq_register_dev(struct device *dev, struct request_queue *q)
 
        kobject_uevent(&q->mq_kobj, KOBJ_ADD);
 
-       blk_mq_debugfs_register(q, kobject_name(&dev->kobj));
+       blk_mq_debugfs_register(q);
 
        queue_for_each_hw_ctx(q, hctx, i) {
                ret = blk_mq_register_hctx(hctx);
                if (ret)
-                       break;
+                       goto unreg;
        }
 
-       if (ret)
-               __blk_mq_unregister_dev(dev, q);
-       else
-               q->mq_sysfs_init_done = true;
+       q->mq_sysfs_init_done = true;
+
 out:
-       blk_mq_enable_hotplug();
+       return ret;
+
+unreg:
+       while (--i >= 0)
+               blk_mq_unregister_hctx(q->queue_hw_ctx[i]);
+
+       blk_mq_debugfs_unregister_mq(q);
+
+       kobject_uevent(&q->mq_kobj, KOBJ_REMOVE);
+       kobject_del(&q->mq_kobj);
+       kobject_put(&dev->kobj);
+       return ret;
+}
+
+int blk_mq_register_dev(struct device *dev, struct request_queue *q)
+{
+       int ret;
+
+       mutex_lock(&q->sysfs_lock);
+       ret = __blk_mq_register_dev(dev, q);
+       mutex_unlock(&q->sysfs_lock);
 
        return ret;
 }
@@ -339,13 +360,17 @@ void blk_mq_sysfs_unregister(struct request_queue *q)
        struct blk_mq_hw_ctx *hctx;
        int i;
 
+       mutex_lock(&q->sysfs_lock);
        if (!q->mq_sysfs_init_done)
-               return;
+               goto unlock;
 
-       blk_mq_debugfs_unregister_hctxs(q);
+       blk_mq_debugfs_unregister_mq(q);
 
        queue_for_each_hw_ctx(q, hctx, i)
                blk_mq_unregister_hctx(hctx);
+
+unlock:
+       mutex_unlock(&q->sysfs_lock);
 }
 
 int blk_mq_sysfs_register(struct request_queue *q)
@@ -353,10 +378,11 @@ int blk_mq_sysfs_register(struct request_queue *q)
        struct blk_mq_hw_ctx *hctx;
        int i, ret = 0;
 
+       mutex_lock(&q->sysfs_lock);
        if (!q->mq_sysfs_init_done)
-               return ret;
+               goto unlock;
 
-       blk_mq_debugfs_register_hctxs(q);
+       blk_mq_debugfs_register_mq(q);
 
        queue_for_each_hw_ctx(q, hctx, i) {
                ret = blk_mq_register_hctx(hctx);
@@ -364,5 +390,8 @@ int blk_mq_sysfs_register(struct request_queue *q)
                        break;
        }
 
+unlock:
+       mutex_unlock(&q->sysfs_lock);
+
        return ret;
 }
index 9d97bfc..d0be72c 100644 (file)
@@ -96,7 +96,10 @@ static int __blk_mq_get_tag(struct blk_mq_alloc_data *data,
        if (!(data->flags & BLK_MQ_REQ_INTERNAL) &&
            !hctx_may_queue(data->hctx, bt))
                return -1;
-       return __sbitmap_queue_get(bt);
+       if (data->shallow_depth)
+               return __sbitmap_queue_get_shallow(bt, data->shallow_depth);
+       else
+               return __sbitmap_queue_get(bt);
 }
 
 unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
index 6b6e7bc..bf90684 100644 (file)
 static DEFINE_MUTEX(all_q_mutex);
 static LIST_HEAD(all_q_list);
 
+static void blk_mq_poll_stats_start(struct request_queue *q);
+static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
+
+static int blk_mq_poll_stats_bkt(const struct request *rq)
+{
+       int ddir, bytes, bucket;
+
+       ddir = rq_data_dir(rq);
+       bytes = blk_rq_bytes(rq);
+
+       bucket = ddir + 2*(ilog2(bytes) - 9);
+
+       if (bucket < 0)
+               return -1;
+       else if (bucket >= BLK_MQ_POLL_STATS_BKTS)
+               return ddir + BLK_MQ_POLL_STATS_BKTS - 2;
+
+       return bucket;
+}
+
 /*
  * Check if any of the ctx's have pending work in this hardware queue
  */
@@ -65,7 +85,7 @@ static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
        sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw);
 }
 
-void blk_mq_freeze_queue_start(struct request_queue *q)
+void blk_freeze_queue_start(struct request_queue *q)
 {
        int freeze_depth;
 
@@ -75,7 +95,7 @@ void blk_mq_freeze_queue_start(struct request_queue *q)
                blk_mq_run_hw_queues(q, false);
        }
 }
-EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start);
+EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
 
 void blk_mq_freeze_queue_wait(struct request_queue *q)
 {
@@ -105,7 +125,7 @@ void blk_freeze_queue(struct request_queue *q)
         * no blk_unfreeze_queue(), and blk_freeze_queue() is not
         * exported to drivers as the only user for unfreeze is blk_mq.
         */
-       blk_mq_freeze_queue_start(q);
+       blk_freeze_queue_start(q);
        blk_mq_freeze_queue_wait(q);
 }
 
@@ -210,7 +230,6 @@ void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
 #endif
        rq->special = NULL;
        /* tag was already set */
-       rq->errors = 0;
        rq->extra_len = 0;
 
        INIT_LIST_HEAD(&rq->timeout_list);
@@ -321,7 +340,6 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw,
 
        rq = blk_mq_sched_get_request(q, NULL, rw, &alloc_data);
 
-       blk_mq_put_ctx(alloc_data.ctx);
        blk_queue_exit(q);
 
        if (!rq)
@@ -348,8 +366,8 @@ void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
        if (rq->tag != -1)
                blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
        if (sched_tag != -1)
-               blk_mq_sched_completed_request(hctx, rq);
-       blk_mq_sched_restart_queues(hctx);
+               blk_mq_put_tag(hctx, hctx->sched_tags, ctx, sched_tag);
+       blk_mq_sched_restart(hctx);
        blk_queue_exit(q);
 }
 
@@ -366,6 +384,7 @@ void blk_mq_finish_request(struct request *rq)
 {
        blk_mq_finish_hctx_request(blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), rq);
 }
+EXPORT_SYMBOL_GPL(blk_mq_finish_request);
 
 void blk_mq_free_request(struct request *rq)
 {
@@ -403,12 +422,19 @@ static void __blk_mq_complete_request_remote(void *data)
        rq->q->softirq_done_fn(rq);
 }
 
-static void blk_mq_ipi_complete_request(struct request *rq)
+static void __blk_mq_complete_request(struct request *rq)
 {
        struct blk_mq_ctx *ctx = rq->mq_ctx;
        bool shared = false;
        int cpu;
 
+       if (rq->internal_tag != -1)
+               blk_mq_sched_completed_request(rq);
+       if (rq->rq_flags & RQF_STATS) {
+               blk_mq_poll_stats_start(rq->q);
+               blk_stat_add(rq);
+       }
+
        if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) {
                rq->q->softirq_done_fn(rq);
                return;
@@ -429,33 +455,6 @@ static void blk_mq_ipi_complete_request(struct request *rq)
        put_cpu();
 }
 
-static void blk_mq_stat_add(struct request *rq)
-{
-       if (rq->rq_flags & RQF_STATS) {
-               /*
-                * We could rq->mq_ctx here, but there's less of a risk
-                * of races if we have the completion event add the stats
-                * to the local software queue.
-                */
-               struct blk_mq_ctx *ctx;
-
-               ctx = __blk_mq_get_ctx(rq->q, raw_smp_processor_id());
-               blk_stat_add(&ctx->stat[rq_data_dir(rq)], rq);
-       }
-}
-
-static void __blk_mq_complete_request(struct request *rq)
-{
-       struct request_queue *q = rq->q;
-
-       blk_mq_stat_add(rq);
-
-       if (!q->softirq_done_fn)
-               blk_mq_end_request(rq, rq->errors);
-       else
-               blk_mq_ipi_complete_request(rq);
-}
-
 /**
  * blk_mq_complete_request - end I/O on a request
  * @rq:                the request being processed
@@ -464,16 +463,14 @@ static void __blk_mq_complete_request(struct request *rq)
  *     Ends all I/O on a request. It does not handle partial completions.
  *     The actual completion happens out-of-order, through a IPI handler.
  **/
-void blk_mq_complete_request(struct request *rq, int error)
+void blk_mq_complete_request(struct request *rq)
 {
        struct request_queue *q = rq->q;
 
        if (unlikely(blk_should_fake_timeout(q)))
                return;
-       if (!blk_mark_rq_complete(rq)) {
-               rq->errors = error;
+       if (!blk_mark_rq_complete(rq))
                __blk_mq_complete_request(rq);
-       }
 }
 EXPORT_SYMBOL(blk_mq_complete_request);
 
@@ -492,7 +489,7 @@ void blk_mq_start_request(struct request *rq)
        trace_block_rq_issue(q, rq);
 
        if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
-               blk_stat_set_issue_time(&rq->issue_stat);
+               blk_stat_set_issue(&rq->issue_stat, blk_rq_sectors(rq));
                rq->rq_flags |= RQF_STATS;
                wbt_issue(q->rq_wb, &rq->issue_stat);
        }
@@ -527,6 +524,15 @@ void blk_mq_start_request(struct request *rq)
 }
 EXPORT_SYMBOL(blk_mq_start_request);
 
+/*
+ * When we reach here because queue is busy, REQ_ATOM_COMPLETE
+ * flag isn't set yet, so there may be race with timeout handler,
+ * but given rq->deadline is just set in .queue_rq() under
+ * this situation, the race won't be possible in reality because
+ * rq->timeout should be set as big enough to cover the window
+ * between blk_mq_start_request() called from .queue_rq() and
+ * clearing REQ_ATOM_STARTED here.
+ */
 static void __blk_mq_requeue_request(struct request *rq)
 {
        struct request_queue *q = rq->q;
@@ -634,8 +640,7 @@ void blk_mq_abort_requeue_list(struct request_queue *q)
 
                rq = list_first_entry(&rq_list, struct request, queuelist);
                list_del_init(&rq->queuelist);
-               rq->errors = -EIO;
-               blk_mq_end_request(rq, rq->errors);
+               blk_mq_end_request(rq, -EIO);
        }
 }
 EXPORT_SYMBOL(blk_mq_abort_requeue_list);
@@ -667,7 +672,7 @@ void blk_mq_rq_timed_out(struct request *req, bool reserved)
         * just be ignored. This can happen due to the bitflag ordering.
         * Timeout first checks if STARTED is set, and if it is, assumes
         * the request is active. But if we race with completion, then
-        * we both flags will get cleared. So check here again, and ignore
+        * both flags will get cleared. So check here again, and ignore
         * a timeout event with a request that isn't active.
         */
        if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags))
@@ -700,6 +705,19 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
        if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
                return;
 
+       /*
+        * The rq being checked may have been freed and reallocated
+        * out already here, we avoid this race by checking rq->deadline
+        * and REQ_ATOM_COMPLETE flag together:
+        *
+        * - if rq->deadline is observed as new value because of
+        *   reusing, the rq won't be timed out because of timing.
+        * - if rq->deadline is observed as previous value,
+        *   REQ_ATOM_COMPLETE flag won't be cleared in reuse path
+        *   because we put a barrier between setting rq->deadline
+        *   and clearing the flag in blk_mq_start_request(), so
+        *   this rq won't be timed out too.
+        */
        if (time_after_eq(jiffies, rq->deadline)) {
                if (!blk_mark_rq_complete(rq))
                        blk_mq_rq_timed_out(rq, reserved);
@@ -728,7 +746,7 @@ static void blk_mq_timeout_work(struct work_struct *work)
         * percpu_ref_tryget directly, because we need to be able to
         * obtain a reference even in the short window between the queue
         * starting to freeze, by dropping the first reference in
-        * blk_mq_freeze_queue_start, and the moment the last request is
+        * blk_freeze_queue_start, and the moment the last request is
         * consumed, marked by the instant q_usage_counter reaches
         * zero.
         */
@@ -846,12 +864,10 @@ bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
                .flags = wait ? 0 : BLK_MQ_REQ_NOWAIT,
        };
 
-       if (rq->tag != -1) {
-done:
-               if (hctx)
-                       *hctx = data.hctx;
-               return true;
-       }
+       might_sleep_if(wait);
+
+       if (rq->tag != -1)
+               goto done;
 
        if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag))
                data.flags |= BLK_MQ_REQ_RESERVED;
@@ -863,10 +879,12 @@ done:
                        atomic_inc(&data.hctx->nr_active);
                }
                data.hctx->tags->rqs[rq->tag] = rq;
-               goto done;
        }
 
-       return false;
+done:
+       if (hctx)
+               *hctx = data.hctx;
+       return rq->tag != -1;
 }
 
 static void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
@@ -963,25 +981,20 @@ static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx)
        return true;
 }
 
-bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
+bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
 {
-       struct request_queue *q = hctx->queue;
+       struct blk_mq_hw_ctx *hctx;
        struct request *rq;
-       LIST_HEAD(driver_list);
-       struct list_head *dptr;
        int errors, queued, ret = BLK_MQ_RQ_QUEUE_OK;
 
-       /*
-        * Start off with dptr being NULL, so we start the first request
-        * immediately, even if we have more pending.
-        */
-       dptr = NULL;
+       if (list_empty(list))
+               return false;
 
        /*
         * Now process all the entries, sending them to the driver.
         */
        errors = queued = 0;
-       while (!list_empty(list)) {
+       do {
                struct blk_mq_queue_data bd;
 
                rq = list_first_entry(list, struct request, queuelist);
@@ -993,23 +1006,21 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
                         * The initial allocation attempt failed, so we need to
                         * rerun the hardware queue when a tag is freed.
                         */
-                       if (blk_mq_dispatch_wait_add(hctx)) {
-                               /*
-                                * It's possible that a tag was freed in the
-                                * window between the allocation failure and
-                                * adding the hardware queue to the wait queue.
-                                */
-                               if (!blk_mq_get_driver_tag(rq, &hctx, false))
-                                       break;
-                       } else {
+                       if (!blk_mq_dispatch_wait_add(hctx))
+                               break;
+
+                       /*
+                        * It's possible that a tag was freed in the window
+                        * between the allocation failure and adding the
+                        * hardware queue to the wait queue.
+                        */
+                       if (!blk_mq_get_driver_tag(rq, &hctx, false))
                                break;
-                       }
                }
 
                list_del_init(&rq->queuelist);
 
                bd.rq = rq;
-               bd.list = dptr;
 
                /*
                 * Flag last if we have no more requests, or if we have more
@@ -1038,21 +1049,13 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
                        pr_err("blk-mq: bad return on queue: %d\n", ret);
                case BLK_MQ_RQ_QUEUE_ERROR:
                        errors++;
-                       rq->errors = -EIO;
-                       blk_mq_end_request(rq, rq->errors);
+                       blk_mq_end_request(rq, -EIO);
                        break;
                }
 
                if (ret == BLK_MQ_RQ_QUEUE_BUSY)
                        break;
-
-               /*
-                * We've done the first request. If we have more than 1
-                * left in the list, set dptr to defer issue.
-                */
-               if (!dptr && list->next != list->prev)
-                       dptr = &driver_list;
-       }
+       } while (!list_empty(list));
 
        hctx->dispatched[queued_to_index(queued)]++;
 
@@ -1062,8 +1065,8 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
         */
        if (!list_empty(list)) {
                /*
-                * If we got a driver tag for the next request already,
-                * free it again.
+                * If an I/O scheduler has been configured and we got a driver
+                * tag for the next request already, free it again.
                 */
                rq = list_first_entry(list, struct request, queuelist);
                blk_mq_put_driver_tag(rq);
@@ -1073,16 +1076,24 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
                spin_unlock(&hctx->lock);
 
                /*
-                * the queue is expected stopped with BLK_MQ_RQ_QUEUE_BUSY, but
-                * it's possible the queue is stopped and restarted again
-                * before this. Queue restart will dispatch requests. And since
-                * requests in rq_list aren't added into hctx->dispatch yet,
-                * the requests in rq_list might get lost.
+                * If SCHED_RESTART was set by the caller of this function and
+                * it is no longer set that means that it was cleared by another
+                * thread and hence that a queue rerun is needed.
                 *
-                * blk_mq_run_hw_queue() already checks the STOPPED bit
+                * If TAG_WAITING is set that means that an I/O scheduler has
+                * been configured and another thread is waiting for a driver
+                * tag. To guarantee fairness, do not rerun this hardware queue
+                * but let the other thread grab the driver tag.
                 *
-                * If RESTART or TAG_WAITING is set, then let completion restart
-                * the queue instead of potentially looping here.
+                * If no I/O scheduler has been configured it is possible that
+                * the hardware queue got stopped and restarted before requests
+                * were pushed back onto the dispatch list. Rerun the queue to
+                * avoid starvation. Notes:
+                * - blk_mq_run_hw_queue() checks whether or not a queue has
+                *   been stopped before rerunning a queue.
+                * - Some but not all block drivers stop a queue before
+                *   returning BLK_MQ_RQ_QUEUE_BUSY. Two exceptions are scsi-mq
+                *   and dm-rq.
                 */
                if (!blk_mq_sched_needs_restart(hctx) &&
                    !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state))
@@ -1104,6 +1115,8 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
                blk_mq_sched_dispatch_requests(hctx);
                rcu_read_unlock();
        } else {
+               might_sleep();
+
                srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu);
                blk_mq_sched_dispatch_requests(hctx);
                srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx);
@@ -1135,7 +1148,8 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
        return hctx->next_cpu;
 }
 
-void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
+static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
+                                       unsigned long msecs)
 {
        if (unlikely(blk_mq_hctx_stopped(hctx) ||
                     !blk_mq_hw_queue_mapped(hctx)))
@@ -1152,8 +1166,22 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
                put_cpu();
        }
 
-       kblockd_schedule_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work);
+       kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
+                                        &hctx->run_work,
+                                        msecs_to_jiffies(msecs));
+}
+
+void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
+{
+       __blk_mq_delay_run_hw_queue(hctx, true, msecs);
+}
+EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
+
+void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
+{
+       __blk_mq_delay_run_hw_queue(hctx, async, 0);
 }
+EXPORT_SYMBOL(blk_mq_run_hw_queue);
 
 void blk_mq_run_hw_queues(struct request_queue *q, bool async)
 {
@@ -1192,8 +1220,7 @@ EXPORT_SYMBOL(blk_mq_queue_stopped);
 
 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
 {
-       cancel_work(&hctx->run_work);
-       cancel_delayed_work(&hctx->delay_work);
+       cancel_delayed_work_sync(&hctx->run_work);
        set_bit(BLK_MQ_S_STOPPED, &hctx->state);
 }
 EXPORT_SYMBOL(blk_mq_stop_hw_queue);
@@ -1250,29 +1277,40 @@ static void blk_mq_run_work_fn(struct work_struct *work)
 {
        struct blk_mq_hw_ctx *hctx;
 
-       hctx = container_of(work, struct blk_mq_hw_ctx, run_work);
+       hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work);
 
-       __blk_mq_run_hw_queue(hctx);
-}
-
-static void blk_mq_delay_work_fn(struct work_struct *work)
-{
-       struct blk_mq_hw_ctx *hctx;
+       /*
+        * If we are stopped, don't run the queue. The exception is if
+        * BLK_MQ_S_START_ON_RUN is set. For that case, we auto-clear
+        * the STOPPED bit and run it.
+        */
+       if (test_bit(BLK_MQ_S_STOPPED, &hctx->state)) {
+               if (!test_bit(BLK_MQ_S_START_ON_RUN, &hctx->state))
+                       return;
 
-       hctx = container_of(work, struct blk_mq_hw_ctx, delay_work.work);
+               clear_bit(BLK_MQ_S_START_ON_RUN, &hctx->state);
+               clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
+       }
 
-       if (test_and_clear_bit(BLK_MQ_S_STOPPED, &hctx->state))
-               __blk_mq_run_hw_queue(hctx);
+       __blk_mq_run_hw_queue(hctx);
 }
 
+
 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
 {
        if (unlikely(!blk_mq_hw_queue_mapped(hctx)))
                return;
 
+       /*
+        * Stop the hw queue, then modify currently delayed work.
+        * This should prevent us from running the queue prematurely.
+        * Mark the queue as auto-clearing STOPPED when it runs.
+        */
        blk_mq_stop_hw_queue(hctx);
-       kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
-                       &hctx->delay_work, msecs_to_jiffies(msecs));
+       set_bit(BLK_MQ_S_START_ON_RUN, &hctx->state);
+       kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
+                                       &hctx->run_work,
+                                       msecs_to_jiffies(msecs));
 }
 EXPORT_SYMBOL(blk_mq_delay_queue);
 
@@ -1381,7 +1419,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 
 static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
 {
-       init_request_from_bio(rq, bio);
+       blk_init_request_from_bio(rq, bio);
 
        blk_account_io_start(rq, true);
 }
@@ -1426,14 +1464,13 @@ static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq)
        return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true);
 }
 
-static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie,
+static void __blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie,
                                      bool may_sleep)
 {
        struct request_queue *q = rq->q;
        struct blk_mq_queue_data bd = {
                .rq = rq,
-               .list = NULL,
-               .last = 1
+               .last = true,
        };
        struct blk_mq_hw_ctx *hctx;
        blk_qc_t new_cookie;
@@ -1458,31 +1495,42 @@ static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie,
                return;
        }
 
-       __blk_mq_requeue_request(rq);
-
        if (ret == BLK_MQ_RQ_QUEUE_ERROR) {
                *cookie = BLK_QC_T_NONE;
-               rq->errors = -EIO;
-               blk_mq_end_request(rq, rq->errors);
+               blk_mq_end_request(rq, -EIO);
                return;
        }
 
+       __blk_mq_requeue_request(rq);
 insert:
        blk_mq_sched_insert_request(rq, false, true, false, may_sleep);
 }
 
-/*
- * Multiple hardware queue variant. This will not use per-process plugs,
- * but will attempt to bypass the hctx queueing if we can go straight to
- * hardware for SYNC IO.
- */
+static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
+               struct request *rq, blk_qc_t *cookie)
+{
+       if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
+               rcu_read_lock();
+               __blk_mq_try_issue_directly(rq, cookie, false);
+               rcu_read_unlock();
+       } else {
+               unsigned int srcu_idx;
+
+               might_sleep();
+
+               srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu);
+               __blk_mq_try_issue_directly(rq, cookie, true);
+               srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx);
+       }
+}
+
 static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 {
        const int is_sync = op_is_sync(bio->bi_opf);
        const int is_flush_fua = op_is_flush(bio->bi_opf);
        struct blk_mq_alloc_data data = { .flags = 0 };
        struct request *rq;
-       unsigned int request_count = 0, srcu_idx;
+       unsigned int request_count = 0;
        struct blk_plug *plug;
        struct request *same_queue_rq = NULL;
        blk_qc_t cookie;
@@ -1518,147 +1566,21 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 
        cookie = request_to_qc_t(data.hctx, rq);
 
-       if (unlikely(is_flush_fua)) {
-               if (q->elevator)
-                       goto elv_insert;
-               blk_mq_bio_to_request(rq, bio);
-               blk_insert_flush(rq);
-               goto run_queue;
-       }
-
        plug = current->plug;
-       /*
-        * If the driver supports defer issued based on 'last', then
-        * queue it up like normal since we can potentially save some
-        * CPU this way.
-        */
-       if (((plug && !blk_queue_nomerges(q)) || is_sync) &&
-           !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) {
-               struct request *old_rq = NULL;
-
-               blk_mq_bio_to_request(rq, bio);
-
-               /*
-                * We do limited plugging. If the bio can be merged, do that.
-                * Otherwise the existing request in the plug list will be
-                * issued. So the plug list will have one request at most
-                */
-               if (plug) {
-                       /*
-                        * The plug list might get flushed before this. If that
-                        * happens, same_queue_rq is invalid and plug list is
-                        * empty
-                        */
-                       if (same_queue_rq && !list_empty(&plug->mq_list)) {
-                               old_rq = same_queue_rq;
-                               list_del_init(&old_rq->queuelist);
-                       }
-                       list_add_tail(&rq->queuelist, &plug->mq_list);
-               } else /* is_sync */
-                       old_rq = rq;
+       if (unlikely(is_flush_fua)) {
                blk_mq_put_ctx(data.ctx);
-               if (!old_rq)
-                       goto done;
-
-               if (!(data.hctx->flags & BLK_MQ_F_BLOCKING)) {
-                       rcu_read_lock();
-                       blk_mq_try_issue_directly(old_rq, &cookie, false);
-                       rcu_read_unlock();
+               blk_mq_bio_to_request(rq, bio);
+               if (q->elevator) {
+                       blk_mq_sched_insert_request(rq, false, true, true,
+                                       true);
                } else {
-                       srcu_idx = srcu_read_lock(&data.hctx->queue_rq_srcu);
-                       blk_mq_try_issue_directly(old_rq, &cookie, true);
-                       srcu_read_unlock(&data.hctx->queue_rq_srcu, srcu_idx);
+                       blk_insert_flush(rq);
+                       blk_mq_run_hw_queue(data.hctx, true);
                }
-               goto done;
-       }
-
-       if (q->elevator) {
-elv_insert:
-               blk_mq_put_ctx(data.ctx);
-               blk_mq_bio_to_request(rq, bio);
-               blk_mq_sched_insert_request(rq, false, true,
-                                               !is_sync || is_flush_fua, true);
-               goto done;
-       }
-       if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
-               /*
-                * For a SYNC request, send it to the hardware immediately. For
-                * an ASYNC request, just ensure that we run it later on. The
-                * latter allows for merging opportunities and more efficient
-                * dispatching.
-                */
-run_queue:
-               blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua);
-       }
-       blk_mq_put_ctx(data.ctx);
-done:
-       return cookie;
-}
-
-/*
- * Single hardware queue variant. This will attempt to use any per-process
- * plug for merging and IO deferral.
- */
-static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
-{
-       const int is_sync = op_is_sync(bio->bi_opf);
-       const int is_flush_fua = op_is_flush(bio->bi_opf);
-       struct blk_plug *plug;
-       unsigned int request_count = 0;
-       struct blk_mq_alloc_data data = { .flags = 0 };
-       struct request *rq;
-       blk_qc_t cookie;
-       unsigned int wb_acct;
-
-       blk_queue_bounce(q, &bio);
-
-       if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
-               bio_io_error(bio);
-               return BLK_QC_T_NONE;
-       }
-
-       blk_queue_split(q, &bio, q->bio_split);
-
-       if (!is_flush_fua && !blk_queue_nomerges(q)) {
-               if (blk_attempt_plug_merge(q, bio, &request_count, NULL))
-                       return BLK_QC_T_NONE;
-       } else
-               request_count = blk_plug_queued_count(q);
-
-       if (blk_mq_sched_bio_merge(q, bio))
-               return BLK_QC_T_NONE;
-
-       wb_acct = wbt_wait(q->rq_wb, bio, NULL);
-
-       trace_block_getrq(q, bio, bio->bi_opf);
-
-       rq = blk_mq_sched_get_request(q, bio, bio->bi_opf, &data);
-       if (unlikely(!rq)) {
-               __wbt_done(q->rq_wb, wb_acct);
-               return BLK_QC_T_NONE;
-       }
-
-       wbt_track(&rq->issue_stat, wb_acct);
-
-       cookie = request_to_qc_t(data.hctx, rq);
-
-       if (unlikely(is_flush_fua)) {
-               if (q->elevator)
-                       goto elv_insert;
-               blk_mq_bio_to_request(rq, bio);
-               blk_insert_flush(rq);
-               goto run_queue;
-       }
-
-       /*
-        * A task plug currently exists. Since this is completely lockless,
-        * utilize that to temporarily store requests until the task is
-        * either done or scheduled away.
-        */
-       plug = current->plug;
-       if (plug) {
+       } else if (plug && q->nr_hw_queues == 1) {
                struct request *last = NULL;
 
+               blk_mq_put_ctx(data.ctx);
                blk_mq_bio_to_request(rq, bio);
 
                /*
@@ -1667,13 +1589,14 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
                 */
                if (list_empty(&plug->mq_list))
                        request_count = 0;
+               else if (blk_queue_nomerges(q))
+                       request_count = blk_plug_queued_count(q);
+
                if (!request_count)
                        trace_block_plug(q);
                else
                        last = list_entry_rq(plug->mq_list.prev);
 
-               blk_mq_put_ctx(data.ctx);
-
                if (request_count >= BLK_MAX_REQUEST_COUNT || (last &&
                    blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
                        blk_flush_plug_list(plug, false);
@@ -1681,30 +1604,41 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
                }
 
                list_add_tail(&rq->queuelist, &plug->mq_list);
-               return cookie;
-       }
-
-       if (q->elevator) {
-elv_insert:
-               blk_mq_put_ctx(data.ctx);
+       } else if (plug && !blk_queue_nomerges(q)) {
                blk_mq_bio_to_request(rq, bio);
-               blk_mq_sched_insert_request(rq, false, true,
-                                               !is_sync || is_flush_fua, true);
-               goto done;
-       }
-       if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
+
                /*
-                * For a SYNC request, send it to the hardware immediately. For
-                * an ASYNC request, just ensure that we run it later on. The
-                * latter allows for merging opportunities and more efficient
-                * dispatching.
+                * We do limited plugging. If the bio can be merged, do that.
+                * Otherwise the existing request in the plug list will be
+                * issued. So the plug list will have one request at most
+                * The plug list might get flushed before this. If that happens,
+                * the plug list is empty, and same_queue_rq is invalid.
                 */
-run_queue:
-               blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua);
-       }
+               if (list_empty(&plug->mq_list))
+                       same_queue_rq = NULL;
+               if (same_queue_rq)
+                       list_del_init(&same_queue_rq->queuelist);
+               list_add_tail(&rq->queuelist, &plug->mq_list);
+
+               blk_mq_put_ctx(data.ctx);
+
+               if (same_queue_rq)
+                       blk_mq_try_issue_directly(data.hctx, same_queue_rq,
+                                       &cookie);
+       } else if (q->nr_hw_queues > 1 && is_sync) {
+               blk_mq_put_ctx(data.ctx);
+               blk_mq_bio_to_request(rq, bio);
+               blk_mq_try_issue_directly(data.hctx, rq, &cookie);
+       } else if (q->elevator) {
+               blk_mq_put_ctx(data.ctx);
+               blk_mq_bio_to_request(rq, bio);
+               blk_mq_sched_insert_request(rq, false, true, true, true);
+       } else if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
+               blk_mq_put_ctx(data.ctx);
+               blk_mq_run_hw_queue(data.hctx, true);
+       } else
+               blk_mq_put_ctx(data.ctx);
 
-       blk_mq_put_ctx(data.ctx);
-done:
        return cookie;
 }
 
@@ -1924,6 +1858,8 @@ static void blk_mq_exit_hctx(struct request_queue *q,
                                       hctx->fq->flush_rq, hctx_idx,
                                       flush_start_tag + hctx_idx);
 
+       blk_mq_sched_exit_hctx(q, hctx, hctx_idx);
+
        if (set->ops->exit_hctx)
                set->ops->exit_hctx(hctx, hctx_idx);
 
@@ -1959,8 +1895,7 @@ static int blk_mq_init_hctx(struct request_queue *q,
        if (node == NUMA_NO_NODE)
                node = hctx->numa_node = set->numa_node;
 
-       INIT_WORK(&hctx->run_work, blk_mq_run_work_fn);
-       INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn);
+       INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
        spin_lock_init(&hctx->lock);
        INIT_LIST_HEAD(&hctx->dispatch);
        hctx->queue = q;
@@ -1990,9 +1925,12 @@ static int blk_mq_init_hctx(struct request_queue *q,
            set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
                goto free_bitmap;
 
+       if (blk_mq_sched_init_hctx(q, hctx, hctx_idx))
+               goto exit_hctx;
+
        hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size);
        if (!hctx->fq)
-               goto exit_hctx;
+               goto sched_exit_hctx;
 
        if (set->ops->init_request &&
            set->ops->init_request(set->driver_data,
@@ -2007,6 +1945,8 @@ static int blk_mq_init_hctx(struct request_queue *q,
 
  free_fq:
        kfree(hctx->fq);
+ sched_exit_hctx:
+       blk_mq_sched_exit_hctx(q, hctx, hctx_idx);
  exit_hctx:
        if (set->ops->exit_hctx)
                set->ops->exit_hctx(hctx, hctx_idx);
@@ -2032,8 +1972,6 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
                spin_lock_init(&__ctx->lock);
                INIT_LIST_HEAD(&__ctx->rq_list);
                __ctx->queue = q;
-               blk_stat_init(&__ctx->stat[BLK_STAT_READ]);
-               blk_stat_init(&__ctx->stat[BLK_STAT_WRITE]);
 
                /* If the cpu isn't online, the cpu is mapped to first hctx */
                if (!cpu_online(i))
@@ -2180,6 +2118,8 @@ static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set, bool shared)
 {
        struct request_queue *q;
 
+       lockdep_assert_held(&set->tag_list_lock);
+
        list_for_each_entry(q, &set->tag_list, tag_set_list) {
                blk_mq_freeze_queue(q);
                queue_set_hctx_shared(q, shared);
@@ -2192,7 +2132,8 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q)
        struct blk_mq_tag_set *set = q->tag_set;
 
        mutex_lock(&set->tag_list_lock);
-       list_del_init(&q->tag_set_list);
+       list_del_rcu(&q->tag_set_list);
+       INIT_LIST_HEAD(&q->tag_set_list);
        if (list_is_singular(&set->tag_list)) {
                /* just transitioned to unshared */
                set->flags &= ~BLK_MQ_F_TAG_SHARED;
@@ -2200,6 +2141,8 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q)
                blk_mq_update_tag_set_depth(set, false);
        }
        mutex_unlock(&set->tag_list_lock);
+
+       synchronize_rcu();
 }
 
 static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
@@ -2217,7 +2160,7 @@ static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
        }
        if (set->flags & BLK_MQ_F_TAG_SHARED)
                queue_set_hctx_shared(q, true);
-       list_add_tail(&q->tag_set_list, &set->tag_list);
+       list_add_tail_rcu(&q->tag_set_list, &set->tag_list);
 
        mutex_unlock(&set->tag_list_lock);
 }
@@ -2233,8 +2176,6 @@ void blk_mq_release(struct request_queue *q)
        struct blk_mq_hw_ctx *hctx;
        unsigned int i;
 
-       blk_mq_sched_teardown(q);
-
        /* hctx kobj stays in hctx */
        queue_for_each_hw_ctx(q, hctx, i) {
                if (!hctx)
@@ -2331,6 +2272,12 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
        /* mark the queue as mq asap */
        q->mq_ops = set->ops;
 
+       q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn,
+                                            blk_mq_poll_stats_bkt,
+                                            BLK_MQ_POLL_STATS_BKTS, q);
+       if (!q->poll_cb)
+               goto err_exit;
+
        q->queue_ctx = alloc_percpu(struct blk_mq_ctx);
        if (!q->queue_ctx)
                goto err_exit;
@@ -2365,10 +2312,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
        INIT_LIST_HEAD(&q->requeue_list);
        spin_lock_init(&q->requeue_lock);
 
-       if (q->nr_hw_queues > 1)
-               blk_queue_make_request(q, blk_mq_make_request);
-       else
-               blk_queue_make_request(q, blk_sq_make_request);
+       blk_queue_make_request(q, blk_mq_make_request);
 
        /*
         * Do this after blk_queue_make_request() overrides it...
@@ -2423,8 +2367,6 @@ void blk_mq_free_queue(struct request_queue *q)
        list_del_init(&q->all_q_node);
        mutex_unlock(&all_q_mutex);
 
-       wbt_exit(q);
-
        blk_mq_del_queue_tag_set(q);
 
        blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
@@ -2469,7 +2411,7 @@ static void blk_mq_queue_reinit_work(void)
         * take place in parallel.
         */
        list_for_each_entry(q, &all_q_list, all_q_node)
-               blk_mq_freeze_queue_start(q);
+               blk_freeze_queue_start(q);
        list_for_each_entry(q, &all_q_list, all_q_node)
                blk_mq_freeze_queue_wait(q);
 
@@ -2565,6 +2507,14 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
        return 0;
 }
 
+static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
+{
+       if (set->ops->map_queues)
+               return set->ops->map_queues(set);
+       else
+               return blk_mq_map_queues(set);
+}
+
 /*
  * Alloc a tag set to be associated with one or more request queues.
  * May fail with EINVAL for various error conditions. May adjust the
@@ -2619,10 +2569,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
        if (!set->mq_map)
                goto out_free_tags;
 
-       if (set->ops->map_queues)
-               ret = set->ops->map_queues(set);
-       else
-               ret = blk_mq_map_queues(set);
+       ret = blk_mq_update_queue_map(set);
        if (ret)
                goto out_free_mq_map;
 
@@ -2705,6 +2652,8 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
 {
        struct request_queue *q;
 
+       lockdep_assert_held(&set->tag_list_lock);
+
        if (nr_hw_queues > nr_cpu_ids)
                nr_hw_queues = nr_cpu_ids;
        if (nr_hw_queues < 1 || nr_hw_queues == set->nr_hw_queues)
@@ -2714,18 +2663,9 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
                blk_mq_freeze_queue(q);
 
        set->nr_hw_queues = nr_hw_queues;
+       blk_mq_update_queue_map(set);
        list_for_each_entry(q, &set->tag_list, tag_set_list) {
                blk_mq_realloc_hw_ctxs(set, q);
-
-               /*
-                * Manually set the make_request_fn as blk_queue_make_request
-                * resets a lot of the queue settings.
-                */
-               if (q->nr_hw_queues > 1)
-                       q->make_request_fn = blk_mq_make_request;
-               else
-                       q->make_request_fn = blk_sq_make_request;
-
                blk_mq_queue_reinit(q, cpu_online_mask);
        }
 
@@ -2734,39 +2674,69 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
 }
 EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
 
+/* Enable polling stats and return whether they were already enabled. */
+static bool blk_poll_stats_enable(struct request_queue *q)
+{
+       if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) ||
+           test_and_set_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags))
+               return true;
+       blk_stat_add_callback(q, q->poll_cb);
+       return false;
+}
+
+static void blk_mq_poll_stats_start(struct request_queue *q)
+{
+       /*
+        * We don't arm the callback if polling stats are not enabled or the
+        * callback is already active.
+        */
+       if (!test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) ||
+           blk_stat_is_active(q->poll_cb))
+               return;
+
+       blk_stat_activate_msecs(q->poll_cb, 100);
+}
+
+static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb)
+{
+       struct request_queue *q = cb->data;
+       int bucket;
+
+       for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS; bucket++) {
+               if (cb->stat[bucket].nr_samples)
+                       q->poll_stat[bucket] = cb->stat[bucket];
+       }
+}
+
 static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
                                       struct blk_mq_hw_ctx *hctx,
                                       struct request *rq)
 {
-       struct blk_rq_stat stat[2];
        unsigned long ret = 0;
+       int bucket;
 
        /*
         * If stats collection isn't on, don't sleep but turn it on for
         * future users
         */
-       if (!blk_stat_enable(q))
+       if (!blk_poll_stats_enable(q))
                return 0;
 
        /*
-        * We don't have to do this once per IO, should optimize this
-        * to just use the current window of stats until it changes
-        */
-       memset(&stat, 0, sizeof(stat));
-       blk_hctx_stat_get(hctx, stat);
-
-       /*
         * As an optimistic guess, use half of the mean service time
         * for this type of request. We can (and should) make this smarter.
         * For instance, if the completion latencies are tight, we can
         * get closer than just half the mean. This is especially
         * important on devices where the completion latencies are longer
-        * than ~10 usec.
+        * than ~10 usec. We do use the stats for the relevant IO size
+        * if available which does lead to better estimates.
         */
-       if (req_op(rq) == REQ_OP_READ && stat[BLK_STAT_READ].nr_samples)
-               ret = (stat[BLK_STAT_READ].mean + 1) / 2;
-       else if (req_op(rq) == REQ_OP_WRITE && stat[BLK_STAT_WRITE].nr_samples)
-               ret = (stat[BLK_STAT_WRITE].mean + 1) / 2;
+       bucket = blk_mq_poll_stats_bkt(rq);
+       if (bucket < 0)
+               return ret;
+
+       if (q->poll_stat[bucket].nr_samples)
+               ret = (q->poll_stat[bucket].mean + 1) / 2;
 
        return ret;
 }
@@ -2889,8 +2859,17 @@ bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
        hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
        if (!blk_qc_t_is_internal(cookie))
                rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
-       else
+       else {
                rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie));
+               /*
+                * With scheduling, if the request has completed, we'll
+                * get a NULL return here, as we clear the sched tag when
+                * that happens. The request still remains valid, like always,
+                * so we should be safe with just the NULL check.
+                */
+               if (!rq)
+                       return false;
+       }
 
        return __blk_mq_poll(hctx, rq);
 }
index b79f9a7..2814a14 100644 (file)
@@ -20,7 +20,6 @@ struct blk_mq_ctx {
 
        /* incremented at completion time */
        unsigned long           ____cacheline_aligned_in_smp rq_completed[2];
-       struct blk_rq_stat      stat[2];
 
        struct request_queue    *queue;
        struct kobject          kobj;
@@ -31,7 +30,7 @@ void blk_mq_freeze_queue(struct request_queue *q);
 void blk_mq_free_queue(struct request_queue *q);
 int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
 void blk_mq_wake_waiters(struct request_queue *q);
-bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *, struct list_head *);
+bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *);
 void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
 bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx);
 bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
@@ -79,6 +78,7 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
  */
 extern void blk_mq_sysfs_init(struct request_queue *q);
 extern void blk_mq_sysfs_deinit(struct request_queue *q);
+extern int __blk_mq_register_dev(struct device *dev, struct request_queue *q);
 extern int blk_mq_sysfs_register(struct request_queue *q);
 extern void blk_mq_sysfs_unregister(struct request_queue *q);
 extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx);
@@ -87,13 +87,12 @@ extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx);
  * debugfs helpers
  */
 #ifdef CONFIG_BLK_DEBUG_FS
-int blk_mq_debugfs_register(struct request_queue *q, const char *name);
+int blk_mq_debugfs_register(struct request_queue *q);
 void blk_mq_debugfs_unregister(struct request_queue *q);
-int blk_mq_debugfs_register_hctxs(struct request_queue *q);
-void blk_mq_debugfs_unregister_hctxs(struct request_queue *q);
+int blk_mq_debugfs_register_mq(struct request_queue *q);
+void blk_mq_debugfs_unregister_mq(struct request_queue *q);
 #else
-static inline int blk_mq_debugfs_register(struct request_queue *q,
-                                         const char *name)
+static inline int blk_mq_debugfs_register(struct request_queue *q)
 {
        return 0;
 }
@@ -102,12 +101,12 @@ static inline void blk_mq_debugfs_unregister(struct request_queue *q)
 {
 }
 
-static inline int blk_mq_debugfs_register_hctxs(struct request_queue *q)
+static inline int blk_mq_debugfs_register_mq(struct request_queue *q)
 {
        return 0;
 }
 
-static inline void blk_mq_debugfs_unregister_hctxs(struct request_queue *q)
+static inline void blk_mq_debugfs_unregister_mq(struct request_queue *q)
 {
 }
 #endif
@@ -142,6 +141,7 @@ struct blk_mq_alloc_data {
        /* input parameter */
        struct request_queue *q;
        unsigned int flags;
+       unsigned int shallow_depth;
 
        /* input & output parameter */
        struct blk_mq_ctx *ctx;
index 1e7174f..4fa81ed 100644 (file)
@@ -103,7 +103,6 @@ void blk_set_default_limits(struct queue_limits *lim)
        lim->discard_granularity = 0;
        lim->discard_alignment = 0;
        lim->discard_misaligned = 0;
-       lim->discard_zeroes_data = 0;
        lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
        lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT);
        lim->alignment_offset = 0;
@@ -127,7 +126,6 @@ void blk_set_stacking_limits(struct queue_limits *lim)
        blk_set_default_limits(lim);
 
        /* Inherit limits from component devices */
-       lim->discard_zeroes_data = 1;
        lim->max_segments = USHRT_MAX;
        lim->max_discard_segments = 1;
        lim->max_hw_sectors = UINT_MAX;
@@ -609,7 +607,6 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
        t->io_opt = lcm_not_zero(t->io_opt, b->io_opt);
 
        t->cluster &= b->cluster;
-       t->discard_zeroes_data &= b->discard_zeroes_data;
 
        /* Physical block size a multiple of the logical block size? */
        if (t->physical_block_size & (t->logical_block_size - 1)) {
index 186fcb9..6c2f409 100644 (file)
@@ -4,10 +4,27 @@
  * Copyright (C) 2016 Jens Axboe
  */
 #include <linux/kernel.h>
+#include <linux/rculist.h>
 #include <linux/blk-mq.h>
 
 #include "blk-stat.h"
 #include "blk-mq.h"
+#include "blk.h"
+
+#define BLK_RQ_STAT_BATCH      64
+
+struct blk_queue_stats {
+       struct list_head callbacks;
+       spinlock_t lock;
+       bool enable_accounting;
+};
+
+static void blk_stat_init(struct blk_rq_stat *stat)
+{
+       stat->min = -1ULL;
+       stat->max = stat->nr_samples = stat->mean = 0;
+       stat->batch = stat->nr_batch = 0;
+}
 
 static void blk_stat_flush_batch(struct blk_rq_stat *stat)
 {
@@ -48,209 +65,185 @@ static void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src)
        dst->nr_samples += src->nr_samples;
 }
 
-static void blk_mq_stat_get(struct request_queue *q, struct blk_rq_stat *dst)
+static void __blk_stat_add(struct blk_rq_stat *stat, u64 value)
 {
-       struct blk_mq_hw_ctx *hctx;
-       struct blk_mq_ctx *ctx;
-       uint64_t latest = 0;
-       int i, j, nr;
-
-       blk_stat_init(&dst[BLK_STAT_READ]);
-       blk_stat_init(&dst[BLK_STAT_WRITE]);
-
-       nr = 0;
-       do {
-               uint64_t newest = 0;
-
-               queue_for_each_hw_ctx(q, hctx, i) {
-                       hctx_for_each_ctx(hctx, ctx, j) {
-                               blk_stat_flush_batch(&ctx->stat[BLK_STAT_READ]);
-                               blk_stat_flush_batch(&ctx->stat[BLK_STAT_WRITE]);
-
-                               if (!ctx->stat[BLK_STAT_READ].nr_samples &&
-                                   !ctx->stat[BLK_STAT_WRITE].nr_samples)
-                                       continue;
-                               if (ctx->stat[BLK_STAT_READ].time > newest)
-                                       newest = ctx->stat[BLK_STAT_READ].time;
-                               if (ctx->stat[BLK_STAT_WRITE].time > newest)
-                                       newest = ctx->stat[BLK_STAT_WRITE].time;
-                       }
-               }
+       stat->min = min(stat->min, value);
+       stat->max = max(stat->max, value);
 
-               /*
-                * No samples
-                */
-               if (!newest)
-                       break;
-
-               if (newest > latest)
-                       latest = newest;
-
-               queue_for_each_hw_ctx(q, hctx, i) {
-                       hctx_for_each_ctx(hctx, ctx, j) {
-                               if (ctx->stat[BLK_STAT_READ].time == newest) {
-                                       blk_stat_sum(&dst[BLK_STAT_READ],
-                                                    &ctx->stat[BLK_STAT_READ]);
-                                       nr++;
-                               }
-                               if (ctx->stat[BLK_STAT_WRITE].time == newest) {
-                                       blk_stat_sum(&dst[BLK_STAT_WRITE],
-                                                    &ctx->stat[BLK_STAT_WRITE]);
-                                       nr++;
-                               }
-                       }
-               }
-               /*
-                * If we race on finding an entry, just loop back again.
-                * Should be very rare.
-                */
-       } while (!nr);
+       if (stat->batch + value < stat->batch ||
+           stat->nr_batch + 1 == BLK_RQ_STAT_BATCH)
+               blk_stat_flush_batch(stat);
 
-       dst[BLK_STAT_READ].time = dst[BLK_STAT_WRITE].time = latest;
+       stat->batch += value;
+       stat->nr_batch++;
 }
 
-void blk_queue_stat_get(struct request_queue *q, struct blk_rq_stat *dst)
+void blk_stat_add(struct request *rq)
 {
-       if (q->mq_ops)
-               blk_mq_stat_get(q, dst);
-       else {
-               blk_stat_flush_batch(&q->rq_stats[BLK_STAT_READ]);
-               blk_stat_flush_batch(&q->rq_stats[BLK_STAT_WRITE]);
-               memcpy(&dst[BLK_STAT_READ], &q->rq_stats[BLK_STAT_READ],
-                               sizeof(struct blk_rq_stat));
-               memcpy(&dst[BLK_STAT_WRITE], &q->rq_stats[BLK_STAT_WRITE],
-                               sizeof(struct blk_rq_stat));
+       struct request_queue *q = rq->q;
+       struct blk_stat_callback *cb;
+       struct blk_rq_stat *stat;
+       int bucket;
+       s64 now, value;
+
+       now = __blk_stat_time(ktime_to_ns(ktime_get()));
+       if (now < blk_stat_time(&rq->issue_stat))
+               return;
+
+       value = now - blk_stat_time(&rq->issue_stat);
+
+       blk_throtl_stat_add(rq, value);
+
+       rcu_read_lock();
+       list_for_each_entry_rcu(cb, &q->stats->callbacks, list) {
+               if (blk_stat_is_active(cb)) {
+                       bucket = cb->bucket_fn(rq);
+                       if (bucket < 0)
+                               continue;
+                       stat = &this_cpu_ptr(cb->cpu_stat)[bucket];
+                       __blk_stat_add(stat, value);
+               }
        }
+       rcu_read_unlock();
 }
 
-void blk_hctx_stat_get(struct blk_mq_hw_ctx *hctx, struct blk_rq_stat *dst)
+static void blk_stat_timer_fn(unsigned long data)
 {
-       struct blk_mq_ctx *ctx;
-       unsigned int i, nr;
+       struct blk_stat_callback *cb = (void *)data;
+       unsigned int bucket;
+       int cpu;
 
-       nr = 0;
-       do {
-               uint64_t newest = 0;
+       for (bucket = 0; bucket < cb->buckets; bucket++)
+               blk_stat_init(&cb->stat[bucket]);
 
-               hctx_for_each_ctx(hctx, ctx, i) {
-                       blk_stat_flush_batch(&ctx->stat[BLK_STAT_READ]);
-                       blk_stat_flush_batch(&ctx->stat[BLK_STAT_WRITE]);
+       for_each_online_cpu(cpu) {
+               struct blk_rq_stat *cpu_stat;
 
-                       if (!ctx->stat[BLK_STAT_READ].nr_samples &&
-                           !ctx->stat[BLK_STAT_WRITE].nr_samples)
-                               continue;
-
-                       if (ctx->stat[BLK_STAT_READ].time > newest)
-                               newest = ctx->stat[BLK_STAT_READ].time;
-                       if (ctx->stat[BLK_STAT_WRITE].time > newest)
-                               newest = ctx->stat[BLK_STAT_WRITE].time;
+               cpu_stat = per_cpu_ptr(cb->cpu_stat, cpu);
+               for (bucket = 0; bucket < cb->buckets; bucket++) {
+                       blk_stat_sum(&cb->stat[bucket], &cpu_stat[bucket]);
+                       blk_stat_init(&cpu_stat[bucket]);
                }
+       }
 
-               if (!newest)
-                       break;
-
-               hctx_for_each_ctx(hctx, ctx, i) {
-                       if (ctx->stat[BLK_STAT_READ].time == newest) {
-                               blk_stat_sum(&dst[BLK_STAT_READ],
-                                               &ctx->stat[BLK_STAT_READ]);
-                               nr++;
-                       }
-                       if (ctx->stat[BLK_STAT_WRITE].time == newest) {
-                               blk_stat_sum(&dst[BLK_STAT_WRITE],
-                                               &ctx->stat[BLK_STAT_WRITE]);
-                               nr++;
-                       }
-               }
-               /*
-                * If we race on finding an entry, just loop back again.
-                * Should be very rare, as the window is only updated
-                * occasionally
-                */
-       } while (!nr);
+       cb->timer_fn(cb);
 }
 
-static void __blk_stat_init(struct blk_rq_stat *stat, s64 time_now)
+struct blk_stat_callback *
+blk_stat_alloc_callback(void (*timer_fn)(struct blk_stat_callback *),
+                       int (*bucket_fn)(const struct request *),
+                       unsigned int buckets, void *data)
 {
-       stat->min = -1ULL;
-       stat->max = stat->nr_samples = stat->mean = 0;
-       stat->batch = stat->nr_batch = 0;
-       stat->time = time_now & BLK_STAT_NSEC_MASK;
-}
+       struct blk_stat_callback *cb;
 
-void blk_stat_init(struct blk_rq_stat *stat)
-{
-       __blk_stat_init(stat, ktime_to_ns(ktime_get()));
-}
+       cb = kmalloc(sizeof(*cb), GFP_KERNEL);
+       if (!cb)
+               return NULL;
 
-static bool __blk_stat_is_current(struct blk_rq_stat *stat, s64 now)
-{
-       return (now & BLK_STAT_NSEC_MASK) == (stat->time & BLK_STAT_NSEC_MASK);
+       cb->stat = kmalloc_array(buckets, sizeof(struct blk_rq_stat),
+                                GFP_KERNEL);
+       if (!cb->stat) {
+               kfree(cb);
+               return NULL;
+       }
+       cb->cpu_stat = __alloc_percpu(buckets * sizeof(struct blk_rq_stat),
+                                     __alignof__(struct blk_rq_stat));
+       if (!cb->cpu_stat) {
+               kfree(cb->stat);
+               kfree(cb);
+               return NULL;
+       }
+
+       cb->timer_fn = timer_fn;
+       cb->bucket_fn = bucket_fn;
+       cb->data = data;
+       cb->buckets = buckets;
+       setup_timer(&cb->timer, blk_stat_timer_fn, (unsigned long)cb);
+
+       return cb;
 }
+EXPORT_SYMBOL_GPL(blk_stat_alloc_callback);
 
-bool blk_stat_is_current(struct blk_rq_stat *stat)
+void blk_stat_add_callback(struct request_queue *q,
+                          struct blk_stat_callback *cb)
 {
-       return __blk_stat_is_current(stat, ktime_to_ns(ktime_get()));
+       unsigned int bucket;
+       int cpu;
+
+       for_each_possible_cpu(cpu) {
+               struct blk_rq_stat *cpu_stat;
+
+               cpu_stat = per_cpu_ptr(cb->cpu_stat, cpu);
+               for (bucket = 0; bucket < cb->buckets; bucket++)
+                       blk_stat_init(&cpu_stat[bucket]);
+       }
+
+       spin_lock(&q->stats->lock);
+       list_add_tail_rcu(&cb->list, &q->stats->callbacks);
+       set_bit(QUEUE_FLAG_STATS, &q->queue_flags);
+       spin_unlock(&q->stats->lock);
 }
+EXPORT_SYMBOL_GPL(blk_stat_add_callback);
 
-void blk_stat_add(struct blk_rq_stat *stat, struct request *rq)
+void blk_stat_remove_callback(struct request_queue *q,
+                             struct blk_stat_callback *cb)
 {
-       s64 now, value;
+       spin_lock(&q->stats->lock);
+       list_del_rcu(&cb->list);
+       if (list_empty(&q->stats->callbacks) && !q->stats->enable_accounting)
+               clear_bit(QUEUE_FLAG_STATS, &q->queue_flags);
+       spin_unlock(&q->stats->lock);
 
-       now = __blk_stat_time(ktime_to_ns(ktime_get()));
-       if (now < blk_stat_time(&rq->issue_stat))
-               return;
-
-       if (!__blk_stat_is_current(stat, now))
-               __blk_stat_init(stat, now);
+       del_timer_sync(&cb->timer);
+}
+EXPORT_SYMBOL_GPL(blk_stat_remove_callback);
 
-       value = now - blk_stat_time(&rq->issue_stat);
-       if (value > stat->max)
-               stat->max = value;
-       if (value < stat->min)
-               stat->min = value;
+static void blk_stat_free_callback_rcu(struct rcu_head *head)
+{
+       struct blk_stat_callback *cb;
 
-       if (stat->batch + value < stat->batch ||
-           stat->nr_batch + 1 == BLK_RQ_STAT_BATCH)
-               blk_stat_flush_batch(stat);
+       cb = container_of(head, struct blk_stat_callback, rcu);
+       free_percpu(cb->cpu_stat);
+       kfree(cb->stat);
+       kfree(cb);
+}
 
-       stat->batch += value;
-       stat->nr_batch++;
+void blk_stat_free_callback(struct blk_stat_callback *cb)
+{
+       if (cb)
+               call_rcu(&cb->rcu, blk_stat_free_callback_rcu);
 }
+EXPORT_SYMBOL_GPL(blk_stat_free_callback);
 
-void blk_stat_clear(struct request_queue *q)
+void blk_stat_enable_accounting(struct request_queue *q)
 {
-       if (q->mq_ops) {
-               struct blk_mq_hw_ctx *hctx;
-               struct blk_mq_ctx *ctx;
-               int i, j;
-
-               queue_for_each_hw_ctx(q, hctx, i) {
-                       hctx_for_each_ctx(hctx, ctx, j) {
-                               blk_stat_init(&ctx->stat[BLK_STAT_READ]);
-                               blk_stat_init(&ctx->stat[BLK_STAT_WRITE]);
-                       }
-               }
-       } else {
-               blk_stat_init(&q->rq_stats[BLK_STAT_READ]);
-               blk_stat_init(&q->rq_stats[BLK_STAT_WRITE]);
-       }
+       spin_lock(&q->stats->lock);
+       q->stats->enable_accounting = true;
+       set_bit(QUEUE_FLAG_STATS, &q->queue_flags);
+       spin_unlock(&q->stats->lock);
 }
 
-void blk_stat_set_issue_time(struct blk_issue_stat *stat)
+struct blk_queue_stats *blk_alloc_queue_stats(void)
 {
-       stat->time = (stat->time & BLK_STAT_MASK) |
-                       (ktime_to_ns(ktime_get()) & BLK_STAT_TIME_MASK);
+       struct blk_queue_stats *stats;
+
+       stats = kmalloc(sizeof(*stats), GFP_KERNEL);
+       if (!stats)
+               return NULL;
+
+       INIT_LIST_HEAD(&stats->callbacks);
+       spin_lock_init(&stats->lock);
+       stats->enable_accounting = false;
+
+       return stats;
 }
 
-/*
- * Enable stat tracking, return whether it was enabled
- */
-bool blk_stat_enable(struct request_queue *q)
+void blk_free_queue_stats(struct blk_queue_stats *stats)
 {
-       if (!test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
-               set_bit(QUEUE_FLAG_STATS, &q->queue_flags);
-               return false;
-       }
+       if (!stats)
+               return;
+
+       WARN_ON(!list_empty(&stats->callbacks));
 
-       return true;
+       kfree(stats);
 }
index a2050a0..2fb20d1 100644 (file)
@@ -1,33 +1,85 @@
 #ifndef BLK_STAT_H
 #define BLK_STAT_H
 
-/*
- * ~0.13s window as a power-of-2 (2^27 nsecs)
- */
-#define BLK_STAT_NSEC          134217728ULL
-#define BLK_STAT_NSEC_MASK     ~(BLK_STAT_NSEC - 1)
+#include <linux/kernel.h>
+#include <linux/blkdev.h>
+#include <linux/ktime.h>
+#include <linux/rcupdate.h>
+#include <linux/timer.h>
 
 /*
- * Upper 3 bits can be used elsewhere
+ * from upper:
+ * 3 bits: reserved for other usage
+ * 12 bits: size
+ * 49 bits: time
  */
 #define BLK_STAT_RES_BITS      3
-#define BLK_STAT_SHIFT         (64 - BLK_STAT_RES_BITS)
-#define BLK_STAT_TIME_MASK     ((1ULL << BLK_STAT_SHIFT) - 1)
-#define BLK_STAT_MASK          ~BLK_STAT_TIME_MASK
+#define BLK_STAT_SIZE_BITS     12
+#define BLK_STAT_RES_SHIFT     (64 - BLK_STAT_RES_BITS)
+#define BLK_STAT_SIZE_SHIFT    (BLK_STAT_RES_SHIFT - BLK_STAT_SIZE_BITS)
+#define BLK_STAT_TIME_MASK     ((1ULL << BLK_STAT_SIZE_SHIFT) - 1)
+#define BLK_STAT_SIZE_MASK     \
+       (((1ULL << BLK_STAT_SIZE_BITS) - 1) << BLK_STAT_SIZE_SHIFT)
+#define BLK_STAT_RES_MASK      (~((1ULL << BLK_STAT_RES_SHIFT) - 1))
+
+/**
+ * struct blk_stat_callback - Block statistics callback.
+ *
+ * A &struct blk_stat_callback is associated with a &struct request_queue. While
+ * @timer is active, that queue's request completion latencies are sorted into
+ * buckets by @bucket_fn and added to a per-cpu buffer, @cpu_stat. When the
+ * timer fires, @cpu_stat is flushed to @stat and @timer_fn is invoked.
+ */
+struct blk_stat_callback {
+       /*
+        * @list: RCU list of callbacks for a &struct request_queue.
+        */
+       struct list_head list;
+
+       /**
+        * @timer: Timer for the next callback invocation.
+        */
+       struct timer_list timer;
+
+       /**
+        * @cpu_stat: Per-cpu statistics buckets.
+        */
+       struct blk_rq_stat __percpu *cpu_stat;
+
+       /**
+        * @bucket_fn: Given a request, returns which statistics bucket it
+        * should be accounted under. Return -1 for no bucket for this
+        * request.
+        */
+       int (*bucket_fn)(const struct request *);
+
+       /**
+        * @buckets: Number of statistics buckets.
+        */
+       unsigned int buckets;
+
+       /**
+        * @stat: Array of statistics buckets.
+        */
+       struct blk_rq_stat *stat;
+
+       /**
+        * @fn: Callback function.
+        */
+       void (*timer_fn)(struct blk_stat_callback *);
+
+       /**
+        * @data: Private pointer for the user.
+        */
+       void *data;
 
-enum {
-       BLK_STAT_READ   = 0,
-       BLK_STAT_WRITE,
+       struct rcu_head rcu;
 };
 
-void blk_stat_add(struct blk_rq_stat *, struct request *);
-void blk_hctx_stat_get(struct blk_mq_hw_ctx *, struct blk_rq_stat *);
-void blk_queue_stat_get(struct request_queue *, struct blk_rq_stat *);
-void blk_stat_clear(struct request_queue *);
-void blk_stat_init(struct blk_rq_stat *);
-bool blk_stat_is_current(struct blk_rq_stat *);
-void blk_stat_set_issue_time(struct blk_issue_stat *);
-bool blk_stat_enable(struct request_queue *);
+struct blk_queue_stats *blk_alloc_queue_stats(void);
+void blk_free_queue_stats(struct blk_queue_stats *);
+
+void blk_stat_add(struct request *);
 
 static inline u64 __blk_stat_time(u64 time)
 {
@@ -36,7 +88,117 @@ static inline u64 __blk_stat_time(u64 time)
 
 static inline u64 blk_stat_time(struct blk_issue_stat *stat)
 {
-       return __blk_stat_time(stat->time);
+       return __blk_stat_time(stat->stat);
+}
+
+static inline sector_t blk_capped_size(sector_t size)
+{
+       return size & ((1ULL << BLK_STAT_SIZE_BITS) - 1);
+}
+
+static inline sector_t blk_stat_size(struct blk_issue_stat *stat)
+{
+       return (stat->stat & BLK_STAT_SIZE_MASK) >> BLK_STAT_SIZE_SHIFT;
+}
+
+static inline void blk_stat_set_issue(struct blk_issue_stat *stat,
+       sector_t size)
+{
+       stat->stat = (stat->stat & BLK_STAT_RES_MASK) |
+               (ktime_to_ns(ktime_get()) & BLK_STAT_TIME_MASK) |
+               (((u64)blk_capped_size(size)) << BLK_STAT_SIZE_SHIFT);
+}
+
+/* record time/size info in request but not add a callback */
+void blk_stat_enable_accounting(struct request_queue *q);
+
+/**
+ * blk_stat_alloc_callback() - Allocate a block statistics callback.
+ * @timer_fn: Timer callback function.
+ * @bucket_fn: Bucket callback function.
+ * @buckets: Number of statistics buckets.
+ * @data: Value for the @data field of the &struct blk_stat_callback.
+ *
+ * See &struct blk_stat_callback for details on the callback functions.
+ *
+ * Return: &struct blk_stat_callback on success or NULL on ENOMEM.
+ */
+struct blk_stat_callback *
+blk_stat_alloc_callback(void (*timer_fn)(struct blk_stat_callback *),
+                       int (*bucket_fn)(const struct request *),
+                       unsigned int buckets, void *data);
+
+/**
+ * blk_stat_add_callback() - Add a block statistics callback to be run on a
+ * request queue.
+ * @q: The request queue.
+ * @cb: The callback.
+ *
+ * Note that a single &struct blk_stat_callback can only be added to a single
+ * &struct request_queue.
+ */
+void blk_stat_add_callback(struct request_queue *q,
+                          struct blk_stat_callback *cb);
+
+/**
+ * blk_stat_remove_callback() - Remove a block statistics callback from a
+ * request queue.
+ * @q: The request queue.
+ * @cb: The callback.
+ *
+ * When this returns, the callback is not running on any CPUs and will not be
+ * called again unless readded.
+ */
+void blk_stat_remove_callback(struct request_queue *q,
+                             struct blk_stat_callback *cb);
+
+/**
+ * blk_stat_free_callback() - Free a block statistics callback.
+ * @cb: The callback.
+ *
+ * @cb may be NULL, in which case this does nothing. If it is not NULL, @cb must
+ * not be associated with a request queue. I.e., if it was previously added with
+ * blk_stat_add_callback(), it must also have been removed since then with
+ * blk_stat_remove_callback().
+ */
+void blk_stat_free_callback(struct blk_stat_callback *cb);
+
+/**
+ * blk_stat_is_active() - Check if a block statistics callback is currently
+ * gathering statistics.
+ * @cb: The callback.
+ */
+static inline bool blk_stat_is_active(struct blk_stat_callback *cb)
+{
+       return timer_pending(&cb->timer);
+}
+
+/**
+ * blk_stat_activate_nsecs() - Gather block statistics during a time window in
+ * nanoseconds.
+ * @cb: The callback.
+ * @nsecs: Number of nanoseconds to gather statistics for.
+ *
+ * The timer callback will be called when the window expires.
+ */
+static inline void blk_stat_activate_nsecs(struct blk_stat_callback *cb,
+                                          u64 nsecs)
+{
+       mod_timer(&cb->timer, jiffies + nsecs_to_jiffies(nsecs));
+}
+
+/**
+ * blk_stat_activate_msecs() - Gather block statistics during a time window in
+ * milliseconds.
+ * @cb: The callback.
+ * @msecs: Number of milliseconds to gather statistics for.
+ *
+ * The timer callback will be called when the window expires.
+ */
+static inline void blk_stat_activate_msecs(struct blk_stat_callback *cb,
+                                          unsigned int msecs)
+{
+       mod_timer(&cb->timer, jiffies + msecs_to_jiffies(msecs));
 }
 
 #endif
index c44b321..3f37813 100644 (file)
@@ -208,7 +208,7 @@ static ssize_t queue_discard_max_store(struct request_queue *q,
 
 static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *page)
 {
-       return queue_var_show(queue_discard_zeroes_data(q), page);
+       return queue_var_show(0, page);
 }
 
 static ssize_t queue_write_same_max_show(struct request_queue *q, char *page)
@@ -503,26 +503,6 @@ static ssize_t queue_dax_show(struct request_queue *q, char *page)
        return queue_var_show(blk_queue_dax(q), page);
 }
 
-static ssize_t print_stat(char *page, struct blk_rq_stat *stat, const char *pre)
-{
-       return sprintf(page, "%s samples=%llu, mean=%lld, min=%lld, max=%lld\n",
-                       pre, (long long) stat->nr_samples,
-                       (long long) stat->mean, (long long) stat->min,
-                       (long long) stat->max);
-}
-
-static ssize_t queue_stats_show(struct request_queue *q, char *page)
-{
-       struct blk_rq_stat stat[2];
-       ssize_t ret;
-
-       blk_queue_stat_get(q, stat);
-
-       ret = print_stat(page, &stat[BLK_STAT_READ], "read :");
-       ret += print_stat(page + ret, &stat[BLK_STAT_WRITE], "write:");
-       return ret;
-}
-
 static struct queue_sysfs_entry queue_requests_entry = {
        .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
        .show = queue_requests_show,
@@ -691,17 +671,20 @@ static struct queue_sysfs_entry queue_dax_entry = {
        .show = queue_dax_show,
 };
 
-static struct queue_sysfs_entry queue_stats_entry = {
-       .attr = {.name = "stats", .mode = S_IRUGO },
-       .show = queue_stats_show,
-};
-
 static struct queue_sysfs_entry queue_wb_lat_entry = {
        .attr = {.name = "wbt_lat_usec", .mode = S_IRUGO | S_IWUSR },
        .show = queue_wb_lat_show,
        .store = queue_wb_lat_store,
 };
 
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+static struct queue_sysfs_entry throtl_sample_time_entry = {
+       .attr = {.name = "throttle_sample_time", .mode = S_IRUGO | S_IWUSR },
+       .show = blk_throtl_sample_time_show,
+       .store = blk_throtl_sample_time_store,
+};
+#endif
+
 static struct attribute *default_attrs[] = {
        &queue_requests_entry.attr,
        &queue_ra_entry.attr,
@@ -733,9 +716,11 @@ static struct attribute *default_attrs[] = {
        &queue_poll_entry.attr,
        &queue_wc_entry.attr,
        &queue_dax_entry.attr,
-       &queue_stats_entry.attr,
        &queue_wb_lat_entry.attr,
        &queue_poll_delay_entry.attr,
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+       &throtl_sample_time_entry.attr,
+#endif
        NULL,
 };
 
@@ -810,15 +795,19 @@ static void blk_release_queue(struct kobject *kobj)
        struct request_queue *q =
                container_of(kobj, struct request_queue, kobj);
 
-       wbt_exit(q);
+       if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags))
+               blk_stat_remove_callback(q, q->poll_cb);
+       blk_stat_free_callback(q->poll_cb);
        bdi_put(q->backing_dev_info);
        blkcg_exit_queue(q);
 
        if (q->elevator) {
                ioc_clear_queue(q);
-               elevator_exit(q->elevator);
+               elevator_exit(q, q->elevator);
        }
 
+       blk_free_queue_stats(q->stats);
+
        blk_exit_rl(&q->root_rl);
 
        if (q->queue_tags)
@@ -855,23 +844,6 @@ struct kobj_type blk_queue_ktype = {
        .release        = blk_release_queue,
 };
 
-static void blk_wb_init(struct request_queue *q)
-{
-#ifndef CONFIG_BLK_WBT_MQ
-       if (q->mq_ops)
-               return;
-#endif
-#ifndef CONFIG_BLK_WBT_SQ
-       if (q->request_fn)
-               return;
-#endif
-
-       /*
-        * If this fails, we don't get throttling
-        */
-       wbt_init(q);
-}
-
 int blk_register_queue(struct gendisk *disk)
 {
        int ret;
@@ -881,6 +853,11 @@ int blk_register_queue(struct gendisk *disk)
        if (WARN_ON(!q))
                return -ENXIO;
 
+       WARN_ONCE(test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags),
+                 "%s is registering an already registered queue\n",
+                 kobject_name(&dev->kobj));
+       queue_flag_set_unlocked(QUEUE_FLAG_REGISTERED, q);
+
        /*
         * SCSI probing may synchronously create and destroy a lot of
         * request_queues for non-existent devices.  Shutting down a fully
@@ -900,9 +877,6 @@ int blk_register_queue(struct gendisk *disk)
        if (ret)
                return ret;
 
-       if (q->mq_ops)
-               blk_mq_register_dev(dev, q);
-
        /* Prevent changes through sysfs until registration is completed. */
        mutex_lock(&q->sysfs_lock);
 
@@ -912,9 +886,14 @@ int blk_register_queue(struct gendisk *disk)
                goto unlock;
        }
 
+       if (q->mq_ops)
+               __blk_mq_register_dev(dev, q);
+
        kobject_uevent(&q->kobj, KOBJ_ADD);
 
-       blk_wb_init(q);
+       wbt_enable_default(q);
+
+       blk_throtl_register_queue(q);
 
        if (q->request_fn || (q->mq_ops && q->elevator)) {
                ret = elv_register_queue(q);
@@ -939,6 +918,11 @@ void blk_unregister_queue(struct gendisk *disk)
        if (WARN_ON(!q))
                return;
 
+       queue_flag_clear_unlocked(QUEUE_FLAG_REGISTERED, q);
+
+       wbt_exit(q);
+
+
        if (q->mq_ops)
                blk_mq_unregister_dev(disk_to_dev(disk), q);
 
index 8fab716..b78db2e 100644 (file)
@@ -18,8 +18,17 @@ static int throtl_grp_quantum = 8;
 /* Total max dispatch from all groups in one round */
 static int throtl_quantum = 32;
 
-/* Throttling is performed over 100ms slice and after that slice is renewed */
-static unsigned long throtl_slice = HZ/10;     /* 100 ms */
+/* Throttling is performed over a slice and after that slice is renewed */
+#define DFL_THROTL_SLICE_HD (HZ / 10)
+#define DFL_THROTL_SLICE_SSD (HZ / 50)
+#define MAX_THROTL_SLICE (HZ)
+#define DFL_IDLE_THRESHOLD_SSD (1000L) /* 1 ms */
+#define DFL_IDLE_THRESHOLD_HD (100L * 1000) /* 100 ms */
+#define MAX_IDLE_TIME (5L * 1000 * 1000) /* 5 s */
+/* default latency target is 0, eg, guarantee IO latency by default */
+#define DFL_LATENCY_TARGET (0)
+
+#define SKIP_LATENCY (((u64)1) << BLK_STAT_RES_SHIFT)
 
 static struct blkcg_policy blkcg_policy_throtl;
 
@@ -83,6 +92,12 @@ enum tg_state_flags {
 
 #define rb_entry_tg(node)      rb_entry((node), struct throtl_grp, rb_node)
 
+enum {
+       LIMIT_LOW,
+       LIMIT_MAX,
+       LIMIT_CNT,
+};
+
 struct throtl_grp {
        /* must be the first member */
        struct blkg_policy_data pd;
@@ -119,20 +134,54 @@ struct throtl_grp {
        /* are there any throtl rules between this group and td? */
        bool has_rules[2];
 
-       /* bytes per second rate limits */
-       uint64_t bps[2];
+       /* internally used bytes per second rate limits */
+       uint64_t bps[2][LIMIT_CNT];
+       /* user configured bps limits */
+       uint64_t bps_conf[2][LIMIT_CNT];
 
-       /* IOPS limits */
-       unsigned int iops[2];
+       /* internally used IOPS limits */
+       unsigned int iops[2][LIMIT_CNT];
+       /* user configured IOPS limits */
+       unsigned int iops_conf[2][LIMIT_CNT];
 
        /* Number of bytes disptached in current slice */
        uint64_t bytes_disp[2];
        /* Number of bio's dispatched in current slice */
        unsigned int io_disp[2];
 
+       unsigned long last_low_overflow_time[2];
+
+       uint64_t last_bytes_disp[2];
+       unsigned int last_io_disp[2];
+
+       unsigned long last_check_time;
+
+       unsigned long latency_target; /* us */
        /* When did we start a new slice */
        unsigned long slice_start[2];
        unsigned long slice_end[2];
+
+       unsigned long last_finish_time; /* ns / 1024 */
+       unsigned long checked_last_finish_time; /* ns / 1024 */
+       unsigned long avg_idletime; /* ns / 1024 */
+       unsigned long idletime_threshold; /* us */
+
+       unsigned int bio_cnt; /* total bios */
+       unsigned int bad_bio_cnt; /* bios exceeding latency threshold */
+       unsigned long bio_cnt_reset_time;
+};
+
+/* We measure latency for request size from <= 4k to >= 1M */
+#define LATENCY_BUCKET_SIZE 9
+
+struct latency_bucket {
+       unsigned long total_latency; /* ns / 1024 */
+       int samples;
+};
+
+struct avg_latency_bucket {
+       unsigned long latency; /* ns / 1024 */
+       bool valid;
 };
 
 struct throtl_data
@@ -145,8 +194,26 @@ struct throtl_data
        /* Total Number of queued bios on READ and WRITE lists */
        unsigned int nr_queued[2];
 
+       unsigned int throtl_slice;
+
        /* Work for dispatching throttled bios */
        struct work_struct dispatch_work;
+       unsigned int limit_index;
+       bool limit_valid[LIMIT_CNT];
+
+       unsigned long dft_idletime_threshold; /* us */
+
+       unsigned long low_upgrade_time;
+       unsigned long low_downgrade_time;
+
+       unsigned int scale;
+
+       struct latency_bucket tmp_buckets[LATENCY_BUCKET_SIZE];
+       struct avg_latency_bucket avg_buckets[LATENCY_BUCKET_SIZE];
+       struct latency_bucket __percpu *latency_buckets;
+       unsigned long last_calculate_time;
+
+       bool track_bio_latency;
 };
 
 static void throtl_pending_timer_fn(unsigned long arg);
@@ -198,6 +265,76 @@ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq)
                return container_of(sq, struct throtl_data, service_queue);
 }
 
+/*
+ * cgroup's limit in LIMIT_MAX is scaled if low limit is set. This scale is to
+ * make the IO dispatch more smooth.
+ * Scale up: linearly scale up according to lapsed time since upgrade. For
+ *           every throtl_slice, the limit scales up 1/2 .low limit till the
+ *           limit hits .max limit
+ * Scale down: exponentially scale down if a cgroup doesn't hit its .low limit
+ */
+static uint64_t throtl_adjusted_limit(uint64_t low, struct throtl_data *td)
+{
+       /* arbitrary value to avoid too big scale */
+       if (td->scale < 4096 && time_after_eq(jiffies,
+           td->low_upgrade_time + td->scale * td->throtl_slice))
+               td->scale = (jiffies - td->low_upgrade_time) / td->throtl_slice;
+
+       return low + (low >> 1) * td->scale;
+}
+
+static uint64_t tg_bps_limit(struct throtl_grp *tg, int rw)
+{
+       struct blkcg_gq *blkg = tg_to_blkg(tg);
+       struct throtl_data *td;
+       uint64_t ret;
+
+       if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent)
+               return U64_MAX;
+
+       td = tg->td;
+       ret = tg->bps[rw][td->limit_index];
+       if (ret == 0 && td->limit_index == LIMIT_LOW)
+               return tg->bps[rw][LIMIT_MAX];
+
+       if (td->limit_index == LIMIT_MAX && tg->bps[rw][LIMIT_LOW] &&
+           tg->bps[rw][LIMIT_LOW] != tg->bps[rw][LIMIT_MAX]) {
+               uint64_t adjusted;
+
+               adjusted = throtl_adjusted_limit(tg->bps[rw][LIMIT_LOW], td);
+               ret = min(tg->bps[rw][LIMIT_MAX], adjusted);
+       }
+       return ret;
+}
+
+static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw)
+{
+       struct blkcg_gq *blkg = tg_to_blkg(tg);
+       struct throtl_data *td;
+       unsigned int ret;
+
+       if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent)
+               return UINT_MAX;
+       td = tg->td;
+       ret = tg->iops[rw][td->limit_index];
+       if (ret == 0 && tg->td->limit_index == LIMIT_LOW)
+               return tg->iops[rw][LIMIT_MAX];
+
+       if (td->limit_index == LIMIT_MAX && tg->iops[rw][LIMIT_LOW] &&
+           tg->iops[rw][LIMIT_LOW] != tg->iops[rw][LIMIT_MAX]) {
+               uint64_t adjusted;
+
+               adjusted = throtl_adjusted_limit(tg->iops[rw][LIMIT_LOW], td);
+               if (adjusted > UINT_MAX)
+                       adjusted = UINT_MAX;
+               ret = min_t(unsigned int, tg->iops[rw][LIMIT_MAX], adjusted);
+       }
+       return ret;
+}
+
+#define request_bucket_index(sectors) \
+       clamp_t(int, order_base_2(sectors) - 3, 0, LATENCY_BUCKET_SIZE - 1)
+
 /**
  * throtl_log - log debug message via blktrace
  * @sq: the service_queue being reported
@@ -334,10 +471,17 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node)
        }
 
        RB_CLEAR_NODE(&tg->rb_node);
-       tg->bps[READ] = -1;
-       tg->bps[WRITE] = -1;
-       tg->iops[READ] = -1;
-       tg->iops[WRITE] = -1;
+       tg->bps[READ][LIMIT_MAX] = U64_MAX;
+       tg->bps[WRITE][LIMIT_MAX] = U64_MAX;
+       tg->iops[READ][LIMIT_MAX] = UINT_MAX;
+       tg->iops[WRITE][LIMIT_MAX] = UINT_MAX;
+       tg->bps_conf[READ][LIMIT_MAX] = U64_MAX;
+       tg->bps_conf[WRITE][LIMIT_MAX] = U64_MAX;
+       tg->iops_conf[READ][LIMIT_MAX] = UINT_MAX;
+       tg->iops_conf[WRITE][LIMIT_MAX] = UINT_MAX;
+       /* LIMIT_LOW will have default value 0 */
+
+       tg->latency_target = DFL_LATENCY_TARGET;
 
        return &tg->pd;
 }
@@ -366,6 +510,8 @@ static void throtl_pd_init(struct blkg_policy_data *pd)
        if (cgroup_subsys_on_dfl(io_cgrp_subsys) && blkg->parent)
                sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue;
        tg->td = td;
+
+       tg->idletime_threshold = td->dft_idletime_threshold;
 }
 
 /*
@@ -376,20 +522,59 @@ static void throtl_pd_init(struct blkg_policy_data *pd)
 static void tg_update_has_rules(struct throtl_grp *tg)
 {
        struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq);
+       struct throtl_data *td = tg->td;
        int rw;
 
        for (rw = READ; rw <= WRITE; rw++)
                tg->has_rules[rw] = (parent_tg && parent_tg->has_rules[rw]) ||
-                                   (tg->bps[rw] != -1 || tg->iops[rw] != -1);
+                       (td->limit_valid[td->limit_index] &&
+                        (tg_bps_limit(tg, rw) != U64_MAX ||
+                         tg_iops_limit(tg, rw) != UINT_MAX));
 }
 
 static void throtl_pd_online(struct blkg_policy_data *pd)
 {
+       struct throtl_grp *tg = pd_to_tg(pd);
        /*
         * We don't want new groups to escape the limits of its ancestors.
         * Update has_rules[] after a new group is brought online.
         */
-       tg_update_has_rules(pd_to_tg(pd));
+       tg_update_has_rules(tg);
+}
+
+static void blk_throtl_update_limit_valid(struct throtl_data *td)
+{
+       struct cgroup_subsys_state *pos_css;
+       struct blkcg_gq *blkg;
+       bool low_valid = false;
+
+       rcu_read_lock();
+       blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
+               struct throtl_grp *tg = blkg_to_tg(blkg);
+
+               if (tg->bps[READ][LIMIT_LOW] || tg->bps[WRITE][LIMIT_LOW] ||
+                   tg->iops[READ][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW])
+                       low_valid = true;
+       }
+       rcu_read_unlock();
+
+       td->limit_valid[LIMIT_LOW] = low_valid;
+}
+
+static void throtl_upgrade_state(struct throtl_data *td);
+static void throtl_pd_offline(struct blkg_policy_data *pd)
+{
+       struct throtl_grp *tg = pd_to_tg(pd);
+
+       tg->bps[READ][LIMIT_LOW] = 0;
+       tg->bps[WRITE][LIMIT_LOW] = 0;
+       tg->iops[READ][LIMIT_LOW] = 0;
+       tg->iops[WRITE][LIMIT_LOW] = 0;
+
+       blk_throtl_update_limit_valid(tg->td);
+
+       if (!tg->td->limit_valid[tg->td->limit_index])
+               throtl_upgrade_state(tg->td);
 }
 
 static void throtl_pd_free(struct blkg_policy_data *pd)
@@ -499,6 +684,17 @@ static void throtl_dequeue_tg(struct throtl_grp *tg)
 static void throtl_schedule_pending_timer(struct throtl_service_queue *sq,
                                          unsigned long expires)
 {
+       unsigned long max_expire = jiffies + 8 * sq_to_tg(sq)->td->throtl_slice;
+
+       /*
+        * Since we are adjusting the throttle limit dynamically, the sleep
+        * time calculated according to previous limit might be invalid. It's
+        * possible the cgroup sleep time is very long and no other cgroups
+        * have IO running so notify the limit changes. Make sure the cgroup
+        * doesn't sleep too long to avoid the missed notification.
+        */
+       if (time_after(expires, max_expire))
+               expires = max_expire;
        mod_timer(&sq->pending_timer, expires);
        throtl_log(sq, "schedule timer. delay=%lu jiffies=%lu",
                   expires - jiffies, jiffies);
@@ -556,7 +752,7 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg,
        if (time_after_eq(start, tg->slice_start[rw]))
                tg->slice_start[rw] = start;
 
-       tg->slice_end[rw] = jiffies + throtl_slice;
+       tg->slice_end[rw] = jiffies + tg->td->throtl_slice;
        throtl_log(&tg->service_queue,
                   "[%c] new slice with credit start=%lu end=%lu jiffies=%lu",
                   rw == READ ? 'R' : 'W', tg->slice_start[rw],
@@ -568,7 +764,7 @@ static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw)
        tg->bytes_disp[rw] = 0;
        tg->io_disp[rw] = 0;
        tg->slice_start[rw] = jiffies;
-       tg->slice_end[rw] = jiffies + throtl_slice;
+       tg->slice_end[rw] = jiffies + tg->td->throtl_slice;
        throtl_log(&tg->service_queue,
                   "[%c] new slice start=%lu end=%lu jiffies=%lu",
                   rw == READ ? 'R' : 'W', tg->slice_start[rw],
@@ -578,13 +774,13 @@ static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw)
 static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw,
                                        unsigned long jiffy_end)
 {
-       tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
+       tg->slice_end[rw] = roundup(jiffy_end, tg->td->throtl_slice);
 }
 
 static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw,
                                       unsigned long jiffy_end)
 {
-       tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
+       tg->slice_end[rw] = roundup(jiffy_end, tg->td->throtl_slice);
        throtl_log(&tg->service_queue,
                   "[%c] extend slice start=%lu end=%lu jiffies=%lu",
                   rw == READ ? 'R' : 'W', tg->slice_start[rw],
@@ -624,19 +820,20 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
         * is bad because it does not allow new slice to start.
         */
 
-       throtl_set_slice_end(tg, rw, jiffies + throtl_slice);
+       throtl_set_slice_end(tg, rw, jiffies + tg->td->throtl_slice);
 
        time_elapsed = jiffies - tg->slice_start[rw];
 
-       nr_slices = time_elapsed / throtl_slice;
+       nr_slices = time_elapsed / tg->td->throtl_slice;
 
        if (!nr_slices)
                return;
-       tmp = tg->bps[rw] * throtl_slice * nr_slices;
+       tmp = tg_bps_limit(tg, rw) * tg->td->throtl_slice * nr_slices;
        do_div(tmp, HZ);
        bytes_trim = tmp;
 
-       io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ;
+       io_trim = (tg_iops_limit(tg, rw) * tg->td->throtl_slice * nr_slices) /
+               HZ;
 
        if (!bytes_trim && !io_trim)
                return;
@@ -651,7 +848,7 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
        else
                tg->io_disp[rw] = 0;
 
-       tg->slice_start[rw] += nr_slices * throtl_slice;
+       tg->slice_start[rw] += nr_slices * tg->td->throtl_slice;
 
        throtl_log(&tg->service_queue,
                   "[%c] trim slice nr=%lu bytes=%llu io=%lu start=%lu end=%lu jiffies=%lu",
@@ -671,9 +868,9 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
 
        /* Slice has just started. Consider one slice interval */
        if (!jiffy_elapsed)
-               jiffy_elapsed_rnd = throtl_slice;
+               jiffy_elapsed_rnd = tg->td->throtl_slice;
 
-       jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
+       jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice);
 
        /*
         * jiffy_elapsed_rnd should not be a big value as minimum iops can be
@@ -682,7 +879,7 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
         * have been trimmed.
         */
 
-       tmp = (u64)tg->iops[rw] * jiffy_elapsed_rnd;
+       tmp = (u64)tg_iops_limit(tg, rw) * jiffy_elapsed_rnd;
        do_div(tmp, HZ);
 
        if (tmp > UINT_MAX)
@@ -697,7 +894,7 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
        }
 
        /* Calc approx time to dispatch */
-       jiffy_wait = ((tg->io_disp[rw] + 1) * HZ)/tg->iops[rw] + 1;
+       jiffy_wait = ((tg->io_disp[rw] + 1) * HZ) / tg_iops_limit(tg, rw) + 1;
 
        if (jiffy_wait > jiffy_elapsed)
                jiffy_wait = jiffy_wait - jiffy_elapsed;
@@ -720,11 +917,11 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
 
        /* Slice has just started. Consider one slice interval */
        if (!jiffy_elapsed)
-               jiffy_elapsed_rnd = throtl_slice;
+               jiffy_elapsed_rnd = tg->td->throtl_slice;
 
-       jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
+       jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice);
 
-       tmp = tg->bps[rw] * jiffy_elapsed_rnd;
+       tmp = tg_bps_limit(tg, rw) * jiffy_elapsed_rnd;
        do_div(tmp, HZ);
        bytes_allowed = tmp;
 
@@ -736,7 +933,7 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
 
        /* Calc approx time to dispatch */
        extra_bytes = tg->bytes_disp[rw] + bio->bi_iter.bi_size - bytes_allowed;
-       jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]);
+       jiffy_wait = div64_u64(extra_bytes * HZ, tg_bps_limit(tg, rw));
 
        if (!jiffy_wait)
                jiffy_wait = 1;
@@ -771,7 +968,8 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
               bio != throtl_peek_queued(&tg->service_queue.queued[rw]));
 
        /* If tg->bps = -1, then BW is unlimited */
-       if (tg->bps[rw] == -1 && tg->iops[rw] == -1) {
+       if (tg_bps_limit(tg, rw) == U64_MAX &&
+           tg_iops_limit(tg, rw) == UINT_MAX) {
                if (wait)
                        *wait = 0;
                return true;
@@ -787,8 +985,10 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
        if (throtl_slice_used(tg, rw) && !(tg->service_queue.nr_queued[rw]))
                throtl_start_new_slice(tg, rw);
        else {
-               if (time_before(tg->slice_end[rw], jiffies + throtl_slice))
-                       throtl_extend_slice(tg, rw, jiffies + throtl_slice);
+               if (time_before(tg->slice_end[rw],
+                   jiffies + tg->td->throtl_slice))
+                       throtl_extend_slice(tg, rw,
+                               jiffies + tg->td->throtl_slice);
        }
 
        if (tg_with_in_bps_limit(tg, bio, &bps_wait) &&
@@ -816,6 +1016,8 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
        /* Charge the bio to the group */
        tg->bytes_disp[rw] += bio->bi_iter.bi_size;
        tg->io_disp[rw]++;
+       tg->last_bytes_disp[rw] += bio->bi_iter.bi_size;
+       tg->last_io_disp[rw]++;
 
        /*
         * BIO_THROTTLED is used to prevent the same bio to be throttled
@@ -999,6 +1201,8 @@ static int throtl_select_dispatch(struct throtl_service_queue *parent_sq)
        return nr_disp;
 }
 
+static bool throtl_can_upgrade(struct throtl_data *td,
+       struct throtl_grp *this_tg);
 /**
  * throtl_pending_timer_fn - timer function for service_queue->pending_timer
  * @arg: the throtl_service_queue being serviced
@@ -1025,6 +1229,9 @@ static void throtl_pending_timer_fn(unsigned long arg)
        int ret;
 
        spin_lock_irq(q->queue_lock);
+       if (throtl_can_upgrade(td, NULL))
+               throtl_upgrade_state(td);
+
 again:
        parent_sq = sq->parent_sq;
        dispatched = false;
@@ -1112,7 +1319,7 @@ static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd,
        struct throtl_grp *tg = pd_to_tg(pd);
        u64 v = *(u64 *)((void *)tg + off);
 
-       if (v == -1)
+       if (v == U64_MAX)
                return 0;
        return __blkg_prfill_u64(sf, pd, v);
 }
@@ -1123,7 +1330,7 @@ static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd,
        struct throtl_grp *tg = pd_to_tg(pd);
        unsigned int v = *(unsigned int *)((void *)tg + off);
 
-       if (v == -1)
+       if (v == UINT_MAX)
                return 0;
        return __blkg_prfill_u64(sf, pd, v);
 }
@@ -1150,8 +1357,8 @@ static void tg_conf_updated(struct throtl_grp *tg)
 
        throtl_log(&tg->service_queue,
                   "limit change rbps=%llu wbps=%llu riops=%u wiops=%u",
-                  tg->bps[READ], tg->bps[WRITE],
-                  tg->iops[READ], tg->iops[WRITE]);
+                  tg_bps_limit(tg, READ), tg_bps_limit(tg, WRITE),
+                  tg_iops_limit(tg, READ), tg_iops_limit(tg, WRITE));
 
        /*
         * Update has_rules[] flags for the updated tg's subtree.  A tg is
@@ -1197,7 +1404,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
        if (sscanf(ctx.body, "%llu", &v) != 1)
                goto out_finish;
        if (!v)
-               v = -1;
+               v = U64_MAX;
 
        tg = blkg_to_tg(ctx.blkg);
 
@@ -1228,25 +1435,25 @@ static ssize_t tg_set_conf_uint(struct kernfs_open_file *of,
 static struct cftype throtl_legacy_files[] = {
        {
                .name = "throttle.read_bps_device",
-               .private = offsetof(struct throtl_grp, bps[READ]),
+               .private = offsetof(struct throtl_grp, bps[READ][LIMIT_MAX]),
                .seq_show = tg_print_conf_u64,
                .write = tg_set_conf_u64,
        },
        {
                .name = "throttle.write_bps_device",
-               .private = offsetof(struct throtl_grp, bps[WRITE]),
+               .private = offsetof(struct throtl_grp, bps[WRITE][LIMIT_MAX]),
                .seq_show = tg_print_conf_u64,
                .write = tg_set_conf_u64,
        },
        {
                .name = "throttle.read_iops_device",
-               .private = offsetof(struct throtl_grp, iops[READ]),
+               .private = offsetof(struct throtl_grp, iops[READ][LIMIT_MAX]),
                .seq_show = tg_print_conf_uint,
                .write = tg_set_conf_uint,
        },
        {
                .name = "throttle.write_iops_device",
-               .private = offsetof(struct throtl_grp, iops[WRITE]),
+               .private = offsetof(struct throtl_grp, iops[WRITE][LIMIT_MAX]),
                .seq_show = tg_print_conf_uint,
                .write = tg_set_conf_uint,
        },
@@ -1263,48 +1470,87 @@ static struct cftype throtl_legacy_files[] = {
        { }     /* terminate */
 };
 
-static u64 tg_prfill_max(struct seq_file *sf, struct blkg_policy_data *pd,
+static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd,
                         int off)
 {
        struct throtl_grp *tg = pd_to_tg(pd);
        const char *dname = blkg_dev_name(pd->blkg);
        char bufs[4][21] = { "max", "max", "max", "max" };
+       u64 bps_dft;
+       unsigned int iops_dft;
+       char idle_time[26] = "";
+       char latency_time[26] = "";
 
        if (!dname)
                return 0;
-       if (tg->bps[READ] == -1 && tg->bps[WRITE] == -1 &&
-           tg->iops[READ] == -1 && tg->iops[WRITE] == -1)
+
+       if (off == LIMIT_LOW) {
+               bps_dft = 0;
+               iops_dft = 0;
+       } else {
+               bps_dft = U64_MAX;
+               iops_dft = UINT_MAX;
+       }
+
+       if (tg->bps_conf[READ][off] == bps_dft &&
+           tg->bps_conf[WRITE][off] == bps_dft &&
+           tg->iops_conf[READ][off] == iops_dft &&
+           tg->iops_conf[WRITE][off] == iops_dft &&
+           (off != LIMIT_LOW ||
+            (tg->idletime_threshold == tg->td->dft_idletime_threshold &&
+             tg->latency_target == DFL_LATENCY_TARGET)))
                return 0;
 
-       if (tg->bps[READ] != -1)
-               snprintf(bufs[0], sizeof(bufs[0]), "%llu", tg->bps[READ]);
-       if (tg->bps[WRITE] != -1)
-               snprintf(bufs[1], sizeof(bufs[1]), "%llu", tg->bps[WRITE]);
-       if (tg->iops[READ] != -1)
-               snprintf(bufs[2], sizeof(bufs[2]), "%u", tg->iops[READ]);
-       if (tg->iops[WRITE] != -1)
-               snprintf(bufs[3], sizeof(bufs[3]), "%u", tg->iops[WRITE]);
-
-       seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s\n",
-                  dname, bufs[0], bufs[1], bufs[2], bufs[3]);
+       if (tg->bps_conf[READ][off] != bps_dft)
+               snprintf(bufs[0], sizeof(bufs[0]), "%llu",
+                       tg->bps_conf[READ][off]);
+       if (tg->bps_conf[WRITE][off] != bps_dft)
+               snprintf(bufs[1], sizeof(bufs[1]), "%llu",
+                       tg->bps_conf[WRITE][off]);
+       if (tg->iops_conf[READ][off] != iops_dft)
+               snprintf(bufs[2], sizeof(bufs[2]), "%u",
+                       tg->iops_conf[READ][off]);
+       if (tg->iops_conf[WRITE][off] != iops_dft)
+               snprintf(bufs[3], sizeof(bufs[3]), "%u",
+                       tg->iops_conf[WRITE][off]);
+       if (off == LIMIT_LOW) {
+               if (tg->idletime_threshold == ULONG_MAX)
+                       strcpy(idle_time, " idle=max");
+               else
+                       snprintf(idle_time, sizeof(idle_time), " idle=%lu",
+                               tg->idletime_threshold);
+
+               if (tg->latency_target == ULONG_MAX)
+                       strcpy(latency_time, " latency=max");
+               else
+                       snprintf(latency_time, sizeof(latency_time),
+                               " latency=%lu", tg->latency_target);
+       }
+
+       seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s%s%s\n",
+                  dname, bufs[0], bufs[1], bufs[2], bufs[3], idle_time,
+                  latency_time);
        return 0;
 }
 
-static int tg_print_max(struct seq_file *sf, void *v)
+static int tg_print_limit(struct seq_file *sf, void *v)
 {
-       blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_max,
+       blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_limit,
                          &blkcg_policy_throtl, seq_cft(sf)->private, false);
        return 0;
 }
 
-static ssize_t tg_set_max(struct kernfs_open_file *of,
+static ssize_t tg_set_limit(struct kernfs_open_file *of,
                          char *buf, size_t nbytes, loff_t off)
 {
        struct blkcg *blkcg = css_to_blkcg(of_css(of));
        struct blkg_conf_ctx ctx;
        struct throtl_grp *tg;
        u64 v[4];
+       unsigned long idle_time;
+       unsigned long latency_time;
        int ret;
+       int index = of_cft(of)->private;
 
        ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
        if (ret)
@@ -1312,15 +1558,17 @@ static ssize_t tg_set_max(struct kernfs_open_file *of,
 
        tg = blkg_to_tg(ctx.blkg);
 
-       v[0] = tg->bps[READ];
-       v[1] = tg->bps[WRITE];
-       v[2] = tg->iops[READ];
-       v[3] = tg->iops[WRITE];
+       v[0] = tg->bps_conf[READ][index];
+       v[1] = tg->bps_conf[WRITE][index];
+       v[2] = tg->iops_conf[READ][index];
+       v[3] = tg->iops_conf[WRITE][index];
 
+       idle_time = tg->idletime_threshold;
+       latency_time = tg->latency_target;
        while (true) {
                char tok[27];   /* wiops=18446744073709551616 */
                char *p;
-               u64 val = -1;
+               u64 val = U64_MAX;
                int len;
 
                if (sscanf(ctx.body, "%26s%n", tok, &len) != 1)
@@ -1348,15 +1596,43 @@ static ssize_t tg_set_max(struct kernfs_open_file *of,
                        v[2] = min_t(u64, val, UINT_MAX);
                else if (!strcmp(tok, "wiops"))
                        v[3] = min_t(u64, val, UINT_MAX);
+               else if (off == LIMIT_LOW && !strcmp(tok, "idle"))
+                       idle_time = val;
+               else if (off == LIMIT_LOW && !strcmp(tok, "latency"))
+                       latency_time = val;
                else
                        goto out_finish;
        }
 
-       tg->bps[READ] = v[0];
-       tg->bps[WRITE] = v[1];
-       tg->iops[READ] = v[2];
-       tg->iops[WRITE] = v[3];
+       tg->bps_conf[READ][index] = v[0];
+       tg->bps_conf[WRITE][index] = v[1];
+       tg->iops_conf[READ][index] = v[2];
+       tg->iops_conf[WRITE][index] = v[3];
 
+       if (index == LIMIT_MAX) {
+               tg->bps[READ][index] = v[0];
+               tg->bps[WRITE][index] = v[1];
+               tg->iops[READ][index] = v[2];
+               tg->iops[WRITE][index] = v[3];
+       }
+       tg->bps[READ][LIMIT_LOW] = min(tg->bps_conf[READ][LIMIT_LOW],
+               tg->bps_conf[READ][LIMIT_MAX]);
+       tg->bps[WRITE][LIMIT_LOW] = min(tg->bps_conf[WRITE][LIMIT_LOW],
+               tg->bps_conf[WRITE][LIMIT_MAX]);
+       tg->iops[READ][LIMIT_LOW] = min(tg->iops_conf[READ][LIMIT_LOW],
+               tg->iops_conf[READ][LIMIT_MAX]);
+       tg->iops[WRITE][LIMIT_LOW] = min(tg->iops_conf[WRITE][LIMIT_LOW],
+               tg->iops_conf[WRITE][LIMIT_MAX]);
+
+       if (index == LIMIT_LOW) {
+               blk_throtl_update_limit_valid(tg->td);
+               if (tg->td->limit_valid[LIMIT_LOW])
+                       tg->td->limit_index = LIMIT_LOW;
+               tg->idletime_threshold = (idle_time == ULONG_MAX) ?
+                       ULONG_MAX : idle_time;
+               tg->latency_target = (latency_time == ULONG_MAX) ?
+                       ULONG_MAX : latency_time;
+       }
        tg_conf_updated(tg);
        ret = 0;
 out_finish:
@@ -1365,11 +1641,21 @@ out_finish:
 }
 
 static struct cftype throtl_files[] = {
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+       {
+               .name = "low",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .seq_show = tg_print_limit,
+               .write = tg_set_limit,
+               .private = LIMIT_LOW,
+       },
+#endif
        {
                .name = "max",
                .flags = CFTYPE_NOT_ON_ROOT,
-               .seq_show = tg_print_max,
-               .write = tg_set_max,
+               .seq_show = tg_print_limit,
+               .write = tg_set_limit,
+               .private = LIMIT_MAX,
        },
        { }     /* terminate */
 };
@@ -1388,9 +1674,376 @@ static struct blkcg_policy blkcg_policy_throtl = {
        .pd_alloc_fn            = throtl_pd_alloc,
        .pd_init_fn             = throtl_pd_init,
        .pd_online_fn           = throtl_pd_online,
+       .pd_offline_fn          = throtl_pd_offline,
        .pd_free_fn             = throtl_pd_free,
 };
 
+static unsigned long __tg_last_low_overflow_time(struct throtl_grp *tg)
+{
+       unsigned long rtime = jiffies, wtime = jiffies;
+
+       if (tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW])
+               rtime = tg->last_low_overflow_time[READ];
+       if (tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW])
+               wtime = tg->last_low_overflow_time[WRITE];
+       return min(rtime, wtime);
+}
+
+/* tg should not be an intermediate node */
+static unsigned long tg_last_low_overflow_time(struct throtl_grp *tg)
+{
+       struct throtl_service_queue *parent_sq;
+       struct throtl_grp *parent = tg;
+       unsigned long ret = __tg_last_low_overflow_time(tg);
+
+       while (true) {
+               parent_sq = parent->service_queue.parent_sq;
+               parent = sq_to_tg(parent_sq);
+               if (!parent)
+                       break;
+
+               /*
+                * The parent doesn't have low limit, it always reaches low
+                * limit. Its overflow time is useless for children
+                */
+               if (!parent->bps[READ][LIMIT_LOW] &&
+                   !parent->iops[READ][LIMIT_LOW] &&
+                   !parent->bps[WRITE][LIMIT_LOW] &&
+                   !parent->iops[WRITE][LIMIT_LOW])
+                       continue;
+               if (time_after(__tg_last_low_overflow_time(parent), ret))
+                       ret = __tg_last_low_overflow_time(parent);
+       }
+       return ret;
+}
+
+static bool throtl_tg_is_idle(struct throtl_grp *tg)
+{
+       /*
+        * cgroup is idle if:
+        * - single idle is too long, longer than a fixed value (in case user
+        *   configure a too big threshold) or 4 times of slice
+        * - average think time is more than threshold
+        * - IO latency is largely below threshold
+        */
+       unsigned long time = jiffies_to_usecs(4 * tg->td->throtl_slice);
+
+       time = min_t(unsigned long, MAX_IDLE_TIME, time);
+       return (ktime_get_ns() >> 10) - tg->last_finish_time > time ||
+              tg->avg_idletime > tg->idletime_threshold ||
+              (tg->latency_target && tg->bio_cnt &&
+               tg->bad_bio_cnt * 5 < tg->bio_cnt);
+}
+
+static bool throtl_tg_can_upgrade(struct throtl_grp *tg)
+{
+       struct throtl_service_queue *sq = &tg->service_queue;
+       bool read_limit, write_limit;
+
+       /*
+        * if cgroup reaches low limit (if low limit is 0, the cgroup always
+        * reaches), it's ok to upgrade to next limit
+        */
+       read_limit = tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW];
+       write_limit = tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW];
+       if (!read_limit && !write_limit)
+               return true;
+       if (read_limit && sq->nr_queued[READ] &&
+           (!write_limit || sq->nr_queued[WRITE]))
+               return true;
+       if (write_limit && sq->nr_queued[WRITE] &&
+           (!read_limit || sq->nr_queued[READ]))
+               return true;
+
+       if (time_after_eq(jiffies,
+               tg_last_low_overflow_time(tg) + tg->td->throtl_slice) &&
+           throtl_tg_is_idle(tg))
+               return true;
+       return false;
+}
+
+static bool throtl_hierarchy_can_upgrade(struct throtl_grp *tg)
+{
+       while (true) {
+               if (throtl_tg_can_upgrade(tg))
+                       return true;
+               tg = sq_to_tg(tg->service_queue.parent_sq);
+               if (!tg || !tg_to_blkg(tg)->parent)
+                       return false;
+       }
+       return false;
+}
+
+static bool throtl_can_upgrade(struct throtl_data *td,
+       struct throtl_grp *this_tg)
+{
+       struct cgroup_subsys_state *pos_css;
+       struct blkcg_gq *blkg;
+
+       if (td->limit_index != LIMIT_LOW)
+               return false;
+
+       if (time_before(jiffies, td->low_downgrade_time + td->throtl_slice))
+               return false;
+
+       rcu_read_lock();
+       blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
+               struct throtl_grp *tg = blkg_to_tg(blkg);
+
+               if (tg == this_tg)
+                       continue;
+               if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children))
+                       continue;
+               if (!throtl_hierarchy_can_upgrade(tg)) {
+                       rcu_read_unlock();
+                       return false;
+               }
+       }
+       rcu_read_unlock();
+       return true;
+}
+
+static void throtl_upgrade_check(struct throtl_grp *tg)
+{
+       unsigned long now = jiffies;
+
+       if (tg->td->limit_index != LIMIT_LOW)
+               return;
+
+       if (time_after(tg->last_check_time + tg->td->throtl_slice, now))
+               return;
+
+       tg->last_check_time = now;
+
+       if (!time_after_eq(now,
+            __tg_last_low_overflow_time(tg) + tg->td->throtl_slice))
+               return;
+
+       if (throtl_can_upgrade(tg->td, NULL))
+               throtl_upgrade_state(tg->td);
+}
+
+static void throtl_upgrade_state(struct throtl_data *td)
+{
+       struct cgroup_subsys_state *pos_css;
+       struct blkcg_gq *blkg;
+
+       td->limit_index = LIMIT_MAX;
+       td->low_upgrade_time = jiffies;
+       td->scale = 0;
+       rcu_read_lock();
+       blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
+               struct throtl_grp *tg = blkg_to_tg(blkg);
+               struct throtl_service_queue *sq = &tg->service_queue;
+
+               tg->disptime = jiffies - 1;
+               throtl_select_dispatch(sq);
+               throtl_schedule_next_dispatch(sq, false);
+       }
+       rcu_read_unlock();
+       throtl_select_dispatch(&td->service_queue);
+       throtl_schedule_next_dispatch(&td->service_queue, false);
+       queue_work(kthrotld_workqueue, &td->dispatch_work);
+}
+
+static void throtl_downgrade_state(struct throtl_data *td, int new)
+{
+       td->scale /= 2;
+
+       if (td->scale) {
+               td->low_upgrade_time = jiffies - td->scale * td->throtl_slice;
+               return;
+       }
+
+       td->limit_index = new;
+       td->low_downgrade_time = jiffies;
+}
+
+static bool throtl_tg_can_downgrade(struct throtl_grp *tg)
+{
+       struct throtl_data *td = tg->td;
+       unsigned long now = jiffies;
+
+       /*
+        * If cgroup is below low limit, consider downgrade and throttle other
+        * cgroups
+        */
+       if (time_after_eq(now, td->low_upgrade_time + td->throtl_slice) &&
+           time_after_eq(now, tg_last_low_overflow_time(tg) +
+                                       td->throtl_slice) &&
+           (!throtl_tg_is_idle(tg) ||
+            !list_empty(&tg_to_blkg(tg)->blkcg->css.children)))
+               return true;
+       return false;
+}
+
+static bool throtl_hierarchy_can_downgrade(struct throtl_grp *tg)
+{
+       while (true) {
+               if (!throtl_tg_can_downgrade(tg))
+                       return false;
+               tg = sq_to_tg(tg->service_queue.parent_sq);
+               if (!tg || !tg_to_blkg(tg)->parent)
+                       break;
+       }
+       return true;
+}
+
+static void throtl_downgrade_check(struct throtl_grp *tg)
+{
+       uint64_t bps;
+       unsigned int iops;
+       unsigned long elapsed_time;
+       unsigned long now = jiffies;
+
+       if (tg->td->limit_index != LIMIT_MAX ||
+           !tg->td->limit_valid[LIMIT_LOW])
+               return;
+       if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children))
+               return;
+       if (time_after(tg->last_check_time + tg->td->throtl_slice, now))
+               return;
+
+       elapsed_time = now - tg->last_check_time;
+       tg->last_check_time = now;
+
+       if (time_before(now, tg_last_low_overflow_time(tg) +
+                       tg->td->throtl_slice))
+               return;
+
+       if (tg->bps[READ][LIMIT_LOW]) {
+               bps = tg->last_bytes_disp[READ] * HZ;
+               do_div(bps, elapsed_time);
+               if (bps >= tg->bps[READ][LIMIT_LOW])
+                       tg->last_low_overflow_time[READ] = now;
+       }
+
+       if (tg->bps[WRITE][LIMIT_LOW]) {
+               bps = tg->last_bytes_disp[WRITE] * HZ;
+               do_div(bps, elapsed_time);
+               if (bps >= tg->bps[WRITE][LIMIT_LOW])
+                       tg->last_low_overflow_time[WRITE] = now;
+       }
+
+       if (tg->iops[READ][LIMIT_LOW]) {
+               iops = tg->last_io_disp[READ] * HZ / elapsed_time;
+               if (iops >= tg->iops[READ][LIMIT_LOW])
+                       tg->last_low_overflow_time[READ] = now;
+       }
+
+       if (tg->iops[WRITE][LIMIT_LOW]) {
+               iops = tg->last_io_disp[WRITE] * HZ / elapsed_time;
+               if (iops >= tg->iops[WRITE][LIMIT_LOW])
+                       tg->last_low_overflow_time[WRITE] = now;
+       }
+
+       /*
+        * If cgroup is below low limit, consider downgrade and throttle other
+        * cgroups
+        */
+       if (throtl_hierarchy_can_downgrade(tg))
+               throtl_downgrade_state(tg->td, LIMIT_LOW);
+
+       tg->last_bytes_disp[READ] = 0;
+       tg->last_bytes_disp[WRITE] = 0;
+       tg->last_io_disp[READ] = 0;
+       tg->last_io_disp[WRITE] = 0;
+}
+
+static void blk_throtl_update_idletime(struct throtl_grp *tg)
+{
+       unsigned long now = ktime_get_ns() >> 10;
+       unsigned long last_finish_time = tg->last_finish_time;
+
+       if (now <= last_finish_time || last_finish_time == 0 ||
+           last_finish_time == tg->checked_last_finish_time)
+               return;
+
+       tg->avg_idletime = (tg->avg_idletime * 7 + now - last_finish_time) >> 3;
+       tg->checked_last_finish_time = last_finish_time;
+}
+
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+static void throtl_update_latency_buckets(struct throtl_data *td)
+{
+       struct avg_latency_bucket avg_latency[LATENCY_BUCKET_SIZE];
+       int i, cpu;
+       unsigned long last_latency = 0;
+       unsigned long latency;
+
+       if (!blk_queue_nonrot(td->queue))
+               return;
+       if (time_before(jiffies, td->last_calculate_time + HZ))
+               return;
+       td->last_calculate_time = jiffies;
+
+       memset(avg_latency, 0, sizeof(avg_latency));
+       for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
+               struct latency_bucket *tmp = &td->tmp_buckets[i];
+
+               for_each_possible_cpu(cpu) {
+                       struct latency_bucket *bucket;
+
+                       /* this isn't race free, but ok in practice */
+                       bucket = per_cpu_ptr(td->latency_buckets, cpu);
+                       tmp->total_latency += bucket[i].total_latency;
+                       tmp->samples += bucket[i].samples;
+                       bucket[i].total_latency = 0;
+                       bucket[i].samples = 0;
+               }
+
+               if (tmp->samples >= 32) {
+                       int samples = tmp->samples;
+
+                       latency = tmp->total_latency;
+
+                       tmp->total_latency = 0;
+                       tmp->samples = 0;
+                       latency /= samples;
+                       if (latency == 0)
+                               continue;
+                       avg_latency[i].latency = latency;
+               }
+       }
+
+       for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
+               if (!avg_latency[i].latency) {
+                       if (td->avg_buckets[i].latency < last_latency)
+                               td->avg_buckets[i].latency = last_latency;
+                       continue;
+               }
+
+               if (!td->avg_buckets[i].valid)
+                       latency = avg_latency[i].latency;
+               else
+                       latency = (td->avg_buckets[i].latency * 7 +
+                               avg_latency[i].latency) >> 3;
+
+               td->avg_buckets[i].latency = max(latency, last_latency);
+               td->avg_buckets[i].valid = true;
+               last_latency = td->avg_buckets[i].latency;
+       }
+}
+#else
+static inline void throtl_update_latency_buckets(struct throtl_data *td)
+{
+}
+#endif
+
+static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio)
+{
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+       int ret;
+
+       ret = bio_associate_current(bio);
+       if (ret == 0 || ret == -EBUSY)
+               bio->bi_cg_private = tg;
+       blk_stat_set_issue(&bio->bi_issue_stat, bio_sectors(bio));
+#else
+       bio_associate_current(bio);
+#endif
+}
+
 bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
                    struct bio *bio)
 {
@@ -1399,6 +2052,7 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
        struct throtl_service_queue *sq;
        bool rw = bio_data_dir(bio);
        bool throttled = false;
+       struct throtl_data *td = tg->td;
 
        WARN_ON_ONCE(!rcu_read_lock_held());
 
@@ -1408,19 +2062,35 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 
        spin_lock_irq(q->queue_lock);
 
+       throtl_update_latency_buckets(td);
+
        if (unlikely(blk_queue_bypass(q)))
                goto out_unlock;
 
+       blk_throtl_assoc_bio(tg, bio);
+       blk_throtl_update_idletime(tg);
+
        sq = &tg->service_queue;
 
+again:
        while (true) {
+               if (tg->last_low_overflow_time[rw] == 0)
+                       tg->last_low_overflow_time[rw] = jiffies;
+               throtl_downgrade_check(tg);
+               throtl_upgrade_check(tg);
                /* throtl is FIFO - if bios are already queued, should queue */
                if (sq->nr_queued[rw])
                        break;
 
                /* if above limits, break to queue */
-               if (!tg_may_dispatch(tg, bio, NULL))
+               if (!tg_may_dispatch(tg, bio, NULL)) {
+                       tg->last_low_overflow_time[rw] = jiffies;
+                       if (throtl_can_upgrade(td, tg)) {
+                               throtl_upgrade_state(td);
+                               goto again;
+                       }
                        break;
+               }
 
                /* within limits, let's charge and dispatch directly */
                throtl_charge_bio(tg, bio);
@@ -1453,12 +2123,14 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
        /* out-of-limit, queue to @tg */
        throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u queued=%d/%d",
                   rw == READ ? 'R' : 'W',
-                  tg->bytes_disp[rw], bio->bi_iter.bi_size, tg->bps[rw],
-                  tg->io_disp[rw], tg->iops[rw],
+                  tg->bytes_disp[rw], bio->bi_iter.bi_size,
+                  tg_bps_limit(tg, rw),
+                  tg->io_disp[rw], tg_iops_limit(tg, rw),
                   sq->nr_queued[READ], sq->nr_queued[WRITE]);
 
-       bio_associate_current(bio);
-       tg->td->nr_queued[rw]++;
+       tg->last_low_overflow_time[rw] = jiffies;
+
+       td->nr_queued[rw]++;
        throtl_add_bio_tg(bio, qn, tg);
        throttled = true;
 
@@ -1483,9 +2155,94 @@ out:
         */
        if (!throttled)
                bio_clear_flag(bio, BIO_THROTTLED);
+
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+       if (throttled || !td->track_bio_latency)
+               bio->bi_issue_stat.stat |= SKIP_LATENCY;
+#endif
        return throttled;
 }
 
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+static void throtl_track_latency(struct throtl_data *td, sector_t size,
+       int op, unsigned long time)
+{
+       struct latency_bucket *latency;
+       int index;
+
+       if (!td || td->limit_index != LIMIT_LOW || op != REQ_OP_READ ||
+           !blk_queue_nonrot(td->queue))
+               return;
+
+       index = request_bucket_index(size);
+
+       latency = get_cpu_ptr(td->latency_buckets);
+       latency[index].total_latency += time;
+       latency[index].samples++;
+       put_cpu_ptr(td->latency_buckets);
+}
+
+void blk_throtl_stat_add(struct request *rq, u64 time_ns)
+{
+       struct request_queue *q = rq->q;
+       struct throtl_data *td = q->td;
+
+       throtl_track_latency(td, blk_stat_size(&rq->issue_stat),
+               req_op(rq), time_ns >> 10);
+}
+
+void blk_throtl_bio_endio(struct bio *bio)
+{
+       struct throtl_grp *tg;
+       u64 finish_time_ns;
+       unsigned long finish_time;
+       unsigned long start_time;
+       unsigned long lat;
+
+       tg = bio->bi_cg_private;
+       if (!tg)
+               return;
+       bio->bi_cg_private = NULL;
+
+       finish_time_ns = ktime_get_ns();
+       tg->last_finish_time = finish_time_ns >> 10;
+
+       start_time = blk_stat_time(&bio->bi_issue_stat) >> 10;
+       finish_time = __blk_stat_time(finish_time_ns) >> 10;
+       if (!start_time || finish_time <= start_time)
+               return;
+
+       lat = finish_time - start_time;
+       /* this is only for bio based driver */
+       if (!(bio->bi_issue_stat.stat & SKIP_LATENCY))
+               throtl_track_latency(tg->td, blk_stat_size(&bio->bi_issue_stat),
+                       bio_op(bio), lat);
+
+       if (tg->latency_target) {
+               int bucket;
+               unsigned int threshold;
+
+               bucket = request_bucket_index(
+                       blk_stat_size(&bio->bi_issue_stat));
+               threshold = tg->td->avg_buckets[bucket].latency +
+                       tg->latency_target;
+               if (lat > threshold)
+                       tg->bad_bio_cnt++;
+               /*
+                * Not race free, could get wrong count, which means cgroups
+                * will be throttled
+                */
+               tg->bio_cnt++;
+       }
+
+       if (time_after(jiffies, tg->bio_cnt_reset_time) || tg->bio_cnt > 1024) {
+               tg->bio_cnt_reset_time = tg->td->throtl_slice + jiffies;
+               tg->bio_cnt /= 2;
+               tg->bad_bio_cnt /= 2;
+       }
+}
+#endif
+
 /*
  * Dispatch all bios from all children tg's queued on @parent_sq.  On
  * return, @parent_sq is guaranteed to not have any active children tg's
@@ -1558,6 +2315,12 @@ int blk_throtl_init(struct request_queue *q)
        td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
        if (!td)
                return -ENOMEM;
+       td->latency_buckets = __alloc_percpu(sizeof(struct latency_bucket) *
+               LATENCY_BUCKET_SIZE, __alignof__(u64));
+       if (!td->latency_buckets) {
+               kfree(td);
+               return -ENOMEM;
+       }
 
        INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
        throtl_service_queue_init(&td->service_queue);
@@ -1565,10 +2328,17 @@ int blk_throtl_init(struct request_queue *q)
        q->td = td;
        td->queue = q;
 
+       td->limit_valid[LIMIT_MAX] = true;
+       td->limit_index = LIMIT_MAX;
+       td->low_upgrade_time = jiffies;
+       td->low_downgrade_time = jiffies;
+
        /* activate policy */
        ret = blkcg_activate_policy(q, &blkcg_policy_throtl);
-       if (ret)
+       if (ret) {
+               free_percpu(td->latency_buckets);
                kfree(td);
+       }
        return ret;
 }
 
@@ -1577,9 +2347,74 @@ void blk_throtl_exit(struct request_queue *q)
        BUG_ON(!q->td);
        throtl_shutdown_wq(q);
        blkcg_deactivate_policy(q, &blkcg_policy_throtl);
+       free_percpu(q->td->latency_buckets);
        kfree(q->td);
 }
 
+void blk_throtl_register_queue(struct request_queue *q)
+{
+       struct throtl_data *td;
+       struct cgroup_subsys_state *pos_css;
+       struct blkcg_gq *blkg;
+
+       td = q->td;
+       BUG_ON(!td);
+
+       if (blk_queue_nonrot(q)) {
+               td->throtl_slice = DFL_THROTL_SLICE_SSD;
+               td->dft_idletime_threshold = DFL_IDLE_THRESHOLD_SSD;
+       } else {
+               td->throtl_slice = DFL_THROTL_SLICE_HD;
+               td->dft_idletime_threshold = DFL_IDLE_THRESHOLD_HD;
+       }
+#ifndef CONFIG_BLK_DEV_THROTTLING_LOW
+       /* if no low limit, use previous default */
+       td->throtl_slice = DFL_THROTL_SLICE_HD;
+#endif
+
+       td->track_bio_latency = !q->mq_ops && !q->request_fn;
+       if (!td->track_bio_latency)
+               blk_stat_enable_accounting(q);
+
+       /*
+        * some tg are created before queue is fully initialized, eg, nonrot
+        * isn't initialized yet
+        */
+       rcu_read_lock();
+       blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) {
+               struct throtl_grp *tg = blkg_to_tg(blkg);
+
+               tg->idletime_threshold = td->dft_idletime_threshold;
+       }
+       rcu_read_unlock();
+}
+
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page)
+{
+       if (!q->td)
+               return -EINVAL;
+       return sprintf(page, "%u\n", jiffies_to_msecs(q->td->throtl_slice));
+}
+
+ssize_t blk_throtl_sample_time_store(struct request_queue *q,
+       const char *page, size_t count)
+{
+       unsigned long v;
+       unsigned long t;
+
+       if (!q->td)
+               return -EINVAL;
+       if (kstrtoul(page, 10, &v))
+               return -EINVAL;
+       t = msecs_to_jiffies(v);
+       if (t == 0 || t > MAX_THROTL_SLICE)
+               return -EINVAL;
+       q->td->throtl_slice = t;
+       return count;
+}
+#endif
+
 static int __init throtl_init(void)
 {
        kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0);
index a30441a..cbff183 100644 (file)
@@ -89,7 +89,6 @@ static void blk_rq_timed_out(struct request *req)
                ret = q->rq_timed_out_fn(req);
        switch (ret) {
        case BLK_EH_HANDLED:
-               /* Can we use req->errors here? */
                __blk_complete_request(req);
                break;
        case BLK_EH_RESET_TIMER:
index 1aedb1f..17676f4 100644 (file)
@@ -255,8 +255,8 @@ static inline bool stat_sample_valid(struct blk_rq_stat *stat)
         * that it's writes impacting us, and not just some sole read on
         * a device that is in a lower power state.
         */
-       return stat[BLK_STAT_READ].nr_samples >= 1 &&
-               stat[BLK_STAT_WRITE].nr_samples >= RWB_MIN_WRITE_SAMPLES;
+       return (stat[READ].nr_samples >= 1 &&
+               stat[WRITE].nr_samples >= RWB_MIN_WRITE_SAMPLES);
 }
 
 static u64 rwb_sync_issue_lat(struct rq_wb *rwb)
@@ -277,7 +277,7 @@ enum {
        LAT_EXCEEDED,
 };
 
-static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
+static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
 {
        struct backing_dev_info *bdi = rwb->queue->backing_dev_info;
        u64 thislat;
@@ -293,7 +293,7 @@ static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
         */
        thislat = rwb_sync_issue_lat(rwb);
        if (thislat > rwb->cur_win_nsec ||
-           (thislat > rwb->min_lat_nsec && !stat[BLK_STAT_READ].nr_samples)) {
+           (thislat > rwb->min_lat_nsec && !stat[READ].nr_samples)) {
                trace_wbt_lat(bdi, thislat);
                return LAT_EXCEEDED;
        }
@@ -308,8 +308,8 @@ static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
                 * waited or still has writes in flights, consider us doing
                 * just writes as well.
                 */
-               if ((stat[BLK_STAT_WRITE].nr_samples && blk_stat_is_current(stat)) ||
-                   wb_recent_wait(rwb) || wbt_inflight(rwb))
+               if (stat[WRITE].nr_samples || wb_recent_wait(rwb) ||
+                   wbt_inflight(rwb))
                        return LAT_UNKNOWN_WRITES;
                return LAT_UNKNOWN;
        }
@@ -317,8 +317,8 @@ static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
        /*
         * If the 'min' latency exceeds our target, step down.
         */
-       if (stat[BLK_STAT_READ].min > rwb->min_lat_nsec) {
-               trace_wbt_lat(bdi, stat[BLK_STAT_READ].min);
+       if (stat[READ].min > rwb->min_lat_nsec) {
+               trace_wbt_lat(bdi, stat[READ].min);
                trace_wbt_stat(bdi, stat);
                return LAT_EXCEEDED;
        }
@@ -329,14 +329,6 @@ static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
        return LAT_OK;
 }
 
-static int latency_exceeded(struct rq_wb *rwb)
-{
-       struct blk_rq_stat stat[2];
-
-       blk_queue_stat_get(rwb->queue, stat);
-       return __latency_exceeded(rwb, stat);
-}
-
 static void rwb_trace_step(struct rq_wb *rwb, const char *msg)
 {
        struct backing_dev_info *bdi = rwb->queue->backing_dev_info;
@@ -355,7 +347,6 @@ static void scale_up(struct rq_wb *rwb)
 
        rwb->scale_step--;
        rwb->unknown_cnt = 0;
-       blk_stat_clear(rwb->queue);
 
        rwb->scaled_max = calc_wb_limits(rwb);
 
@@ -385,15 +376,12 @@ static void scale_down(struct rq_wb *rwb, bool hard_throttle)
 
        rwb->scaled_max = false;
        rwb->unknown_cnt = 0;
-       blk_stat_clear(rwb->queue);
        calc_wb_limits(rwb);
        rwb_trace_step(rwb, "step down");
 }
 
 static void rwb_arm_timer(struct rq_wb *rwb)
 {
-       unsigned long expires;
-
        if (rwb->scale_step > 0) {
                /*
                 * We should speed this up, using some variant of a fast
@@ -411,17 +399,16 @@ static void rwb_arm_timer(struct rq_wb *rwb)
                rwb->cur_win_nsec = rwb->win_nsec;
        }
 
-       expires = jiffies + nsecs_to_jiffies(rwb->cur_win_nsec);
-       mod_timer(&rwb->window_timer, expires);
+       blk_stat_activate_nsecs(rwb->cb, rwb->cur_win_nsec);
 }
 
-static void wb_timer_fn(unsigned long data)
+static void wb_timer_fn(struct blk_stat_callback *cb)
 {
-       struct rq_wb *rwb = (struct rq_wb *) data;
+       struct rq_wb *rwb = cb->data;
        unsigned int inflight = wbt_inflight(rwb);
        int status;
 
-       status = latency_exceeded(rwb);
+       status = latency_exceeded(rwb, cb->stat);
 
        trace_wbt_timer(rwb->queue->backing_dev_info, status, rwb->scale_step,
                        inflight);
@@ -614,7 +601,7 @@ enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio, spinlock_t *lock)
 
        __wbt_wait(rwb, bio->bi_opf, lock);
 
-       if (!timer_pending(&rwb->window_timer))
+       if (!blk_stat_is_active(rwb->cb))
                rwb_arm_timer(rwb);
 
        if (current_is_kswapd())
@@ -666,22 +653,37 @@ void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on)
                rwb->wc = write_cache_on;
 }
 
- /*
- * Disable wbt, if enabled by default. Only called from CFQ, if we have
- * cgroups enabled
+/*
+ * Disable wbt, if enabled by default. Only called from CFQ.
  */
 void wbt_disable_default(struct request_queue *q)
 {
        struct rq_wb *rwb = q->rq_wb;
 
-       if (rwb && rwb->enable_state == WBT_STATE_ON_DEFAULT) {
-               del_timer_sync(&rwb->window_timer);
-               rwb->win_nsec = rwb->min_lat_nsec = 0;
-               wbt_update_limits(rwb);
-       }
+       if (rwb && rwb->enable_state == WBT_STATE_ON_DEFAULT)
+               wbt_exit(q);
 }
 EXPORT_SYMBOL_GPL(wbt_disable_default);
 
+/*
+ * Enable wbt if defaults are configured that way
+ */
+void wbt_enable_default(struct request_queue *q)
+{
+       /* Throttling already enabled? */
+       if (q->rq_wb)
+               return;
+
+       /* Queue not registered? Maybe shutting down... */
+       if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
+               return;
+
+       if ((q->mq_ops && IS_ENABLED(CONFIG_BLK_WBT_MQ)) ||
+           (q->request_fn && IS_ENABLED(CONFIG_BLK_WBT_SQ)))
+               wbt_init(q);
+}
+EXPORT_SYMBOL_GPL(wbt_enable_default);
+
 u64 wbt_default_latency_nsec(struct request_queue *q)
 {
        /*
@@ -694,29 +696,33 @@ u64 wbt_default_latency_nsec(struct request_queue *q)
                return 75000000ULL;
 }
 
+static int wbt_data_dir(const struct request *rq)
+{
+       return rq_data_dir(rq);
+}
+
 int wbt_init(struct request_queue *q)
 {
        struct rq_wb *rwb;
        int i;
 
-       /*
-        * For now, we depend on the stats window being larger than
-        * our monitoring window. Ensure that this isn't inadvertently
-        * violated.
-        */
-       BUILD_BUG_ON(RWB_WINDOW_NSEC > BLK_STAT_NSEC);
        BUILD_BUG_ON(WBT_NR_BITS > BLK_STAT_RES_BITS);
 
        rwb = kzalloc(sizeof(*rwb), GFP_KERNEL);
        if (!rwb)
                return -ENOMEM;
 
+       rwb->cb = blk_stat_alloc_callback(wb_timer_fn, wbt_data_dir, 2, rwb);
+       if (!rwb->cb) {
+               kfree(rwb);
+               return -ENOMEM;
+       }
+
        for (i = 0; i < WBT_NUM_RWQ; i++) {
                atomic_set(&rwb->rq_wait[i].inflight, 0);
                init_waitqueue_head(&rwb->rq_wait[i].wait);
        }
 
-       setup_timer(&rwb->window_timer, wb_timer_fn, (unsigned long) rwb);
        rwb->wc = 1;
        rwb->queue_depth = RWB_DEF_DEPTH;
        rwb->last_comp = rwb->last_issue = jiffies;
@@ -726,10 +732,10 @@ int wbt_init(struct request_queue *q)
        wbt_update_limits(rwb);
 
        /*
-        * Assign rwb, and turn on stats tracking for this queue
+        * Assign rwb and add the stats callback.
         */
        q->rq_wb = rwb;
-       blk_stat_enable(q);
+       blk_stat_add_callback(q, rwb->cb);
 
        rwb->min_lat_nsec = wbt_default_latency_nsec(q);
 
@@ -744,7 +750,8 @@ void wbt_exit(struct request_queue *q)
        struct rq_wb *rwb = q->rq_wb;
 
        if (rwb) {
-               del_timer_sync(&rwb->window_timer);
+               blk_stat_remove_callback(q, rwb->cb);
+               blk_stat_free_callback(rwb->cb);
                q->rq_wb = NULL;
                kfree(rwb);
        }
index 65f1de5..df6de50 100644 (file)
@@ -32,27 +32,27 @@ enum {
 
 static inline void wbt_clear_state(struct blk_issue_stat *stat)
 {
-       stat->time &= BLK_STAT_TIME_MASK;
+       stat->stat &= ~BLK_STAT_RES_MASK;
 }
 
 static inline enum wbt_flags wbt_stat_to_mask(struct blk_issue_stat *stat)
 {
-       return (stat->time & BLK_STAT_MASK) >> BLK_STAT_SHIFT;
+       return (stat->stat & BLK_STAT_RES_MASK) >> BLK_STAT_RES_SHIFT;
 }
 
 static inline void wbt_track(struct blk_issue_stat *stat, enum wbt_flags wb_acct)
 {
-       stat->time |= ((u64) wb_acct) << BLK_STAT_SHIFT;
+       stat->stat |= ((u64) wb_acct) << BLK_STAT_RES_SHIFT;
 }
 
 static inline bool wbt_is_tracked(struct blk_issue_stat *stat)
 {
-       return (stat->time >> BLK_STAT_SHIFT) & WBT_TRACKED;
+       return (stat->stat >> BLK_STAT_RES_SHIFT) & WBT_TRACKED;
 }
 
 static inline bool wbt_is_read(struct blk_issue_stat *stat)
 {
-       return (stat->time >> BLK_STAT_SHIFT) & WBT_READ;
+       return (stat->stat >> BLK_STAT_RES_SHIFT) & WBT_READ;
 }
 
 struct rq_wait {
@@ -81,7 +81,7 @@ struct rq_wb {
        u64 win_nsec;                           /* default window size */
        u64 cur_win_nsec;                       /* current window size */
 
-       struct timer_list window_timer;
+       struct blk_stat_callback *cb;
 
        s64 sync_issue;
        void *sync_cookie;
@@ -117,6 +117,7 @@ void wbt_update_limits(struct rq_wb *);
 void wbt_requeue(struct rq_wb *, struct blk_issue_stat *);
 void wbt_issue(struct rq_wb *, struct blk_issue_stat *);
 void wbt_disable_default(struct request_queue *);
+void wbt_enable_default(struct request_queue *);
 
 void wbt_set_queue_depth(struct rq_wb *, unsigned int);
 void wbt_set_write_cache(struct rq_wb *, bool);
@@ -155,6 +156,9 @@ static inline void wbt_issue(struct rq_wb *rwb, struct blk_issue_stat *stat)
 static inline void wbt_disable_default(struct request_queue *q)
 {
 }
+static inline void wbt_enable_default(struct request_queue *q)
+{
+}
 static inline void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth)
 {
 }
index d1ea4bd..2ed7022 100644 (file)
@@ -60,15 +60,12 @@ void blk_free_flush_queue(struct blk_flush_queue *q);
 int blk_init_rl(struct request_list *rl, struct request_queue *q,
                gfp_t gfp_mask);
 void blk_exit_rl(struct request_list *rl);
-void init_request_from_bio(struct request *req, struct bio *bio);
 void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
                        struct bio *bio);
 void blk_queue_bypass_start(struct request_queue *q);
 void blk_queue_bypass_end(struct request_queue *q);
 void blk_dequeue_request(struct request *rq);
 void __blk_queue_free_tags(struct request_queue *q);
-bool __blk_end_bidi_request(struct request *rq, int error,
-                           unsigned int nr_bytes, unsigned int bidi_bytes);
 void blk_freeze_queue(struct request_queue *q);
 
 static inline void blk_queue_enter_live(struct request_queue *q)
@@ -319,10 +316,22 @@ static inline struct io_context *create_io_context(gfp_t gfp_mask, int node)
 extern void blk_throtl_drain(struct request_queue *q);
 extern int blk_throtl_init(struct request_queue *q);
 extern void blk_throtl_exit(struct request_queue *q);
+extern void blk_throtl_register_queue(struct request_queue *q);
 #else /* CONFIG_BLK_DEV_THROTTLING */
 static inline void blk_throtl_drain(struct request_queue *q) { }
 static inline int blk_throtl_init(struct request_queue *q) { return 0; }
 static inline void blk_throtl_exit(struct request_queue *q) { }
+static inline void blk_throtl_register_queue(struct request_queue *q) { }
 #endif /* CONFIG_BLK_DEV_THROTTLING */
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+extern ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page);
+extern ssize_t blk_throtl_sample_time_store(struct request_queue *q,
+       const char *page, size_t count);
+extern void blk_throtl_bio_endio(struct bio *bio);
+extern void blk_throtl_stat_add(struct request *rq, u64 time);
+#else
+static inline void blk_throtl_bio_endio(struct bio *bio) { }
+static inline void blk_throtl_stat_add(struct request *rq, u64 time) { }
+#endif
 
 #endif /* BLK_INTERNAL_H */
index cd15f9d..0a23dbb 100644 (file)
@@ -37,7 +37,7 @@ static void bsg_destroy_job(struct kref *kref)
        struct bsg_job *job = container_of(kref, struct bsg_job, kref);
        struct request *rq = job->req;
 
-       blk_end_request_all(rq, rq->errors);
+       blk_end_request_all(rq, scsi_req(rq)->result);
 
        put_device(job->dev);   /* release reference for the request */
 
@@ -74,7 +74,7 @@ void bsg_job_done(struct bsg_job *job, int result,
        struct scsi_request *rq = scsi_req(req);
        int err;
 
-       err = job->req->errors = result;
+       err = scsi_req(job->req)->result = result;
        if (err < 0)
                /* we're only returning the result field in the reply */
                rq->sense_len = sizeof(u32);
@@ -177,7 +177,7 @@ failjob_rls_job:
  * @q: request queue to manage
  *
  * On error the create_bsg_job function should return a -Exyz error value
- * that will be set to the req->errors.
+ * that will be set to ->result.
  *
  * Drivers/subsys should pass this to the queue init function.
  */
@@ -201,7 +201,7 @@ static void bsg_request_fn(struct request_queue *q)
 
                ret = bsg_create_job(dev, req);
                if (ret) {
-                       req->errors = ret;
+                       scsi_req(req)->result = ret;
                        blk_end_request_all(req, ret);
                        spin_lock_irq(q->queue_lock);
                        continue;
index 74835db..6fd0854 100644 (file)
@@ -391,13 +391,13 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
        struct scsi_request *req = scsi_req(rq);
        int ret = 0;
 
-       dprintk("rq %p bio %p 0x%x\n", rq, bio, rq->errors);
+       dprintk("rq %p bio %p 0x%x\n", rq, bio, req->result);
        /*
         * fill in all the output members
         */
-       hdr->device_status = rq->errors & 0xff;
-       hdr->transport_status = host_byte(rq->errors);
-       hdr->driver_status = driver_byte(rq->errors);
+       hdr->device_status = req->result & 0xff;
+       hdr->transport_status = host_byte(req->result);
+       hdr->driver_status = driver_byte(req->result);
        hdr->info = 0;
        if (hdr->device_status || hdr->transport_status || hdr->driver_status)
                hdr->info |= SG_INFO_CHECK;
@@ -431,8 +431,8 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
         * just a protocol response (i.e. non negative), that gets
         * processed above.
         */
-       if (!ret && rq->errors < 0)
-               ret = rq->errors;
+       if (!ret && req->result < 0)
+               ret = req->result;
 
        blk_rq_unmap_user(bio);
        scsi_req_free_cmd(req);
@@ -650,7 +650,7 @@ bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
 
        dprintk("%s: write %zd bytes\n", bd->name, count);
 
-       if (unlikely(segment_eq(get_fs(), KERNEL_DS)))
+       if (unlikely(uaccess_kernel()))
                return -EINVAL;
 
        bsg_set_block(bd, file);
index 440b95e..da69b07 100644 (file)
@@ -3761,16 +3761,14 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 }
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
-static bool check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
+static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
 {
        struct cfq_data *cfqd = cic_to_cfqd(cic);
        struct cfq_queue *cfqq;
        uint64_t serial_nr;
-       bool nonroot_cg;
 
        rcu_read_lock();
        serial_nr = bio_blkcg(bio)->css.serial_nr;
-       nonroot_cg = bio_blkcg(bio) != &blkcg_root;
        rcu_read_unlock();
 
        /*
@@ -3778,7 +3776,7 @@ static bool check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
         * spuriously on a newly created cic but there's no harm.
         */
        if (unlikely(!cfqd) || likely(cic->blkcg_serial_nr == serial_nr))
-               return nonroot_cg;
+               return;
 
        /*
         * Drop reference to queues.  New queues will be assigned in new
@@ -3799,12 +3797,10 @@ static bool check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
        }
 
        cic->blkcg_serial_nr = serial_nr;
-       return nonroot_cg;
 }
 #else
-static inline bool check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
+static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
 {
-       return false;
 }
 #endif  /* CONFIG_CFQ_GROUP_IOSCHED */
 
@@ -4449,12 +4445,11 @@ cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio,
        const int rw = rq_data_dir(rq);
        const bool is_sync = rq_is_sync(rq);
        struct cfq_queue *cfqq;
-       bool disable_wbt;
 
        spin_lock_irq(q->queue_lock);
 
        check_ioprio_changed(cic, bio);
-       disable_wbt = check_blkcg_changed(cic, bio);
+       check_blkcg_changed(cic, bio);
 new_queue:
        cfqq = cic_to_cfqq(cic, is_sync);
        if (!cfqq || cfqq == &cfqd->oom_cfqq) {
@@ -4491,9 +4486,6 @@ new_queue:
        rq->elv.priv[1] = cfqq->cfqg;
        spin_unlock_irq(q->queue_lock);
 
-       if (disable_wbt)
-               wbt_disable_default(q);
-
        return 0;
 }
 
@@ -4706,6 +4698,7 @@ static void cfq_registered_queue(struct request_queue *q)
         */
        if (blk_queue_nonrot(q))
                cfqd->cfq_slice_idle = 0;
+       wbt_disable_default(q);
 }
 
 /*
index 570021a..04325b8 100644 (file)
@@ -685,7 +685,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
        case BLKALIGNOFF:
                return compat_put_int(arg, bdev_alignment_offset(bdev));
        case BLKDISCARDZEROES:
-               return compat_put_uint(arg, bdev_discard_zeroes_data(bdev));
+               return compat_put_uint(arg, 0);
        case BLKFLSBUF:
        case BLKROSET:
        case BLKDISCARD:
index 01139f5..bf11e70 100644 (file)
@@ -41,6 +41,7 @@
 
 #include "blk.h"
 #include "blk-mq-sched.h"
+#include "blk-wbt.h"
 
 static DEFINE_SPINLOCK(elv_list_lock);
 static LIST_HEAD(elv_list);
@@ -242,26 +243,21 @@ int elevator_init(struct request_queue *q, char *name)
                }
        }
 
-       if (e->uses_mq) {
-               err = blk_mq_sched_setup(q);
-               if (!err)
-                       err = e->ops.mq.init_sched(q, e);
-       } else
+       if (e->uses_mq)
+               err = blk_mq_init_sched(q, e);
+       else
                err = e->ops.sq.elevator_init_fn(q, e);
-       if (err) {
-               if (e->uses_mq)
-                       blk_mq_sched_teardown(q);
+       if (err)
                elevator_put(e);
-       }
        return err;
 }
 EXPORT_SYMBOL(elevator_init);
 
-void elevator_exit(struct elevator_queue *e)
+void elevator_exit(struct request_queue *q, struct elevator_queue *e)
 {
        mutex_lock(&e->sysfs_lock);
        if (e->uses_mq && e->type->ops.mq.exit_sched)
-               e->type->ops.mq.exit_sched(e);
+               blk_mq_exit_sched(q, e);
        else if (!e->uses_mq && e->type->ops.sq.elevator_exit_fn)
                e->type->ops.sq.elevator_exit_fn(e);
        mutex_unlock(&e->sysfs_lock);
@@ -882,6 +878,8 @@ void elv_unregister_queue(struct request_queue *q)
                kobject_uevent(&e->kobj, KOBJ_REMOVE);
                kobject_del(&e->kobj);
                e->registered = 0;
+               /* Re-enable throttling in case elevator disabled it */
+               wbt_enable_default(q);
        }
 }
 EXPORT_SYMBOL(elv_unregister_queue);
@@ -946,6 +944,45 @@ void elv_unregister(struct elevator_type *e)
 }
 EXPORT_SYMBOL_GPL(elv_unregister);
 
+static int elevator_switch_mq(struct request_queue *q,
+                             struct elevator_type *new_e)
+{
+       int ret;
+
+       blk_mq_freeze_queue(q);
+       blk_mq_quiesce_queue(q);
+
+       if (q->elevator) {
+               if (q->elevator->registered)
+                       elv_unregister_queue(q);
+               ioc_clear_queue(q);
+               elevator_exit(q, q->elevator);
+       }
+
+       ret = blk_mq_init_sched(q, new_e);
+       if (ret)
+               goto out;
+
+       if (new_e) {
+               ret = elv_register_queue(q);
+               if (ret) {
+                       elevator_exit(q, q->elevator);
+                       goto out;
+               }
+       }
+
+       if (new_e)
+               blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
+       else
+               blk_add_trace_msg(q, "elv switch: none");
+
+out:
+       blk_mq_unfreeze_queue(q);
+       blk_mq_start_stopped_hw_queues(q, true);
+       return ret;
+
+}
+
 /*
  * switch to new_e io scheduler. be careful not to introduce deadlocks -
  * we don't free the old io scheduler, before we have allocated what we
@@ -958,10 +995,8 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
        bool old_registered = false;
        int err;
 
-       if (q->mq_ops) {
-               blk_mq_freeze_queue(q);
-               blk_mq_quiesce_queue(q);
-       }
+       if (q->mq_ops)
+               return elevator_switch_mq(q, new_e);
 
        /*
         * Turn on BYPASS and drain all requests w/ elevator private data.
@@ -973,11 +1008,7 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
        if (old) {
                old_registered = old->registered;
 
-               if (old->uses_mq)
-                       blk_mq_sched_teardown(q);
-
-               if (!q->mq_ops)
-                       blk_queue_bypass_start(q);
+               blk_queue_bypass_start(q);
 
                /* unregister and clear all auxiliary data of the old elevator */
                if (old_registered)
@@ -987,56 +1018,32 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
        }
 
        /* allocate, init and register new elevator */
-       if (new_e) {
-               if (new_e->uses_mq) {
-                       err = blk_mq_sched_setup(q);
-                       if (!err)
-                               err = new_e->ops.mq.init_sched(q, new_e);
-               } else
-                       err = new_e->ops.sq.elevator_init_fn(q, new_e);
-               if (err)
-                       goto fail_init;
+       err = new_e->ops.sq.elevator_init_fn(q, new_e);
+       if (err)
+               goto fail_init;
 
-               err = elv_register_queue(q);
-               if (err)
-                       goto fail_register;
-       } else
-               q->elevator = NULL;
+       err = elv_register_queue(q);
+       if (err)
+               goto fail_register;
 
        /* done, kill the old one and finish */
        if (old) {
-               elevator_exit(old);
-               if (!q->mq_ops)
-                       blk_queue_bypass_end(q);
+               elevator_exit(q, old);
+               blk_queue_bypass_end(q);
        }
 
-       if (q->mq_ops) {
-               blk_mq_unfreeze_queue(q);
-               blk_mq_start_stopped_hw_queues(q, true);
-       }
-
-       if (new_e)
-               blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
-       else
-               blk_add_trace_msg(q, "elv switch: none");
+       blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
 
        return 0;
 
 fail_register:
-       if (q->mq_ops)
-               blk_mq_sched_teardown(q);
-       elevator_exit(q->elevator);
+       elevator_exit(q, q->elevator);
 fail_init:
        /* switch failed, restore and re-register old elevator */
        if (old) {
                q->elevator = old;
                elv_register_queue(q);
-               if (!q->mq_ops)
-                       blk_queue_bypass_end(q);
-       }
-       if (q->mq_ops) {
-               blk_mq_unfreeze_queue(q);
-               blk_mq_start_stopped_hw_queues(q, true);
+               blk_queue_bypass_end(q);
        }
 
        return err;
@@ -1094,12 +1101,20 @@ int elevator_change(struct request_queue *q, const char *name)
 }
 EXPORT_SYMBOL(elevator_change);
 
+static inline bool elv_support_iosched(struct request_queue *q)
+{
+       if (q->mq_ops && q->tag_set && (q->tag_set->flags &
+                               BLK_MQ_F_NO_SCHED))
+               return false;
+       return true;
+}
+
 ssize_t elv_iosched_store(struct request_queue *q, const char *name,
                          size_t count)
 {
        int ret;
 
-       if (!(q->mq_ops || q->request_fn))
+       if (!(q->mq_ops || q->request_fn) || !elv_support_iosched(q))
                return count;
 
        ret = __elevator_change(q, name);
@@ -1131,7 +1146,7 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name)
                        len += sprintf(name+len, "[%s] ", elv->elevator_name);
                        continue;
                }
-               if (__e->uses_mq && q->mq_ops)
+               if (__e->uses_mq && q->mq_ops && elv_support_iosched(q))
                        len += sprintf(name+len, "%s ", __e->elevator_name);
                else if (!__e->uses_mq && !q->mq_ops)
                        len += sprintf(name+len, "%s ", __e->elevator_name);
index a9c516a..9a2d01a 100644 (file)
@@ -1060,8 +1060,19 @@ static struct attribute *disk_attrs[] = {
        NULL
 };
 
+static umode_t disk_visible(struct kobject *kobj, struct attribute *a, int n)
+{
+       struct device *dev = container_of(kobj, typeof(*dev), kobj);
+       struct gendisk *disk = dev_to_disk(dev);
+
+       if (a == &dev_attr_badblocks.attr && !disk->bb)
+               return 0;
+       return a->mode;
+}
+
 static struct attribute_group disk_attr_group = {
        .attrs = disk_attrs,
+       .is_visible = disk_visible,
 };
 
 static const struct attribute_group *disk_attr_groups[] = {
@@ -1352,7 +1363,7 @@ struct kobject *get_disk(struct gendisk *disk)
        owner = disk->fops->owner;
        if (owner && !try_module_get(owner))
                return NULL;
-       kobj = kobject_get(&disk_to_dev(disk)->kobj);
+       kobj = kobject_get_unless_zero(&disk_to_dev(disk)->kobj);
        if (kobj == NULL) {
                module_put(owner);
                return NULL;
index 7b88820..0de02ee 100644 (file)
@@ -255,7 +255,7 @@ static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode,
        truncate_inode_pages_range(mapping, start, end);
 
        return blkdev_issue_zeroout(bdev, start >> 9, len >> 9, GFP_KERNEL,
-                                   false);
+                       BLKDEV_ZERO_NOUNMAP);
 }
 
 static int put_ushort(unsigned long arg, unsigned short val)
@@ -547,7 +547,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
        case BLKALIGNOFF:
                return put_int(arg, bdev_alignment_offset(bdev));
        case BLKDISCARDZEROES:
-               return put_uint(arg, bdev_discard_zeroes_data(bdev));
+               return put_uint(arg, 0);
        case BLKSECTGET:
                max_sectors = min_t(unsigned int, USHRT_MAX,
                                    queue_max_sectors(bdev_get_queue(bdev)));
index 0c47a00..4b120c9 100644 (file)
@@ -163,22 +163,12 @@ out:
 
 int ioprio_best(unsigned short aprio, unsigned short bprio)
 {
-       unsigned short aclass;
-       unsigned short bclass;
-
        if (!ioprio_valid(aprio))
                aprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM);
        if (!ioprio_valid(bprio))
                bprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM);
 
-       aclass = IOPRIO_PRIO_CLASS(aprio);
-       bclass = IOPRIO_PRIO_CLASS(bprio);
-       if (aclass == bclass)
-               return min(aprio, bprio);
-       if (aclass > bclass)
-               return bprio;
-       else
-               return aprio;
+       return min(aprio, bprio);
 }
 
 SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
new file mode 100644 (file)
index 0000000..3b0090b
--- /dev/null
@@ -0,0 +1,719 @@
+/*
+ * The Kyber I/O scheduler. Controls latency by throttling queue depths using
+ * scalable techniques.
+ *
+ * Copyright (C) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+#include <linux/kernel.h>
+#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
+#include <linux/elevator.h>
+#include <linux/module.h>
+#include <linux/sbitmap.h>
+
+#include "blk.h"
+#include "blk-mq.h"
+#include "blk-mq-sched.h"
+#include "blk-mq-tag.h"
+#include "blk-stat.h"
+
+/* Scheduling domains. */
+enum {
+       KYBER_READ,
+       KYBER_SYNC_WRITE,
+       KYBER_OTHER, /* Async writes, discard, etc. */
+       KYBER_NUM_DOMAINS,
+};
+
+enum {
+       KYBER_MIN_DEPTH = 256,
+
+       /*
+        * In order to prevent starvation of synchronous requests by a flood of
+        * asynchronous requests, we reserve 25% of requests for synchronous
+        * operations.
+        */
+       KYBER_ASYNC_PERCENT = 75,
+};
+
+/*
+ * Initial device-wide depths for each scheduling domain.
+ *
+ * Even for fast devices with lots of tags like NVMe, you can saturate
+ * the device with only a fraction of the maximum possible queue depth.
+ * So, we cap these to a reasonable value.
+ */
+static const unsigned int kyber_depth[] = {
+       [KYBER_READ] = 256,
+       [KYBER_SYNC_WRITE] = 128,
+       [KYBER_OTHER] = 64,
+};
+
+/*
+ * Scheduling domain batch sizes. We favor reads.
+ */
+static const unsigned int kyber_batch_size[] = {
+       [KYBER_READ] = 16,
+       [KYBER_SYNC_WRITE] = 8,
+       [KYBER_OTHER] = 8,
+};
+
+struct kyber_queue_data {
+       struct request_queue *q;
+
+       struct blk_stat_callback *cb;
+
+       /*
+        * The device is divided into multiple scheduling domains based on the
+        * request type. Each domain has a fixed number of in-flight requests of
+        * that type device-wide, limited by these tokens.
+        */
+       struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS];
+
+       /*
+        * Async request percentage, converted to per-word depth for
+        * sbitmap_get_shallow().
+        */
+       unsigned int async_depth;
+
+       /* Target latencies in nanoseconds. */
+       u64 read_lat_nsec, write_lat_nsec;
+};
+
+struct kyber_hctx_data {
+       spinlock_t lock;
+       struct list_head rqs[KYBER_NUM_DOMAINS];
+       unsigned int cur_domain;
+       unsigned int batching;
+       wait_queue_t domain_wait[KYBER_NUM_DOMAINS];
+       atomic_t wait_index[KYBER_NUM_DOMAINS];
+};
+
+static int rq_sched_domain(const struct request *rq)
+{
+       unsigned int op = rq->cmd_flags;
+
+       if ((op & REQ_OP_MASK) == REQ_OP_READ)
+               return KYBER_READ;
+       else if ((op & REQ_OP_MASK) == REQ_OP_WRITE && op_is_sync(op))
+               return KYBER_SYNC_WRITE;
+       else
+               return KYBER_OTHER;
+}
+
+enum {
+       NONE = 0,
+       GOOD = 1,
+       GREAT = 2,
+       BAD = -1,
+       AWFUL = -2,
+};
+
+#define IS_GOOD(status) ((status) > 0)
+#define IS_BAD(status) ((status) < 0)
+
+static int kyber_lat_status(struct blk_stat_callback *cb,
+                           unsigned int sched_domain, u64 target)
+{
+       u64 latency;
+
+       if (!cb->stat[sched_domain].nr_samples)
+               return NONE;
+
+       latency = cb->stat[sched_domain].mean;
+       if (latency >= 2 * target)
+               return AWFUL;
+       else if (latency > target)
+               return BAD;
+       else if (latency <= target / 2)
+               return GREAT;
+       else /* (latency <= target) */
+               return GOOD;
+}
+
+/*
+ * Adjust the read or synchronous write depth given the status of reads and
+ * writes. The goal is that the latencies of the two domains are fair (i.e., if
+ * one is good, then the other is good).
+ */
+static void kyber_adjust_rw_depth(struct kyber_queue_data *kqd,
+                                 unsigned int sched_domain, int this_status,
+                                 int other_status)
+{
+       unsigned int orig_depth, depth;
+
+       /*
+        * If this domain had no samples, or reads and writes are both good or
+        * both bad, don't adjust the depth.
+        */
+       if (this_status == NONE ||
+           (IS_GOOD(this_status) && IS_GOOD(other_status)) ||
+           (IS_BAD(this_status) && IS_BAD(other_status)))
+               return;
+
+       orig_depth = depth = kqd->domain_tokens[sched_domain].sb.depth;
+
+       if (other_status == NONE) {
+               depth++;
+       } else {
+               switch (this_status) {
+               case GOOD:
+                       if (other_status == AWFUL)
+                               depth -= max(depth / 4, 1U);
+                       else
+                               depth -= max(depth / 8, 1U);
+                       break;
+               case GREAT:
+                       if (other_status == AWFUL)
+                               depth /= 2;
+                       else
+                               depth -= max(depth / 4, 1U);
+                       break;
+               case BAD:
+                       depth++;
+                       break;
+               case AWFUL:
+                       if (other_status == GREAT)
+                               depth += 2;
+                       else
+                               depth++;
+                       break;
+               }
+       }
+
+       depth = clamp(depth, 1U, kyber_depth[sched_domain]);
+       if (depth != orig_depth)
+               sbitmap_queue_resize(&kqd->domain_tokens[sched_domain], depth);
+}
+
+/*
+ * Adjust the depth of other requests given the status of reads and synchronous
+ * writes. As long as either domain is doing fine, we don't throttle, but if
+ * both domains are doing badly, we throttle heavily.
+ */
+static void kyber_adjust_other_depth(struct kyber_queue_data *kqd,
+                                    int read_status, int write_status,
+                                    bool have_samples)
+{
+       unsigned int orig_depth, depth;
+       int status;
+
+       orig_depth = depth = kqd->domain_tokens[KYBER_OTHER].sb.depth;
+
+       if (read_status == NONE && write_status == NONE) {
+               depth += 2;
+       } else if (have_samples) {
+               if (read_status == NONE)
+                       status = write_status;
+               else if (write_status == NONE)
+                       status = read_status;
+               else
+                       status = max(read_status, write_status);
+               switch (status) {
+               case GREAT:
+                       depth += 2;
+                       break;
+               case GOOD:
+                       depth++;
+                       break;
+               case BAD:
+                       depth -= max(depth / 4, 1U);
+                       break;
+               case AWFUL:
+                       depth /= 2;
+                       break;
+               }
+       }
+
+       depth = clamp(depth, 1U, kyber_depth[KYBER_OTHER]);
+       if (depth != orig_depth)
+               sbitmap_queue_resize(&kqd->domain_tokens[KYBER_OTHER], depth);
+}
+
+/*
+ * Apply heuristics for limiting queue depths based on gathered latency
+ * statistics.
+ */
+static void kyber_stat_timer_fn(struct blk_stat_callback *cb)
+{
+       struct kyber_queue_data *kqd = cb->data;
+       int read_status, write_status;
+
+       read_status = kyber_lat_status(cb, KYBER_READ, kqd->read_lat_nsec);
+       write_status = kyber_lat_status(cb, KYBER_SYNC_WRITE, kqd->write_lat_nsec);
+
+       kyber_adjust_rw_depth(kqd, KYBER_READ, read_status, write_status);
+       kyber_adjust_rw_depth(kqd, KYBER_SYNC_WRITE, write_status, read_status);
+       kyber_adjust_other_depth(kqd, read_status, write_status,
+                                cb->stat[KYBER_OTHER].nr_samples != 0);
+
+       /*
+        * Continue monitoring latencies if we aren't hitting the targets or
+        * we're still throttling other requests.
+        */
+       if (!blk_stat_is_active(kqd->cb) &&
+           ((IS_BAD(read_status) || IS_BAD(write_status) ||
+             kqd->domain_tokens[KYBER_OTHER].sb.depth < kyber_depth[KYBER_OTHER])))
+               blk_stat_activate_msecs(kqd->cb, 100);
+}
+
+static unsigned int kyber_sched_tags_shift(struct kyber_queue_data *kqd)
+{
+       /*
+        * All of the hardware queues have the same depth, so we can just grab
+        * the shift of the first one.
+        */
+       return kqd->q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift;
+}
+
+static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
+{
+       struct kyber_queue_data *kqd;
+       unsigned int max_tokens;
+       unsigned int shift;
+       int ret = -ENOMEM;
+       int i;
+
+       kqd = kmalloc_node(sizeof(*kqd), GFP_KERNEL, q->node);
+       if (!kqd)
+               goto err;
+       kqd->q = q;
+
+       kqd->cb = blk_stat_alloc_callback(kyber_stat_timer_fn, rq_sched_domain,
+                                         KYBER_NUM_DOMAINS, kqd);
+       if (!kqd->cb)
+               goto err_kqd;
+
+       /*
+        * The maximum number of tokens for any scheduling domain is at least
+        * the queue depth of a single hardware queue. If the hardware doesn't
+        * have many tags, still provide a reasonable number.
+        */
+       max_tokens = max_t(unsigned int, q->tag_set->queue_depth,
+                          KYBER_MIN_DEPTH);
+       for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
+               WARN_ON(!kyber_depth[i]);
+               WARN_ON(!kyber_batch_size[i]);
+               ret = sbitmap_queue_init_node(&kqd->domain_tokens[i],
+                                             max_tokens, -1, false, GFP_KERNEL,
+                                             q->node);
+               if (ret) {
+                       while (--i >= 0)
+                               sbitmap_queue_free(&kqd->domain_tokens[i]);
+                       goto err_cb;
+               }
+               sbitmap_queue_resize(&kqd->domain_tokens[i], kyber_depth[i]);
+       }
+
+       shift = kyber_sched_tags_shift(kqd);
+       kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
+
+       kqd->read_lat_nsec = 2000000ULL;
+       kqd->write_lat_nsec = 10000000ULL;
+
+       return kqd;
+
+err_cb:
+       blk_stat_free_callback(kqd->cb);
+err_kqd:
+       kfree(kqd);
+err:
+       return ERR_PTR(ret);
+}
+
+static int kyber_init_sched(struct request_queue *q, struct elevator_type *e)
+{
+       struct kyber_queue_data *kqd;
+       struct elevator_queue *eq;
+
+       eq = elevator_alloc(q, e);
+       if (!eq)
+               return -ENOMEM;
+
+       kqd = kyber_queue_data_alloc(q);
+       if (IS_ERR(kqd)) {
+               kobject_put(&eq->kobj);
+               return PTR_ERR(kqd);
+       }
+
+       eq->elevator_data = kqd;
+       q->elevator = eq;
+
+       blk_stat_add_callback(q, kqd->cb);
+
+       return 0;
+}
+
+static void kyber_exit_sched(struct elevator_queue *e)
+{
+       struct kyber_queue_data *kqd = e->elevator_data;
+       struct request_queue *q = kqd->q;
+       int i;
+
+       blk_stat_remove_callback(q, kqd->cb);
+
+       for (i = 0; i < KYBER_NUM_DOMAINS; i++)
+               sbitmap_queue_free(&kqd->domain_tokens[i]);
+       blk_stat_free_callback(kqd->cb);
+       kfree(kqd);
+}
+
+static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
+{
+       struct kyber_hctx_data *khd;
+       int i;
+
+       khd = kmalloc_node(sizeof(*khd), GFP_KERNEL, hctx->numa_node);
+       if (!khd)
+               return -ENOMEM;
+
+       spin_lock_init(&khd->lock);
+
+       for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
+               INIT_LIST_HEAD(&khd->rqs[i]);
+               INIT_LIST_HEAD(&khd->domain_wait[i].task_list);
+               atomic_set(&khd->wait_index[i], 0);
+       }
+
+       khd->cur_domain = 0;
+       khd->batching = 0;
+
+       hctx->sched_data = khd;
+
+       return 0;
+}
+
+static void kyber_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
+{
+       kfree(hctx->sched_data);
+}
+
+static int rq_get_domain_token(struct request *rq)
+{
+       return (long)rq->elv.priv[0];
+}
+
+static void rq_set_domain_token(struct request *rq, int token)
+{
+       rq->elv.priv[0] = (void *)(long)token;
+}
+
+static void rq_clear_domain_token(struct kyber_queue_data *kqd,
+                                 struct request *rq)
+{
+       unsigned int sched_domain;
+       int nr;
+
+       nr = rq_get_domain_token(rq);
+       if (nr != -1) {
+               sched_domain = rq_sched_domain(rq);
+               sbitmap_queue_clear(&kqd->domain_tokens[sched_domain], nr,
+                                   rq->mq_ctx->cpu);
+       }
+}
+
+static struct request *kyber_get_request(struct request_queue *q,
+                                        unsigned int op,
+                                        struct blk_mq_alloc_data *data)
+{
+       struct kyber_queue_data *kqd = q->elevator->elevator_data;
+       struct request *rq;
+
+       /*
+        * We use the scheduler tags as per-hardware queue queueing tokens.
+        * Async requests can be limited at this stage.
+        */
+       if (!op_is_sync(op))
+               data->shallow_depth = kqd->async_depth;
+
+       rq = __blk_mq_alloc_request(data, op);
+       if (rq)
+               rq_set_domain_token(rq, -1);
+       return rq;
+}
+
+static void kyber_put_request(struct request *rq)
+{
+       struct request_queue *q = rq->q;
+       struct kyber_queue_data *kqd = q->elevator->elevator_data;
+
+       rq_clear_domain_token(kqd, rq);
+       blk_mq_finish_request(rq);
+}
+
+static void kyber_completed_request(struct request *rq)
+{
+       struct request_queue *q = rq->q;
+       struct kyber_queue_data *kqd = q->elevator->elevator_data;
+       unsigned int sched_domain;
+       u64 now, latency, target;
+
+       /*
+        * Check if this request met our latency goal. If not, quickly gather
+        * some statistics and start throttling.
+        */
+       sched_domain = rq_sched_domain(rq);
+       switch (sched_domain) {
+       case KYBER_READ:
+               target = kqd->read_lat_nsec;
+               break;
+       case KYBER_SYNC_WRITE:
+               target = kqd->write_lat_nsec;
+               break;
+       default:
+               return;
+       }
+
+       /* If we are already monitoring latencies, don't check again. */
+       if (blk_stat_is_active(kqd->cb))
+               return;
+
+       now = __blk_stat_time(ktime_to_ns(ktime_get()));
+       if (now < blk_stat_time(&rq->issue_stat))
+               return;
+
+       latency = now - blk_stat_time(&rq->issue_stat);
+
+       if (latency > target)
+               blk_stat_activate_msecs(kqd->cb, 10);
+}
+
+static void kyber_flush_busy_ctxs(struct kyber_hctx_data *khd,
+                                 struct blk_mq_hw_ctx *hctx)
+{
+       LIST_HEAD(rq_list);
+       struct request *rq, *next;
+
+       blk_mq_flush_busy_ctxs(hctx, &rq_list);
+       list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
+               unsigned int sched_domain;
+
+               sched_domain = rq_sched_domain(rq);
+               list_move_tail(&rq->queuelist, &khd->rqs[sched_domain]);
+       }
+}
+
+static int kyber_domain_wake(wait_queue_t *wait, unsigned mode, int flags,
+                            void *key)
+{
+       struct blk_mq_hw_ctx *hctx = READ_ONCE(wait->private);
+
+       list_del_init(&wait->task_list);
+       blk_mq_run_hw_queue(hctx, true);
+       return 1;
+}
+
+static int kyber_get_domain_token(struct kyber_queue_data *kqd,
+                                 struct kyber_hctx_data *khd,
+                                 struct blk_mq_hw_ctx *hctx)
+{
+       unsigned int sched_domain = khd->cur_domain;
+       struct sbitmap_queue *domain_tokens = &kqd->domain_tokens[sched_domain];
+       wait_queue_t *wait = &khd->domain_wait[sched_domain];
+       struct sbq_wait_state *ws;
+       int nr;
+
+       nr = __sbitmap_queue_get(domain_tokens);
+       if (nr >= 0)
+               return nr;
+
+       /*
+        * If we failed to get a domain token, make sure the hardware queue is
+        * run when one becomes available. Note that this is serialized on
+        * khd->lock, but we still need to be careful about the waker.
+        */
+       if (list_empty_careful(&wait->task_list)) {
+               init_waitqueue_func_entry(wait, kyber_domain_wake);
+               wait->private = hctx;
+               ws = sbq_wait_ptr(domain_tokens,
+                                 &khd->wait_index[sched_domain]);
+               add_wait_queue(&ws->wait, wait);
+
+               /*
+                * Try again in case a token was freed before we got on the wait
+                * queue.
+                */
+               nr = __sbitmap_queue_get(domain_tokens);
+       }
+       return nr;
+}
+
+static struct request *
+kyber_dispatch_cur_domain(struct kyber_queue_data *kqd,
+                         struct kyber_hctx_data *khd,
+                         struct blk_mq_hw_ctx *hctx,
+                         bool *flushed)
+{
+       struct list_head *rqs;
+       struct request *rq;
+       int nr;
+
+       rqs = &khd->rqs[khd->cur_domain];
+       rq = list_first_entry_or_null(rqs, struct request, queuelist);
+
+       /*
+        * If there wasn't already a pending request and we haven't flushed the
+        * software queues yet, flush the software queues and check again.
+        */
+       if (!rq && !*flushed) {
+               kyber_flush_busy_ctxs(khd, hctx);
+               *flushed = true;
+               rq = list_first_entry_or_null(rqs, struct request, queuelist);
+       }
+
+       if (rq) {
+               nr = kyber_get_domain_token(kqd, khd, hctx);
+               if (nr >= 0) {
+                       khd->batching++;
+                       rq_set_domain_token(rq, nr);
+                       list_del_init(&rq->queuelist);
+                       return rq;
+               }
+       }
+
+       /* There were either no pending requests or no tokens. */
+       return NULL;
+}
+
+static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx)
+{
+       struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
+       struct kyber_hctx_data *khd = hctx->sched_data;
+       bool flushed = false;
+       struct request *rq;
+       int i;
+
+       spin_lock(&khd->lock);
+
+       /*
+        * First, if we are still entitled to batch, try to dispatch a request
+        * from the batch.
+        */
+       if (khd->batching < kyber_batch_size[khd->cur_domain]) {
+               rq = kyber_dispatch_cur_domain(kqd, khd, hctx, &flushed);
+               if (rq)
+                       goto out;
+       }
+
+       /*
+        * Either,
+        * 1. We were no longer entitled to a batch.
+        * 2. The domain we were batching didn't have any requests.
+        * 3. The domain we were batching was out of tokens.
+        *
+        * Start another batch. Note that this wraps back around to the original
+        * domain if no other domains have requests or tokens.
+        */
+       khd->batching = 0;
+       for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
+               if (khd->cur_domain == KYBER_NUM_DOMAINS - 1)
+                       khd->cur_domain = 0;
+               else
+                       khd->cur_domain++;
+
+               rq = kyber_dispatch_cur_domain(kqd, khd, hctx, &flushed);
+               if (rq)
+                       goto out;
+       }
+
+       rq = NULL;
+out:
+       spin_unlock(&khd->lock);
+       return rq;
+}
+
+static bool kyber_has_work(struct blk_mq_hw_ctx *hctx)
+{
+       struct kyber_hctx_data *khd = hctx->sched_data;
+       int i;
+
+       for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
+               if (!list_empty_careful(&khd->rqs[i]))
+                       return true;
+       }
+       return false;
+}
+
+#define KYBER_LAT_SHOW_STORE(op)                                       \
+static ssize_t kyber_##op##_lat_show(struct elevator_queue *e,         \
+                                    char *page)                        \
+{                                                                      \
+       struct kyber_queue_data *kqd = e->elevator_data;                \
+                                                                       \
+       return sprintf(page, "%llu\n", kqd->op##_lat_nsec);             \
+}                                                                      \
+                                                                       \
+static ssize_t kyber_##op##_lat_store(struct elevator_queue *e,                \
+                                     const char *page, size_t count)   \
+{                                                                      \
+       struct kyber_queue_data *kqd = e->elevator_data;                \
+       unsigned long long nsec;                                        \
+       int ret;                                                        \
+                                                                       \
+       ret = kstrtoull(page, 10, &nsec);                               \
+       if (ret)                                                        \
+               return ret;                                             \
+                                                                       \
+       kqd->op##_lat_nsec = nsec;                                      \
+                                                                       \
+       return count;                                                   \
+}
+KYBER_LAT_SHOW_STORE(read);
+KYBER_LAT_SHOW_STORE(write);
+#undef KYBER_LAT_SHOW_STORE
+
+#define KYBER_LAT_ATTR(op) __ATTR(op##_lat_nsec, 0644, kyber_##op##_lat_show, kyber_##op##_lat_store)
+static struct elv_fs_entry kyber_sched_attrs[] = {
+       KYBER_LAT_ATTR(read),
+       KYBER_LAT_ATTR(write),
+       __ATTR_NULL
+};
+#undef KYBER_LAT_ATTR
+
+static struct elevator_type kyber_sched = {
+       .ops.mq = {
+               .init_sched = kyber_init_sched,
+               .exit_sched = kyber_exit_sched,
+               .init_hctx = kyber_init_hctx,
+               .exit_hctx = kyber_exit_hctx,
+               .get_request = kyber_get_request,
+               .put_request = kyber_put_request,
+               .completed_request = kyber_completed_request,
+               .dispatch_request = kyber_dispatch_request,
+               .has_work = kyber_has_work,
+       },
+       .uses_mq = true,
+       .elevator_attrs = kyber_sched_attrs,
+       .elevator_name = "kyber",
+       .elevator_owner = THIS_MODULE,
+};
+
+static int __init kyber_init(void)
+{
+       return elv_register(&kyber_sched);
+}
+
+static void __exit kyber_exit(void)
+{
+       elv_unregister(&kyber_sched);
+}
+
+module_init(kyber_init);
+module_exit(kyber_exit);
+
+MODULE_AUTHOR("Omar Sandoval");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Kyber I/O scheduler");
index 7afb990..0171a2f 100644 (file)
@@ -497,7 +497,6 @@ rescan:
 
        if (disk->fops->revalidate_disk)
                disk->fops->revalidate_disk(disk);
-       blk_integrity_revalidate(disk);
        check_disk_size_change(disk, bdev);
        bdev->bd_invalidated = 0;
        if (!get_capacity(disk) || !(state = check_partition(disk, bdev)))
index 2a2fc76..4a294a5 100644 (file)
@@ -262,11 +262,11 @@ static int blk_complete_sghdr_rq(struct request *rq, struct sg_io_hdr *hdr,
        /*
         * fill in all the output members
         */
-       hdr->status = rq->errors & 0xff;
-       hdr->masked_status = status_byte(rq->errors);
-       hdr->msg_status = msg_byte(rq->errors);
-       hdr->host_status = host_byte(rq->errors);
-       hdr->driver_status = driver_byte(rq->errors);
+       hdr->status = req->result & 0xff;
+       hdr->masked_status = status_byte(req->result);
+       hdr->msg_status = msg_byte(req->result);
+       hdr->host_status = host_byte(req->result);
+       hdr->driver_status = driver_byte(req->result);
        hdr->info = 0;
        if (hdr->masked_status || hdr->host_status || hdr->driver_status)
                hdr->info |= SG_INFO_CHECK;
@@ -362,7 +362,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
                goto out_free_cdb;
 
        bio = rq->bio;
-       rq->retries = 0;
+       req->retries = 0;
 
        start_time = jiffies;
 
@@ -476,13 +476,13 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
                goto error;
 
        /* default.  possible overriden later */
-       rq->retries = 5;
+       req->retries = 5;
 
        switch (opcode) {
        case SEND_DIAGNOSTIC:
        case FORMAT_UNIT:
                rq->timeout = FORMAT_UNIT_TIMEOUT;
-               rq->retries = 1;
+               req->retries = 1;
                break;
        case START_STOP:
                rq->timeout = START_STOP_TIMEOUT;
@@ -495,7 +495,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
                break;
        case READ_DEFECT_DATA:
                rq->timeout = READ_DEFECT_DATA_TIMEOUT;
-               rq->retries = 1;
+               req->retries = 1;
                break;
        default:
                rq->timeout = BLK_DEFAULT_SG_TIMEOUT;
@@ -509,7 +509,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
 
        blk_execute_rq(q, disk, rq, 0);
 
-       err = rq->errors & 0xff;        /* only 8 bit SCSI status */
+       err = req->result & 0xff;       /* only 8 bit SCSI status */
        if (err) {
                if (req->sense_len && req->sense) {
                        bytes = (OMAX_SB_LEN > req->sense_len) ?
@@ -547,7 +547,8 @@ static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk,
        scsi_req(rq)->cmd[0] = cmd;
        scsi_req(rq)->cmd[4] = data;
        scsi_req(rq)->cmd_len = 6;
-       err = blk_execute_rq(q, bd_disk, rq, 0);
+       blk_execute_rq(q, bd_disk, rq, 0);
+       err = scsi_req(rq)->result ? -EIO : 0;
        blk_put_request(rq);
 
        return err;
index 14035f8..9b30ae5 100644 (file)
@@ -275,8 +275,8 @@ static bool check_tper(const void *data)
        u8 flags = tper->supported_features;
 
        if (!(flags & TPER_SYNC_SUPPORTED)) {
-               pr_err("TPer sync not supported. flags = %d\n",
-                      tper->supported_features);
+               pr_debug("TPer sync not supported. flags = %d\n",
+                        tper->supported_features);
                return false;
        }
 
@@ -289,7 +289,7 @@ static bool check_sum(const void *data)
        u32 nlo = be32_to_cpu(sum->num_locking_objects);
 
        if (nlo == 0) {
-               pr_err("Need at least one locking object.\n");
+               pr_debug("Need at least one locking object.\n");
                return false;
        }
 
@@ -385,9 +385,9 @@ static int next(struct opal_dev *dev)
 
                error = step->fn(dev, step->data);
                if (error) {
-                       pr_err("Error on step function: %d with error %d: %s\n",
-                              state, error,
-                              opal_error_to_human(error));
+                       pr_debug("Error on step function: %d with error %d: %s\n",
+                                state, error,
+                                opal_error_to_human(error));
 
                        /* For each OPAL command we do a discovery0 then we
                         * start some sort of session.
@@ -419,8 +419,8 @@ static int opal_discovery0_end(struct opal_dev *dev)
        print_buffer(dev->resp, hlen);
 
        if (hlen > IO_BUFFER_LENGTH - sizeof(*hdr)) {
-               pr_warn("Discovery length overflows buffer (%zu+%u)/%u\n",
-                       sizeof(*hdr), hlen, IO_BUFFER_LENGTH);
+               pr_debug("Discovery length overflows buffer (%zu+%u)/%u\n",
+                        sizeof(*hdr), hlen, IO_BUFFER_LENGTH);
                return -EFAULT;
        }
 
@@ -503,7 +503,7 @@ static void add_token_u8(int *err, struct opal_dev *cmd, u8 tok)
        if (*err)
                return;
        if (cmd->pos >= IO_BUFFER_LENGTH - 1) {
-               pr_err("Error adding u8: end of buffer.\n");
+               pr_debug("Error adding u8: end of buffer.\n");
                *err = -ERANGE;
                return;
        }
@@ -553,7 +553,7 @@ static void add_token_u64(int *err, struct opal_dev *cmd, u64 number)
        len = DIV_ROUND_UP(msb, 4);
 
        if (cmd->pos >= IO_BUFFER_LENGTH - len - 1) {
-               pr_err("Error adding u64: end of buffer.\n");
+               pr_debug("Error adding u64: end of buffer.\n");
                *err = -ERANGE;
                return;
        }
@@ -579,7 +579,7 @@ static void add_token_bytestring(int *err, struct opal_dev *cmd,
        }
 
        if (len >= IO_BUFFER_LENGTH - cmd->pos - header_len) {
-               pr_err("Error adding bytestring: end of buffer.\n");
+               pr_debug("Error adding bytestring: end of buffer.\n");
                *err = -ERANGE;
                return;
        }
@@ -597,7 +597,7 @@ static void add_token_bytestring(int *err, struct opal_dev *cmd,
 static int build_locking_range(u8 *buffer, size_t length, u8 lr)
 {
        if (length > OPAL_UID_LENGTH) {
-               pr_err("Can't build locking range. Length OOB\n");
+               pr_debug("Can't build locking range. Length OOB\n");
                return -ERANGE;
        }
 
@@ -614,7 +614,7 @@ static int build_locking_range(u8 *buffer, size_t length, u8 lr)
 static int build_locking_user(u8 *buffer, size_t length, u8 lr)
 {
        if (length > OPAL_UID_LENGTH) {
-               pr_err("Can't build locking range user, Length OOB\n");
+               pr_debug("Can't build locking range user, Length OOB\n");
                return -ERANGE;
        }
 
@@ -648,7 +648,7 @@ static int cmd_finalize(struct opal_dev *cmd, u32 hsn, u32 tsn)
        add_token_u8(&err, cmd, OPAL_ENDLIST);
 
        if (err) {
-               pr_err("Error finalizing command.\n");
+               pr_debug("Error finalizing command.\n");
                return -EFAULT;
        }
 
@@ -660,7 +660,7 @@ static int cmd_finalize(struct opal_dev *cmd, u32 hsn, u32 tsn)
        hdr->subpkt.length = cpu_to_be32(cmd->pos - sizeof(*hdr));
        while (cmd->pos % 4) {
                if (cmd->pos >= IO_BUFFER_LENGTH) {
-                       pr_err("Error: Buffer overrun\n");
+                       pr_debug("Error: Buffer overrun\n");
                        return -ERANGE;
                }
                cmd->cmd[cmd->pos++] = 0;
@@ -679,14 +679,14 @@ static const struct opal_resp_tok *response_get_token(
        const struct opal_resp_tok *tok;
 
        if (n >= resp->num) {
-               pr_err("Token number doesn't exist: %d, resp: %d\n",
-                      n, resp->num);
+               pr_debug("Token number doesn't exist: %d, resp: %d\n",
+                        n, resp->num);
                return ERR_PTR(-EINVAL);
        }
 
        tok = &resp->toks[n];
        if (tok->len == 0) {
-               pr_err("Token length must be non-zero\n");
+               pr_debug("Token length must be non-zero\n");
                return ERR_PTR(-EINVAL);
        }
 
@@ -727,7 +727,7 @@ static ssize_t response_parse_short(struct opal_resp_tok *tok,
 
                tok->type = OPAL_DTA_TOKENID_UINT;
                if (tok->len > 9) {
-                       pr_warn("uint64 with more than 8 bytes\n");
+                       pr_debug("uint64 with more than 8 bytes\n");
                        return -EINVAL;
                }
                for (i = tok->len - 1; i > 0; i--) {
@@ -814,8 +814,8 @@ static int response_parse(const u8 *buf, size_t length,
 
        if (clen == 0 || plen == 0 || slen == 0 ||
            slen > IO_BUFFER_LENGTH - sizeof(*hdr)) {
-               pr_err("Bad header length. cp: %u, pkt: %u, subpkt: %u\n",
-                      clen, plen, slen);
+               pr_debug("Bad header length. cp: %u, pkt: %u, subpkt: %u\n",
+                        clen, plen, slen);
                print_buffer(pos, sizeof(*hdr));
                return -EINVAL;
        }
@@ -848,7 +848,7 @@ static int response_parse(const u8 *buf, size_t length,
        }
 
        if (num_entries == 0) {
-               pr_err("Couldn't parse response.\n");
+               pr_debug("Couldn't parse response.\n");
                return -EINVAL;
        }
        resp->num = num_entries;
@@ -861,18 +861,18 @@ static size_t response_get_string(const struct parsed_resp *resp, int n,
 {
        *store = NULL;
        if (!resp) {
-               pr_err("Response is NULL\n");
+               pr_debug("Response is NULL\n");
                return 0;
        }
 
        if (n > resp->num) {
-               pr_err("Response has %d tokens. Can't access %d\n",
-                      resp->num, n);
+               pr_debug("Response has %d tokens. Can't access %d\n",
+                        resp->num, n);
                return 0;
        }
 
        if (resp->toks[n].type != OPAL_DTA_TOKENID_BYTESTRING) {
-               pr_err("Token is not a byte string!\n");
+               pr_debug("Token is not a byte string!\n");
                return 0;
        }
 
@@ -883,26 +883,26 @@ static size_t response_get_string(const struct parsed_resp *resp, int n,
 static u64 response_get_u64(const struct parsed_resp *resp, int n)
 {
        if (!resp) {
-               pr_err("Response is NULL\n");
+               pr_debug("Response is NULL\n");
                return 0;
        }
 
        if (n > resp->num) {
-               pr_err("Response has %d tokens. Can't access %d\n",
-                      resp->num, n);
+               pr_debug("Response has %d tokens. Can't access %d\n",
+                        resp->num, n);
                return 0;
        }
 
        if (resp->toks[n].type != OPAL_DTA_TOKENID_UINT) {
-               pr_err("Token is not unsigned it: %d\n",
-                      resp->toks[n].type);
+               pr_debug("Token is not unsigned it: %d\n",
+                        resp->toks[n].type);
                return 0;
        }
 
        if (!(resp->toks[n].width == OPAL_WIDTH_TINY ||
              resp->toks[n].width == OPAL_WIDTH_SHORT)) {
-               pr_err("Atom is not short or tiny: %d\n",
-                      resp->toks[n].width);
+               pr_debug("Atom is not short or tiny: %d\n",
+                        resp->toks[n].width);
                return 0;
        }
 
@@ -949,7 +949,7 @@ static int parse_and_check_status(struct opal_dev *dev)
 
        error = response_parse(dev->resp, IO_BUFFER_LENGTH, &dev->parsed);
        if (error) {
-               pr_err("Couldn't parse response.\n");
+               pr_debug("Couldn't parse response.\n");
                return error;
        }
 
@@ -975,7 +975,7 @@ static int start_opal_session_cont(struct opal_dev *dev)
        tsn = response_get_u64(&dev->parsed, 5);
 
        if (hsn == 0 && tsn == 0) {
-               pr_err("Couldn't authenticate session\n");
+               pr_debug("Couldn't authenticate session\n");
                return -EPERM;
        }
 
@@ -1012,7 +1012,7 @@ static int finalize_and_send(struct opal_dev *dev, cont_fn cont)
 
        ret = cmd_finalize(dev, dev->hsn, dev->tsn);
        if (ret) {
-               pr_err("Error finalizing command buffer: %d\n", ret);
+               pr_debug("Error finalizing command buffer: %d\n", ret);
                return ret;
        }
 
@@ -1041,7 +1041,7 @@ static int gen_key(struct opal_dev *dev, void *data)
        add_token_u8(&err, dev, OPAL_ENDLIST);
 
        if (err) {
-               pr_err("Error building gen key command\n");
+               pr_debug("Error building gen key command\n");
                return err;
 
        }
@@ -1059,8 +1059,8 @@ static int get_active_key_cont(struct opal_dev *dev)
                return error;
        keylen = response_get_string(&dev->parsed, 4, &activekey);
        if (!activekey) {
-               pr_err("%s: Couldn't extract the Activekey from the response\n",
-                      __func__);
+               pr_debug("%s: Couldn't extract the Activekey from the response\n",
+                        __func__);
                return OPAL_INVAL_PARAM;
        }
        dev->prev_data = kmemdup(activekey, keylen, GFP_KERNEL);
@@ -1103,7 +1103,7 @@ static int get_active_key(struct opal_dev *dev, void *data)
        add_token_u8(&err, dev, OPAL_ENDLIST);
        add_token_u8(&err, dev, OPAL_ENDLIST);
        if (err) {
-               pr_err("Error building get active key command\n");
+               pr_debug("Error building get active key command\n");
                return err;
        }
 
@@ -1159,7 +1159,7 @@ static inline int enable_global_lr(struct opal_dev *dev, u8 *uid,
        err = generic_lr_enable_disable(dev, uid, !!setup->RLE, !!setup->WLE,
                                        0, 0);
        if (err)
-               pr_err("Failed to create enable global lr command\n");
+               pr_debug("Failed to create enable global lr command\n");
        return err;
 }
 
@@ -1217,7 +1217,7 @@ static int setup_locking_range(struct opal_dev *dev, void *data)
 
        }
        if (err) {
-               pr_err("Error building Setup Locking range command.\n");
+               pr_debug("Error building Setup Locking range command.\n");
                return err;
 
        }
@@ -1234,11 +1234,8 @@ static int start_generic_opal_session(struct opal_dev *dev,
        u32 hsn;
        int err = 0;
 
-       if (key == NULL && auth != OPAL_ANYBODY_UID) {
-               pr_err("%s: Attempted to open ADMIN_SP Session without a Host" \
-                      "Challenge, and not as the Anybody UID\n", __func__);
+       if (key == NULL && auth != OPAL_ANYBODY_UID)
                return OPAL_INVAL_PARAM;
-       }
 
        clear_opal_cmd(dev);
 
@@ -1273,12 +1270,12 @@ static int start_generic_opal_session(struct opal_dev *dev,
                add_token_u8(&err, dev, OPAL_ENDLIST);
                break;
        default:
-               pr_err("Cannot start Admin SP session with auth %d\n", auth);
+               pr_debug("Cannot start Admin SP session with auth %d\n", auth);
                return OPAL_INVAL_PARAM;
        }
 
        if (err) {
-               pr_err("Error building start adminsp session command.\n");
+               pr_debug("Error building start adminsp session command.\n");
                return err;
        }
 
@@ -1369,7 +1366,7 @@ static int start_auth_opal_session(struct opal_dev *dev, void *data)
        add_token_u8(&err, dev, OPAL_ENDLIST);
 
        if (err) {
-               pr_err("Error building STARTSESSION command.\n");
+               pr_debug("Error building STARTSESSION command.\n");
                return err;
        }
 
@@ -1391,7 +1388,7 @@ static int revert_tper(struct opal_dev *dev, void *data)
        add_token_u8(&err, dev, OPAL_STARTLIST);
        add_token_u8(&err, dev, OPAL_ENDLIST);
        if (err) {
-               pr_err("Error building REVERT TPER command.\n");
+               pr_debug("Error building REVERT TPER command.\n");
                return err;
        }
 
@@ -1426,7 +1423,7 @@ static int internal_activate_user(struct opal_dev *dev, void *data)
        add_token_u8(&err, dev, OPAL_ENDLIST);
 
        if (err) {
-               pr_err("Error building Activate UserN command.\n");
+               pr_debug("Error building Activate UserN command.\n");
                return err;
        }
 
@@ -1453,7 +1450,7 @@ static int erase_locking_range(struct opal_dev *dev, void *data)
        add_token_u8(&err, dev, OPAL_ENDLIST);
 
        if (err) {
-               pr_err("Error building Erase Locking Range Command.\n");
+               pr_debug("Error building Erase Locking Range Command.\n");
                return err;
        }
        return finalize_and_send(dev, parse_and_check_status);
@@ -1484,7 +1481,7 @@ static int set_mbr_done(struct opal_dev *dev, void *data)
        add_token_u8(&err, dev, OPAL_ENDLIST);
 
        if (err) {
-               pr_err("Error Building set MBR Done command\n");
+               pr_debug("Error Building set MBR Done command\n");
                return err;
        }
 
@@ -1516,7 +1513,7 @@ static int set_mbr_enable_disable(struct opal_dev *dev, void *data)
        add_token_u8(&err, dev, OPAL_ENDLIST);
 
        if (err) {
-               pr_err("Error Building set MBR done command\n");
+               pr_debug("Error Building set MBR done command\n");
                return err;
        }
 
@@ -1567,7 +1564,7 @@ static int set_new_pw(struct opal_dev *dev, void *data)
 
        if (generic_pw_cmd(usr->opal_key.key, usr->opal_key.key_len,
                           cpin_uid, dev)) {
-               pr_err("Error building set password command.\n");
+               pr_debug("Error building set password command.\n");
                return -ERANGE;
        }
 
@@ -1582,7 +1579,7 @@ static int set_sid_cpin_pin(struct opal_dev *dev, void *data)
        memcpy(cpin_uid, opaluid[OPAL_C_PIN_SID], OPAL_UID_LENGTH);
 
        if (generic_pw_cmd(key->key, key->key_len, cpin_uid, dev)) {
-               pr_err("Error building Set SID cpin\n");
+               pr_debug("Error building Set SID cpin\n");
                return -ERANGE;
        }
        return finalize_and_send(dev, parse_and_check_status);
@@ -1657,7 +1654,7 @@ static int add_user_to_lr(struct opal_dev *dev, void *data)
        add_token_u8(&err, dev, OPAL_ENDLIST);
 
        if (err) {
-               pr_err("Error building add user to locking range command.\n");
+               pr_debug("Error building add user to locking range command.\n");
                return err;
        }
 
@@ -1691,7 +1688,7 @@ static int lock_unlock_locking_range(struct opal_dev *dev, void *data)
                /* vars are initalized to locked */
                break;
        default:
-               pr_err("Tried to set an invalid locking state... returning to uland\n");
+               pr_debug("Tried to set an invalid locking state... returning to uland\n");
                return OPAL_INVAL_PARAM;
        }
 
@@ -1718,7 +1715,7 @@ static int lock_unlock_locking_range(struct opal_dev *dev, void *data)
        add_token_u8(&err, dev, OPAL_ENDLIST);
 
        if (err) {
-               pr_err("Error building SET command.\n");
+               pr_debug("Error building SET command.\n");
                return err;
        }
        return finalize_and_send(dev, parse_and_check_status);
@@ -1752,14 +1749,14 @@ static int lock_unlock_locking_range_sum(struct opal_dev *dev, void *data)
                /* vars are initalized to locked */
                break;
        default:
-               pr_err("Tried to set an invalid locking state.\n");
+               pr_debug("Tried to set an invalid locking state.\n");
                return OPAL_INVAL_PARAM;
        }
        ret = generic_lr_enable_disable(dev, lr_buffer, 1, 1,
                                        read_locked, write_locked);
 
        if (ret < 0) {
-               pr_err("Error building SET command.\n");
+               pr_debug("Error building SET command.\n");
                return ret;
        }
        return finalize_and_send(dev, parse_and_check_status);
@@ -1811,7 +1808,7 @@ static int activate_lsp(struct opal_dev *dev, void *data)
        }
 
        if (err) {
-               pr_err("Error building Activate LockingSP command.\n");
+               pr_debug("Error building Activate LockingSP command.\n");
                return err;
        }
 
@@ -1831,7 +1828,7 @@ static int get_lsp_lifecycle_cont(struct opal_dev *dev)
        /* 0x08 is Manufacured Inactive */
        /* 0x09 is Manufactured */
        if (lc_status != OPAL_MANUFACTURED_INACTIVE) {
-               pr_err("Couldn't determine the status of the Lifcycle state\n");
+               pr_debug("Couldn't determine the status of the Lifecycle state\n");
                return -ENODEV;
        }
 
@@ -1868,7 +1865,7 @@ static int get_lsp_lifecycle(struct opal_dev *dev, void *data)
        add_token_u8(&err, dev, OPAL_ENDLIST);
 
        if (err) {
-               pr_err("Error Building GET Lifecycle Status command\n");
+               pr_debug("Error Building GET Lifecycle Status command\n");
                return err;
        }
 
@@ -1887,7 +1884,7 @@ static int get_msid_cpin_pin_cont(struct opal_dev *dev)
 
        strlen = response_get_string(&dev->parsed, 4, &msid_pin);
        if (!msid_pin) {
-               pr_err("%s: Couldn't extract PIN from response\n", __func__);
+               pr_debug("%s: Couldn't extract PIN from response\n", __func__);
                return OPAL_INVAL_PARAM;
        }
 
@@ -1929,7 +1926,7 @@ static int get_msid_cpin_pin(struct opal_dev *dev, void *data)
        add_token_u8(&err, dev, OPAL_ENDLIST);
 
        if (err) {
-               pr_err("Error building Get MSID CPIN PIN command.\n");
+               pr_debug("Error building Get MSID CPIN PIN command.\n");
                return err;
        }
 
@@ -2124,18 +2121,18 @@ static int opal_add_user_to_lr(struct opal_dev *dev,
 
        if (lk_unlk->l_state != OPAL_RO &&
            lk_unlk->l_state != OPAL_RW) {
-               pr_err("Locking state was not RO or RW\n");
+               pr_debug("Locking state was not RO or RW\n");
                return -EINVAL;
        }
        if (lk_unlk->session.who < OPAL_USER1 ||
            lk_unlk->session.who > OPAL_USER9) {
-               pr_err("Authority was not within the range of users: %d\n",
-                      lk_unlk->session.who);
+               pr_debug("Authority was not within the range of users: %d\n",
+                        lk_unlk->session.who);
                return -EINVAL;
        }
        if (lk_unlk->session.sum) {
-               pr_err("%s not supported in sum. Use setup locking range\n",
-                      __func__);
+               pr_debug("%s not supported in sum. Use setup locking range\n",
+                        __func__);
                return -EINVAL;
        }
 
@@ -2312,7 +2309,7 @@ static int opal_activate_user(struct opal_dev *dev,
        /* We can't activate Admin1 it's active as manufactured */
        if (opal_session->who < OPAL_USER1 ||
            opal_session->who > OPAL_USER9) {
-               pr_err("Who was not a valid user: %d\n", opal_session->who);
+               pr_debug("Who was not a valid user: %d\n", opal_session->who);
                return -EINVAL;
        }
 
@@ -2343,9 +2340,9 @@ bool opal_unlock_from_suspend(struct opal_dev *dev)
 
                ret = __opal_lock_unlock(dev, &suspend->unlk);
                if (ret) {
-                       pr_warn("Failed to unlock LR %hhu with sum %d\n",
-                               suspend->unlk.session.opal_key.lr,
-                               suspend->unlk.session.sum);
+                       pr_debug("Failed to unlock LR %hhu with sum %d\n",
+                                suspend->unlk.session.opal_key.lr,
+                                suspend->unlk.session.sum);
                        was_failure = true;
                }
        }
@@ -2363,10 +2360,8 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg)
                return -EACCES;
        if (!dev)
                return -ENOTSUPP;
-       if (!dev->supported) {
-               pr_err("Not supported\n");
+       if (!dev->supported)
                return -ENOTSUPP;
-       }
 
        p = memdup_user(arg, _IOC_SIZE(cmd));
        if (IS_ERR(p))
@@ -2410,7 +2405,7 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg)
                ret = opal_secure_erase_locking_range(dev, p);
                break;
        default:
-               pr_warn("No such Opal Ioctl %u\n", cmd);
+               break;
        }
 
        kfree(p);
index 2c97912..680c6d6 100644 (file)
@@ -160,28 +160,28 @@ static int t10_pi_type3_verify_ip(struct blk_integrity_iter *iter)
        return t10_pi_verify(iter, t10_pi_ip_fn, 3);
 }
 
-struct blk_integrity_profile t10_pi_type1_crc = {
+const struct blk_integrity_profile t10_pi_type1_crc = {
        .name                   = "T10-DIF-TYPE1-CRC",
        .generate_fn            = t10_pi_type1_generate_crc,
        .verify_fn              = t10_pi_type1_verify_crc,
 };
 EXPORT_SYMBOL(t10_pi_type1_crc);
 
-struct blk_integrity_profile t10_pi_type1_ip = {
+const struct blk_integrity_profile t10_pi_type1_ip = {
        .name                   = "T10-DIF-TYPE1-IP",
        .generate_fn            = t10_pi_type1_generate_ip,
        .verify_fn              = t10_pi_type1_verify_ip,
 };
 EXPORT_SYMBOL(t10_pi_type1_ip);
 
-struct blk_integrity_profile t10_pi_type3_crc = {
+const struct blk_integrity_profile t10_pi_type3_crc = {
        .name                   = "T10-DIF-TYPE3-CRC",
        .generate_fn            = t10_pi_type3_generate_crc,
        .verify_fn              = t10_pi_type3_verify_crc,
 };
 EXPORT_SYMBOL(t10_pi_type3_crc);
 
-struct blk_integrity_profile t10_pi_type3_ip = {
+const struct blk_integrity_profile t10_pi_type3_ip = {
        .name                   = "T10-DIF-TYPE3-IP",
        .generate_fn            = t10_pi_type3_generate_ip,
        .verify_fn              = t10_pi_type3_verify_ip,
index e58c497..826cd7a 100644 (file)
@@ -32,6 +32,7 @@ struct ahash_request_priv {
        crypto_completion_t complete;
        void *data;
        u8 *result;
+       u32 flags;
        void *ubuf[] CRYPTO_MINALIGN_ATTR;
 };
 
@@ -253,6 +254,8 @@ static int ahash_save_req(struct ahash_request *req, crypto_completion_t cplt)
        priv->result = req->result;
        priv->complete = req->base.complete;
        priv->data = req->base.data;
+       priv->flags = req->base.flags;
+
        /*
         * WARNING: We do not backup req->priv here! The req->priv
         *          is for internal use of the Crypto API and the
@@ -267,38 +270,44 @@ static int ahash_save_req(struct ahash_request *req, crypto_completion_t cplt)
        return 0;
 }
 
-static void ahash_restore_req(struct ahash_request *req)
+static void ahash_restore_req(struct ahash_request *req, int err)
 {
        struct ahash_request_priv *priv = req->priv;
 
+       if (!err)
+               memcpy(priv->result, req->result,
+                      crypto_ahash_digestsize(crypto_ahash_reqtfm(req)));
+
        /* Restore the original crypto request. */
        req->result = priv->result;
-       req->base.complete = priv->complete;
-       req->base.data = priv->data;
+
+       ahash_request_set_callback(req, priv->flags,
+                                  priv->complete, priv->data);
        req->priv = NULL;
 
        /* Free the req->priv.priv from the ADJUSTED request. */
        kzfree(priv);
 }
 
-static void ahash_op_unaligned_finish(struct ahash_request *req, int err)
+static void ahash_notify_einprogress(struct ahash_request *req)
 {
        struct ahash_request_priv *priv = req->priv;
+       struct crypto_async_request oreq;
 
-       if (err == -EINPROGRESS)
-               return;
-
-       if (!err)
-               memcpy(priv->result, req->result,
-                      crypto_ahash_digestsize(crypto_ahash_reqtfm(req)));
+       oreq.data = priv->data;
 
-       ahash_restore_req(req);
+       priv->complete(&oreq, -EINPROGRESS);
 }
 
 static void ahash_op_unaligned_done(struct crypto_async_request *req, int err)
 {
        struct ahash_request *areq = req->data;
 
+       if (err == -EINPROGRESS) {
+               ahash_notify_einprogress(areq);
+               return;
+       }
+
        /*
         * Restore the original request, see ahash_op_unaligned() for what
         * goes where.
@@ -309,7 +318,7 @@ static void ahash_op_unaligned_done(struct crypto_async_request *req, int err)
         */
 
        /* First copy req->result into req->priv.result */
-       ahash_op_unaligned_finish(areq, err);
+       ahash_restore_req(areq, err);
 
        /* Complete the ORIGINAL request. */
        areq->base.complete(&areq->base, err);
@@ -325,7 +334,12 @@ static int ahash_op_unaligned(struct ahash_request *req,
                return err;
 
        err = op(req);
-       ahash_op_unaligned_finish(req, err);
+       if (err == -EINPROGRESS ||
+           (err == -EBUSY && (ahash_request_flags(req) &
+                              CRYPTO_TFM_REQ_MAY_BACKLOG)))
+               return err;
+
+       ahash_restore_req(req, err);
 
        return err;
 }
@@ -360,25 +374,14 @@ int crypto_ahash_digest(struct ahash_request *req)
 }
 EXPORT_SYMBOL_GPL(crypto_ahash_digest);
 
-static void ahash_def_finup_finish2(struct ahash_request *req, int err)
+static void ahash_def_finup_done2(struct crypto_async_request *req, int err)
 {
-       struct ahash_request_priv *priv = req->priv;
+       struct ahash_request *areq = req->data;
 
        if (err == -EINPROGRESS)
                return;
 
-       if (!err)
-               memcpy(priv->result, req->result,
-                      crypto_ahash_digestsize(crypto_ahash_reqtfm(req)));
-
-       ahash_restore_req(req);
-}
-
-static void ahash_def_finup_done2(struct crypto_async_request *req, int err)
-{
-       struct ahash_request *areq = req->data;
-
-       ahash_def_finup_finish2(areq, err);
+       ahash_restore_req(areq, err);
 
        areq->base.complete(&areq->base, err);
 }
@@ -389,11 +392,15 @@ static int ahash_def_finup_finish1(struct ahash_request *req, int err)
                goto out;
 
        req->base.complete = ahash_def_finup_done2;
-       req->base.flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+
        err = crypto_ahash_reqtfm(req)->final(req);
+       if (err == -EINPROGRESS ||
+           (err == -EBUSY && (ahash_request_flags(req) &
+                              CRYPTO_TFM_REQ_MAY_BACKLOG)))
+               return err;
 
 out:
-       ahash_def_finup_finish2(req, err);
+       ahash_restore_req(req, err);
        return err;
 }
 
@@ -401,7 +408,16 @@ static void ahash_def_finup_done1(struct crypto_async_request *req, int err)
 {
        struct ahash_request *areq = req->data;
 
+       if (err == -EINPROGRESS) {
+               ahash_notify_einprogress(areq);
+               return;
+       }
+
+       areq->base.flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+
        err = ahash_def_finup_finish1(areq, err);
+       if (areq->priv)
+               return;
 
        areq->base.complete(&areq->base, err);
 }
@@ -416,6 +432,11 @@ static int ahash_def_finup(struct ahash_request *req)
                return err;
 
        err = tfm->update(req);
+       if (err == -EINPROGRESS ||
+           (err == -EBUSY && (ahash_request_flags(req) &
+                              CRYPTO_TFM_REQ_MAY_BACKLOG)))
+               return err;
+
        return ahash_def_finup_finish1(req, err);
 }
 
index 5a80537..ef59d99 100644 (file)
@@ -40,6 +40,7 @@ struct aead_async_req {
        struct aead_async_rsgl first_rsgl;
        struct list_head list;
        struct kiocb *iocb;
+       struct sock *sk;
        unsigned int tsgls;
        char iv[];
 };
@@ -379,12 +380,10 @@ unlock:
 
 static void aead_async_cb(struct crypto_async_request *_req, int err)
 {
-       struct sock *sk = _req->data;
-       struct alg_sock *ask = alg_sk(sk);
-       struct aead_ctx *ctx = ask->private;
-       struct crypto_aead *tfm = crypto_aead_reqtfm(&ctx->aead_req);
-       struct aead_request *req = aead_request_cast(_req);
+       struct aead_request *req = _req->data;
+       struct crypto_aead *tfm = crypto_aead_reqtfm(req);
        struct aead_async_req *areq = GET_ASYM_REQ(req, tfm);
+       struct sock *sk = areq->sk;
        struct scatterlist *sg = areq->tsgl;
        struct aead_async_rsgl *rsgl;
        struct kiocb *iocb = areq->iocb;
@@ -447,11 +446,12 @@ static int aead_recvmsg_async(struct socket *sock, struct msghdr *msg,
        memset(&areq->first_rsgl, '\0', sizeof(areq->first_rsgl));
        INIT_LIST_HEAD(&areq->list);
        areq->iocb = msg->msg_iocb;
+       areq->sk = sk;
        memcpy(areq->iv, ctx->iv, crypto_aead_ivsize(tfm));
        aead_request_set_tfm(req, tfm);
        aead_request_set_ad(req, ctx->aead_assoclen);
        aead_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
-                                 aead_async_cb, sk);
+                                 aead_async_cb, req);
        used -= ctx->aead_assoclen;
 
        /* take over all tx sgls from ctx */
index 3ea095a..a8bfae4 100644 (file)
@@ -345,6 +345,13 @@ static void encrypt_done(struct crypto_async_request *areq, int err)
        struct rctx *rctx;
 
        rctx = skcipher_request_ctx(req);
+
+       if (err == -EINPROGRESS) {
+               if (rctx->left != req->cryptlen)
+                       return;
+               goto out;
+       }
+
        subreq = &rctx->subreq;
        subreq->base.flags &= CRYPTO_TFM_REQ_MAY_BACKLOG;
 
@@ -352,6 +359,7 @@ static void encrypt_done(struct crypto_async_request *areq, int err)
        if (rctx->left)
                return;
 
+out:
        skcipher_request_complete(req, err);
 }
 
@@ -389,6 +397,13 @@ static void decrypt_done(struct crypto_async_request *areq, int err)
        struct rctx *rctx;
 
        rctx = skcipher_request_ctx(req);
+
+       if (err == -EINPROGRESS) {
+               if (rctx->left != req->cryptlen)
+                       return;
+               goto out;
+       }
+
        subreq = &rctx->subreq;
        subreq->base.flags &= CRYPTO_TFM_REQ_MAY_BACKLOG;
 
@@ -396,6 +411,7 @@ static void decrypt_done(struct crypto_async_request *areq, int err)
        if (rctx->left)
                return;
 
+out:
        skcipher_request_complete(req, err);
 }
 
index c976bfa..89ace5e 100644 (file)
@@ -286,6 +286,13 @@ static void encrypt_done(struct crypto_async_request *areq, int err)
        struct rctx *rctx;
 
        rctx = skcipher_request_ctx(req);
+
+       if (err == -EINPROGRESS) {
+               if (rctx->left != req->cryptlen)
+                       return;
+               goto out;
+       }
+
        subreq = &rctx->subreq;
        subreq->base.flags &= CRYPTO_TFM_REQ_MAY_BACKLOG;
 
@@ -293,6 +300,7 @@ static void encrypt_done(struct crypto_async_request *areq, int err)
        if (rctx->left)
                return;
 
+out:
        skcipher_request_complete(req, err);
 }
 
@@ -330,6 +338,13 @@ static void decrypt_done(struct crypto_async_request *areq, int err)
        struct rctx *rctx;
 
        rctx = skcipher_request_ctx(req);
+
+       if (err == -EINPROGRESS) {
+               if (rctx->left != req->cryptlen)
+                       return;
+               goto out;
+       }
+
        subreq = &rctx->subreq;
        subreq->base.flags &= CRYPTO_TFM_REQ_MAY_BACKLOG;
 
@@ -337,6 +352,7 @@ static void decrypt_done(struct crypto_async_request *areq, int err)
        if (rctx->left)
                return;
 
+out:
        skcipher_request_complete(req, err);
 }
 
index 83e5f7e..18f3036 100644 (file)
@@ -256,7 +256,7 @@ config ACPI_PROCESSOR
 
 config ACPI_IPMI
        tristate "IPMI"
-       depends on IPMI_SI
+       depends on IPMI_HANDLER
        default n
        help
          This driver enables the ACPI to access the BMC controller. And it
@@ -440,7 +440,7 @@ config ACPI_CUSTOM_METHOD
 
 config ACPI_BGRT
        bool "Boottime Graphics Resource Table support"
-       depends on EFI && X86
+       depends on EFI && (X86 || ARM64)
         help
          This driver adds support for exposing the ACPI Boottime Graphics
          Resource Table, which allows the operating system to obtain
@@ -469,9 +469,8 @@ config ACPI_WATCHDOG
 
 config ACPI_EXTLOG
        tristate "Extended Error Log support"
-       depends on X86_MCE && X86_LOCAL_APIC
+       depends on X86_MCE && X86_LOCAL_APIC && EDAC
        select UEFI_CPER
-       select RAS
        default n
        help
          Certain usages such as Predictive Failure Analysis (PFA) require
@@ -506,7 +505,7 @@ config CRC_PMIC_OPREGION
 
 config XPOWER_PMIC_OPREGION
        bool "ACPI operation region support for XPower AXP288 PMIC"
-       depends on AXP288_ADC = y
+       depends on MFD_AXP20X_I2C
        help
          This config adds ACPI operation region support for XPower AXP288 PMIC.
 
@@ -516,6 +515,12 @@ config BXT_WC_PMIC_OPREGION
        help
          This config adds ACPI operation region support for BXT WhiskeyCove PMIC.
 
+config CHT_WC_PMIC_OPREGION
+       bool "ACPI operation region support for CHT Whiskey Cove PMIC"
+       depends on INTEL_SOC_PMIC_CHTWC
+       help
+         This config adds ACPI operation region support for CHT Whiskey Cove PMIC.
+
 endif
 
 config ACPI_CONFIGFS
index d94f92f..d78065c 100644 (file)
@@ -101,6 +101,7 @@ obj-$(CONFIG_PMIC_OPREGION) += pmic/intel_pmic.o
 obj-$(CONFIG_CRC_PMIC_OPREGION) += pmic/intel_pmic_crc.o
 obj-$(CONFIG_XPOWER_PMIC_OPREGION) += pmic/intel_pmic_xpower.o
 obj-$(CONFIG_BXT_WC_PMIC_OPREGION) += pmic/intel_pmic_bxtwc.o
+obj-$(CONFIG_CHT_WC_PMIC_OPREGION) += pmic/intel_pmic_chtwc.o
 
 obj-$(CONFIG_ACPI_CONFIGFS)    += acpi_configfs.o
 
index f71b756..8f52483 100644 (file)
@@ -57,12 +57,23 @@ static int acpi_ac_add(struct acpi_device *device);
 static int acpi_ac_remove(struct acpi_device *device);
 static void acpi_ac_notify(struct acpi_device *device, u32 event);
 
+struct acpi_ac_bl {
+       const char *hid;
+       int hrv;
+};
+
 static const struct acpi_device_id ac_device_ids[] = {
        {"ACPI0003", 0},
        {"", 0},
 };
 MODULE_DEVICE_TABLE(acpi, ac_device_ids);
 
+/* Lists of PMIC ACPI HIDs with an (often better) native charger driver */
+static const struct acpi_ac_bl acpi_ac_blacklist[] = {
+       { "INT33F4", -1 }, /* X-Powers AXP288 PMIC */
+       { "INT34D3",  3 }, /* Intel Cherrytrail Whiskey Cove PMIC */
+};
+
 #ifdef CONFIG_PM_SLEEP
 static int acpi_ac_resume(struct device *dev);
 #endif
@@ -424,11 +435,20 @@ static int acpi_ac_remove(struct acpi_device *device)
 
 static int __init acpi_ac_init(void)
 {
+       unsigned int i;
        int result;
 
        if (acpi_disabled)
                return -ENODEV;
 
+       for (i = 0; i < ARRAY_SIZE(acpi_ac_blacklist); i++)
+               if (acpi_dev_present(acpi_ac_blacklist[i].hid, "1",
+                                    acpi_ac_blacklist[i].hrv)) {
+                       pr_info(PREFIX "AC: found native %s PMIC, not loading\n",
+                               acpi_ac_blacklist[i].hid);
+                       return -ENODEV;
+               }
+
 #ifdef CONFIG_ACPI_PROCFS_POWER
        acpi_ac_dir = acpi_lock_ac_dir();
        if (!acpi_ac_dir)
index a15270a..502ea4d 100644 (file)
@@ -229,7 +229,7 @@ static int __init extlog_init(void)
        if (!(cap & MCG_ELOG_P) || !extlog_get_l1addr())
                return -ENODEV;
 
-       if (get_edac_report_status() == EDAC_REPORTING_FORCE) {
+       if (edac_get_report_status() == EDAC_REPORTING_FORCE) {
                pr_warn("Not loading eMCA, error reporting force-enabled through EDAC.\n");
                return -EPERM;
        }
@@ -285,8 +285,8 @@ static int __init extlog_init(void)
         * eMCA event report method has higher priority than EDAC method,
         * unless EDAC event report method is mandatory.
         */
-       old_edac_report_status = get_edac_report_status();
-       set_edac_report_status(EDAC_REPORTING_DISABLED);
+       old_edac_report_status = edac_get_report_status();
+       edac_set_report_status(EDAC_REPORTING_DISABLED);
        mce_register_decode_chain(&extlog_mce_dec);
        /* enable OS to be involved to take over management from BIOS */
        ((struct extlog_l1_head *)extlog_l1_addr)->flags |= FLAG_OS_OPTIN;
@@ -308,7 +308,7 @@ err:
 
 static void __exit extlog_exit(void)
 {
-       set_edac_report_status(old_edac_report_status);
+       edac_set_report_status(old_edac_report_status);
        mce_unregister_decode_chain(&extlog_mce_dec);
        ((struct extlog_l1_head *)extlog_l1_addr)->flags &= ~FLAG_OS_OPTIN;
        if (extlog_l1_addr)
index 747c2ba..1b64419 100644 (file)
@@ -429,8 +429,7 @@ static void ipmi_msg_handler(struct ipmi_recv_msg *msg, void *user_msg_data)
        if (msg->recv_type == IPMI_RESPONSE_RECV_TYPE &&
            msg->msg.data_len == 1) {
                if (msg->msg.data[0] == IPMI_TIMEOUT_COMPLETION_CODE) {
-                       dev_WARN_ONCE(dev, true,
-                                     "Unexpected response (timeout).\n");
+                       dev_dbg_once(dev, "Unexpected response (timeout).\n");
                        tx_msg->msg_done = ACPI_IPMI_TIMEOUT;
                }
                goto out_comp;
index 03250e1..88cd949 100644 (file)
@@ -121,11 +121,14 @@ struct platform_device *acpi_create_platform_device(struct acpi_device *adev,
        if (IS_ERR(pdev))
                dev_err(&adev->dev, "platform device creation failed: %ld\n",
                        PTR_ERR(pdev));
-       else
+       else {
+               set_dev_node(&pdev->dev, acpi_get_node(adev->handle));
                dev_dbg(&adev->dev, "created platform device %s\n",
                        dev_name(&pdev->dev));
+       }
 
        kfree(resources);
+
        return pdev;
 }
 EXPORT_SYMBOL_GPL(acpi_create_platform_device);
index 0143135..f098e25 100644 (file)
@@ -388,11 +388,6 @@ static int acpi_processor_add(struct acpi_device *device,
        if (result) /* Processor is not physically present or unavailable */
                return 0;
 
-#ifdef CONFIG_SMP
-       if (pr->id >= setup_max_cpus && pr->id != 0)
-               return 0;
-#endif
-
        BUG_ON(pr->id >= nr_cpu_ids);
 
        /*
index d00bc0e..e88fe36 100644 (file)
@@ -73,6 +73,10 @@ module_param(report_key_events, int, 0644);
 MODULE_PARM_DESC(report_key_events,
        "0: none, 1: output changes, 2: brightness changes, 3: all");
 
+/*
+ * Whether the struct acpi_video_device_attrib::device_id_scheme bit should be
+ * assumed even if not actually set.
+ */
 static bool device_id_scheme = false;
 module_param(device_id_scheme, bool, 0444);
 
@@ -88,6 +92,18 @@ static int acpi_video_bus_remove(struct acpi_device *device);
 static void acpi_video_bus_notify(struct acpi_device *device, u32 event);
 void acpi_video_detect_exit(void);
 
+/*
+ * Indices in the _BCL method response: the first two items are special,
+ * the rest are all supported levels.
+ *
+ * See page 575 of the ACPI spec 3.0
+ */
+enum acpi_video_level_idx {
+       ACPI_VIDEO_AC_LEVEL,            /* level when machine has full power */
+       ACPI_VIDEO_BATTERY_LEVEL,       /* level when machine is on batteries */
+       ACPI_VIDEO_FIRST_LEVEL,         /* actual supported levels begin here */
+};
+
 static const struct acpi_device_id video_device_ids[] = {
        {ACPI_VIDEO_HID, 0},
        {"", 0},
@@ -132,7 +148,15 @@ struct acpi_video_device_attrib {
                                   the VGA device. */
        u32 pipe_id:3;          /* For VGA multiple-head devices. */
        u32 reserved:10;        /* Must be 0 */
-       u32 device_id_scheme:1; /* Device ID Scheme */
+
+       /*
+        * The device ID might not actually follow the scheme described by this
+        * struct acpi_video_device_attrib. If it does, then this bit
+        * device_id_scheme is set; otherwise, other fields should be ignored.
+        *
+        * (but also see the global flag device_id_scheme)
+        */
+       u32 device_id_scheme:1;
 };
 
 struct acpi_video_enumerated_device {
@@ -217,20 +241,16 @@ static int acpi_video_get_brightness(struct backlight_device *bd)
 
        if (acpi_video_device_lcd_get_level_current(vd, &cur_level, false))
                return -EINVAL;
-       for (i = 2; i < vd->brightness->count; i++) {
+       for (i = ACPI_VIDEO_FIRST_LEVEL; i < vd->brightness->count; i++) {
                if (vd->brightness->levels[i] == cur_level)
-                       /*
-                        * The first two entries are special - see page 575
-                        * of the ACPI spec 3.0
-                        */
-                       return i - 2;
+                       return i - ACPI_VIDEO_FIRST_LEVEL;
        }
        return 0;
 }
 
 static int acpi_video_set_brightness(struct backlight_device *bd)
 {
-       int request_level = bd->props.brightness + 2;
+       int request_level = bd->props.brightness + ACPI_VIDEO_FIRST_LEVEL;
        struct acpi_video_device *vd = bl_get_data(bd);
 
        cancel_delayed_work(&vd->switch_brightness_work);
@@ -244,18 +264,18 @@ static const struct backlight_ops acpi_backlight_ops = {
 };
 
 /* thermal cooling device callbacks */
-static int video_get_max_state(struct thermal_cooling_device *cooling_dev, unsigned
-                              long *state)
+static int video_get_max_state(struct thermal_cooling_device *cooling_dev,
+                              unsigned long *state)
 {
        struct acpi_device *device = cooling_dev->devdata;
        struct acpi_video_device *video = acpi_driver_data(device);
 
-       *state = video->brightness->count - 3;
+       *state = video->brightness->count - ACPI_VIDEO_FIRST_LEVEL - 1;
        return 0;
 }
 
-static int video_get_cur_state(struct thermal_cooling_device *cooling_dev, unsigned
-                              long *state)
+static int video_get_cur_state(struct thermal_cooling_device *cooling_dev,
+                              unsigned long *state)
 {
        struct acpi_device *device = cooling_dev->devdata;
        struct acpi_video_device *video = acpi_driver_data(device);
@@ -264,7 +284,8 @@ static int video_get_cur_state(struct thermal_cooling_device *cooling_dev, unsig
 
        if (acpi_video_device_lcd_get_level_current(video, &level, false))
                return -EINVAL;
-       for (offset = 2; offset < video->brightness->count; offset++)
+       for (offset = ACPI_VIDEO_FIRST_LEVEL; offset < video->brightness->count;
+            offset++)
                if (level == video->brightness->levels[offset]) {
                        *state = video->brightness->count - offset - 1;
                        return 0;
@@ -280,7 +301,7 @@ video_set_cur_state(struct thermal_cooling_device *cooling_dev, unsigned long st
        struct acpi_video_device *video = acpi_driver_data(device);
        int level;
 
-       if (state >= video->brightness->count - 2)
+       if (state >= video->brightness->count - ACPI_VIDEO_FIRST_LEVEL)
                return -EINVAL;
 
        state = video->brightness->count - state;
@@ -345,10 +366,12 @@ acpi_video_device_lcd_set_level(struct acpi_video_device *device, int level)
        }
 
        device->brightness->curr = level;
-       for (state = 2; state < device->brightness->count; state++)
+       for (state = ACPI_VIDEO_FIRST_LEVEL; state < device->brightness->count;
+            state++)
                if (level == device->brightness->levels[state]) {
                        if (device->backlight)
-                               device->backlight->props.brightness = state - 2;
+                               device->backlight->props.brightness =
+                                       state - ACPI_VIDEO_FIRST_LEVEL;
                        return 0;
                }
 
@@ -530,14 +553,16 @@ acpi_video_bqc_value_to_level(struct acpi_video_device *device,
 
        if (device->brightness->flags._BQC_use_index) {
                /*
-                * _BQC returns an index that doesn't account for
-                * the first 2 items with special meaning, so we need
-                * to compensate for that by offsetting ourselves
+                * _BQC returns an index that doesn't account for the first 2
+                * items with special meaning (see enum acpi_video_level_idx),
+                * so we need to compensate for that by offsetting ourselves
                 */
                if (device->brightness->flags._BCL_reversed)
-                       bqc_value = device->brightness->count - 3 - bqc_value;
+                       bqc_value = device->brightness->count -
+                               ACPI_VIDEO_FIRST_LEVEL - 1 - bqc_value;
 
-               level = device->brightness->levels[bqc_value + 2];
+               level = device->brightness->levels[bqc_value +
+                                                  ACPI_VIDEO_FIRST_LEVEL];
        } else {
                level = bqc_value;
        }
@@ -571,7 +596,8 @@ acpi_video_device_lcd_get_level_current(struct acpi_video_device *device,
 
                        *level = acpi_video_bqc_value_to_level(device, *level);
 
-                       for (i = 2; i < device->brightness->count; i++)
+                       for (i = ACPI_VIDEO_FIRST_LEVEL;
+                            i < device->brightness->count; i++)
                                if (device->brightness->levels[i] == *level) {
                                        device->brightness->curr = *level;
                                        return 0;
@@ -714,9 +740,37 @@ static int acpi_video_bqc_quirk(struct acpi_video_device *device,
 
        /*
         * Some systems always report current brightness level as maximum
-        * through _BQC, we need to test another value for them.
+        * through _BQC, we need to test another value for them. However,
+        * there is a subtlety:
+        *
+        * If the _BCL package ordering is descending, the first level
+        * (br->levels[2]) is likely to be 0, and if the number of levels
+        * matches the number of steps, we might confuse a returned level to
+        * mean the index.
+        *
+        * For example:
+        *
+        *     current_level = max_level = 100
+        *     test_level = 0
+        *     returned level = 100
+        *
+        * In this case 100 means the level, not the index, and _BCM failed.
+        * Still, if the _BCL package ordering is descending, the index of
+        * level 0 is also 100, so we assume _BQC is indexed, when it's not.
+        *
+        * This causes all _BQC calls to return bogus values causing weird
+        * behavior from the user's perspective.  For example:
+        *
+        * xbacklight -set 10; xbacklight -set 20;
+        *
+        * would flash to 90% and then slowly down to the desired level (20).
+        *
+        * The solution is simple; test anything other than the first level
+        * (e.g. 1).
         */
-       test_level = current_level == max_level ? br->levels[3] : max_level;
+       test_level = current_level == max_level
+               ? br->levels[ACPI_VIDEO_FIRST_LEVEL + 1]
+               : max_level;
 
        result = acpi_video_device_lcd_set_level(device, test_level);
        if (result)
@@ -730,8 +784,8 @@ static int acpi_video_bqc_quirk(struct acpi_video_device *device,
                /* buggy _BQC found, need to find out if it uses index */
                if (level < br->count) {
                        if (br->flags._BCL_reversed)
-                               level = br->count - 3 - level;
-                       if (br->levels[level + 2] == test_level)
+                               level = br->count - ACPI_VIDEO_FIRST_LEVEL - 1 - level;
+                       if (br->levels[level + ACPI_VIDEO_FIRST_LEVEL] == test_level)
                                br->flags._BQC_use_index = 1;
                }
 
@@ -761,7 +815,7 @@ int acpi_video_get_levels(struct acpi_device *device,
                goto out;
        }
 
-       if (obj->package.count < 2) {
+       if (obj->package.count < ACPI_VIDEO_FIRST_LEVEL) {
                result = -EINVAL;
                goto out;
        }
@@ -773,8 +827,13 @@ int acpi_video_get_levels(struct acpi_device *device,
                goto out;
        }
 
-       br->levels = kmalloc((obj->package.count + 2) * sizeof *(br->levels),
-                               GFP_KERNEL);
+       /*
+        * Note that we have to reserve 2 extra items (ACPI_VIDEO_FIRST_LEVEL),
+        * in order to account for buggy BIOS which don't export the first two
+        * special levels (see below)
+        */
+       br->levels = kmalloc((obj->package.count + ACPI_VIDEO_FIRST_LEVEL) *
+                            sizeof(*br->levels), GFP_KERNEL);
        if (!br->levels) {
                result = -ENOMEM;
                goto out_free;
@@ -788,7 +847,8 @@ int acpi_video_get_levels(struct acpi_device *device,
                }
                value = (u32) o->integer.value;
                /* Skip duplicate entries */
-               if (count > 2 && br->levels[count - 1] == value)
+               if (count > ACPI_VIDEO_FIRST_LEVEL
+                   && br->levels[count - 1] == value)
                        continue;
 
                br->levels[count] = value;
@@ -804,27 +864,30 @@ int acpi_video_get_levels(struct acpi_device *device,
         * In this case, the first two elements in _BCL packages
         * are also supported brightness levels that OS should take care of.
         */
-       for (i = 2; i < count; i++) {
-               if (br->levels[i] == br->levels[0])
+       for (i = ACPI_VIDEO_FIRST_LEVEL; i < count; i++) {
+               if (br->levels[i] == br->levels[ACPI_VIDEO_AC_LEVEL])
                        level_ac_battery++;
-               if (br->levels[i] == br->levels[1])
+               if (br->levels[i] == br->levels[ACPI_VIDEO_BATTERY_LEVEL])
                        level_ac_battery++;
        }
 
-       if (level_ac_battery < 2) {
-               level_ac_battery = 2 - level_ac_battery;
+       if (level_ac_battery < ACPI_VIDEO_FIRST_LEVEL) {
+               level_ac_battery = ACPI_VIDEO_FIRST_LEVEL - level_ac_battery;
                br->flags._BCL_no_ac_battery_levels = 1;
-               for (i = (count - 1 + level_ac_battery); i >= 2; i--)
+               for (i = (count - 1 + level_ac_battery);
+                    i >= ACPI_VIDEO_FIRST_LEVEL; i--)
                        br->levels[i] = br->levels[i - level_ac_battery];
                count += level_ac_battery;
-       } else if (level_ac_battery > 2)
+       } else if (level_ac_battery > ACPI_VIDEO_FIRST_LEVEL)
                ACPI_ERROR((AE_INFO, "Too many duplicates in _BCL package"));
 
        /* Check if the _BCL package is in a reversed order */
-       if (max_level == br->levels[2]) {
+       if (max_level == br->levels[ACPI_VIDEO_FIRST_LEVEL]) {
                br->flags._BCL_reversed = 1;
-               sort(&br->levels[2], count - 2, sizeof(br->levels[2]),
-                       acpi_video_cmp_level, NULL);
+               sort(&br->levels[ACPI_VIDEO_FIRST_LEVEL],
+                    count - ACPI_VIDEO_FIRST_LEVEL,
+                    sizeof(br->levels[ACPI_VIDEO_FIRST_LEVEL]),
+                    acpi_video_cmp_level, NULL);
        } else if (max_level != br->levels[count - 1])
                ACPI_ERROR((AE_INFO,
                            "Found unordered _BCL package"));
@@ -894,7 +957,7 @@ acpi_video_init_brightness(struct acpi_video_device *device)
         * level_old is invalid (no matter whether it's a level
         * or an index). Set the backlight to max_level in this case.
         */
-       for (i = 2; i < br->count; i++)
+       for (i = ACPI_VIDEO_FIRST_LEVEL; i < br->count; i++)
                if (level == br->levels[i])
                        break;
        if (i == br->count || !level)
@@ -906,7 +969,8 @@ set_level:
                goto out_free_levels;
 
        ACPI_DEBUG_PRINT((ACPI_DB_INFO,
-                         "found %d brightness levels\n", br->count - 2));
+                         "found %d brightness levels\n",
+                         br->count - ACPI_VIDEO_FIRST_LEVEL));
        return 0;
 
 out_free_levels:
@@ -1297,7 +1361,7 @@ acpi_video_get_next_level(struct acpi_video_device *device,
        max = max_below = 0;
        min = min_above = 255;
        /* Find closest level to level_current */
-       for (i = 2; i < device->brightness->count; i++) {
+       for (i = ACPI_VIDEO_FIRST_LEVEL; i < device->brightness->count; i++) {
                l = device->brightness->levels[i];
                if (abs(l - level_current) < abs(delta)) {
                        delta = l - level_current;
@@ -1307,7 +1371,7 @@ acpi_video_get_next_level(struct acpi_video_device *device,
        }
        /* Ajust level_current to closest available level */
        level_current += delta;
-       for (i = 2; i < device->brightness->count; i++) {
+       for (i = ACPI_VIDEO_FIRST_LEVEL; i < device->brightness->count; i++) {
                l = device->brightness->levels[i];
                if (l < min)
                        min = l;
@@ -1680,7 +1744,8 @@ static void acpi_video_dev_register_backlight(struct acpi_video_device *device)
 
        memset(&props, 0, sizeof(struct backlight_properties));
        props.type = BACKLIGHT_FIRMWARE;
-       props.max_brightness = device->brightness->count - 3;
+       props.max_brightness =
+               device->brightness->count - ACPI_VIDEO_FIRST_LEVEL - 1;
        device->backlight = backlight_device_register(name,
                                                      parent,
                                                      device,
index c86bae7..ff096d9 100644 (file)
@@ -421,10 +421,8 @@ acpi_ut_walk_aml_resources(struct acpi_walk_state *walk_state,
 
        ACPI_FUNCTION_TRACE(ut_walk_aml_resources);
 
-       /*
-        * The absolute minimum resource template is one end_tag descriptor.
-        * However, we will treat a lone end_tag as just a simple buffer.
-        */
+       /* The absolute minimum resource template is one end_tag descriptor */
+
        if (aml_length < sizeof(struct aml_resource_end_tag)) {
                return_ACPI_STATUS(AE_AML_NO_RESOURCE_END_TAG);
        }
@@ -456,8 +454,9 @@ acpi_ut_walk_aml_resources(struct acpi_walk_state *walk_state,
                /* Invoke the user function */
 
                if (user_function) {
-                       status = user_function(aml, length, offset,
-                                              resource_index, context);
+                       status =
+                           user_function(aml, length, offset, resource_index,
+                                         context);
                        if (ACPI_FAILURE(status)) {
                                return_ACPI_STATUS(status);
                        }
@@ -481,12 +480,6 @@ acpi_ut_walk_aml_resources(struct acpi_walk_state *walk_state,
                                *context = aml;
                        }
 
-                       /* Check if buffer is defined to be longer than the resource length */
-
-                       if (aml_length > (offset + length)) {
-                               return_ACPI_STATUS(AE_AML_NO_RESOURCE_END_TAG);
-                       }
-
                        /* Normal exit */
 
                        return_ACPI_STATUS(AE_OK);
index 79b3c9c..d0855c0 100644 (file)
@@ -1005,9 +1005,8 @@ static int ghes_probe(struct platform_device *ghes_dev)
 
        switch (generic->notify.type) {
        case ACPI_HEST_NOTIFY_POLLED:
-               ghes->timer.function = ghes_poll_func;
-               ghes->timer.data = (unsigned long)ghes;
-               init_timer_deferrable(&ghes->timer);
+               setup_deferrable_timer(&ghes->timer, ghes_poll_func,
+                                      (unsigned long)ghes);
                ghes_add_timer(ghes);
                break;
        case ACPI_HEST_NOTIFY_EXTERNAL:
index 4616da4..5a6f80f 100644 (file)
@@ -4,3 +4,6 @@
 
 config ACPI_IORT
        bool
+
+config ACPI_GTDT
+       bool
index 72331f2..1017def 100644 (file)
@@ -1 +1,2 @@
 obj-$(CONFIG_ACPI_IORT)        += iort.o
+obj-$(CONFIG_ACPI_GTDT)        += gtdt.o
diff --git a/drivers/acpi/arm64/gtdt.c b/drivers/acpi/arm64/gtdt.c
new file mode 100644 (file)
index 0000000..597a737
--- /dev/null
@@ -0,0 +1,417 @@
+/*
+ * ARM Specific GTDT table Support
+ *
+ * Copyright (C) 2016, Linaro Ltd.
+ * Author: Daniel Lezcano <daniel.lezcano@linaro.org>
+ *         Fu Wei <fu.wei@linaro.org>
+ *         Hanjun Guo <hanjun.guo@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/acpi.h>
+#include <linux/init.h>
+#include <linux/irqdomain.h>
+#include <linux/kernel.h>
+#include <linux/platform_device.h>
+
+#include <clocksource/arm_arch_timer.h>
+
+#undef pr_fmt
+#define pr_fmt(fmt) "ACPI GTDT: " fmt
+
+/**
+ * struct acpi_gtdt_descriptor - Store the key info of GTDT for all functions
+ * @gtdt:      The pointer to the struct acpi_table_gtdt of GTDT table.
+ * @gtdt_end:  The pointer to the end of GTDT table.
+ * @platform_timer:    The pointer to the start of Platform Timer Structure
+ *
+ * The struct store the key info of GTDT table, it should be initialized by
+ * acpi_gtdt_init.
+ */
+struct acpi_gtdt_descriptor {
+       struct acpi_table_gtdt *gtdt;
+       void *gtdt_end;
+       void *platform_timer;
+};
+
+static struct acpi_gtdt_descriptor acpi_gtdt_desc __initdata;
+
+static inline void *next_platform_timer(void *platform_timer)
+{
+       struct acpi_gtdt_header *gh = platform_timer;
+
+       platform_timer += gh->length;
+       if (platform_timer < acpi_gtdt_desc.gtdt_end)
+               return platform_timer;
+
+       return NULL;
+}
+
+#define for_each_platform_timer(_g)                            \
+       for (_g = acpi_gtdt_desc.platform_timer; _g;    \
+            _g = next_platform_timer(_g))
+
+static inline bool is_timer_block(void *platform_timer)
+{
+       struct acpi_gtdt_header *gh = platform_timer;
+
+       return gh->type == ACPI_GTDT_TYPE_TIMER_BLOCK;
+}
+
+static inline bool is_non_secure_watchdog(void *platform_timer)
+{
+       struct acpi_gtdt_header *gh = platform_timer;
+       struct acpi_gtdt_watchdog *wd = platform_timer;
+
+       if (gh->type != ACPI_GTDT_TYPE_WATCHDOG)
+               return false;
+
+       return !(wd->timer_flags & ACPI_GTDT_WATCHDOG_SECURE);
+}
+
+static int __init map_gt_gsi(u32 interrupt, u32 flags)
+{
+       int trigger, polarity;
+
+       trigger = (flags & ACPI_GTDT_INTERRUPT_MODE) ? ACPI_EDGE_SENSITIVE
+                       : ACPI_LEVEL_SENSITIVE;
+
+       polarity = (flags & ACPI_GTDT_INTERRUPT_POLARITY) ? ACPI_ACTIVE_LOW
+                       : ACPI_ACTIVE_HIGH;
+
+       return acpi_register_gsi(NULL, interrupt, trigger, polarity);
+}
+
+/**
+ * acpi_gtdt_map_ppi() - Map the PPIs of per-cpu arch_timer.
+ * @type:      the type of PPI.
+ *
+ * Note: Secure state is not managed by the kernel on ARM64 systems.
+ * So we only handle the non-secure timer PPIs,
+ * ARCH_TIMER_PHYS_SECURE_PPI is treated as invalid type.
+ *
+ * Return: the mapped PPI value, 0 if error.
+ */
+int __init acpi_gtdt_map_ppi(int type)
+{
+       struct acpi_table_gtdt *gtdt = acpi_gtdt_desc.gtdt;
+
+       switch (type) {
+       case ARCH_TIMER_PHYS_NONSECURE_PPI:
+               return map_gt_gsi(gtdt->non_secure_el1_interrupt,
+                                 gtdt->non_secure_el1_flags);
+       case ARCH_TIMER_VIRT_PPI:
+               return map_gt_gsi(gtdt->virtual_timer_interrupt,
+                                 gtdt->virtual_timer_flags);
+
+       case ARCH_TIMER_HYP_PPI:
+               return map_gt_gsi(gtdt->non_secure_el2_interrupt,
+                                 gtdt->non_secure_el2_flags);
+       default:
+               pr_err("Failed to map timer interrupt: invalid type.\n");
+       }
+
+       return 0;
+}
+
+/**
+ * acpi_gtdt_c3stop() - Got c3stop info from GTDT according to the type of PPI.
+ * @type:      the type of PPI.
+ *
+ * Return: true if the timer HW state is lost when a CPU enters an idle state,
+ * false otherwise
+ */
+bool __init acpi_gtdt_c3stop(int type)
+{
+       struct acpi_table_gtdt *gtdt = acpi_gtdt_desc.gtdt;
+
+       switch (type) {
+       case ARCH_TIMER_PHYS_NONSECURE_PPI:
+               return !(gtdt->non_secure_el1_flags & ACPI_GTDT_ALWAYS_ON);
+
+       case ARCH_TIMER_VIRT_PPI:
+               return !(gtdt->virtual_timer_flags & ACPI_GTDT_ALWAYS_ON);
+
+       case ARCH_TIMER_HYP_PPI:
+               return !(gtdt->non_secure_el2_flags & ACPI_GTDT_ALWAYS_ON);
+
+       default:
+               pr_err("Failed to get c3stop info: invalid type.\n");
+       }
+
+       return false;
+}
+
+/**
+ * acpi_gtdt_init() - Get the info of GTDT table to prepare for further init.
+ * @table:                     The pointer to GTDT table.
+ * @platform_timer_count:      It points to a integer variable which is used
+ *                             for storing the number of platform timers.
+ *                             This pointer could be NULL, if the caller
+ *                             doesn't need this info.
+ *
+ * Return: 0 if success, -EINVAL if error.
+ */
+int __init acpi_gtdt_init(struct acpi_table_header *table,
+                         int *platform_timer_count)
+{
+       void *platform_timer;
+       struct acpi_table_gtdt *gtdt;
+
+       gtdt = container_of(table, struct acpi_table_gtdt, header);
+       acpi_gtdt_desc.gtdt = gtdt;
+       acpi_gtdt_desc.gtdt_end = (void *)table + table->length;
+       acpi_gtdt_desc.platform_timer = NULL;
+       if (platform_timer_count)
+               *platform_timer_count = 0;
+
+       if (table->revision < 2) {
+               pr_warn("Revision:%d doesn't support Platform Timers.\n",
+                       table->revision);
+               return 0;
+       }
+
+       if (!gtdt->platform_timer_count) {
+               pr_debug("No Platform Timer.\n");
+               return 0;
+       }
+
+       platform_timer = (void *)gtdt + gtdt->platform_timer_offset;
+       if (platform_timer < (void *)table + sizeof(struct acpi_table_gtdt)) {
+               pr_err(FW_BUG "invalid timer data.\n");
+               return -EINVAL;
+       }
+       acpi_gtdt_desc.platform_timer = platform_timer;
+       if (platform_timer_count)
+               *platform_timer_count = gtdt->platform_timer_count;
+
+       return 0;
+}
+
+static int __init gtdt_parse_timer_block(struct acpi_gtdt_timer_block *block,
+                                        struct arch_timer_mem *timer_mem)
+{
+       int i;
+       struct arch_timer_mem_frame *frame;
+       struct acpi_gtdt_timer_entry *gtdt_frame;
+
+       if (!block->timer_count) {
+               pr_err(FW_BUG "GT block present, but frame count is zero.");
+               return -ENODEV;
+       }
+
+       if (block->timer_count > ARCH_TIMER_MEM_MAX_FRAMES) {
+               pr_err(FW_BUG "GT block lists %d frames, ACPI spec only allows 8\n",
+                      block->timer_count);
+               return -EINVAL;
+       }
+
+       timer_mem->cntctlbase = (phys_addr_t)block->block_address;
+       /*
+        * The CNTCTLBase frame is 4KB (register offsets 0x000 - 0xFFC).
+        * See ARM DDI 0487A.k_iss10775, page I1-5129, Table I1-3
+        * "CNTCTLBase memory map".
+        */
+       timer_mem->size = SZ_4K;
+
+       gtdt_frame = (void *)block + block->timer_offset;
+       if (gtdt_frame + block->timer_count != (void *)block + block->header.length)
+               return -EINVAL;
+
+       /*
+        * Get the GT timer Frame data for every GT Block Timer
+        */
+       for (i = 0; i < block->timer_count; i++, gtdt_frame++) {
+               if (gtdt_frame->common_flags & ACPI_GTDT_GT_IS_SECURE_TIMER)
+                       continue;
+               if (gtdt_frame->frame_number >= ARCH_TIMER_MEM_MAX_FRAMES ||
+                   !gtdt_frame->base_address || !gtdt_frame->timer_interrupt)
+                       goto error;
+
+               frame = &timer_mem->frame[gtdt_frame->frame_number];
+
+               /* duplicate frame */
+               if (frame->valid)
+                       goto error;
+
+               frame->phys_irq = map_gt_gsi(gtdt_frame->timer_interrupt,
+                                            gtdt_frame->timer_flags);
+               if (frame->phys_irq <= 0) {
+                       pr_warn("failed to map physical timer irq in frame %d.\n",
+                               gtdt_frame->frame_number);
+                       goto error;
+               }
+
+               if (gtdt_frame->virtual_timer_interrupt) {
+                       frame->virt_irq =
+                               map_gt_gsi(gtdt_frame->virtual_timer_interrupt,
+                                          gtdt_frame->virtual_timer_flags);
+                       if (frame->virt_irq <= 0) {
+                               pr_warn("failed to map virtual timer irq in frame %d.\n",
+                                       gtdt_frame->frame_number);
+                               goto error;
+                       }
+               } else {
+                       pr_debug("virtual timer in frame %d not implemented.\n",
+                                gtdt_frame->frame_number);
+               }
+
+               frame->cntbase = gtdt_frame->base_address;
+               /*
+                * The CNTBaseN frame is 4KB (register offsets 0x000 - 0xFFC).
+                * See ARM DDI 0487A.k_iss10775, page I1-5130, Table I1-4
+                * "CNTBaseN memory map".
+                */
+               frame->size = SZ_4K;
+               frame->valid = true;
+       }
+
+       return 0;
+
+error:
+       do {
+               if (gtdt_frame->common_flags & ACPI_GTDT_GT_IS_SECURE_TIMER ||
+                   gtdt_frame->frame_number >= ARCH_TIMER_MEM_MAX_FRAMES)
+                       continue;
+
+               frame = &timer_mem->frame[gtdt_frame->frame_number];
+
+               if (frame->phys_irq > 0)
+                       acpi_unregister_gsi(gtdt_frame->timer_interrupt);
+               frame->phys_irq = 0;
+
+               if (frame->virt_irq > 0)
+                       acpi_unregister_gsi(gtdt_frame->virtual_timer_interrupt);
+               frame->virt_irq = 0;
+       } while (i-- >= 0 && gtdt_frame--);
+
+       return -EINVAL;
+}
+
+/**
+ * acpi_arch_timer_mem_init() - Get the info of all GT blocks in GTDT table.
+ * @timer_mem: The pointer to the array of struct arch_timer_mem for returning
+ *             the result of parsing. The element number of this array should
+ *             be platform_timer_count(the total number of platform timers).
+ * @timer_count: It points to a integer variable which is used for storing the
+ *             number of GT blocks we have parsed.
+ *
+ * Return: 0 if success, -EINVAL/-ENODEV if error.
+ */
+int __init acpi_arch_timer_mem_init(struct arch_timer_mem *timer_mem,
+                                   int *timer_count)
+{
+       int ret;
+       void *platform_timer;
+
+       *timer_count = 0;
+       for_each_platform_timer(platform_timer) {
+               if (is_timer_block(platform_timer)) {
+                       ret = gtdt_parse_timer_block(platform_timer, timer_mem);
+                       if (ret)
+                               return ret;
+                       timer_mem++;
+                       (*timer_count)++;
+               }
+       }
+
+       if (*timer_count)
+               pr_info("found %d memory-mapped timer block(s).\n",
+                       *timer_count);
+
+       return 0;
+}
+
+/*
+ * Initialize a SBSA generic Watchdog platform device info from GTDT
+ */
+static int __init gtdt_import_sbsa_gwdt(struct acpi_gtdt_watchdog *wd,
+                                       int index)
+{
+       struct platform_device *pdev;
+       int irq = map_gt_gsi(wd->timer_interrupt, wd->timer_flags);
+
+       /*
+        * According to SBSA specification the size of refresh and control
+        * frames of SBSA Generic Watchdog is SZ_4K(Offset 0x000 â€“ 0xFFF).
+        */
+       struct resource res[] = {
+               DEFINE_RES_MEM(wd->control_frame_address, SZ_4K),
+               DEFINE_RES_MEM(wd->refresh_frame_address, SZ_4K),
+               DEFINE_RES_IRQ(irq),
+       };
+       int nr_res = ARRAY_SIZE(res);
+
+       pr_debug("found a Watchdog (0x%llx/0x%llx gsi:%u flags:0x%x).\n",
+                wd->refresh_frame_address, wd->control_frame_address,
+                wd->timer_interrupt, wd->timer_flags);
+
+       if (!(wd->refresh_frame_address && wd->control_frame_address)) {
+               pr_err(FW_BUG "failed to get the Watchdog base address.\n");
+               acpi_unregister_gsi(wd->timer_interrupt);
+               return -EINVAL;
+       }
+
+       if (irq <= 0) {
+               pr_warn("failed to map the Watchdog interrupt.\n");
+               nr_res--;
+       }
+
+       /*
+        * Add a platform device named "sbsa-gwdt" to match the platform driver.
+        * "sbsa-gwdt": SBSA(Server Base System Architecture) Generic Watchdog
+        * The platform driver can get device info below by matching this name.
+        */
+       pdev = platform_device_register_simple("sbsa-gwdt", index, res, nr_res);
+       if (IS_ERR(pdev)) {
+               acpi_unregister_gsi(wd->timer_interrupt);
+               return PTR_ERR(pdev);
+       }
+
+       return 0;
+}
+
+static int __init gtdt_sbsa_gwdt_init(void)
+{
+       void *platform_timer;
+       struct acpi_table_header *table;
+       int ret, timer_count, gwdt_count = 0;
+
+       if (acpi_disabled)
+               return 0;
+
+       if (ACPI_FAILURE(acpi_get_table(ACPI_SIG_GTDT, 0, &table)))
+               return -EINVAL;
+
+       /*
+        * Note: Even though the global variable acpi_gtdt_desc has been
+        * initialized by acpi_gtdt_init() while initializing the arch timers,
+        * when we call this function to get SBSA watchdogs info from GTDT, the
+        * pointers stashed in it are stale (since they are early temporary
+        * mappings carried out before acpi_permanent_mmap is set) and we need
+        * to re-initialize them with permanent mapped pointer values to let the
+        * GTDT parsing possible.
+        */
+       ret = acpi_gtdt_init(table, &timer_count);
+       if (ret || !timer_count)
+               return ret;
+
+       for_each_platform_timer(platform_timer) {
+               if (is_non_secure_watchdog(platform_timer)) {
+                       ret = gtdt_import_sbsa_gwdt(platform_timer, gwdt_count);
+                       if (ret)
+                               break;
+                       gwdt_count++;
+               }
+       }
+
+       if (gwdt_count)
+               pr_info("found %d SBSA generic Watchdog(s).\n", gwdt_count);
+
+       return ret;
+}
+
+device_initcall(gtdt_sbsa_gwdt_init);
index 4ef1e46..d42eeef 100644 (file)
@@ -67,6 +67,7 @@ MODULE_DESCRIPTION("ACPI Battery Driver");
 MODULE_LICENSE("GPL");
 
 static async_cookie_t async_cookie;
+static bool battery_driver_registered;
 static int battery_bix_broken_package;
 static int battery_notification_delay_ms;
 static unsigned int cache_time = 1000;
@@ -93,6 +94,11 @@ static const struct acpi_device_id battery_device_ids[] = {
 
 MODULE_DEVICE_TABLE(acpi, battery_device_ids);
 
+/* Lists of PMIC ACPI HIDs with an (often better) native battery driver */
+static const char * const acpi_battery_blacklist[] = {
+       "INT33F4", /* X-Powers AXP288 PMIC */
+};
+
 enum {
        ACPI_BATTERY_ALARM_PRESENT,
        ACPI_BATTERY_XINFO_PRESENT,
@@ -1315,8 +1321,17 @@ static struct acpi_driver acpi_battery_driver = {
 
 static void __init acpi_battery_init_async(void *unused, async_cookie_t cookie)
 {
+       unsigned int i;
        int result;
 
+       for (i = 0; i < ARRAY_SIZE(acpi_battery_blacklist); i++)
+               if (acpi_dev_present(acpi_battery_blacklist[i], "1", -1)) {
+                       pr_info(PREFIX ACPI_BATTERY_DEVICE_NAME
+                               ": found native %s PMIC, not loading\n",
+                               acpi_battery_blacklist[i]);
+                       return;
+               }
+
        dmi_check_system(bat_dmi_table);
 
 #ifdef CONFIG_ACPI_PROCFS_POWER
@@ -1329,6 +1344,7 @@ static void __init acpi_battery_init_async(void *unused, async_cookie_t cookie)
        if (result < 0)
                acpi_unlock_battery_dir(acpi_battery_dir);
 #endif
+       battery_driver_registered = (result == 0);
 }
 
 static int __init acpi_battery_init(void)
@@ -1343,9 +1359,11 @@ static int __init acpi_battery_init(void)
 static void __exit acpi_battery_exit(void)
 {
        async_synchronize_cookie(async_cookie + 1);
-       acpi_bus_unregister_driver(&acpi_battery_driver);
+       if (battery_driver_registered)
+               acpi_bus_unregister_driver(&acpi_battery_driver);
 #ifdef CONFIG_ACPI_PROCFS_POWER
-       acpi_unlock_battery_dir(acpi_battery_dir);
+       if (acpi_battery_dir)
+               acpi_unlock_battery_dir(acpi_battery_dir);
 #endif
 }
 
index ca28aa5..df1c629 100644 (file)
@@ -81,6 +81,12 @@ static struct attribute_group bgrt_attribute_group = {
        .bin_attrs = bgrt_bin_attributes,
 };
 
+int __init acpi_parse_bgrt(struct acpi_table_header *table)
+{
+       efi_bgrt_init(table);
+       return 0;
+}
+
 static int __init bgrt_init(void)
 {
        int ret;
index 4421f7c..bb542ac 100644 (file)
@@ -188,6 +188,14 @@ static struct dmi_system_id acpi_rev_dmi_table[] __initdata = {
                      DMI_MATCH(DMI_PRODUCT_NAME, "Latitude 3350"),
                },
        },
+       {
+        .callback = dmi_enable_rev_override,
+        .ident = "DELL Inspiron 7537",
+        .matches = {
+                     DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+                     DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 7537"),
+               },
+       },
 #endif
        {}
 };
index 3ca0729..6cbe603 100644 (file)
@@ -132,49 +132,54 @@ __ATTR(_name, 0444, show_##_name, NULL)
 
 #define to_cpc_desc(a) container_of(a, struct cpc_desc, kobj)
 
+#define show_cppc_data(access_fn, struct_name, member_name)            \
+       static ssize_t show_##member_name(struct kobject *kobj,         \
+                                       struct attribute *attr, char *buf) \
+       {                                                               \
+               struct cpc_desc *cpc_ptr = to_cpc_desc(kobj);           \
+               struct struct_name st_name = {0};                       \
+               int ret;                                                \
+                                                                       \
+               ret = access_fn(cpc_ptr->cpu_id, &st_name);             \
+               if (ret)                                                \
+                       return ret;                                     \
+                                                                       \
+               return scnprintf(buf, PAGE_SIZE, "%llu\n",              \
+                               (u64)st_name.member_name);              \
+       }                                                               \
+       define_one_cppc_ro(member_name)
+
+show_cppc_data(cppc_get_perf_caps, cppc_perf_caps, highest_perf);
+show_cppc_data(cppc_get_perf_caps, cppc_perf_caps, lowest_perf);
+show_cppc_data(cppc_get_perf_caps, cppc_perf_caps, nominal_perf);
+show_cppc_data(cppc_get_perf_caps, cppc_perf_caps, lowest_nonlinear_perf);
+show_cppc_data(cppc_get_perf_ctrs, cppc_perf_fb_ctrs, reference_perf);
+show_cppc_data(cppc_get_perf_ctrs, cppc_perf_fb_ctrs, wraparound_time);
+
 static ssize_t show_feedback_ctrs(struct kobject *kobj,
                struct attribute *attr, char *buf)
 {
        struct cpc_desc *cpc_ptr = to_cpc_desc(kobj);
        struct cppc_perf_fb_ctrs fb_ctrs = {0};
+       int ret;
 
-       cppc_get_perf_ctrs(cpc_ptr->cpu_id, &fb_ctrs);
+       ret = cppc_get_perf_ctrs(cpc_ptr->cpu_id, &fb_ctrs);
+       if (ret)
+               return ret;
 
        return scnprintf(buf, PAGE_SIZE, "ref:%llu del:%llu\n",
                        fb_ctrs.reference, fb_ctrs.delivered);
 }
 define_one_cppc_ro(feedback_ctrs);
 
-static ssize_t show_reference_perf(struct kobject *kobj,
-               struct attribute *attr, char *buf)
-{
-       struct cpc_desc *cpc_ptr = to_cpc_desc(kobj);
-       struct cppc_perf_fb_ctrs fb_ctrs = {0};
-
-       cppc_get_perf_ctrs(cpc_ptr->cpu_id, &fb_ctrs);
-
-       return scnprintf(buf, PAGE_SIZE, "%llu\n",
-                       fb_ctrs.reference_perf);
-}
-define_one_cppc_ro(reference_perf);
-
-static ssize_t show_wraparound_time(struct kobject *kobj,
-                               struct attribute *attr, char *buf)
-{
-       struct cpc_desc *cpc_ptr = to_cpc_desc(kobj);
-       struct cppc_perf_fb_ctrs fb_ctrs = {0};
-
-       cppc_get_perf_ctrs(cpc_ptr->cpu_id, &fb_ctrs);
-
-       return scnprintf(buf, PAGE_SIZE, "%llu\n", fb_ctrs.ctr_wrap_time);
-
-}
-define_one_cppc_ro(wraparound_time);
-
 static struct attribute *cppc_attrs[] = {
        &feedback_ctrs.attr,
        &reference_perf.attr,
        &wraparound_time.attr,
+       &highest_perf.attr,
+       &lowest_perf.attr,
+       &lowest_nonlinear_perf.attr,
+       &nominal_perf.attr,
        NULL
 };
 
@@ -972,9 +977,9 @@ static int cpc_write(int cpu, struct cpc_register_resource *reg_res, u64 val)
 int cppc_get_perf_caps(int cpunum, struct cppc_perf_caps *perf_caps)
 {
        struct cpc_desc *cpc_desc = per_cpu(cpc_desc_ptr, cpunum);
-       struct cpc_register_resource *highest_reg, *lowest_reg, *ref_perf,
-                                                                *nom_perf;
-       u64 high, low, nom;
+       struct cpc_register_resource *highest_reg, *lowest_reg,
+               *lowest_non_linear_reg, *nominal_reg;
+       u64 high, low, nom, min_nonlinear;
        int ret = 0, regs_in_pcc = 0;
 
        if (!cpc_desc) {
@@ -984,12 +989,12 @@ int cppc_get_perf_caps(int cpunum, struct cppc_perf_caps *perf_caps)
 
        highest_reg = &cpc_desc->cpc_regs[HIGHEST_PERF];
        lowest_reg = &cpc_desc->cpc_regs[LOWEST_PERF];
-       ref_perf = &cpc_desc->cpc_regs[REFERENCE_PERF];
-       nom_perf = &cpc_desc->cpc_regs[NOMINAL_PERF];
+       lowest_non_linear_reg = &cpc_desc->cpc_regs[LOW_NON_LINEAR_PERF];
+       nominal_reg = &cpc_desc->cpc_regs[NOMINAL_PERF];
 
        /* Are any of the regs PCC ?*/
        if (CPC_IN_PCC(highest_reg) || CPC_IN_PCC(lowest_reg) ||
-               CPC_IN_PCC(ref_perf) || CPC_IN_PCC(nom_perf)) {
+               CPC_IN_PCC(lowest_non_linear_reg) || CPC_IN_PCC(nominal_reg)) {
                regs_in_pcc = 1;
                down_write(&pcc_data.pcc_lock);
                /* Ring doorbell once to update PCC subspace */
@@ -1005,10 +1010,13 @@ int cppc_get_perf_caps(int cpunum, struct cppc_perf_caps *perf_caps)
        cpc_read(cpunum, lowest_reg, &low);
        perf_caps->lowest_perf = low;
 
-       cpc_read(cpunum, nom_perf, &nom);
+       cpc_read(cpunum, nominal_reg, &nom);
        perf_caps->nominal_perf = nom;
 
-       if (!high || !low || !nom)
+       cpc_read(cpunum, lowest_non_linear_reg, &min_nonlinear);
+       perf_caps->lowest_nonlinear_perf = min_nonlinear;
+
+       if (!high || !low || !nom || !min_nonlinear)
                ret = -EFAULT;
 
 out_err:
@@ -1083,7 +1091,7 @@ int cppc_get_perf_ctrs(int cpunum, struct cppc_perf_fb_ctrs *perf_fb_ctrs)
        perf_fb_ctrs->delivered = delivered;
        perf_fb_ctrs->reference = reference;
        perf_fb_ctrs->reference_perf = ref_perf;
-       perf_fb_ctrs->ctr_wrap_time = ctr_wrap_time;
+       perf_fb_ctrs->wraparound_time = ctr_wrap_time;
 out_err:
        if (regs_in_pcc)
                up_write(&pcc_data.pcc_lock);
index fb19e1c..edc8663 100644 (file)
@@ -99,13 +99,13 @@ static int find_child_checks(struct acpi_device *adev, bool check_children)
                return -ENODEV;
 
        /*
-        * If the device has a _HID (or _CID) returning a valid ACPI/PNP
-        * device ID, it is better to make it look less attractive here, so that
-        * the other device with the same _ADR value (that may not have a valid
-        * device ID) can be matched going forward.  [This means a second spec
-        * violation in a row, so whatever we do here is best effort anyway.]
+        * If the device has a _HID returning a valid ACPI/PNP device ID, it is
+        * better to make it look less attractive here, so that the other device
+        * with the same _ADR value (that may not have a valid device ID) can be
+        * matched going forward.  [This means a second spec violation in a row,
+        * so whatever we do here is best effort anyway.]
         */
-       return sta_present && list_empty(&adev->pnp.ids) ?
+       return sta_present && !adev->pnp.type.platform_id ?
                        FIND_CHILD_MAX_SCORE : FIND_CHILD_MIN_SCORE;
 }
 
index f159001..66229ff 100644 (file)
@@ -65,8 +65,6 @@ static inline void acpi_cmos_rtc_init(void) {}
 #endif
 int acpi_rev_override_setup(char *str);
 
-extern bool acpi_force_hot_remove;
-
 void acpi_sysfs_add_hotplug_profile(struct acpi_hotplug_profile *hotplug,
                                    const char *name);
 int acpi_scan_add_handler_with_hotplug(struct acpi_scan_handler *handler,
index 662036b..c8ea9d6 100644 (file)
@@ -1617,7 +1617,11 @@ static int cmp_map(const void *m0, const void *m1)
        const struct nfit_set_info_map *map0 = m0;
        const struct nfit_set_info_map *map1 = m1;
 
-       return map0->region_offset - map1->region_offset;
+       if (map0->region_offset < map1->region_offset)
+               return -1;
+       else if (map0->region_offset > map1->region_offset)
+               return 1;
+       return 0;
 }
 
 /* Retrieve the nth entry referencing this spa */
diff --git a/drivers/acpi/pmic/intel_pmic_chtwc.c b/drivers/acpi/pmic/intel_pmic_chtwc.c
new file mode 100644 (file)
index 0000000..85636d7
--- /dev/null
@@ -0,0 +1,280 @@
+/*
+ * Intel CHT Whiskey Cove PMIC operation region driver
+ * Copyright (C) 2017 Hans de Goede <hdegoede@redhat.com>
+ *
+ * Based on various non upstream patches to support the CHT Whiskey Cove PMIC:
+ * Copyright (C) 2013-2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/acpi.h>
+#include <linux/init.h>
+#include <linux/mfd/intel_soc_pmic.h>
+#include <linux/platform_device.h>
+#include <linux/regmap.h>
+#include "intel_pmic.h"
+
+#define CHT_WC_V1P05A_CTRL             0x6e3b
+#define CHT_WC_V1P15_CTRL              0x6e3c
+#define CHT_WC_V1P05A_VSEL             0x6e3d
+#define CHT_WC_V1P15_VSEL              0x6e3e
+#define CHT_WC_V1P8A_CTRL              0x6e56
+#define CHT_WC_V1P8SX_CTRL             0x6e57
+#define CHT_WC_VDDQ_CTRL               0x6e58
+#define CHT_WC_V1P2A_CTRL              0x6e59
+#define CHT_WC_V1P2SX_CTRL             0x6e5a
+#define CHT_WC_V1P8A_VSEL              0x6e5b
+#define CHT_WC_VDDQ_VSEL               0x6e5c
+#define CHT_WC_V2P8SX_CTRL             0x6e5d
+#define CHT_WC_V3P3A_CTRL              0x6e5e
+#define CHT_WC_V3P3SD_CTRL             0x6e5f
+#define CHT_WC_VSDIO_CTRL              0x6e67
+#define CHT_WC_V3P3A_VSEL              0x6e68
+#define CHT_WC_VPROG1A_CTRL            0x6e90
+#define CHT_WC_VPROG1B_CTRL            0x6e91
+#define CHT_WC_VPROG1F_CTRL            0x6e95
+#define CHT_WC_VPROG2D_CTRL            0x6e99
+#define CHT_WC_VPROG3A_CTRL            0x6e9a
+#define CHT_WC_VPROG3B_CTRL            0x6e9b
+#define CHT_WC_VPROG4A_CTRL            0x6e9c
+#define CHT_WC_VPROG4B_CTRL            0x6e9d
+#define CHT_WC_VPROG4C_CTRL            0x6e9e
+#define CHT_WC_VPROG4D_CTRL            0x6e9f
+#define CHT_WC_VPROG5A_CTRL            0x6ea0
+#define CHT_WC_VPROG5B_CTRL            0x6ea1
+#define CHT_WC_VPROG6A_CTRL            0x6ea2
+#define CHT_WC_VPROG6B_CTRL            0x6ea3
+#define CHT_WC_VPROG1A_VSEL            0x6ec0
+#define CHT_WC_VPROG1B_VSEL            0x6ec1
+#define CHT_WC_V1P8SX_VSEL             0x6ec2
+#define CHT_WC_V1P2SX_VSEL             0x6ec3
+#define CHT_WC_V1P2A_VSEL              0x6ec4
+#define CHT_WC_VPROG1F_VSEL            0x6ec5
+#define CHT_WC_VSDIO_VSEL              0x6ec6
+#define CHT_WC_V2P8SX_VSEL             0x6ec7
+#define CHT_WC_V3P3SD_VSEL             0x6ec8
+#define CHT_WC_VPROG2D_VSEL            0x6ec9
+#define CHT_WC_VPROG3A_VSEL            0x6eca
+#define CHT_WC_VPROG3B_VSEL            0x6ecb
+#define CHT_WC_VPROG4A_VSEL            0x6ecc
+#define CHT_WC_VPROG4B_VSEL            0x6ecd
+#define CHT_WC_VPROG4C_VSEL            0x6ece
+#define CHT_WC_VPROG4D_VSEL            0x6ecf
+#define CHT_WC_VPROG5A_VSEL            0x6ed0
+#define CHT_WC_VPROG5B_VSEL            0x6ed1
+#define CHT_WC_VPROG6A_VSEL            0x6ed2
+#define CHT_WC_VPROG6B_VSEL            0x6ed3
+
+/*
+ * Regulator support is based on the non upstream patch:
+ * "regulator: whiskey_cove: implements Whiskey Cove pmic VRF support"
+ * https://github.com/intel-aero/meta-intel-aero/blob/master/recipes-kernel/linux/linux-yocto/0019-regulator-whiskey_cove-implements-WhiskeyCove-pmic-V.patch
+ */
+static struct pmic_table power_table[] = {
+       {
+               .address = 0x0,
+               .reg = CHT_WC_V1P8A_CTRL,
+               .bit = 0x01,
+       }, /* V18A */
+       {
+               .address = 0x04,
+               .reg = CHT_WC_V1P8SX_CTRL,
+               .bit = 0x07,
+       }, /* V18X */
+       {
+               .address = 0x08,
+               .reg = CHT_WC_VDDQ_CTRL,
+               .bit = 0x01,
+       }, /* VDDQ */
+       {
+               .address = 0x0c,
+               .reg = CHT_WC_V1P2A_CTRL,
+               .bit = 0x07,
+       }, /* V12A */
+       {
+               .address = 0x10,
+               .reg = CHT_WC_V1P2SX_CTRL,
+               .bit = 0x07,
+       }, /* V12X */
+       {
+               .address = 0x14,
+               .reg = CHT_WC_V2P8SX_CTRL,
+               .bit = 0x07,
+       }, /* V28X */
+       {
+               .address = 0x18,
+               .reg = CHT_WC_V3P3A_CTRL,
+               .bit = 0x01,
+       }, /* V33A */
+       {
+               .address = 0x1c,
+               .reg = CHT_WC_V3P3SD_CTRL,
+               .bit = 0x07,
+       }, /* V3SD */
+       {
+               .address = 0x20,
+               .reg = CHT_WC_VSDIO_CTRL,
+               .bit = 0x07,
+       }, /* VSD */
+/*     {
+               .address = 0x24,
+               .reg = ??,
+               .bit = ??,
+       }, ** VSW2 */
+/*     {
+               .address = 0x28,
+               .reg = ??,
+               .bit = ??,
+       }, ** VSW1 */
+/*     {
+               .address = 0x2c,
+               .reg = ??,
+               .bit = ??,
+       }, ** VUPY */
+/*     {
+               .address = 0x30,
+               .reg = ??,
+               .bit = ??,
+       }, ** VRSO */
+       {
+               .address = 0x34,
+               .reg = CHT_WC_VPROG1A_CTRL,
+               .bit = 0x07,
+       }, /* VP1A */
+       {
+               .address = 0x38,
+               .reg = CHT_WC_VPROG1B_CTRL,
+               .bit = 0x07,
+       }, /* VP1B */
+       {
+               .address = 0x3c,
+               .reg = CHT_WC_VPROG1F_CTRL,
+               .bit = 0x07,
+       }, /* VP1F */
+       {
+               .address = 0x40,
+               .reg = CHT_WC_VPROG2D_CTRL,
+               .bit = 0x07,
+       }, /* VP2D */
+       {
+               .address = 0x44,
+               .reg = CHT_WC_VPROG3A_CTRL,
+               .bit = 0x07,
+       }, /* VP3A */
+       {
+               .address = 0x48,
+               .reg = CHT_WC_VPROG3B_CTRL,
+               .bit = 0x07,
+       }, /* VP3B */
+       {
+               .address = 0x4c,
+               .reg = CHT_WC_VPROG4A_CTRL,
+               .bit = 0x07,
+       }, /* VP4A */
+       {
+               .address = 0x50,
+               .reg = CHT_WC_VPROG4B_CTRL,
+               .bit = 0x07,
+       }, /* VP4B */
+       {
+               .address = 0x54,
+               .reg = CHT_WC_VPROG4C_CTRL,
+               .bit = 0x07,
+       }, /* VP4C */
+       {
+               .address = 0x58,
+               .reg = CHT_WC_VPROG4D_CTRL,
+               .bit = 0x07,
+       }, /* VP4D */
+       {
+               .address = 0x5c,
+               .reg = CHT_WC_VPROG5A_CTRL,
+               .bit = 0x07,
+       }, /* VP5A */
+       {
+               .address = 0x60,
+               .reg = CHT_WC_VPROG5B_CTRL,
+               .bit = 0x07,
+       }, /* VP5B */
+       {
+               .address = 0x64,
+               .reg = CHT_WC_VPROG6A_CTRL,
+               .bit = 0x07,
+       }, /* VP6A */
+       {
+               .address = 0x68,
+               .reg = CHT_WC_VPROG6B_CTRL,
+               .bit = 0x07,
+       }, /* VP6B */
+/*     {
+               .address = 0x6c,
+               .reg = ??,
+               .bit = ??,
+       }  ** VP7A */
+};
+
+static int intel_cht_wc_pmic_get_power(struct regmap *regmap, int reg,
+               int bit, u64 *value)
+{
+       int data;
+
+       if (regmap_read(regmap, reg, &data))
+               return -EIO;
+
+       *value = (data & bit) ? 1 : 0;
+       return 0;
+}
+
+static int intel_cht_wc_pmic_update_power(struct regmap *regmap, int reg,
+               int bitmask, bool on)
+{
+       return regmap_update_bits(regmap, reg, bitmask, on ? 1 : 0);
+}
+
+/*
+ * The thermal table and ops are empty, we do not support the Thermal opregion
+ * (DPTF) due to lacking documentation.
+ */
+static struct intel_pmic_opregion_data intel_cht_wc_pmic_opregion_data = {
+       .get_power              = intel_cht_wc_pmic_get_power,
+       .update_power           = intel_cht_wc_pmic_update_power,
+       .power_table            = power_table,
+       .power_table_count      = ARRAY_SIZE(power_table),
+};
+
+static int intel_cht_wc_pmic_opregion_probe(struct platform_device *pdev)
+{
+       struct intel_soc_pmic *pmic = dev_get_drvdata(pdev->dev.parent);
+
+       return intel_pmic_install_opregion_handler(&pdev->dev,
+                       ACPI_HANDLE(pdev->dev.parent),
+                       pmic->regmap,
+                       &intel_cht_wc_pmic_opregion_data);
+}
+
+static struct platform_device_id cht_wc_opregion_id_table[] = {
+       { .name = "cht_wcove_region" },
+       {},
+};
+MODULE_DEVICE_TABLE(platform, cht_wc_opregion_id_table);
+
+static struct platform_driver intel_cht_wc_pmic_opregion_driver = {
+       .probe = intel_cht_wc_pmic_opregion_probe,
+       .driver = {
+               .name = "cht_whiskey_cove_pmic",
+       },
+       .id_table = cht_wc_opregion_id_table,
+};
+module_platform_driver(intel_cht_wc_pmic_opregion_driver);
+
+MODULE_DESCRIPTION("Intel CHT Whiskey Cove PMIC operation region driver");
+MODULE_AUTHOR("Hans de Goede <hdegoede@redhat.com>");
+MODULE_LICENSE("GPL");
index e6e991a..55f5111 100644 (file)
@@ -18,7 +18,6 @@
 #include <linux/mfd/axp20x.h>
 #include <linux/regmap.h>
 #include <linux/platform_device.h>
-#include <linux/iio/consumer.h>
 #include "intel_pmic.h"
 
 #define XPOWER_GPADC_LOW       0x5b
@@ -186,28 +185,16 @@ static int intel_xpower_pmic_update_power(struct regmap *regmap, int reg,
  * @regmap: regmap of the PMIC device
  * @reg: register to get the reading
  *
- * We could get the sensor value by manipulating the HW regs here, but since
- * the axp288 IIO driver may also access the same regs at the same time, the
- * APIs provided by IIO subsystem are used here instead to avoid problems. As
- * a result, the two passed in params are of no actual use.
- *
  * Return a positive value on success, errno on failure.
  */
 static int intel_xpower_pmic_get_raw_temp(struct regmap *regmap, int reg)
 {
-       struct iio_channel *gpadc_chan;
-       int ret, val;
-
-       gpadc_chan = iio_channel_get(NULL, "axp288-system-temp");
-       if (IS_ERR_OR_NULL(gpadc_chan))
-               return -EACCES;
+       u8 buf[2];
 
-       ret = iio_read_channel_raw(gpadc_chan, &val);
-       if (ret < 0)
-               val = ret;
+       if (regmap_bulk_read(regmap, AXP288_GP_ADC_H, buf, 2))
+               return -EIO;
 
-       iio_channel_release(gpadc_chan);
-       return val;
+       return (buf[0] << 4) + ((buf[1] >> 4) & 0x0F);
 }
 
 static struct intel_pmic_opregion_data intel_xpower_pmic_opregion_data = {
index fcd4ce6..1c2b846 100644 (file)
@@ -200,6 +200,7 @@ static int acpi_power_get_list_state(struct list_head *list, int *state)
                return -EINVAL;
 
        /* The state of the list is 'on' IFF all resources are 'on'. */
+       cur_state = 0;
        list_for_each_entry(entry, list, node) {
                struct acpi_power_resource *resource = entry->resource;
                acpi_handle handle = resource->device.handle;
index 9d5f0c7..8697a82 100644 (file)
@@ -251,6 +251,9 @@ static int __acpi_processor_start(struct acpi_device *device)
        if (ACPI_SUCCESS(status))
                return 0;
 
+       result = -ENODEV;
+       acpi_pss_perf_exit(pr, device);
+
 err_power_exit:
        acpi_processor_power_exit(pr);
        return result;
@@ -259,11 +262,16 @@ err_power_exit:
 static int acpi_processor_start(struct device *dev)
 {
        struct acpi_device *device = ACPI_COMPANION(dev);
+       int ret;
 
        if (!device)
                return -ENODEV;
 
-       return __acpi_processor_start(device);
+       /* Protect against concurrent CPU hotplug operations */
+       get_online_cpus();
+       ret = __acpi_processor_start(device);
+       put_online_cpus();
+       return ret;
 }
 
 static int acpi_processor_stop(struct device *dev)
index a12f96c..3de3463 100644 (file)
@@ -62,8 +62,8 @@ struct acpi_processor_throttling_arg {
 #define THROTTLING_POSTCHANGE      (2)
 
 static int acpi_processor_get_throttling(struct acpi_processor *pr);
-int acpi_processor_set_throttling(struct acpi_processor *pr,
-                                               int state, bool force);
+static int __acpi_processor_set_throttling(struct acpi_processor *pr,
+                                          int state, bool force, bool direct);
 
 static int acpi_processor_update_tsd_coord(void)
 {
@@ -891,7 +891,8 @@ static int acpi_processor_get_throttling_ptc(struct acpi_processor *pr)
                        ACPI_DEBUG_PRINT((ACPI_DB_INFO,
                                "Invalid throttling state, reset\n"));
                        state = 0;
-                       ret = acpi_processor_set_throttling(pr, state, true);
+                       ret = __acpi_processor_set_throttling(pr, state, true,
+                                                             true);
                        if (ret)
                                return ret;
                }
@@ -901,36 +902,31 @@ static int acpi_processor_get_throttling_ptc(struct acpi_processor *pr)
        return 0;
 }
 
-static int acpi_processor_get_throttling(struct acpi_processor *pr)
+static long __acpi_processor_get_throttling(void *data)
 {
-       cpumask_var_t saved_mask;
-       int ret;
+       struct acpi_processor *pr = data;
+
+       return pr->throttling.acpi_processor_get_throttling(pr);
+}
 
+static int acpi_processor_get_throttling(struct acpi_processor *pr)
+{
        if (!pr)
                return -EINVAL;
 
        if (!pr->flags.throttling)
                return -ENODEV;
 
-       if (!alloc_cpumask_var(&saved_mask, GFP_KERNEL))
-               return -ENOMEM;
-
        /*
-        * Migrate task to the cpu pointed by pr.
+        * This is either called from the CPU hotplug callback of
+        * processor_driver or via the ACPI probe function. In the latter
+        * case the CPU is not guaranteed to be online. Both call sites are
+        * protected against CPU hotplug.
         */
-       cpumask_copy(saved_mask, &current->cpus_allowed);
-       /* FIXME: use work_on_cpu() */
-       if (set_cpus_allowed_ptr(current, cpumask_of(pr->id))) {
-               /* Can't migrate to the target pr->id CPU. Exit */
-               free_cpumask_var(saved_mask);
+       if (!cpu_online(pr->id))
                return -ENODEV;
-       }
-       ret = pr->throttling.acpi_processor_get_throttling(pr);
-       /* restore the previous state */
-       set_cpus_allowed_ptr(current, saved_mask);
-       free_cpumask_var(saved_mask);
 
-       return ret;
+       return work_on_cpu(pr->id, __acpi_processor_get_throttling, pr);
 }
 
 static int acpi_processor_get_fadt_info(struct acpi_processor *pr)
@@ -1080,8 +1076,15 @@ static long acpi_processor_throttling_fn(void *data)
                        arg->target_state, arg->force);
 }
 
-int acpi_processor_set_throttling(struct acpi_processor *pr,
-                                               int state, bool force)
+static int call_on_cpu(int cpu, long (*fn)(void *), void *arg, bool direct)
+{
+       if (direct)
+               return fn(arg);
+       return work_on_cpu(cpu, fn, arg);
+}
+
+static int __acpi_processor_set_throttling(struct acpi_processor *pr,
+                                          int state, bool force, bool direct)
 {
        int ret = 0;
        unsigned int i;
@@ -1130,7 +1133,8 @@ int acpi_processor_set_throttling(struct acpi_processor *pr,
                arg.pr = pr;
                arg.target_state = state;
                arg.force = force;
-               ret = work_on_cpu(pr->id, acpi_processor_throttling_fn, &arg);
+               ret = call_on_cpu(pr->id, acpi_processor_throttling_fn, &arg,
+                                 direct);
        } else {
                /*
                 * When the T-state coordination is SW_ALL or HW_ALL,
@@ -1163,8 +1167,8 @@ int acpi_processor_set_throttling(struct acpi_processor *pr,
                        arg.pr = match_pr;
                        arg.target_state = state;
                        arg.force = force;
-                       ret = work_on_cpu(pr->id, acpi_processor_throttling_fn,
-                               &arg);
+                       ret = call_on_cpu(pr->id, acpi_processor_throttling_fn,
+                                         &arg, direct);
                }
        }
        /*
@@ -1182,6 +1186,12 @@ int acpi_processor_set_throttling(struct acpi_processor *pr,
        return ret;
 }
 
+int acpi_processor_set_throttling(struct acpi_processor *pr, int state,
+                                 bool force)
+{
+       return __acpi_processor_set_throttling(pr, state, force, false);
+}
+
 int acpi_processor_get_throttling_info(struct acpi_processor *pr)
 {
        int result = 0;
index 3afddcd..9364398 100644 (file)
@@ -37,14 +37,16 @@ static const u8 ads_uuid[16] = {
 
 static bool acpi_enumerate_nondev_subnodes(acpi_handle scope,
                                           const union acpi_object *desc,
-                                          struct acpi_device_data *data);
+                                          struct acpi_device_data *data,
+                                          struct fwnode_handle *parent);
 static bool acpi_extract_properties(const union acpi_object *desc,
                                    struct acpi_device_data *data);
 
 static bool acpi_nondev_subnode_extract(const union acpi_object *desc,
                                        acpi_handle handle,
                                        const union acpi_object *link,
-                                       struct list_head *list)
+                                       struct list_head *list,
+                                       struct fwnode_handle *parent)
 {
        struct acpi_data_node *dn;
        bool result;
@@ -55,6 +57,7 @@ static bool acpi_nondev_subnode_extract(const union acpi_object *desc,
 
        dn->name = link->package.elements[0].string.pointer;
        dn->fwnode.type = FWNODE_ACPI_DATA;
+       dn->parent = parent;
        INIT_LIST_HEAD(&dn->data.subnodes);
 
        result = acpi_extract_properties(desc, &dn->data);
@@ -71,9 +74,11 @@ static bool acpi_nondev_subnode_extract(const union acpi_object *desc,
                 */
                status = acpi_get_parent(handle, &scope);
                if (ACPI_SUCCESS(status)
-                   && acpi_enumerate_nondev_subnodes(scope, desc, &dn->data))
+                   && acpi_enumerate_nondev_subnodes(scope, desc, &dn->data,
+                                                     &dn->fwnode))
                        result = true;
-       } else if (acpi_enumerate_nondev_subnodes(NULL, desc, &dn->data)) {
+       } else if (acpi_enumerate_nondev_subnodes(NULL, desc, &dn->data,
+                                                 &dn->fwnode)) {
                result = true;
        }
 
@@ -91,7 +96,8 @@ static bool acpi_nondev_subnode_extract(const union acpi_object *desc,
 
 static bool acpi_nondev_subnode_data_ok(acpi_handle handle,
                                        const union acpi_object *link,
-                                       struct list_head *list)
+                                       struct list_head *list,
+                                       struct fwnode_handle *parent)
 {
        struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER };
        acpi_status status;
@@ -101,7 +107,8 @@ static bool acpi_nondev_subnode_data_ok(acpi_handle handle,
        if (ACPI_FAILURE(status))
                return false;
 
-       if (acpi_nondev_subnode_extract(buf.pointer, handle, link, list))
+       if (acpi_nondev_subnode_extract(buf.pointer, handle, link, list,
+                                       parent))
                return true;
 
        ACPI_FREE(buf.pointer);
@@ -110,7 +117,8 @@ static bool acpi_nondev_subnode_data_ok(acpi_handle handle,
 
 static bool acpi_nondev_subnode_ok(acpi_handle scope,
                                   const union acpi_object *link,
-                                  struct list_head *list)
+                                  struct list_head *list,
+                                  struct fwnode_handle *parent)
 {
        acpi_handle handle;
        acpi_status status;
@@ -123,12 +131,13 @@ static bool acpi_nondev_subnode_ok(acpi_handle scope,
        if (ACPI_FAILURE(status))
                return false;
 
-       return acpi_nondev_subnode_data_ok(handle, link, list);
+       return acpi_nondev_subnode_data_ok(handle, link, list, parent);
 }
 
 static int acpi_add_nondev_subnodes(acpi_handle scope,
                                    const union acpi_object *links,
-                                   struct list_head *list)
+                                   struct list_head *list,
+                                   struct fwnode_handle *parent)
 {
        bool ret = false;
        int i;
@@ -150,15 +159,18 @@ static int acpi_add_nondev_subnodes(acpi_handle scope,
                /* The second one may be a string, a reference or a package. */
                switch (link->package.elements[1].type) {
                case ACPI_TYPE_STRING:
-                       result = acpi_nondev_subnode_ok(scope, link, list);
+                       result = acpi_nondev_subnode_ok(scope, link, list,
+                                                        parent);
                        break;
                case ACPI_TYPE_LOCAL_REFERENCE:
                        handle = link->package.elements[1].reference.handle;
-                       result = acpi_nondev_subnode_data_ok(handle, link, list);
+                       result = acpi_nondev_subnode_data_ok(handle, link, list,
+                                                            parent);
                        break;
                case ACPI_TYPE_PACKAGE:
                        desc = &link->package.elements[1];
-                       result = acpi_nondev_subnode_extract(desc, NULL, link, list);
+                       result = acpi_nondev_subnode_extract(desc, NULL, link,
+                                                            list, parent);
                        break;
                default:
                        result = false;
@@ -172,7 +184,8 @@ static int acpi_add_nondev_subnodes(acpi_handle scope,
 
 static bool acpi_enumerate_nondev_subnodes(acpi_handle scope,
                                           const union acpi_object *desc,
-                                          struct acpi_device_data *data)
+                                          struct acpi_device_data *data,
+                                          struct fwnode_handle *parent)
 {
        int i;
 
@@ -194,7 +207,8 @@ static bool acpi_enumerate_nondev_subnodes(acpi_handle scope,
                if (memcmp(uuid->buffer.pointer, ads_uuid, sizeof(ads_uuid)))
                        continue;
 
-               return acpi_add_nondev_subnodes(scope, links, &data->subnodes);
+               return acpi_add_nondev_subnodes(scope, links, &data->subnodes,
+                                               parent);
        }
 
        return false;
@@ -345,7 +359,8 @@ void acpi_init_properties(struct acpi_device *adev)
                if (acpi_of)
                        acpi_init_of_compatible(adev);
        }
-       if (acpi_enumerate_nondev_subnodes(adev->handle, buf.pointer, &adev->data))
+       if (acpi_enumerate_nondev_subnodes(adev->handle, buf.pointer,
+                                       &adev->data, acpi_fwnode_handle(adev)))
                adev->data.pointer = buf.pointer;
 
        if (!adev->data.pointer) {
@@ -699,6 +714,8 @@ static int acpi_data_prop_read_single(struct acpi_device_data *data,
                        return ret;
 
                *(char **)val = obj->string.pointer;
+
+               return 1;
        } else {
                ret = -EINVAL;
        }
@@ -708,7 +725,15 @@ static int acpi_data_prop_read_single(struct acpi_device_data *data,
 int acpi_dev_prop_read_single(struct acpi_device *adev, const char *propname,
                              enum dev_prop_type proptype, void *val)
 {
-       return adev ? acpi_data_prop_read_single(&adev->data, propname, proptype, val) : -EINVAL;
+       int ret;
+
+       if (!adev)
+               return -EINVAL;
+
+       ret = acpi_data_prop_read_single(&adev->data, propname, proptype, val);
+       if (ret < 0 || proptype != ACPI_TYPE_STRING)
+               return ret;
+       return 0;
 }
 
 static int acpi_copy_property_array_u8(const union acpi_object *items, u8 *val,
@@ -784,7 +809,7 @@ static int acpi_copy_property_array_string(const union acpi_object *items,
 
                val[i] = items[i].string.pointer;
        }
-       return 0;
+       return nval;
 }
 
 static int acpi_data_prop_read(struct acpi_device_data *data,
@@ -798,7 +823,7 @@ static int acpi_data_prop_read(struct acpi_device_data *data,
 
        if (val && nval == 1) {
                ret = acpi_data_prop_read_single(data, propname, proptype, val);
-               if (!ret)
+               if (ret >= 0)
                        return ret;
        }
 
@@ -809,7 +834,7 @@ static int acpi_data_prop_read(struct acpi_device_data *data,
        if (!val)
                return obj->package.count;
 
-       if (nval > obj->package.count)
+       if (proptype != DEV_PROP_STRING && nval > obj->package.count)
                return -EOVERFLOW;
        else if (nval <= 0)
                return -EINVAL;
@@ -830,7 +855,9 @@ static int acpi_data_prop_read(struct acpi_device_data *data,
                ret = acpi_copy_property_array_u64(items, (u64 *)val, nval);
                break;
        case DEV_PROP_STRING:
-               ret = acpi_copy_property_array_string(items, (char **)val, nval);
+               ret = acpi_copy_property_array_string(
+                       items, (char **)val,
+                       min_t(u32, nval, obj->package.count));
                break;
        default:
                ret = -EINVAL;
@@ -865,21 +892,22 @@ int acpi_node_prop_read(struct fwnode_handle *fwnode,  const char *propname,
 }
 
 /**
- * acpi_get_next_subnode - Return the next child node handle for a device.
- * @dev: Device to find the next child node for.
+ * acpi_get_next_subnode - Return the next child node handle for a fwnode
+ * @fwnode: Firmware node to find the next child node for.
  * @child: Handle to one of the device's child nodes or a null handle.
  */
-struct fwnode_handle *acpi_get_next_subnode(struct device *dev,
+struct fwnode_handle *acpi_get_next_subnode(struct fwnode_handle *fwnode,
                                            struct fwnode_handle *child)
 {
-       struct acpi_device *adev = ACPI_COMPANION(dev);
+       struct acpi_device *adev = to_acpi_device_node(fwnode);
        struct list_head *head, *next;
 
-       if (!adev)
-               return NULL;
-
        if (!child || child->type == FWNODE_ACPI) {
-               head = &adev->children;
+               if (adev)
+                       head = &adev->children;
+               else
+                       goto nondev;
+
                if (list_empty(head))
                        goto nondev;
 
@@ -888,7 +916,6 @@ struct fwnode_handle *acpi_get_next_subnode(struct device *dev,
                        next = adev->node.next;
                        if (next == head) {
                                child = NULL;
-                               adev = ACPI_COMPANION(dev);
                                goto nondev;
                        }
                        adev = list_entry(next, struct acpi_device, node);
@@ -900,9 +927,16 @@ struct fwnode_handle *acpi_get_next_subnode(struct device *dev,
 
  nondev:
        if (!child || child->type == FWNODE_ACPI_DATA) {
+               struct acpi_data_node *data = to_acpi_data_node(fwnode);
                struct acpi_data_node *dn;
 
-               head = &adev->data.subnodes;
+               if (adev)
+                       head = &adev->data.subnodes;
+               else if (data)
+                       head = &data->data.subnodes;
+               else
+                       return NULL;
+
                if (list_empty(head))
                        return NULL;
 
@@ -920,3 +954,168 @@ struct fwnode_handle *acpi_get_next_subnode(struct device *dev,
        }
        return NULL;
 }
+
+/**
+ * acpi_node_get_parent - Return parent fwnode of this fwnode
+ * @fwnode: Firmware node whose parent to get
+ *
+ * Returns parent node of an ACPI device or data firmware node or %NULL if
+ * not available.
+ */
+struct fwnode_handle *acpi_node_get_parent(struct fwnode_handle *fwnode)
+{
+       if (is_acpi_data_node(fwnode)) {
+               /* All data nodes have parent pointer so just return that */
+               return to_acpi_data_node(fwnode)->parent;
+       } else if (is_acpi_device_node(fwnode)) {
+               acpi_handle handle, parent_handle;
+
+               handle = to_acpi_device_node(fwnode)->handle;
+               if (ACPI_SUCCESS(acpi_get_parent(handle, &parent_handle))) {
+                       struct acpi_device *adev;
+
+                       if (!acpi_bus_get_device(parent_handle, &adev))
+                               return acpi_fwnode_handle(adev);
+               }
+       }
+
+       return NULL;
+}
+
+/**
+ * acpi_graph_get_next_endpoint - Get next endpoint ACPI firmware node
+ * @fwnode: Pointer to the parent firmware node
+ * @prev: Previous endpoint node or %NULL to get the first
+ *
+ * Looks up next endpoint ACPI firmware node below a given @fwnode. Returns
+ * %NULL if there is no next endpoint, ERR_PTR() in case of error. In case
+ * of success the next endpoint is returned.
+ */
+struct fwnode_handle *acpi_graph_get_next_endpoint(struct fwnode_handle *fwnode,
+                                                  struct fwnode_handle *prev)
+{
+       struct fwnode_handle *port = NULL;
+       struct fwnode_handle *endpoint;
+
+       if (!prev) {
+               do {
+                       port = fwnode_get_next_child_node(fwnode, port);
+                       /* Ports must have port property */
+                       if (fwnode_property_present(port, "port"))
+                               break;
+               } while (port);
+       } else {
+               port = fwnode_get_parent(prev);
+       }
+
+       if (!port)
+               return NULL;
+
+       endpoint = fwnode_get_next_child_node(port, prev);
+       while (!endpoint) {
+               port = fwnode_get_next_child_node(fwnode, port);
+               if (!port)
+                       break;
+               if (fwnode_property_present(port, "port"))
+                       endpoint = fwnode_get_next_child_node(port, NULL);
+       }
+
+       if (endpoint) {
+               /* Endpoints must have "endpoint" property */
+               if (!fwnode_property_present(endpoint, "endpoint"))
+                       return ERR_PTR(-EPROTO);
+       }
+
+       return endpoint;
+}
+
+/**
+ * acpi_graph_get_child_prop_value - Return a child with a given property value
+ * @fwnode: device fwnode
+ * @prop_name: The name of the property to look for
+ * @val: the desired property value
+ *
+ * Return the port node corresponding to a given port number. Returns
+ * the child node on success, NULL otherwise.
+ */
+static struct fwnode_handle *acpi_graph_get_child_prop_value(
+       struct fwnode_handle *fwnode, const char *prop_name, unsigned int val)
+{
+       struct fwnode_handle *child;
+
+       fwnode_for_each_child_node(fwnode, child) {
+               u32 nr;
+
+               if (!fwnode_property_read_u32(fwnode, prop_name, &nr))
+                       continue;
+
+               if (val == nr)
+                       return child;
+       }
+
+       return NULL;
+}
+
+
+/**
+ * acpi_graph_get_remote_enpoint - Parses and returns remote end of an endpoint
+ * @fwnode: Endpoint firmware node pointing to a remote device
+ * @parent: Firmware node of remote port parent is filled here if not %NULL
+ * @port: Firmware node of remote port is filled here if not %NULL
+ * @endpoint: Firmware node of remote endpoint is filled here if not %NULL
+ *
+ * Function parses remote end of ACPI firmware remote endpoint and fills in
+ * fields requested by the caller. Returns %0 in case of success and
+ * negative errno otherwise.
+ */
+int acpi_graph_get_remote_endpoint(struct fwnode_handle *fwnode,
+                                  struct fwnode_handle **parent,
+                                  struct fwnode_handle **port,
+                                  struct fwnode_handle **endpoint)
+{
+       unsigned int port_nr, endpoint_nr;
+       struct acpi_reference_args args;
+       int ret;
+
+       memset(&args, 0, sizeof(args));
+       ret = acpi_node_get_property_reference(fwnode, "remote-endpoint", 0,
+                                              &args);
+       if (ret)
+               return ret;
+
+       /*
+        * Always require two arguments with the reference: port and
+        * endpoint indices.
+        */
+       if (args.nargs != 2)
+               return -EPROTO;
+
+       fwnode = acpi_fwnode_handle(args.adev);
+       port_nr = args.args[0];
+       endpoint_nr = args.args[1];
+
+       if (parent)
+               *parent = fwnode;
+
+       if (!port && !endpoint)
+               return 0;
+
+       fwnode = acpi_graph_get_child_prop_value(fwnode, "port", port_nr);
+       if (!fwnode)
+               return -EPROTO;
+
+       if (port)
+               *port = fwnode;
+
+       if (!endpoint)
+               return 0;
+
+       fwnode = acpi_graph_get_child_prop_value(fwnode, "endpoint",
+                                                endpoint_nr);
+       if (!fwnode)
+               return -EPROTO;
+
+       *endpoint = fwnode;
+
+       return 0;
+}
index 1926918..c269310 100644 (file)
@@ -30,12 +30,6 @@ extern struct acpi_device *acpi_root;
 
 #define INVALID_ACPI_HANDLE    ((acpi_handle)empty_zero_page)
 
-/*
- * If set, devices will be hot-removed even if they cannot be put offline
- * gracefully (from the kernel's standpoint).
- */
-bool acpi_force_hot_remove;
-
 static const char *dummy_hid = "device";
 
 static LIST_HEAD(acpi_dep_list);
@@ -170,9 +164,6 @@ static acpi_status acpi_bus_offline(acpi_handle handle, u32 lvl, void *data,
                        pn->put_online = false;
                }
                ret = device_offline(pn->dev);
-               if (acpi_force_hot_remove)
-                       continue;
-
                if (ret >= 0) {
                        pn->put_online = !ret;
                } else {
@@ -241,11 +232,11 @@ static int acpi_scan_try_to_offline(struct acpi_device *device)
                acpi_walk_namespace(ACPI_TYPE_ANY, handle, ACPI_UINT32_MAX,
                                    NULL, acpi_bus_offline, (void *)true,
                                    (void **)&errdev);
-               if (!errdev || acpi_force_hot_remove)
+               if (!errdev)
                        acpi_bus_offline(handle, 0, (void *)true,
                                         (void **)&errdev);
 
-               if (errdev && !acpi_force_hot_remove) {
+               if (errdev) {
                        dev_warn(errdev, "Offline failed.\n");
                        acpi_bus_online(handle, 0, NULL, NULL);
                        acpi_walk_namespace(ACPI_TYPE_ANY, handle,
@@ -263,8 +254,7 @@ static int acpi_scan_hot_remove(struct acpi_device *device)
        unsigned long long sta;
        acpi_status status;
 
-       if (device->handler && device->handler->hotplug.demand_offline
-           && !acpi_force_hot_remove) {
+       if (device->handler && device->handler->hotplug.demand_offline) {
                if (!acpi_scan_is_offline(device, true))
                        return -EBUSY;
        } else {
@@ -1850,6 +1840,8 @@ static void acpi_bus_attach(struct acpi_device *device)
                        device->flags.power_manageable = 0;
 
                device->flags.initialized = true;
+       } else if (device->flags.visited) {
+               goto ok;
        }
 
        ret = acpi_scan_attach_handler(device);
@@ -1857,15 +1849,20 @@ static void acpi_bus_attach(struct acpi_device *device)
                return;
 
        device->flags.match_driver = true;
-       if (!ret) {
-               ret = device_attach(&device->dev);
-               if (ret < 0)
-                       return;
-
-               if (!ret && device->pnp.type.platform_id)
-                       acpi_default_enumeration(device);
+       if (ret > 0) {
+               acpi_device_set_enumerated(device);
+               goto ok;
        }
 
+       ret = device_attach(&device->dev);
+       if (ret < 0)
+               return;
+
+       if (device->pnp.type.platform_id)
+               acpi_default_enumeration(device);
+       else
+               acpi_device_set_enumerated(device);
+
  ok:
        list_for_each_entry(child, &device->children, node)
                acpi_bus_attach(child);
index cf05ae9..1b5ee1e 100644 (file)
@@ -921,7 +921,7 @@ void acpi_sysfs_add_hotplug_profile(struct acpi_hotplug_profile *hotplug,
 static ssize_t force_remove_show(struct kobject *kobj,
                                 struct kobj_attribute *attr, char *buf)
 {
-       return sprintf(buf, "%d\n", !!acpi_force_hot_remove);
+       return sprintf(buf, "%d\n", 0);
 }
 
 static ssize_t force_remove_store(struct kobject *kobj,
@@ -935,9 +935,10 @@ static ssize_t force_remove_store(struct kobject *kobj,
        if (ret < 0)
                return ret;
 
-       lock_device_hotplug();
-       acpi_force_hot_remove = val;
-       unlock_device_hotplug();
+       if (val) {
+               pr_err("Enabling force_remove is not supported anymore. Please report to linux-acpi@vger.kernel.org if you depend on this functionality\n");
+               return -EINVAL;
+       }
        return size;
 }
 
index 2604189..ff42539 100644 (file)
@@ -311,22 +311,6 @@ acpi_parse_entries_array(char *id, unsigned long table_size,
 }
 
 int __init
-acpi_parse_entries(char *id,
-                       unsigned long table_size,
-                       acpi_tbl_entry_handler handler,
-                       struct acpi_table_header *table_header,
-                       int entry_id, unsigned int max_entries)
-{
-       struct acpi_subtable_proc proc = {
-               .id             = entry_id,
-               .handler        = handler,
-       };
-
-       return acpi_parse_entries_array(id, table_size, table_header,
-                       &proc, 1, max_entries);
-}
-
-int __init
 acpi_table_parse_entries_array(char *id,
                         unsigned long table_size,
                         struct acpi_subtable_proc *proc, int proc_num,
@@ -556,7 +540,7 @@ void __init acpi_table_upgrade(void)
         * But it's not enough on X86 because ioremap will
         * complain later (used by acpi_os_map_memory) that the pages
         * that should get mapped are not marked "reserved".
-        * Both memblock_reserve and e820_add_region (via arch_reserve_mem_area)
+        * Both memblock_reserve and e820__range_add (via arch_reserve_mem_area)
         * works fine.
         */
        memblock_reserve(acpi_tables_addr, all_tables_size);
index 22c0995..27d0dcf 100644 (file)
@@ -736,6 +736,72 @@ bool acpi_dev_found(const char *hid)
 }
 EXPORT_SYMBOL(acpi_dev_found);
 
+struct acpi_dev_present_info {
+       struct acpi_device_id hid[2];
+       const char *uid;
+       s64 hrv;
+};
+
+static int acpi_dev_present_cb(struct device *dev, void *data)
+{
+       struct acpi_device *adev = to_acpi_device(dev);
+       struct acpi_dev_present_info *match = data;
+       unsigned long long hrv;
+       acpi_status status;
+
+       if (acpi_match_device_ids(adev, match->hid))
+               return 0;
+
+       if (match->uid && (!adev->pnp.unique_id ||
+           strcmp(adev->pnp.unique_id, match->uid)))
+               return 0;
+
+       if (match->hrv == -1)
+               return 1;
+
+       status = acpi_evaluate_integer(adev->handle, "_HRV", NULL, &hrv);
+       if (ACPI_FAILURE(status))
+               return 0;
+
+       return hrv == match->hrv;
+}
+
+/**
+ * acpi_dev_present - Detect that a given ACPI device is present
+ * @hid: Hardware ID of the device.
+ * @uid: Unique ID of the device, pass NULL to not check _UID
+ * @hrv: Hardware Revision of the device, pass -1 to not check _HRV
+ *
+ * Return %true if a matching device was present at the moment of invocation.
+ * Note that if the device is pluggable, it may since have disappeared.
+ *
+ * Note that unlike acpi_dev_found() this function checks the status
+ * of the device. So for devices which are present in the dsdt, but
+ * which are disabled (their _STA callback returns 0) this function
+ * will return false.
+ *
+ * For this function to work, acpi_bus_scan() must have been executed
+ * which happens in the subsys_initcall() subsection. Hence, do not
+ * call from a subsys_initcall() or earlier (use acpi_get_devices()
+ * instead). Calling from module_init() is fine (which is synonymous
+ * with device_initcall()).
+ */
+bool acpi_dev_present(const char *hid, const char *uid, s64 hrv)
+{
+       struct acpi_dev_present_info match = {};
+       struct device *dev;
+
+       strlcpy(match.hid[0].id, hid, sizeof(match.hid[0].id));
+       match.uid = uid;
+       match.hrv = hrv;
+
+       dev = bus_find_device(&acpi_bus_type, NULL, &match,
+                             acpi_dev_present_cb);
+
+       return !!dev;
+}
+EXPORT_SYMBOL(acpi_dev_present);
+
 /*
  * acpi_backlight= handling, this is done here rather then in video_detect.c
  * because __setup cannot be used in modules.
index 70b57d2..ff6cb9e 100644 (file)
@@ -14,7 +14,6 @@ menuconfig ATA
        tristate "Serial ATA and Parallel ATA drivers (libata)"
        depends on HAS_IOMEM
        depends on BLOCK
-       depends on !(M32R || S390) || BROKEN
        select SCSI
        select GLOB
        ---help---
@@ -118,6 +117,15 @@ config AHCI_DA850
 
          If unsure, say N.
 
+config AHCI_DM816
+       tristate "DaVinci DM816 AHCI SATA support"
+       depends on ARCH_OMAP2PLUS
+       help
+         This option enables support for the DaVinci DM816 SoC's
+         onboard AHCI SATA controller.
+
+         If unsure, say N.
+
 config AHCI_ST
        tristate "ST AHCI SATA support"
        depends on ARCH_STI
@@ -885,14 +893,6 @@ config PATA_AT32
 
          If unsure, say N.
 
-config PATA_AT91
-       tristate "PATA support for AT91SAM9260"
-       depends on ARM && SOC_AT91SAM9
-       help
-         This option enables support for IDE devices on the Atmel AT91SAM9260 SoC.
-
-         If unsure, say N.
-
 config PATA_CMD640_PCI
        tristate "CMD640 PCI PATA support (Experimental)"
        depends on PCI
index 89a0a19..3048cc1 100644 (file)
@@ -14,6 +14,7 @@ obj-$(CONFIG_SATA_HIGHBANK)   += sata_highbank.o libahci.o
 obj-$(CONFIG_AHCI_BRCM)                += ahci_brcm.o libahci.o libahci_platform.o
 obj-$(CONFIG_AHCI_CEVA)                += ahci_ceva.o libahci.o libahci_platform.o
 obj-$(CONFIG_AHCI_DA850)       += ahci_da850.o libahci.o libahci_platform.o
+obj-$(CONFIG_AHCI_DM816)       += ahci_dm816.o libahci.o libahci_platform.o
 obj-$(CONFIG_AHCI_IMX)         += ahci_imx.o libahci.o libahci_platform.o
 obj-$(CONFIG_AHCI_MVEBU)       += ahci_mvebu.o libahci.o libahci_platform.o
 obj-$(CONFIG_AHCI_OCTEON)      += ahci_octeon.o
@@ -91,7 +92,6 @@ obj-$(CONFIG_PATA_WINBOND)    += pata_sl82c105.o
 
 # SFF PIO only
 obj-$(CONFIG_PATA_AT32)                += pata_at32.o
-obj-$(CONFIG_PATA_AT91)                += pata_at91.o
 obj-$(CONFIG_PATA_CMD640_PCI)  += pata_cmd640.o
 obj-$(CONFIG_PATA_FALCON)      += pata_falcon.o
 obj-$(CONFIG_PATA_ISAPNP)      += pata_isapnp.o
diff --git a/drivers/ata/ahci_dm816.c b/drivers/ata/ahci_dm816.c
new file mode 100644 (file)
index 0000000..fbd827c
--- /dev/null
@@ -0,0 +1,200 @@
+/*
+ * DaVinci DM816 AHCI SATA platform driver
+ *
+ * Copyright (C) 2017 BayLibre SAS
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/pm.h>
+#include <linux/platform_device.h>
+#include <linux/libata.h>
+#include <linux/ahci_platform.h>
+
+#include "ahci.h"
+
+#define AHCI_DM816_DRV_NAME            "ahci-dm816"
+
+#define AHCI_DM816_PHY_ENPLL(x)                ((x) << 0)
+#define AHCI_DM816_PHY_MPY(x)          ((x) << 1)
+#define AHCI_DM816_PHY_LOS(x)          ((x) << 12)
+#define AHCI_DM816_PHY_RXCDR(x)                ((x) << 13)
+#define AHCI_DM816_PHY_RXEQ(x)         ((x) << 16)
+#define AHCI_DM816_PHY_TXSWING(x)      ((x) << 23)
+
+#define AHCI_DM816_P0PHYCR_REG         0x178
+#define AHCI_DM816_P1PHYCR_REG         0x1f8
+
+#define AHCI_DM816_PLL_OUT             1500000000LU
+
+static const unsigned long pll_mpy_table[] = {
+         400,  500,  600,  800,  825, 1000, 1200,
+        1250, 1500, 1600, 1650, 2000, 2200, 2500
+};
+
+static int ahci_dm816_get_mpy_bits(unsigned long refclk_rate)
+{
+       unsigned long pll_multiplier;
+       int i;
+
+       /*
+        * We need to determine the value of the multiplier (MPY) bits.
+        * In order to include the 8.25 multiplier we need to first divide
+        * the refclk rate by 100.
+        */
+       pll_multiplier = AHCI_DM816_PLL_OUT / (refclk_rate / 100);
+
+       for (i = 0; i < ARRAY_SIZE(pll_mpy_table); i++) {
+               if (pll_mpy_table[i] == pll_multiplier)
+                       return i;
+       }
+
+       /*
+        * We should have divided evenly - if not, return an invalid
+        * value.
+        */
+       return -1;
+}
+
+static int ahci_dm816_phy_init(struct ahci_host_priv *hpriv, struct device *dev)
+{
+       unsigned long refclk_rate;
+       int mpy;
+       u32 val;
+
+       /*
+        * We should have been supplied two clocks: the functional and
+        * keep-alive clock and the external reference clock. We need the
+        * rate of the latter to calculate the correct value of MPY bits.
+        */
+       if (!hpriv->clks[1]) {
+               dev_err(dev, "reference clock not supplied\n");
+               return -EINVAL;
+       }
+
+       refclk_rate = clk_get_rate(hpriv->clks[1]);
+       if ((refclk_rate % 100) != 0) {
+               dev_err(dev, "reference clock rate must be divisible by 100\n");
+               return -EINVAL;
+       }
+
+       mpy = ahci_dm816_get_mpy_bits(refclk_rate);
+       if (mpy < 0) {
+               dev_err(dev, "can't calculate the MPY bits value\n");
+               return -EINVAL;
+       }
+
+       /* Enable the PHY and configure the first HBA port. */
+       val = AHCI_DM816_PHY_MPY(mpy) | AHCI_DM816_PHY_LOS(1) |
+             AHCI_DM816_PHY_RXCDR(4) | AHCI_DM816_PHY_RXEQ(1) |
+             AHCI_DM816_PHY_TXSWING(3) | AHCI_DM816_PHY_ENPLL(1);
+       writel(val, hpriv->mmio + AHCI_DM816_P0PHYCR_REG);
+
+       /* Configure the second HBA port. */
+       val = AHCI_DM816_PHY_LOS(1) | AHCI_DM816_PHY_RXCDR(4) |
+             AHCI_DM816_PHY_RXEQ(1) | AHCI_DM816_PHY_TXSWING(3);
+       writel(val, hpriv->mmio + AHCI_DM816_P1PHYCR_REG);
+
+       return 0;
+}
+
+static int ahci_dm816_softreset(struct ata_link *link,
+                               unsigned int *class, unsigned long deadline)
+{
+       int pmp, ret;
+
+       pmp = sata_srst_pmp(link);
+
+       /*
+        * There's an issue with the SATA controller on DM816 SoC: if we
+        * enable Port Multiplier support, but the drive is connected directly
+        * to the board, it can't be detected. As a workaround: if PMP is
+        * enabled, we first call ahci_do_softreset() and pass it the result of
+        * sata_srst_pmp(). If this call fails, we retry with pmp = 0.
+        */
+       ret = ahci_do_softreset(link, class, pmp, deadline, ahci_check_ready);
+       if (pmp && ret == -EBUSY)
+               return ahci_do_softreset(link, class, 0,
+                                        deadline, ahci_check_ready);
+
+       return ret;
+}
+
+static struct ata_port_operations ahci_dm816_port_ops = {
+       .inherits = &ahci_platform_ops,
+       .softreset = ahci_dm816_softreset,
+};
+
+static const struct ata_port_info ahci_dm816_port_info = {
+       .flags          = AHCI_FLAG_COMMON,
+       .pio_mask       = ATA_PIO4,
+       .udma_mask      = ATA_UDMA6,
+       .port_ops       = &ahci_dm816_port_ops,
+};
+
+static struct scsi_host_template ahci_dm816_platform_sht = {
+       AHCI_SHT(AHCI_DM816_DRV_NAME),
+};
+
+static int ahci_dm816_probe(struct platform_device *pdev)
+{
+       struct device *dev = &pdev->dev;
+       struct ahci_host_priv *hpriv;
+       int rc;
+
+       hpriv = ahci_platform_get_resources(pdev);
+       if (IS_ERR(hpriv))
+               return PTR_ERR(hpriv);
+
+       rc = ahci_platform_enable_resources(hpriv);
+       if (rc)
+               return rc;
+
+       rc = ahci_dm816_phy_init(hpriv, dev);
+       if (rc)
+               goto disable_resources;
+
+       rc = ahci_platform_init_host(pdev, hpriv,
+                                    &ahci_dm816_port_info,
+                                    &ahci_dm816_platform_sht);
+       if (rc)
+               goto disable_resources;
+
+       return 0;
+
+disable_resources:
+       ahci_platform_disable_resources(hpriv);
+
+       return rc;
+}
+
+static SIMPLE_DEV_PM_OPS(ahci_dm816_pm_ops,
+                        ahci_platform_suspend,
+                        ahci_platform_resume);
+
+static const struct of_device_id ahci_dm816_of_match[] = {
+       { .compatible = "ti,dm816-ahci", },
+       { },
+};
+MODULE_DEVICE_TABLE(of, ahci_dm816_of_match);
+
+static struct platform_driver ahci_dm816_driver = {
+       .probe = ahci_dm816_probe,
+       .remove = ata_platform_remove_one,
+       .driver = {
+               .name = AHCI_DM816_DRV_NAME,
+               .of_match_table = ahci_dm816_of_match,
+               .pm = &ahci_dm816_pm_ops,
+       },
+};
+module_platform_driver(ahci_dm816_driver);
+
+MODULE_DESCRIPTION("DaVinci DM816 AHCI SATA platform driver");
+MODULE_AUTHOR("Bartosz Golaszewski <bgolaszewski@baylibre.com>");
+MODULE_LICENSE("GPL");
index ea865fe..5a44e08 100644 (file)
@@ -38,11 +38,6 @@ static int ahci_octeon_probe(struct platform_device *pdev)
        int ret;
 
        res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-       if (!res) {
-               dev_err(&pdev->dev, "Platform resource[0] is missing\n");
-               return -ENODEV;
-       }
-
        base = devm_ioremap_resource(&pdev->dev, res);
        if (IS_ERR(base))
                return PTR_ERR(base);
index ca75823..2d83b8c 100644 (file)
@@ -4910,7 +4910,7 @@ void ata_sg_init(struct ata_queued_cmd *qc, struct scatterlist *sg,
  *     LOCKING:
  *     spin_lock_irqsave(host lock)
  */
-void ata_sg_clean(struct ata_queued_cmd *qc)
+static void ata_sg_clean(struct ata_queued_cmd *qc)
 {
        struct ata_port *ap = qc->ap;
        struct scatterlist *sg = qc->sg;
@@ -5902,9 +5902,9 @@ struct ata_port *ata_port_alloc(struct ata_host *host)
        INIT_LIST_HEAD(&ap->eh_done_q);
        init_waitqueue_head(&ap->eh_wait_q);
        init_completion(&ap->park_req_pending);
-       init_timer_deferrable(&ap->fastdrain_timer);
-       ap->fastdrain_timer.function = ata_eh_fastdrain_timerfn;
-       ap->fastdrain_timer.data = (unsigned long)ap;
+       setup_deferrable_timer(&ap->fastdrain_timer,
+                              ata_eh_fastdrain_timerfn,
+                              (unsigned long)ap);
 
        ap->cbl = ATA_CBL_NONE;
 
index 1ac7074..49ba983 100644 (file)
@@ -3393,46 +3393,6 @@ static size_t ata_format_dsm_trim_descr(struct scsi_cmnd *cmd, u32 trmax,
 }
 
 /**
- * ata_format_dsm_trim_descr() - SATL Write Same to ATA SCT Write Same
- * @cmd: SCSI command being translated
- * @lba: Starting sector
- * @num: Number of sectors to be zero'd.
- *
- * Rewrite the WRITE SAME payload to be an SCT Write Same formatted
- * descriptor.
- * NOTE: Writes a pattern (0's) in the foreground.
- *
- * Return: Number of bytes copied into sglist.
- */
-static size_t ata_format_sct_write_same(struct scsi_cmnd *cmd, u64 lba, u64 num)
-{
-       struct scsi_device *sdp = cmd->device;
-       size_t len = sdp->sector_size;
-       size_t r;
-       u16 *buf;
-       unsigned long flags;
-
-       spin_lock_irqsave(&ata_scsi_rbuf_lock, flags);
-       buf = ((void *)ata_scsi_rbuf);
-
-       put_unaligned_le16(0x0002,  &buf[0]); /* SCT_ACT_WRITE_SAME */
-       put_unaligned_le16(0x0101,  &buf[1]); /* WRITE PTRN FG */
-       put_unaligned_le64(lba,     &buf[2]);
-       put_unaligned_le64(num,     &buf[6]);
-       put_unaligned_le32(0u,      &buf[10]); /* pattern */
-
-       WARN_ON(len > ATA_SCSI_RBUF_SIZE);
-
-       if (len > ATA_SCSI_RBUF_SIZE)
-               len = ATA_SCSI_RBUF_SIZE;
-
-       r = sg_copy_from_buffer(scsi_sglist(cmd), scsi_sg_count(cmd), buf, len);
-       spin_unlock_irqrestore(&ata_scsi_rbuf_lock, flags);
-
-       return r;
-}
-
-/**
  * ata_scsi_write_same_xlat() - SATL Write Same to ATA SCT Write Same
  * @qc: Command to be translated
  *
@@ -3462,32 +3422,31 @@ static unsigned int ata_scsi_write_same_xlat(struct ata_queued_cmd *qc)
        if (unlikely(!dev->dma_mode))
                goto invalid_opcode;
 
+       /*
+        * We only allow sending this command through the block layer,
+        * as it modifies the DATA OUT buffer, which would corrupt user
+        * memory for SG_IO commands.
+        */
+       if (unlikely(blk_rq_is_passthrough(scmd->request)))
+               goto invalid_opcode;
+
        if (unlikely(scmd->cmd_len < 16)) {
                fp = 15;
                goto invalid_fld;
        }
        scsi_16_lba_len(cdb, &block, &n_block);
 
-       if (unmap) {
-               /* If trim is not enabled the cmd is invalid. */
-               if ((dev->horkage & ATA_HORKAGE_NOTRIM) ||
-                   !ata_id_has_trim(dev->id)) {
-                       fp = 1;
-                       bp = 3;
-                       goto invalid_fld;
-               }
-               /* If the request is too large the cmd is invalid */
-               if (n_block > 0xffff * trmax) {
-                       fp = 2;
-                       goto invalid_fld;
-               }
-       } else {
-               /* If write same is not available the cmd is invalid */
-               if (!ata_id_sct_write_same(dev->id)) {
-                       fp = 1;
-                       bp = 3;
-                       goto invalid_fld;
-               }
+       if (!unmap ||
+           (dev->horkage & ATA_HORKAGE_NOTRIM) ||
+           !ata_id_has_trim(dev->id)) {
+               fp = 1;
+               bp = 3;
+               goto invalid_fld;
+       }
+       /* If the request is too large the cmd is invalid */
+       if (n_block > 0xffff * trmax) {
+               fp = 2;
+               goto invalid_fld;
        }
 
        /*
@@ -3502,49 +3461,28 @@ static unsigned int ata_scsi_write_same_xlat(struct ata_queued_cmd *qc)
         * For DATA SET MANAGEMENT TRIM in ACS-2 nsect (aka count)
         * is defined as number of 512 byte blocks to be transferred.
         */
-       if (unmap) {
-               size = ata_format_dsm_trim_descr(scmd, trmax, block, n_block);
-               if (size != len)
-                       goto invalid_param_len;
 
-               if (ata_ncq_enabled(dev) && ata_fpdma_dsm_supported(dev)) {
-                       /* Newer devices support queued TRIM commands */
-                       tf->protocol = ATA_PROT_NCQ;
-                       tf->command = ATA_CMD_FPDMA_SEND;
-                       tf->hob_nsect = ATA_SUBCMD_FPDMA_SEND_DSM & 0x1f;
-                       tf->nsect = qc->tag << 3;
-                       tf->hob_feature = (size / 512) >> 8;
-                       tf->feature = size / 512;
+       size = ata_format_dsm_trim_descr(scmd, trmax, block, n_block);
+       if (size != len)
+               goto invalid_param_len;
 
-                       tf->auxiliary = 1;
-               } else {
-                       tf->protocol = ATA_PROT_DMA;
-                       tf->hob_feature = 0;
-                       tf->feature = ATA_DSM_TRIM;
-                       tf->hob_nsect = (size / 512) >> 8;
-                       tf->nsect = size / 512;
-                       tf->command = ATA_CMD_DSM;
-               }
-       } else {
-               size = ata_format_sct_write_same(scmd, block, n_block);
-               if (size != len)
-                       goto invalid_param_len;
+       if (ata_ncq_enabled(dev) && ata_fpdma_dsm_supported(dev)) {
+               /* Newer devices support queued TRIM commands */
+               tf->protocol = ATA_PROT_NCQ;
+               tf->command = ATA_CMD_FPDMA_SEND;
+               tf->hob_nsect = ATA_SUBCMD_FPDMA_SEND_DSM & 0x1f;
+               tf->nsect = qc->tag << 3;
+               tf->hob_feature = (size / 512) >> 8;
+               tf->feature = size / 512;
 
-               tf->hob_feature = 0;
-               tf->feature = 0;
-               tf->hob_nsect = 0;
-               tf->nsect = 1;
-               tf->lbah = 0;
-               tf->lbam = 0;
-               tf->lbal = ATA_CMD_STANDBYNOW1;
-               tf->hob_lbah = 0;
-               tf->hob_lbam = 0;
-               tf->hob_lbal = 0;
-               tf->device = ATA_CMD_STANDBYNOW1;
+               tf->auxiliary = 1;
+       } else {
                tf->protocol = ATA_PROT_DMA;
-               tf->command = ATA_CMD_WRITE_LOG_DMA_EXT;
-               if (unlikely(dev->flags & ATA_DFLAG_PIO))
-                       tf->command = ATA_CMD_WRITE_LOG_EXT;
+               tf->hob_feature = 0;
+               tf->feature = ATA_DSM_TRIM;
+               tf->hob_nsect = (size / 512) >> 8;
+               tf->nsect = size / 512;
+               tf->command = ATA_CMD_DSM;
        }
 
        tf->flags |= ATA_TFLAG_ISADDR | ATA_TFLAG_DEVICE | ATA_TFLAG_LBA48 |
@@ -3619,10 +3557,6 @@ static unsigned int ata_scsiop_maint_in(struct ata_scsi_args *args, u8 *rbuf)
        case START_STOP:
                supported = 3;
                break;
-       case WRITE_SAME_16:
-               if (!ata_id_sct_write_same(dev->id))
-                       break;
-               /* fallthrough: if SCT ... only enable for ZBC */
        case ZBC_IN:
        case ZBC_OUT:
                if (ata_id_zoned_cap(dev->id) ||
diff --git a/drivers/ata/pata_at91.c b/drivers/ata/pata_at91.c
deleted file mode 100644 (file)
index fd5b34f..0000000
+++ /dev/null
@@ -1,503 +0,0 @@
-/*
- * PATA driver for AT91SAM9260 Static Memory Controller
- * with CompactFlash interface in True IDE mode
- *
- * Copyright (C) 2009 Matyukevich Sergey
- *               2011 Igor Plyatov
- *
- * Based on:
- *      * generic platform driver by Paul Mundt: drivers/ata/pata_platform.c
- *      * pata_at32 driver by Kristoffer Nyborg Gregertsen
- *      * at91_ide driver by Stanislaw Gruszka
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation.
- *
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/blkdev.h>
-#include <linux/gfp.h>
-#include <scsi/scsi_host.h>
-#include <linux/ata.h>
-#include <linux/clk.h>
-#include <linux/libata.h>
-#include <linux/mfd/syscon.h>
-#include <linux/mfd/syscon/atmel-smc.h>
-#include <linux/platform_device.h>
-#include <linux/ata_platform.h>
-#include <linux/platform_data/atmel.h>
-#include <linux/regmap.h>
-#include <linux/gpio.h>
-
-#define DRV_NAME               "pata_at91"
-#define DRV_VERSION            "0.3"
-
-#define CF_IDE_OFFSET          0x00c00000
-#define CF_ALT_IDE_OFFSET      0x00e00000
-#define CF_IDE_RES_SIZE                0x08
-#define CS_PULSE_MAXIMUM       319
-#define ER_SMC_CALC            1
-#define ER_SMC_RECALC          2
-
-struct at91_ide_info {
-       unsigned long mode;
-       unsigned int cs;
-       struct clk *mck;
-       void __iomem *ide_addr;
-       void __iomem *alt_addr;
-};
-
-/**
- * struct smc_range - range of valid values for SMC register.
- */
-struct smc_range {
-       int min;
-       int max;
-};
-
-struct regmap *smc;
-
-struct at91sam9_smc_generic_fields {
-       struct regmap_field *setup;
-       struct regmap_field *pulse;
-       struct regmap_field *cycle;
-       struct regmap_field *mode;
-} fields;
-
-/**
- * adjust_smc_value - adjust value for one of SMC registers.
- * @value: adjusted value
- * @range: array of SMC ranges with valid values
- * @size: SMC ranges array size
- *
- * This returns the difference between input and output value or negative
- * in case of invalid input value.
- * If negative returned, then output value = maximal possible from ranges.
- */
-static int adjust_smc_value(int *value, struct smc_range *range, int size)
-{
-       int maximum = (range + size - 1)->max;
-       int remainder;
-
-       do {
-               if (*value < range->min) {
-                       remainder = range->min - *value;
-                       *value = range->min; /* nearest valid value */
-                       return remainder;
-               } else if ((range->min <= *value) && (*value <= range->max))
-                       return 0;
-
-               range++;
-       } while (--size);
-       *value = maximum;
-
-       return -1; /* invalid value */
-}
-
-/**
- * calc_smc_vals - calculate SMC register values
- * @dev: ATA device
- * @setup: SMC_SETUP register value
- * @pulse: SMC_PULSE register value
- * @cycle: SMC_CYCLE register value
- *
- * This returns negative in case of invalid values for SMC registers:
- * -ER_SMC_RECALC - recalculation required for SMC values,
- * -ER_SMC_CALC - calculation failed (invalid input values).
- *
- * SMC use special coding scheme, see "Coding and Range of Timing
- * Parameters" table from AT91SAM9 datasheets.
- *
- *     SMC_SETUP = 128*setup[5] + setup[4:0]
- *     SMC_PULSE = 256*pulse[6] + pulse[5:0]
- *     SMC_CYCLE = 256*cycle[8:7] + cycle[6:0]
- */
-static int calc_smc_vals(struct device *dev,
-               int *setup, int *pulse, int *cycle, int *cs_pulse)
-{
-       int ret_val;
-       int err = 0;
-       struct smc_range range_setup[] = {      /* SMC_SETUP valid values */
-               {.min = 0,      .max = 31},     /* first  range */
-               {.min = 128,    .max = 159}     /* second range */
-       };
-       struct smc_range range_pulse[] = {      /* SMC_PULSE valid values */
-               {.min = 0,      .max = 63},     /* first  range */
-               {.min = 256,    .max = 319}     /* second range */
-       };
-       struct smc_range range_cycle[] = {      /* SMC_CYCLE valid values */
-               {.min = 0,      .max = 127},    /* first  range */
-               {.min = 256,    .max = 383},    /* second range */
-               {.min = 512,    .max = 639},    /* third  range */
-               {.min = 768,    .max = 895}     /* fourth range */
-       };
-
-       ret_val = adjust_smc_value(setup, range_setup, ARRAY_SIZE(range_setup));
-       if (ret_val < 0)
-               dev_warn(dev, "maximal SMC Setup value\n");
-       else
-               *cycle += ret_val;
-
-       ret_val = adjust_smc_value(pulse, range_pulse, ARRAY_SIZE(range_pulse));
-       if (ret_val < 0)
-               dev_warn(dev, "maximal SMC Pulse value\n");
-       else
-               *cycle += ret_val;
-
-       ret_val = adjust_smc_value(cycle, range_cycle, ARRAY_SIZE(range_cycle));
-       if (ret_val < 0)
-               dev_warn(dev, "maximal SMC Cycle value\n");
-
-       *cs_pulse = *cycle;
-       if (*cs_pulse > CS_PULSE_MAXIMUM) {
-               dev_err(dev, "unable to calculate valid SMC settings\n");
-               return -ER_SMC_CALC;
-       }
-
-       ret_val = adjust_smc_value(cs_pulse, range_pulse,
-                                       ARRAY_SIZE(range_pulse));
-       if (ret_val < 0) {
-               dev_warn(dev, "maximal SMC CS Pulse value\n");
-       } else if (ret_val != 0) {
-               *cycle = *cs_pulse;
-               dev_warn(dev, "SMC Cycle extended\n");
-               err = -ER_SMC_RECALC;
-       }
-
-       return err;
-}
-
-/**
- * to_smc_format - convert values into SMC format
- * @setup: SETUP value of SMC Setup Register
- * @pulse: PULSE value of SMC Pulse Register
- * @cycle: CYCLE value of SMC Cycle Register
- * @cs_pulse: NCS_PULSE value of SMC Pulse Register
- */
-static void to_smc_format(int *setup, int *pulse, int *cycle, int *cs_pulse)
-{
-       *setup = (*setup & 0x1f) | ((*setup & 0x80) >> 2);
-       *pulse = (*pulse & 0x3f) | ((*pulse & 0x100) >> 2);
-       *cycle = (*cycle & 0x7f) | ((*cycle & 0x300) >> 1);
-       *cs_pulse = (*cs_pulse & 0x3f) | ((*cs_pulse & 0x100) >> 2);
-}
-
-static unsigned long calc_mck_cycles(unsigned long ns, unsigned long mck_hz)
-{
-       unsigned long mul;
-
-       /*
-       * cycles = x [nsec] * f [Hz] / 10^9 [ns in sec] =
-       *     x * (f / 1_000_000_000) =
-       *     x * ((f * 65536) / 1_000_000_000) / 65536 =
-       *     x * (((f / 10_000) * 65536) / 100_000) / 65536 =
-       */
-
-       mul = (mck_hz / 10000) << 16;
-       mul /= 100000;
-
-       return (ns * mul + 65536) >> 16;    /* rounding */
-}
-
-/**
- * set_smc_timing - SMC timings setup.
- * @dev: device
- * @info: AT91 IDE info
- * @ata: ATA timings
- *
- * Its assumed that write timings are same as read timings,
- * cs_setup = 0 and cs_pulse = cycle.
- */
-static void set_smc_timing(struct device *dev, struct ata_device *adev,
-               struct at91_ide_info *info, const struct ata_timing *ata)
-{
-       int ret = 0;
-       int use_iordy;
-       unsigned int t6z;         /* data tristate time in ns */
-       unsigned int cycle;       /* SMC Cycle width in MCK ticks */
-       unsigned int setup;       /* SMC Setup width in MCK ticks */
-       unsigned int pulse;       /* CFIOR and CFIOW pulse width in MCK ticks */
-       unsigned int cs_pulse;    /* CS4 or CS5 pulse width in MCK ticks*/
-       unsigned int tdf_cycles;  /* SMC TDF MCK ticks */
-       unsigned long mck_hz;     /* MCK frequency in Hz */
-
-       t6z = (ata->mode < XFER_PIO_5) ? 30 : 20;
-       mck_hz = clk_get_rate(info->mck);
-       cycle = calc_mck_cycles(ata->cyc8b, mck_hz);
-       setup = calc_mck_cycles(ata->setup, mck_hz);
-       pulse = calc_mck_cycles(ata->act8b, mck_hz);
-       tdf_cycles = calc_mck_cycles(t6z, mck_hz);
-
-       do {
-               ret = calc_smc_vals(dev, &setup, &pulse, &cycle, &cs_pulse);
-       } while (ret == -ER_SMC_RECALC);
-
-       if (ret == -ER_SMC_CALC)
-               dev_err(dev, "Interface may not operate correctly\n");
-
-       dev_dbg(dev, "SMC Setup=%u, Pulse=%u, Cycle=%u, CS Pulse=%u\n",
-               setup, pulse, cycle, cs_pulse);
-       to_smc_format(&setup, &pulse, &cycle, &cs_pulse);
-       /* disable or enable waiting for IORDY signal */
-       use_iordy = ata_pio_need_iordy(adev);
-       if (use_iordy)
-               info->mode |= AT91_SMC_EXNWMODE_READY;
-
-       if (tdf_cycles > 15) {
-               tdf_cycles = 15;
-               dev_warn(dev, "maximal SMC TDF Cycles value\n");
-       }
-
-       dev_dbg(dev, "Use IORDY=%u, TDF Cycles=%u\n", use_iordy, tdf_cycles);
-
-       regmap_fields_write(fields.setup, info->cs,
-                           AT91SAM9_SMC_NRDSETUP(setup) |
-                           AT91SAM9_SMC_NWESETUP(setup) |
-                           AT91SAM9_SMC_NCS_NRDSETUP(0) |
-                           AT91SAM9_SMC_NCS_WRSETUP(0));
-       regmap_fields_write(fields.pulse, info->cs,
-                           AT91SAM9_SMC_NRDPULSE(pulse) |
-                           AT91SAM9_SMC_NWEPULSE(pulse) |
-                           AT91SAM9_SMC_NCS_NRDPULSE(cs_pulse) |
-                           AT91SAM9_SMC_NCS_WRPULSE(cs_pulse));
-       regmap_fields_write(fields.cycle, info->cs,
-                           AT91SAM9_SMC_NRDCYCLE(cycle) |
-                           AT91SAM9_SMC_NWECYCLE(cycle));
-       regmap_fields_write(fields.mode, info->cs, info->mode |
-                           AT91_SMC_TDF_(tdf_cycles));
-}
-
-static void pata_at91_set_piomode(struct ata_port *ap, struct ata_device *adev)
-{
-       struct at91_ide_info *info = ap->host->private_data;
-       struct ata_timing timing;
-       int ret;
-
-       /* Compute ATA timing and set it to SMC */
-       ret = ata_timing_compute(adev, adev->pio_mode, &timing, 1000, 0);
-       if (ret) {
-               dev_warn(ap->dev, "Failed to compute ATA timing %d, "
-                        "set PIO_0 timing\n", ret);
-               timing = *ata_timing_find_mode(XFER_PIO_0);
-       }
-       set_smc_timing(ap->dev, adev, info, &timing);
-}
-
-static unsigned int pata_at91_data_xfer_noirq(struct ata_queued_cmd *qc,
-               unsigned char *buf, unsigned int buflen, int rw)
-{
-       struct at91_ide_info *info = qc->dev->link->ap->host->private_data;
-       unsigned int consumed;
-       unsigned int mode;
-       unsigned long flags;
-
-       local_irq_save(flags);
-       regmap_fields_read(fields.mode, info->cs, &mode);
-
-       /* set 16bit mode before writing data */
-       regmap_fields_write(fields.mode, info->cs, (mode & ~AT91_SMC_DBW) |
-                           AT91_SMC_DBW_16);
-
-       consumed = ata_sff_data_xfer(qc, buf, buflen, rw);
-
-       /* restore 8bit mode after data is written */
-       regmap_fields_write(fields.mode, info->cs, (mode & ~AT91_SMC_DBW) |
-                           AT91_SMC_DBW_8);
-
-       local_irq_restore(flags);
-       return consumed;
-}
-
-static struct scsi_host_template pata_at91_sht = {
-       ATA_PIO_SHT(DRV_NAME),
-};
-
-static struct ata_port_operations pata_at91_port_ops = {
-       .inherits       = &ata_sff_port_ops,
-
-       .sff_data_xfer  = pata_at91_data_xfer_noirq,
-       .set_piomode    = pata_at91_set_piomode,
-       .cable_detect   = ata_cable_40wire,
-};
-
-static int at91sam9_smc_fields_init(struct device *dev)
-{
-       struct reg_field field = REG_FIELD(0, 0, 31);
-
-       field.id_size = 8;
-       field.id_offset = AT91SAM9_SMC_GENERIC_BLK_SZ;
-
-       field.reg = AT91SAM9_SMC_SETUP(AT91SAM9_SMC_GENERIC);
-       fields.setup = devm_regmap_field_alloc(dev, smc, field);
-       if (IS_ERR(fields.setup))
-               return PTR_ERR(fields.setup);
-
-       field.reg = AT91SAM9_SMC_PULSE(AT91SAM9_SMC_GENERIC);
-       fields.pulse = devm_regmap_field_alloc(dev, smc, field);
-       if (IS_ERR(fields.pulse))
-               return PTR_ERR(fields.pulse);
-
-       field.reg = AT91SAM9_SMC_CYCLE(AT91SAM9_SMC_GENERIC);
-       fields.cycle = devm_regmap_field_alloc(dev, smc, field);
-       if (IS_ERR(fields.cycle))
-               return PTR_ERR(fields.cycle);
-
-       field.reg = AT91SAM9_SMC_MODE(AT91SAM9_SMC_GENERIC);
-       fields.mode = devm_regmap_field_alloc(dev, smc, field);
-
-       return PTR_ERR_OR_ZERO(fields.mode);
-}
-
-static int pata_at91_probe(struct platform_device *pdev)
-{
-       struct at91_cf_data *board = dev_get_platdata(&pdev->dev);
-       struct device *dev = &pdev->dev;
-       struct at91_ide_info *info;
-       struct resource *mem_res;
-       struct ata_host *host;
-       struct ata_port *ap;
-
-       int irq_flags = 0;
-       int irq = 0;
-       int ret;
-
-       /*  get platform resources: IO/CTL memories and irq/rst pins */
-
-       if (pdev->num_resources != 1) {
-               dev_err(&pdev->dev, "invalid number of resources\n");
-               return -EINVAL;
-       }
-
-       mem_res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-
-       if (!mem_res) {
-               dev_err(dev, "failed to get mem resource\n");
-               return -EINVAL;
-       }
-
-       irq = board->irq_pin;
-
-       smc = syscon_regmap_lookup_by_phandle(pdev->dev.of_node, "atmel,smc");
-       if (IS_ERR(smc))
-               return PTR_ERR(smc);
-
-       ret = at91sam9_smc_fields_init(dev);
-       if (ret < 0)
-               return ret;
-
-       /* init ata host */
-
-       host = ata_host_alloc(dev, 1);
-
-       if (!host)
-               return -ENOMEM;
-
-       ap = host->ports[0];
-       ap->ops = &pata_at91_port_ops;
-       ap->flags |= ATA_FLAG_SLAVE_POSS;
-       ap->pio_mask = ATA_PIO4;
-
-       if (!gpio_is_valid(irq)) {
-               ap->flags |= ATA_FLAG_PIO_POLLING;
-               ata_port_desc(ap, "no IRQ, using PIO polling");
-       }
-
-       info = devm_kzalloc(dev, sizeof(*info), GFP_KERNEL);
-
-       if (!info) {
-               dev_err(dev, "failed to allocate memory for private data\n");
-               return -ENOMEM;
-       }
-
-       info->mck = clk_get(NULL, "mck");
-
-       if (IS_ERR(info->mck)) {
-               dev_err(dev, "failed to get access to mck clock\n");
-               return -ENODEV;
-       }
-
-       info->cs    = board->chipselect;
-       info->mode  = AT91_SMC_READMODE | AT91_SMC_WRITEMODE |
-               AT91_SMC_EXNWMODE_READY | AT91_SMC_BAT_SELECT |
-               AT91_SMC_DBW_8 | AT91_SMC_TDF_(0);
-
-       info->ide_addr = devm_ioremap(dev,
-                       mem_res->start + CF_IDE_OFFSET, CF_IDE_RES_SIZE);
-
-       if (!info->ide_addr) {
-               dev_err(dev, "failed to map IO base\n");
-               ret = -ENOMEM;
-               goto err_put;
-       }
-
-       info->alt_addr = devm_ioremap(dev,
-                       mem_res->start + CF_ALT_IDE_OFFSET, CF_IDE_RES_SIZE);
-
-       if (!info->alt_addr) {
-               dev_err(dev, "failed to map CTL base\n");
-               ret = -ENOMEM;
-               goto err_put;
-       }
-
-       ap->ioaddr.cmd_addr = info->ide_addr;
-       ap->ioaddr.ctl_addr = info->alt_addr + 0x06;
-       ap->ioaddr.altstatus_addr = ap->ioaddr.ctl_addr;
-
-       ata_sff_std_ports(&ap->ioaddr);
-
-       ata_port_desc(ap, "mmio cmd 0x%llx ctl 0x%llx",
-                       (unsigned long long)mem_res->start + CF_IDE_OFFSET,
-                       (unsigned long long)mem_res->start + CF_ALT_IDE_OFFSET);
-
-       host->private_data = info;
-
-       ret = ata_host_activate(host, gpio_is_valid(irq) ? gpio_to_irq(irq) : 0,
-                               gpio_is_valid(irq) ? ata_sff_interrupt : NULL,
-                               irq_flags, &pata_at91_sht);
-       if (ret)
-               goto err_put;
-
-       return 0;
-
-err_put:
-       clk_put(info->mck);
-       return ret;
-}
-
-static int pata_at91_remove(struct platform_device *pdev)
-{
-       struct ata_host *host = platform_get_drvdata(pdev);
-       struct at91_ide_info *info;
-
-       if (!host)
-               return 0;
-       info = host->private_data;
-
-       ata_host_detach(host);
-
-       if (!info)
-               return 0;
-
-       clk_put(info->mck);
-
-       return 0;
-}
-
-static struct platform_driver pata_at91_driver = {
-       .probe          = pata_at91_probe,
-       .remove         = pata_at91_remove,
-       .driver         = {
-               .name           = DRV_NAME,
-       },
-};
-
-module_platform_driver(pata_at91_driver);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("Driver for CF in True IDE mode on AT91SAM9260 SoC");
-MODULE_AUTHOR("Matyukevich Sergey");
-MODULE_VERSION(DRV_VERSION);
-
index 6c9aa95..49d705c 100644 (file)
@@ -278,11 +278,6 @@ static int atiixp_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
        };
        const struct ata_port_info *ppi[] = { &info, &info };
 
-       /* SB600/700 don't have secondary port wired */
-       if ((pdev->device == PCI_DEVICE_ID_ATI_IXP600_IDE) ||
-               (pdev->device == PCI_DEVICE_ID_ATI_IXP700_IDE))
-               ppi[1] = &ata_dummy_port_info;
-
        return ata_pci_bmdma_init_one(pdev, ppi, &atiixp_sht, NULL,
                                      ATA_HOST_PARALLEL_SCAN);
 }
index e347e7a..0adcb40 100644 (file)
@@ -1328,7 +1328,7 @@ static int pata_macio_pci_resume(struct pci_dev *pdev)
 }
 #endif /* CONFIG_PM_SLEEP */
 
-static struct of_device_id pata_macio_match[] =
+static const struct of_device_id pata_macio_match[] =
 {
        {
        .name           = "IDE",
index 252ba27..9730125 100644 (file)
@@ -847,7 +847,7 @@ mpc52xx_ata_resume(struct platform_device *op)
 }
 #endif
 
-static struct of_device_id mpc52xx_ata_of_match[] = {
+static const struct of_device_id mpc52xx_ata_of_match[] = {
        { .compatible = "fsl,mpc5200-ata", },
        { .compatible = "mpc5200-ata", },
        {},
index 201a32d..01161c1 100644 (file)
@@ -67,7 +67,7 @@ static int pata_of_platform_probe(struct platform_device *ofdev)
                                     reg_shift, pio_mask, &pata_platform_sht);
 }
 
-static struct of_device_id pata_of_platform_match[] = {
+static const struct of_device_id pata_of_platform_match[] = {
        { .compatible = "ata-generic", },
        { },
 };
index a723ae9..01734d5 100644 (file)
@@ -1612,7 +1612,7 @@ static int sata_fsl_resume(struct platform_device *op)
 }
 #endif
 
-static struct of_device_id fsl_sata_match[] = {
+static const struct of_device_id fsl_sata_match[] = {
        {
                .compatible = "fsl,pq-sata",
        },
index 00ce26d..b66bcda 100644 (file)
@@ -4286,7 +4286,7 @@ static int mv_platform_resume(struct platform_device *pdev)
 #endif
 
 #ifdef CONFIG_OF
-static struct of_device_id mv_sata_dt_ids[] = {
+static const struct of_device_id mv_sata_dt_ids[] = {
        { .compatible = "marvell,armada-370-sata", },
        { .compatible = "marvell,orion-sata", },
        {},
index 0636d84..f3f538e 100644 (file)
@@ -644,14 +644,16 @@ static void svia_configure(struct pci_dev *pdev, int board_id,
                pci_write_config_byte(pdev, SATA_NATIVE_MODE, tmp8);
        }
 
-       /* enable IRQ on hotplug */
-       pci_read_config_byte(pdev, SVIA_MISC_3, &tmp8);
-       if ((tmp8 & SATA_HOTPLUG) != SATA_HOTPLUG) {
-               dev_dbg(&pdev->dev,
-                       "enabling SATA hotplug (0x%x)\n",
-                       (int) tmp8);
-               tmp8 |= SATA_HOTPLUG;
-               pci_write_config_byte(pdev, SVIA_MISC_3, tmp8);
+       if (board_id == vt6421) {
+               /* enable IRQ on hotplug */
+               pci_read_config_byte(pdev, SVIA_MISC_3, &tmp8);
+               if ((tmp8 & SATA_HOTPLUG) != SATA_HOTPLUG) {
+                       dev_dbg(&pdev->dev,
+                               "enabling SATA hotplug (0x%x)\n",
+                               (int) tmp8);
+                       tmp8 |= SATA_HOTPLUG;
+                       pci_write_config_byte(pdev, SVIA_MISC_3, tmp8);
+               }
        }
 
        /*
index 0fc7c4d..d35e9a2 100644 (file)
@@ -345,8 +345,7 @@ platform_msi_create_device_domain(struct device *dev,
 
        data->host_data = host_data;
        domain = irq_domain_create_hierarchy(dev->msi_domain, 0, nvec,
-                                            of_node_to_fwnode(dev->of_node),
-                                            ops, data);
+                                            dev->fwnode, ops, data);
        if (!domain)
                goto free_priv;
 
index e697dec..ad19642 100644 (file)
@@ -121,7 +121,9 @@ static const struct genpd_lock_ops genpd_spin_ops = {
 #define genpd_lock_interruptible(p)    p->lock_ops->lock_interruptible(p)
 #define genpd_unlock(p)                        p->lock_ops->unlock(p)
 
+#define genpd_status_on(genpd)         (genpd->status == GPD_STATE_ACTIVE)
 #define genpd_is_irq_safe(genpd)       (genpd->flags & GENPD_FLAG_IRQ_SAFE)
+#define genpd_is_always_on(genpd)      (genpd->flags & GENPD_FLAG_ALWAYS_ON)
 
 static inline bool irq_safe_dev_in_no_sleep_domain(struct device *dev,
                struct generic_pm_domain *genpd)
@@ -130,8 +132,12 @@ static inline bool irq_safe_dev_in_no_sleep_domain(struct device *dev,
 
        ret = pm_runtime_is_irq_safe(dev) && !genpd_is_irq_safe(genpd);
 
-       /* Warn once if IRQ safe dev in no sleep domain */
-       if (ret)
+       /*
+        * Warn once if an IRQ safe device is attached to a no sleep domain, as
+        * to indicate a suboptimal configuration for PM. For an always on
+        * domain this isn't case, thus don't warn.
+        */
+       if (ret && !genpd_is_always_on(genpd))
                dev_warn_once(dev, "PM domain %s will not be powered off\n",
                                genpd->name);
 
@@ -296,11 +302,15 @@ static int genpd_power_off(struct generic_pm_domain *genpd, bool one_dev_on,
         * (1) The domain is already in the "power off" state.
         * (2) System suspend is in progress.
         */
-       if (genpd->status == GPD_STATE_POWER_OFF
-           || genpd->prepared_count > 0)
+       if (!genpd_status_on(genpd) || genpd->prepared_count > 0)
                return 0;
 
-       if (atomic_read(&genpd->sd_count) > 0)
+       /*
+        * Abort power off for the PM domain in the following situations:
+        * (1) The domain is configured as always on.
+        * (2) When the domain has a subdomain being powered on.
+        */
+       if (genpd_is_always_on(genpd) || atomic_read(&genpd->sd_count) > 0)
                return -EBUSY;
 
        list_for_each_entry(pdd, &genpd->dev_list, list_node) {
@@ -373,7 +383,7 @@ static int genpd_power_on(struct generic_pm_domain *genpd, unsigned int depth)
        struct gpd_link *link;
        int ret = 0;
 
-       if (genpd->status == GPD_STATE_ACTIVE)
+       if (genpd_status_on(genpd))
                return 0;
 
        /*
@@ -752,7 +762,7 @@ static void genpd_sync_power_off(struct generic_pm_domain *genpd, bool use_lock,
 {
        struct gpd_link *link;
 
-       if (genpd->status == GPD_STATE_POWER_OFF)
+       if (!genpd_status_on(genpd) || genpd_is_always_on(genpd))
                return;
 
        if (genpd->suspended_count != genpd->device_count
@@ -761,7 +771,8 @@ static void genpd_sync_power_off(struct generic_pm_domain *genpd, bool use_lock,
 
        /* Choose the deepest state when suspending */
        genpd->state_idx = genpd->state_count - 1;
-       _genpd_power_off(genpd, false);
+       if (_genpd_power_off(genpd, false))
+               return;
 
        genpd->status = GPD_STATE_POWER_OFF;
 
@@ -793,7 +804,7 @@ static void genpd_sync_power_on(struct generic_pm_domain *genpd, bool use_lock,
 {
        struct gpd_link *link;
 
-       if (genpd->status == GPD_STATE_ACTIVE)
+       if (genpd_status_on(genpd))
                return;
 
        list_for_each_entry(link, &genpd->slave_links, slave_node) {
@@ -1329,8 +1340,7 @@ static int genpd_add_subdomain(struct generic_pm_domain *genpd,
        genpd_lock(subdomain);
        genpd_lock_nested(genpd, SINGLE_DEPTH_NESTING);
 
-       if (genpd->status == GPD_STATE_POWER_OFF
-           &&  subdomain->status != GPD_STATE_POWER_OFF) {
+       if (!genpd_status_on(genpd) && genpd_status_on(subdomain)) {
                ret = -EINVAL;
                goto out;
        }
@@ -1346,7 +1356,7 @@ static int genpd_add_subdomain(struct generic_pm_domain *genpd,
        list_add_tail(&link->master_node, &genpd->master_links);
        link->slave = subdomain;
        list_add_tail(&link->slave_node, &subdomain->slave_links);
-       if (subdomain->status != GPD_STATE_POWER_OFF)
+       if (genpd_status_on(subdomain))
                genpd_sd_counter_inc(genpd);
 
  out:
@@ -1406,7 +1416,7 @@ int pm_genpd_remove_subdomain(struct generic_pm_domain *genpd,
                list_del(&link->master_node);
                list_del(&link->slave_node);
                kfree(link);
-               if (subdomain->status != GPD_STATE_POWER_OFF)
+               if (genpd_status_on(subdomain))
                        genpd_sd_counter_dec(genpd);
 
                ret = 0;
@@ -1492,6 +1502,10 @@ int pm_genpd_init(struct generic_pm_domain *genpd,
                genpd->dev_ops.start = pm_clk_resume;
        }
 
+       /* Always-on domains must be powered on at initialization. */
+       if (genpd_is_always_on(genpd) && !genpd_status_on(genpd))
+               return -EINVAL;
+
        /* Use only one "off" state if there were no states declared */
        if (genpd->state_count == 0) {
                ret = genpd_set_default_power_state(genpd);
@@ -1700,12 +1714,12 @@ int of_genpd_add_provider_simple(struct device_node *np,
 
        mutex_lock(&gpd_list_lock);
 
-       if (pm_genpd_present(genpd))
+       if (pm_genpd_present(genpd)) {
                ret = genpd_add_provider(np, genpd_xlate_simple, genpd);
-
-       if (!ret) {
-               genpd->provider = &np->fwnode;
-               genpd->has_provider = true;
+               if (!ret) {
+                       genpd->provider = &np->fwnode;
+                       genpd->has_provider = true;
+               }
        }
 
        mutex_unlock(&gpd_list_lock);
@@ -2079,11 +2093,6 @@ static int genpd_parse_state(struct genpd_power_state *genpd_state,
        int err;
        u32 residency;
        u32 entry_latency, exit_latency;
-       const struct of_device_id *match_id;
-
-       match_id = of_match_node(idle_state_match, state_node);
-       if (!match_id)
-               return -EINVAL;
 
        err = of_property_read_u32(state_node, "entry-latency-us",
                                                &entry_latency);
@@ -2132,6 +2141,7 @@ int of_genpd_parse_idle_states(struct device_node *dn,
        int err, ret;
        int count;
        struct of_phandle_iterator it;
+       const struct of_device_id *match_id;
 
        count = of_count_phandle_with_args(dn, "domain-idle-states", NULL);
        if (count <= 0)
@@ -2144,6 +2154,9 @@ int of_genpd_parse_idle_states(struct device_node *dn,
        /* Loop over the phandles until all the requested entry is found */
        of_for_each_phandle(&it, err, dn, "domain-idle-states", NULL, 0) {
                np = it.node;
+               match_id = of_match_node(idle_state_match, np);
+               if (!match_id)
+                       continue;
                ret = genpd_parse_state(&st[i++], np);
                if (ret) {
                        pr_err
@@ -2155,8 +2168,11 @@ int of_genpd_parse_idle_states(struct device_node *dn,
                }
        }
 
-       *n = count;
-       *states = st;
+       *n = i;
+       if (!i)
+               kfree(st);
+       else
+               *states = st;
 
        return 0;
 }
@@ -2221,7 +2237,7 @@ static int pm_genpd_summary_one(struct seq_file *s,
 
        if (WARN_ON(genpd->status >= ARRAY_SIZE(status_lookup)))
                goto exit;
-       if (genpd->status == GPD_STATE_POWER_OFF)
+       if (!genpd_status_on(genpd))
                snprintf(state, sizeof(state), "%s-%u",
                         status_lookup[genpd->status], genpd->state_idx);
        else
index c458c63..149de31 100644 (file)
@@ -15,6 +15,7 @@
 #include <linux/kernel.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
+#include <linux/of_graph.h>
 #include <linux/property.h>
 #include <linux/etherdevice.h>
 #include <linux/phy.h>
@@ -146,47 +147,45 @@ static int pset_prop_read_string_array(struct property_set *pset,
                                       const char *propname,
                                       const char **strings, size_t nval)
 {
+       const struct property_entry *prop;
        const void *pointer;
-       size_t length = nval * sizeof(*strings);
+       size_t array_len, length;
+
+       /* Find out the array length. */
+       prop = pset_prop_get(pset, propname);
+       if (!prop)
+               return -EINVAL;
+
+       if (!prop->is_array)
+               /* The array length for a non-array string property is 1. */
+               array_len = 1;
+       else
+               /* Find the length of an array. */
+               array_len = pset_prop_count_elems_of_size(pset, propname,
+                                                         sizeof(const char *));
+
+       /* Return how many there are if strings is NULL. */
+       if (!strings)
+               return array_len;
+
+       array_len = min(nval, array_len);
+       length = array_len * sizeof(*strings);
 
        pointer = pset_prop_find(pset, propname, length);
        if (IS_ERR(pointer))
                return PTR_ERR(pointer);
 
        memcpy(strings, pointer, length);
-       return 0;
-}
 
-static int pset_prop_read_string(struct property_set *pset,
-                                const char *propname, const char **strings)
-{
-       const struct property_entry *prop;
-       const char * const *pointer;
-
-       prop = pset_prop_get(pset, propname);
-       if (!prop)
-               return -EINVAL;
-       if (!prop->is_string)
-               return -EILSEQ;
-       if (prop->is_array) {
-               pointer = prop->pointer.str;
-               if (!pointer)
-                       return -ENODATA;
-       } else {
-               pointer = &prop->value.str;
-               if (*pointer && strnlen(*pointer, prop->length) >= prop->length)
-                       return -EILSEQ;
-       }
-
-       *strings = *pointer;
-       return 0;
+       return array_len;
 }
 
-static inline struct fwnode_handle *dev_fwnode(struct device *dev)
+struct fwnode_handle *dev_fwnode(struct device *dev)
 {
        return IS_ENABLED(CONFIG_OF) && dev->of_node ?
                &dev->of_node->fwnode : dev->fwnode;
 }
+EXPORT_SYMBOL_GPL(dev_fwnode);
 
 /**
  * device_property_present - check if a property of a device is present
@@ -340,8 +339,8 @@ EXPORT_SYMBOL_GPL(device_property_read_u64_array);
  * Function reads an array of string properties with @propname from the device
  * firmware description and stores them to @val if found.
  *
- * Return: number of values if @val was %NULL,
- *         %0 if the property was found (success),
+ * Return: number of values read on success if @val is non-NULL,
+ *        number of values available on success if @val is NULL,
  *        %-EINVAL if given arguments are not valid,
  *        %-ENODATA if the property does not have a value,
  *        %-EPROTO or %-EILSEQ if the property is not an array of strings,
@@ -553,25 +552,8 @@ static int __fwnode_property_read_string_array(struct fwnode_handle *fwnode,
                return acpi_node_prop_read(fwnode, propname, DEV_PROP_STRING,
                                           val, nval);
        else if (is_pset_node(fwnode))
-               return val ?
-                       pset_prop_read_string_array(to_pset_node(fwnode),
-                                                   propname, val, nval) :
-                       pset_prop_count_elems_of_size(to_pset_node(fwnode),
-                                                     propname,
-                                                     sizeof(const char *));
-       return -ENXIO;
-}
-
-static int __fwnode_property_read_string(struct fwnode_handle *fwnode,
-                                        const char *propname, const char **val)
-{
-       if (is_of_node(fwnode))
-               return of_property_read_string(to_of_node(fwnode), propname, val);
-       else if (is_acpi_node(fwnode))
-               return acpi_node_prop_read(fwnode, propname, DEV_PROP_STRING,
-                                          val, 1);
-       else if (is_pset_node(fwnode))
-               return pset_prop_read_string(to_pset_node(fwnode), propname, val);
+               return pset_prop_read_string_array(to_pset_node(fwnode),
+                                                  propname, val, nval);
        return -ENXIO;
 }
 
@@ -585,11 +567,11 @@ static int __fwnode_property_read_string(struct fwnode_handle *fwnode,
  * Read an string list property @propname from the given firmware node and store
  * them to @val if found.
  *
- * Return: number of values if @val was %NULL,
- *         %0 if the property was found (success),
+ * Return: number of values read on success if @val is non-NULL,
+ *        number of values available on success if @val is NULL,
  *        %-EINVAL if given arguments are not valid,
  *        %-ENODATA if the property does not have a value,
- *        %-EPROTO if the property is not an array of strings,
+ *        %-EPROTO or %-EILSEQ if the property is not an array of strings,
  *        %-EOVERFLOW if the size of the property is not as expected,
  *        %-ENXIO if no suitable firmware interface is present.
  */
@@ -626,14 +608,9 @@ EXPORT_SYMBOL_GPL(fwnode_property_read_string_array);
 int fwnode_property_read_string(struct fwnode_handle *fwnode,
                                const char *propname, const char **val)
 {
-       int ret;
+       int ret = fwnode_property_read_string_array(fwnode, propname, val, 1);
 
-       ret = __fwnode_property_read_string(fwnode, propname, val);
-       if (ret == -EINVAL && !IS_ERR_OR_NULL(fwnode) &&
-           !IS_ERR_OR_NULL(fwnode->secondary))
-               ret = __fwnode_property_read_string(fwnode->secondary,
-                                                   propname, val);
-       return ret;
+       return ret < 0 ? ret : 0;
 }
 EXPORT_SYMBOL_GPL(fwnode_property_read_string);
 
@@ -932,41 +909,109 @@ int device_add_properties(struct device *dev,
 EXPORT_SYMBOL_GPL(device_add_properties);
 
 /**
- * device_get_next_child_node - Return the next child node handle for a device
- * @dev: Device to find the next child node for.
- * @child: Handle to one of the device's child nodes or a null handle.
+ * fwnode_get_next_parent - Iterate to the node's parent
+ * @fwnode: Firmware whose parent is retrieved
+ *
+ * This is like fwnode_get_parent() except that it drops the refcount
+ * on the passed node, making it suitable for iterating through a
+ * node's parents.
+ *
+ * Returns a node pointer with refcount incremented, use
+ * fwnode_handle_node() on it when done.
  */
-struct fwnode_handle *device_get_next_child_node(struct device *dev,
+struct fwnode_handle *fwnode_get_next_parent(struct fwnode_handle *fwnode)
+{
+       struct fwnode_handle *parent = fwnode_get_parent(fwnode);
+
+       fwnode_handle_put(fwnode);
+
+       return parent;
+}
+EXPORT_SYMBOL_GPL(fwnode_get_next_parent);
+
+/**
+ * fwnode_get_parent - Return parent firwmare node
+ * @fwnode: Firmware whose parent is retrieved
+ *
+ * Return parent firmware node of the given node if possible or %NULL if no
+ * parent was available.
+ */
+struct fwnode_handle *fwnode_get_parent(struct fwnode_handle *fwnode)
+{
+       struct fwnode_handle *parent = NULL;
+
+       if (is_of_node(fwnode)) {
+               struct device_node *node;
+
+               node = of_get_parent(to_of_node(fwnode));
+               if (node)
+                       parent = &node->fwnode;
+       } else if (is_acpi_node(fwnode)) {
+               parent = acpi_node_get_parent(fwnode);
+       }
+
+       return parent;
+}
+EXPORT_SYMBOL_GPL(fwnode_get_parent);
+
+/**
+ * fwnode_get_next_child_node - Return the next child node handle for a node
+ * @fwnode: Firmware node to find the next child node for.
+ * @child: Handle to one of the node's child nodes or a %NULL handle.
+ */
+struct fwnode_handle *fwnode_get_next_child_node(struct fwnode_handle *fwnode,
                                                 struct fwnode_handle *child)
 {
-       if (IS_ENABLED(CONFIG_OF) && dev->of_node) {
+       if (is_of_node(fwnode)) {
                struct device_node *node;
 
-               node = of_get_next_available_child(dev->of_node, to_of_node(child));
+               node = of_get_next_available_child(to_of_node(fwnode),
+                                                  to_of_node(child));
                if (node)
                        return &node->fwnode;
-       } else if (IS_ENABLED(CONFIG_ACPI)) {
-               return acpi_get_next_subnode(dev, child);
+       } else if (is_acpi_node(fwnode)) {
+               return acpi_get_next_subnode(fwnode, child);
        }
+
        return NULL;
 }
+EXPORT_SYMBOL_GPL(fwnode_get_next_child_node);
+
+/**
+ * device_get_next_child_node - Return the next child node handle for a device
+ * @dev: Device to find the next child node for.
+ * @child: Handle to one of the device's child nodes or a null handle.
+ */
+struct fwnode_handle *device_get_next_child_node(struct device *dev,
+                                                struct fwnode_handle *child)
+{
+       struct acpi_device *adev = ACPI_COMPANION(dev);
+       struct fwnode_handle *fwnode = NULL;
+
+       if (dev->of_node)
+               fwnode = &dev->of_node->fwnode;
+       else if (adev)
+               fwnode = acpi_fwnode_handle(adev);
+
+       return fwnode_get_next_child_node(fwnode, child);
+}
 EXPORT_SYMBOL_GPL(device_get_next_child_node);
 
 /**
- * device_get_named_child_node - Return first matching named child node handle
- * @dev: Device to find the named child node for.
+ * fwnode_get_named_child_node - Return first matching named child node handle
+ * @fwnode: Firmware node to find the named child node for.
  * @childname: String to match child node name against.
  */
-struct fwnode_handle *device_get_named_child_node(struct device *dev,
+struct fwnode_handle *fwnode_get_named_child_node(struct fwnode_handle *fwnode,
                                                  const char *childname)
 {
        struct fwnode_handle *child;
 
        /*
-        * Find first matching named child node of this device.
+        * Find first matching named child node of this fwnode.
         * For ACPI this will be a data only sub-node.
         */
-       device_for_each_child_node(dev, child) {
+       fwnode_for_each_child_node(fwnode, child) {
                if (is_of_node(child)) {
                        if (!of_node_cmp(to_of_node(child)->name, childname))
                                return child;
@@ -978,9 +1023,32 @@ struct fwnode_handle *device_get_named_child_node(struct device *dev,
 
        return NULL;
 }
+EXPORT_SYMBOL_GPL(fwnode_get_named_child_node);
+
+/**
+ * device_get_named_child_node - Return first matching named child node handle
+ * @dev: Device to find the named child node for.
+ * @childname: String to match child node name against.
+ */
+struct fwnode_handle *device_get_named_child_node(struct device *dev,
+                                                 const char *childname)
+{
+       return fwnode_get_named_child_node(dev_fwnode(dev), childname);
+}
 EXPORT_SYMBOL_GPL(device_get_named_child_node);
 
 /**
+ * fwnode_handle_get - Obtain a reference to a device node
+ * @fwnode: Pointer to the device node to obtain the reference to.
+ */
+void fwnode_handle_get(struct fwnode_handle *fwnode)
+{
+       if (is_of_node(fwnode))
+               of_node_get(to_of_node(fwnode));
+}
+EXPORT_SYMBOL_GPL(fwnode_handle_get);
+
+/**
  * fwnode_handle_put - Drop reference to a device node
  * @fwnode: Pointer to the device node to drop the reference to.
  *
@@ -1117,3 +1185,157 @@ void *device_get_mac_address(struct device *dev, char *addr, int alen)
        return device_get_mac_addr(dev, "address", addr, alen);
 }
 EXPORT_SYMBOL(device_get_mac_address);
+
+/**
+ * device_graph_get_next_endpoint - Get next endpoint firmware node
+ * @fwnode: Pointer to the parent firmware node
+ * @prev: Previous endpoint node or %NULL to get the first
+ *
+ * Returns an endpoint firmware node pointer or %NULL if no more endpoints
+ * are available.
+ */
+struct fwnode_handle *
+fwnode_graph_get_next_endpoint(struct fwnode_handle *fwnode,
+                              struct fwnode_handle *prev)
+{
+       struct fwnode_handle *endpoint = NULL;
+
+       if (is_of_node(fwnode)) {
+               struct device_node *node;
+
+               node = of_graph_get_next_endpoint(to_of_node(fwnode),
+                                                 to_of_node(prev));
+
+               if (node)
+                       endpoint = &node->fwnode;
+       } else if (is_acpi_node(fwnode)) {
+               endpoint = acpi_graph_get_next_endpoint(fwnode, prev);
+               if (IS_ERR(endpoint))
+                       endpoint = NULL;
+       }
+
+       return endpoint;
+
+}
+EXPORT_SYMBOL_GPL(fwnode_graph_get_next_endpoint);
+
+/**
+ * fwnode_graph_get_remote_port_parent - Return fwnode of a remote device
+ * @fwnode: Endpoint firmware node pointing to the remote endpoint
+ *
+ * Extracts firmware node of a remote device the @fwnode points to.
+ */
+struct fwnode_handle *
+fwnode_graph_get_remote_port_parent(struct fwnode_handle *fwnode)
+{
+       struct fwnode_handle *parent = NULL;
+
+       if (is_of_node(fwnode)) {
+               struct device_node *node;
+
+               node = of_graph_get_remote_port_parent(to_of_node(fwnode));
+               if (node)
+                       parent = &node->fwnode;
+       } else if (is_acpi_node(fwnode)) {
+               int ret;
+
+               ret = acpi_graph_get_remote_endpoint(fwnode, &parent, NULL,
+                                                    NULL);
+               if (ret)
+                       return NULL;
+       }
+
+       return parent;
+}
+EXPORT_SYMBOL_GPL(fwnode_graph_get_remote_port_parent);
+
+/**
+ * fwnode_graph_get_remote_port - Return fwnode of a remote port
+ * @fwnode: Endpoint firmware node pointing to the remote endpoint
+ *
+ * Extracts firmware node of a remote port the @fwnode points to.
+ */
+struct fwnode_handle *fwnode_graph_get_remote_port(struct fwnode_handle *fwnode)
+{
+       struct fwnode_handle *port = NULL;
+
+       if (is_of_node(fwnode)) {
+               struct device_node *node;
+
+               node = of_graph_get_remote_port(to_of_node(fwnode));
+               if (node)
+                       port = &node->fwnode;
+       } else if (is_acpi_node(fwnode)) {
+               int ret;
+
+               ret = acpi_graph_get_remote_endpoint(fwnode, NULL, &port, NULL);
+               if (ret)
+                       return NULL;
+       }
+
+       return port;
+}
+EXPORT_SYMBOL_GPL(fwnode_graph_get_remote_port);
+
+/**
+ * fwnode_graph_get_remote_endpoint - Return fwnode of a remote endpoint
+ * @fwnode: Endpoint firmware node pointing to the remote endpoint
+ *
+ * Extracts firmware node of a remote endpoint the @fwnode points to.
+ */
+struct fwnode_handle *
+fwnode_graph_get_remote_endpoint(struct fwnode_handle *fwnode)
+{
+       struct fwnode_handle *endpoint = NULL;
+
+       if (is_of_node(fwnode)) {
+               struct device_node *node;
+
+               node = of_parse_phandle(to_of_node(fwnode), "remote-endpoint",
+                                       0);
+               if (node)
+                       endpoint = &node->fwnode;
+       } else if (is_acpi_node(fwnode)) {
+               int ret;
+
+               ret = acpi_graph_get_remote_endpoint(fwnode, NULL, NULL,
+                                                    &endpoint);
+               if (ret)
+                       return NULL;
+       }
+
+       return endpoint;
+}
+EXPORT_SYMBOL_GPL(fwnode_graph_get_remote_endpoint);
+
+/**
+ * fwnode_graph_parse_endpoint - parse common endpoint node properties
+ * @fwnode: pointer to endpoint fwnode_handle
+ * @endpoint: pointer to the fwnode endpoint data structure
+ *
+ * Parse @fwnode representing a graph endpoint node and store the
+ * information in @endpoint. The caller must hold a reference to
+ * @fwnode.
+ */
+int fwnode_graph_parse_endpoint(struct fwnode_handle *fwnode,
+                               struct fwnode_endpoint *endpoint)
+{
+       struct fwnode_handle *port_fwnode = fwnode_get_parent(fwnode);
+
+       memset(endpoint, 0, sizeof(*endpoint));
+
+       endpoint->local_fwnode = fwnode;
+
+       if (is_acpi_node(port_fwnode)) {
+               fwnode_property_read_u32(port_fwnode, "port", &endpoint->port);
+               fwnode_property_read_u32(fwnode, "endpoint", &endpoint->id);
+       } else {
+               fwnode_property_read_u32(port_fwnode, "reg", &endpoint->port);
+               fwnode_property_read_u32(fwnode, "reg", &endpoint->id);
+       }
+
+       fwnode_handle_put(port_fwnode);
+
+       return 0;
+}
+EXPORT_SYMBOL(fwnode_graph_parse_endpoint);
index f744de7..19df491 100644 (file)
@@ -312,22 +312,6 @@ config BLK_DEV_SKD
 
        Use device /dev/skd$N amd /dev/skd$Np$M.
 
-config BLK_DEV_OSD
-       tristate "OSD object-as-blkdev support"
-       depends on SCSI_OSD_ULD
-       ---help---
-         Saying Y or M here will allow the exporting of a single SCSI
-         OSD (object-based storage) object as a Linux block device.
-
-         For example, if you create a 2G object on an OSD device,
-         you can then use this module to present that 2G object as
-         a Linux block device.
-
-         To compile this driver as a module, choose M here: the
-         module will be called osdblk.
-
-         If unsure, say N.
-
 config BLK_DEV_SX8
        tristate "Promise SATA SX8 support"
        depends on PCI
@@ -434,23 +418,6 @@ config ATA_OVER_ETH
        This driver provides Support for ATA over Ethernet block
        devices like the Coraid EtherDrive (R) Storage Blade.
 
-config MG_DISK
-       tristate "mGine mflash, gflash support"
-       depends on ARM && GPIOLIB
-       help
-         mGine mFlash(gFlash) block device driver
-
-config MG_DISK_RES
-       int "Size of reserved area before MBR"
-       depends on MG_DISK
-       default 0
-       help
-         Define size of reserved area that usually used for boot. Unit is KB.
-         All of the block device operation will be taken this value as start
-         offset
-         Examples:
-                       1024 => 1 MB
-
 config SUNVDC
        tristate "Sun Virtual Disk Client support"
        depends on SUN_LDOMS
@@ -512,19 +479,7 @@ config VIRTIO_BLK_SCSI
          Enable support for SCSI passthrough (e.g. the SG_IO ioctl) on
          virtio-blk devices.  This is only supported for the legacy
          virtio protocol and not enabled by default by any hypervisor.
-         Your probably want to virtio-scsi instead.
-
-config BLK_DEV_HD
-       bool "Very old hard disk (MFM/RLL/IDE) driver"
-       depends on HAVE_IDE
-       depends on !ARM || ARCH_RPC || BROKEN
-       help
-         This is a very old hard disk driver that lacks the enhanced
-         functionality of the newer ones.
-
-         It is required for systems with ancient MFM/RLL/ESDI drives.
-
-         If unsure, say N.
+         You probably want to use virtio-scsi instead.
 
 config BLK_DEV_RBD
        tristate "Rados block device (RBD)"
index 1e9661e..ec8c368 100644 (file)
@@ -19,10 +19,8 @@ obj-$(CONFIG_BLK_CPQ_CISS_DA)  += cciss.o
 obj-$(CONFIG_BLK_DEV_DAC960)   += DAC960.o
 obj-$(CONFIG_XILINX_SYSACE)    += xsysace.o
 obj-$(CONFIG_CDROM_PKTCDVD)    += pktcdvd.o
-obj-$(CONFIG_MG_DISK)          += mg_disk.o
 obj-$(CONFIG_SUNVDC)           += sunvdc.o
 obj-$(CONFIG_BLK_DEV_SKD)      += skd.o
-obj-$(CONFIG_BLK_DEV_OSD)      += osdblk.o
 
 obj-$(CONFIG_BLK_DEV_UMEM)     += umem.o
 obj-$(CONFIG_BLK_DEV_NBD)      += nbd.o
@@ -30,7 +28,6 @@ obj-$(CONFIG_BLK_DEV_CRYPTOLOOP) += cryptoloop.o
 obj-$(CONFIG_VIRTIO_BLK)       += virtio_blk.o
 
 obj-$(CONFIG_BLK_DEV_SX8)      += sx8.o
-obj-$(CONFIG_BLK_DEV_HD)       += hd.o
 
 obj-$(CONFIG_XEN_BLKDEV_FRONTEND)      += xen-blkfront.o
 obj-$(CONFIG_XEN_BLKDEV_BACKEND)       += xen-blkback/
index 2104b1b..fa69ecd 100644 (file)
@@ -617,12 +617,12 @@ static void fd_error( void )
        if (!fd_request)
                return;
 
-       fd_request->errors++;
-       if (fd_request->errors >= MAX_ERRORS) {
+       fd_request->error_count++;
+       if (fd_request->error_count >= MAX_ERRORS) {
                printk(KERN_ERR "fd%d: too many errors.\n", SelectedDrive );
                fd_end_request_cur(-EIO);
        }
-       else if (fd_request->errors == RECALIBRATE_ERRORS) {
+       else if (fd_request->error_count == RECALIBRATE_ERRORS) {
                printk(KERN_WARNING "fd%d: recalibrating\n", SelectedDrive );
                if (SelectedDrive != -1)
                        SUD.track = -1;
@@ -1386,7 +1386,7 @@ static void setup_req_params( int drive )
        ReqData = ReqBuffer + 512 * ReqCnt;
 
        if (UseTrackbuffer)
-               read_track = (ReqCmd == READ && fd_request->errors == 0);
+               read_track = (ReqCmd == READ && fd_request->error_count == 0);
        else
                read_track = 0;
 
@@ -1409,8 +1409,10 @@ static struct request *set_next_request(void)
                        fdc_queue = 0;
                if (q) {
                        rq = blk_fetch_request(q);
-                       if (rq)
+                       if (rq) {
+                               rq->error_count = 0;
                                break;
+                       }
                }
        } while (fdc_queue != old_pos);
 
index 3adc32a..4ec84d5 100644 (file)
@@ -134,28 +134,6 @@ static struct page *brd_insert_page(struct brd_device *brd, sector_t sector)
        return page;
 }
 
-static void brd_free_page(struct brd_device *brd, sector_t sector)
-{
-       struct page *page;
-       pgoff_t idx;
-
-       spin_lock(&brd->brd_lock);
-       idx = sector >> PAGE_SECTORS_SHIFT;
-       page = radix_tree_delete(&brd->brd_pages, idx);
-       spin_unlock(&brd->brd_lock);
-       if (page)
-               __free_page(page);
-}
-
-static void brd_zero_page(struct brd_device *brd, sector_t sector)
-{
-       struct page *page;
-
-       page = brd_lookup_page(brd, sector);
-       if (page)
-               clear_highpage(page);
-}
-
 /*
  * Free all backing store pages and radix tree. This must only be called when
  * there are no other users of the device.
@@ -212,24 +190,6 @@ static int copy_to_brd_setup(struct brd_device *brd, sector_t sector, size_t n)
        return 0;
 }
 
-static void discard_from_brd(struct brd_device *brd,
-                       sector_t sector, size_t n)
-{
-       while (n >= PAGE_SIZE) {
-               /*
-                * Don't want to actually discard pages here because
-                * re-allocating the pages can result in writeback
-                * deadlocks under heavy load.
-                */
-               if (0)
-                       brd_free_page(brd, sector);
-               else
-                       brd_zero_page(brd, sector);
-               sector += PAGE_SIZE >> SECTOR_SHIFT;
-               n -= PAGE_SIZE;
-       }
-}
-
 /*
  * Copy n bytes from src to the brd starting at sector. Does not sleep.
  */
@@ -338,14 +298,6 @@ static blk_qc_t brd_make_request(struct request_queue *q, struct bio *bio)
        if (bio_end_sector(bio) > get_capacity(bdev->bd_disk))
                goto io_error;
 
-       if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {
-               if (sector & ((PAGE_SIZE >> SECTOR_SHIFT) - 1) ||
-                   bio->bi_iter.bi_size & ~PAGE_MASK)
-                       goto io_error;
-               discard_from_brd(brd, sector, bio->bi_iter.bi_size);
-               goto out;
-       }
-
        bio_for_each_segment(bvec, bio, iter) {
                unsigned int len = bvec.bv_len;
                int err;
@@ -357,7 +309,6 @@ static blk_qc_t brd_make_request(struct request_queue *q, struct bio *bio)
                sector += len >> SECTOR_SHIFT;
        }
 
-out:
        bio_endio(bio);
        return BLK_QC_T_NONE;
 io_error:
@@ -464,11 +415,6 @@ static struct brd_device *brd_alloc(int i)
         *  is harmless)
         */
        blk_queue_physical_block_size(brd->brd_queue, PAGE_SIZE);
-
-       brd->brd_queue->limits.discard_granularity = PAGE_SIZE;
-       blk_queue_max_discard_sectors(brd->brd_queue, UINT_MAX);
-       brd->brd_queue->limits.discard_zeroes_data = 1;
-       queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, brd->brd_queue);
 #ifdef CONFIG_BLK_DEV_RAM_DAX
        queue_flag_set_unlocked(QUEUE_FLAG_DAX, brd->brd_queue);
 #endif
index 8e1a455..cd37550 100644 (file)
@@ -1864,8 +1864,7 @@ static void cciss_softirq_done(struct request *rq)
        /* set the residual count for pc requests */
        if (blk_rq_is_passthrough(rq))
                scsi_req(rq)->resid_len = c->err_info->ResidualCnt;
-
-       blk_end_request_all(rq, (rq->errors == 0) ? 0 : -EIO);
+       blk_end_request_all(rq, scsi_req(rq)->result ? -EIO : 0);
 
        spin_lock_irqsave(&h->lock, flags);
        cmd_free(h, c);
@@ -3140,18 +3139,19 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
 {
        int retry_cmd = 0;
        struct request *rq = cmd->rq;
+       struct scsi_request *sreq = scsi_req(rq);
 
-       rq->errors = 0;
+       sreq->result = 0;
 
        if (timeout)
-               rq->errors = make_status_bytes(0, 0, 0, DRIVER_TIMEOUT);
+               sreq->result = make_status_bytes(0, 0, 0, DRIVER_TIMEOUT);
 
        if (cmd->err_info->CommandStatus == 0)  /* no error has occurred */
                goto after_error_processing;
 
        switch (cmd->err_info->CommandStatus) {
        case CMD_TARGET_STATUS:
-               rq->errors = evaluate_target_status(h, cmd, &retry_cmd);
+               sreq->result = evaluate_target_status(h, cmd, &retry_cmd);
                break;
        case CMD_DATA_UNDERRUN:
                if (!blk_rq_is_passthrough(cmd->rq)) {
@@ -3169,7 +3169,7 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
        case CMD_INVALID:
                dev_warn(&h->pdev->dev, "cciss: cmd %p is "
                       "reported invalid\n", cmd);
-               rq->errors = make_status_bytes(SAM_STAT_GOOD,
+               sreq->result = make_status_bytes(SAM_STAT_GOOD,
                        cmd->err_info->CommandStatus, DRIVER_OK,
                        blk_rq_is_passthrough(cmd->rq) ?
                                DID_PASSTHROUGH : DID_ERROR);
@@ -3177,7 +3177,7 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
        case CMD_PROTOCOL_ERR:
                dev_warn(&h->pdev->dev, "cciss: cmd %p has "
                       "protocol error\n", cmd);
-               rq->errors = make_status_bytes(SAM_STAT_GOOD,
+               sreq->result = make_status_bytes(SAM_STAT_GOOD,
                        cmd->err_info->CommandStatus, DRIVER_OK,
                        blk_rq_is_passthrough(cmd->rq) ?
                                DID_PASSTHROUGH : DID_ERROR);
@@ -3185,7 +3185,7 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
        case CMD_HARDWARE_ERR:
                dev_warn(&h->pdev->dev, "cciss: cmd %p had "
                       " hardware error\n", cmd);
-               rq->errors = make_status_bytes(SAM_STAT_GOOD,
+               sreq->result = make_status_bytes(SAM_STAT_GOOD,
                        cmd->err_info->CommandStatus, DRIVER_OK,
                        blk_rq_is_passthrough(cmd->rq) ?
                                DID_PASSTHROUGH : DID_ERROR);
@@ -3193,7 +3193,7 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
        case CMD_CONNECTION_LOST:
                dev_warn(&h->pdev->dev, "cciss: cmd %p had "
                       "connection lost\n", cmd);
-               rq->errors = make_status_bytes(SAM_STAT_GOOD,
+               sreq->result = make_status_bytes(SAM_STAT_GOOD,
                        cmd->err_info->CommandStatus, DRIVER_OK,
                        blk_rq_is_passthrough(cmd->rq) ?
                                DID_PASSTHROUGH : DID_ERROR);
@@ -3201,7 +3201,7 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
        case CMD_ABORTED:
                dev_warn(&h->pdev->dev, "cciss: cmd %p was "
                       "aborted\n", cmd);
-               rq->errors = make_status_bytes(SAM_STAT_GOOD,
+               sreq->result = make_status_bytes(SAM_STAT_GOOD,
                        cmd->err_info->CommandStatus, DRIVER_OK,
                        blk_rq_is_passthrough(cmd->rq) ?
                                DID_PASSTHROUGH : DID_ABORT);
@@ -3209,7 +3209,7 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
        case CMD_ABORT_FAILED:
                dev_warn(&h->pdev->dev, "cciss: cmd %p reports "
                       "abort failed\n", cmd);
-               rq->errors = make_status_bytes(SAM_STAT_GOOD,
+               sreq->result = make_status_bytes(SAM_STAT_GOOD,
                        cmd->err_info->CommandStatus, DRIVER_OK,
                        blk_rq_is_passthrough(cmd->rq) ?
                                DID_PASSTHROUGH : DID_ERROR);
@@ -3224,21 +3224,21 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
                } else
                        dev_warn(&h->pdev->dev,
                                "%p retried too many times\n", cmd);
-               rq->errors = make_status_bytes(SAM_STAT_GOOD,
+               sreq->result = make_status_bytes(SAM_STAT_GOOD,
                        cmd->err_info->CommandStatus, DRIVER_OK,
                        blk_rq_is_passthrough(cmd->rq) ?
                                DID_PASSTHROUGH : DID_ABORT);
                break;
        case CMD_TIMEOUT:
                dev_warn(&h->pdev->dev, "cmd %p timedout\n", cmd);
-               rq->errors = make_status_bytes(SAM_STAT_GOOD,
+               sreq->result = make_status_bytes(SAM_STAT_GOOD,
                        cmd->err_info->CommandStatus, DRIVER_OK,
                        blk_rq_is_passthrough(cmd->rq) ?
                                DID_PASSTHROUGH : DID_ERROR);
                break;
        case CMD_UNABORTABLE:
                dev_warn(&h->pdev->dev, "cmd %p unabortable\n", cmd);
-               rq->errors = make_status_bytes(SAM_STAT_GOOD,
+               sreq->result = make_status_bytes(SAM_STAT_GOOD,
                        cmd->err_info->CommandStatus, DRIVER_OK,
                        blk_rq_is_passthrough(cmd->rq) ?
                                DID_PASSTHROUGH : DID_ERROR);
@@ -3247,7 +3247,7 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
                dev_warn(&h->pdev->dev, "cmd %p returned "
                       "unknown status %x\n", cmd,
                       cmd->err_info->CommandStatus);
-               rq->errors = make_status_bytes(SAM_STAT_GOOD,
+               sreq->result = make_status_bytes(SAM_STAT_GOOD,
                        cmd->err_info->CommandStatus, DRIVER_OK,
                        blk_rq_is_passthrough(cmd->rq) ?
                                DID_PASSTHROUGH : DID_ERROR);
@@ -3380,9 +3380,9 @@ static void do_cciss_request(struct request_queue *q)
                if (dma_mapping_error(&h->pdev->dev, temp64.val)) {
                        dev_warn(&h->pdev->dev,
                                "%s: error mapping page for DMA\n", __func__);
-                       creq->errors = make_status_bytes(SAM_STAT_GOOD,
-                                                       0, DRIVER_OK,
-                                                       DID_SOFT_ERROR);
+                       scsi_req(creq)->result =
+                               make_status_bytes(SAM_STAT_GOOD, 0, DRIVER_OK,
+                                                 DID_SOFT_ERROR);
                        cmd_free(h, c);
                        return;
                }
@@ -3395,9 +3395,9 @@ static void do_cciss_request(struct request_queue *q)
                if (cciss_map_sg_chain_block(h, c, h->cmd_sg_list[c->cmdindex],
                        (seg - (h->max_cmd_sgentries - 1)) *
                                sizeof(SGDescriptor_struct))) {
-                       creq->errors = make_status_bytes(SAM_STAT_GOOD,
-                                                       0, DRIVER_OK,
-                                                       DID_SOFT_ERROR);
+                       scsi_req(creq)->result =
+                               make_status_bytes(SAM_STAT_GOOD, 0, DRIVER_OK,
+                                                 DID_SOFT_ERROR);
                        cmd_free(h, c);
                        return;
                }
index de5c3ee..494837e 100644 (file)
@@ -236,9 +236,6 @@ static void seq_print_peer_request_flags(struct seq_file *m, struct drbd_peer_re
        seq_print_rq_state_bit(m, f & EE_CALL_AL_COMPLETE_IO, &sep, "in-AL");
        seq_print_rq_state_bit(m, f & EE_SEND_WRITE_ACK, &sep, "C");
        seq_print_rq_state_bit(m, f & EE_MAY_SET_IN_SYNC, &sep, "set-in-sync");
-
-       if (f & EE_IS_TRIM)
-               __seq_print_rq_state_bit(m, f & EE_IS_TRIM_USE_ZEROOUT, &sep, "zero-out", "trim");
        seq_print_rq_state_bit(m, f & EE_WRITE_SAME, &sep, "write-same");
        seq_putc(m, '\n');
 }
index 724d1c5..d5da45b 100644 (file)
@@ -437,9 +437,6 @@ enum {
 
        /* is this a TRIM aka REQ_DISCARD? */
        __EE_IS_TRIM,
-       /* our lower level cannot handle trim,
-        * and we want to fall back to zeroout instead */
-       __EE_IS_TRIM_USE_ZEROOUT,
 
        /* In case a barrier failed,
         * we need to resubmit without the barrier flag. */
@@ -482,7 +479,6 @@ enum {
 #define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
 #define EE_MAY_SET_IN_SYNC     (1<<__EE_MAY_SET_IN_SYNC)
 #define EE_IS_TRIM             (1<<__EE_IS_TRIM)
-#define EE_IS_TRIM_USE_ZEROOUT (1<<__EE_IS_TRIM_USE_ZEROOUT)
 #define EE_RESUBMITTED         (1<<__EE_RESUBMITTED)
 #define EE_WAS_ERROR           (1<<__EE_WAS_ERROR)
 #define EE_HAS_DIGEST          (1<<__EE_HAS_DIGEST)
@@ -1561,8 +1557,6 @@ extern void start_resync_timer_fn(unsigned long data);
 extern void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req);
 
 /* drbd_receiver.c */
-extern int drbd_issue_discard_or_zero_out(struct drbd_device *device,
-               sector_t start, unsigned int nr_sectors, bool discard);
 extern int drbd_receiver(struct drbd_thread *thi);
 extern int drbd_ack_receiver(struct drbd_thread *thi);
 extern void drbd_send_ping_wf(struct work_struct *ws);
index 92c60cb..84455c3 100644 (file)
@@ -931,7 +931,6 @@ void assign_p_sizes_qlim(struct drbd_device *device, struct p_sizes *p, struct r
                p->qlim->io_min = cpu_to_be32(queue_io_min(q));
                p->qlim->io_opt = cpu_to_be32(queue_io_opt(q));
                p->qlim->discard_enabled = blk_queue_discard(q);
-               p->qlim->discard_zeroes_data = queue_discard_zeroes_data(q);
                p->qlim->write_same_capable = !!q->limits.max_write_same_sectors;
        } else {
                q = device->rq_queue;
@@ -941,7 +940,6 @@ void assign_p_sizes_qlim(struct drbd_device *device, struct p_sizes *p, struct r
                p->qlim->io_min = cpu_to_be32(queue_io_min(q));
                p->qlim->io_opt = cpu_to_be32(queue_io_opt(q));
                p->qlim->discard_enabled = 0;
-               p->qlim->discard_zeroes_data = 0;
                p->qlim->write_same_capable = 0;
        }
 }
@@ -1668,7 +1666,8 @@ static u32 bio_flags_to_wire(struct drbd_connection *connection,
                        (bio->bi_opf & REQ_FUA ? DP_FUA : 0) |
                        (bio->bi_opf & REQ_PREFLUSH ? DP_FLUSH : 0) |
                        (bio_op(bio) == REQ_OP_WRITE_SAME ? DP_WSAME : 0) |
-                       (bio_op(bio) == REQ_OP_DISCARD ? DP_DISCARD : 0);
+                       (bio_op(bio) == REQ_OP_DISCARD ? DP_DISCARD : 0) |
+                       (bio_op(bio) == REQ_OP_WRITE_ZEROES ? DP_DISCARD : 0);
        else
                return bio->bi_opf & REQ_SYNC ? DP_RW_SYNC : 0;
 }
index 908c704..02255a0 100644 (file)
@@ -1199,10 +1199,6 @@ static void decide_on_discard_support(struct drbd_device *device,
        struct drbd_connection *connection = first_peer_device(device)->connection;
        bool can_do = b ? blk_queue_discard(b) : true;
 
-       if (can_do && b && !b->limits.discard_zeroes_data && !discard_zeroes_if_aligned) {
-               can_do = false;
-               drbd_info(device, "discard_zeroes_data=0 and discard_zeroes_if_aligned=no: disabling discards\n");
-       }
        if (can_do && connection->cstate >= C_CONNECTED && !(connection->agreed_features & DRBD_FF_TRIM)) {
                can_do = false;
                drbd_info(connection, "peer DRBD too old, does not support TRIM: disabling discards\n");
@@ -1217,10 +1213,12 @@ static void decide_on_discard_support(struct drbd_device *device,
                blk_queue_discard_granularity(q, 512);
                q->limits.max_discard_sectors = drbd_max_discard_sectors(connection);
                queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
+               q->limits.max_write_zeroes_sectors = drbd_max_discard_sectors(connection);
        } else {
                queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q);
                blk_queue_discard_granularity(q, 0);
                q->limits.max_discard_sectors = 0;
+               q->limits.max_write_zeroes_sectors = 0;
        }
 }
 
@@ -1482,8 +1480,7 @@ static void sanitize_disk_conf(struct drbd_device *device, struct disk_conf *dis
        if (disk_conf->al_extents > drbd_al_extents_max(nbc))
                disk_conf->al_extents = drbd_al_extents_max(nbc);
 
-       if (!blk_queue_discard(q)
-           || (!q->limits.discard_zeroes_data && !disk_conf->discard_zeroes_if_aligned)) {
+       if (!blk_queue_discard(q)) {
                if (disk_conf->rs_discard_granularity) {
                        disk_conf->rs_discard_granularity = 0; /* disable feature */
                        drbd_info(device, "rs_discard_granularity feature disabled\n");
index aa6bf96..1b0a2be 100644 (file)
@@ -1448,105 +1448,14 @@ void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backin
                drbd_info(resource, "Method to ensure write ordering: %s\n", write_ordering_str[resource->write_ordering]);
 }
 
-/*
- * We *may* ignore the discard-zeroes-data setting, if so configured.
- *
- * Assumption is that it "discard_zeroes_data=0" is only because the backend
- * may ignore partial unaligned discards.
- *
- * LVM/DM thin as of at least
- *   LVM version:     2.02.115(2)-RHEL7 (2015-01-28)
- *   Library version: 1.02.93-RHEL7 (2015-01-28)
- *   Driver version:  4.29.0
- * still behaves this way.
- *
- * For unaligned (wrt. alignment and granularity) or too small discards,
- * we zero-out the initial (and/or) trailing unaligned partial chunks,
- * but discard all the aligned full chunks.
- *
- * At least for LVM/DM thin, the result is effectively "discard_zeroes_data=1".
- */
-int drbd_issue_discard_or_zero_out(struct drbd_device *device, sector_t start, unsigned int nr_sectors, bool discard)
-{
-       struct block_device *bdev = device->ldev->backing_bdev;
-       struct request_queue *q = bdev_get_queue(bdev);
-       sector_t tmp, nr;
-       unsigned int max_discard_sectors, granularity;
-       int alignment;
-       int err = 0;
-
-       if (!discard)
-               goto zero_out;
-
-       /* Zero-sector (unknown) and one-sector granularities are the same.  */
-       granularity = max(q->limits.discard_granularity >> 9, 1U);
-       alignment = (bdev_discard_alignment(bdev) >> 9) % granularity;
-
-       max_discard_sectors = min(q->limits.max_discard_sectors, (1U << 22));
-       max_discard_sectors -= max_discard_sectors % granularity;
-       if (unlikely(!max_discard_sectors))
-               goto zero_out;
-
-       if (nr_sectors < granularity)
-               goto zero_out;
-
-       tmp = start;
-       if (sector_div(tmp, granularity) != alignment) {
-               if (nr_sectors < 2*granularity)
-                       goto zero_out;
-               /* start + gran - (start + gran - align) % gran */
-               tmp = start + granularity - alignment;
-               tmp = start + granularity - sector_div(tmp, granularity);
-
-               nr = tmp - start;
-               err |= blkdev_issue_zeroout(bdev, start, nr, GFP_NOIO, 0);
-               nr_sectors -= nr;
-               start = tmp;
-       }
-       while (nr_sectors >= granularity) {
-               nr = min_t(sector_t, nr_sectors, max_discard_sectors);
-               err |= blkdev_issue_discard(bdev, start, nr, GFP_NOIO, 0);
-               nr_sectors -= nr;
-               start += nr;
-       }
- zero_out:
-       if (nr_sectors) {
-               err |= blkdev_issue_zeroout(bdev, start, nr_sectors, GFP_NOIO, 0);
-       }
-       return err != 0;
-}
-
-static bool can_do_reliable_discards(struct drbd_device *device)
-{
-       struct request_queue *q = bdev_get_queue(device->ldev->backing_bdev);
-       struct disk_conf *dc;
-       bool can_do;
-
-       if (!blk_queue_discard(q))
-               return false;
-
-       if (q->limits.discard_zeroes_data)
-               return true;
-
-       rcu_read_lock();
-       dc = rcu_dereference(device->ldev->disk_conf);
-       can_do = dc->discard_zeroes_if_aligned;
-       rcu_read_unlock();
-       return can_do;
-}
-
 static void drbd_issue_peer_discard(struct drbd_device *device, struct drbd_peer_request *peer_req)
 {
-       /* If the backend cannot discard, or does not guarantee
-        * read-back zeroes in discarded ranges, we fall back to
-        * zero-out.  Unless configuration specifically requested
-        * otherwise. */
-       if (!can_do_reliable_discards(device))
-               peer_req->flags |= EE_IS_TRIM_USE_ZEROOUT;
+       struct block_device *bdev = device->ldev->backing_bdev;
 
-       if (drbd_issue_discard_or_zero_out(device, peer_req->i.sector,
-           peer_req->i.size >> 9, !(peer_req->flags & EE_IS_TRIM_USE_ZEROOUT)))
+       if (blkdev_issue_zeroout(bdev, peer_req->i.sector, peer_req->i.size >> 9,
+                       GFP_NOIO, 0))
                peer_req->flags |= EE_WAS_ERROR;
+
        drbd_endio_write_sec_final(peer_req);
 }
 
@@ -2376,7 +2285,7 @@ static unsigned long wire_flags_to_bio_flags(u32 dpf)
 static unsigned long wire_flags_to_bio_op(u32 dpf)
 {
        if (dpf & DP_DISCARD)
-               return REQ_OP_DISCARD;
+               return REQ_OP_WRITE_ZEROES;
        else
                return REQ_OP_WRITE;
 }
@@ -2567,7 +2476,7 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
        op_flags = wire_flags_to_bio_flags(dp_flags);
        if (pi->cmd == P_TRIM) {
                D_ASSERT(peer_device, peer_req->i.size > 0);
-               D_ASSERT(peer_device, op == REQ_OP_DISCARD);
+               D_ASSERT(peer_device, op == REQ_OP_WRITE_ZEROES);
                D_ASSERT(peer_device, peer_req->pages == NULL);
        } else if (peer_req->pages == NULL) {
                D_ASSERT(device, peer_req->i.size == 0);
@@ -4880,7 +4789,7 @@ static int receive_rs_deallocated(struct drbd_connection *connection, struct pac
 
        if (get_ldev(device)) {
                struct drbd_peer_request *peer_req;
-               const int op = REQ_OP_DISCARD;
+               const int op = REQ_OP_WRITE_ZEROES;
 
                peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER, sector,
                                               size, 0, GFP_NOIO);
index 652114a..b5730e1 100644 (file)
@@ -59,6 +59,7 @@ static struct drbd_request *drbd_req_new(struct drbd_device *device, struct bio
        drbd_req_make_private_bio(req, bio_src);
        req->rq_state = (bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0)
                      | (bio_op(bio_src) == REQ_OP_WRITE_SAME ? RQ_WSAME : 0)
+                     | (bio_op(bio_src) == REQ_OP_WRITE_ZEROES ? RQ_UNMAP : 0)
                      | (bio_op(bio_src) == REQ_OP_DISCARD ? RQ_UNMAP : 0);
        req->device = device;
        req->master_bio = bio_src;
@@ -1148,10 +1149,10 @@ static int drbd_process_write_request(struct drbd_request *req)
 
 static void drbd_process_discard_req(struct drbd_request *req)
 {
-       int err = drbd_issue_discard_or_zero_out(req->device,
-                               req->i.sector, req->i.size >> 9, true);
+       struct block_device *bdev = req->device->ldev->backing_bdev;
 
-       if (err)
+       if (blkdev_issue_zeroout(bdev, req->i.sector, req->i.size >> 9,
+                       GFP_NOIO, 0))
                req->private_bio->bi_error = -EIO;
        bio_endio(req->private_bio);
 }
@@ -1180,7 +1181,8 @@ drbd_submit_req_private_bio(struct drbd_request *req)
        if (get_ldev(device)) {
                if (drbd_insert_fault(device, type))
                        bio_io_error(bio);
-               else if (bio_op(bio) == REQ_OP_DISCARD)
+               else if (bio_op(bio) == REQ_OP_WRITE_ZEROES ||
+                        bio_op(bio) == REQ_OP_DISCARD)
                        drbd_process_discard_req(req);
                else
                        generic_make_request(bio);
@@ -1234,7 +1236,8 @@ drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long
        _drbd_start_io_acct(device, req);
 
        /* process discards always from our submitter thread */
-       if (bio_op(bio) & REQ_OP_DISCARD)
+       if ((bio_op(bio) & REQ_OP_WRITE_ZEROES) ||
+           (bio_op(bio) & REQ_OP_DISCARD))
                goto queue_for_submitter_thread;
 
        if (rw == WRITE && req->private_bio && req->i.size
index 3bff33f..1afcb4e 100644 (file)
@@ -174,7 +174,8 @@ void drbd_peer_request_endio(struct bio *bio)
        struct drbd_peer_request *peer_req = bio->bi_private;
        struct drbd_device *device = peer_req->peer_device->device;
        bool is_write = bio_data_dir(bio) == WRITE;
-       bool is_discard = !!(bio_op(bio) == REQ_OP_DISCARD);
+       bool is_discard = bio_op(bio) == REQ_OP_WRITE_ZEROES ||
+                         bio_op(bio) == REQ_OP_DISCARD;
 
        if (bio->bi_error && __ratelimit(&drbd_ratelimit_state))
                drbd_warn(device, "%s: error=%d s=%llus\n",
@@ -249,6 +250,7 @@ void drbd_request_endio(struct bio *bio)
        /* to avoid recursion in __req_mod */
        if (unlikely(bio->bi_error)) {
                switch (bio_op(bio)) {
+               case REQ_OP_WRITE_ZEROES:
                case REQ_OP_DISCARD:
                        if (bio->bi_error == -EOPNOTSUPP)
                                what = DISCARD_COMPLETED_NOTSUPP;
index 45b4384..60d4c76 100644 (file)
@@ -2805,8 +2805,10 @@ static int set_next_request(void)
                        fdc_queue = 0;
                if (q) {
                        current_req = blk_fetch_request(q);
-                       if (current_req)
+                       if (current_req) {
+                               current_req->error_count = 0;
                                break;
+                       }
                }
        } while (fdc_queue != old_pos);
 
@@ -2866,7 +2868,7 @@ do_request:
                _floppy = floppy_type + DP->autodetect[DRS->probed_format];
        } else
                probing = 0;
-       errors = &(current_req->errors);
+       errors = &(current_req->error_count);
        tmp = make_raw_rw_request();
        if (tmp < 2) {
                request_done(tmp);
@@ -4207,9 +4209,7 @@ static int __init do_floppy_init(void)
                disks[drive]->fops = &floppy_fops;
                sprintf(disks[drive]->disk_name, "fd%d", drive);
 
-               init_timer(&motor_off_timer[drive]);
-               motor_off_timer[drive].data = drive;
-               motor_off_timer[drive].function = motor_off_callback;
+               setup_timer(&motor_off_timer[drive], motor_off_callback, drive);
        }
 
        err = register_blkdev(FLOPPY_MAJOR, "fd");
diff --git a/drivers/block/hd.c b/drivers/block/hd.c
deleted file mode 100644 (file)
index 6043648..0000000
+++ /dev/null
@@ -1,803 +0,0 @@
-/*
- *  Copyright (C) 1991, 1992  Linus Torvalds
- *
- * This is the low-level hd interrupt support. It traverses the
- * request-list, using interrupts to jump between functions. As
- * all the functions are called within interrupts, we may not
- * sleep. Special care is recommended.
- *
- *  modified by Drew Eckhardt to check nr of hd's from the CMOS.
- *
- *  Thanks to Branko Lankester, lankeste@fwi.uva.nl, who found a bug
- *  in the early extended-partition checks and added DM partitions
- *
- *  IRQ-unmask, drive-id, multiple-mode, support for ">16 heads",
- *  and general streamlining by Mark Lord.
- *
- *  Removed 99% of above. Use Mark's ide driver for those options.
- *  This is now a lightweight ST-506 driver. (Paul Gortmaker)
- *
- *  Modified 1995 Russell King for ARM processor.
- *
- *  Bugfix: max_sectors must be <= 255 or the wheels tend to come
- *  off in a hurry once you queue things up - Paul G. 02/2001
- */
-
-/* Uncomment the following if you want verbose error reports. */
-/* #define VERBOSE_ERRORS */
-
-#include <linux/blkdev.h>
-#include <linux/errno.h>
-#include <linux/signal.h>
-#include <linux/interrupt.h>
-#include <linux/timer.h>
-#include <linux/fs.h>
-#include <linux/kernel.h>
-#include <linux/genhd.h>
-#include <linux/string.h>
-#include <linux/ioport.h>
-#include <linux/init.h>
-#include <linux/blkpg.h>
-#include <linux/ata.h>
-#include <linux/hdreg.h>
-
-#define HD_IRQ 14
-
-#define REALLY_SLOW_IO
-#include <asm/io.h>
-#include <linux/uaccess.h>
-
-#ifdef __arm__
-#undef  HD_IRQ
-#endif
-#include <asm/irq.h>
-#ifdef __arm__
-#define HD_IRQ IRQ_HARDDISK
-#endif
-
-/* Hd controller regster ports */
-
-#define HD_DATA                0x1f0           /* _CTL when writing */
-#define HD_ERROR       0x1f1           /* see err-bits */
-#define HD_NSECTOR     0x1f2           /* nr of sectors to read/write */
-#define HD_SECTOR      0x1f3           /* starting sector */
-#define HD_LCYL                0x1f4           /* starting cylinder */
-#define HD_HCYL                0x1f5           /* high byte of starting cyl */
-#define HD_CURRENT     0x1f6           /* 101dhhhh , d=drive, hhhh=head */
-#define HD_STATUS      0x1f7           /* see status-bits */
-#define HD_FEATURE     HD_ERROR        /* same io address, read=error, write=feature */
-#define HD_PRECOMP     HD_FEATURE      /* obsolete use of this port - predates IDE */
-#define HD_COMMAND     HD_STATUS       /* same io address, read=status, write=cmd */
-
-#define HD_CMD         0x3f6           /* used for resets */
-#define HD_ALTSTATUS   0x3f6           /* same as HD_STATUS but doesn't clear irq */
-
-/* Bits of HD_STATUS */
-#define ERR_STAT               0x01
-#define INDEX_STAT             0x02
-#define ECC_STAT               0x04    /* Corrected error */
-#define DRQ_STAT               0x08
-#define SEEK_STAT              0x10
-#define SERVICE_STAT           SEEK_STAT
-#define WRERR_STAT             0x20
-#define READY_STAT             0x40
-#define BUSY_STAT              0x80
-
-/* Bits for HD_ERROR */
-#define MARK_ERR               0x01    /* Bad address mark */
-#define TRK0_ERR               0x02    /* couldn't find track 0 */
-#define ABRT_ERR               0x04    /* Command aborted */
-#define MCR_ERR                        0x08    /* media change request */
-#define ID_ERR                 0x10    /* ID field not found */
-#define MC_ERR                 0x20    /* media changed */
-#define ECC_ERR                        0x40    /* Uncorrectable ECC error */
-#define BBD_ERR                        0x80    /* pre-EIDE meaning:  block marked bad */
-#define ICRC_ERR               0x80    /* new meaning:  CRC error during transfer */
-
-static DEFINE_SPINLOCK(hd_lock);
-static struct request_queue *hd_queue;
-static struct request *hd_req;
-
-#define TIMEOUT_VALUE  (6*HZ)
-#define        HD_DELAY        0
-
-#define MAX_ERRORS     16      /* Max read/write errors/sector */
-#define RESET_FREQ      8      /* Reset controller every 8th retry */
-#define RECAL_FREQ      4      /* Recalibrate every 4th retry */
-#define MAX_HD         2
-
-#define STAT_OK                (READY_STAT|SEEK_STAT)
-#define OK_STATUS(s)   (((s)&(STAT_OK|(BUSY_STAT|WRERR_STAT|ERR_STAT)))==STAT_OK)
-
-static void recal_intr(void);
-static void bad_rw_intr(void);
-
-static int reset;
-static int hd_error;
-
-/*
- *  This struct defines the HD's and their types.
- */
-struct hd_i_struct {
-       unsigned int head, sect, cyl, wpcom, lzone, ctl;
-       int unit;
-       int recalibrate;
-       int special_op;
-};
-
-#ifdef HD_TYPE
-static struct hd_i_struct hd_info[] = { HD_TYPE };
-static int NR_HD = ARRAY_SIZE(hd_info);
-#else
-static struct hd_i_struct hd_info[MAX_HD];
-static int NR_HD;
-#endif
-
-static struct gendisk *hd_gendisk[MAX_HD];
-
-static struct timer_list device_timer;
-
-#define TIMEOUT_VALUE (6*HZ)
-
-#define SET_TIMER                                                      \
-       do {                                                            \
-               mod_timer(&device_timer, jiffies + TIMEOUT_VALUE);      \
-       } while (0)
-
-static void (*do_hd)(void) = NULL;
-#define SET_HANDLER(x) \
-if ((do_hd = (x)) != NULL) \
-       SET_TIMER; \
-else \
-       del_timer(&device_timer);
-
-
-#if (HD_DELAY > 0)
-
-#include <linux/i8253.h>
-
-unsigned long last_req;
-
-unsigned long read_timer(void)
-{
-       unsigned long t, flags;
-       int i;
-
-       raw_spin_lock_irqsave(&i8253_lock, flags);
-       t = jiffies * 11932;
-       outb_p(0, 0x43);
-       i = inb_p(0x40);
-       i |= inb(0x40) << 8;
-       raw_spin_unlock_irqrestore(&i8253_lock, flags);
-       return(t - i);
-}
-#endif
-
-static void __init hd_setup(char *str, int *ints)
-{
-       int hdind = 0;
-
-       if (ints[0] != 3)
-               return;
-       if (hd_info[0].head != 0)
-               hdind = 1;
-       hd_info[hdind].head = ints[2];
-       hd_info[hdind].sect = ints[3];
-       hd_info[hdind].cyl = ints[1];
-       hd_info[hdind].wpcom = 0;
-       hd_info[hdind].lzone = ints[1];
-       hd_info[hdind].ctl = (ints[2] > 8 ? 8 : 0);
-       NR_HD = hdind+1;
-}
-
-static bool hd_end_request(int err, unsigned int bytes)
-{
-       if (__blk_end_request(hd_req, err, bytes))
-               return true;
-       hd_req = NULL;
-       return false;
-}
-
-static bool hd_end_request_cur(int err)
-{
-       return hd_end_request(err, blk_rq_cur_bytes(hd_req));
-}
-
-static void dump_status(const char *msg, unsigned int stat)
-{
-       char *name = "hd?";
-       if (hd_req)
-               name = hd_req->rq_disk->disk_name;
-
-#ifdef VERBOSE_ERRORS
-       printk("%s: %s: status=0x%02x { ", name, msg, stat & 0xff);
-       if (stat & BUSY_STAT)   printk("Busy ");
-       if (stat & READY_STAT)  printk("DriveReady ");
-       if (stat & WRERR_STAT)  printk("WriteFault ");
-       if (stat & SEEK_STAT)   printk("SeekComplete ");
-       if (stat & DRQ_STAT)    printk("DataRequest ");
-       if (stat & ECC_STAT)    printk("CorrectedError ");
-       if (stat & INDEX_STAT)  printk("Index ");
-       if (stat & ERR_STAT)    printk("Error ");
-       printk("}\n");
-       if ((stat & ERR_STAT) == 0) {
-               hd_error = 0;
-       } else {
-               hd_error = inb(HD_ERROR);
-               printk("%s: %s: error=0x%02x { ", name, msg, hd_error & 0xff);
-               if (hd_error & BBD_ERR)         printk("BadSector ");
-               if (hd_error & ECC_ERR)         printk("UncorrectableError ");
-               if (hd_error & ID_ERR)          printk("SectorIdNotFound ");
-               if (hd_error & ABRT_ERR)        printk("DriveStatusError ");
-               if (hd_error & TRK0_ERR)        printk("TrackZeroNotFound ");
-               if (hd_error & MARK_ERR)        printk("AddrMarkNotFound ");
-               printk("}");
-               if (hd_error & (BBD_ERR|ECC_ERR|ID_ERR|MARK_ERR)) {
-                       printk(", CHS=%d/%d/%d", (inb(HD_HCYL)<<8) + inb(HD_LCYL),
-                               inb(HD_CURRENT) & 0xf, inb(HD_SECTOR));
-                       if (hd_req)
-                               printk(", sector=%ld", blk_rq_pos(hd_req));
-               }
-               printk("\n");
-       }
-#else
-       printk("%s: %s: status=0x%02x.\n", name, msg, stat & 0xff);
-       if ((stat & ERR_STAT) == 0) {
-               hd_error = 0;
-       } else {
-               hd_error = inb(HD_ERROR);
-               printk("%s: %s: error=0x%02x.\n", name, msg, hd_error & 0xff);
-       }
-#endif
-}
-
-static void check_status(void)
-{
-       int i = inb_p(HD_STATUS);
-
-       if (!OK_STATUS(i)) {
-               dump_status("check_status", i);
-               bad_rw_intr();
-       }
-}
-
-static int controller_busy(void)
-{
-       int retries = 100000;
-       unsigned char status;
-
-       do {
-               status = inb_p(HD_STATUS);
-       } while ((status & BUSY_STAT) && --retries);
-       return status;
-}
-
-static int status_ok(void)
-{
-       unsigned char status = inb_p(HD_STATUS);
-
-       if (status & BUSY_STAT)
-               return 1;       /* Ancient, but does it make sense??? */
-       if (status & WRERR_STAT)
-               return 0;
-       if (!(status & READY_STAT))
-               return 0;
-       if (!(status & SEEK_STAT))
-               return 0;
-       return 1;
-}
-
-static int controller_ready(unsigned int drive, unsigned int head)
-{
-       int retry = 100;
-
-       do {
-               if (controller_busy() & BUSY_STAT)
-                       return 0;
-               outb_p(0xA0 | (drive<<4) | head, HD_CURRENT);
-               if (status_ok())
-                       return 1;
-       } while (--retry);
-       return 0;
-}
-
-static void hd_out(struct hd_i_struct *disk,
-                  unsigned int nsect,
-                  unsigned int sect,
-                  unsigned int head,
-                  unsigned int cyl,
-                  unsigned int cmd,
-                  void (*intr_addr)(void))
-{
-       unsigned short port;
-
-#if (HD_DELAY > 0)
-       while (read_timer() - last_req < HD_DELAY)
-               /* nothing */;
-#endif
-       if (reset)
-               return;
-       if (!controller_ready(disk->unit, head)) {
-               reset = 1;
-               return;
-       }
-       SET_HANDLER(intr_addr);
-       outb_p(disk->ctl, HD_CMD);
-       port = HD_DATA;
-       outb_p(disk->wpcom >> 2, ++port);
-       outb_p(nsect, ++port);
-       outb_p(sect, ++port);
-       outb_p(cyl, ++port);
-       outb_p(cyl >> 8, ++port);
-       outb_p(0xA0 | (disk->unit << 4) | head, ++port);
-       outb_p(cmd, ++port);
-}
-
-static void hd_request (void);
-
-static int drive_busy(void)
-{
-       unsigned int i;
-       unsigned char c;
-
-       for (i = 0; i < 500000 ; i++) {
-               c = inb_p(HD_STATUS);
-               if ((c & (BUSY_STAT | READY_STAT | SEEK_STAT)) == STAT_OK)
-                       return 0;
-       }
-       dump_status("reset timed out", c);
-       return 1;
-}
-
-static void reset_controller(void)
-{
-       int     i;
-
-       outb_p(4, HD_CMD);
-       for (i = 0; i < 1000; i++) barrier();
-       outb_p(hd_info[0].ctl & 0x0f, HD_CMD);
-       for (i = 0; i < 1000; i++) barrier();
-       if (drive_busy())
-               printk("hd: controller still busy\n");
-       else if ((hd_error = inb(HD_ERROR)) != 1)
-               printk("hd: controller reset failed: %02x\n", hd_error);
-}
-
-static void reset_hd(void)
-{
-       static int i;
-
-repeat:
-       if (reset) {
-               reset = 0;
-               i = -1;
-               reset_controller();
-       } else {
-               check_status();
-               if (reset)
-                       goto repeat;
-       }
-       if (++i < NR_HD) {
-               struct hd_i_struct *disk = &hd_info[i];
-               disk->special_op = disk->recalibrate = 1;
-               hd_out(disk, disk->sect, disk->sect, disk->head-1,
-                       disk->cyl, ATA_CMD_INIT_DEV_PARAMS, &reset_hd);
-               if (reset)
-                       goto repeat;
-       } else
-               hd_request();
-}
-
-/*
- * Ok, don't know what to do with the unexpected interrupts: on some machines
- * doing a reset and a retry seems to result in an eternal loop. Right now I
- * ignore it, and just set the timeout.
- *
- * On laptops (and "green" PCs), an unexpected interrupt occurs whenever the
- * drive enters "idle", "standby", or "sleep" mode, so if the status looks
- * "good", we just ignore the interrupt completely.
- */
-static void unexpected_hd_interrupt(void)
-{
-       unsigned int stat = inb_p(HD_STATUS);
-
-       if (stat & (BUSY_STAT|DRQ_STAT|ECC_STAT|ERR_STAT)) {
-               dump_status("unexpected interrupt", stat);
-               SET_TIMER;
-       }
-}
-
-/*
- * bad_rw_intr() now tries to be a bit smarter and does things
- * according to the error returned by the controller.
- * -Mika Liljeberg (liljeber@cs.Helsinki.FI)
- */
-static void bad_rw_intr(void)
-{
-       struct request *req = hd_req;
-
-       if (req != NULL) {
-               struct hd_i_struct *disk = req->rq_disk->private_data;
-               if (++req->errors >= MAX_ERRORS || (hd_error & BBD_ERR)) {
-                       hd_end_request_cur(-EIO);
-                       disk->special_op = disk->recalibrate = 1;
-               } else if (req->errors % RESET_FREQ == 0)
-                       reset = 1;
-               else if ((hd_error & TRK0_ERR) || req->errors % RECAL_FREQ == 0)
-                       disk->special_op = disk->recalibrate = 1;
-               /* Otherwise just retry */
-       }
-}
-
-static inline int wait_DRQ(void)
-{
-       int retries;
-       int stat;
-
-       for (retries = 0; retries < 100000; retries++) {
-               stat = inb_p(HD_STATUS);
-               if (stat & DRQ_STAT)
-                       return 0;
-       }
-       dump_status("wait_DRQ", stat);
-       return -1;
-}
-
-static void read_intr(void)
-{
-       struct request *req;
-       int i, retries = 100000;
-
-       do {
-               i = (unsigned) inb_p(HD_STATUS);
-               if (i & BUSY_STAT)
-                       continue;
-               if (!OK_STATUS(i))
-                       break;
-               if (i & DRQ_STAT)
-                       goto ok_to_read;
-       } while (--retries > 0);
-       dump_status("read_intr", i);
-       bad_rw_intr();
-       hd_request();
-       return;
-
-ok_to_read:
-       req = hd_req;
-       insw(HD_DATA, bio_data(req->bio), 256);
-#ifdef DEBUG
-       printk("%s: read: sector %ld, remaining = %u, buffer=%p\n",
-              req->rq_disk->disk_name, blk_rq_pos(req) + 1,
-              blk_rq_sectors(req) - 1, bio_data(req->bio)+512);
-#endif
-       if (hd_end_request(0, 512)) {
-               SET_HANDLER(&read_intr);
-               return;
-       }
-
-       (void) inb_p(HD_STATUS);
-#if (HD_DELAY > 0)
-       last_req = read_timer();
-#endif
-       hd_request();
-}
-
-static void write_intr(void)
-{
-       struct request *req = hd_req;
-       int i;
-       int retries = 100000;
-
-       do {
-               i = (unsigned) inb_p(HD_STATUS);
-               if (i & BUSY_STAT)
-                       continue;
-               if (!OK_STATUS(i))
-                       break;
-               if ((blk_rq_sectors(req) <= 1) || (i & DRQ_STAT))
-                       goto ok_to_write;
-       } while (--retries > 0);
-       dump_status("write_intr", i);
-       bad_rw_intr();
-       hd_request();
-       return;
-
-ok_to_write:
-       if (hd_end_request(0, 512)) {
-               SET_HANDLER(&write_intr);
-               outsw(HD_DATA, bio_data(req->bio), 256);
-               return;
-       }
-
-#if (HD_DELAY > 0)
-       last_req = read_timer();
-#endif
-       hd_request();
-}
-
-static void recal_intr(void)
-{
-       check_status();
-#if (HD_DELAY > 0)
-       last_req = read_timer();
-#endif
-       hd_request();
-}
-
-/*
- * This is another of the error-routines I don't know what to do with. The
- * best idea seems to just set reset, and start all over again.
- */
-static void hd_times_out(unsigned long dummy)
-{
-       char *name;
-
-       do_hd = NULL;
-
-       if (!hd_req)
-               return;
-
-       spin_lock_irq(hd_queue->queue_lock);
-       reset = 1;
-       name = hd_req->rq_disk->disk_name;
-       printk("%s: timeout\n", name);
-       if (++hd_req->errors >= MAX_ERRORS) {
-#ifdef DEBUG
-               printk("%s: too many errors\n", name);
-#endif
-               hd_end_request_cur(-EIO);
-       }
-       hd_request();
-       spin_unlock_irq(hd_queue->queue_lock);
-}
-
-static int do_special_op(struct hd_i_struct *disk, struct request *req)
-{
-       if (disk->recalibrate) {
-               disk->recalibrate = 0;
-               hd_out(disk, disk->sect, 0, 0, 0, ATA_CMD_RESTORE, &recal_intr);
-               return reset;
-       }
-       if (disk->head > 16) {
-               printk("%s: cannot handle device with more than 16 heads - giving up\n", req->rq_disk->disk_name);
-               hd_end_request_cur(-EIO);
-       }
-       disk->special_op = 0;
-       return 1;
-}
-
-/*
- * The driver enables interrupts as much as possible.  In order to do this,
- * (a) the device-interrupt is disabled before entering hd_request(),
- * and (b) the timeout-interrupt is disabled before the sti().
- *
- * Interrupts are still masked (by default) whenever we are exchanging
- * data/cmds with a drive, because some drives seem to have very poor
- * tolerance for latency during I/O. The IDE driver has support to unmask
- * interrupts for non-broken hardware, so use that driver if required.
- */
-static void hd_request(void)
-{
-       unsigned int block, nsect, sec, track, head, cyl;
-       struct hd_i_struct *disk;
-       struct request *req;
-
-       if (do_hd)
-               return;
-repeat:
-       del_timer(&device_timer);
-
-       if (!hd_req) {
-               hd_req = blk_fetch_request(hd_queue);
-               if (!hd_req) {
-                       do_hd = NULL;
-                       return;
-               }
-       }
-       req = hd_req;
-
-       if (reset) {
-               reset_hd();
-               return;
-       }
-       disk = req->rq_disk->private_data;
-       block = blk_rq_pos(req);
-       nsect = blk_rq_sectors(req);
-       if (block >= get_capacity(req->rq_disk) ||
-           ((block+nsect) > get_capacity(req->rq_disk))) {
-               printk("%s: bad access: block=%d, count=%d\n",
-                       req->rq_disk->disk_name, block, nsect);
-               hd_end_request_cur(-EIO);
-               goto repeat;
-       }
-
-       if (disk->special_op) {
-               if (do_special_op(disk, req))
-                       goto repeat;
-               return;
-       }
-       sec   = block % disk->sect + 1;
-       track = block / disk->sect;
-       head  = track % disk->head;
-       cyl   = track / disk->head;
-#ifdef DEBUG
-       printk("%s: %sing: CHS=%d/%d/%d, sectors=%d, buffer=%p\n",
-               req->rq_disk->disk_name,
-               req_data_dir(req) == READ ? "read" : "writ",
-               cyl, head, sec, nsect, bio_data(req->bio));
-#endif
-
-       switch (req_op(req)) {
-       case REQ_OP_READ:
-               hd_out(disk, nsect, sec, head, cyl, ATA_CMD_PIO_READ,
-                       &read_intr);
-               if (reset)
-                       goto repeat;
-               break;
-       case REQ_OP_WRITE:
-               hd_out(disk, nsect, sec, head, cyl, ATA_CMD_PIO_WRITE,
-                       &write_intr);
-               if (reset)
-                       goto repeat;
-               if (wait_DRQ()) {
-                       bad_rw_intr();
-                       goto repeat;
-               }
-               outsw(HD_DATA, bio_data(req->bio), 256);
-               break;
-       default:
-               printk("unknown hd-command\n");
-               hd_end_request_cur(-EIO);
-               break;
-       }
-}
-
-static void do_hd_request(struct request_queue *q)
-{
-       hd_request();
-}
-
-static int hd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
-{
-       struct hd_i_struct *disk = bdev->bd_disk->private_data;
-
-       geo->heads = disk->head;
-       geo->sectors = disk->sect;
-       geo->cylinders = disk->cyl;
-       return 0;
-}
-
-/*
- * Releasing a block device means we sync() it, so that it can safely
- * be forgotten about...
- */
-
-static irqreturn_t hd_interrupt(int irq, void *dev_id)
-{
-       void (*handler)(void) = do_hd;
-
-       spin_lock(hd_queue->queue_lock);
-
-       do_hd = NULL;
-       del_timer(&device_timer);
-       if (!handler)
-               handler = unexpected_hd_interrupt;
-       handler();
-
-       spin_unlock(hd_queue->queue_lock);
-
-       return IRQ_HANDLED;
-}
-
-static const struct block_device_operations hd_fops = {
-       .getgeo =       hd_getgeo,
-};
-
-static int __init hd_init(void)
-{
-       int drive;
-
-       if (register_blkdev(HD_MAJOR, "hd"))
-               return -1;
-
-       hd_queue = blk_init_queue(do_hd_request, &hd_lock);
-       if (!hd_queue) {
-               unregister_blkdev(HD_MAJOR, "hd");
-               return -ENOMEM;
-       }
-
-       blk_queue_max_hw_sectors(hd_queue, 255);
-       init_timer(&device_timer);
-       device_timer.function = hd_times_out;
-       blk_queue_logical_block_size(hd_queue, 512);
-
-       if (!NR_HD) {
-               /*
-                * We don't know anything about the drive.  This means
-                * that you *MUST* specify the drive parameters to the
-                * kernel yourself.
-                *
-                * If we were on an i386, we used to read this info from
-                * the BIOS or CMOS.  This doesn't work all that well,
-                * since this assumes that this is a primary or secondary
-                * drive, and if we're using this legacy driver, it's
-                * probably an auxiliary controller added to recover
-                * legacy data off an ST-506 drive.  Either way, it's
-                * definitely safest to have the user explicitly specify
-                * the information.
-                */
-               printk("hd: no drives specified - use hd=cyl,head,sectors"
-                       " on kernel command line\n");
-               goto out;
-       }
-
-       for (drive = 0 ; drive < NR_HD ; drive++) {
-               struct gendisk *disk = alloc_disk(64);
-               struct hd_i_struct *p = &hd_info[drive];
-               if (!disk)
-                       goto Enomem;
-               disk->major = HD_MAJOR;
-               disk->first_minor = drive << 6;
-               disk->fops = &hd_fops;
-               sprintf(disk->disk_name, "hd%c", 'a'+drive);
-               disk->private_data = p;
-               set_capacity(disk, p->head * p->sect * p->cyl);
-               disk->queue = hd_queue;
-               p->unit = drive;
-               hd_gendisk[drive] = disk;
-               printk("%s: %luMB, CHS=%d/%d/%d\n",
-                       disk->disk_name, (unsigned long)get_capacity(disk)/2048,
-                       p->cyl, p->head, p->sect);
-       }
-
-       if (request_irq(HD_IRQ, hd_interrupt, 0, "hd", NULL)) {
-               printk("hd: unable to get IRQ%d for the hard disk driver\n",
-                       HD_IRQ);
-               goto out1;
-       }
-       if (!request_region(HD_DATA, 8, "hd")) {
-               printk(KERN_WARNING "hd: port 0x%x busy\n", HD_DATA);
-               goto out2;
-       }
-       if (!request_region(HD_CMD, 1, "hd(cmd)")) {
-               printk(KERN_WARNING "hd: port 0x%x busy\n", HD_CMD);
-               goto out3;
-       }
-
-       /* Let them fly */
-       for (drive = 0; drive < NR_HD; drive++)
-               add_disk(hd_gendisk[drive]);
-
-       return 0;
-
-out3:
-       release_region(HD_DATA, 8);
-out2:
-       free_irq(HD_IRQ, NULL);
-out1:
-       for (drive = 0; drive < NR_HD; drive++)
-               put_disk(hd_gendisk[drive]);
-       NR_HD = 0;
-out:
-       del_timer(&device_timer);
-       unregister_blkdev(HD_MAJOR, "hd");
-       blk_cleanup_queue(hd_queue);
-       return -1;
-Enomem:
-       while (drive--)
-               put_disk(hd_gendisk[drive]);
-       goto out;
-}
-
-static int __init parse_hd_setup(char *line)
-{
-       int ints[6];
-
-       (void) get_options(line, ARRAY_SIZE(ints), ints);
-       hd_setup(NULL, ints);
-
-       return 1;
-}
-__setup("hd=", parse_hd_setup);
-
-late_initcall(hd_init);
index 0ecb646..994403e 100644 (file)
@@ -445,32 +445,27 @@ static int lo_req_flush(struct loop_device *lo, struct request *rq)
        return ret;
 }
 
-static inline void handle_partial_read(struct loop_cmd *cmd, long bytes)
+static void lo_complete_rq(struct request *rq)
 {
-       if (bytes < 0 || op_is_write(req_op(cmd->rq)))
-               return;
+       struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq);
 
-       if (unlikely(bytes < blk_rq_bytes(cmd->rq))) {
+       if (unlikely(req_op(cmd->rq) == REQ_OP_READ && cmd->use_aio &&
+                    cmd->ret >= 0 && cmd->ret < blk_rq_bytes(cmd->rq))) {
                struct bio *bio = cmd->rq->bio;
 
-               bio_advance(bio, bytes);
+               bio_advance(bio, cmd->ret);
                zero_fill_bio(bio);
        }
+
+       blk_mq_end_request(rq, cmd->ret < 0 ? -EIO : 0);
 }
 
 static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2)
 {
        struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb);
-       struct request *rq = cmd->rq;
-
-       handle_partial_read(cmd, ret);
 
-       if (ret > 0)
-               ret = 0;
-       else if (ret < 0)
-               ret = -EIO;
-
-       blk_mq_complete_request(rq, ret);
+       cmd->ret = ret;
+       blk_mq_complete_request(cmd->rq);
 }
 
 static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
@@ -528,6 +523,7 @@ static int do_req_filebacked(struct loop_device *lo, struct request *rq)
        case REQ_OP_FLUSH:
                return lo_req_flush(lo, rq);
        case REQ_OP_DISCARD:
+       case REQ_OP_WRITE_ZEROES:
                return lo_discard(lo, rq, pos);
        case REQ_OP_WRITE:
                if (lo->transfer)
@@ -826,7 +822,7 @@ static void loop_config_discard(struct loop_device *lo)
                q->limits.discard_granularity = 0;
                q->limits.discard_alignment = 0;
                blk_queue_max_discard_sectors(q, 0);
-               q->limits.discard_zeroes_data = 0;
+               blk_queue_max_write_zeroes_sectors(q, 0);
                queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q);
                return;
        }
@@ -834,7 +830,7 @@ static void loop_config_discard(struct loop_device *lo)
        q->limits.discard_granularity = inode->i_sb->s_blocksize;
        q->limits.discard_alignment = 0;
        blk_queue_max_discard_sectors(q, UINT_MAX >> 9);
-       q->limits.discard_zeroes_data = 1;
+       blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> 9);
        queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
 }
 
@@ -1660,6 +1656,7 @@ static int loop_queue_rq(struct blk_mq_hw_ctx *hctx,
        switch (req_op(cmd->rq)) {
        case REQ_OP_FLUSH:
        case REQ_OP_DISCARD:
+       case REQ_OP_WRITE_ZEROES:
                cmd->use_aio = false;
                break;
        default:
@@ -1686,8 +1683,10 @@ static void loop_handle_cmd(struct loop_cmd *cmd)
        ret = do_req_filebacked(lo, cmd->rq);
  failed:
        /* complete non-aio request */
-       if (!cmd->use_aio || ret)
-               blk_mq_complete_request(cmd->rq, ret ? -EIO : 0);
+       if (!cmd->use_aio || ret) {
+               cmd->ret = ret ? -EIO : 0;
+               blk_mq_complete_request(cmd->rq);
+       }
 }
 
 static void loop_queue_work(struct kthread_work *work)
@@ -1710,9 +1709,10 @@ static int loop_init_request(void *data, struct request *rq,
        return 0;
 }
 
-static struct blk_mq_ops loop_mq_ops = {
+static const struct blk_mq_ops loop_mq_ops = {
        .queue_rq       = loop_queue_rq,
        .init_request   = loop_init_request,
+       .complete       = lo_complete_rq,
 };
 
 static int loop_add(struct loop_device **l, int i)
index fb2237c..fecd3f9 100644 (file)
@@ -70,6 +70,7 @@ struct loop_cmd {
        struct request *rq;
        struct list_head list;
        bool use_aio;           /* use AIO interface to handle I/O */
+       long ret;
        struct kiocb iocb;
 };
 
diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
deleted file mode 100644 (file)
index 286f276..0000000
+++ /dev/null
@@ -1,1112 +0,0 @@
-/*
- *  drivers/block/mg_disk.c
- *
- *  Support for the mGine m[g]flash IO mode.
- *  Based on legacy hd.c
- *
- * (c) 2008 mGine Co.,LTD
- * (c) 2008 unsik Kim <donari75@gmail.com>
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License version 2 as
- *  published by the Free Software Foundation.
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/fs.h>
-#include <linux/blkdev.h>
-#include <linux/hdreg.h>
-#include <linux/ata.h>
-#include <linux/interrupt.h>
-#include <linux/delay.h>
-#include <linux/platform_device.h>
-#include <linux/gpio.h>
-#include <linux/mg_disk.h>
-#include <linux/slab.h>
-
-#define MG_RES_SEC (CONFIG_MG_DISK_RES << 1)
-
-/* name for block device */
-#define MG_DISK_NAME "mgd"
-
-#define MG_DISK_MAJ 0
-#define MG_DISK_MAX_PART 16
-#define MG_SECTOR_SIZE 512
-#define MG_MAX_SECTS 256
-
-/* Register offsets */
-#define MG_BUFF_OFFSET                 0x8000
-#define MG_REG_OFFSET                  0xC000
-#define MG_REG_FEATURE                 (MG_REG_OFFSET + 2)     /* write case */
-#define MG_REG_ERROR                   (MG_REG_OFFSET + 2)     /* read case */
-#define MG_REG_SECT_CNT                        (MG_REG_OFFSET + 4)
-#define MG_REG_SECT_NUM                        (MG_REG_OFFSET + 6)
-#define MG_REG_CYL_LOW                 (MG_REG_OFFSET + 8)
-#define MG_REG_CYL_HIGH                        (MG_REG_OFFSET + 0xA)
-#define MG_REG_DRV_HEAD                        (MG_REG_OFFSET + 0xC)
-#define MG_REG_COMMAND                 (MG_REG_OFFSET + 0xE)   /* write case */
-#define MG_REG_STATUS                  (MG_REG_OFFSET + 0xE)   /* read  case */
-#define MG_REG_DRV_CTRL                        (MG_REG_OFFSET + 0x10)
-#define MG_REG_BURST_CTRL              (MG_REG_OFFSET + 0x12)
-
-/* handy status */
-#define MG_STAT_READY  (ATA_DRDY | ATA_DSC)
-#define MG_READY_OK(s) (((s) & (MG_STAT_READY | (ATA_BUSY | ATA_DF | \
-                                ATA_ERR))) == MG_STAT_READY)
-
-/* error code for others */
-#define MG_ERR_NONE            0
-#define MG_ERR_TIMEOUT         0x100
-#define MG_ERR_INIT_STAT       0x101
-#define MG_ERR_TRANSLATION     0x102
-#define MG_ERR_CTRL_RST                0x103
-#define MG_ERR_INV_STAT                0x104
-#define MG_ERR_RSTOUT          0x105
-
-#define MG_MAX_ERRORS  6       /* Max read/write errors */
-
-/* command */
-#define MG_CMD_RD 0x20
-#define MG_CMD_WR 0x30
-#define MG_CMD_SLEEP 0x99
-#define MG_CMD_WAKEUP 0xC3
-#define MG_CMD_ID 0xEC
-#define MG_CMD_WR_CONF 0x3C
-#define MG_CMD_RD_CONF 0x40
-
-/* operation mode */
-#define MG_OP_CASCADE (1 << 0)
-#define MG_OP_CASCADE_SYNC_RD (1 << 1)
-#define MG_OP_CASCADE_SYNC_WR (1 << 2)
-#define MG_OP_INTERLEAVE (1 << 3)
-
-/* synchronous */
-#define MG_BURST_LAT_4 (3 << 4)
-#define MG_BURST_LAT_5 (4 << 4)
-#define MG_BURST_LAT_6 (5 << 4)
-#define MG_BURST_LAT_7 (6 << 4)
-#define MG_BURST_LAT_8 (7 << 4)
-#define MG_BURST_LEN_4 (1 << 1)
-#define MG_BURST_LEN_8 (2 << 1)
-#define MG_BURST_LEN_16 (3 << 1)
-#define MG_BURST_LEN_32 (4 << 1)
-#define MG_BURST_LEN_CONT (0 << 1)
-
-/* timeout value (unit: ms) */
-#define MG_TMAX_CONF_TO_CMD    1
-#define MG_TMAX_WAIT_RD_DRQ    10
-#define MG_TMAX_WAIT_WR_DRQ    500
-#define MG_TMAX_RST_TO_BUSY    10
-#define MG_TMAX_HDRST_TO_RDY   500
-#define MG_TMAX_SWRST_TO_RDY   500
-#define MG_TMAX_RSTOUT         3000
-
-#define MG_DEV_MASK (MG_BOOT_DEV | MG_STORAGE_DEV | MG_STORAGE_DEV_SKIP_RST)
-
-/* main structure for mflash driver */
-struct mg_host {
-       struct device *dev;
-
-       struct request_queue *breq;
-       struct request *req;
-       spinlock_t lock;
-       struct gendisk *gd;
-
-       struct timer_list timer;
-       void (*mg_do_intr) (struct mg_host *);
-
-       u16 id[ATA_ID_WORDS];
-
-       u16 cyls;
-       u16 heads;
-       u16 sectors;
-       u32 n_sectors;
-       u32 nres_sectors;
-
-       void __iomem *dev_base;
-       unsigned int irq;
-       unsigned int rst;
-       unsigned int rstout;
-
-       u32 major;
-       u32 error;
-};
-
-/*
- * Debugging macro and defines
- */
-#undef DO_MG_DEBUG
-#ifdef DO_MG_DEBUG
-#  define MG_DBG(fmt, args...) \
-       printk(KERN_DEBUG "%s:%d "fmt, __func__, __LINE__, ##args)
-#else /* CONFIG_MG_DEBUG */
-#  define MG_DBG(fmt, args...) do { } while (0)
-#endif /* CONFIG_MG_DEBUG */
-
-static void mg_request(struct request_queue *);
-
-static bool mg_end_request(struct mg_host *host, int err, unsigned int nr_bytes)
-{
-       if (__blk_end_request(host->req, err, nr_bytes))
-               return true;
-
-       host->req = NULL;
-       return false;
-}
-
-static bool mg_end_request_cur(struct mg_host *host, int err)
-{
-       return mg_end_request(host, err, blk_rq_cur_bytes(host->req));
-}
-
-static void mg_dump_status(const char *msg, unsigned int stat,
-               struct mg_host *host)
-{
-       char *name = MG_DISK_NAME;
-
-       if (host->req)
-               name = host->req->rq_disk->disk_name;
-
-       printk(KERN_ERR "%s: %s: status=0x%02x { ", name, msg, stat & 0xff);
-       if (stat & ATA_BUSY)
-               printk("Busy ");
-       if (stat & ATA_DRDY)
-               printk("DriveReady ");
-       if (stat & ATA_DF)
-               printk("WriteFault ");
-       if (stat & ATA_DSC)
-               printk("SeekComplete ");
-       if (stat & ATA_DRQ)
-               printk("DataRequest ");
-       if (stat & ATA_CORR)
-               printk("CorrectedError ");
-       if (stat & ATA_ERR)
-               printk("Error ");
-       printk("}\n");
-       if ((stat & ATA_ERR) == 0) {
-               host->error = 0;
-       } else {
-               host->error = inb((unsigned long)host->dev_base + MG_REG_ERROR);
-               printk(KERN_ERR "%s: %s: error=0x%02x { ", name, msg,
-                               host->error & 0xff);
-               if (host->error & ATA_BBK)
-                       printk("BadSector ");
-               if (host->error & ATA_UNC)
-                       printk("UncorrectableError ");
-               if (host->error & ATA_IDNF)
-                       printk("SectorIdNotFound ");
-               if (host->error & ATA_ABORTED)
-                       printk("DriveStatusError ");
-               if (host->error & ATA_AMNF)
-                       printk("AddrMarkNotFound ");
-               printk("}");
-               if (host->error & (ATA_BBK | ATA_UNC | ATA_IDNF | ATA_AMNF)) {
-                       if (host->req)
-                               printk(", sector=%u",
-                                      (unsigned int)blk_rq_pos(host->req));
-               }
-               printk("\n");
-       }
-}
-
-static unsigned int mg_wait(struct mg_host *host, u32 expect, u32 msec)
-{
-       u8 status;
-       unsigned long expire, cur_jiffies;
-       struct mg_drv_data *prv_data = host->dev->platform_data;
-
-       host->error = MG_ERR_NONE;
-       expire = jiffies + msecs_to_jiffies(msec);
-
-       /* These 2 times dummy status read prevents reading invalid
-        * status. A very little time (3 times of mflash operating clk)
-        * is required for busy bit is set. Use dummy read instead of
-        * busy wait, because mflash's PLL is machine dependent.
-        */
-       if (prv_data->use_polling) {
-               status = inb((unsigned long)host->dev_base + MG_REG_STATUS);
-               status = inb((unsigned long)host->dev_base + MG_REG_STATUS);
-       }
-
-       status = inb((unsigned long)host->dev_base + MG_REG_STATUS);
-
-       do {
-               cur_jiffies = jiffies;
-               if (status & ATA_BUSY) {
-                       if (expect == ATA_BUSY)
-                               break;
-               } else {
-                       /* Check the error condition! */
-                       if (status & ATA_ERR) {
-                               mg_dump_status("mg_wait", status, host);
-                               break;
-                       }
-
-                       if (expect == MG_STAT_READY)
-                               if (MG_READY_OK(status))
-                                       break;
-
-                       if (expect == ATA_DRQ)
-                               if (status & ATA_DRQ)
-                                       break;
-               }
-               if (!msec) {
-                       mg_dump_status("not ready", status, host);
-                       return MG_ERR_INV_STAT;
-               }
-
-               status = inb((unsigned long)host->dev_base + MG_REG_STATUS);
-       } while (time_before(cur_jiffies, expire));
-
-       if (time_after_eq(cur_jiffies, expire) && msec)
-               host->error = MG_ERR_TIMEOUT;
-
-       return host->error;
-}
-
-static unsigned int mg_wait_rstout(u32 rstout, u32 msec)
-{
-       unsigned long expire;
-
-       expire = jiffies + msecs_to_jiffies(msec);
-       while (time_before(jiffies, expire)) {
-               if (gpio_get_value(rstout) == 1)
-                       return MG_ERR_NONE;
-               msleep(10);
-       }
-
-       return MG_ERR_RSTOUT;
-}
-
-static void mg_unexpected_intr(struct mg_host *host)
-{
-       u32 status = inb((unsigned long)host->dev_base + MG_REG_STATUS);
-
-       mg_dump_status("mg_unexpected_intr", status, host);
-}
-
-static irqreturn_t mg_irq(int irq, void *dev_id)
-{
-       struct mg_host *host = dev_id;
-       void (*handler)(struct mg_host *) = host->mg_do_intr;
-
-       spin_lock(&host->lock);
-
-       host->mg_do_intr = NULL;
-       del_timer(&host->timer);
-       if (!handler)
-               handler = mg_unexpected_intr;
-       handler(host);
-
-       spin_unlock(&host->lock);
-
-       return IRQ_HANDLED;
-}
-
-/* local copy of ata_id_string() */
-static void mg_id_string(const u16 *id, unsigned char *s,
-                        unsigned int ofs, unsigned int len)
-{
-       unsigned int c;
-
-       BUG_ON(len & 1);
-
-       while (len > 0) {
-               c = id[ofs] >> 8;
-               *s = c;
-               s++;
-
-               c = id[ofs] & 0xff;
-               *s = c;
-               s++;
-
-               ofs++;
-               len -= 2;
-       }
-}
-
-/* local copy of ata_id_c_string() */
-static void mg_id_c_string(const u16 *id, unsigned char *s,
-                          unsigned int ofs, unsigned int len)
-{
-       unsigned char *p;
-
-       mg_id_string(id, s, ofs, len - 1);
-
-       p = s + strnlen(s, len - 1);
-       while (p > s && p[-1] == ' ')
-               p--;
-       *p = '\0';
-}
-
-static int mg_get_disk_id(struct mg_host *host)
-{
-       u32 i;
-       s32 err;
-       const u16 *id = host->id;
-       struct mg_drv_data *prv_data = host->dev->platform_data;
-       char fwrev[ATA_ID_FW_REV_LEN + 1];
-       char model[ATA_ID_PROD_LEN + 1];
-       char serial[ATA_ID_SERNO_LEN + 1];
-
-       if (!prv_data->use_polling)
-               outb(ATA_NIEN, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
-
-       outb(MG_CMD_ID, (unsigned long)host->dev_base + MG_REG_COMMAND);
-       err = mg_wait(host, ATA_DRQ, MG_TMAX_WAIT_RD_DRQ);
-       if (err)
-               return err;
-
-       for (i = 0; i < (MG_SECTOR_SIZE >> 1); i++)
-               host->id[i] = le16_to_cpu(inw((unsigned long)host->dev_base +
-                                       MG_BUFF_OFFSET + i * 2));
-
-       outb(MG_CMD_RD_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND);
-       err = mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD);
-       if (err)
-               return err;
-
-       if ((id[ATA_ID_FIELD_VALID] & 1) == 0)
-               return MG_ERR_TRANSLATION;
-
-       host->n_sectors = ata_id_u32(id, ATA_ID_LBA_CAPACITY);
-       host->cyls = id[ATA_ID_CYLS];
-       host->heads = id[ATA_ID_HEADS];
-       host->sectors = id[ATA_ID_SECTORS];
-
-       if (MG_RES_SEC && host->heads && host->sectors) {
-               /* modify cyls, n_sectors */
-               host->cyls = (host->n_sectors - MG_RES_SEC) /
-                       host->heads / host->sectors;
-               host->nres_sectors = host->n_sectors - host->cyls *
-                       host->heads * host->sectors;
-               host->n_sectors -= host->nres_sectors;
-       }
-
-       mg_id_c_string(id, fwrev, ATA_ID_FW_REV, sizeof(fwrev));
-       mg_id_c_string(id, model, ATA_ID_PROD, sizeof(model));
-       mg_id_c_string(id, serial, ATA_ID_SERNO, sizeof(serial));
-       printk(KERN_INFO "mg_disk: model: %s\n", model);
-       printk(KERN_INFO "mg_disk: firm: %.8s\n", fwrev);
-       printk(KERN_INFO "mg_disk: serial: %s\n", serial);
-       printk(KERN_INFO "mg_disk: %d + reserved %d sectors\n",
-                       host->n_sectors, host->nres_sectors);
-
-       if (!prv_data->use_polling)
-               outb(0, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
-
-       return err;
-}
-
-
-static int mg_disk_init(struct mg_host *host)
-{
-       struct mg_drv_data *prv_data = host->dev->platform_data;
-       s32 err;
-       u8 init_status;
-
-       /* hdd rst low */
-       gpio_set_value(host->rst, 0);
-       err = mg_wait(host, ATA_BUSY, MG_TMAX_RST_TO_BUSY);
-       if (err)
-               return err;
-
-       /* hdd rst high */
-       gpio_set_value(host->rst, 1);
-       err = mg_wait(host, MG_STAT_READY, MG_TMAX_HDRST_TO_RDY);
-       if (err)
-               return err;
-
-       /* soft reset on */
-       outb(ATA_SRST | (prv_data->use_polling ? ATA_NIEN : 0),
-                       (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
-       err = mg_wait(host, ATA_BUSY, MG_TMAX_RST_TO_BUSY);
-       if (err)
-               return err;
-
-       /* soft reset off */
-       outb(prv_data->use_polling ? ATA_NIEN : 0,
-                       (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
-       err = mg_wait(host, MG_STAT_READY, MG_TMAX_SWRST_TO_RDY);
-       if (err)
-               return err;
-
-       init_status = inb((unsigned long)host->dev_base + MG_REG_STATUS) & 0xf;
-
-       if (init_status == 0xf)
-               return MG_ERR_INIT_STAT;
-
-       return err;
-}
-
-static void mg_bad_rw_intr(struct mg_host *host)
-{
-       if (host->req)
-               if (++host->req->errors >= MG_MAX_ERRORS ||
-                   host->error == MG_ERR_TIMEOUT)
-                       mg_end_request_cur(host, -EIO);
-}
-
-static unsigned int mg_out(struct mg_host *host,
-               unsigned int sect_num,
-               unsigned int sect_cnt,
-               unsigned int cmd,
-               void (*intr_addr)(struct mg_host *))
-{
-       struct mg_drv_data *prv_data = host->dev->platform_data;
-
-       if (mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD))
-               return host->error;
-
-       if (!prv_data->use_polling) {
-               host->mg_do_intr = intr_addr;
-               mod_timer(&host->timer, jiffies + 3 * HZ);
-       }
-       if (MG_RES_SEC)
-               sect_num += MG_RES_SEC;
-       outb((u8)sect_cnt, (unsigned long)host->dev_base + MG_REG_SECT_CNT);
-       outb((u8)sect_num, (unsigned long)host->dev_base + MG_REG_SECT_NUM);
-       outb((u8)(sect_num >> 8), (unsigned long)host->dev_base +
-                       MG_REG_CYL_LOW);
-       outb((u8)(sect_num >> 16), (unsigned long)host->dev_base +
-                       MG_REG_CYL_HIGH);
-       outb((u8)((sect_num >> 24) | ATA_LBA | ATA_DEVICE_OBS),
-                       (unsigned long)host->dev_base + MG_REG_DRV_HEAD);
-       outb(cmd, (unsigned long)host->dev_base + MG_REG_COMMAND);
-       return MG_ERR_NONE;
-}
-
-static void mg_read_one(struct mg_host *host, struct request *req)
-{
-       u16 *buff = (u16 *)bio_data(req->bio);
-       u32 i;
-
-       for (i = 0; i < MG_SECTOR_SIZE >> 1; i++)
-               *buff++ = inw((unsigned long)host->dev_base + MG_BUFF_OFFSET +
-                             (i << 1));
-}
-
-static void mg_read(struct request *req)
-{
-       struct mg_host *host = req->rq_disk->private_data;
-
-       if (mg_out(host, blk_rq_pos(req), blk_rq_sectors(req),
-                  MG_CMD_RD, NULL) != MG_ERR_NONE)
-               mg_bad_rw_intr(host);
-
-       MG_DBG("requested %d sects (from %ld), buffer=0x%p\n",
-              blk_rq_sectors(req), blk_rq_pos(req), bio_data(req->bio));
-
-       do {
-               if (mg_wait(host, ATA_DRQ,
-                           MG_TMAX_WAIT_RD_DRQ) != MG_ERR_NONE) {
-                       mg_bad_rw_intr(host);
-                       return;
-               }
-
-               mg_read_one(host, req);
-
-               outb(MG_CMD_RD_CONF, (unsigned long)host->dev_base +
-                               MG_REG_COMMAND);
-       } while (mg_end_request(host, 0, MG_SECTOR_SIZE));
-}
-
-static void mg_write_one(struct mg_host *host, struct request *req)
-{
-       u16 *buff = (u16 *)bio_data(req->bio);
-       u32 i;
-
-       for (i = 0; i < MG_SECTOR_SIZE >> 1; i++)
-               outw(*buff++, (unsigned long)host->dev_base + MG_BUFF_OFFSET +
-                    (i << 1));
-}
-
-static void mg_write(struct request *req)
-{
-       struct mg_host *host = req->rq_disk->private_data;
-       unsigned int rem = blk_rq_sectors(req);
-
-       if (mg_out(host, blk_rq_pos(req), rem,
-                  MG_CMD_WR, NULL) != MG_ERR_NONE) {
-               mg_bad_rw_intr(host);
-               return;
-       }
-
-       MG_DBG("requested %d sects (from %ld), buffer=0x%p\n",
-              rem, blk_rq_pos(req), bio_data(req->bio));
-
-       if (mg_wait(host, ATA_DRQ,
-                   MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) {
-               mg_bad_rw_intr(host);
-               return;
-       }
-
-       do {
-               mg_write_one(host, req);
-
-               outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base +
-                               MG_REG_COMMAND);
-
-               rem--;
-               if (rem > 1 && mg_wait(host, ATA_DRQ,
-                                       MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) {
-                       mg_bad_rw_intr(host);
-                       return;
-               } else if (mg_wait(host, MG_STAT_READY,
-                                       MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) {
-                       mg_bad_rw_intr(host);
-                       return;
-               }
-       } while (mg_end_request(host, 0, MG_SECTOR_SIZE));
-}
-
-static void mg_read_intr(struct mg_host *host)
-{
-       struct request *req = host->req;
-       u32 i;
-
-       /* check status */
-       do {
-               i = inb((unsigned long)host->dev_base + MG_REG_STATUS);
-               if (i & ATA_BUSY)
-                       break;
-               if (!MG_READY_OK(i))
-                       break;
-               if (i & ATA_DRQ)
-                       goto ok_to_read;
-       } while (0);
-       mg_dump_status("mg_read_intr", i, host);
-       mg_bad_rw_intr(host);
-       mg_request(host->breq);
-       return;
-
-ok_to_read:
-       mg_read_one(host, req);
-
-       MG_DBG("sector %ld, remaining=%ld, buffer=0x%p\n",
-              blk_rq_pos(req), blk_rq_sectors(req) - 1, bio_data(req->bio));
-
-       /* send read confirm */
-       outb(MG_CMD_RD_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND);
-
-       if (mg_end_request(host, 0, MG_SECTOR_SIZE)) {
-               /* set handler if read remains */
-               host->mg_do_intr = mg_read_intr;
-               mod_timer(&host->timer, jiffies + 3 * HZ);
-       } else /* goto next request */
-               mg_request(host->breq);
-}
-
-static void mg_write_intr(struct mg_host *host)
-{
-       struct request *req = host->req;
-       u32 i;
-       bool rem;
-
-       /* check status */
-       do {
-               i = inb((unsigned long)host->dev_base + MG_REG_STATUS);
-               if (i & ATA_BUSY)
-                       break;
-               if (!MG_READY_OK(i))
-                       break;
-               if ((blk_rq_sectors(req) <= 1) || (i & ATA_DRQ))
-                       goto ok_to_write;
-       } while (0);
-       mg_dump_status("mg_write_intr", i, host);
-       mg_bad_rw_intr(host);
-       mg_request(host->breq);
-       return;
-
-ok_to_write:
-       if ((rem = mg_end_request(host, 0, MG_SECTOR_SIZE))) {
-               /* write 1 sector and set handler if remains */
-               mg_write_one(host, req);
-               MG_DBG("sector %ld, remaining=%ld, buffer=0x%p\n",
-                      blk_rq_pos(req), blk_rq_sectors(req), bio_data(req->bio));
-               host->mg_do_intr = mg_write_intr;
-               mod_timer(&host->timer, jiffies + 3 * HZ);
-       }
-
-       /* send write confirm */
-       outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND);
-
-       if (!rem)
-               mg_request(host->breq);
-}
-
-static void mg_times_out(unsigned long data)
-{
-       struct mg_host *host = (struct mg_host *)data;
-       char *name;
-
-       spin_lock_irq(&host->lock);
-
-       if (!host->req)
-               goto out_unlock;
-
-       host->mg_do_intr = NULL;
-
-       name = host->req->rq_disk->disk_name;
-       printk(KERN_DEBUG "%s: timeout\n", name);
-
-       host->error = MG_ERR_TIMEOUT;
-       mg_bad_rw_intr(host);
-
-out_unlock:
-       mg_request(host->breq);
-       spin_unlock_irq(&host->lock);
-}
-
-static void mg_request_poll(struct request_queue *q)
-{
-       struct mg_host *host = q->queuedata;
-
-       while (1) {
-               if (!host->req) {
-                       host->req = blk_fetch_request(q);
-                       if (!host->req)
-                               break;
-               }
-
-               switch (req_op(host->req)) {
-               case REQ_OP_READ:
-                       mg_read(host->req);
-                       break;
-               case REQ_OP_WRITE:
-                       mg_write(host->req);
-                       break;
-               default:
-                       mg_end_request_cur(host, -EIO);
-                       break;
-               }
-       }
-}
-
-static unsigned int mg_issue_req(struct request *req,
-               struct mg_host *host,
-               unsigned int sect_num,
-               unsigned int sect_cnt)
-{
-       switch (req_op(host->req)) {
-       case REQ_OP_READ:
-               if (mg_out(host, sect_num, sect_cnt, MG_CMD_RD, &mg_read_intr)
-                               != MG_ERR_NONE) {
-                       mg_bad_rw_intr(host);
-                       return host->error;
-               }
-               break;
-       case REQ_OP_WRITE:
-               /* TODO : handler */
-               outb(ATA_NIEN, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
-               if (mg_out(host, sect_num, sect_cnt, MG_CMD_WR, &mg_write_intr)
-                               != MG_ERR_NONE) {
-                       mg_bad_rw_intr(host);
-                       return host->error;
-               }
-               del_timer(&host->timer);
-               mg_wait(host, ATA_DRQ, MG_TMAX_WAIT_WR_DRQ);
-               outb(0, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
-               if (host->error) {
-                       mg_bad_rw_intr(host);
-                       return host->error;
-               }
-               mg_write_one(host, req);
-               mod_timer(&host->timer, jiffies + 3 * HZ);
-               outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base +
-                               MG_REG_COMMAND);
-               break;
-       default:
-               mg_end_request_cur(host, -EIO);
-               break;
-       }
-       return MG_ERR_NONE;
-}
-
-/* This function also called from IRQ context */
-static void mg_request(struct request_queue *q)
-{
-       struct mg_host *host = q->queuedata;
-       struct request *req;
-       u32 sect_num, sect_cnt;
-
-       while (1) {
-               if (!host->req) {
-                       host->req = blk_fetch_request(q);
-                       if (!host->req)
-                               break;
-               }
-               req = host->req;
-
-               /* check unwanted request call */
-               if (host->mg_do_intr)
-                       return;
-
-               del_timer(&host->timer);
-
-               sect_num = blk_rq_pos(req);
-               /* deal whole segments */
-               sect_cnt = blk_rq_sectors(req);
-
-               /* sanity check */
-               if (sect_num >= get_capacity(req->rq_disk) ||
-                               ((sect_num + sect_cnt) >
-                                get_capacity(req->rq_disk))) {
-                       printk(KERN_WARNING
-                                       "%s: bad access: sector=%d, count=%d\n",
-                                       req->rq_disk->disk_name,
-                                       sect_num, sect_cnt);
-                       mg_end_request_cur(host, -EIO);
-                       continue;
-               }
-
-               if (!mg_issue_req(req, host, sect_num, sect_cnt))
-                       return;
-       }
-}
-
-static int mg_getgeo(struct block_device *bdev, struct hd_geometry *geo)
-{
-       struct mg_host *host = bdev->bd_disk->private_data;
-
-       geo->cylinders = (unsigned short)host->cyls;
-       geo->heads = (unsigned char)host->heads;
-       geo->sectors = (unsigned char)host->sectors;
-       return 0;
-}
-
-static const struct block_device_operations mg_disk_ops = {
-       .getgeo = mg_getgeo
-};
-
-#ifdef CONFIG_PM_SLEEP
-static int mg_suspend(struct device *dev)
-{
-       struct mg_drv_data *prv_data = dev->platform_data;
-       struct mg_host *host = prv_data->host;
-
-       if (mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD))
-               return -EIO;
-
-       if (!prv_data->use_polling)
-               outb(ATA_NIEN, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
-
-       outb(MG_CMD_SLEEP, (unsigned long)host->dev_base + MG_REG_COMMAND);
-       /* wait until mflash deep sleep */
-       msleep(1);
-
-       if (mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD)) {
-               if (!prv_data->use_polling)
-                       outb(0, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
-               return -EIO;
-       }
-
-       return 0;
-}
-
-static int mg_resume(struct device *dev)
-{
-       struct mg_drv_data *prv_data = dev->platform_data;
-       struct mg_host *host = prv_data->host;
-
-       if (mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD))
-               return -EIO;
-
-       outb(MG_CMD_WAKEUP, (unsigned long)host->dev_base + MG_REG_COMMAND);
-       /* wait until mflash wakeup */
-       msleep(1);
-
-       if (mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD))
-               return -EIO;
-
-       if (!prv_data->use_polling)
-               outb(0, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
-
-       return 0;
-}
-#endif
-
-static SIMPLE_DEV_PM_OPS(mg_pm, mg_suspend, mg_resume);
-
-static int mg_probe(struct platform_device *plat_dev)
-{
-       struct mg_host *host;
-       struct resource *rsc;
-       struct mg_drv_data *prv_data = plat_dev->dev.platform_data;
-       int err = 0;
-
-       if (!prv_data) {
-               printk(KERN_ERR "%s:%d fail (no driver_data)\n",
-                               __func__, __LINE__);
-               err = -EINVAL;
-               goto probe_err;
-       }
-
-       /* alloc mg_host */
-       host = kzalloc(sizeof(struct mg_host), GFP_KERNEL);
-       if (!host) {
-               printk(KERN_ERR "%s:%d fail (no memory for mg_host)\n",
-                               __func__, __LINE__);
-               err = -ENOMEM;
-               goto probe_err;
-       }
-       host->major = MG_DISK_MAJ;
-
-       /* link each other */
-       prv_data->host = host;
-       host->dev = &plat_dev->dev;
-
-       /* io remap */
-       rsc = platform_get_resource(plat_dev, IORESOURCE_MEM, 0);
-       if (!rsc) {
-               printk(KERN_ERR "%s:%d platform_get_resource fail\n",
-                               __func__, __LINE__);
-               err = -EINVAL;
-               goto probe_err_2;
-       }
-       host->dev_base = ioremap(rsc->start, resource_size(rsc));
-       if (!host->dev_base) {
-               printk(KERN_ERR "%s:%d ioremap fail\n",
-                               __func__, __LINE__);
-               err = -EIO;
-               goto probe_err_2;
-       }
-       MG_DBG("dev_base = 0x%x\n", (u32)host->dev_base);
-
-       /* get reset pin */
-       rsc = platform_get_resource_byname(plat_dev, IORESOURCE_IO,
-                       MG_RST_PIN);
-       if (!rsc) {
-               printk(KERN_ERR "%s:%d get reset pin fail\n",
-                               __func__, __LINE__);
-               err = -EIO;
-               goto probe_err_3;
-       }
-       host->rst = rsc->start;
-
-       /* init rst pin */
-       err = gpio_request(host->rst, MG_RST_PIN);
-       if (err)
-               goto probe_err_3;
-       gpio_direction_output(host->rst, 1);
-
-       /* reset out pin */
-       if (!(prv_data->dev_attr & MG_DEV_MASK)) {
-               err = -EINVAL;
-               goto probe_err_3a;
-       }
-
-       if (prv_data->dev_attr != MG_BOOT_DEV) {
-               rsc = platform_get_resource_byname(plat_dev, IORESOURCE_IO,
-                               MG_RSTOUT_PIN);
-               if (!rsc) {
-                       printk(KERN_ERR "%s:%d get reset-out pin fail\n",
-                                       __func__, __LINE__);
-                       err = -EIO;
-                       goto probe_err_3a;
-               }
-               host->rstout = rsc->start;
-               err = gpio_request(host->rstout, MG_RSTOUT_PIN);
-               if (err)
-                       goto probe_err_3a;
-               gpio_direction_input(host->rstout);
-       }
-
-       /* disk reset */
-       if (prv_data->dev_attr == MG_STORAGE_DEV) {
-               /* If POR seq. not yet finished, wait */
-               err = mg_wait_rstout(host->rstout, MG_TMAX_RSTOUT);
-               if (err)
-                       goto probe_err_3b;
-               err = mg_disk_init(host);
-               if (err) {
-                       printk(KERN_ERR "%s:%d fail (err code : %d)\n",
-                                       __func__, __LINE__, err);
-                       err = -EIO;
-                       goto probe_err_3b;
-               }
-       }
-
-       /* get irq resource */
-       if (!prv_data->use_polling) {
-               host->irq = platform_get_irq(plat_dev, 0);
-               if (host->irq == -ENXIO) {
-                       err = host->irq;
-                       goto probe_err_3b;
-               }
-               err = request_irq(host->irq, mg_irq,
-                               IRQF_TRIGGER_RISING,
-                               MG_DEV_NAME, host);
-               if (err) {
-                       printk(KERN_ERR "%s:%d fail (request_irq err=%d)\n",
-                                       __func__, __LINE__, err);
-                       goto probe_err_3b;
-               }
-
-       }
-
-       /* get disk id */
-       err = mg_get_disk_id(host);
-       if (err) {
-               printk(KERN_ERR "%s:%d fail (err code : %d)\n",
-                               __func__, __LINE__, err);
-               err = -EIO;
-               goto probe_err_4;
-       }
-
-       err = register_blkdev(host->major, MG_DISK_NAME);
-       if (err < 0) {
-               printk(KERN_ERR "%s:%d register_blkdev fail (err code : %d)\n",
-                               __func__, __LINE__, err);
-               goto probe_err_4;
-       }
-       if (!host->major)
-               host->major = err;
-
-       spin_lock_init(&host->lock);
-
-       if (prv_data->use_polling)
-               host->breq = blk_init_queue(mg_request_poll, &host->lock);
-       else
-               host->breq = blk_init_queue(mg_request, &host->lock);
-
-       if (!host->breq) {
-               err = -ENOMEM;
-               printk(KERN_ERR "%s:%d (blk_init_queue) fail\n",
-                               __func__, __LINE__);
-               goto probe_err_5;
-       }
-       host->breq->queuedata = host;
-
-       /* mflash is random device, thanx for the noop */
-       err = elevator_change(host->breq, "noop");
-       if (err) {
-               printk(KERN_ERR "%s:%d (elevator_init) fail\n",
-                               __func__, __LINE__);
-               goto probe_err_6;
-       }
-       blk_queue_max_hw_sectors(host->breq, MG_MAX_SECTS);
-       blk_queue_logical_block_size(host->breq, MG_SECTOR_SIZE);
-
-       init_timer(&host->timer);
-       host->timer.function = mg_times_out;
-       host->timer.data = (unsigned long)host;
-
-       host->gd = alloc_disk(MG_DISK_MAX_PART);
-       if (!host->gd) {
-               printk(KERN_ERR "%s:%d (alloc_disk) fail\n",
-                               __func__, __LINE__);
-               err = -ENOMEM;
-               goto probe_err_7;
-       }
-       host->gd->major = host->major;
-       host->gd->first_minor = 0;
-       host->gd->fops = &mg_disk_ops;
-       host->gd->queue = host->breq;
-       host->gd->private_data = host;
-       sprintf(host->gd->disk_name, MG_DISK_NAME"a");
-
-       set_capacity(host->gd, host->n_sectors);
-
-       add_disk(host->gd);
-
-       return err;
-
-probe_err_7:
-       del_timer_sync(&host->timer);
-probe_err_6:
-       blk_cleanup_queue(host->breq);
-probe_err_5:
-       unregister_blkdev(host->major, MG_DISK_NAME);
-probe_err_4:
-       if (!prv_data->use_polling)
-               free_irq(host->irq, host);
-probe_err_3b:
-       gpio_free(host->rstout);
-probe_err_3a:
-       gpio_free(host->rst);
-probe_err_3:
-       iounmap(host->dev_base);
-probe_err_2:
-       kfree(host);
-probe_err:
-       return err;
-}
-
-static int mg_remove(struct platform_device *plat_dev)
-{
-       struct mg_drv_data *prv_data = plat_dev->dev.platform_data;
-       struct mg_host *host = prv_data->host;
-       int err = 0;
-
-       /* delete timer */
-       del_timer_sync(&host->timer);
-
-       /* remove disk */
-       if (host->gd) {
-               del_gendisk(host->gd);
-               put_disk(host->gd);
-       }
-       /* remove queue */
-       if (host->breq)
-               blk_cleanup_queue(host->breq);
-
-       /* unregister blk device */
-       unregister_blkdev(host->major, MG_DISK_NAME);
-
-       /* free irq */
-       if (!prv_data->use_polling)
-               free_irq(host->irq, host);
-
-       /* free reset-out pin */
-       if (prv_data->dev_attr != MG_BOOT_DEV)
-               gpio_free(host->rstout);
-
-       /* free rst pin */
-       if (host->rst)
-               gpio_free(host->rst);
-
-       /* unmap io */
-       if (host->dev_base)
-               iounmap(host->dev_base);
-
-       /* free mg_host */
-       kfree(host);
-
-       return err;
-}
-
-static struct platform_driver mg_disk_driver = {
-       .probe = mg_probe,
-       .remove = mg_remove,
-       .driver = {
-               .name = MG_DEV_NAME,
-               .pm = &mg_pm,
-       }
-};
-
-/****************************************************************************
- *
- * Module stuff
- *
- ****************************************************************************/
-
-static int __init mg_init(void)
-{
-       printk(KERN_INFO "mGine mflash driver, (c) 2008 mGine Co.\n");
-       return platform_driver_register(&mg_disk_driver);
-}
-
-static void __exit mg_exit(void)
-{
-       printk(KERN_INFO "mflash driver : bye bye\n");
-       platform_driver_unregister(&mg_disk_driver);
-}
-
-module_init(mg_init);
-module_exit(mg_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("unsik Kim <donari75@gmail.com>");
-MODULE_DESCRIPTION("mGine m[g]flash device driver");
index f96ab71..02804cc 100644 (file)
@@ -169,6 +169,25 @@ static bool mtip_check_surprise_removal(struct pci_dev *pdev)
        return false; /* device present */
 }
 
+/* we have to use runtime tag to setup command header */
+static void mtip_init_cmd_header(struct request *rq)
+{
+       struct driver_data *dd = rq->q->queuedata;
+       struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
+       u32 host_cap_64 = readl(dd->mmio + HOST_CAP) & HOST_CAP_64;
+
+       /* Point the command headers at the command tables. */
+       cmd->command_header = dd->port->command_list +
+                               (sizeof(struct mtip_cmd_hdr) * rq->tag);
+       cmd->command_header_dma = dd->port->command_list_dma +
+                               (sizeof(struct mtip_cmd_hdr) * rq->tag);
+
+       if (host_cap_64)
+               cmd->command_header->ctbau = __force_bit2int cpu_to_le32((cmd->command_dma >> 16) >> 16);
+
+       cmd->command_header->ctba = __force_bit2int cpu_to_le32(cmd->command_dma & 0xFFFFFFFF);
+}
+
 static struct mtip_cmd *mtip_get_int_command(struct driver_data *dd)
 {
        struct request *rq;
@@ -180,6 +199,9 @@ static struct mtip_cmd *mtip_get_int_command(struct driver_data *dd)
        if (IS_ERR(rq))
                return NULL;
 
+       /* Internal cmd isn't submitted via .queue_rq */
+       mtip_init_cmd_header(rq);
+
        return blk_mq_rq_to_pdu(rq);
 }
 
@@ -241,7 +263,8 @@ static void mtip_async_complete(struct mtip_port *port,
 
        rq = mtip_rq_from_tag(dd, tag);
 
-       blk_mq_complete_request(rq, status);
+       cmd->status = status;
+       blk_mq_complete_request(rq);
 }
 
 /*
@@ -2910,18 +2933,19 @@ static void mtip_softirq_done_fn(struct request *rq)
        if (unlikely(cmd->unaligned))
                up(&dd->port->cmd_slot_unal);
 
-       blk_mq_end_request(rq, rq->errors);
+       blk_mq_end_request(rq, cmd->status);
 }
 
 static void mtip_abort_cmd(struct request *req, void *data,
                                                        bool reserved)
 {
+       struct mtip_cmd *cmd = blk_mq_rq_to_pdu(req);
        struct driver_data *dd = data;
 
        dbg_printk(MTIP_DRV_NAME " Aborting request, tag = %d\n", req->tag);
 
        clear_bit(req->tag, dd->port->cmds_to_issue);
-       req->errors = -EIO;
+       cmd->status = -EIO;
        mtip_softirq_done_fn(req);
 }
 
@@ -3807,6 +3831,8 @@ static int mtip_queue_rq(struct blk_mq_hw_ctx *hctx,
        struct request *rq = bd->rq;
        int ret;
 
+       mtip_init_cmd_header(rq);
+
        if (unlikely(mtip_check_unal_depth(hctx, rq)))
                return BLK_MQ_RQ_QUEUE_BUSY;
 
@@ -3816,7 +3842,6 @@ static int mtip_queue_rq(struct blk_mq_hw_ctx *hctx,
        if (likely(!ret))
                return BLK_MQ_RQ_QUEUE_OK;
 
-       rq->errors = ret;
        return BLK_MQ_RQ_QUEUE_ERROR;
 }
 
@@ -3838,7 +3863,6 @@ static int mtip_init_cmd(void *data, struct request *rq, unsigned int hctx_idx,
 {
        struct driver_data *dd = data;
        struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
-       u32 host_cap_64 = readl(dd->mmio + HOST_CAP) & HOST_CAP_64;
 
        /*
         * For flush requests, request_idx starts at the end of the
@@ -3855,17 +3879,6 @@ static int mtip_init_cmd(void *data, struct request *rq, unsigned int hctx_idx,
 
        memset(cmd->command, 0, CMD_DMA_ALLOC_SZ);
 
-       /* Point the command headers at the command tables. */
-       cmd->command_header = dd->port->command_list +
-                               (sizeof(struct mtip_cmd_hdr) * request_idx);
-       cmd->command_header_dma = dd->port->command_list_dma +
-                               (sizeof(struct mtip_cmd_hdr) * request_idx);
-
-       if (host_cap_64)
-               cmd->command_header->ctbau = __force_bit2int cpu_to_le32((cmd->command_dma >> 16) >> 16);
-
-       cmd->command_header->ctba = __force_bit2int cpu_to_le32(cmd->command_dma & 0xFFFFFFFF);
-
        sg_init_table(cmd->sg, MTIP_MAX_SG);
        return 0;
 }
@@ -3889,7 +3902,7 @@ exit_handler:
        return BLK_EH_RESET_TIMER;
 }
 
-static struct blk_mq_ops mtip_mq_ops = {
+static const struct blk_mq_ops mtip_mq_ops = {
        .queue_rq       = mtip_queue_rq,
        .init_request   = mtip_init_cmd,
        .exit_request   = mtip_free_cmd,
@@ -3969,7 +3982,7 @@ static int mtip_block_initialize(struct driver_data *dd)
        dd->tags.reserved_tags = 1;
        dd->tags.cmd_size = sizeof(struct mtip_cmd);
        dd->tags.numa_node = dd->numa_node;
-       dd->tags.flags = BLK_MQ_F_SHOULD_MERGE;
+       dd->tags.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_NO_SCHED;
        dd->tags.driver_data = dd;
        dd->tags.timeout = MTIP_NCQ_CMD_TIMEOUT_MS;
 
@@ -4025,7 +4038,6 @@ skip_create_disk:
                dd->queue->limits.discard_granularity = 4096;
                blk_queue_max_discard_sectors(dd->queue,
                        MTIP_MAX_TRIM_ENTRY_LEN * MTIP_MAX_TRIM_ENTRIES);
-               dd->queue->limits.discard_zeroes_data = 0;
        }
 
        /* Set the capacity of the device in 512 byte sectors. */
@@ -4107,9 +4119,11 @@ static void mtip_no_dev_cleanup(struct request *rq, void *data, bool reserv)
        struct driver_data *dd = (struct driver_data *)data;
        struct mtip_cmd *cmd;
 
-       if (likely(!reserv))
-               blk_mq_complete_request(rq, -ENODEV);
-       else if (test_bit(MTIP_PF_IC_ACTIVE_BIT, &dd->port->flags)) {
+       if (likely(!reserv)) {
+               cmd = blk_mq_rq_to_pdu(rq);
+               cmd->status = -ENODEV;
+               blk_mq_complete_request(rq);
+       } else if (test_bit(MTIP_PF_IC_ACTIVE_BIT, &dd->port->flags)) {
 
                cmd = mtip_cmd_from_tag(dd, MTIP_TAG_INTERNAL);
                if (cmd->comp_func)
@@ -4162,7 +4176,7 @@ static int mtip_block_remove(struct driver_data *dd)
                dev_info(&dd->pdev->dev, "device %s surprise removal\n",
                                                dd->disk->disk_name);
 
-       blk_mq_freeze_queue_start(dd->queue);
+       blk_freeze_queue_start(dd->queue);
        blk_mq_stop_hw_queues(dd->queue);
        blk_mq_tagset_busy_iter(&dd->tags, mtip_no_dev_cleanup, dd);
 
index 7617888..57b4152 100644 (file)
@@ -352,6 +352,7 @@ struct mtip_cmd {
        int retries; /* The number of retries left for this command. */
 
        int direction; /* Data transfer direction */
+       int status;
 };
 
 /* Structure used to describe a port. */
index d8a2356..56efb04 100644 (file)
 #include <asm/types.h>
 
 #include <linux/nbd.h>
+#include <linux/nbd-netlink.h>
+#include <net/genetlink.h>
 
 static DEFINE_IDR(nbd_index_idr);
 static DEFINE_MUTEX(nbd_index_mutex);
+static int nbd_total_devices = 0;
 
 struct nbd_sock {
        struct socket *sock;
        struct mutex tx_lock;
        struct request *pending;
        int sent;
+       bool dead;
+       int fallback_index;
+       int cookie;
+};
+
+struct recv_thread_args {
+       struct work_struct work;
+       struct nbd_device *nbd;
+       int index;
+};
+
+struct link_dead_args {
+       struct work_struct work;
+       int index;
 };
 
 #define NBD_TIMEDOUT                   0
 #define NBD_DISCONNECT_REQUESTED       1
 #define NBD_DISCONNECTED               2
-#define NBD_RUNNING                    3
+#define NBD_HAS_PID_FILE               3
+#define NBD_HAS_CONFIG_REF             4
+#define NBD_BOUND                      5
+#define NBD_DESTROY_ON_DISCONNECT      6
 
-struct nbd_device {
+struct nbd_config {
        u32 flags;
        unsigned long runtime_flags;
-       struct nbd_sock **socks;
-       int magic;
+       u64 dead_conn_timeout;
 
-       struct blk_mq_tag_set tag_set;
-
-       struct mutex config_lock;
-       struct gendisk *disk;
+       struct nbd_sock **socks;
        int num_connections;
+       atomic_t live_connections;
+       wait_queue_head_t conn_wait;
+
        atomic_t recv_threads;
        wait_queue_head_t recv_wq;
        loff_t blksize;
        loff_t bytesize;
-
-       struct task_struct *task_recv;
-       struct task_struct *task_setup;
-
 #if IS_ENABLED(CONFIG_DEBUG_FS)
        struct dentry *dbg_dir;
 #endif
 };
 
+struct nbd_device {
+       struct blk_mq_tag_set tag_set;
+
+       int index;
+       refcount_t config_refs;
+       refcount_t refs;
+       struct nbd_config *config;
+       struct mutex config_lock;
+       struct gendisk *disk;
+
+       struct list_head list;
+       struct task_struct *task_recv;
+       struct task_struct *task_setup;
+};
+
 struct nbd_cmd {
        struct nbd_device *nbd;
+       int index;
+       int cookie;
        struct completion send_complete;
+       int status;
 };
 
 #if IS_ENABLED(CONFIG_DEBUG_FS)
@@ -100,18 +133,16 @@ static int part_shift;
 
 static int nbd_dev_dbg_init(struct nbd_device *nbd);
 static void nbd_dev_dbg_close(struct nbd_device *nbd);
-
+static void nbd_config_put(struct nbd_device *nbd);
+static void nbd_connect_reply(struct genl_info *info, int index);
+static int nbd_genl_status(struct sk_buff *skb, struct genl_info *info);
+static void nbd_dead_link_work(struct work_struct *work);
 
 static inline struct device *nbd_to_dev(struct nbd_device *nbd)
 {
        return disk_to_dev(nbd->disk);
 }
 
-static bool nbd_is_connected(struct nbd_device *nbd)
-{
-       return !!nbd->task_recv;
-}
-
 static const char *nbdcmd_to_ascii(int cmd)
 {
        switch (cmd) {
@@ -124,44 +155,104 @@ static const char *nbdcmd_to_ascii(int cmd)
        return "invalid";
 }
 
-static int nbd_size_clear(struct nbd_device *nbd, struct block_device *bdev)
+static ssize_t pid_show(struct device *dev,
+                       struct device_attribute *attr, char *buf)
 {
-       if (bdev->bd_openers <= 1)
-               bd_set_size(bdev, 0);
-       set_capacity(nbd->disk, 0);
-       kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
+       struct gendisk *disk = dev_to_disk(dev);
+       struct nbd_device *nbd = (struct nbd_device *)disk->private_data;
 
-       return 0;
+       return sprintf(buf, "%d\n", task_pid_nr(nbd->task_recv));
+}
+
+static struct device_attribute pid_attr = {
+       .attr = { .name = "pid", .mode = S_IRUGO},
+       .show = pid_show,
+};
+
+static void nbd_dev_remove(struct nbd_device *nbd)
+{
+       struct gendisk *disk = nbd->disk;
+       if (disk) {
+               del_gendisk(disk);
+               blk_cleanup_queue(disk->queue);
+               blk_mq_free_tag_set(&nbd->tag_set);
+               disk->private_data = NULL;
+               put_disk(disk);
+       }
+       kfree(nbd);
+}
+
+static void nbd_put(struct nbd_device *nbd)
+{
+       if (refcount_dec_and_mutex_lock(&nbd->refs,
+                                       &nbd_index_mutex)) {
+               idr_remove(&nbd_index_idr, nbd->index);
+               mutex_unlock(&nbd_index_mutex);
+               nbd_dev_remove(nbd);
+       }
+}
+
+static int nbd_disconnected(struct nbd_config *config)
+{
+       return test_bit(NBD_DISCONNECTED, &config->runtime_flags) ||
+               test_bit(NBD_DISCONNECT_REQUESTED, &config->runtime_flags);
+}
+
+static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock,
+                               int notify)
+{
+       if (!nsock->dead && notify && !nbd_disconnected(nbd->config)) {
+               struct link_dead_args *args;
+               args = kmalloc(sizeof(struct link_dead_args), GFP_NOIO);
+               if (args) {
+                       INIT_WORK(&args->work, nbd_dead_link_work);
+                       args->index = nbd->index;
+                       queue_work(system_wq, &args->work);
+               }
+       }
+       if (!nsock->dead) {
+               kernel_sock_shutdown(nsock->sock, SHUT_RDWR);
+               atomic_dec(&nbd->config->live_connections);
+       }
+       nsock->dead = true;
+       nsock->pending = NULL;
+       nsock->sent = 0;
+}
+
+static void nbd_size_clear(struct nbd_device *nbd)
+{
+       if (nbd->config->bytesize) {
+               set_capacity(nbd->disk, 0);
+               kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
+       }
 }
 
-static void nbd_size_update(struct nbd_device *nbd, struct block_device *bdev)
+static void nbd_size_update(struct nbd_device *nbd)
 {
-       blk_queue_logical_block_size(nbd->disk->queue, nbd->blksize);
-       blk_queue_physical_block_size(nbd->disk->queue, nbd->blksize);
-       bd_set_size(bdev, nbd->bytesize);
-       set_capacity(nbd->disk, nbd->bytesize >> 9);
+       struct nbd_config *config = nbd->config;
+       blk_queue_logical_block_size(nbd->disk->queue, config->blksize);
+       blk_queue_physical_block_size(nbd->disk->queue, config->blksize);
+       set_capacity(nbd->disk, config->bytesize >> 9);
        kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
 }
 
-static void nbd_size_set(struct nbd_device *nbd, struct block_device *bdev,
-                       loff_t blocksize, loff_t nr_blocks)
+static void nbd_size_set(struct nbd_device *nbd, loff_t blocksize,
+                        loff_t nr_blocks)
 {
-       nbd->blksize = blocksize;
-       nbd->bytesize = blocksize * nr_blocks;
-       if (nbd_is_connected(nbd))
-               nbd_size_update(nbd, bdev);
+       struct nbd_config *config = nbd->config;
+       config->blksize = blocksize;
+       config->bytesize = blocksize * nr_blocks;
+       nbd_size_update(nbd);
 }
 
-static void nbd_end_request(struct nbd_cmd *cmd)
+static void nbd_complete_rq(struct request *req)
 {
-       struct nbd_device *nbd = cmd->nbd;
-       struct request *req = blk_mq_rq_from_pdu(cmd);
-       int error = req->errors ? -EIO : 0;
+       struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
 
-       dev_dbg(nbd_to_dev(nbd), "request %p: %s\n", cmd,
-               error ? "failed" : "done");
+       dev_dbg(nbd_to_dev(cmd->nbd), "request %p: %s\n", cmd,
+               cmd->status ? "failed" : "done");
 
-       blk_mq_complete_request(req, error);
+       blk_mq_end_request(req, cmd->status);
 }
 
 /*
@@ -169,17 +260,18 @@ static void nbd_end_request(struct nbd_cmd *cmd)
  */
 static void sock_shutdown(struct nbd_device *nbd)
 {
+       struct nbd_config *config = nbd->config;
        int i;
 
-       if (nbd->num_connections == 0)
+       if (config->num_connections == 0)
                return;
-       if (test_and_set_bit(NBD_DISCONNECTED, &nbd->runtime_flags))
+       if (test_and_set_bit(NBD_DISCONNECTED, &config->runtime_flags))
                return;
 
-       for (i = 0; i < nbd->num_connections; i++) {
-               struct nbd_sock *nsock = nbd->socks[i];
+       for (i = 0; i < config->num_connections; i++) {
+               struct nbd_sock *nsock = config->socks[i];
                mutex_lock(&nsock->tx_lock);
-               kernel_sock_shutdown(nsock->sock, SHUT_RDWR);
+               nbd_mark_nsock_dead(nbd, nsock, 0);
                mutex_unlock(&nsock->tx_lock);
        }
        dev_warn(disk_to_dev(nbd->disk), "shutting down sockets\n");
@@ -190,14 +282,58 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
 {
        struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
        struct nbd_device *nbd = cmd->nbd;
+       struct nbd_config *config;
 
-       dev_err(nbd_to_dev(nbd), "Connection timed out, shutting down connection\n");
-       set_bit(NBD_TIMEDOUT, &nbd->runtime_flags);
-       req->errors = -EIO;
+       if (!refcount_inc_not_zero(&nbd->config_refs)) {
+               cmd->status = -EIO;
+               return BLK_EH_HANDLED;
+       }
 
-       mutex_lock(&nbd->config_lock);
+       /* If we are waiting on our dead timer then we could get timeout
+        * callbacks for our request.  For this we just want to reset the timer
+        * and let the queue side take care of everything.
+        */
+       if (!completion_done(&cmd->send_complete)) {
+               nbd_config_put(nbd);
+               return BLK_EH_RESET_TIMER;
+       }
+       config = nbd->config;
+
+       if (config->num_connections > 1) {
+               dev_err_ratelimited(nbd_to_dev(nbd),
+                                   "Connection timed out, retrying\n");
+               /*
+                * Hooray we have more connections, requeue this IO, the submit
+                * path will put it on a real connection.
+                */
+               if (config->socks && config->num_connections > 1) {
+                       if (cmd->index < config->num_connections) {
+                               struct nbd_sock *nsock =
+                                       config->socks[cmd->index];
+                               mutex_lock(&nsock->tx_lock);
+                               /* We can have multiple outstanding requests, so
+                                * we don't want to mark the nsock dead if we've
+                                * already reconnected with a new socket, so
+                                * only mark it dead if its the same socket we
+                                * were sent out on.
+                                */
+                               if (cmd->cookie == nsock->cookie)
+                                       nbd_mark_nsock_dead(nbd, nsock, 1);
+                               mutex_unlock(&nsock->tx_lock);
+                       }
+                       blk_mq_requeue_request(req, true);
+                       nbd_config_put(nbd);
+                       return BLK_EH_NOT_HANDLED;
+               }
+       } else {
+               dev_err_ratelimited(nbd_to_dev(nbd),
+                                   "Connection timed out\n");
+       }
+       set_bit(NBD_TIMEDOUT, &config->runtime_flags);
+       cmd->status = -EIO;
        sock_shutdown(nbd);
-       mutex_unlock(&nbd->config_lock);
+       nbd_config_put(nbd);
+
        return BLK_EH_HANDLED;
 }
 
@@ -207,7 +343,8 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
 static int sock_xmit(struct nbd_device *nbd, int index, int send,
                     struct iov_iter *iter, int msg_flags, int *sent)
 {
-       struct socket *sock = nbd->socks[index]->sock;
+       struct nbd_config *config = nbd->config;
+       struct socket *sock = config->socks[index]->sock;
        int result;
        struct msghdr msg;
        unsigned long pflags = current->flags;
@@ -244,7 +381,7 @@ static int sock_xmit(struct nbd_device *nbd, int index, int send,
                        *sent += result;
        } while (msg_data_left(&msg));
 
-       tsk_restore_flags(current, pflags, PF_MEMALLOC);
+       current_restore_flags(pflags, PF_MEMALLOC);
 
        return result;
 }
@@ -253,7 +390,8 @@ static int sock_xmit(struct nbd_device *nbd, int index, int send,
 static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
 {
        struct request *req = blk_mq_rq_from_pdu(cmd);
-       struct nbd_sock *nsock = nbd->socks[index];
+       struct nbd_config *config = nbd->config;
+       struct nbd_sock *nsock = config->socks[index];
        int result;
        struct nbd_request request = {.magic = htonl(NBD_REQUEST_MAGIC)};
        struct kvec iov = {.iov_base = &request, .iov_len = sizeof(request)};
@@ -284,7 +422,7 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
        }
 
        if (rq_data_dir(req) == WRITE &&
-           (nbd->flags & NBD_FLAG_READ_ONLY)) {
+           (config->flags & NBD_FLAG_READ_ONLY)) {
                dev_err_ratelimited(disk_to_dev(nbd->disk),
                                    "Write on read-only\n");
                return -EIO;
@@ -301,6 +439,8 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
                }
                iov_iter_advance(&from, sent);
        }
+       cmd->index = index;
+       cmd->cookie = nsock->cookie;
        request.type = htonl(type);
        if (type != NBD_CMD_FLUSH) {
                request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9);
@@ -328,7 +468,7 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
                }
                dev_err_ratelimited(disk_to_dev(nbd->disk),
                        "Send control failed (result %d)\n", result);
-               return -EIO;
+               return -EAGAIN;
        }
 send_pages:
        if (type != NBD_CMD_WRITE)
@@ -370,7 +510,7 @@ send_pages:
                                dev_err(disk_to_dev(nbd->disk),
                                        "Send data failed (result %d)\n",
                                        result);
-                               return -EIO;
+                               return -EAGAIN;
                        }
                        /*
                         * The completion might already have come in,
@@ -392,6 +532,7 @@ out:
 /* NULL returned = something went wrong, inform userspace */
 static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
 {
+       struct nbd_config *config = nbd->config;
        int result;
        struct nbd_reply reply;
        struct nbd_cmd *cmd;
@@ -405,8 +546,7 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
        iov_iter_kvec(&to, READ | ITER_KVEC, &iov, 1, sizeof(reply));
        result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL);
        if (result <= 0) {
-               if (!test_bit(NBD_DISCONNECTED, &nbd->runtime_flags) &&
-                   !test_bit(NBD_DISCONNECT_REQUESTED, &nbd->runtime_flags))
+               if (!nbd_disconnected(config))
                        dev_err(disk_to_dev(nbd->disk),
                                "Receive control failed (result %d)\n", result);
                return ERR_PTR(result);
@@ -433,7 +573,7 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
        if (ntohl(reply.error)) {
                dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n",
                        ntohl(reply.error));
-               req->errors = -EIO;
+               cmd->status = -EIO;
                return cmd;
        }
 
@@ -449,8 +589,19 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
                        if (result <= 0) {
                                dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n",
                                        result);
-                               req->errors = -EIO;
-                               return cmd;
+                               /*
+                                * If we've disconnected or we only have 1
+                                * connection then we need to make sure we
+                                * complete this request, otherwise error out
+                                * and let the timeout stuff handle resubmitting
+                                * this request onto another connection.
+                                */
+                               if (nbd_disconnected(config) ||
+                                   config->num_connections <= 1) {
+                                       cmd->status = -EIO;
+                                       return cmd;
+                               }
+                               return ERR_PTR(-EIO);
                        }
                        dev_dbg(nbd_to_dev(nbd), "request %p: got %d bytes data\n",
                                cmd, bvec.bv_len);
@@ -462,54 +613,34 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
        return cmd;
 }
 
-static ssize_t pid_show(struct device *dev,
-                       struct device_attribute *attr, char *buf)
-{
-       struct gendisk *disk = dev_to_disk(dev);
-       struct nbd_device *nbd = (struct nbd_device *)disk->private_data;
-
-       return sprintf(buf, "%d\n", task_pid_nr(nbd->task_recv));
-}
-
-static struct device_attribute pid_attr = {
-       .attr = { .name = "pid", .mode = S_IRUGO},
-       .show = pid_show,
-};
-
-struct recv_thread_args {
-       struct work_struct work;
-       struct nbd_device *nbd;
-       int index;
-};
-
 static void recv_work(struct work_struct *work)
 {
        struct recv_thread_args *args = container_of(work,
                                                     struct recv_thread_args,
                                                     work);
        struct nbd_device *nbd = args->nbd;
+       struct nbd_config *config = nbd->config;
        struct nbd_cmd *cmd;
        int ret = 0;
 
-       BUG_ON(nbd->magic != NBD_MAGIC);
        while (1) {
                cmd = nbd_read_stat(nbd, args->index);
                if (IS_ERR(cmd)) {
+                       struct nbd_sock *nsock = config->socks[args->index];
+
+                       mutex_lock(&nsock->tx_lock);
+                       nbd_mark_nsock_dead(nbd, nsock, 1);
+                       mutex_unlock(&nsock->tx_lock);
                        ret = PTR_ERR(cmd);
                        break;
                }
 
-               nbd_end_request(cmd);
+               blk_mq_complete_request(blk_mq_rq_from_pdu(cmd));
        }
-
-       /*
-        * We got an error, shut everybody down if this wasn't the result of a
-        * disconnect request.
-        */
-       if (ret && !test_bit(NBD_DISCONNECT_REQUESTED, &nbd->runtime_flags))
-               sock_shutdown(nbd);
-       atomic_dec(&nbd->recv_threads);
-       wake_up(&nbd->recv_wq);
+       atomic_dec(&config->recv_threads);
+       wake_up(&config->recv_wq);
+       nbd_config_put(nbd);
+       kfree(args);
 }
 
 static void nbd_clear_req(struct request *req, void *data, bool reserved)
@@ -519,47 +650,119 @@ static void nbd_clear_req(struct request *req, void *data, bool reserved)
        if (!blk_mq_request_started(req))
                return;
        cmd = blk_mq_rq_to_pdu(req);
-       req->errors = -EIO;
-       nbd_end_request(cmd);
+       cmd->status = -EIO;
+       blk_mq_complete_request(req);
 }
 
 static void nbd_clear_que(struct nbd_device *nbd)
 {
-       BUG_ON(nbd->magic != NBD_MAGIC);
-
+       blk_mq_stop_hw_queues(nbd->disk->queue);
        blk_mq_tagset_busy_iter(&nbd->tag_set, nbd_clear_req, NULL);
+       blk_mq_start_hw_queues(nbd->disk->queue);
        dev_dbg(disk_to_dev(nbd->disk), "queue cleared\n");
 }
 
+static int find_fallback(struct nbd_device *nbd, int index)
+{
+       struct nbd_config *config = nbd->config;
+       int new_index = -1;
+       struct nbd_sock *nsock = config->socks[index];
+       int fallback = nsock->fallback_index;
+
+       if (test_bit(NBD_DISCONNECTED, &config->runtime_flags))
+               return new_index;
+
+       if (config->num_connections <= 1) {
+               dev_err_ratelimited(disk_to_dev(nbd->disk),
+                                   "Attempted send on invalid socket\n");
+               return new_index;
+       }
+
+       if (fallback >= 0 && fallback < config->num_connections &&
+           !config->socks[fallback]->dead)
+               return fallback;
+
+       if (nsock->fallback_index < 0 ||
+           nsock->fallback_index >= config->num_connections ||
+           config->socks[nsock->fallback_index]->dead) {
+               int i;
+               for (i = 0; i < config->num_connections; i++) {
+                       if (i == index)
+                               continue;
+                       if (!config->socks[i]->dead) {
+                               new_index = i;
+                               break;
+                       }
+               }
+               nsock->fallback_index = new_index;
+               if (new_index < 0) {
+                       dev_err_ratelimited(disk_to_dev(nbd->disk),
+                                           "Dead connection, failed to find a fallback\n");
+                       return new_index;
+               }
+       }
+       new_index = nsock->fallback_index;
+       return new_index;
+}
+
+static int wait_for_reconnect(struct nbd_device *nbd)
+{
+       struct nbd_config *config = nbd->config;
+       if (!config->dead_conn_timeout)
+               return 0;
+       if (test_bit(NBD_DISCONNECTED, &config->runtime_flags))
+               return 0;
+       wait_event_interruptible_timeout(config->conn_wait,
+                                        atomic_read(&config->live_connections),
+                                        config->dead_conn_timeout);
+       return atomic_read(&config->live_connections);
+}
 
 static int nbd_handle_cmd(struct nbd_cmd *cmd, int index)
 {
        struct request *req = blk_mq_rq_from_pdu(cmd);
        struct nbd_device *nbd = cmd->nbd;
+       struct nbd_config *config;
        struct nbd_sock *nsock;
        int ret;
 
-       if (index >= nbd->num_connections) {
+       if (!refcount_inc_not_zero(&nbd->config_refs)) {
                dev_err_ratelimited(disk_to_dev(nbd->disk),
-                                   "Attempted send on invalid socket\n");
+                                   "Socks array is empty\n");
                return -EINVAL;
        }
+       config = nbd->config;
 
-       if (test_bit(NBD_DISCONNECTED, &nbd->runtime_flags)) {
+       if (index >= config->num_connections) {
                dev_err_ratelimited(disk_to_dev(nbd->disk),
-                                   "Attempted send on closed socket\n");
+                                   "Attempted send on invalid socket\n");
+               nbd_config_put(nbd);
                return -EINVAL;
        }
-
-       req->errors = 0;
-
-       nsock = nbd->socks[index];
+       cmd->status = 0;
+again:
+       nsock = config->socks[index];
        mutex_lock(&nsock->tx_lock);
-       if (unlikely(!nsock->sock)) {
+       if (nsock->dead) {
+               int old_index = index;
+               index = find_fallback(nbd, index);
                mutex_unlock(&nsock->tx_lock);
-               dev_err_ratelimited(disk_to_dev(nbd->disk),
-                                   "Attempted send on closed socket\n");
-               return -EINVAL;
+               if (index < 0) {
+                       if (wait_for_reconnect(nbd)) {
+                               index = old_index;
+                               goto again;
+                       }
+                       /* All the sockets should already be down at this point,
+                        * we just want to make sure that DISCONNECTED is set so
+                        * any requests that come in that were queue'ed waiting
+                        * for the reconnect timer don't trigger the timer again
+                        * and instead just error out.
+                        */
+                       sock_shutdown(nbd);
+                       nbd_config_put(nbd);
+                       return -EIO;
+               }
+               goto again;
        }
 
        /* Handle the case that we have a pending request that was partially
@@ -572,9 +775,21 @@ static int nbd_handle_cmd(struct nbd_cmd *cmd, int index)
                ret = 0;
                goto out;
        }
+       /*
+        * Some failures are related to the link going down, so anything that
+        * returns EAGAIN can be retried on a different socket.
+        */
        ret = nbd_send_cmd(nbd, cmd, index);
+       if (ret == -EAGAIN) {
+               dev_err_ratelimited(disk_to_dev(nbd->disk),
+                                   "Request send failed trying another connection\n");
+               nbd_mark_nsock_dead(nbd, nsock, 1);
+               mutex_unlock(&nsock->tx_lock);
+               goto again;
+       }
 out:
        mutex_unlock(&nsock->tx_lock);
+       nbd_config_put(nbd);
        return ret;
 }
 
@@ -611,9 +826,10 @@ static int nbd_queue_rq(struct blk_mq_hw_ctx *hctx,
        return ret;
 }
 
-static int nbd_add_socket(struct nbd_device *nbd, struct block_device *bdev,
-                         unsigned long arg)
+static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg,
+                         bool netlink)
 {
+       struct nbd_config *config = nbd->config;
        struct socket *sock;
        struct nbd_sock **socks;
        struct nbd_sock *nsock;
@@ -623,43 +839,107 @@ static int nbd_add_socket(struct nbd_device *nbd, struct block_device *bdev,
        if (!sock)
                return err;
 
-       if (!nbd->task_setup)
+       if (!netlink && !nbd->task_setup &&
+           !test_bit(NBD_BOUND, &config->runtime_flags))
                nbd->task_setup = current;
-       if (nbd->task_setup != current) {
+
+       if (!netlink &&
+           (nbd->task_setup != current ||
+            test_bit(NBD_BOUND, &config->runtime_flags))) {
                dev_err(disk_to_dev(nbd->disk),
                        "Device being setup by another task");
-               return -EINVAL;
+               sockfd_put(sock);
+               return -EBUSY;
        }
 
-       socks = krealloc(nbd->socks, (nbd->num_connections + 1) *
+       socks = krealloc(config->socks, (config->num_connections + 1) *
                         sizeof(struct nbd_sock *), GFP_KERNEL);
-       if (!socks)
+       if (!socks) {
+               sockfd_put(sock);
                return -ENOMEM;
+       }
        nsock = kzalloc(sizeof(struct nbd_sock), GFP_KERNEL);
-       if (!nsock)
+       if (!nsock) {
+               sockfd_put(sock);
                return -ENOMEM;
+       }
 
-       nbd->socks = socks;
+       config->socks = socks;
 
+       nsock->fallback_index = -1;
+       nsock->dead = false;
        mutex_init(&nsock->tx_lock);
        nsock->sock = sock;
        nsock->pending = NULL;
        nsock->sent = 0;
-       socks[nbd->num_connections++] = nsock;
+       nsock->cookie = 0;
+       socks[config->num_connections++] = nsock;
+       atomic_inc(&config->live_connections);
 
-       if (max_part)
-               bdev->bd_invalidated = 1;
        return 0;
 }
 
+static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg)
+{
+       struct nbd_config *config = nbd->config;
+       struct socket *sock, *old;
+       struct recv_thread_args *args;
+       int i;
+       int err;
+
+       sock = sockfd_lookup(arg, &err);
+       if (!sock)
+               return err;
+
+       args = kzalloc(sizeof(*args), GFP_KERNEL);
+       if (!args) {
+               sockfd_put(sock);
+               return -ENOMEM;
+       }
+
+       for (i = 0; i < config->num_connections; i++) {
+               struct nbd_sock *nsock = config->socks[i];
+
+               if (!nsock->dead)
+                       continue;
+
+               mutex_lock(&nsock->tx_lock);
+               if (!nsock->dead) {
+                       mutex_unlock(&nsock->tx_lock);
+                       continue;
+               }
+               sk_set_memalloc(sock->sk);
+               atomic_inc(&config->recv_threads);
+               refcount_inc(&nbd->config_refs);
+               old = nsock->sock;
+               nsock->fallback_index = -1;
+               nsock->sock = sock;
+               nsock->dead = false;
+               INIT_WORK(&args->work, recv_work);
+               args->index = i;
+               args->nbd = nbd;
+               nsock->cookie++;
+               mutex_unlock(&nsock->tx_lock);
+               sockfd_put(old);
+
+               /* We take the tx_mutex in an error path in the recv_work, so we
+                * need to queue_work outside of the tx_mutex.
+                */
+               queue_work(recv_workqueue, &args->work);
+
+               atomic_inc(&config->live_connections);
+               wake_up(&config->conn_wait);
+               return 0;
+       }
+       sockfd_put(sock);
+       kfree(args);
+       return -ENOSPC;
+}
+
 /* Reset all properties of an NBD device */
 static void nbd_reset(struct nbd_device *nbd)
 {
-       nbd->runtime_flags = 0;
-       nbd->blksize = 1024;
-       nbd->bytesize = 0;
-       set_capacity(nbd->disk, 0);
-       nbd->flags = 0;
+       nbd->config = NULL;
        nbd->tag_set.timeout = 0;
        queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue);
 }
@@ -668,21 +948,23 @@ static void nbd_bdev_reset(struct block_device *bdev)
 {
        if (bdev->bd_openers > 1)
                return;
-       set_device_ro(bdev, false);
-       bdev->bd_inode->i_size = 0;
+       bd_set_size(bdev, 0);
        if (max_part > 0) {
                blkdev_reread_part(bdev);
                bdev->bd_invalidated = 1;
        }
 }
 
-static void nbd_parse_flags(struct nbd_device *nbd, struct block_device *bdev)
+static void nbd_parse_flags(struct nbd_device *nbd)
 {
-       if (nbd->flags & NBD_FLAG_READ_ONLY)
-               set_device_ro(bdev, true);
-       if (nbd->flags & NBD_FLAG_SEND_TRIM)
+       struct nbd_config *config = nbd->config;
+       if (config->flags & NBD_FLAG_READ_ONLY)
+               set_disk_ro(nbd->disk, true);
+       else
+               set_disk_ro(nbd->disk, false);
+       if (config->flags & NBD_FLAG_SEND_TRIM)
                queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue);
-       if (nbd->flags & NBD_FLAG_SEND_FLUSH)
+       if (config->flags & NBD_FLAG_SEND_FLUSH)
                blk_queue_write_cache(nbd->disk->queue, true, false);
        else
                blk_queue_write_cache(nbd->disk->queue, false, false);
@@ -690,6 +972,7 @@ static void nbd_parse_flags(struct nbd_device *nbd, struct block_device *bdev)
 
 static void send_disconnects(struct nbd_device *nbd)
 {
+       struct nbd_config *config = nbd->config;
        struct nbd_request request = {
                .magic = htonl(NBD_REQUEST_MAGIC),
                .type = htonl(NBD_CMD_DISC),
@@ -698,7 +981,7 @@ static void send_disconnects(struct nbd_device *nbd)
        struct iov_iter from;
        int i, ret;
 
-       for (i = 0; i < nbd->num_connections; i++) {
+       for (i = 0; i < config->num_connections; i++) {
                iov_iter_kvec(&from, WRITE | ITER_KVEC, &iov, 1, sizeof(request));
                ret = sock_xmit(nbd, i, 1, &from, 0, NULL);
                if (ret <= 0)
@@ -707,145 +990,162 @@ static void send_disconnects(struct nbd_device *nbd)
        }
 }
 
-static int nbd_disconnect(struct nbd_device *nbd, struct block_device *bdev)
+static int nbd_disconnect(struct nbd_device *nbd)
 {
-       dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n");
-       if (!nbd->socks)
-               return -EINVAL;
-
-       mutex_unlock(&nbd->config_lock);
-       fsync_bdev(bdev);
-       mutex_lock(&nbd->config_lock);
-
-       /* Check again after getting mutex back.  */
-       if (!nbd->socks)
-               return -EINVAL;
+       struct nbd_config *config = nbd->config;
 
+       dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n");
        if (!test_and_set_bit(NBD_DISCONNECT_REQUESTED,
-                             &nbd->runtime_flags))
+                             &config->runtime_flags))
                send_disconnects(nbd);
        return 0;
 }
 
-static int nbd_clear_sock(struct nbd_device *nbd, struct block_device *bdev)
+static void nbd_clear_sock(struct nbd_device *nbd)
 {
        sock_shutdown(nbd);
        nbd_clear_que(nbd);
+       nbd->task_setup = NULL;
+}
 
-       __invalidate_device(bdev, true);
-       nbd_bdev_reset(bdev);
-       /*
-        * We want to give the run thread a chance to wait for everybody
-        * to clean up and then do it's own cleanup.
-        */
-       if (!test_bit(NBD_RUNNING, &nbd->runtime_flags) &&
-           nbd->num_connections) {
-               int i;
-
-               for (i = 0; i < nbd->num_connections; i++) {
-                       sockfd_put(nbd->socks[i]->sock);
-                       kfree(nbd->socks[i]);
+static void nbd_config_put(struct nbd_device *nbd)
+{
+       if (refcount_dec_and_mutex_lock(&nbd->config_refs,
+                                       &nbd->config_lock)) {
+               struct nbd_config *config = nbd->config;
+               nbd_dev_dbg_close(nbd);
+               nbd_size_clear(nbd);
+               if (test_and_clear_bit(NBD_HAS_PID_FILE,
+                                      &config->runtime_flags))
+                       device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
+               nbd->task_recv = NULL;
+               nbd_clear_sock(nbd);
+               if (config->num_connections) {
+                       int i;
+                       for (i = 0; i < config->num_connections; i++) {
+                               sockfd_put(config->socks[i]->sock);
+                               kfree(config->socks[i]);
+                       }
+                       kfree(config->socks);
                }
-               kfree(nbd->socks);
-               nbd->socks = NULL;
-               nbd->num_connections = 0;
-       }
-       nbd->task_setup = NULL;
+               nbd_reset(nbd);
 
-       return 0;
+               mutex_unlock(&nbd->config_lock);
+               nbd_put(nbd);
+               module_put(THIS_MODULE);
+       }
 }
 
-static int nbd_start_device(struct nbd_device *nbd, struct block_device *bdev)
+static int nbd_start_device(struct nbd_device *nbd)
 {
-       struct recv_thread_args *args;
-       int num_connections = nbd->num_connections;
+       struct nbd_config *config = nbd->config;
+       int num_connections = config->num_connections;
        int error = 0, i;
 
        if (nbd->task_recv)
                return -EBUSY;
-       if (!nbd->socks)
+       if (!config->socks)
                return -EINVAL;
        if (num_connections > 1 &&
-           !(nbd->flags & NBD_FLAG_CAN_MULTI_CONN)) {
+           !(config->flags & NBD_FLAG_CAN_MULTI_CONN)) {
                dev_err(disk_to_dev(nbd->disk), "server does not support multiple connections per device.\n");
-               error = -EINVAL;
-               goto out_err;
+               return -EINVAL;
        }
 
-       set_bit(NBD_RUNNING, &nbd->runtime_flags);
-       blk_mq_update_nr_hw_queues(&nbd->tag_set, nbd->num_connections);
-       args = kcalloc(num_connections, sizeof(*args), GFP_KERNEL);
-       if (!args) {
-               error = -ENOMEM;
-               goto out_err;
-       }
+       blk_mq_update_nr_hw_queues(&nbd->tag_set, config->num_connections);
        nbd->task_recv = current;
-       mutex_unlock(&nbd->config_lock);
 
-       nbd_parse_flags(nbd, bdev);
+       nbd_parse_flags(nbd);
 
        error = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
        if (error) {
                dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n");
-               goto out_recv;
+               return error;
        }
-
-       nbd_size_update(nbd, bdev);
+       set_bit(NBD_HAS_PID_FILE, &config->runtime_flags);
 
        nbd_dev_dbg_init(nbd);
        for (i = 0; i < num_connections; i++) {
-               sk_set_memalloc(nbd->socks[i]->sock->sk);
-               atomic_inc(&nbd->recv_threads);
-               INIT_WORK(&args[i].work, recv_work);
-               args[i].nbd = nbd;
-               args[i].index = i;
-               queue_work(recv_workqueue, &args[i].work);
-       }
-       wait_event_interruptible(nbd->recv_wq,
-                                atomic_read(&nbd->recv_threads) == 0);
-       for (i = 0; i < num_connections; i++)
-               flush_work(&args[i].work);
-       nbd_dev_dbg_close(nbd);
-       nbd_size_clear(nbd, bdev);
-       device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
-out_recv:
-       mutex_lock(&nbd->config_lock);
-       nbd->task_recv = NULL;
-out_err:
-       clear_bit(NBD_RUNNING, &nbd->runtime_flags);
-       nbd_clear_sock(nbd, bdev);
+               struct recv_thread_args *args;
 
+               args = kzalloc(sizeof(*args), GFP_KERNEL);
+               if (!args) {
+                       sock_shutdown(nbd);
+                       return -ENOMEM;
+               }
+               sk_set_memalloc(config->socks[i]->sock->sk);
+               atomic_inc(&config->recv_threads);
+               refcount_inc(&nbd->config_refs);
+               INIT_WORK(&args->work, recv_work);
+               args->nbd = nbd;
+               args->index = i;
+               queue_work(recv_workqueue, &args->work);
+       }
+       return error;
+}
+
+static int nbd_start_device_ioctl(struct nbd_device *nbd, struct block_device *bdev)
+{
+       struct nbd_config *config = nbd->config;
+       int ret;
+
+       ret = nbd_start_device(nbd);
+       if (ret)
+               return ret;
+
+       bd_set_size(bdev, config->bytesize);
+       if (max_part)
+               bdev->bd_invalidated = 1;
+       mutex_unlock(&nbd->config_lock);
+       ret = wait_event_interruptible(config->recv_wq,
+                                        atomic_read(&config->recv_threads) == 0);
+       if (ret)
+               sock_shutdown(nbd);
+       mutex_lock(&nbd->config_lock);
+       bd_set_size(bdev, 0);
        /* user requested, ignore socket errors */
-       if (test_bit(NBD_DISCONNECT_REQUESTED, &nbd->runtime_flags))
-               error = 0;
-       if (test_bit(NBD_TIMEDOUT, &nbd->runtime_flags))
-               error = -ETIMEDOUT;
+       if (test_bit(NBD_DISCONNECT_REQUESTED, &config->runtime_flags))
+               ret = 0;
+       if (test_bit(NBD_TIMEDOUT, &config->runtime_flags))
+               ret = -ETIMEDOUT;
+       return ret;
+}
 
-       nbd_reset(nbd);
-       return error;
+static void nbd_clear_sock_ioctl(struct nbd_device *nbd,
+                                struct block_device *bdev)
+{
+       sock_shutdown(nbd);
+       kill_bdev(bdev);
+       nbd_bdev_reset(bdev);
+       if (test_and_clear_bit(NBD_HAS_CONFIG_REF,
+                              &nbd->config->runtime_flags))
+               nbd_config_put(nbd);
 }
 
 /* Must be called with config_lock held */
 static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
                       unsigned int cmd, unsigned long arg)
 {
+       struct nbd_config *config = nbd->config;
+
        switch (cmd) {
        case NBD_DISCONNECT:
-               return nbd_disconnect(nbd, bdev);
+               return nbd_disconnect(nbd);
        case NBD_CLEAR_SOCK:
-               return nbd_clear_sock(nbd, bdev);
+               nbd_clear_sock_ioctl(nbd, bdev);
+               return 0;
        case NBD_SET_SOCK:
-               return nbd_add_socket(nbd, bdev, arg);
+               return nbd_add_socket(nbd, arg, false);
        case NBD_SET_BLKSIZE:
-               nbd_size_set(nbd, bdev, arg,
-                            div_s64(nbd->bytesize, arg));
+               nbd_size_set(nbd, arg,
+                            div_s64(config->bytesize, arg));
                return 0;
        case NBD_SET_SIZE:
-               nbd_size_set(nbd, bdev, nbd->blksize,
-                            div_s64(arg, nbd->blksize));
+               nbd_size_set(nbd, config->blksize,
+                            div_s64(arg, config->blksize));
                return 0;
        case NBD_SET_SIZE_BLOCKS:
-               nbd_size_set(nbd, bdev, nbd->blksize, arg);
+               nbd_size_set(nbd, config->blksize, arg);
                return 0;
        case NBD_SET_TIMEOUT:
                if (arg) {
@@ -855,10 +1155,10 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
                return 0;
 
        case NBD_SET_FLAGS:
-               nbd->flags = arg;
+               config->flags = arg;
                return 0;
        case NBD_DO_IT:
-               return nbd_start_device(nbd, bdev);
+               return nbd_start_device_ioctl(nbd, bdev);
        case NBD_CLEAR_QUE:
                /*
                 * This is for compatibility only.  The queue is always cleared
@@ -879,23 +1179,92 @@ static int nbd_ioctl(struct block_device *bdev, fmode_t mode,
                     unsigned int cmd, unsigned long arg)
 {
        struct nbd_device *nbd = bdev->bd_disk->private_data;
-       int error;
+       struct nbd_config *config = nbd->config;
+       int error = -EINVAL;
 
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
 
-       BUG_ON(nbd->magic != NBD_MAGIC);
-
        mutex_lock(&nbd->config_lock);
-       error = __nbd_ioctl(bdev, nbd, cmd, arg);
-       mutex_unlock(&nbd->config_lock);
 
+       /* Don't allow ioctl operations on a nbd device that was created with
+        * netlink, unless it's DISCONNECT or CLEAR_SOCK, which are fine.
+        */
+       if (!test_bit(NBD_BOUND, &config->runtime_flags) ||
+           (cmd == NBD_DISCONNECT || cmd == NBD_CLEAR_SOCK))
+               error = __nbd_ioctl(bdev, nbd, cmd, arg);
+       else
+               dev_err(nbd_to_dev(nbd), "Cannot use ioctl interface on a netlink controlled device.\n");
+       mutex_unlock(&nbd->config_lock);
        return error;
 }
 
+static struct nbd_config *nbd_alloc_config(void)
+{
+       struct nbd_config *config;
+
+       config = kzalloc(sizeof(struct nbd_config), GFP_NOFS);
+       if (!config)
+               return NULL;
+       atomic_set(&config->recv_threads, 0);
+       init_waitqueue_head(&config->recv_wq);
+       init_waitqueue_head(&config->conn_wait);
+       config->blksize = 1024;
+       atomic_set(&config->live_connections, 0);
+       try_module_get(THIS_MODULE);
+       return config;
+}
+
+static int nbd_open(struct block_device *bdev, fmode_t mode)
+{
+       struct nbd_device *nbd;
+       int ret = 0;
+
+       mutex_lock(&nbd_index_mutex);
+       nbd = bdev->bd_disk->private_data;
+       if (!nbd) {
+               ret = -ENXIO;
+               goto out;
+       }
+       if (!refcount_inc_not_zero(&nbd->refs)) {
+               ret = -ENXIO;
+               goto out;
+       }
+       if (!refcount_inc_not_zero(&nbd->config_refs)) {
+               struct nbd_config *config;
+
+               mutex_lock(&nbd->config_lock);
+               if (refcount_inc_not_zero(&nbd->config_refs)) {
+                       mutex_unlock(&nbd->config_lock);
+                       goto out;
+               }
+               config = nbd->config = nbd_alloc_config();
+               if (!config) {
+                       ret = -ENOMEM;
+                       mutex_unlock(&nbd->config_lock);
+                       goto out;
+               }
+               refcount_set(&nbd->config_refs, 1);
+               refcount_inc(&nbd->refs);
+               mutex_unlock(&nbd->config_lock);
+       }
+out:
+       mutex_unlock(&nbd_index_mutex);
+       return ret;
+}
+
+static void nbd_release(struct gendisk *disk, fmode_t mode)
+{
+       struct nbd_device *nbd = disk->private_data;
+       nbd_config_put(nbd);
+       nbd_put(nbd);
+}
+
 static const struct block_device_operations nbd_fops =
 {
        .owner =        THIS_MODULE,
+       .open =         nbd_open,
+       .release =      nbd_release,
        .ioctl =        nbd_ioctl,
        .compat_ioctl = nbd_ioctl,
 };
@@ -927,7 +1296,7 @@ static const struct file_operations nbd_dbg_tasks_ops = {
 static int nbd_dbg_flags_show(struct seq_file *s, void *unused)
 {
        struct nbd_device *nbd = s->private;
-       u32 flags = nbd->flags;
+       u32 flags = nbd->config->flags;
 
        seq_printf(s, "Hex: 0x%08x\n\n", flags);
 
@@ -960,6 +1329,7 @@ static const struct file_operations nbd_dbg_flags_ops = {
 static int nbd_dev_dbg_init(struct nbd_device *nbd)
 {
        struct dentry *dir;
+       struct nbd_config *config = nbd->config;
 
        if (!nbd_dbg_dir)
                return -EIO;
@@ -970,12 +1340,12 @@ static int nbd_dev_dbg_init(struct nbd_device *nbd)
                        nbd_name(nbd));
                return -EIO;
        }
-       nbd->dbg_dir = dir;
+       config->dbg_dir = dir;
 
        debugfs_create_file("tasks", 0444, dir, nbd, &nbd_dbg_tasks_ops);
-       debugfs_create_u64("size_bytes", 0444, dir, &nbd->bytesize);
+       debugfs_create_u64("size_bytes", 0444, dir, &config->bytesize);
        debugfs_create_u32("timeout", 0444, dir, &nbd->tag_set.timeout);
-       debugfs_create_u64("blocksize", 0444, dir, &nbd->blksize);
+       debugfs_create_u64("blocksize", 0444, dir, &config->blksize);
        debugfs_create_file("flags", 0444, dir, nbd, &nbd_dbg_flags_ops);
 
        return 0;
@@ -983,7 +1353,7 @@ static int nbd_dev_dbg_init(struct nbd_device *nbd)
 
 static void nbd_dev_dbg_close(struct nbd_device *nbd)
 {
-       debugfs_remove_recursive(nbd->dbg_dir);
+       debugfs_remove_recursive(nbd->config->dbg_dir);
 }
 
 static int nbd_dbg_init(void)
@@ -1035,25 +1405,13 @@ static int nbd_init_request(void *data, struct request *rq,
        return 0;
 }
 
-static struct blk_mq_ops nbd_mq_ops = {
+static const struct blk_mq_ops nbd_mq_ops = {
        .queue_rq       = nbd_queue_rq,
+       .complete       = nbd_complete_rq,
        .init_request   = nbd_init_request,
        .timeout        = nbd_xmit_timeout,
 };
 
-static void nbd_dev_remove(struct nbd_device *nbd)
-{
-       struct gendisk *disk = nbd->disk;
-       nbd->magic = 0;
-       if (disk) {
-               del_gendisk(disk);
-               blk_cleanup_queue(disk->queue);
-               blk_mq_free_tag_set(&nbd->tag_set);
-               put_disk(disk);
-       }
-       kfree(nbd);
-}
-
 static int nbd_dev_add(int index)
 {
        struct nbd_device *nbd;
@@ -1082,6 +1440,7 @@ static int nbd_dev_add(int index)
        if (err < 0)
                goto out_free_disk;
 
+       nbd->index = index;
        nbd->disk = disk;
        nbd->tag_set.ops = &nbd_mq_ops;
        nbd->tag_set.nr_hw_queues = 1;
@@ -1110,20 +1469,23 @@ static int nbd_dev_add(int index)
        queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, disk->queue);
        disk->queue->limits.discard_granularity = 512;
        blk_queue_max_discard_sectors(disk->queue, UINT_MAX);
-       disk->queue->limits.discard_zeroes_data = 0;
+       blk_queue_max_segment_size(disk->queue, UINT_MAX);
+       blk_queue_max_segments(disk->queue, USHRT_MAX);
        blk_queue_max_hw_sectors(disk->queue, 65536);
        disk->queue->limits.max_sectors = 256;
 
-       nbd->magic = NBD_MAGIC;
        mutex_init(&nbd->config_lock);
+       refcount_set(&nbd->config_refs, 0);
+       refcount_set(&nbd->refs, 1);
+       INIT_LIST_HEAD(&nbd->list);
        disk->major = NBD_MAJOR;
        disk->first_minor = index << part_shift;
        disk->fops = &nbd_fops;
        disk->private_data = nbd;
        sprintf(disk->disk_name, "nbd%d", index);
-       init_waitqueue_head(&nbd->recv_wq);
        nbd_reset(nbd);
        add_disk(disk);
+       nbd_total_devices++;
        return index;
 
 out_free_tags:
@@ -1138,10 +1500,535 @@ out:
        return err;
 }
 
-/*
- * And here should be modules and kernel interface 
- *  (Just smiley confuses emacs :-)
+static int find_free_cb(int id, void *ptr, void *data)
+{
+       struct nbd_device *nbd = ptr;
+       struct nbd_device **found = data;
+
+       if (!refcount_read(&nbd->config_refs)) {
+               *found = nbd;
+               return 1;
+       }
+       return 0;
+}
+
+/* Netlink interface. */
+static struct nla_policy nbd_attr_policy[NBD_ATTR_MAX + 1] = {
+       [NBD_ATTR_INDEX]                =       { .type = NLA_U32 },
+       [NBD_ATTR_SIZE_BYTES]           =       { .type = NLA_U64 },
+       [NBD_ATTR_BLOCK_SIZE_BYTES]     =       { .type = NLA_U64 },
+       [NBD_ATTR_TIMEOUT]              =       { .type = NLA_U64 },
+       [NBD_ATTR_SERVER_FLAGS]         =       { .type = NLA_U64 },
+       [NBD_ATTR_CLIENT_FLAGS]         =       { .type = NLA_U64 },
+       [NBD_ATTR_SOCKETS]              =       { .type = NLA_NESTED},
+       [NBD_ATTR_DEAD_CONN_TIMEOUT]    =       { .type = NLA_U64 },
+       [NBD_ATTR_DEVICE_LIST]          =       { .type = NLA_NESTED},
+};
+
+static struct nla_policy nbd_sock_policy[NBD_SOCK_MAX + 1] = {
+       [NBD_SOCK_FD]                   =       { .type = NLA_U32 },
+};
+
+/* We don't use this right now since we don't parse the incoming list, but we
+ * still want it here so userspace knows what to expect.
  */
+static struct nla_policy __attribute__((unused))
+nbd_device_policy[NBD_DEVICE_ATTR_MAX + 1] = {
+       [NBD_DEVICE_INDEX]              =       { .type = NLA_U32 },
+       [NBD_DEVICE_CONNECTED]          =       { .type = NLA_U8 },
+};
+
+static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info)
+{
+       struct nbd_device *nbd = NULL;
+       struct nbd_config *config;
+       int index = -1;
+       int ret;
+       bool put_dev = false;
+
+       if (!netlink_capable(skb, CAP_SYS_ADMIN))
+               return -EPERM;
+
+       if (info->attrs[NBD_ATTR_INDEX])
+               index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
+       if (!info->attrs[NBD_ATTR_SOCKETS]) {
+               printk(KERN_ERR "nbd: must specify at least one socket\n");
+               return -EINVAL;
+       }
+       if (!info->attrs[NBD_ATTR_SIZE_BYTES]) {
+               printk(KERN_ERR "nbd: must specify a size in bytes for the device\n");
+               return -EINVAL;
+       }
+again:
+       mutex_lock(&nbd_index_mutex);
+       if (index == -1) {
+               ret = idr_for_each(&nbd_index_idr, &find_free_cb, &nbd);
+               if (ret == 0) {
+                       int new_index;
+                       new_index = nbd_dev_add(-1);
+                       if (new_index < 0) {
+                               mutex_unlock(&nbd_index_mutex);
+                               printk(KERN_ERR "nbd: failed to add new device\n");
+                               return ret;
+                       }
+                       nbd = idr_find(&nbd_index_idr, new_index);
+               }
+       } else {
+               nbd = idr_find(&nbd_index_idr, index);
+       }
+       if (!nbd) {
+               printk(KERN_ERR "nbd: couldn't find device at index %d\n",
+                      index);
+               mutex_unlock(&nbd_index_mutex);
+               return -EINVAL;
+       }
+       if (!refcount_inc_not_zero(&nbd->refs)) {
+               mutex_unlock(&nbd_index_mutex);
+               if (index == -1)
+                       goto again;
+               printk(KERN_ERR "nbd: device at index %d is going down\n",
+                      index);
+               return -EINVAL;
+       }
+       mutex_unlock(&nbd_index_mutex);
+
+       mutex_lock(&nbd->config_lock);
+       if (refcount_read(&nbd->config_refs)) {
+               mutex_unlock(&nbd->config_lock);
+               nbd_put(nbd);
+               if (index == -1)
+                       goto again;
+               printk(KERN_ERR "nbd: nbd%d already in use\n", index);
+               return -EBUSY;
+       }
+       if (WARN_ON(nbd->config)) {
+               mutex_unlock(&nbd->config_lock);
+               nbd_put(nbd);
+               return -EINVAL;
+       }
+       config = nbd->config = nbd_alloc_config();
+       if (!nbd->config) {
+               mutex_unlock(&nbd->config_lock);
+               nbd_put(nbd);
+               printk(KERN_ERR "nbd: couldn't allocate config\n");
+               return -ENOMEM;
+       }
+       refcount_set(&nbd->config_refs, 1);
+       set_bit(NBD_BOUND, &config->runtime_flags);
+
+       if (info->attrs[NBD_ATTR_SIZE_BYTES]) {
+               u64 bytes = nla_get_u64(info->attrs[NBD_ATTR_SIZE_BYTES]);
+               nbd_size_set(nbd, config->blksize,
+                            div64_u64(bytes, config->blksize));
+       }
+       if (info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]) {
+               u64 bsize =
+                       nla_get_u64(info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]);
+               nbd_size_set(nbd, bsize, div64_u64(config->bytesize, bsize));
+       }
+       if (info->attrs[NBD_ATTR_TIMEOUT]) {
+               u64 timeout = nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]);
+               nbd->tag_set.timeout = timeout * HZ;
+               blk_queue_rq_timeout(nbd->disk->queue, timeout * HZ);
+       }
+       if (info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]) {
+               config->dead_conn_timeout =
+                       nla_get_u64(info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]);
+               config->dead_conn_timeout *= HZ;
+       }
+       if (info->attrs[NBD_ATTR_SERVER_FLAGS])
+               config->flags =
+                       nla_get_u64(info->attrs[NBD_ATTR_SERVER_FLAGS]);
+       if (info->attrs[NBD_ATTR_CLIENT_FLAGS]) {
+               u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]);
+               if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) {
+                       set_bit(NBD_DESTROY_ON_DISCONNECT,
+                               &config->runtime_flags);
+                       put_dev = true;
+               }
+       }
+
+       if (info->attrs[NBD_ATTR_SOCKETS]) {
+               struct nlattr *attr;
+               int rem, fd;
+
+               nla_for_each_nested(attr, info->attrs[NBD_ATTR_SOCKETS],
+                                   rem) {
+                       struct nlattr *socks[NBD_SOCK_MAX+1];
+
+                       if (nla_type(attr) != NBD_SOCK_ITEM) {
+                               printk(KERN_ERR "nbd: socks must be embedded in a SOCK_ITEM attr\n");
+                               ret = -EINVAL;
+                               goto out;
+                       }
+                       ret = nla_parse_nested(socks, NBD_SOCK_MAX, attr,
+                                              nbd_sock_policy);
+                       if (ret != 0) {
+                               printk(KERN_ERR "nbd: error processing sock list\n");
+                               ret = -EINVAL;
+                               goto out;
+                       }
+                       if (!socks[NBD_SOCK_FD])
+                               continue;
+                       fd = (int)nla_get_u32(socks[NBD_SOCK_FD]);
+                       ret = nbd_add_socket(nbd, fd, true);
+                       if (ret)
+                               goto out;
+               }
+       }
+       ret = nbd_start_device(nbd);
+out:
+       mutex_unlock(&nbd->config_lock);
+       if (!ret) {
+               set_bit(NBD_HAS_CONFIG_REF, &config->runtime_flags);
+               refcount_inc(&nbd->config_refs);
+               nbd_connect_reply(info, nbd->index);
+       }
+       nbd_config_put(nbd);
+       if (put_dev)
+               nbd_put(nbd);
+       return ret;
+}
+
+static int nbd_genl_disconnect(struct sk_buff *skb, struct genl_info *info)
+{
+       struct nbd_device *nbd;
+       int index;
+
+       if (!netlink_capable(skb, CAP_SYS_ADMIN))
+               return -EPERM;
+
+       if (!info->attrs[NBD_ATTR_INDEX]) {
+               printk(KERN_ERR "nbd: must specify an index to disconnect\n");
+               return -EINVAL;
+       }
+       index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
+       mutex_lock(&nbd_index_mutex);
+       nbd = idr_find(&nbd_index_idr, index);
+       if (!nbd) {
+               mutex_unlock(&nbd_index_mutex);
+               printk(KERN_ERR "nbd: couldn't find device at index %d\n",
+                      index);
+               return -EINVAL;
+       }
+       if (!refcount_inc_not_zero(&nbd->refs)) {
+               mutex_unlock(&nbd_index_mutex);
+               printk(KERN_ERR "nbd: device at index %d is going down\n",
+                      index);
+               return -EINVAL;
+       }
+       mutex_unlock(&nbd_index_mutex);
+       if (!refcount_inc_not_zero(&nbd->config_refs)) {
+               nbd_put(nbd);
+               return 0;
+       }
+       mutex_lock(&nbd->config_lock);
+       nbd_disconnect(nbd);
+       mutex_unlock(&nbd->config_lock);
+       if (test_and_clear_bit(NBD_HAS_CONFIG_REF,
+                              &nbd->config->runtime_flags))
+               nbd_config_put(nbd);
+       nbd_config_put(nbd);
+       nbd_put(nbd);
+       return 0;
+}
+
+static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info)
+{
+       struct nbd_device *nbd = NULL;
+       struct nbd_config *config;
+       int index;
+       int ret = -EINVAL;
+       bool put_dev = false;
+
+       if (!netlink_capable(skb, CAP_SYS_ADMIN))
+               return -EPERM;
+
+       if (!info->attrs[NBD_ATTR_INDEX]) {
+               printk(KERN_ERR "nbd: must specify a device to reconfigure\n");
+               return -EINVAL;
+       }
+       index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
+       mutex_lock(&nbd_index_mutex);
+       nbd = idr_find(&nbd_index_idr, index);
+       if (!nbd) {
+               mutex_unlock(&nbd_index_mutex);
+               printk(KERN_ERR "nbd: couldn't find a device at index %d\n",
+                      index);
+               return -EINVAL;
+       }
+       if (!refcount_inc_not_zero(&nbd->refs)) {
+               mutex_unlock(&nbd_index_mutex);
+               printk(KERN_ERR "nbd: device at index %d is going down\n",
+                      index);
+               return -EINVAL;
+       }
+       mutex_unlock(&nbd_index_mutex);
+
+       if (!refcount_inc_not_zero(&nbd->config_refs)) {
+               dev_err(nbd_to_dev(nbd),
+                       "not configured, cannot reconfigure\n");
+               nbd_put(nbd);
+               return -EINVAL;
+       }
+
+       mutex_lock(&nbd->config_lock);
+       config = nbd->config;
+       if (!test_bit(NBD_BOUND, &config->runtime_flags) ||
+           !nbd->task_recv) {
+               dev_err(nbd_to_dev(nbd),
+                       "not configured, cannot reconfigure\n");
+               goto out;
+       }
+
+       if (info->attrs[NBD_ATTR_TIMEOUT]) {
+               u64 timeout = nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]);
+               nbd->tag_set.timeout = timeout * HZ;
+               blk_queue_rq_timeout(nbd->disk->queue, timeout * HZ);
+       }
+       if (info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]) {
+               config->dead_conn_timeout =
+                       nla_get_u64(info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]);
+               config->dead_conn_timeout *= HZ;
+       }
+       if (info->attrs[NBD_ATTR_CLIENT_FLAGS]) {
+               u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]);
+               if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) {
+                       if (!test_and_set_bit(NBD_DESTROY_ON_DISCONNECT,
+                                             &config->runtime_flags))
+                               put_dev = true;
+               } else {
+                       if (test_and_clear_bit(NBD_DESTROY_ON_DISCONNECT,
+                                              &config->runtime_flags))
+                               refcount_inc(&nbd->refs);
+               }
+       }
+
+       if (info->attrs[NBD_ATTR_SOCKETS]) {
+               struct nlattr *attr;
+               int rem, fd;
+
+               nla_for_each_nested(attr, info->attrs[NBD_ATTR_SOCKETS],
+                                   rem) {
+                       struct nlattr *socks[NBD_SOCK_MAX+1];
+
+                       if (nla_type(attr) != NBD_SOCK_ITEM) {
+                               printk(KERN_ERR "nbd: socks must be embedded in a SOCK_ITEM attr\n");
+                               ret = -EINVAL;
+                               goto out;
+                       }
+                       ret = nla_parse_nested(socks, NBD_SOCK_MAX, attr,
+                                              nbd_sock_policy);
+                       if (ret != 0) {
+                               printk(KERN_ERR "nbd: error processing sock list\n");
+                               ret = -EINVAL;
+                               goto out;
+                       }
+                       if (!socks[NBD_SOCK_FD])
+                               continue;
+                       fd = (int)nla_get_u32(socks[NBD_SOCK_FD]);
+                       ret = nbd_reconnect_socket(nbd, fd);
+                       if (ret) {
+                               if (ret == -ENOSPC)
+                                       ret = 0;
+                               goto out;
+                       }
+                       dev_info(nbd_to_dev(nbd), "reconnected socket\n");
+               }
+       }
+out:
+       mutex_unlock(&nbd->config_lock);
+       nbd_config_put(nbd);
+       nbd_put(nbd);
+       if (put_dev)
+               nbd_put(nbd);
+       return ret;
+}
+
+static const struct genl_ops nbd_connect_genl_ops[] = {
+       {
+               .cmd    = NBD_CMD_CONNECT,
+               .policy = nbd_attr_policy,
+               .doit   = nbd_genl_connect,
+       },
+       {
+               .cmd    = NBD_CMD_DISCONNECT,
+               .policy = nbd_attr_policy,
+               .doit   = nbd_genl_disconnect,
+       },
+       {
+               .cmd    = NBD_CMD_RECONFIGURE,
+               .policy = nbd_attr_policy,
+               .doit   = nbd_genl_reconfigure,
+       },
+       {
+               .cmd    = NBD_CMD_STATUS,
+               .policy = nbd_attr_policy,
+               .doit   = nbd_genl_status,
+       },
+};
+
+static const struct genl_multicast_group nbd_mcast_grps[] = {
+       { .name = NBD_GENL_MCAST_GROUP_NAME, },
+};
+
+static struct genl_family nbd_genl_family __ro_after_init = {
+       .hdrsize        = 0,
+       .name           = NBD_GENL_FAMILY_NAME,
+       .version        = NBD_GENL_VERSION,
+       .module         = THIS_MODULE,
+       .ops            = nbd_connect_genl_ops,
+       .n_ops          = ARRAY_SIZE(nbd_connect_genl_ops),
+       .maxattr        = NBD_ATTR_MAX,
+       .mcgrps         = nbd_mcast_grps,
+       .n_mcgrps       = ARRAY_SIZE(nbd_mcast_grps),
+};
+
+static int populate_nbd_status(struct nbd_device *nbd, struct sk_buff *reply)
+{
+       struct nlattr *dev_opt;
+       u8 connected = 0;
+       int ret;
+
+       /* This is a little racey, but for status it's ok.  The
+        * reason we don't take a ref here is because we can't
+        * take a ref in the index == -1 case as we would need
+        * to put under the nbd_index_mutex, which could
+        * deadlock if we are configured to remove ourselves
+        * once we're disconnected.
+        */
+       if (refcount_read(&nbd->config_refs))
+               connected = 1;
+       dev_opt = nla_nest_start(reply, NBD_DEVICE_ITEM);
+       if (!dev_opt)
+               return -EMSGSIZE;
+       ret = nla_put_u32(reply, NBD_DEVICE_INDEX, nbd->index);
+       if (ret)
+               return -EMSGSIZE;
+       ret = nla_put_u8(reply, NBD_DEVICE_CONNECTED,
+                        connected);
+       if (ret)
+               return -EMSGSIZE;
+       nla_nest_end(reply, dev_opt);
+       return 0;
+}
+
+static int status_cb(int id, void *ptr, void *data)
+{
+       struct nbd_device *nbd = ptr;
+       return populate_nbd_status(nbd, (struct sk_buff *)data);
+}
+
+static int nbd_genl_status(struct sk_buff *skb, struct genl_info *info)
+{
+       struct nlattr *dev_list;
+       struct sk_buff *reply;
+       void *reply_head;
+       size_t msg_size;
+       int index = -1;
+       int ret = -ENOMEM;
+
+       if (info->attrs[NBD_ATTR_INDEX])
+               index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
+
+       mutex_lock(&nbd_index_mutex);
+
+       msg_size = nla_total_size(nla_attr_size(sizeof(u32)) +
+                                 nla_attr_size(sizeof(u8)));
+       msg_size *= (index == -1) ? nbd_total_devices : 1;
+
+       reply = genlmsg_new(msg_size, GFP_KERNEL);
+       if (!reply)
+               goto out;
+       reply_head = genlmsg_put_reply(reply, info, &nbd_genl_family, 0,
+                                      NBD_CMD_STATUS);
+       if (!reply_head) {
+               nlmsg_free(reply);
+               goto out;
+       }
+
+       dev_list = nla_nest_start(reply, NBD_ATTR_DEVICE_LIST);
+       if (index == -1) {
+               ret = idr_for_each(&nbd_index_idr, &status_cb, reply);
+               if (ret) {
+                       nlmsg_free(reply);
+                       goto out;
+               }
+       } else {
+               struct nbd_device *nbd;
+               nbd = idr_find(&nbd_index_idr, index);
+               if (nbd) {
+                       ret = populate_nbd_status(nbd, reply);
+                       if (ret) {
+                               nlmsg_free(reply);
+                               goto out;
+                       }
+               }
+       }
+       nla_nest_end(reply, dev_list);
+       genlmsg_end(reply, reply_head);
+       genlmsg_reply(reply, info);
+       ret = 0;
+out:
+       mutex_unlock(&nbd_index_mutex);
+       return ret;
+}
+
+static void nbd_connect_reply(struct genl_info *info, int index)
+{
+       struct sk_buff *skb;
+       void *msg_head;
+       int ret;
+
+       skb = genlmsg_new(nla_total_size(sizeof(u32)), GFP_KERNEL);
+       if (!skb)
+               return;
+       msg_head = genlmsg_put_reply(skb, info, &nbd_genl_family, 0,
+                                    NBD_CMD_CONNECT);
+       if (!msg_head) {
+               nlmsg_free(skb);
+               return;
+       }
+       ret = nla_put_u32(skb, NBD_ATTR_INDEX, index);
+       if (ret) {
+               nlmsg_free(skb);
+               return;
+       }
+       genlmsg_end(skb, msg_head);
+       genlmsg_reply(skb, info);
+}
+
+static void nbd_mcast_index(int index)
+{
+       struct sk_buff *skb;
+       void *msg_head;
+       int ret;
+
+       skb = genlmsg_new(nla_total_size(sizeof(u32)), GFP_KERNEL);
+       if (!skb)
+               return;
+       msg_head = genlmsg_put(skb, 0, 0, &nbd_genl_family, 0,
+                                    NBD_CMD_LINK_DEAD);
+       if (!msg_head) {
+               nlmsg_free(skb);
+               return;
+       }
+       ret = nla_put_u32(skb, NBD_ATTR_INDEX, index);
+       if (ret) {
+               nlmsg_free(skb);
+               return;
+       }
+       genlmsg_end(skb, msg_head);
+       genlmsg_multicast(&nbd_genl_family, skb, 0, 0, GFP_KERNEL);
+}
+
+static void nbd_dead_link_work(struct work_struct *work)
+{
+       struct link_dead_args *args = container_of(work, struct link_dead_args,
+                                                  work);
+       nbd_mcast_index(args->index);
+       kfree(args);
+}
 
 static int __init nbd_init(void)
 {
@@ -1184,6 +2071,11 @@ static int __init nbd_init(void)
                return -EIO;
        }
 
+       if (genl_register_family(&nbd_genl_family)) {
+               unregister_blkdev(NBD_MAJOR, "nbd");
+               destroy_workqueue(recv_workqueue);
+               return -EINVAL;
+       }
        nbd_dbg_init();
 
        mutex_lock(&nbd_index_mutex);
@@ -1195,17 +2087,34 @@ static int __init nbd_init(void)
 
 static int nbd_exit_cb(int id, void *ptr, void *data)
 {
+       struct list_head *list = (struct list_head *)data;
        struct nbd_device *nbd = ptr;
-       nbd_dev_remove(nbd);
+
+       list_add_tail(&nbd->list, list);
        return 0;
 }
 
 static void __exit nbd_cleanup(void)
 {
+       struct nbd_device *nbd;
+       LIST_HEAD(del_list);
+
        nbd_dbg_close();
 
-       idr_for_each(&nbd_index_idr, &nbd_exit_cb, NULL);
+       mutex_lock(&nbd_index_mutex);
+       idr_for_each(&nbd_index_idr, &nbd_exit_cb, &del_list);
+       mutex_unlock(&nbd_index_mutex);
+
+       while (!list_empty(&del_list)) {
+               nbd = list_first_entry(&del_list, struct nbd_device, list);
+               list_del_init(&nbd->list);
+               if (refcount_read(&nbd->refs) != 1)
+                       printk(KERN_ERR "nbd: possibly leaking a device\n");
+               nbd_put(nbd);
+       }
+
        idr_destroy(&nbd_index_idr);
+       genl_unregister_family(&nbd_genl_family);
        destroy_workqueue(recv_workqueue);
        unregister_blkdev(NBD_MAJOR, "nbd");
 }
index 6f2e565..d946e1e 100644 (file)
@@ -117,6 +117,10 @@ static bool use_lightnvm;
 module_param(use_lightnvm, bool, S_IRUGO);
 MODULE_PARM_DESC(use_lightnvm, "Register as a LightNVM device");
 
+static bool blocking;
+module_param(blocking, bool, S_IRUGO);
+MODULE_PARM_DESC(blocking, "Register as a blocking blk-mq driver device");
+
 static int irqmode = NULL_IRQ_SOFTIRQ;
 
 static int null_set_irqmode(const char *str, const struct kernel_param *kp)
@@ -277,7 +281,7 @@ static inline void null_handle_cmd(struct nullb_cmd *cmd)
        case NULL_IRQ_SOFTIRQ:
                switch (queue_mode)  {
                case NULL_Q_MQ:
-                       blk_mq_complete_request(cmd->rq, cmd->rq->errors);
+                       blk_mq_complete_request(cmd->rq);
                        break;
                case NULL_Q_RQ:
                        blk_complete_request(cmd->rq);
@@ -357,6 +361,8 @@ static int null_queue_rq(struct blk_mq_hw_ctx *hctx,
 {
        struct nullb_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
 
+       might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
+
        if (irqmode == NULL_IRQ_TIMER) {
                hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
                cmd->timer.function = null_cmd_timer_expired;
@@ -392,7 +398,7 @@ static int null_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
        return 0;
 }
 
-static struct blk_mq_ops null_mq_ops = {
+static const struct blk_mq_ops null_mq_ops = {
        .queue_rq       = null_queue_rq,
        .init_hctx      = null_init_hctx,
        .complete       = null_softirq_done_fn,
@@ -437,14 +443,7 @@ static int null_lnvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
        if (IS_ERR(rq))
                return -ENOMEM;
 
-       rq->__sector = bio->bi_iter.bi_sector;
-       rq->ioprio = bio_prio(bio);
-
-       if (bio_has_data(bio))
-               rq->nr_phys_segments = bio_phys_segments(q, bio);
-
-       rq->__data_len = bio->bi_iter.bi_size;
-       rq->bio = rq->biotail = bio;
+       blk_init_request_from_bio(rq, bio);
 
        rq->end_io_data = rqd;
 
@@ -724,6 +723,9 @@ static int null_add_dev(void)
                nullb->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
                nullb->tag_set.driver_data = nullb;
 
+               if (blocking)
+                       nullb->tag_set.flags |= BLK_MQ_F_BLOCKING;
+
                rv = blk_mq_alloc_tag_set(&nullb->tag_set);
                if (rv)
                        goto out_cleanup_queues;
diff --git a/drivers/block/osdblk.c b/drivers/block/osdblk.c
deleted file mode 100644 (file)
index 8127b82..0000000
+++ /dev/null
@@ -1,693 +0,0 @@
-
-/*
-   osdblk.c -- Export a single SCSI OSD object as a Linux block device
-
-
-   Copyright 2009 Red Hat, Inc.
-
-   This program is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published by
-   the Free Software Foundation.
-
-   This program is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-   GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; see the file COPYING.  If not, write to
-   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
-
-
-   Instructions for use
-   --------------------
-
-   1) Map a Linux block device to an existing OSD object.
-
-      In this example, we will use partition id 1234, object id 5678,
-      OSD device /dev/osd1.
-
-      $ echo "1234 5678 /dev/osd1" > /sys/class/osdblk/add
-
-
-   2) List all active blkdev<->object mappings.
-
-      In this example, we have performed step #1 twice, creating two blkdevs,
-      mapped to two separate OSD objects.
-
-      $ cat /sys/class/osdblk/list
-      0 174 1234 5678 /dev/osd1
-      1 179 1994 897123 /dev/osd0
-
-      The columns, in order, are:
-      - blkdev unique id
-      - blkdev assigned major
-      - OSD object partition id
-      - OSD object id
-      - OSD device
-
-
-   3) Remove an active blkdev<->object mapping.
-
-      In this example, we remove the mapping with blkdev unique id 1.
-
-      $ echo 1 > /sys/class/osdblk/remove
-
-
-   NOTE:  The actual creation and deletion of OSD objects is outside the scope
-   of this driver.
-
- */
-
-#include <linux/kernel.h>
-#include <linux/device.h>
-#include <linux/module.h>
-#include <linux/fs.h>
-#include <linux/slab.h>
-#include <scsi/osd_initiator.h>
-#include <scsi/osd_attributes.h>
-#include <scsi/osd_sec.h>
-#include <scsi/scsi_device.h>
-
-#define DRV_NAME "osdblk"
-#define PFX DRV_NAME ": "
-
-/* #define _OSDBLK_DEBUG */
-#ifdef _OSDBLK_DEBUG
-#define OSDBLK_DEBUG(fmt, a...) \
-       printk(KERN_NOTICE "osdblk @%s:%d: " fmt, __func__, __LINE__, ##a)
-#else
-#define OSDBLK_DEBUG(fmt, a...) \
-       do { if (0) printk(fmt, ##a); } while (0)
-#endif
-
-MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
-MODULE_DESCRIPTION("block device inside an OSD object osdblk.ko");
-MODULE_LICENSE("GPL");
-
-struct osdblk_device;
-
-enum {
-       OSDBLK_MINORS_PER_MAJOR = 256,          /* max minors per blkdev */
-       OSDBLK_MAX_REQ          = 32,           /* max parallel requests */
-       OSDBLK_OP_TIMEOUT       = 4 * 60,       /* sync OSD req timeout */
-};
-
-struct osdblk_request {
-       struct request          *rq;            /* blk layer request */
-       struct bio              *bio;           /* cloned bio */
-       struct osdblk_device    *osdev;         /* associated blkdev */
-};
-
-struct osdblk_device {
-       int                     id;             /* blkdev unique id */
-
-       int                     major;          /* blkdev assigned major */
-       struct gendisk          *disk;          /* blkdev's gendisk and rq */
-       struct request_queue    *q;
-
-       struct osd_dev          *osd;           /* associated OSD */
-
-       char                    name[32];       /* blkdev name, e.g. osdblk34 */
-
-       spinlock_t              lock;           /* queue lock */
-
-       struct osd_obj_id       obj;            /* OSD partition, obj id */
-       uint8_t                 obj_cred[OSD_CAP_LEN]; /* OSD cred */
-
-       struct osdblk_request   req[OSDBLK_MAX_REQ]; /* request table */
-
-       struct list_head        node;
-
-       char                    osd_path[0];    /* OSD device path */
-};
-
-static struct class *class_osdblk;             /* /sys/class/osdblk */
-static DEFINE_MUTEX(ctl_mutex);        /* Serialize open/close/setup/teardown */
-static LIST_HEAD(osdblkdev_list);
-
-static const struct block_device_operations osdblk_bd_ops = {
-       .owner          = THIS_MODULE,
-};
-
-static const struct osd_attr g_attr_logical_length = ATTR_DEF(
-       OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8);
-
-static void osdblk_make_credential(u8 cred_a[OSD_CAP_LEN],
-                                  const struct osd_obj_id *obj)
-{
-       osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
-}
-
-/* copied from exofs; move to libosd? */
-/*
- * Perform a synchronous OSD operation.  copied from exofs; move to libosd?
- */
-static int osd_sync_op(struct osd_request *or, int timeout, uint8_t *credential)
-{
-       int ret;
-
-       or->timeout = timeout;
-       ret = osd_finalize_request(or, 0, credential, NULL);
-       if (ret)
-               return ret;
-
-       ret = osd_execute_request(or);
-
-       /* osd_req_decode_sense(or, ret); */
-       return ret;
-}
-
-/*
- * Perform an asynchronous OSD operation.  copied from exofs; move to libosd?
- */
-static int osd_async_op(struct osd_request *or, osd_req_done_fn *async_done,
-                  void *caller_context, u8 *cred)
-{
-       int ret;
-
-       ret = osd_finalize_request(or, 0, cred, NULL);
-       if (ret)
-               return ret;
-
-       ret = osd_execute_request_async(or, async_done, caller_context);
-
-       return ret;
-}
-
-/* copied from exofs; move to libosd? */
-static int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr)
-{
-       struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */
-       void *iter = NULL;
-       int nelem;
-
-       do {
-               nelem = 1;
-               osd_req_decode_get_attr_list(or, &cur_attr, &nelem, &iter);
-               if ((cur_attr.attr_page == attr->attr_page) &&
-                   (cur_attr.attr_id == attr->attr_id)) {
-                       attr->len = cur_attr.len;
-                       attr->val_ptr = cur_attr.val_ptr;
-                       return 0;
-               }
-       } while (iter);
-
-       return -EIO;
-}
-
-static int osdblk_get_obj_size(struct osdblk_device *osdev, u64 *size_out)
-{
-       struct osd_request *or;
-       struct osd_attr attr;
-       int ret;
-
-       /* start request */
-       or = osd_start_request(osdev->osd, GFP_KERNEL);
-       if (!or)
-               return -ENOMEM;
-
-       /* create a get-attributes(length) request */
-       osd_req_get_attributes(or, &osdev->obj);
-
-       osd_req_add_get_attr_list(or, &g_attr_logical_length, 1);
-
-       /* execute op synchronously */
-       ret = osd_sync_op(or, OSDBLK_OP_TIMEOUT, osdev->obj_cred);
-       if (ret)
-               goto out;
-
-       /* extract length from returned attribute info */
-       attr = g_attr_logical_length;
-       ret = extract_attr_from_req(or, &attr);
-       if (ret)
-               goto out;
-
-       *size_out = get_unaligned_be64(attr.val_ptr);
-
-out:
-       osd_end_request(or);
-       return ret;
-
-}
-
-static void osdblk_osd_complete(struct osd_request *or, void *private)
-{
-       struct osdblk_request *orq = private;
-       struct osd_sense_info osi;
-       int ret = osd_req_decode_sense(or, &osi);
-
-       if (ret) {
-               ret = -EIO;
-               OSDBLK_DEBUG("osdblk_osd_complete with err=%d\n", ret);
-       }
-
-       /* complete OSD request */
-       osd_end_request(or);
-
-       /* complete request passed to osdblk by block layer */
-       __blk_end_request_all(orq->rq, ret);
-}
-
-static void bio_chain_put(struct bio *chain)
-{
-       struct bio *tmp;
-
-       while (chain) {
-               tmp = chain;
-               chain = chain->bi_next;
-
-               bio_put(tmp);
-       }
-}
-
-static struct bio *bio_chain_clone(struct bio *old_chain, gfp_t gfpmask)
-{
-       struct bio *tmp, *new_chain = NULL, *tail = NULL;
-
-       while (old_chain) {
-               tmp = bio_clone_kmalloc(old_chain, gfpmask);
-               if (!tmp)
-                       goto err_out;
-
-               tmp->bi_bdev = NULL;
-               gfpmask &= ~__GFP_DIRECT_RECLAIM;
-               tmp->bi_next = NULL;
-
-               if (!new_chain)
-                       new_chain = tail = tmp;
-               else {
-                       tail->bi_next = tmp;
-                       tail = tmp;
-               }
-
-               old_chain = old_chain->bi_next;
-       }
-
-       return new_chain;
-
-err_out:
-       OSDBLK_DEBUG("bio_chain_clone with err\n");
-       bio_chain_put(new_chain);
-       return NULL;
-}
-
-static void osdblk_rq_fn(struct request_queue *q)
-{
-       struct osdblk_device *osdev = q->queuedata;
-
-       while (1) {
-               struct request *rq;
-               struct osdblk_request *orq;
-               struct osd_request *or;
-               struct bio *bio;
-               bool do_write, do_flush;
-
-               /* peek at request from block layer */
-               rq = blk_fetch_request(q);
-               if (!rq)
-                       break;
-
-               /* deduce our operation (read, write, flush) */
-               /* I wish the block layer simplified cmd_type/cmd_flags/cmd[]
-                * into a clearly defined set of RPC commands:
-                * read, write, flush, scsi command, power mgmt req,
-                * driver-specific, etc.
-                */
-
-               do_flush = (req_op(rq) == REQ_OP_FLUSH);
-               do_write = (rq_data_dir(rq) == WRITE);
-
-               if (!do_flush) { /* osd_flush does not use a bio */
-                       /* a bio clone to be passed down to OSD request */
-                       bio = bio_chain_clone(rq->bio, GFP_ATOMIC);
-                       if (!bio)
-                               break;
-               } else
-                       bio = NULL;
-
-               /* alloc internal OSD request, for OSD command execution */
-               or = osd_start_request(osdev->osd, GFP_ATOMIC);
-               if (!or) {
-                       bio_chain_put(bio);
-                       OSDBLK_DEBUG("osd_start_request with err\n");
-                       break;
-               }
-
-               orq = &osdev->req[rq->tag];
-               orq->rq = rq;
-               orq->bio = bio;
-               orq->osdev = osdev;
-
-               /* init OSD command: flush, write or read */
-               if (do_flush)
-                       osd_req_flush_object(or, &osdev->obj,
-                                            OSD_CDB_FLUSH_ALL, 0, 0);
-               else if (do_write)
-                       osd_req_write(or, &osdev->obj, blk_rq_pos(rq) * 512ULL,
-                                     bio, blk_rq_bytes(rq));
-               else
-                       osd_req_read(or, &osdev->obj, blk_rq_pos(rq) * 512ULL,
-                                    bio, blk_rq_bytes(rq));
-
-               OSDBLK_DEBUG("%s 0x%x bytes at 0x%llx\n",
-                       do_flush ? "flush" : do_write ?
-                               "write" : "read", blk_rq_bytes(rq),
-                       blk_rq_pos(rq) * 512ULL);
-
-               /* begin OSD command execution */
-               if (osd_async_op(or, osdblk_osd_complete, orq,
-                                osdev->obj_cred)) {
-                       osd_end_request(or);
-                       blk_requeue_request(q, rq);
-                       bio_chain_put(bio);
-                       OSDBLK_DEBUG("osd_execute_request_async with err\n");
-                       break;
-               }
-
-               /* remove the special 'flush' marker, now that the command
-                * is executing
-                */
-               rq->special = NULL;
-       }
-}
-
-static void osdblk_free_disk(struct osdblk_device *osdev)
-{
-       struct gendisk *disk = osdev->disk;
-
-       if (!disk)
-               return;
-
-       if (disk->flags & GENHD_FL_UP)
-               del_gendisk(disk);
-       if (disk->queue)
-               blk_cleanup_queue(disk->queue);
-       put_disk(disk);
-}
-
-static int osdblk_init_disk(struct osdblk_device *osdev)
-{
-       struct gendisk *disk;
-       struct request_queue *q;
-       int rc;
-       u64 obj_size = 0;
-
-       /* contact OSD, request size info about the object being mapped */
-       rc = osdblk_get_obj_size(osdev, &obj_size);
-       if (rc)
-               return rc;
-
-       /* create gendisk info */
-       disk = alloc_disk(OSDBLK_MINORS_PER_MAJOR);
-       if (!disk)
-               return -ENOMEM;
-
-       sprintf(disk->disk_name, DRV_NAME "%d", osdev->id);
-       disk->major = osdev->major;
-       disk->first_minor = 0;
-       disk->fops = &osdblk_bd_ops;
-       disk->private_data = osdev;
-
-       /* init rq */
-       q = blk_init_queue(osdblk_rq_fn, &osdev->lock);
-       if (!q) {
-               put_disk(disk);
-               return -ENOMEM;
-       }
-
-       /* switch queue to TCQ mode; allocate tag map */
-       rc = blk_queue_init_tags(q, OSDBLK_MAX_REQ, NULL, BLK_TAG_ALLOC_FIFO);
-       if (rc) {
-               blk_cleanup_queue(q);
-               put_disk(disk);
-               return rc;
-       }
-
-       /* Set our limits to the lower device limits, because osdblk cannot
-        * sleep when allocating a lower-request and therefore cannot be
-        * bouncing.
-        */
-       blk_queue_stack_limits(q, osd_request_queue(osdev->osd));
-
-       blk_queue_prep_rq(q, blk_queue_start_tag);
-       blk_queue_write_cache(q, true, false);
-
-       disk->queue = q;
-
-       q->queuedata = osdev;
-
-       osdev->disk = disk;
-       osdev->q = q;
-
-       /* finally, announce the disk to the world */
-       set_capacity(disk, obj_size / 512ULL);
-       add_disk(disk);
-
-       printk(KERN_INFO "%s: Added of size 0x%llx\n",
-               disk->disk_name, (unsigned long long)obj_size);
-
-       return 0;
-}
-
-/********************************************************************
- * /sys/class/osdblk/
- *                   add       map OSD object to blkdev
- *                   remove    unmap OSD object
- *                   list      show mappings
- *******************************************************************/
-
-static void class_osdblk_release(struct class *cls)
-{
-       kfree(cls);
-}
-
-static ssize_t class_osdblk_list(struct class *c,
-                               struct class_attribute *attr,
-                               char *data)
-{
-       int n = 0;
-       struct list_head *tmp;
-
-       mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
-
-       list_for_each(tmp, &osdblkdev_list) {
-               struct osdblk_device *osdev;
-
-               osdev = list_entry(tmp, struct osdblk_device, node);
-
-               n += sprintf(data+n, "%d %d %llu %llu %s\n",
-                       osdev->id,
-                       osdev->major,
-                       osdev->obj.partition,
-                       osdev->obj.id,
-                       osdev->osd_path);
-       }
-
-       mutex_unlock(&ctl_mutex);
-       return n;
-}
-
-static ssize_t class_osdblk_add(struct class *c,
-                               struct class_attribute *attr,
-                               const char *buf, size_t count)
-{
-       struct osdblk_device *osdev;
-       ssize_t rc;
-       int irc, new_id = 0;
-       struct list_head *tmp;
-
-       if (!try_module_get(THIS_MODULE))
-               return -ENODEV;
-
-       /* new osdblk_device object */
-       osdev = kzalloc(sizeof(*osdev) + strlen(buf) + 1, GFP_KERNEL);
-       if (!osdev) {
-               rc = -ENOMEM;
-               goto err_out_mod;
-       }
-
-       /* static osdblk_device initialization */
-       spin_lock_init(&osdev->lock);
-       INIT_LIST_HEAD(&osdev->node);
-
-       /* generate unique id: find highest unique id, add one */
-
-       mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
-
-       list_for_each(tmp, &osdblkdev_list) {
-               struct osdblk_device *osdev;
-
-               osdev = list_entry(tmp, struct osdblk_device, node);
-               if (osdev->id > new_id)
-                       new_id = osdev->id + 1;
-       }
-
-       osdev->id = new_id;
-
-       /* add to global list */
-       list_add_tail(&osdev->node, &osdblkdev_list);
-
-       mutex_unlock(&ctl_mutex);
-
-       /* parse add command */
-       if (sscanf(buf, "%llu %llu %s", &osdev->obj.partition, &osdev->obj.id,
-                  osdev->osd_path) != 3) {
-               rc = -EINVAL;
-               goto err_out_slot;
-       }
-
-       /* initialize rest of new object */
-       sprintf(osdev->name, DRV_NAME "%d", osdev->id);
-
-       /* contact requested OSD */
-       osdev->osd = osduld_path_lookup(osdev->osd_path);
-       if (IS_ERR(osdev->osd)) {
-               rc = PTR_ERR(osdev->osd);
-               goto err_out_slot;
-       }
-
-       /* build OSD credential */
-       osdblk_make_credential(osdev->obj_cred, &osdev->obj);
-
-       /* register our block device */
-       irc = register_blkdev(0, osdev->name);
-       if (irc < 0) {
-               rc = irc;
-               goto err_out_osd;
-       }
-
-       osdev->major = irc;
-
-       /* set up and announce blkdev mapping */
-       rc = osdblk_init_disk(osdev);
-       if (rc)
-               goto err_out_blkdev;
-
-       return count;
-
-err_out_blkdev:
-       unregister_blkdev(osdev->major, osdev->name);
-err_out_osd:
-       osduld_put_device(osdev->osd);
-err_out_slot:
-       mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
-       list_del_init(&osdev->node);
-       mutex_unlock(&ctl_mutex);
-
-       kfree(osdev);
-err_out_mod:
-       OSDBLK_DEBUG("Error adding device %s\n", buf);
-       module_put(THIS_MODULE);
-       return rc;
-}
-
-static ssize_t class_osdblk_remove(struct class *c,
-                                       struct class_attribute *attr,
-                                       const char *buf,
-                                       size_t count)
-{
-       struct osdblk_device *osdev = NULL;
-       int target_id, rc;
-       unsigned long ul;
-       struct list_head *tmp;
-
-       rc = kstrtoul(buf, 10, &ul);
-       if (rc)
-               return rc;
-
-       /* convert to int; abort if we lost anything in the conversion */
-       target_id = (int) ul;
-       if (target_id != ul)
-               return -EINVAL;
-
-       /* remove object from list immediately */
-       mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
-
-       list_for_each(tmp, &osdblkdev_list) {
-               osdev = list_entry(tmp, struct osdblk_device, node);
-               if (osdev->id == target_id) {
-                       list_del_init(&osdev->node);
-                       break;
-               }
-               osdev = NULL;
-       }
-
-       mutex_unlock(&ctl_mutex);
-
-       if (!osdev)
-               return -ENOENT;
-
-       /* clean up and free blkdev and associated OSD connection */
-       osdblk_free_disk(osdev);
-       unregister_blkdev(osdev->major, osdev->name);
-       osduld_put_device(osdev->osd);
-       kfree(osdev);
-
-       /* release module ref */
-       module_put(THIS_MODULE);
-
-       return count;
-}
-
-static struct class_attribute class_osdblk_attrs[] = {
-       __ATTR(add,     0200, NULL, class_osdblk_add),
-       __ATTR(remove,  0200, NULL, class_osdblk_remove),
-       __ATTR(list,    0444, class_osdblk_list, NULL),
-       __ATTR_NULL
-};
-
-static int osdblk_sysfs_init(void)
-{
-       int ret = 0;
-
-       /*
-        * create control files in sysfs
-        * /sys/class/osdblk/...
-        */
-       class_osdblk = kzalloc(sizeof(*class_osdblk), GFP_KERNEL);
-       if (!class_osdblk)
-               return -ENOMEM;
-
-       class_osdblk->name = DRV_NAME;
-       class_osdblk->owner = THIS_MODULE;
-       class_osdblk->class_release = class_osdblk_release;
-       class_osdblk->class_attrs = class_osdblk_attrs;
-
-       ret = class_register(class_osdblk);
-       if (ret) {
-               kfree(class_osdblk);
-               class_osdblk = NULL;
-               printk(PFX "failed to create class osdblk\n");
-               return ret;
-       }
-
-       return 0;
-}
-
-static void osdblk_sysfs_cleanup(void)
-{
-       if (class_osdblk)
-               class_destroy(class_osdblk);
-       class_osdblk = NULL;
-}
-
-static int __init osdblk_init(void)
-{
-       int rc;
-
-       rc = osdblk_sysfs_init();
-       if (rc)
-               return rc;
-
-       return 0;
-}
-
-static void __exit osdblk_exit(void)
-{
-       osdblk_sysfs_cleanup();
-}
-
-module_init(osdblk_init);
-module_exit(osdblk_exit);
-
index 939641d..b1267ef 100644 (file)
@@ -300,6 +300,11 @@ static void pcd_init_units(void)
                struct gendisk *disk = alloc_disk(1);
                if (!disk)
                        continue;
+               disk->queue = blk_init_queue(do_pcd_request, &pcd_lock);
+               if (!disk->queue) {
+                       put_disk(disk);
+                       continue;
+               }
                cd->disk = disk;
                cd->pi = &cd->pia;
                cd->present = 0;
@@ -735,18 +740,36 @@ static int pcd_detect(void)
 }
 
 /* I/O request processing */
-static struct request_queue *pcd_queue;
+static int pcd_queue;
+
+static int set_next_request(void)
+{
+       struct pcd_unit *cd;
+       struct request_queue *q;
+       int old_pos = pcd_queue;
+
+       do {
+               cd = &pcd[pcd_queue];
+               q = cd->present ? cd->disk->queue : NULL;
+               if (++pcd_queue == PCD_UNITS)
+                       pcd_queue = 0;
+               if (q) {
+                       pcd_req = blk_fetch_request(q);
+                       if (pcd_req)
+                               break;
+               }
+       } while (pcd_queue != old_pos);
+
+       return pcd_req != NULL;
+}
 
-static void do_pcd_request(struct request_queue * q)
+static void pcd_request(void)
 {
        if (pcd_busy)
                return;
        while (1) {
-               if (!pcd_req) {
-                       pcd_req = blk_fetch_request(q);
-                       if (!pcd_req)
-                               return;
-               }
+               if (!pcd_req && !set_next_request())
+                       return;
 
                if (rq_data_dir(pcd_req) == READ) {
                        struct pcd_unit *cd = pcd_req->rq_disk->private_data;
@@ -766,6 +789,11 @@ static void do_pcd_request(struct request_queue * q)
        }
 }
 
+static void do_pcd_request(struct request_queue *q)
+{
+       pcd_request();
+}
+
 static inline void next_request(int err)
 {
        unsigned long saved_flags;
@@ -774,7 +802,7 @@ static inline void next_request(int err)
        if (!__blk_end_request_cur(pcd_req, err))
                pcd_req = NULL;
        pcd_busy = 0;
-       do_pcd_request(pcd_queue);
+       pcd_request();
        spin_unlock_irqrestore(&pcd_lock, saved_flags);
 }
 
@@ -849,7 +877,7 @@ static void do_pcd_read_drq(void)
 
        do_pcd_read();
        spin_lock_irqsave(&pcd_lock, saved_flags);
-       do_pcd_request(pcd_queue);
+       pcd_request();
        spin_unlock_irqrestore(&pcd_lock, saved_flags);
 }
 
@@ -957,19 +985,10 @@ static int __init pcd_init(void)
                return -EBUSY;
        }
 
-       pcd_queue = blk_init_queue(do_pcd_request, &pcd_lock);
-       if (!pcd_queue) {
-               unregister_blkdev(major, name);
-               for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++)
-                       put_disk(cd->disk);
-               return -ENOMEM;
-       }
-
        for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++) {
                if (cd->present) {
                        register_cdrom(&cd->info);
                        cd->disk->private_data = cd;
-                       cd->disk->queue = pcd_queue;
                        add_disk(cd->disk);
                }
        }
@@ -988,9 +1007,9 @@ static void __exit pcd_exit(void)
                        pi_release(cd->pi);
                        unregister_cdrom(&cd->info);
                }
+               blk_cleanup_queue(cd->disk->queue);
                put_disk(cd->disk);
        }
-       blk_cleanup_queue(pcd_queue);
        unregister_blkdev(major, name);
        pi_unregister_driver(par_drv);
 }
index 9cfd2e0..7d2402f 100644 (file)
@@ -381,12 +381,33 @@ static enum action do_pd_write_start(void);
 static enum action do_pd_read_drq(void);
 static enum action do_pd_write_done(void);
 
-static struct request_queue *pd_queue;
+static int pd_queue;
 static int pd_claimed;
 
 static struct pd_unit *pd_current; /* current request's drive */
 static PIA *pi_current; /* current request's PIA */
 
+static int set_next_request(void)
+{
+       struct gendisk *disk;
+       struct request_queue *q;
+       int old_pos = pd_queue;
+
+       do {
+               disk = pd[pd_queue].gd;
+               q = disk ? disk->queue : NULL;
+               if (++pd_queue == PD_UNITS)
+                       pd_queue = 0;
+               if (q) {
+                       pd_req = blk_fetch_request(q);
+                       if (pd_req)
+                               break;
+               }
+       } while (pd_queue != old_pos);
+
+       return pd_req != NULL;
+}
+
 static void run_fsm(void)
 {
        while (1) {
@@ -418,8 +439,7 @@ static void run_fsm(void)
                                spin_lock_irqsave(&pd_lock, saved_flags);
                                if (!__blk_end_request_cur(pd_req,
                                                res == Ok ? 0 : -EIO)) {
-                                       pd_req = blk_fetch_request(pd_queue);
-                                       if (!pd_req)
+                                       if (!set_next_request())
                                                stop = 1;
                                }
                                spin_unlock_irqrestore(&pd_lock, saved_flags);
@@ -719,18 +739,15 @@ static int pd_special_command(struct pd_unit *disk,
                      enum action (*func)(struct pd_unit *disk))
 {
        struct request *rq;
-       int err = 0;
 
        rq = blk_get_request(disk->gd->queue, REQ_OP_DRV_IN, __GFP_RECLAIM);
        if (IS_ERR(rq))
                return PTR_ERR(rq);
 
        rq->special = func;
-
-       err = blk_execute_rq(disk->gd->queue, disk->gd, rq, 0);
-
+       blk_execute_rq(disk->gd->queue, disk->gd, rq, 0);
        blk_put_request(rq);
-       return err;
+       return 0;
 }
 
 /* kernel glue structures */
@@ -839,7 +856,13 @@ static void pd_probe_drive(struct pd_unit *disk)
        p->first_minor = (disk - pd) << PD_BITS;
        disk->gd = p;
        p->private_data = disk;
-       p->queue = pd_queue;
+       p->queue = blk_init_queue(do_pd_request, &pd_lock);
+       if (!p->queue) {
+               disk->gd = NULL;
+               put_disk(p);
+               return;
+       }
+       blk_queue_max_hw_sectors(p->queue, cluster);
 
        if (disk->drive == -1) {
                for (disk->drive = 0; disk->drive <= 1; disk->drive++)
@@ -919,26 +942,18 @@ static int __init pd_init(void)
        if (disable)
                goto out1;
 
-       pd_queue = blk_init_queue(do_pd_request, &pd_lock);
-       if (!pd_queue)
-               goto out1;
-
-       blk_queue_max_hw_sectors(pd_queue, cluster);
-
        if (register_blkdev(major, name))
-               goto out2;
+               goto out1;
 
        printk("%s: %s version %s, major %d, cluster %d, nice %d\n",
               name, name, PD_VERSION, major, cluster, nice);
        if (!pd_detect())
-               goto out3;
+               goto out2;
 
        return 0;
 
-out3:
-       unregister_blkdev(major, name);
 out2:
-       blk_cleanup_queue(pd_queue);
+       unregister_blkdev(major, name);
 out1:
        return -ENODEV;
 }
@@ -953,11 +968,11 @@ static void __exit pd_exit(void)
                if (p) {
                        disk->gd = NULL;
                        del_gendisk(p);
+                       blk_cleanup_queue(p->queue);
                        put_disk(p);
                        pi_release(disk->pi);
                }
        }
-       blk_cleanup_queue(pd_queue);
 }
 
 MODULE_LICENSE("GPL");
index 14c5d32..f24ca73 100644 (file)
@@ -287,6 +287,12 @@ static void __init pf_init_units(void)
                struct gendisk *disk = alloc_disk(1);
                if (!disk)
                        continue;
+               disk->queue = blk_init_queue(do_pf_request, &pf_spin_lock);
+               if (!disk->queue) {
+                       put_disk(disk);
+                       return;
+               }
+               blk_queue_max_segments(disk->queue, cluster);
                pf->disk = disk;
                pf->pi = &pf->pia;
                pf->media_status = PF_NM;
@@ -772,7 +778,28 @@ static int pf_ready(void)
        return (((status_reg(pf_current) & (STAT_BUSY | pf_mask)) == pf_mask));
 }
 
-static struct request_queue *pf_queue;
+static int pf_queue;
+
+static int set_next_request(void)
+{
+       struct pf_unit *pf;
+       struct request_queue *q;
+       int old_pos = pf_queue;
+
+       do {
+               pf = &units[pf_queue];
+               q = pf->present ? pf->disk->queue : NULL;
+               if (++pf_queue == PF_UNITS)
+                       pf_queue = 0;
+               if (q) {
+                       pf_req = blk_fetch_request(q);
+                       if (pf_req)
+                               break;
+               }
+       } while (pf_queue != old_pos);
+
+       return pf_req != NULL;
+}
 
 static void pf_end_request(int err)
 {
@@ -780,16 +807,13 @@ static void pf_end_request(int err)
                pf_req = NULL;
 }
 
-static void do_pf_request(struct request_queue * q)
+static void pf_request(void)
 {
        if (pf_busy)
                return;
 repeat:
-       if (!pf_req) {
-               pf_req = blk_fetch_request(q);
-               if (!pf_req)
-                       return;
-       }
+       if (!pf_req && !set_next_request())
+               return;
 
        pf_current = pf_req->rq_disk->private_data;
        pf_block = blk_rq_pos(pf_req);
@@ -817,6 +841,11 @@ repeat:
        }
 }
 
+static void do_pf_request(struct request_queue *q)
+{
+       pf_request();
+}
+
 static int pf_next_buf(void)
 {
        unsigned long saved_flags;
@@ -846,7 +875,7 @@ static inline void next_request(int err)
        spin_lock_irqsave(&pf_spin_lock, saved_flags);
        pf_end_request(err);
        pf_busy = 0;
-       do_pf_request(pf_queue);
+       pf_request();
        spin_unlock_irqrestore(&pf_spin_lock, saved_flags);
 }
 
@@ -972,15 +1001,6 @@ static int __init pf_init(void)
                        put_disk(pf->disk);
                return -EBUSY;
        }
-       pf_queue = blk_init_queue(do_pf_request, &pf_spin_lock);
-       if (!pf_queue) {
-               unregister_blkdev(major, name);
-               for (pf = units, unit = 0; unit < PF_UNITS; pf++, unit++)
-                       put_disk(pf->disk);
-               return -ENOMEM;
-       }
-
-       blk_queue_max_segments(pf_queue, cluster);
 
        for (pf = units, unit = 0; unit < PF_UNITS; pf++, unit++) {
                struct gendisk *disk = pf->disk;
@@ -988,7 +1008,6 @@ static int __init pf_init(void)
                if (!pf->present)
                        continue;
                disk->private_data = pf;
-               disk->queue = pf_queue;
                add_disk(disk);
        }
        return 0;
@@ -1003,10 +1022,10 @@ static void __exit pf_exit(void)
                if (!pf->present)
                        continue;
                del_gendisk(pf->disk);
+               blk_cleanup_queue(pf->disk->queue);
                put_disk(pf->disk);
                pi_release(pf->pi);
        }
-       blk_cleanup_queue(pf_queue);
 }
 
 MODULE_LICENSE("GPL");
index 66d846b..205b865 100644 (file)
@@ -724,7 +724,7 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *
                rq->rq_flags |= RQF_QUIET;
 
        blk_execute_rq(rq->q, pd->bdev->bd_disk, rq, 0);
-       if (rq->errors)
+       if (scsi_req(rq)->result)
                ret = -EIO;
 out:
        blk_put_request(rq);
index 517838b..089ac41 100644 (file)
@@ -4317,7 +4317,7 @@ static int rbd_init_request(void *data, struct request *rq,
        return 0;
 }
 
-static struct blk_mq_ops rbd_mq_ops = {
+static const struct blk_mq_ops rbd_mq_ops = {
        .queue_rq       = rbd_queue_rq,
        .init_request   = rbd_init_request,
 };
@@ -4380,7 +4380,6 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
        q->limits.discard_granularity = segment_size;
        q->limits.discard_alignment = segment_size;
        blk_queue_max_discard_sectors(q, segment_size / SECTOR_SIZE);
-       q->limits.discard_zeroes_data = 1;
 
        if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
                q->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES;
index f81d70b..9c56636 100644 (file)
@@ -300,7 +300,6 @@ int rsxx_setup_dev(struct rsxx_cardinfo *card)
                                                RSXX_HW_BLK_SIZE >> 9);
                card->queue->limits.discard_granularity = RSXX_HW_BLK_SIZE;
                card->queue->limits.discard_alignment   = RSXX_HW_BLK_SIZE;
-               card->queue->limits.discard_zeroes_data = 1;
        }
 
        card->queue->queuedata = card;
index b5afd49..3064be6 100644 (file)
@@ -211,7 +211,7 @@ enum head {
 struct swim_priv {
        struct swim __iomem *base;
        spinlock_t lock;
-       struct request_queue *queue;
+       int fdc_queue;
        int floppy_count;
        struct floppy_state unit[FD_MAX_UNIT];
 };
@@ -525,12 +525,33 @@ static int floppy_read_sectors(struct floppy_state *fs,
        return 0;
 }
 
-static void redo_fd_request(struct request_queue *q)
+static struct request *swim_next_request(struct swim_priv *swd)
 {
+       struct request_queue *q;
+       struct request *rq;
+       int old_pos = swd->fdc_queue;
+
+       do {
+               q = swd->unit[swd->fdc_queue].disk->queue;
+               if (++swd->fdc_queue == swd->floppy_count)
+                       swd->fdc_queue = 0;
+               if (q) {
+                       rq = blk_fetch_request(q);
+                       if (rq)
+                               return rq;
+               }
+       } while (swd->fdc_queue != old_pos);
+
+       return NULL;
+}
+
+static void do_fd_request(struct request_queue *q)
+{
+       struct swim_priv *swd = q->queuedata;
        struct request *req;
        struct floppy_state *fs;
 
-       req = blk_fetch_request(q);
+       req = swim_next_request(swd);
        while (req) {
                int err = -EIO;
 
@@ -554,15 +575,10 @@ static void redo_fd_request(struct request_queue *q)
                }
        done:
                if (!__blk_end_request_cur(req, err))
-                       req = blk_fetch_request(q);
+                       req = swim_next_request(swd);
        }
 }
 
-static void do_fd_request(struct request_queue *q)
-{
-       redo_fd_request(q);
-}
-
 static struct floppy_struct floppy_type[4] = {
        {    0,  0, 0,  0, 0, 0x00, 0x00, 0x00, 0x00, NULL }, /* no testing   */
        {  720,  9, 1, 80, 0, 0x2A, 0x02, 0xDF, 0x50, NULL }, /* 360KB SS 3.5"*/
@@ -833,22 +849,25 @@ static int swim_floppy_init(struct swim_priv *swd)
                return -EBUSY;
        }
 
+       spin_lock_init(&swd->lock);
+
        for (drive = 0; drive < swd->floppy_count; drive++) {
                swd->unit[drive].disk = alloc_disk(1);
                if (swd->unit[drive].disk == NULL) {
                        err = -ENOMEM;
                        goto exit_put_disks;
                }
+               swd->unit[drive].disk->queue = blk_init_queue(do_fd_request,
+                                                             &swd->lock);
+               if (!swd->unit[drive].disk->queue) {
+                       err = -ENOMEM;
+                       put_disk(swd->unit[drive].disk);
+                       goto exit_put_disks;
+               }
+               swd->unit[drive].disk->queue->queuedata = swd;
                swd->unit[drive].swd = swd;
        }
 
-       spin_lock_init(&swd->lock);
-       swd->queue = blk_init_queue(do_fd_request, &swd->lock);
-       if (!swd->queue) {
-               err = -ENOMEM;
-               goto exit_put_disks;
-       }
-
        for (drive = 0; drive < swd->floppy_count; drive++) {
                swd->unit[drive].disk->flags = GENHD_FL_REMOVABLE;
                swd->unit[drive].disk->major = FLOPPY_MAJOR;
@@ -856,7 +875,6 @@ static int swim_floppy_init(struct swim_priv *swd)
                sprintf(swd->unit[drive].disk->disk_name, "fd%d", drive);
                swd->unit[drive].disk->fops = &floppy_fops;
                swd->unit[drive].disk->private_data = &swd->unit[drive];
-               swd->unit[drive].disk->queue = swd->queue;
                set_capacity(swd->unit[drive].disk, 2880);
                add_disk(swd->unit[drive].disk);
        }
@@ -943,13 +961,12 @@ static int swim_remove(struct platform_device *dev)
 
        for (drive = 0; drive < swd->floppy_count; drive++) {
                del_gendisk(swd->unit[drive].disk);
+               blk_cleanup_queue(swd->unit[drive].disk->queue);
                put_disk(swd->unit[drive].disk);
        }
 
        unregister_blkdev(FLOPPY_MAJOR, "fd");
 
-       blk_cleanup_queue(swd->queue);
-
        /* eject floppies */
 
        for (drive = 0; drive < swd->floppy_count; drive++)
index 61b3ffa..ba4809c 100644 (file)
@@ -343,8 +343,8 @@ static void start_request(struct floppy_state *fs)
                          req->rq_disk->disk_name, req->cmd,
                          (long)blk_rq_pos(req), blk_rq_sectors(req),
                          bio_data(req->bio));
-               swim3_dbg("           errors=%d current_nr_sectors=%u\n",
-                         req->errors, blk_rq_cur_sectors(req));
+               swim3_dbg("           current_nr_sectors=%u\n",
+                         blk_rq_cur_sectors(req));
 #endif
 
                if (blk_rq_pos(req) >= fs->total_secs) {
index 1d4c9f8..f946142 100644 (file)
@@ -111,7 +111,7 @@ static int virtblk_add_req_scsi(struct virtqueue *vq, struct virtblk_req *vbr,
        return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC);
 }
 
-static inline void virtblk_scsi_reques_done(struct request *req)
+static inline void virtblk_scsi_request_done(struct request *req)
 {
        struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
        struct virtio_blk *vblk = req->q->queuedata;
@@ -119,7 +119,7 @@ static inline void virtblk_scsi_reques_done(struct request *req)
 
        sreq->resid_len = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.residual);
        sreq->sense_len = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.sense_len);
-       req->errors = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.errors);
+       sreq->result = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.errors);
 }
 
 static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
@@ -144,7 +144,7 @@ static inline int virtblk_add_req_scsi(struct virtqueue *vq,
 {
        return -EIO;
 }
-static inline void virtblk_scsi_reques_done(struct request *req)
+static inline void virtblk_scsi_request_done(struct request *req)
 {
 }
 #define virtblk_ioctl  NULL
@@ -175,19 +175,15 @@ static int virtblk_add_req(struct virtqueue *vq, struct virtblk_req *vbr,
 static inline void virtblk_request_done(struct request *req)
 {
        struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
-       int error = virtblk_result(vbr);
 
        switch (req_op(req)) {
        case REQ_OP_SCSI_IN:
        case REQ_OP_SCSI_OUT:
-               virtblk_scsi_reques_done(req);
-               break;
-       case REQ_OP_DRV_IN:
-               req->errors = (error != 0);
+               virtblk_scsi_request_done(req);
                break;
        }
 
-       blk_mq_end_request(req, error);
+       blk_mq_end_request(req, virtblk_result(vbr));
 }
 
 static void virtblk_done(struct virtqueue *vq)
@@ -205,7 +201,7 @@ static void virtblk_done(struct virtqueue *vq)
                while ((vbr = virtqueue_get_buf(vblk->vqs[qid].vq, &len)) != NULL) {
                        struct request *req = blk_mq_rq_from_pdu(vbr);
 
-                       blk_mq_complete_request(req, req->errors);
+                       blk_mq_complete_request(req);
                        req_done = true;
                }
                if (unlikely(virtqueue_is_broken(vq)))
@@ -310,7 +306,8 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str)
        if (err)
                goto out;
 
-       err = blk_execute_rq(vblk->disk->queue, vblk->disk, req, false);
+       blk_execute_rq(vblk->disk->queue, vblk->disk, req, false);
+       err = virtblk_result(blk_mq_rq_to_pdu(req));
 out:
        blk_put_request(req);
        return err;
@@ -597,7 +594,7 @@ static int virtblk_map_queues(struct blk_mq_tag_set *set)
        return blk_mq_virtio_map_queues(set, vblk->vdev, 0);
 }
 
-static struct blk_mq_ops virtio_mq_ops = {
+static const struct blk_mq_ops virtio_mq_ops = {
        .queue_rq       = virtio_queue_rq,
        .complete       = virtblk_request_done,
        .init_request   = virtblk_init_request,
index 5067a0a..3945963 100644 (file)
@@ -115,6 +115,15 @@ struct split_bio {
        atomic_t pending;
 };
 
+struct blkif_req {
+       int     error;
+};
+
+static inline struct blkif_req *blkif_req(struct request *rq)
+{
+       return blk_mq_rq_to_pdu(rq);
+}
+
 static DEFINE_MUTEX(blkfront_mutex);
 static const struct block_device_operations xlvbd_block_fops;
 
@@ -907,8 +916,14 @@ out_busy:
        return BLK_MQ_RQ_QUEUE_BUSY;
 }
 
-static struct blk_mq_ops blkfront_mq_ops = {
+static void blkif_complete_rq(struct request *rq)
+{
+       blk_mq_end_request(rq, blkif_req(rq)->error);
+}
+
+static const struct blk_mq_ops blkfront_mq_ops = {
        .queue_rq = blkif_queue_rq,
+       .complete = blkif_complete_rq,
 };
 
 static void blkif_set_queue_limits(struct blkfront_info *info)
@@ -969,7 +984,7 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
                info->tag_set.queue_depth = BLK_RING_SIZE(info);
        info->tag_set.numa_node = NUMA_NO_NODE;
        info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
-       info->tag_set.cmd_size = 0;
+       info->tag_set.cmd_size = sizeof(struct blkif_req);
        info->tag_set.driver_data = info;
 
        if (blk_mq_alloc_tag_set(&info->tag_set))
@@ -1543,7 +1558,6 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
        unsigned long flags;
        struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)dev_id;
        struct blkfront_info *info = rinfo->dev_info;
-       int error;
 
        if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
                return IRQ_HANDLED;
@@ -1587,37 +1601,36 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
                        continue;
                }
 
-               error = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO;
+               blkif_req(req)->error = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO;
                switch (bret->operation) {
                case BLKIF_OP_DISCARD:
                        if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
                                struct request_queue *rq = info->rq;
                                printk(KERN_WARNING "blkfront: %s: %s op failed\n",
                                           info->gd->disk_name, op_name(bret->operation));
-                               error = -EOPNOTSUPP;
+                               blkif_req(req)->error = -EOPNOTSUPP;
                                info->feature_discard = 0;
                                info->feature_secdiscard = 0;
                                queue_flag_clear(QUEUE_FLAG_DISCARD, rq);
                                queue_flag_clear(QUEUE_FLAG_SECERASE, rq);
                        }
-                       blk_mq_complete_request(req, error);
                        break;
                case BLKIF_OP_FLUSH_DISKCACHE:
                case BLKIF_OP_WRITE_BARRIER:
                        if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
                                printk(KERN_WARNING "blkfront: %s: %s op failed\n",
                                       info->gd->disk_name, op_name(bret->operation));
-                               error = -EOPNOTSUPP;
+                               blkif_req(req)->error = -EOPNOTSUPP;
                        }
                        if (unlikely(bret->status == BLKIF_RSP_ERROR &&
                                     rinfo->shadow[id].req.u.rw.nr_segments == 0)) {
                                printk(KERN_WARNING "blkfront: %s: empty %s op failed\n",
                                       info->gd->disk_name, op_name(bret->operation));
-                               error = -EOPNOTSUPP;
+                               blkif_req(req)->error = -EOPNOTSUPP;
                        }
-                       if (unlikely(error)) {
-                               if (error == -EOPNOTSUPP)
-                                       error = 0;
+                       if (unlikely(blkif_req(req)->error)) {
+                               if (blkif_req(req)->error == -EOPNOTSUPP)
+                                       blkif_req(req)->error = 0;
                                info->feature_fua = 0;
                                info->feature_flush = 0;
                                xlvbd_flush(info);
@@ -1629,11 +1642,12 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
                                dev_dbg(&info->xbdev->dev, "Bad return from blkdev data "
                                        "request: %x\n", bret->status);
 
-                       blk_mq_complete_request(req, error);
                        break;
                default:
                        BUG();
                }
+
+               blk_mq_complete_request(req);
        }
 
        rinfo->ring.rsp_cons = i;
@@ -2345,6 +2359,7 @@ static void blkfront_connect(struct blkfront_info *info)
        unsigned long sector_size;
        unsigned int physical_sector_size;
        unsigned int binfo;
+       char *envp[] = { "RESIZE=1", NULL };
        int err, i;
 
        switch (info->connected) {
@@ -2361,6 +2376,8 @@ static void blkfront_connect(struct blkfront_info *info)
                       sectors);
                set_capacity(info->gd, sectors);
                revalidate_disk(info->gd);
+               kobject_uevent_env(&disk_to_dev(info->gd)->kobj,
+                                  KOBJ_CHANGE, envp);
 
                return;
        case BLKIF_STATE_SUSPENDED:
index dceb5ed..6fac5fe 100644 (file)
@@ -523,7 +523,7 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
 
        cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_RO);
        if (size == PAGE_SIZE) {
-               copy_page(mem, cmem);
+               memcpy(mem, cmem, PAGE_SIZE);
        } else {
                struct zcomp_strm *zstrm = zcomp_stream_get(zram->comp);
 
@@ -717,7 +717,7 @@ compress_again:
 
        if ((clen == PAGE_SIZE) && !is_partial_io(bvec)) {
                src = kmap_atomic(page);
-               copy_page(cmem, src);
+               memcpy(cmem, src, PAGE_SIZE);
                kunmap_atomic(src);
        } else {
                memcpy(cmem, src, clen);
@@ -829,10 +829,14 @@ static void __zram_make_request(struct zram *zram, struct bio *bio)
        offset = (bio->bi_iter.bi_sector &
                  (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT;
 
-       if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {
+       switch (bio_op(bio)) {
+       case REQ_OP_DISCARD:
+       case REQ_OP_WRITE_ZEROES:
                zram_bio_discard(zram, index, offset, bio);
                bio_endio(bio);
                return;
+       default:
+               break;
        }
 
        bio_for_each_segment(bvec, bio, iter) {
@@ -928,7 +932,7 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
        }
 
        index = sector >> SECTORS_PER_PAGE_SHIFT;
-       offset = sector & (SECTORS_PER_PAGE - 1) << SECTOR_SHIFT;
+       offset = (sector & (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT;
 
        bv.bv_page = page;
        bv.bv_len = PAGE_SIZE;
@@ -1192,6 +1196,8 @@ static int zram_add(void)
        zram->disk->queue->limits.max_sectors = SECTORS_PER_PAGE;
        zram->disk->queue->limits.chunk_sectors = 0;
        blk_queue_max_discard_sectors(zram->disk->queue, UINT_MAX);
+       queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, zram->disk->queue);
+
        /*
         * zram_bio_discard() will clear all logical blocks if logical block
         * size is identical with physical block size(PAGE_SIZE). But if it is
@@ -1201,10 +1207,7 @@ static int zram_add(void)
         * zeroed.
         */
        if (ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE)
-               zram->disk->queue->limits.discard_zeroes_data = 1;
-       else
-               zram->disk->queue->limits.discard_zeroes_data = 0;
-       queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, zram->disk->queue);
+               blk_queue_max_write_zeroes_sectors(zram->disk->queue, UINT_MAX);
 
        add_disk(zram->disk);
 
index 8773964..76c952f 100644 (file)
@@ -2218,7 +2218,8 @@ static int cdrom_read_cdda_bpc(struct cdrom_device_info *cdi, __u8 __user *ubuf,
                rq->timeout = 60 * HZ;
                bio = rq->bio;
 
-               if (blk_execute_rq(q, cdi->disk, rq, 0)) {
+               blk_execute_rq(q, cdi->disk, rq, 0);
+               if (scsi_req(rq)->result) {
                        struct request_sense *s = req->sense;
                        ret = -EIO;
                        cdi->last_sense = s->sense_key;
index 0ef3500..c99cd19 100644 (file)
@@ -14,7 +14,7 @@
 #include <linux/agp_backend.h>
 #include <linux/mmzone.h>
 #include <asm/page.h>          /* PAGE_SIZE */
-#include <asm/e820.h>
+#include <asm/e820/api.h>
 #include <asm/amd_nb.h>
 #include <asm/gart.h>
 #include "agp.h"
index d6f5d9e..70d434b 100644 (file)
@@ -523,6 +523,7 @@ static int bt_bmc_remove(struct platform_device *pdev)
 
 static const struct of_device_id bt_bmc_match[] = {
        { .compatible = "aspeed,ast2400-ibt-bmc" },
+       { .compatible = "aspeed,ast2500-ibt-bmc" },
        { },
 };
 
index 2a7c425..b2b618f 100644 (file)
@@ -1954,7 +1954,9 @@ static int hotmod_handler(const char *val, struct kernel_param *kp)
                                kfree(info);
                                goto out;
                        }
+                       mutex_lock(&smi_infos_lock);
                        rv = try_smi_init(info);
+                       mutex_unlock(&smi_infos_lock);
                        if (rv) {
                                cleanup_one_si(info);
                                goto out;
@@ -2042,8 +2044,10 @@ static int hardcode_find_bmc(void)
                info->slave_addr = slave_addrs[i];
 
                if (!add_smi(info)) {
+                       mutex_lock(&smi_infos_lock);
                        if (try_smi_init(info))
                                cleanup_one_si(info);
+                       mutex_unlock(&smi_infos_lock);
                        ret = 0;
                } else {
                        kfree(info);
@@ -3492,6 +3496,11 @@ out_err:
        return rv;
 }
 
+/*
+ * Try to start up an interface.  Must be called with smi_infos_lock
+ * held, primarily to keep smi_num consistent, we only one to do these
+ * one at a time.
+ */
 static int try_smi_init(struct smi_info *new_smi)
 {
        int rv = 0;
@@ -3524,9 +3533,12 @@ static int try_smi_init(struct smi_info *new_smi)
                goto out_err;
        }
 
+       new_smi->intf_num = smi_num;
+
        /* Do this early so it's available for logs. */
        if (!new_smi->dev) {
-               init_name = kasprintf(GFP_KERNEL, "ipmi_si.%d", 0);
+               init_name = kasprintf(GFP_KERNEL, "ipmi_si.%d",
+                                     new_smi->intf_num);
 
                /*
                 * If we don't already have a device from something
@@ -3593,8 +3605,6 @@ static int try_smi_init(struct smi_info *new_smi)
 
        new_smi->interrupt_disabled = true;
        atomic_set(&new_smi->need_watch, 0);
-       new_smi->intf_num = smi_num;
-       smi_num++;
 
        rv = try_enable_event_buffer(new_smi);
        if (rv == 0)
@@ -3661,6 +3671,9 @@ static int try_smi_init(struct smi_info *new_smi)
                goto out_err_stop_timer;
        }
 
+       /* Don't increment till we know we have succeeded. */
+       smi_num++;
+
        dev_info(new_smi->dev, "IPMI %s interface initialized\n",
                 si_to_str[new_smi->si_type]);
 
index cca6e5b..0b22a9b 100644 (file)
@@ -891,6 +891,7 @@ static void msg_written_handler(struct ssif_info *ssif_info, int result,
                 * for details on the intricacies of this.
                 */
                int left;
+               unsigned char *data_to_send;
 
                ssif_inc_stat(ssif_info, sent_messages_parts);
 
@@ -899,6 +900,7 @@ static void msg_written_handler(struct ssif_info *ssif_info, int result,
                        left = 32;
                /* Length byte. */
                ssif_info->multi_data[ssif_info->multi_pos] = left;
+               data_to_send = ssif_info->multi_data + ssif_info->multi_pos;
                ssif_info->multi_pos += left;
                if (left < 32)
                        /*
@@ -912,7 +914,7 @@ static void msg_written_handler(struct ssif_info *ssif_info, int result,
                rv = ssif_i2c_send(ssif_info, msg_written_handler,
                                  I2C_SMBUS_WRITE,
                                  SSIF_IPMI_MULTI_PART_REQUEST_MIDDLE,
-                                 ssif_info->multi_data + ssif_info->multi_pos,
+                                 data_to_send,
                                  I2C_SMBUS_BLOCK_DATA);
                if (rv < 0) {
                        /* request failed, just return the error. */
@@ -1642,9 +1644,8 @@ static int ssif_probe(struct i2c_client *client, const struct i2c_device_id *id)
 
        spin_lock_init(&ssif_info->lock);
        ssif_info->ssif_state = SSIF_NORMAL;
-       init_timer(&ssif_info->retry_timer);
-       ssif_info->retry_timer.data = (unsigned long) ssif_info;
-       ssif_info->retry_timer.function = retry_timeout;
+       setup_timer(&ssif_info->retry_timer, retry_timeout,
+                   (unsigned long)ssif_info);
 
        for (i = 0; i < SSIF_NUM_STATS; i++)
                atomic_set(&ssif_info->stats[i], 0);
index 5ca24d9..d165af8 100644 (file)
@@ -516,7 +516,7 @@ static void panic_halt_ipmi_heartbeat(void)
        msg.cmd = IPMI_WDOG_RESET_TIMER;
        msg.data = NULL;
        msg.data_len = 0;
-       atomic_add(2, &panic_done_count);
+       atomic_add(1, &panic_done_count);
        rv = ipmi_request_supply_msgs(watchdog_user,
                                      (struct ipmi_addr *) &addr,
                                      0,
@@ -526,7 +526,7 @@ static void panic_halt_ipmi_heartbeat(void)
                                      &panic_halt_heartbeat_recv_msg,
                                      1);
        if (rv)
-               atomic_sub(2, &panic_done_count);
+               atomic_sub(1, &panic_done_count);
 }
 
 static struct ipmi_smi_msg panic_halt_smi_msg = {
@@ -550,12 +550,12 @@ static void panic_halt_ipmi_set_timeout(void)
        /* Wait for the messages to be free. */
        while (atomic_read(&panic_done_count) != 0)
                ipmi_poll_interface(watchdog_user);
-       atomic_add(2, &panic_done_count);
+       atomic_add(1, &panic_done_count);
        rv = i_ipmi_set_timeout(&panic_halt_smi_msg,
                                &panic_halt_recv_msg,
                                &send_heartbeat_now);
        if (rv) {
-               atomic_sub(2, &panic_done_count);
+               atomic_sub(1, &panic_done_count);
                printk(KERN_WARNING PFX
                       "Unable to extend the watchdog timeout.");
        } else {
index 6d9cc2d..7e4a9d1 100644 (file)
@@ -60,6 +60,10 @@ static inline int valid_mmap_phys_addr_range(unsigned long pfn, size_t size)
 #endif
 
 #ifdef CONFIG_STRICT_DEVMEM
+static inline int page_is_allowed(unsigned long pfn)
+{
+       return devmem_is_allowed(pfn);
+}
 static inline int range_is_allowed(unsigned long pfn, unsigned long size)
 {
        u64 from = ((u64)pfn) << PAGE_SHIFT;
@@ -75,6 +79,10 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size)
        return 1;
 }
 #else
+static inline int page_is_allowed(unsigned long pfn)
+{
+       return 1;
+}
 static inline int range_is_allowed(unsigned long pfn, unsigned long size)
 {
        return 1;
@@ -122,23 +130,31 @@ static ssize_t read_mem(struct file *file, char __user *buf,
 
        while (count > 0) {
                unsigned long remaining;
+               int allowed;
 
                sz = size_inside_page(p, count);
 
-               if (!range_is_allowed(p >> PAGE_SHIFT, count))
+               allowed = page_is_allowed(p >> PAGE_SHIFT);
+               if (!allowed)
                        return -EPERM;
+               if (allowed == 2) {
+                       /* Show zeros for restricted memory. */
+                       remaining = clear_user(buf, sz);
+               } else {
+                       /*
+                        * On ia64 if a page has been mapped somewhere as
+                        * uncached, then it must also be accessed uncached
+                        * by the kernel or data corruption may occur.
+                        */
+                       ptr = xlate_dev_mem_ptr(p);
+                       if (!ptr)
+                               return -EFAULT;
 
-               /*
-                * On ia64 if a page has been mapped somewhere as uncached, then
-                * it must also be accessed uncached by the kernel or data
-                * corruption may occur.
-                */
-               ptr = xlate_dev_mem_ptr(p);
-               if (!ptr)
-                       return -EFAULT;
+                       remaining = copy_to_user(buf, ptr, sz);
+
+                       unxlate_dev_mem_ptr(p, ptr);
+               }
 
-               remaining = copy_to_user(buf, ptr, sz);
-               unxlate_dev_mem_ptr(p, ptr);
                if (remaining)
                        return -EFAULT;
 
@@ -181,30 +197,36 @@ static ssize_t write_mem(struct file *file, const char __user *buf,
 #endif
 
        while (count > 0) {
+               int allowed;
+
                sz = size_inside_page(p, count);
 
-               if (!range_is_allowed(p >> PAGE_SHIFT, sz))
+               allowed = page_is_allowed(p >> PAGE_SHIFT);
+               if (!allowed)
                        return -EPERM;
 
-               /*
-                * On ia64 if a page has been mapped somewhere as uncached, then
-                * it must also be accessed uncached by the kernel or data
-                * corruption may occur.
-                */
-               ptr = xlate_dev_mem_ptr(p);
-               if (!ptr) {
-                       if (written)
-                               break;
-                       return -EFAULT;
-               }
+               /* Skip actual writing when a page is marked as restricted. */
+               if (allowed == 1) {
+                       /*
+                        * On ia64 if a page has been mapped somewhere as
+                        * uncached, then it must also be accessed uncached
+                        * by the kernel or data corruption may occur.
+                        */
+                       ptr = xlate_dev_mem_ptr(p);
+                       if (!ptr) {
+                               if (written)
+                                       break;
+                               return -EFAULT;
+                       }
 
-               copied = copy_from_user(ptr, buf, sz);
-               unxlate_dev_mem_ptr(p, ptr);
-               if (copied) {
-                       written += sz - copied;
-                       if (written)
-                               break;
-                       return -EFAULT;
+                       copied = copy_from_user(ptr, buf, sz);
+                       unxlate_dev_mem_ptr(p, ptr);
+                       if (copied) {
+                               written += sz - copied;
+                               if (written)
+                                       break;
+                               return -EFAULT;
+                       }
                }
 
                buf += sz;
index b708c85..0e7fcb0 100644 (file)
@@ -478,18 +478,18 @@ static int sgi_clock_period;
 static struct timespec sgi_clock_offset;
 static int sgi_clock_period;
 
-static int sgi_clock_get(clockid_t clockid, struct timespec *tp)
+static int sgi_clock_get(clockid_t clockid, struct timespec64 *tp)
 {
        u64 nsec;
 
        nsec = rtc_time() * sgi_clock_period
                        + sgi_clock_offset.tv_nsec;
-       *tp = ns_to_timespec(nsec);
+       *tp = ns_to_timespec64(nsec);
        tp->tv_sec += sgi_clock_offset.tv_sec;
        return 0;
 };
 
-static int sgi_clock_set(const clockid_t clockid, const struct timespec *tp)
+static int sgi_clock_set(const clockid_t clockid, const struct timespec64 *tp)
 {
 
        u64 nsec;
@@ -657,7 +657,7 @@ static int sgi_timer_del(struct k_itimer *timr)
 }
 
 /* Assumption: it_lock is already held with irq's disabled */
-static void sgi_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
+static void sgi_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
 {
 
        if (timr->it.mmtimer.clock == TIMER_OFF) {
@@ -668,14 +668,14 @@ static void sgi_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
                return;
        }
 
-       cur_setting->it_interval = ns_to_timespec(timr->it.mmtimer.incr * sgi_clock_period);
-       cur_setting->it_value = ns_to_timespec((timr->it.mmtimer.expires - rtc_time()) * sgi_clock_period);
+       cur_setting->it_interval = ns_to_timespec64(timr->it.mmtimer.incr * sgi_clock_period);
+       cur_setting->it_value = ns_to_timespec64((timr->it.mmtimer.expires - rtc_time()) * sgi_clock_period);
 }
 
 
 static int sgi_timer_set(struct k_itimer *timr, int flags,
-       struct itimerspec * new_setting,
-       struct itimerspec * old_setting)
+       struct itimerspec64 *new_setting,
+       struct itimerspec64 *old_setting)
 {
        unsigned long when, period, irqflags;
        int err = 0;
@@ -687,8 +687,8 @@ static int sgi_timer_set(struct k_itimer *timr, int flags,
                sgi_timer_get(timr, old_setting);
 
        sgi_timer_del(timr);
-       when = timespec_to_ns(&new_setting->it_value);
-       period = timespec_to_ns(&new_setting->it_interval);
+       when = timespec64_to_ns(&new_setting->it_value);
+       period = timespec64_to_ns(&new_setting->it_interval);
 
        if (when == 0)
                /* Clear timer */
@@ -699,11 +699,11 @@ static int sgi_timer_set(struct k_itimer *timr, int flags,
                return -ENOMEM;
 
        if (flags & TIMER_ABSTIME) {
-               struct timespec n;
+               struct timespec64 n;
                unsigned long now;
 
-               getnstimeofday(&n);
-               now = timespec_to_ns(&n);
+               getnstimeofday64(&n);
+               now = timespec64_to_ns(&n);
                if (when > now)
                        when -= now;
                else
@@ -765,7 +765,7 @@ static int sgi_timer_set(struct k_itimer *timr, int flags,
        return err;
 }
 
-static int sgi_clock_getres(const clockid_t which_clock, struct timespec *tp)
+static int sgi_clock_getres(const clockid_t which_clock, struct timespec64 *tp)
 {
        tp->tv_sec = 0;
        tp->tv_nsec = sgi_clock_period;
index e9b7e0b..87fe111 100644 (file)
@@ -2202,14 +2202,16 @@ static int virtcons_freeze(struct virtio_device *vdev)
 
        vdev->config->reset(vdev);
 
-       virtqueue_disable_cb(portdev->c_ivq);
+       if (use_multiport(portdev))
+               virtqueue_disable_cb(portdev->c_ivq);
        cancel_work_sync(&portdev->control_work);
        cancel_work_sync(&portdev->config_work);
        /*
         * Once more: if control_work_handler() was running, it would
         * enable the cb as the last step.
         */
-       virtqueue_disable_cb(portdev->c_ivq);
+       if (use_multiport(portdev))
+               virtqueue_disable_cb(portdev->c_ivq);
        remove_controlq_data(portdev);
 
        list_for_each_entry(port, &portdev->ports, list) {
index ab609a7..cf9449b 100644 (file)
@@ -429,6 +429,13 @@ static const struct clk_div_table pll_divp_table[] = {
        { 0, 2 }, { 1, 4 }, { 2, 6 }, { 3, 8 }, { 0 }
 };
 
+static const struct clk_div_table pll_divq_table[] = {
+       { 2, 2 }, { 3, 3 }, { 4, 4 }, { 5, 5 }, { 6, 6 }, { 7, 7 },
+       { 8, 8 }, { 9, 9 }, { 10, 10 }, { 11, 11 }, { 12, 12 }, { 13, 13 },
+       { 14, 14 }, { 15, 15 },
+       { 0 }
+};
+
 static const struct clk_div_table pll_divr_table[] = {
        { 2, 2 }, { 3, 3 }, { 4, 4 }, { 5, 5 }, { 6, 6 }, { 7, 7 }, { 0 }
 };
@@ -496,9 +503,9 @@ struct stm32f4_div_data {
 
 #define MAX_PLL_DIV 3
 static const struct stm32f4_div_data  div_data[MAX_PLL_DIV] = {
-       { 16, 2, 0,                     pll_divp_table  },
-       { 24, 4, CLK_DIVIDER_ONE_BASED, NULL            },
-       { 28, 3, 0,                     pll_divr_table  },
+       { 16, 2, 0, pll_divp_table },
+       { 24, 4, 0, pll_divq_table },
+       { 28, 3, 0, pll_divr_table },
 };
 
 struct stm32f4_pll_data {
index 72109d2..a077ab6 100644 (file)
@@ -1,6 +1,7 @@
 config SUNXI_CCU
        bool "Clock support for Allwinner SoCs"
        depends on ARCH_SUNXI || COMPILE_TEST
+       select RESET_CONTROLLER
        default ARCH_SUNXI
 
 if SUNXI_CCU
@@ -15,7 +16,7 @@ config SUNXI_CCU_FRAC
        bool
 
 config SUNXI_CCU_GATE
-       bool
+       def_bool y
 
 config SUNXI_CCU_MUX
        bool
@@ -135,6 +136,7 @@ config SUN8I_V3S_CCU
 config SUN9I_A80_CCU
        bool "Support for the Allwinner A80 CCU"
        select SUNXI_CCU_DIV
+       select SUNXI_CCU_MULT
        select SUNXI_CCU_GATE
        select SUNXI_CCU_NKMP
        select SUNXI_CCU_NM
index a7b3c08..2c69b63 100644 (file)
@@ -752,6 +752,13 @@ static const struct sunxi_ccu_desc sun8i_a33_ccu_desc = {
        .num_resets     = ARRAY_SIZE(sun8i_a33_ccu_resets),
 };
 
+static struct ccu_pll_nb sun8i_a33_pll_cpu_nb = {
+       .common = &pll_cpux_clk.common,
+       /* copy from pll_cpux_clk */
+       .enable = BIT(31),
+       .lock   = BIT(28),
+};
+
 static struct ccu_mux_nb sun8i_a33_cpu_nb = {
        .common         = &cpux_clk.common,
        .cm             = &cpux_clk.mux,
@@ -783,6 +790,10 @@ static void __init sun8i_a33_ccu_setup(struct device_node *node)
 
        sunxi_ccu_probe(node, reg, &sun8i_a33_ccu_desc);
 
+       /* Gate then ungate PLL CPU after any rate changes */
+       ccu_pll_notifier_register(&sun8i_a33_pll_cpu_nb);
+
+       /* Reparent CPU during PLL CPU rate changes */
        ccu_mux_notifier_register(pll_cpux_clk.common.hw.clk,
                                  &sun8i_a33_cpu_nb);
 }
index 8a47baf..9d87247 100644 (file)
  * GNU General Public License for more details.
  */
 
+#include <linux/clk.h>
 #include <linux/clk-provider.h>
 #include <linux/iopoll.h>
 #include <linux/slab.h>
 
 #include "ccu_common.h"
+#include "ccu_gate.h"
 #include "ccu_reset.h"
 
 static DEFINE_SPINLOCK(ccu_lock);
@@ -39,6 +41,53 @@ void ccu_helper_wait_for_lock(struct ccu_common *common, u32 lock)
        WARN_ON(readl_relaxed_poll_timeout(addr, reg, reg & lock, 100, 70000));
 }
 
+/*
+ * This clock notifier is called when the frequency of a PLL clock is
+ * changed. In common PLL designs, changes to the dividers take effect
+ * almost immediately, while changes to the multipliers (implemented
+ * as dividers in the feedback loop) take a few cycles to work into
+ * the feedback loop for the PLL to stablize.
+ *
+ * Sometimes when the PLL clock rate is changed, the decrease in the
+ * divider is too much for the decrease in the multiplier to catch up.
+ * The PLL clock rate will spike, and in some cases, might lock up
+ * completely.
+ *
+ * This notifier callback will gate and then ungate the clock,
+ * effectively resetting it, so it proceeds to work. Care must be
+ * taken to reparent consumers to other temporary clocks during the
+ * rate change, and that this notifier callback must be the first
+ * to be registered.
+ */
+static int ccu_pll_notifier_cb(struct notifier_block *nb,
+                              unsigned long event, void *data)
+{
+       struct ccu_pll_nb *pll = to_ccu_pll_nb(nb);
+       int ret = 0;
+
+       if (event != POST_RATE_CHANGE)
+               goto out;
+
+       ccu_gate_helper_disable(pll->common, pll->enable);
+
+       ret = ccu_gate_helper_enable(pll->common, pll->enable);
+       if (ret)
+               goto out;
+
+       ccu_helper_wait_for_lock(pll->common, pll->lock);
+
+out:
+       return notifier_from_errno(ret);
+}
+
+int ccu_pll_notifier_register(struct ccu_pll_nb *pll_nb)
+{
+       pll_nb->clk_nb.notifier_call = ccu_pll_notifier_cb;
+
+       return clk_notifier_register(pll_nb->common->hw.clk,
+                                    &pll_nb->clk_nb);
+}
+
 int sunxi_ccu_probe(struct device_node *node, void __iomem *reg,
                    const struct sunxi_ccu_desc *desc)
 {
index 73d81dc..d6fdd7a 100644 (file)
@@ -83,6 +83,18 @@ struct sunxi_ccu_desc {
 
 void ccu_helper_wait_for_lock(struct ccu_common *common, u32 lock);
 
+struct ccu_pll_nb {
+       struct notifier_block   clk_nb;
+       struct ccu_common       *common;
+
+       u32     enable;
+       u32     lock;
+};
+
+#define to_ccu_pll_nb(_nb) container_of(_nb, struct ccu_pll_nb, clk_nb)
+
+int ccu_pll_notifier_register(struct ccu_pll_nb *pll_nb);
+
 int sunxi_ccu_probe(struct device_node *node, void __iomem *reg,
                    const struct sunxi_ccu_desc *desc);
 
index 3356ab8..545d541 100644 (file)
@@ -67,20 +67,22 @@ config DW_APB_TIMER_OF
        select DW_APB_TIMER
        select CLKSRC_OF
 
-config GEMINI_TIMER
-       bool "Cortina Gemini timer driver" if COMPILE_TEST
+config FTTMR010_TIMER
+       bool "Faraday Technology timer driver" if COMPILE_TEST
        depends on GENERIC_CLOCKEVENTS
        depends on HAS_IOMEM
        select CLKSRC_MMIO
        select CLKSRC_OF
        select MFD_SYSCON
        help
-         Enables support for the Gemini timer
+         Enables support for the Faraday Technology timer block
+         FTTMR010.
 
 config ROCKCHIP_TIMER
        bool "Rockchip timer driver" if COMPILE_TEST
        depends on ARM || ARM64
        select CLKSRC_OF
+       select CLKSRC_MMIO
        help
          Enables the support for the rockchip timer driver.
 
@@ -366,6 +368,17 @@ config HISILICON_ERRATUM_161010101
          161010101. The workaround will be active if the hisilicon,erratum-161010101
          property is found in the timer node.
 
+config ARM64_ERRATUM_858921
+       bool "Workaround for Cortex-A73 erratum 858921"
+       default y
+       select ARM_ARCH_TIMER_OOL_WORKAROUND
+       depends on ARM_ARCH_TIMER && ARM64
+       help
+         This option enables a workaround applicable to Cortex-A73
+         (all versions), whose counter may return incorrect values.
+         The workaround will be dynamically enabled when an affected
+         core is detected.
+
 config ARM_GLOBAL_TIMER
        bool "Support for the ARM global timer" if COMPILE_TEST
        select CLKSRC_OF if OF
index d227d13..2b5b56a 100644 (file)
@@ -17,7 +17,7 @@ obj-$(CONFIG_CLKSRC_MMIO)     += mmio.o
 obj-$(CONFIG_DIGICOLOR_TIMER)  += timer-digicolor.o
 obj-$(CONFIG_DW_APB_TIMER)     += dw_apb_timer.o
 obj-$(CONFIG_DW_APB_TIMER_OF)  += dw_apb_timer_of.o
-obj-$(CONFIG_GEMINI_TIMER)     += timer-gemini.o
+obj-$(CONFIG_FTTMR010_TIMER)   += timer-fttmr010.o
 obj-$(CONFIG_ROCKCHIP_TIMER)      += rockchip_timer.o
 obj-$(CONFIG_CLKSRC_NOMADIK_MTU)       += nomadik-mtu.o
 obj-$(CONFIG_CLKSRC_DBX500_PRCMU)      += clksrc-dbx500-prcmu.o
index 7517f95..2164973 100644 (file)
@@ -37,7 +37,7 @@ static int noinline arc_get_timer_clk(struct device_node *node)
 
        clk = of_clk_get(node, 0);
        if (IS_ERR(clk)) {
-               pr_err("timer missing clk");
+               pr_err("timer missing clk\n");
                return PTR_ERR(clk);
        }
 
@@ -89,7 +89,7 @@ static int __init arc_cs_setup_gfrc(struct device_node *node)
 
        READ_BCR(ARC_REG_MCIP_BCR, mp);
        if (!mp.gfrc) {
-               pr_warn("Global-64-bit-Ctr clocksource not detected");
+               pr_warn("Global-64-bit-Ctr clocksource not detected\n");
                return -ENXIO;
        }
 
@@ -140,13 +140,13 @@ static int __init arc_cs_setup_rtc(struct device_node *node)
 
        READ_BCR(ARC_REG_TIMERS_BCR, timer);
        if (!timer.rtc) {
-               pr_warn("Local-64-bit-Ctr clocksource not detected");
+               pr_warn("Local-64-bit-Ctr clocksource not detected\n");
                return -ENXIO;
        }
 
        /* Local to CPU hence not usable in SMP */
        if (IS_ENABLED(CONFIG_SMP)) {
-               pr_warn("Local-64-bit-Ctr not usable in SMP");
+               pr_warn("Local-64-bit-Ctr not usable in SMP\n");
                return -EINVAL;
        }
 
@@ -290,13 +290,13 @@ static int __init arc_clockevent_setup(struct device_node *node)
 
        arc_timer_irq = irq_of_parse_and_map(node, 0);
        if (arc_timer_irq <= 0) {
-               pr_err("clockevent: missing irq");
+               pr_err("clockevent: missing irq\n");
                return -EINVAL;
        }
 
        ret = arc_get_timer_clk(node);
        if (ret) {
-               pr_err("clockevent: missing clk");
+               pr_err("clockevent: missing clk\n");
                return ret;
        }
 
@@ -313,7 +313,7 @@ static int __init arc_clockevent_setup(struct device_node *node)
                                arc_timer_starting_cpu,
                                arc_timer_dying_cpu);
        if (ret) {
-               pr_err("Failed to setup hotplug state");
+               pr_err("Failed to setup hotplug state\n");
                return ret;
        }
        return 0;
index 7a8a411..a1fb918 100644 (file)
@@ -33,6 +33,9 @@
 
 #include <clocksource/arm_arch_timer.h>
 
+#undef pr_fmt
+#define pr_fmt(fmt) "arch_timer: " fmt
+
 #define CNTTIDR                0x08
 #define CNTTIDR_VIRT(n)        (BIT(1) << ((n) * 4))
 
@@ -52,8 +55,6 @@
 #define CNTV_TVAL      0x38
 #define CNTV_CTL       0x3c
 
-#define ARCH_CP15_TIMER        BIT(0)
-#define ARCH_MEM_TIMER BIT(1)
 static unsigned arch_timers_present __initdata;
 
 static void __iomem *arch_counter_base;
@@ -66,23 +67,15 @@ struct arch_timer {
 #define to_arch_timer(e) container_of(e, struct arch_timer, evt)
 
 static u32 arch_timer_rate;
-
-enum ppi_nr {
-       PHYS_SECURE_PPI,
-       PHYS_NONSECURE_PPI,
-       VIRT_PPI,
-       HYP_PPI,
-       MAX_TIMER_PPI
-};
-
-static int arch_timer_ppi[MAX_TIMER_PPI];
+static int arch_timer_ppi[ARCH_TIMER_MAX_TIMER_PPI];
 
 static struct clock_event_device __percpu *arch_timer_evt;
 
-static enum ppi_nr arch_timer_uses_ppi = VIRT_PPI;
+static enum arch_timer_ppi_nr arch_timer_uses_ppi = ARCH_TIMER_VIRT_PPI;
 static bool arch_timer_c3stop;
 static bool arch_timer_mem_use_virtual;
 static bool arch_counter_suspend_stop;
+static bool vdso_default = true;
 
 static bool evtstrm_enable = IS_ENABLED(CONFIG_ARM_ARCH_TIMER_EVTSTREAM);
 
@@ -96,6 +89,105 @@ early_param("clocksource.arm_arch_timer.evtstrm", early_evtstrm_cfg);
  * Architected system timer support.
  */
 
+static __always_inline
+void arch_timer_reg_write(int access, enum arch_timer_reg reg, u32 val,
+                         struct clock_event_device *clk)
+{
+       if (access == ARCH_TIMER_MEM_PHYS_ACCESS) {
+               struct arch_timer *timer = to_arch_timer(clk);
+               switch (reg) {
+               case ARCH_TIMER_REG_CTRL:
+                       writel_relaxed(val, timer->base + CNTP_CTL);
+                       break;
+               case ARCH_TIMER_REG_TVAL:
+                       writel_relaxed(val, timer->base + CNTP_TVAL);
+                       break;
+               }
+       } else if (access == ARCH_TIMER_MEM_VIRT_ACCESS) {
+               struct arch_timer *timer = to_arch_timer(clk);
+               switch (reg) {
+               case ARCH_TIMER_REG_CTRL:
+                       writel_relaxed(val, timer->base + CNTV_CTL);
+                       break;
+               case ARCH_TIMER_REG_TVAL:
+                       writel_relaxed(val, timer->base + CNTV_TVAL);
+                       break;
+               }
+       } else {
+               arch_timer_reg_write_cp15(access, reg, val);
+       }
+}
+
+static __always_inline
+u32 arch_timer_reg_read(int access, enum arch_timer_reg reg,
+                       struct clock_event_device *clk)
+{
+       u32 val;
+
+       if (access == ARCH_TIMER_MEM_PHYS_ACCESS) {
+               struct arch_timer *timer = to_arch_timer(clk);
+               switch (reg) {
+               case ARCH_TIMER_REG_CTRL:
+                       val = readl_relaxed(timer->base + CNTP_CTL);
+                       break;
+               case ARCH_TIMER_REG_TVAL:
+                       val = readl_relaxed(timer->base + CNTP_TVAL);
+                       break;
+               }
+       } else if (access == ARCH_TIMER_MEM_VIRT_ACCESS) {
+               struct arch_timer *timer = to_arch_timer(clk);
+               switch (reg) {
+               case ARCH_TIMER_REG_CTRL:
+                       val = readl_relaxed(timer->base + CNTV_CTL);
+                       break;
+               case ARCH_TIMER_REG_TVAL:
+                       val = readl_relaxed(timer->base + CNTV_TVAL);
+                       break;
+               }
+       } else {
+               val = arch_timer_reg_read_cp15(access, reg);
+       }
+
+       return val;
+}
+
+/*
+ * Default to cp15 based access because arm64 uses this function for
+ * sched_clock() before DT is probed and the cp15 method is guaranteed
+ * to exist on arm64. arm doesn't use this before DT is probed so even
+ * if we don't have the cp15 accessors we won't have a problem.
+ */
+u64 (*arch_timer_read_counter)(void) = arch_counter_get_cntvct;
+
+static u64 arch_counter_read(struct clocksource *cs)
+{
+       return arch_timer_read_counter();
+}
+
+static u64 arch_counter_read_cc(const struct cyclecounter *cc)
+{
+       return arch_timer_read_counter();
+}
+
+static struct clocksource clocksource_counter = {
+       .name   = "arch_sys_counter",
+       .rating = 400,
+       .read   = arch_counter_read,
+       .mask   = CLOCKSOURCE_MASK(56),
+       .flags  = CLOCK_SOURCE_IS_CONTINUOUS,
+};
+
+static struct cyclecounter cyclecounter __ro_after_init = {
+       .read   = arch_counter_read_cc,
+       .mask   = CLOCKSOURCE_MASK(56),
+};
+
+struct ate_acpi_oem_info {
+       char oem_id[ACPI_OEM_ID_SIZE + 1];
+       char oem_table_id[ACPI_OEM_TABLE_ID_SIZE + 1];
+       u32 oem_revision;
+};
+
 #ifdef CONFIG_FSL_ERRATUM_A008585
 /*
  * The number of retries is an arbitrary value well beyond the highest number
@@ -170,97 +262,289 @@ static u64 notrace hisi_161010101_read_cntvct_el0(void)
 {
        return __hisi_161010101_read_reg(cntvct_el0);
 }
+
+static struct ate_acpi_oem_info hisi_161010101_oem_info[] = {
+       /*
+        * Note that trailing spaces are required to properly match
+        * the OEM table information.
+        */
+       {
+               .oem_id         = "HISI  ",
+               .oem_table_id   = "HIP05   ",
+               .oem_revision   = 0,
+       },
+       {
+               .oem_id         = "HISI  ",
+               .oem_table_id   = "HIP06   ",
+               .oem_revision   = 0,
+       },
+       {
+               .oem_id         = "HISI  ",
+               .oem_table_id   = "HIP07   ",
+               .oem_revision   = 0,
+       },
+       { /* Sentinel indicating the end of the OEM array */ },
+};
+#endif
+
+#ifdef CONFIG_ARM64_ERRATUM_858921
+static u64 notrace arm64_858921_read_cntvct_el0(void)
+{
+       u64 old, new;
+
+       old = read_sysreg(cntvct_el0);
+       new = read_sysreg(cntvct_el0);
+       return (((old ^ new) >> 32) & 1) ? old : new;
+}
 #endif
 
 #ifdef CONFIG_ARM_ARCH_TIMER_OOL_WORKAROUND
-const struct arch_timer_erratum_workaround *timer_unstable_counter_workaround = NULL;
+DEFINE_PER_CPU(const struct arch_timer_erratum_workaround *,
+              timer_unstable_counter_workaround);
 EXPORT_SYMBOL_GPL(timer_unstable_counter_workaround);
 
 DEFINE_STATIC_KEY_FALSE(arch_timer_read_ool_enabled);
 EXPORT_SYMBOL_GPL(arch_timer_read_ool_enabled);
 
+static void erratum_set_next_event_tval_generic(const int access, unsigned long evt,
+                                               struct clock_event_device *clk)
+{
+       unsigned long ctrl;
+       u64 cval = evt + arch_counter_get_cntvct();
+
+       ctrl = arch_timer_reg_read(access, ARCH_TIMER_REG_CTRL, clk);
+       ctrl |= ARCH_TIMER_CTRL_ENABLE;
+       ctrl &= ~ARCH_TIMER_CTRL_IT_MASK;
+
+       if (access == ARCH_TIMER_PHYS_ACCESS)
+               write_sysreg(cval, cntp_cval_el0);
+       else
+               write_sysreg(cval, cntv_cval_el0);
+
+       arch_timer_reg_write(access, ARCH_TIMER_REG_CTRL, ctrl, clk);
+}
+
+static __maybe_unused int erratum_set_next_event_tval_virt(unsigned long evt,
+                                           struct clock_event_device *clk)
+{
+       erratum_set_next_event_tval_generic(ARCH_TIMER_VIRT_ACCESS, evt, clk);
+       return 0;
+}
+
+static __maybe_unused int erratum_set_next_event_tval_phys(unsigned long evt,
+                                           struct clock_event_device *clk)
+{
+       erratum_set_next_event_tval_generic(ARCH_TIMER_PHYS_ACCESS, evt, clk);
+       return 0;
+}
+
 static const struct arch_timer_erratum_workaround ool_workarounds[] = {
 #ifdef CONFIG_FSL_ERRATUM_A008585
        {
+               .match_type = ate_match_dt,
                .id = "fsl,erratum-a008585",
+               .desc = "Freescale erratum a005858",
                .read_cntp_tval_el0 = fsl_a008585_read_cntp_tval_el0,
                .read_cntv_tval_el0 = fsl_a008585_read_cntv_tval_el0,
                .read_cntvct_el0 = fsl_a008585_read_cntvct_el0,
+               .set_next_event_phys = erratum_set_next_event_tval_phys,
+               .set_next_event_virt = erratum_set_next_event_tval_virt,
        },
 #endif
 #ifdef CONFIG_HISILICON_ERRATUM_161010101
        {
+               .match_type = ate_match_dt,
                .id = "hisilicon,erratum-161010101",
+               .desc = "HiSilicon erratum 161010101",
                .read_cntp_tval_el0 = hisi_161010101_read_cntp_tval_el0,
                .read_cntv_tval_el0 = hisi_161010101_read_cntv_tval_el0,
                .read_cntvct_el0 = hisi_161010101_read_cntvct_el0,
+               .set_next_event_phys = erratum_set_next_event_tval_phys,
+               .set_next_event_virt = erratum_set_next_event_tval_virt,
+       },
+       {
+               .match_type = ate_match_acpi_oem_info,
+               .id = hisi_161010101_oem_info,
+               .desc = "HiSilicon erratum 161010101",
+               .read_cntp_tval_el0 = hisi_161010101_read_cntp_tval_el0,
+               .read_cntv_tval_el0 = hisi_161010101_read_cntv_tval_el0,
+               .read_cntvct_el0 = hisi_161010101_read_cntvct_el0,
+               .set_next_event_phys = erratum_set_next_event_tval_phys,
+               .set_next_event_virt = erratum_set_next_event_tval_virt,
+       },
+#endif
+#ifdef CONFIG_ARM64_ERRATUM_858921
+       {
+               .match_type = ate_match_local_cap_id,
+               .id = (void *)ARM64_WORKAROUND_858921,
+               .desc = "ARM erratum 858921",
+               .read_cntvct_el0 = arm64_858921_read_cntvct_el0,
        },
 #endif
 };
-#endif /* CONFIG_ARM_ARCH_TIMER_OOL_WORKAROUND */
 
-static __always_inline
-void arch_timer_reg_write(int access, enum arch_timer_reg reg, u32 val,
-                         struct clock_event_device *clk)
+typedef bool (*ate_match_fn_t)(const struct arch_timer_erratum_workaround *,
+                              const void *);
+
+static
+bool arch_timer_check_dt_erratum(const struct arch_timer_erratum_workaround *wa,
+                                const void *arg)
 {
-       if (access == ARCH_TIMER_MEM_PHYS_ACCESS) {
-               struct arch_timer *timer = to_arch_timer(clk);
-               switch (reg) {
-               case ARCH_TIMER_REG_CTRL:
-                       writel_relaxed(val, timer->base + CNTP_CTL);
-                       break;
-               case ARCH_TIMER_REG_TVAL:
-                       writel_relaxed(val, timer->base + CNTP_TVAL);
-                       break;
-               }
-       } else if (access == ARCH_TIMER_MEM_VIRT_ACCESS) {
-               struct arch_timer *timer = to_arch_timer(clk);
-               switch (reg) {
-               case ARCH_TIMER_REG_CTRL:
-                       writel_relaxed(val, timer->base + CNTV_CTL);
-                       break;
-               case ARCH_TIMER_REG_TVAL:
-                       writel_relaxed(val, timer->base + CNTV_TVAL);
-                       break;
-               }
-       } else {
-               arch_timer_reg_write_cp15(access, reg, val);
+       const struct device_node *np = arg;
+
+       return of_property_read_bool(np, wa->id);
+}
+
+static
+bool arch_timer_check_local_cap_erratum(const struct arch_timer_erratum_workaround *wa,
+                                       const void *arg)
+{
+       return this_cpu_has_cap((uintptr_t)wa->id);
+}
+
+
+static
+bool arch_timer_check_acpi_oem_erratum(const struct arch_timer_erratum_workaround *wa,
+                                      const void *arg)
+{
+       static const struct ate_acpi_oem_info empty_oem_info = {};
+       const struct ate_acpi_oem_info *info = wa->id;
+       const struct acpi_table_header *table = arg;
+
+       /* Iterate over the ACPI OEM info array, looking for a match */
+       while (memcmp(info, &empty_oem_info, sizeof(*info))) {
+               if (!memcmp(info->oem_id, table->oem_id, ACPI_OEM_ID_SIZE) &&
+                   !memcmp(info->oem_table_id, table->oem_table_id, ACPI_OEM_TABLE_ID_SIZE) &&
+                   info->oem_revision == table->oem_revision)
+                       return true;
+
+               info++;
        }
+
+       return false;
 }
 
-static __always_inline
-u32 arch_timer_reg_read(int access, enum arch_timer_reg reg,
-                       struct clock_event_device *clk)
+static const struct arch_timer_erratum_workaround *
+arch_timer_iterate_errata(enum arch_timer_erratum_match_type type,
+                         ate_match_fn_t match_fn,
+                         void *arg)
 {
-       u32 val;
+       int i;
 
-       if (access == ARCH_TIMER_MEM_PHYS_ACCESS) {
-               struct arch_timer *timer = to_arch_timer(clk);
-               switch (reg) {
-               case ARCH_TIMER_REG_CTRL:
-                       val = readl_relaxed(timer->base + CNTP_CTL);
-                       break;
-               case ARCH_TIMER_REG_TVAL:
-                       val = readl_relaxed(timer->base + CNTP_TVAL);
-                       break;
-               }
-       } else if (access == ARCH_TIMER_MEM_VIRT_ACCESS) {
-               struct arch_timer *timer = to_arch_timer(clk);
-               switch (reg) {
-               case ARCH_TIMER_REG_CTRL:
-                       val = readl_relaxed(timer->base + CNTV_CTL);
-                       break;
-               case ARCH_TIMER_REG_TVAL:
-                       val = readl_relaxed(timer->base + CNTV_TVAL);
-                       break;
-               }
+       for (i = 0; i < ARRAY_SIZE(ool_workarounds); i++) {
+               if (ool_workarounds[i].match_type != type)
+                       continue;
+
+               if (match_fn(&ool_workarounds[i], arg))
+                       return &ool_workarounds[i];
+       }
+
+       return NULL;
+}
+
+static
+void arch_timer_enable_workaround(const struct arch_timer_erratum_workaround *wa,
+                                 bool local)
+{
+       int i;
+
+       if (local) {
+               __this_cpu_write(timer_unstable_counter_workaround, wa);
        } else {
-               val = arch_timer_reg_read_cp15(access, reg);
+               for_each_possible_cpu(i)
+                       per_cpu(timer_unstable_counter_workaround, i) = wa;
        }
 
-       return val;
+       static_branch_enable(&arch_timer_read_ool_enabled);
+
+       /*
+        * Don't use the vdso fastpath if errata require using the
+        * out-of-line counter accessor. We may change our mind pretty
+        * late in the game (with a per-CPU erratum, for example), so
+        * change both the default value and the vdso itself.
+        */
+       if (wa->read_cntvct_el0) {
+               clocksource_counter.archdata.vdso_direct = false;
+               vdso_default = false;
+       }
+}
+
+static void arch_timer_check_ool_workaround(enum arch_timer_erratum_match_type type,
+                                           void *arg)
+{
+       const struct arch_timer_erratum_workaround *wa;
+       ate_match_fn_t match_fn = NULL;
+       bool local = false;
+
+       switch (type) {
+       case ate_match_dt:
+               match_fn = arch_timer_check_dt_erratum;
+               break;
+       case ate_match_local_cap_id:
+               match_fn = arch_timer_check_local_cap_erratum;
+               local = true;
+               break;
+       case ate_match_acpi_oem_info:
+               match_fn = arch_timer_check_acpi_oem_erratum;
+               break;
+       default:
+               WARN_ON(1);
+               return;
+       }
+
+       wa = arch_timer_iterate_errata(type, match_fn, arg);
+       if (!wa)
+               return;
+
+       if (needs_unstable_timer_counter_workaround()) {
+               const struct arch_timer_erratum_workaround *__wa;
+               __wa = __this_cpu_read(timer_unstable_counter_workaround);
+               if (__wa && wa != __wa)
+                       pr_warn("Can't enable workaround for %s (clashes with %s\n)",
+                               wa->desc, __wa->desc);
+
+               if (__wa)
+                       return;
+       }
+
+       arch_timer_enable_workaround(wa, local);
+       pr_info("Enabling %s workaround for %s\n",
+               local ? "local" : "global", wa->desc);
 }
 
+#define erratum_handler(fn, r, ...)                                    \
+({                                                                     \
+       bool __val;                                                     \
+       if (needs_unstable_timer_counter_workaround()) {                \
+               const struct arch_timer_erratum_workaround *__wa;       \
+               __wa = __this_cpu_read(timer_unstable_counter_workaround); \
+               if (__wa && __wa->fn) {                                 \
+                       r = __wa->fn(__VA_ARGS__);                      \
+                       __val = true;                                   \
+               } else {                                                \
+                       __val = false;                                  \
+               }                                                       \
+       } else {                                                        \
+               __val = false;                                          \
+       }                                                               \
+       __val;                                                          \
+})
+
+static bool arch_timer_this_cpu_has_cntvct_wa(void)
+{
+       const struct arch_timer_erratum_workaround *wa;
+
+       wa = __this_cpu_read(timer_unstable_counter_workaround);
+       return wa && wa->read_cntvct_el0;
+}
+#else
+#define arch_timer_check_ool_workaround(t,a)           do { } while(0)
+#define erratum_set_next_event_tval_virt(...)          ({BUG(); 0;})
+#define erratum_set_next_event_tval_phys(...)          ({BUG(); 0;})
+#define erratum_handler(fn, r, ...)                    ({false;})
+#define arch_timer_this_cpu_has_cntvct_wa()            ({false;})
+#endif /* CONFIG_ARM_ARCH_TIMER_OOL_WORKAROUND */
+
 static __always_inline irqreturn_t timer_handler(const int access,
                                        struct clock_event_device *evt)
 {
@@ -348,43 +632,14 @@ static __always_inline void set_next_event(const int access, unsigned long evt,
        arch_timer_reg_write(access, ARCH_TIMER_REG_CTRL, ctrl, clk);
 }
 
-#ifdef CONFIG_ARM_ARCH_TIMER_OOL_WORKAROUND
-static __always_inline void erratum_set_next_event_generic(const int access,
-               unsigned long evt, struct clock_event_device *clk)
-{
-       unsigned long ctrl;
-       u64 cval = evt + arch_counter_get_cntvct();
-
-       ctrl = arch_timer_reg_read(access, ARCH_TIMER_REG_CTRL, clk);
-       ctrl |= ARCH_TIMER_CTRL_ENABLE;
-       ctrl &= ~ARCH_TIMER_CTRL_IT_MASK;
-
-       if (access == ARCH_TIMER_PHYS_ACCESS)
-               write_sysreg(cval, cntp_cval_el0);
-       else if (access == ARCH_TIMER_VIRT_ACCESS)
-               write_sysreg(cval, cntv_cval_el0);
-
-       arch_timer_reg_write(access, ARCH_TIMER_REG_CTRL, ctrl, clk);
-}
-
-static int erratum_set_next_event_virt(unsigned long evt,
-                                          struct clock_event_device *clk)
-{
-       erratum_set_next_event_generic(ARCH_TIMER_VIRT_ACCESS, evt, clk);
-       return 0;
-}
-
-static int erratum_set_next_event_phys(unsigned long evt,
-                                          struct clock_event_device *clk)
-{
-       erratum_set_next_event_generic(ARCH_TIMER_PHYS_ACCESS, evt, clk);
-       return 0;
-}
-#endif /* CONFIG_ARM_ARCH_TIMER_OOL_WORKAROUND */
-
 static int arch_timer_set_next_event_virt(unsigned long evt,
                                          struct clock_event_device *clk)
 {
+       int ret;
+
+       if (erratum_handler(set_next_event_virt, ret, evt, clk))
+               return ret;
+
        set_next_event(ARCH_TIMER_VIRT_ACCESS, evt, clk);
        return 0;
 }
@@ -392,6 +647,11 @@ static int arch_timer_set_next_event_virt(unsigned long evt,
 static int arch_timer_set_next_event_phys(unsigned long evt,
                                          struct clock_event_device *clk)
 {
+       int ret;
+
+       if (erratum_handler(set_next_event_phys, ret, evt, clk))
+               return ret;
+
        set_next_event(ARCH_TIMER_PHYS_ACCESS, evt, clk);
        return 0;
 }
@@ -410,25 +670,12 @@ static int arch_timer_set_next_event_phys_mem(unsigned long evt,
        return 0;
 }
 
-static void erratum_workaround_set_sne(struct clock_event_device *clk)
-{
-#ifdef CONFIG_ARM_ARCH_TIMER_OOL_WORKAROUND
-       if (!static_branch_unlikely(&arch_timer_read_ool_enabled))
-               return;
-
-       if (arch_timer_uses_ppi == VIRT_PPI)
-               clk->set_next_event = erratum_set_next_event_virt;
-       else
-               clk->set_next_event = erratum_set_next_event_phys;
-#endif
-}
-
 static void __arch_timer_setup(unsigned type,
                               struct clock_event_device *clk)
 {
        clk->features = CLOCK_EVT_FEAT_ONESHOT;
 
-       if (type == ARCH_CP15_TIMER) {
+       if (type == ARCH_TIMER_TYPE_CP15) {
                if (arch_timer_c3stop)
                        clk->features |= CLOCK_EVT_FEAT_C3STOP;
                clk->name = "arch_sys_timer";
@@ -436,14 +683,14 @@ static void __arch_timer_setup(unsigned type,
                clk->cpumask = cpumask_of(smp_processor_id());
                clk->irq = arch_timer_ppi[arch_timer_uses_ppi];
                switch (arch_timer_uses_ppi) {
-               case VIRT_PPI:
+               case ARCH_TIMER_VIRT_PPI:
                        clk->set_state_shutdown = arch_timer_shutdown_virt;
                        clk->set_state_oneshot_stopped = arch_timer_shutdown_virt;
                        clk->set_next_event = arch_timer_set_next_event_virt;
                        break;
-               case PHYS_SECURE_PPI:
-               case PHYS_NONSECURE_PPI:
-               case HYP_PPI:
+               case ARCH_TIMER_PHYS_SECURE_PPI:
+               case ARCH_TIMER_PHYS_NONSECURE_PPI:
+               case ARCH_TIMER_HYP_PPI:
                        clk->set_state_shutdown = arch_timer_shutdown_phys;
                        clk->set_state_oneshot_stopped = arch_timer_shutdown_phys;
                        clk->set_next_event = arch_timer_set_next_event_phys;
@@ -452,7 +699,7 @@ static void __arch_timer_setup(unsigned type,
                        BUG();
                }
 
-               erratum_workaround_set_sne(clk);
+               arch_timer_check_ool_workaround(ate_match_local_cap_id, NULL);
        } else {
                clk->features |= CLOCK_EVT_FEAT_DYNIRQ;
                clk->name = "arch_mem_timer";
@@ -508,23 +755,31 @@ static void arch_counter_set_user_access(void)
 {
        u32 cntkctl = arch_timer_get_cntkctl();
 
-       /* Disable user access to the timers and the physical counter */
+       /* Disable user access to the timers and both counters */
        /* Also disable virtual event stream */
        cntkctl &= ~(ARCH_TIMER_USR_PT_ACCESS_EN
                        | ARCH_TIMER_USR_VT_ACCESS_EN
+                       | ARCH_TIMER_USR_VCT_ACCESS_EN
                        | ARCH_TIMER_VIRT_EVT_EN
                        | ARCH_TIMER_USR_PCT_ACCESS_EN);
 
-       /* Enable user access to the virtual counter */
-       cntkctl |= ARCH_TIMER_USR_VCT_ACCESS_EN;
+       /*
+        * Enable user access to the virtual counter if it doesn't
+        * need to be workaround. The vdso may have been already
+        * disabled though.
+        */
+       if (arch_timer_this_cpu_has_cntvct_wa())
+               pr_info("CPU%d: Trapping CNTVCT access\n", smp_processor_id());
+       else
+               cntkctl |= ARCH_TIMER_USR_VCT_ACCESS_EN;
 
        arch_timer_set_cntkctl(cntkctl);
 }
 
 static bool arch_timer_has_nonsecure_ppi(void)
 {
-       return (arch_timer_uses_ppi == PHYS_SECURE_PPI &&
-               arch_timer_ppi[PHYS_NONSECURE_PPI]);
+       return (arch_timer_uses_ppi == ARCH_TIMER_PHYS_SECURE_PPI &&
+               arch_timer_ppi[ARCH_TIMER_PHYS_NONSECURE_PPI]);
 }
 
 static u32 check_ppi_trigger(int irq)
@@ -545,14 +800,15 @@ static int arch_timer_starting_cpu(unsigned int cpu)
        struct clock_event_device *clk = this_cpu_ptr(arch_timer_evt);
        u32 flags;
 
-       __arch_timer_setup(ARCH_CP15_TIMER, clk);
+       __arch_timer_setup(ARCH_TIMER_TYPE_CP15, clk);
 
        flags = check_ppi_trigger(arch_timer_ppi[arch_timer_uses_ppi]);
        enable_percpu_irq(arch_timer_ppi[arch_timer_uses_ppi], flags);
 
        if (arch_timer_has_nonsecure_ppi()) {
-               flags = check_ppi_trigger(arch_timer_ppi[PHYS_NONSECURE_PPI]);
-               enable_percpu_irq(arch_timer_ppi[PHYS_NONSECURE_PPI], flags);
+               flags = check_ppi_trigger(arch_timer_ppi[ARCH_TIMER_PHYS_NONSECURE_PPI]);
+               enable_percpu_irq(arch_timer_ppi[ARCH_TIMER_PHYS_NONSECURE_PPI],
+                                 flags);
        }
 
        arch_counter_set_user_access();
@@ -562,43 +818,39 @@ static int arch_timer_starting_cpu(unsigned int cpu)
        return 0;
 }
 
-static void
-arch_timer_detect_rate(void __iomem *cntbase, struct device_node *np)
+/*
+ * For historical reasons, when probing with DT we use whichever (non-zero)
+ * rate was probed first, and don't verify that others match. If the first node
+ * probed has a clock-frequency property, this overrides the HW register.
+ */
+static void arch_timer_of_configure_rate(u32 rate, struct device_node *np)
 {
        /* Who has more than one independent system counter? */
        if (arch_timer_rate)
                return;
 
-       /*
-        * Try to determine the frequency from the device tree or CNTFRQ,
-        * if ACPI is enabled, get the frequency from CNTFRQ ONLY.
-        */
-       if (!acpi_disabled ||
-           of_property_read_u32(np, "clock-frequency", &arch_timer_rate)) {
-               if (cntbase)
-                       arch_timer_rate = readl_relaxed(cntbase + CNTFRQ);
-               else
-                       arch_timer_rate = arch_timer_get_cntfrq();
-       }
+       if (of_property_read_u32(np, "clock-frequency", &arch_timer_rate))
+               arch_timer_rate = rate;
 
        /* Check the timer frequency. */
        if (arch_timer_rate == 0)
-               pr_warn("Architected timer frequency not available\n");
+               pr_warn("frequency not available\n");
 }
 
 static void arch_timer_banner(unsigned type)
 {
-       pr_info("Architected %s%s%s timer(s) running at %lu.%02luMHz (%s%s%s).\n",
-                    type & ARCH_CP15_TIMER ? "cp15" : "",
-                    type == (ARCH_CP15_TIMER | ARCH_MEM_TIMER) ?  " and " : "",
-                    type & ARCH_MEM_TIMER ? "mmio" : "",
-                    (unsigned long)arch_timer_rate / 1000000,
-                    (unsigned long)(arch_timer_rate / 10000) % 100,
-                    type & ARCH_CP15_TIMER ?
-                    (arch_timer_uses_ppi == VIRT_PPI) ? "virt" : "phys" :
+       pr_info("%s%s%s timer(s) running at %lu.%02luMHz (%s%s%s).\n",
+               type & ARCH_TIMER_TYPE_CP15 ? "cp15" : "",
+               type == (ARCH_TIMER_TYPE_CP15 | ARCH_TIMER_TYPE_MEM) ?
+                       " and " : "",
+               type & ARCH_TIMER_TYPE_MEM ? "mmio" : "",
+               (unsigned long)arch_timer_rate / 1000000,
+               (unsigned long)(arch_timer_rate / 10000) % 100,
+               type & ARCH_TIMER_TYPE_CP15 ?
+                       (arch_timer_uses_ppi == ARCH_TIMER_VIRT_PPI) ? "virt" : "phys" :
                        "",
-                    type == (ARCH_CP15_TIMER | ARCH_MEM_TIMER) ?  "/" : "",
-                    type & ARCH_MEM_TIMER ?
+               type == (ARCH_TIMER_TYPE_CP15 | ARCH_TIMER_TYPE_MEM) ? "/" : "",
+               type & ARCH_TIMER_TYPE_MEM ?
                        arch_timer_mem_use_virtual ? "virt" : "phys" :
                        "");
 }
@@ -621,37 +873,6 @@ static u64 arch_counter_get_cntvct_mem(void)
        return ((u64) vct_hi << 32) | vct_lo;
 }
 
-/*
- * Default to cp15 based access because arm64 uses this function for
- * sched_clock() before DT is probed and the cp15 method is guaranteed
- * to exist on arm64. arm doesn't use this before DT is probed so even
- * if we don't have the cp15 accessors we won't have a problem.
- */
-u64 (*arch_timer_read_counter)(void) = arch_counter_get_cntvct;
-
-static u64 arch_counter_read(struct clocksource *cs)
-{
-       return arch_timer_read_counter();
-}
-
-static u64 arch_counter_read_cc(const struct cyclecounter *cc)
-{
-       return arch_timer_read_counter();
-}
-
-static struct clocksource clocksource_counter = {
-       .name   = "arch_sys_counter",
-       .rating = 400,
-       .read   = arch_counter_read,
-       .mask   = CLOCKSOURCE_MASK(56),
-       .flags  = CLOCK_SOURCE_IS_CONTINUOUS,
-};
-
-static struct cyclecounter cyclecounter __ro_after_init = {
-       .read   = arch_counter_read_cc,
-       .mask   = CLOCKSOURCE_MASK(56),
-};
-
 static struct arch_timer_kvm_info arch_timer_kvm_info;
 
 struct arch_timer_kvm_info *arch_timer_get_kvm_info(void)
@@ -664,22 +885,14 @@ static void __init arch_counter_register(unsigned type)
        u64 start_count;
 
        /* Register the CP15 based counter if we have one */
-       if (type & ARCH_CP15_TIMER) {
-               if (IS_ENABLED(CONFIG_ARM64) || arch_timer_uses_ppi == VIRT_PPI)
+       if (type & ARCH_TIMER_TYPE_CP15) {
+               if (IS_ENABLED(CONFIG_ARM64) ||
+                   arch_timer_uses_ppi == ARCH_TIMER_VIRT_PPI)
                        arch_timer_read_counter = arch_counter_get_cntvct;
                else
                        arch_timer_read_counter = arch_counter_get_cntpct;
 
-               clocksource_counter.archdata.vdso_direct = true;
-
-#ifdef CONFIG_ARM_ARCH_TIMER_OOL_WORKAROUND
-               /*
-                * Don't use the vdso fastpath if errata require using
-                * the out-of-line counter accessor.
-                */
-               if (static_branch_unlikely(&arch_timer_read_ool_enabled))
-                       clocksource_counter.archdata.vdso_direct = false;
-#endif
+               clocksource_counter.archdata.vdso_direct = vdso_default;
        } else {
                arch_timer_read_counter = arch_counter_get_cntvct_mem;
        }
@@ -699,12 +912,11 @@ static void __init arch_counter_register(unsigned type)
 
 static void arch_timer_stop(struct clock_event_device *clk)
 {
-       pr_debug("arch_timer_teardown disable IRQ%d cpu #%d\n",
-                clk->irq, smp_processor_id());
+       pr_debug("disable IRQ%d cpu #%d\n", clk->irq, smp_processor_id());
 
        disable_percpu_irq(arch_timer_ppi[arch_timer_uses_ppi]);
        if (arch_timer_has_nonsecure_ppi())
-               disable_percpu_irq(arch_timer_ppi[PHYS_NONSECURE_PPI]);
+               disable_percpu_irq(arch_timer_ppi[ARCH_TIMER_PHYS_NONSECURE_PPI]);
 
        clk->set_state_shutdown(clk);
 }
@@ -718,14 +930,14 @@ static int arch_timer_dying_cpu(unsigned int cpu)
 }
 
 #ifdef CONFIG_CPU_PM
-static unsigned int saved_cntkctl;
+static DEFINE_PER_CPU(unsigned long, saved_cntkctl);
 static int arch_timer_cpu_pm_notify(struct notifier_block *self,
                                    unsigned long action, void *hcpu)
 {
        if (action == CPU_PM_ENTER)
-               saved_cntkctl = arch_timer_get_cntkctl();
+               __this_cpu_write(saved_cntkctl, arch_timer_get_cntkctl());
        else if (action == CPU_PM_ENTER_FAILED || action == CPU_PM_EXIT)
-               arch_timer_set_cntkctl(saved_cntkctl);
+               arch_timer_set_cntkctl(__this_cpu_read(saved_cntkctl));
        return NOTIFY_OK;
 }
 
@@ -767,24 +979,24 @@ static int __init arch_timer_register(void)
 
        ppi = arch_timer_ppi[arch_timer_uses_ppi];
        switch (arch_timer_uses_ppi) {
-       case VIRT_PPI:
+       case ARCH_TIMER_VIRT_PPI:
                err = request_percpu_irq(ppi, arch_timer_handler_virt,
                                         "arch_timer", arch_timer_evt);
                break;
-       case PHYS_SECURE_PPI:
-       case PHYS_NONSECURE_PPI:
+       case ARCH_TIMER_PHYS_SECURE_PPI:
+       case ARCH_TIMER_PHYS_NONSECURE_PPI:
                err = request_percpu_irq(ppi, arch_timer_handler_phys,
                                         "arch_timer", arch_timer_evt);
-               if (!err && arch_timer_ppi[PHYS_NONSECURE_PPI]) {
-                       ppi = arch_timer_ppi[PHYS_NONSECURE_PPI];
+               if (!err && arch_timer_has_nonsecure_ppi()) {
+                       ppi = arch_timer_ppi[ARCH_TIMER_PHYS_NONSECURE_PPI];
                        err = request_percpu_irq(ppi, arch_timer_handler_phys,
                                                 "arch_timer", arch_timer_evt);
                        if (err)
-                               free_percpu_irq(arch_timer_ppi[PHYS_SECURE_PPI],
+                               free_percpu_irq(arch_timer_ppi[ARCH_TIMER_PHYS_SECURE_PPI],
                                                arch_timer_evt);
                }
                break;
-       case HYP_PPI:
+       case ARCH_TIMER_HYP_PPI:
                err = request_percpu_irq(ppi, arch_timer_handler_phys,
                                         "arch_timer", arch_timer_evt);
                break;
@@ -793,8 +1005,7 @@ static int __init arch_timer_register(void)
        }
 
        if (err) {
-               pr_err("arch_timer: can't register interrupt %d (%d)\n",
-                      ppi, err);
+               pr_err("can't register interrupt %d (%d)\n", ppi, err);
                goto out_free;
        }
 
@@ -817,7 +1028,7 @@ out_unreg_cpupm:
 out_unreg_notify:
        free_percpu_irq(arch_timer_ppi[arch_timer_uses_ppi], arch_timer_evt);
        if (arch_timer_has_nonsecure_ppi())
-               free_percpu_irq(arch_timer_ppi[PHYS_NONSECURE_PPI],
+               free_percpu_irq(arch_timer_ppi[ARCH_TIMER_PHYS_NONSECURE_PPI],
                                arch_timer_evt);
 
 out_free:
@@ -838,7 +1049,7 @@ static int __init arch_timer_mem_register(void __iomem *base, unsigned int irq)
 
        t->base = base;
        t->evt.irq = irq;
-       __arch_timer_setup(ARCH_MEM_TIMER, &t->evt);
+       __arch_timer_setup(ARCH_TIMER_TYPE_MEM, &t->evt);
 
        if (arch_timer_mem_use_virtual)
                func = arch_timer_handler_virt_mem;
@@ -847,7 +1058,7 @@ static int __init arch_timer_mem_register(void __iomem *base, unsigned int irq)
 
        ret = request_irq(irq, func, IRQF_TIMER, "arch_mem_timer", &t->evt);
        if (ret) {
-               pr_err("arch_timer: Failed to request mem timer irq\n");
+               pr_err("Failed to request mem timer irq\n");
                kfree(t);
        }
 
@@ -865,15 +1076,28 @@ static const struct of_device_id arch_timer_mem_of_match[] __initconst = {
        {},
 };
 
-static bool __init
-arch_timer_needs_probing(int type, const struct of_device_id *matches)
+static bool __init arch_timer_needs_of_probing(void)
 {
        struct device_node *dn;
        bool needs_probing = false;
+       unsigned int mask = ARCH_TIMER_TYPE_CP15 | ARCH_TIMER_TYPE_MEM;
+
+       /* We have two timers, and both device-tree nodes are probed. */
+       if ((arch_timers_present & mask) == mask)
+               return false;
 
-       dn = of_find_matching_node(NULL, matches);
-       if (dn && of_device_is_available(dn) && !(arch_timers_present & type))
+       /*
+        * Only one type of timer is probed,
+        * check if we have another type of timer node in device-tree.
+        */
+       if (arch_timers_present & ARCH_TIMER_TYPE_CP15)
+               dn = of_find_matching_node(NULL, arch_timer_mem_of_match);
+       else
+               dn = of_find_matching_node(NULL, arch_timer_of_match);
+
+       if (dn && of_device_is_available(dn))
                needs_probing = true;
+
        of_node_put(dn);
 
        return needs_probing;
@@ -881,96 +1105,66 @@ arch_timer_needs_probing(int type, const struct of_device_id *matches)
 
 static int __init arch_timer_common_init(void)
 {
-       unsigned mask = ARCH_CP15_TIMER | ARCH_MEM_TIMER;
-
-       /* Wait until both nodes are probed if we have two timers */
-       if ((arch_timers_present & mask) != mask) {
-               if (arch_timer_needs_probing(ARCH_MEM_TIMER, arch_timer_mem_of_match))
-                       return 0;
-               if (arch_timer_needs_probing(ARCH_CP15_TIMER, arch_timer_of_match))
-                       return 0;
-       }
-
        arch_timer_banner(arch_timers_present);
        arch_counter_register(arch_timers_present);
        return arch_timer_arch_init();
 }
 
-static int __init arch_timer_init(void)
+/**
+ * arch_timer_select_ppi() - Select suitable PPI for the current system.
+ *
+ * If HYP mode is available, we know that the physical timer
+ * has been configured to be accessible from PL1. Use it, so
+ * that a guest can use the virtual timer instead.
+ *
+ * On ARMv8.1 with VH extensions, the kernel runs in HYP. VHE
+ * accesses to CNTP_*_EL1 registers are silently redirected to
+ * their CNTHP_*_EL2 counterparts, and use a different PPI
+ * number.
+ *
+ * If no interrupt provided for virtual timer, we'll have to
+ * stick to the physical timer. It'd better be accessible...
+ * For arm64 we never use the secure interrupt.
+ *
+ * Return: a suitable PPI type for the current system.
+ */
+static enum arch_timer_ppi_nr __init arch_timer_select_ppi(void)
 {
-       int ret;
-       /*
-        * If HYP mode is available, we know that the physical timer
-        * has been configured to be accessible from PL1. Use it, so
-        * that a guest can use the virtual timer instead.
-        *
-        * If no interrupt provided for virtual timer, we'll have to
-        * stick to the physical timer. It'd better be accessible...
-        *
-        * On ARMv8.1 with VH extensions, the kernel runs in HYP. VHE
-        * accesses to CNTP_*_EL1 registers are silently redirected to
-        * their CNTHP_*_EL2 counterparts, and use a different PPI
-        * number.
-        */
-       if (is_hyp_mode_available() || !arch_timer_ppi[VIRT_PPI]) {
-               bool has_ppi;
-
-               if (is_kernel_in_hyp_mode()) {
-                       arch_timer_uses_ppi = HYP_PPI;
-                       has_ppi = !!arch_timer_ppi[HYP_PPI];
-               } else {
-                       arch_timer_uses_ppi = PHYS_SECURE_PPI;
-                       has_ppi = (!!arch_timer_ppi[PHYS_SECURE_PPI] ||
-                                  !!arch_timer_ppi[PHYS_NONSECURE_PPI]);
-               }
-
-               if (!has_ppi) {
-                       pr_warn("arch_timer: No interrupt available, giving up\n");
-                       return -EINVAL;
-               }
-       }
+       if (is_kernel_in_hyp_mode())
+               return ARCH_TIMER_HYP_PPI;
 
-       ret = arch_timer_register();
-       if (ret)
-               return ret;
+       if (!is_hyp_mode_available() && arch_timer_ppi[ARCH_TIMER_VIRT_PPI])
+               return ARCH_TIMER_VIRT_PPI;
 
-       ret = arch_timer_common_init();
-       if (ret)
-               return ret;
+       if (IS_ENABLED(CONFIG_ARM64))
+               return ARCH_TIMER_PHYS_NONSECURE_PPI;
 
-       arch_timer_kvm_info.virtual_irq = arch_timer_ppi[VIRT_PPI];
-       
-       return 0;
+       return ARCH_TIMER_PHYS_SECURE_PPI;
 }
 
 static int __init arch_timer_of_init(struct device_node *np)
 {
-       int i;
+       int i, ret;
+       u32 rate;
 
-       if (arch_timers_present & ARCH_CP15_TIMER) {
-               pr_warn("arch_timer: multiple nodes in dt, skipping\n");
+       if (arch_timers_present & ARCH_TIMER_TYPE_CP15) {
+               pr_warn("multiple nodes in dt, skipping\n");
                return 0;
        }
 
-       arch_timers_present |= ARCH_CP15_TIMER;
-       for (i = PHYS_SECURE_PPI; i < MAX_TIMER_PPI; i++)
+       arch_timers_present |= ARCH_TIMER_TYPE_CP15;
+       for (i = ARCH_TIMER_PHYS_SECURE_PPI; i < ARCH_TIMER_MAX_TIMER_PPI; i++)
                arch_timer_ppi[i] = irq_of_parse_and_map(np, i);
 
-       arch_timer_detect_rate(NULL, np);
+       arch_timer_kvm_info.virtual_irq = arch_timer_ppi[ARCH_TIMER_VIRT_PPI];
+
+       rate = arch_timer_get_cntfrq();
+       arch_timer_of_configure_rate(rate, np);
 
        arch_timer_c3stop = !of_property_read_bool(np, "always-on");
 
-#ifdef CONFIG_ARM_ARCH_TIMER_OOL_WORKAROUND
-       for (i = 0; i < ARRAY_SIZE(ool_workarounds); i++) {
-               if (of_property_read_bool(np, ool_workarounds[i].id)) {
-                       timer_unstable_counter_workaround = &ool_workarounds[i];
-                       static_branch_enable(&arch_timer_read_ool_enabled);
-                       pr_info("arch_timer: Enabling workaround for %s\n",
-                               timer_unstable_counter_workaround->id);
-                       break;
-               }
-       }
-#endif
+       /* Check for globally applicable workarounds */
+       arch_timer_check_ool_workaround(ate_match_dt, np);
 
        /*
         * If we cannot rely on firmware initializing the timer registers then
@@ -978,29 +1172,63 @@ static int __init arch_timer_of_init(struct device_node *np)
         */
        if (IS_ENABLED(CONFIG_ARM) &&
            of_property_read_bool(np, "arm,cpu-registers-not-fw-configured"))
-               arch_timer_uses_ppi = PHYS_SECURE_PPI;
+               arch_timer_uses_ppi = ARCH_TIMER_PHYS_SECURE_PPI;
+       else
+               arch_timer_uses_ppi = arch_timer_select_ppi();
+
+       if (!arch_timer_ppi[arch_timer_uses_ppi]) {
+               pr_err("No interrupt available, giving up\n");
+               return -EINVAL;
+       }
 
        /* On some systems, the counter stops ticking when in suspend. */
        arch_counter_suspend_stop = of_property_read_bool(np,
                                                         "arm,no-tick-in-suspend");
 
-       return arch_timer_init();
+       ret = arch_timer_register();
+       if (ret)
+               return ret;
+
+       if (arch_timer_needs_of_probing())
+               return 0;
+
+       return arch_timer_common_init();
 }
 CLOCKSOURCE_OF_DECLARE(armv7_arch_timer, "arm,armv7-timer", arch_timer_of_init);
 CLOCKSOURCE_OF_DECLARE(armv8_arch_timer, "arm,armv8-timer", arch_timer_of_init);
 
-static int __init arch_timer_mem_init(struct device_node *np)
+static u32 __init
+arch_timer_mem_frame_get_cntfrq(struct arch_timer_mem_frame *frame)
 {
-       struct device_node *frame, *best_frame = NULL;
-       void __iomem *cntctlbase, *base;
-       unsigned int irq, ret = -EINVAL;
+       void __iomem *base;
+       u32 rate;
+
+       base = ioremap(frame->cntbase, frame->size);
+       if (!base) {
+               pr_err("Unable to map frame @ %pa\n", &frame->cntbase);
+               return 0;
+       }
+
+       rate = readl_relaxed(frame + CNTFRQ);
+
+       iounmap(frame);
+
+       return rate;
+}
+
+static struct arch_timer_mem_frame * __init
+arch_timer_mem_find_best_frame(struct arch_timer_mem *timer_mem)
+{
+       struct arch_timer_mem_frame *frame, *best_frame = NULL;
+       void __iomem *cntctlbase;
        u32 cnttidr;
+       int i;
 
-       arch_timers_present |= ARCH_MEM_TIMER;
-       cntctlbase = of_iomap(np, 0);
+       cntctlbase = ioremap(timer_mem->cntctlbase, timer_mem->size);
        if (!cntctlbase) {
-               pr_err("arch_timer: Can't find CNTCTLBase\n");
-               return -ENXIO;
+               pr_err("Can't map CNTCTLBase @ %pa\n",
+                       &timer_mem->cntctlbase);
+               return NULL;
        }
 
        cnttidr = readl_relaxed(cntctlbase + CNTTIDR);
@@ -1009,25 +1237,20 @@ static int __init arch_timer_mem_init(struct device_node *np)
         * Try to find a virtual capable frame. Otherwise fall back to a
         * physical capable frame.
         */
-       for_each_available_child_of_node(np, frame) {
-               int n;
-               u32 cntacr;
+       for (i = 0; i < ARCH_TIMER_MEM_MAX_FRAMES; i++) {
+               u32 cntacr = CNTACR_RFRQ | CNTACR_RWPT | CNTACR_RPCT |
+                            CNTACR_RWVT | CNTACR_RVOFF | CNTACR_RVCT;
 
-               if (of_property_read_u32(frame, "frame-number", &n)) {
-                       pr_err("arch_timer: Missing frame-number\n");
-                       of_node_put(frame);
-                       goto out;
-               }
+               frame = &timer_mem->frame[i];
+               if (!frame->valid)
+                       continue;
 
                /* Try enabling everything, and see what sticks */
-               cntacr = CNTACR_RFRQ | CNTACR_RWPT | CNTACR_RPCT |
-                        CNTACR_RWVT | CNTACR_RVOFF | CNTACR_RVCT;
-               writel_relaxed(cntacr, cntctlbase + CNTACR(n));
-               cntacr = readl_relaxed(cntctlbase + CNTACR(n));
+               writel_relaxed(cntacr, cntctlbase + CNTACR(i));
+               cntacr = readl_relaxed(cntctlbase + CNTACR(i));
 
-               if ((cnttidr & CNTTIDR_VIRT(n)) &&
+               if ((cnttidr & CNTTIDR_VIRT(i)) &&
                    !(~cntacr & (CNTACR_RWVT | CNTACR_RVCT))) {
-                       of_node_put(best_frame);
                        best_frame = frame;
                        arch_timer_mem_use_virtual = true;
                        break;
@@ -1036,99 +1259,262 @@ static int __init arch_timer_mem_init(struct device_node *np)
                if (~cntacr & (CNTACR_RWPT | CNTACR_RPCT))
                        continue;
 
-               of_node_put(best_frame);
-               best_frame = of_node_get(frame);
+               best_frame = frame;
        }
 
-       ret= -ENXIO;
-       base = arch_counter_base = of_io_request_and_map(best_frame, 0,
-                                                        "arch_mem_timer");
-       if (IS_ERR(base)) {
-               pr_err("arch_timer: Can't map frame's registers\n");
-               goto out;
-       }
+       iounmap(cntctlbase);
+
+       if (!best_frame)
+               pr_err("Unable to find a suitable frame in timer @ %pa\n",
+                       &timer_mem->cntctlbase);
+
+       return frame;
+}
+
+static int __init
+arch_timer_mem_frame_register(struct arch_timer_mem_frame *frame)
+{
+       void __iomem *base;
+       int ret, irq = 0;
 
        if (arch_timer_mem_use_virtual)
-               irq = irq_of_parse_and_map(best_frame, 1);
+               irq = frame->virt_irq;
        else
-               irq = irq_of_parse_and_map(best_frame, 0);
+               irq = frame->phys_irq;
 
-       ret = -EINVAL;
        if (!irq) {
-               pr_err("arch_timer: Frame missing %s irq",
+               pr_err("Frame missing %s irq.\n",
                       arch_timer_mem_use_virtual ? "virt" : "phys");
-               goto out;
+               return -EINVAL;
+       }
+
+       if (!request_mem_region(frame->cntbase, frame->size,
+                               "arch_mem_timer"))
+               return -EBUSY;
+
+       base = ioremap(frame->cntbase, frame->size);
+       if (!base) {
+               pr_err("Can't map frame's registers\n");
+               return -ENXIO;
        }
 
-       arch_timer_detect_rate(base, np);
        ret = arch_timer_mem_register(base, irq);
-       if (ret)
+       if (ret) {
+               iounmap(base);
+               return ret;
+       }
+
+       arch_counter_base = base;
+       arch_timers_present |= ARCH_TIMER_TYPE_MEM;
+
+       return 0;
+}
+
+static int __init arch_timer_mem_of_init(struct device_node *np)
+{
+       struct arch_timer_mem *timer_mem;
+       struct arch_timer_mem_frame *frame;
+       struct device_node *frame_node;
+       struct resource res;
+       int ret = -EINVAL;
+       u32 rate;
+
+       timer_mem = kzalloc(sizeof(*timer_mem), GFP_KERNEL);
+       if (!timer_mem)
+               return -ENOMEM;
+
+       if (of_address_to_resource(np, 0, &res))
                goto out;
+       timer_mem->cntctlbase = res.start;
+       timer_mem->size = resource_size(&res);
 
-       return arch_timer_common_init();
+       for_each_available_child_of_node(np, frame_node) {
+               u32 n;
+               struct arch_timer_mem_frame *frame;
+
+               if (of_property_read_u32(frame_node, "frame-number", &n)) {
+                       pr_err(FW_BUG "Missing frame-number.\n");
+                       of_node_put(frame_node);
+                       goto out;
+               }
+               if (n >= ARCH_TIMER_MEM_MAX_FRAMES) {
+                       pr_err(FW_BUG "Wrong frame-number, only 0-%u are permitted.\n",
+                              ARCH_TIMER_MEM_MAX_FRAMES - 1);
+                       of_node_put(frame_node);
+                       goto out;
+               }
+               frame = &timer_mem->frame[n];
+
+               if (frame->valid) {
+                       pr_err(FW_BUG "Duplicated frame-number.\n");
+                       of_node_put(frame_node);
+                       goto out;
+               }
+
+               if (of_address_to_resource(frame_node, 0, &res)) {
+                       of_node_put(frame_node);
+                       goto out;
+               }
+               frame->cntbase = res.start;
+               frame->size = resource_size(&res);
+
+               frame->virt_irq = irq_of_parse_and_map(frame_node,
+                                                      ARCH_TIMER_VIRT_SPI);
+               frame->phys_irq = irq_of_parse_and_map(frame_node,
+                                                      ARCH_TIMER_PHYS_SPI);
+
+               frame->valid = true;
+       }
+
+       frame = arch_timer_mem_find_best_frame(timer_mem);
+       if (!frame) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       rate = arch_timer_mem_frame_get_cntfrq(frame);
+       arch_timer_of_configure_rate(rate, np);
+
+       ret = arch_timer_mem_frame_register(frame);
+       if (!ret && !arch_timer_needs_of_probing())
+               ret = arch_timer_common_init();
 out:
-       iounmap(cntctlbase);
-       of_node_put(best_frame);
+       kfree(timer_mem);
        return ret;
 }
 CLOCKSOURCE_OF_DECLARE(armv7_arch_timer_mem, "arm,armv7-timer-mem",
-                      arch_timer_mem_init);
+                      arch_timer_mem_of_init);
 
-#ifdef CONFIG_ACPI
-static int __init map_generic_timer_interrupt(u32 interrupt, u32 flags)
+#ifdef CONFIG_ACPI_GTDT
+static int __init
+arch_timer_mem_verify_cntfrq(struct arch_timer_mem *timer_mem)
 {
-       int trigger, polarity;
+       struct arch_timer_mem_frame *frame;
+       u32 rate;
+       int i;
 
-       if (!interrupt)
-               return 0;
+       for (i = 0; i < ARCH_TIMER_MEM_MAX_FRAMES; i++) {
+               frame = &timer_mem->frame[i];
 
-       trigger = (flags & ACPI_GTDT_INTERRUPT_MODE) ? ACPI_EDGE_SENSITIVE
-                       : ACPI_LEVEL_SENSITIVE;
+               if (!frame->valid)
+                       continue;
+
+               rate = arch_timer_mem_frame_get_cntfrq(frame);
+               if (rate == arch_timer_rate)
+                       continue;
+
+               pr_err(FW_BUG "CNTFRQ mismatch: frame @ %pa: (0x%08lx), CPU: (0x%08lx)\n",
+                       &frame->cntbase,
+                       (unsigned long)rate, (unsigned long)arch_timer_rate);
 
-       polarity = (flags & ACPI_GTDT_INTERRUPT_POLARITY) ? ACPI_ACTIVE_LOW
-                       : ACPI_ACTIVE_HIGH;
+               return -EINVAL;
+       }
 
-       return acpi_register_gsi(NULL, interrupt, trigger, polarity);
+       return 0;
 }
 
-/* Initialize per-processor generic timer */
+static int __init arch_timer_mem_acpi_init(int platform_timer_count)
+{
+       struct arch_timer_mem *timers, *timer;
+       struct arch_timer_mem_frame *frame;
+       int timer_count, i, ret = 0;
+
+       timers = kcalloc(platform_timer_count, sizeof(*timers),
+                           GFP_KERNEL);
+       if (!timers)
+               return -ENOMEM;
+
+       ret = acpi_arch_timer_mem_init(timers, &timer_count);
+       if (ret || !timer_count)
+               goto out;
+
+       for (i = 0; i < timer_count; i++) {
+               ret = arch_timer_mem_verify_cntfrq(&timers[i]);
+               if (ret) {
+                       pr_err("Disabling MMIO timers due to CNTFRQ mismatch\n");
+                       goto out;
+               }
+       }
+
+       /*
+        * While unlikely, it's theoretically possible that none of the frames
+        * in a timer expose the combination of feature we want.
+        */
+       for (i = i; i < timer_count; i++) {
+               timer = &timers[i];
+
+               frame = arch_timer_mem_find_best_frame(timer);
+               if (frame)
+                       break;
+       }
+
+       if (frame)
+               ret = arch_timer_mem_frame_register(frame);
+out:
+       kfree(timers);
+       return ret;
+}
+
+/* Initialize per-processor generic timer and memory-mapped timer(if present) */
 static int __init arch_timer_acpi_init(struct acpi_table_header *table)
 {
-       struct acpi_table_gtdt *gtdt;
+       int ret, platform_timer_count;
 
-       if (arch_timers_present & ARCH_CP15_TIMER) {
-               pr_warn("arch_timer: already initialized, skipping\n");
+       if (arch_timers_present & ARCH_TIMER_TYPE_CP15) {
+               pr_warn("already initialized, skipping\n");
                return -EINVAL;
        }
 
-       gtdt = container_of(table, struct acpi_table_gtdt, header);
+       arch_timers_present |= ARCH_TIMER_TYPE_CP15;
+
+       ret = acpi_gtdt_init(table, &platform_timer_count);
+       if (ret) {
+               pr_err("Failed to init GTDT table.\n");
+               return ret;
+       }
 
-       arch_timers_present |= ARCH_CP15_TIMER;
+       arch_timer_ppi[ARCH_TIMER_PHYS_NONSECURE_PPI] =
+               acpi_gtdt_map_ppi(ARCH_TIMER_PHYS_NONSECURE_PPI);
 
-       arch_timer_ppi[PHYS_SECURE_PPI] =
-               map_generic_timer_interrupt(gtdt->secure_el1_interrupt,
-               gtdt->secure_el1_flags);
+       arch_timer_ppi[ARCH_TIMER_VIRT_PPI] =
+               acpi_gtdt_map_ppi(ARCH_TIMER_VIRT_PPI);
 
-       arch_timer_ppi[PHYS_NONSECURE_PPI] =
-               map_generic_timer_interrupt(gtdt->non_secure_el1_interrupt,
-               gtdt->non_secure_el1_flags);
+       arch_timer_ppi[ARCH_TIMER_HYP_PPI] =
+               acpi_gtdt_map_ppi(ARCH_TIMER_HYP_PPI);
 
-       arch_timer_ppi[VIRT_PPI] =
-               map_generic_timer_interrupt(gtdt->virtual_timer_interrupt,
-               gtdt->virtual_timer_flags);
+       arch_timer_kvm_info.virtual_irq = arch_timer_ppi[ARCH_TIMER_VIRT_PPI];
 
-       arch_timer_ppi[HYP_PPI] =
-               map_generic_timer_interrupt(gtdt->non_secure_el2_interrupt,
-               gtdt->non_secure_el2_flags);
+       /*
+        * When probing via ACPI, we have no mechanism to override the sysreg
+        * CNTFRQ value. This *must* be correct.
+        */
+       arch_timer_rate = arch_timer_get_cntfrq();
+       if (!arch_timer_rate) {
+               pr_err(FW_BUG "frequency not available.\n");
+               return -EINVAL;
+       }
 
-       /* Get the frequency from CNTFRQ */
-       arch_timer_detect_rate(NULL, NULL);
+       arch_timer_uses_ppi = arch_timer_select_ppi();
+       if (!arch_timer_ppi[arch_timer_uses_ppi]) {
+               pr_err("No interrupt available, giving up\n");
+               return -EINVAL;
+       }
 
        /* Always-on capability */
-       arch_timer_c3stop = !(gtdt->non_secure_el1_flags & ACPI_GTDT_ALWAYS_ON);
+       arch_timer_c3stop = acpi_gtdt_c3stop(arch_timer_uses_ppi);
 
-       arch_timer_init();
-       return 0;
+       /* Check for globally applicable workarounds */
+       arch_timer_check_ool_workaround(ate_match_acpi_oem_info, table);
+
+       ret = arch_timer_register();
+       if (ret)
+               return ret;
+
+       if (platform_timer_count &&
+           arch_timer_mem_acpi_init(platform_timer_count))
+               pr_err("Failed to initialize memory-mapped timer.\n");
+
+       return arch_timer_common_init();
 }
 CLOCKSOURCE_ACPI_DECLARE(arch_timer, ACPI_SIG_GTDT, arch_timer_acpi_init);
 #endif
index 1ba871b..c678083 100644 (file)
@@ -193,7 +193,7 @@ static int __init asm9260_timer_init(struct device_node *np)
 
        priv.base = of_io_request_and_map(np, 0, np->name);
        if (IS_ERR(priv.base)) {
-               pr_err("%s: unable to map resource", np->name);
+               pr_err("%s: unable to map resource\n", np->name);
                return PTR_ERR(priv.base);
        }
 
index f2f29d2..dce4430 100644 (file)
@@ -89,13 +89,13 @@ static int __init bcm2835_timer_init(struct device_node *node)
 
        base = of_iomap(node, 0);
        if (!base) {
-               pr_err("Can't remap registers");
+               pr_err("Can't remap registers\n");
                return -ENXIO;
        }
 
        ret = of_property_read_u32(node, "clock-frequency", &freq);
        if (ret) {
-               pr_err("Can't read clock-frequency");
+               pr_err("Can't read clock-frequency\n");
                goto err_iounmap;
        }
 
@@ -107,7 +107,7 @@ static int __init bcm2835_timer_init(struct device_node *node)
 
        irq = irq_of_parse_and_map(node, DEFAULT_TIMER);
        if (irq <= 0) {
-               pr_err("Can't parse IRQ");
+               pr_err("Can't parse IRQ\n");
                ret = -EINVAL;
                goto err_iounmap;
        }
index 92f6e4d..fda5e14 100644 (file)
@@ -179,7 +179,7 @@ static int __init kona_timer_init(struct device_node *node)
        } else if (!of_property_read_u32(node, "clock-frequency", &freq)) {
                arch_timer_rate = freq;
        } else {
-               pr_err("Kona Timer v1 unable to determine clock-frequency");
+               pr_err("Kona Timer v1 unable to determine clock-frequency\n");
                return -EINVAL;
        }
 
index bc62be9..ac701ff 100644 (file)
@@ -40,7 +40,7 @@ void __init clocksource_probe(void)
 
                ret = init_func_ret(np);
                if (ret) {
-                       pr_err("Failed to initialize '%s': %d",
+                       pr_err("Failed to initialize '%s': %d\n",
                               of_node_full_name(np), ret);
                        continue;
                }
index 63e4f55..1f5f734 100644 (file)
@@ -101,7 +101,7 @@ static irqreturn_t dw_apb_clockevent_irq(int irq, void *data)
        struct dw_apb_clock_event_device *dw_ced = ced_to_dw_apb_ced(evt);
 
        if (!evt->event_handler) {
-               pr_info("Spurious APBT timer interrupt %d", irq);
+               pr_info("Spurious APBT timer interrupt %d\n", irq);
                return IRQ_NONE;
        }
 
@@ -257,7 +257,9 @@ dw_apb_clockevent_init(int cpu, const char *name, unsigned rating,
        clockevents_calc_mult_shift(&dw_ced->ced, freq, APBT_MIN_PERIOD);
        dw_ced->ced.max_delta_ns = clockevent_delta2ns(0x7fffffff,
                                                       &dw_ced->ced);
+       dw_ced->ced.max_delta_ticks = 0x7fffffff;
        dw_ced->ced.min_delta_ns = clockevent_delta2ns(5000, &dw_ced->ced);
+       dw_ced->ced.min_delta_ticks = 5000;
        dw_ced->ced.cpumask = cpumask_of(cpu);
        dw_ced->ced.features = CLOCK_EVT_FEAT_PERIODIC |
                                CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_DYNIRQ;
index aff87df..bc48cbf 100644 (file)
@@ -78,15 +78,12 @@ static int em_sti_enable(struct em_sti_priv *p)
        int ret;
 
        /* enable clock */
-       ret = clk_prepare_enable(p->clk);
+       ret = clk_enable(p->clk);
        if (ret) {
                dev_err(&p->pdev->dev, "cannot enable clock\n");
                return ret;
        }
 
-       /* configure channel, periodic mode and maximum timeout */
-       p->rate = clk_get_rate(p->clk);
-
        /* reset the counter */
        em_sti_write(p, STI_SET_H, 0x40000000);
        em_sti_write(p, STI_SET_L, 0x00000000);
@@ -107,7 +104,7 @@ static void em_sti_disable(struct em_sti_priv *p)
        em_sti_write(p, STI_INTENCLR, 3);
 
        /* stop clock */
-       clk_disable_unprepare(p->clk);
+       clk_disable(p->clk);
 }
 
 static u64 em_sti_count(struct em_sti_priv *p)
@@ -205,13 +202,9 @@ static u64 em_sti_clocksource_read(struct clocksource *cs)
 
 static int em_sti_clocksource_enable(struct clocksource *cs)
 {
-       int ret;
        struct em_sti_priv *p = cs_to_em_sti(cs);
 
-       ret = em_sti_start(p, USER_CLOCKSOURCE);
-       if (!ret)
-               __clocksource_update_freq_hz(cs, p->rate);
-       return ret;
+       return em_sti_start(p, USER_CLOCKSOURCE);
 }
 
 static void em_sti_clocksource_disable(struct clocksource *cs)
@@ -240,8 +233,7 @@ static int em_sti_register_clocksource(struct em_sti_priv *p)
 
        dev_info(&p->pdev->dev, "used as clock source\n");
 
-       /* Register with dummy 1 Hz value, gets updated in ->enable() */
-       clocksource_register_hz(cs, 1);
+       clocksource_register_hz(cs, p->rate);
        return 0;
 }
 
@@ -263,7 +255,6 @@ static int em_sti_clock_event_set_oneshot(struct clock_event_device *ced)
 
        dev_info(&p->pdev->dev, "used for oneshot clock events\n");
        em_sti_start(p, USER_CLOCKEVENT);
-       clockevents_config(&p->ced, p->rate);
        return 0;
 }
 
@@ -294,8 +285,7 @@ static void em_sti_register_clockevent(struct em_sti_priv *p)
 
        dev_info(&p->pdev->dev, "used for clock events\n");
 
-       /* Register with dummy 1 Hz value, gets updated in ->set_state_oneshot() */
-       clockevents_config_and_register(ced, 1, 2, 0xffffffff);
+       clockevents_config_and_register(ced, p->rate, 2, 0xffffffff);
 }
 
 static int em_sti_probe(struct platform_device *pdev)
@@ -303,6 +293,7 @@ static int em_sti_probe(struct platform_device *pdev)
        struct em_sti_priv *p;
        struct resource *res;
        int irq;
+       int ret;
 
        p = devm_kzalloc(&pdev->dev, sizeof(*p), GFP_KERNEL);
        if (p == NULL)
@@ -323,6 +314,13 @@ static int em_sti_probe(struct platform_device *pdev)
        if (IS_ERR(p->base))
                return PTR_ERR(p->base);
 
+       if (devm_request_irq(&pdev->dev, irq, em_sti_interrupt,
+                            IRQF_TIMER | IRQF_IRQPOLL | IRQF_NOBALANCING,
+                            dev_name(&pdev->dev), p)) {
+               dev_err(&pdev->dev, "failed to request low IRQ\n");
+               return -ENOENT;
+       }
+
        /* get hold of clock */
        p->clk = devm_clk_get(&pdev->dev, "sclk");
        if (IS_ERR(p->clk)) {
@@ -330,12 +328,20 @@ static int em_sti_probe(struct platform_device *pdev)
                return PTR_ERR(p->clk);
        }
 
-       if (devm_request_irq(&pdev->dev, irq, em_sti_interrupt,
-                            IRQF_TIMER | IRQF_IRQPOLL | IRQF_NOBALANCING,
-                            dev_name(&pdev->dev), p)) {
-               dev_err(&pdev->dev, "failed to request low IRQ\n");
-               return -ENOENT;
+       ret = clk_prepare(p->clk);
+       if (ret < 0) {
+               dev_err(&pdev->dev, "cannot prepare clock\n");
+               return ret;
+       }
+
+       ret = clk_enable(p->clk);
+       if (ret < 0) {
+               dev_err(&p->pdev->dev, "cannot enable clock\n");
+               clk_unprepare(p->clk);
+               return ret;
        }
+       p->rate = clk_get_rate(p->clk);
+       clk_disable(p->clk);
 
        raw_spin_lock_init(&p->lock);
        em_sti_register_clockevent(p);
index 546bb18..804c489 100644 (file)
@@ -101,15 +101,7 @@ static inline struct timer8_priv *ced_to_priv(struct clock_event_device *ced)
 
 static void timer8_clock_event_start(struct timer8_priv *p, unsigned long delta)
 {
-       struct clock_event_device *ced = &p->ced;
-
        timer8_start(p);
-
-       ced->shift = 32;
-       ced->mult = div_sc(p->rate, NSEC_PER_SEC, ced->shift);
-       ced->max_delta_ns = clockevent_delta2ns(0xffff, ced);
-       ced->min_delta_ns = clockevent_delta2ns(0x0001, ced);
-
        timer8_set_next(p, delta);
 }
 
index 52af591..39d21f6 100644 (file)
@@ -133,13 +133,13 @@ static int __init meson6_timer_init(struct device_node *node)
 
        timer_base = of_io_request_and_map(node, 0, "meson6-timer");
        if (IS_ERR(timer_base)) {
-               pr_err("Can't map registers");
+               pr_err("Can't map registers\n");
                return -ENXIO;
        }
 
        irq = irq_of_parse_and_map(node, 0);
        if (irq <= 0) {
-               pr_err("Can't parse IRQ");
+               pr_err("Can't parse IRQ\n");
                return -EINVAL;
        }
 
index 6fcf965..3e5fa2f 100644 (file)
@@ -114,7 +114,9 @@ static int arch_timer_starting_cpu(unsigned int cpu)
 
        clk->mult = div_sc(hwtimer_freq, NSEC_PER_SEC, clk->shift);
        clk->max_delta_ns = clockevent_delta2ns(0x7fffffff, clk);
+       clk->max_delta_ticks = 0x7fffffff;
        clk->min_delta_ns = clockevent_delta2ns(0xf, clk);
+       clk->min_delta_ticks = 0xf;
        clk->cpumask = cpumask_of(cpu);
 
        clockevents_register_device(clk);
index d9ef7a6..3f52ee2 100644 (file)
@@ -154,19 +154,6 @@ static int __init __gic_clocksource_init(void)
        return ret;
 }
 
-void __init gic_clocksource_init(unsigned int frequency)
-{
-       gic_frequency = frequency;
-       gic_timer_irq = MIPS_GIC_IRQ_BASE +
-               GIC_LOCAL_TO_HWIRQ(GIC_LOCAL_INT_COMPARE);
-
-       __gic_clocksource_init();
-       gic_clockevent_init();
-
-       /* And finally start the counter */
-       gic_start_count();
-}
-
 static int __init gic_clocksource_of_init(struct device_node *node)
 {
        struct clk *clk;
@@ -174,7 +161,7 @@ static int __init gic_clocksource_of_init(struct device_node *node)
 
        if (!gic_present || !node->parent ||
            !of_device_is_compatible(node->parent, "mti,gic")) {
-               pr_warn("No DT definition for the mips gic driver");
+               pr_warn("No DT definition for the mips gic driver\n");
                return -ENXIO;
        }
 
index 3c124d1..7d44de3 100644 (file)
@@ -260,25 +260,25 @@ static int __init nmdk_timer_of_init(struct device_node *node)
 
        base = of_iomap(node, 0);
        if (!base) {
-               pr_err("Can't remap registers");
+               pr_err("Can't remap registers\n");
                return -ENXIO;
        }
 
        pclk = of_clk_get_by_name(node, "apb_pclk");
        if (IS_ERR(pclk)) {
-               pr_err("could not get apb_pclk");
+               pr_err("could not get apb_pclk\n");
                return PTR_ERR(pclk);
        }
 
        clk = of_clk_get_by_name(node, "timclk");
        if (IS_ERR(clk)) {
-               pr_err("could not get timclk");
+               pr_err("could not get timclk\n");
                return PTR_ERR(clk);
        }
 
        irq = irq_of_parse_and_map(node, 0);
        if (irq <= 0) {
-               pr_err("Can't parse IRQ");
+               pr_err("Can't parse IRQ\n");
                return -EINVAL;
        }
 
index 4e0f11f..6a20dc8 100644 (file)
@@ -51,7 +51,9 @@ static struct clock_event_device numachip2_clockevent = {
        .mult            = 1,
        .shift           = 0,
        .min_delta_ns    = 1250,
+       .min_delta_ticks = 1250,
        .max_delta_ns    = LONG_MAX,
+       .max_delta_ticks = LONG_MAX,
 };
 
 static void numachip_timer_interrupt(void)
index 1c24de2..a10fa66 100644 (file)
@@ -166,14 +166,14 @@ static int __init pxa_timer_common_init(int irq, unsigned long clock_tick_rate)
 
        ret = setup_irq(irq, &pxa_ost0_irq);
        if (ret) {
-               pr_err("Failed to setup irq");
+               pr_err("Failed to setup irq\n");
                return ret;
        }
 
        ret = clocksource_mmio_init(timer_base + OSCR, "oscr0", clock_tick_rate, 200,
                                    32, clocksource_mmio_readl_up);
        if (ret) {
-               pr_err("Failed to init clocksource");
+               pr_err("Failed to init clocksource\n");
                return ret;
        }
 
@@ -203,7 +203,7 @@ static int __init pxa_timer_dt_init(struct device_node *np)
 
        ret = clk_prepare_enable(clk);
        if (ret) {
-               pr_crit("Failed to prepare clock");
+               pr_crit("Failed to prepare clock\n");
                return ret;
        }
 
index 23e267a..49c02be 100644 (file)
@@ -11,6 +11,8 @@
 #include <linux/clockchips.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
+#include <linux/sched_clock.h>
+#include <linux/slab.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
 #include <linux/of_irq.h>
@@ -19,6 +21,8 @@
 
 #define TIMER_LOAD_COUNT0      0x00
 #define TIMER_LOAD_COUNT1      0x04
+#define TIMER_CURRENT_VALUE0   0x08
+#define TIMER_CURRENT_VALUE1   0x0C
 #define TIMER_CONTROL_REG3288  0x10
 #define TIMER_CONTROL_REG3399  0x1c
 #define TIMER_INT_STATUS       0x18
 #define TIMER_MODE_USER_DEFINED_COUNT          (1 << 1)
 #define TIMER_INT_UNMASK                       (1 << 2)
 
-struct bc_timer {
-       struct clock_event_device ce;
+struct rk_timer {
        void __iomem *base;
        void __iomem *ctrl;
+       struct clk *clk;
+       struct clk *pclk;
        u32 freq;
+       int irq;
 };
 
-static struct bc_timer bc_timer;
-
-static inline struct bc_timer *rk_timer(struct clock_event_device *ce)
-{
-       return container_of(ce, struct bc_timer, ce);
-}
+struct rk_clkevt {
+       struct clock_event_device ce;
+       struct rk_timer timer;
+};
 
-static inline void __iomem *rk_base(struct clock_event_device *ce)
-{
-       return rk_timer(ce)->base;
-}
+static struct rk_clkevt *rk_clkevt;
+static struct rk_timer *rk_clksrc;
 
-static inline void __iomem *rk_ctrl(struct clock_event_device *ce)
+static inline struct rk_timer *rk_timer(struct clock_event_device *ce)
 {
-       return rk_timer(ce)->ctrl;
+       return &container_of(ce, struct rk_clkevt, ce)->timer;
 }
 
-static inline void rk_timer_disable(struct clock_event_device *ce)
+static inline void rk_timer_disable(struct rk_timer *timer)
 {
-       writel_relaxed(TIMER_DISABLE, rk_ctrl(ce));
+       writel_relaxed(TIMER_DISABLE, timer->ctrl);
 }
 
-static inline void rk_timer_enable(struct clock_event_device *ce, u32 flags)
+static inline void rk_timer_enable(struct rk_timer *timer, u32 flags)
 {
-       writel_relaxed(TIMER_ENABLE | TIMER_INT_UNMASK | flags,
-                      rk_ctrl(ce));
+       writel_relaxed(TIMER_ENABLE | flags, timer->ctrl);
 }
 
 static void rk_timer_update_counter(unsigned long cycles,
-                                   struct clock_event_device *ce)
+                                   struct rk_timer *timer)
 {
-       writel_relaxed(cycles, rk_base(ce) + TIMER_LOAD_COUNT0);
-       writel_relaxed(0, rk_base(ce) + TIMER_LOAD_COUNT1);
+       writel_relaxed(cycles, timer->base + TIMER_LOAD_COUNT0);
+       writel_relaxed(0, timer->base + TIMER_LOAD_COUNT1);
 }
 
-static void rk_timer_interrupt_clear(struct clock_event_device *ce)
+static void rk_timer_interrupt_clear(struct rk_timer *timer)
 {
-       writel_relaxed(1, rk_base(ce) + TIMER_INT_STATUS);
+       writel_relaxed(1, timer->base + TIMER_INT_STATUS);
 }
 
 static inline int rk_timer_set_next_event(unsigned long cycles,
                                          struct clock_event_device *ce)
 {
-       rk_timer_disable(ce);
-       rk_timer_update_counter(cycles, ce);
-       rk_timer_enable(ce, TIMER_MODE_USER_DEFINED_COUNT);
+       struct rk_timer *timer = rk_timer(ce);
+
+       rk_timer_disable(timer);
+       rk_timer_update_counter(cycles, timer);
+       rk_timer_enable(timer, TIMER_MODE_USER_DEFINED_COUNT |
+                              TIMER_INT_UNMASK);
        return 0;
 }
 
 static int rk_timer_shutdown(struct clock_event_device *ce)
 {
-       rk_timer_disable(ce);
+       struct rk_timer *timer = rk_timer(ce);
+
+       rk_timer_disable(timer);
        return 0;
 }
 
 static int rk_timer_set_periodic(struct clock_event_device *ce)
 {
-       rk_timer_disable(ce);
-       rk_timer_update_counter(rk_timer(ce)->freq / HZ - 1, ce);
-       rk_timer_enable(ce, TIMER_MODE_FREE_RUNNING);
+       struct rk_timer *timer = rk_timer(ce);
+
+       rk_timer_disable(timer);
+       rk_timer_update_counter(timer->freq / HZ - 1, timer);
+       rk_timer_enable(timer, TIMER_MODE_FREE_RUNNING | TIMER_INT_UNMASK);
        return 0;
 }
 
 static irqreturn_t rk_timer_interrupt(int irq, void *dev_id)
 {
        struct clock_event_device *ce = dev_id;
+       struct rk_timer *timer = rk_timer(ce);
 
-       rk_timer_interrupt_clear(ce);
+       rk_timer_interrupt_clear(timer);
 
        if (clockevent_state_oneshot(ce))
-               rk_timer_disable(ce);
+               rk_timer_disable(timer);
 
        ce->event_handler(ce);
 
        return IRQ_HANDLED;
 }
 
-static int __init rk_timer_init(struct device_node *np, u32 ctrl_reg)
+static u64 notrace rk_timer_sched_read(void)
+{
+       return ~readl_relaxed(rk_clksrc->base + TIMER_CURRENT_VALUE0);
+}
+
+static int __init
+rk_timer_probe(struct rk_timer *timer, struct device_node *np)
 {
-       struct clock_event_device *ce = &bc_timer.ce;
        struct clk *timer_clk;
        struct clk *pclk;
        int ret = -EINVAL, irq;
+       u32 ctrl_reg = TIMER_CONTROL_REG3288;
 
-       bc_timer.base = of_iomap(np, 0);
-       if (!bc_timer.base) {
+       timer->base = of_iomap(np, 0);
+       if (!timer->base) {
                pr_err("Failed to get base address for '%s'\n", TIMER_NAME);
                return -ENXIO;
        }
-       bc_timer.ctrl = bc_timer.base + ctrl_reg;
+
+       if (of_device_is_compatible(np, "rockchip,rk3399-timer"))
+               ctrl_reg = TIMER_CONTROL_REG3399;
+
+       timer->ctrl = timer->base + ctrl_reg;
 
        pclk = of_clk_get_by_name(np, "pclk");
        if (IS_ERR(pclk)) {
@@ -139,6 +158,7 @@ static int __init rk_timer_init(struct device_node *np, u32 ctrl_reg)
                pr_err("Failed to enable pclk for '%s'\n", TIMER_NAME);
                goto out_unmap;
        }
+       timer->pclk = pclk;
 
        timer_clk = of_clk_get_by_name(np, "timer");
        if (IS_ERR(timer_clk)) {
@@ -152,8 +172,9 @@ static int __init rk_timer_init(struct device_node *np, u32 ctrl_reg)
                pr_err("Failed to enable timer clock\n");
                goto out_timer_clk;
        }
+       timer->clk = timer_clk;
 
-       bc_timer.freq = clk_get_rate(timer_clk);
+       timer->freq = clk_get_rate(timer_clk);
 
        irq = irq_of_parse_and_map(np, 0);
        if (!irq) {
@@ -161,51 +182,126 @@ static int __init rk_timer_init(struct device_node *np, u32 ctrl_reg)
                pr_err("Failed to map interrupts for '%s'\n", TIMER_NAME);
                goto out_irq;
        }
+       timer->irq = irq;
+
+       rk_timer_interrupt_clear(timer);
+       rk_timer_disable(timer);
+       return 0;
+
+out_irq:
+       clk_disable_unprepare(timer_clk);
+out_timer_clk:
+       clk_disable_unprepare(pclk);
+out_unmap:
+       iounmap(timer->base);
+
+       return ret;
+}
+
+static void __init rk_timer_cleanup(struct rk_timer *timer)
+{
+       clk_disable_unprepare(timer->clk);
+       clk_disable_unprepare(timer->pclk);
+       iounmap(timer->base);
+}
+
+static int __init rk_clkevt_init(struct device_node *np)
+{
+       struct clock_event_device *ce;
+       int ret = -EINVAL;
+
+       rk_clkevt = kzalloc(sizeof(struct rk_clkevt), GFP_KERNEL);
+       if (!rk_clkevt) {
+               ret = -ENOMEM;
+               goto out;
+       }
 
+       ret = rk_timer_probe(&rk_clkevt->timer, np);
+       if (ret)
+               goto out_probe;
+
+       ce = &rk_clkevt->ce;
        ce->name = TIMER_NAME;
        ce->features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT |
                       CLOCK_EVT_FEAT_DYNIRQ;
        ce->set_next_event = rk_timer_set_next_event;
        ce->set_state_shutdown = rk_timer_shutdown;
        ce->set_state_periodic = rk_timer_set_periodic;
-       ce->irq = irq;
+       ce->irq = rk_clkevt->timer.irq;
        ce->cpumask = cpu_possible_mask;
        ce->rating = 250;
 
-       rk_timer_interrupt_clear(ce);
-       rk_timer_disable(ce);
-
-       ret = request_irq(irq, rk_timer_interrupt, IRQF_TIMER, TIMER_NAME, ce);
+       ret = request_irq(rk_clkevt->timer.irq, rk_timer_interrupt, IRQF_TIMER,
+                         TIMER_NAME, ce);
        if (ret) {
-               pr_err("Failed to initialize '%s': %d\n", TIMER_NAME, ret);
+               pr_err("Failed to initialize '%s': %d\n",
+                       TIMER_NAME, ret);
                goto out_irq;
        }
 
-       clockevents_config_and_register(ce, bc_timer.freq, 1, UINT_MAX);
-
+       clockevents_config_and_register(&rk_clkevt->ce,
+                                       rk_clkevt->timer.freq, 1, UINT_MAX);
        return 0;
 
 out_irq:
-       clk_disable_unprepare(timer_clk);
-out_timer_clk:
-       clk_disable_unprepare(pclk);
-out_unmap:
-       iounmap(bc_timer.base);
-
+       rk_timer_cleanup(&rk_clkevt->timer);
+out_probe:
+       kfree(rk_clkevt);
+out:
+       /* Leave rk_clkevt not NULL to prevent future init */
+       rk_clkevt = ERR_PTR(ret);
        return ret;
 }
 
-static int __init rk3288_timer_init(struct device_node *np)
+static int __init rk_clksrc_init(struct device_node *np)
 {
-       return rk_timer_init(np, TIMER_CONTROL_REG3288);
+       int ret = -EINVAL;
+
+       rk_clksrc = kzalloc(sizeof(struct rk_timer), GFP_KERNEL);
+       if (!rk_clksrc) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       ret = rk_timer_probe(rk_clksrc, np);
+       if (ret)
+               goto out_probe;
+
+       rk_timer_update_counter(UINT_MAX, rk_clksrc);
+       rk_timer_enable(rk_clksrc, 0);
+
+       ret = clocksource_mmio_init(rk_clksrc->base + TIMER_CURRENT_VALUE0,
+               TIMER_NAME, rk_clksrc->freq, 250, 32,
+               clocksource_mmio_readl_down);
+       if (ret) {
+               pr_err("Failed to register clocksource");
+               goto out_clocksource;
+       }
+
+       sched_clock_register(rk_timer_sched_read, 32, rk_clksrc->freq);
+       return 0;
+
+out_clocksource:
+       rk_timer_cleanup(rk_clksrc);
+out_probe:
+       kfree(rk_clksrc);
+out:
+       /* Leave rk_clksrc not NULL to prevent future init */
+       rk_clksrc = ERR_PTR(ret);
+       return ret;
 }
 
-static int __init rk3399_timer_init(struct device_node *np)
+static int __init rk_timer_init(struct device_node *np)
 {
-       return rk_timer_init(np, TIMER_CONTROL_REG3399);
+       if (!rk_clkevt)
+               return rk_clkevt_init(np);
+
+       if (!rk_clksrc)
+               return rk_clksrc_init(np);
+
+       pr_err("Too many timer definitions for '%s'\n", TIMER_NAME);
+       return -EINVAL;
 }
 
-CLOCKSOURCE_OF_DECLARE(rk3288_timer, "rockchip,rk3288-timer",
-                      rk3288_timer_init);
-CLOCKSOURCE_OF_DECLARE(rk3399_timer, "rockchip,rk3399-timer",
-                      rk3399_timer_init);
+CLOCKSOURCE_OF_DECLARE(rk3288_timer, "rockchip,rk3288-timer", rk_timer_init);
+CLOCKSOURCE_OF_DECLARE(rk3399_timer, "rockchip,rk3399-timer", rk_timer_init);
index 0093ece..a68e653 100644 (file)
@@ -385,7 +385,7 @@ static int __init _samsung_pwm_clocksource_init(void)
        mask = ~pwm.variant.output_mask & ((1 << SAMSUNG_PWM_NUM) - 1);
        channel = fls(mask) - 1;
        if (channel < 0) {
-               pr_crit("failed to find PWM channel for clocksource");
+               pr_crit("failed to find PWM channel for clocksource\n");
                return -EINVAL;
        }
        pwm.source_id = channel;
@@ -393,7 +393,7 @@ static int __init _samsung_pwm_clocksource_init(void)
        mask &= ~(1 << channel);
        channel = fls(mask) - 1;
        if (channel < 0) {
-               pr_crit("failed to find PWM channel for clock event");
+               pr_crit("failed to find PWM channel for clock event\n");
                return -EINVAL;
        }
        pwm.event_id = channel;
@@ -448,7 +448,7 @@ static int __init samsung_pwm_alloc(struct device_node *np,
 
        pwm.timerclk = of_clk_get_by_name(np, "timers");
        if (IS_ERR(pwm.timerclk)) {
-               pr_crit("failed to get timers clock for timer");
+               pr_crit("failed to get timers clock for timer\n");
                return PTR_ERR(pwm.timerclk);
        }
 
index 28757ed..e09e8bf 100644 (file)
@@ -103,7 +103,6 @@ struct sh_cmt_channel {
        unsigned long match_value;
        unsigned long next_match_value;
        unsigned long max_match_value;
-       unsigned long rate;
        raw_spinlock_t lock;
        struct clock_event_device ced;
        struct clocksource cs;
@@ -118,6 +117,7 @@ struct sh_cmt_device {
 
        void __iomem *mapbase;
        struct clk *clk;
+       unsigned long rate;
 
        raw_spinlock_t lock; /* Protect the shared start/stop register */
 
@@ -320,7 +320,7 @@ static void sh_cmt_start_stop_ch(struct sh_cmt_channel *ch, int start)
        raw_spin_unlock_irqrestore(&ch->cmt->lock, flags);
 }
 
-static int sh_cmt_enable(struct sh_cmt_channel *ch, unsigned long *rate)
+static int sh_cmt_enable(struct sh_cmt_channel *ch)
 {
        int k, ret;
 
@@ -340,11 +340,9 @@ static int sh_cmt_enable(struct sh_cmt_channel *ch, unsigned long *rate)
 
        /* configure channel, periodic mode and maximum timeout */
        if (ch->cmt->info->width == 16) {
-               *rate = clk_get_rate(ch->cmt->clk) / 512;
                sh_cmt_write_cmcsr(ch, SH_CMT16_CMCSR_CMIE |
                                   SH_CMT16_CMCSR_CKS512);
        } else {
-               *rate = clk_get_rate(ch->cmt->clk) / 8;
                sh_cmt_write_cmcsr(ch, SH_CMT32_CMCSR_CMM |
                                   SH_CMT32_CMCSR_CMTOUT_IE |
                                   SH_CMT32_CMCSR_CMR_IRQ |
@@ -572,7 +570,7 @@ static int sh_cmt_start(struct sh_cmt_channel *ch, unsigned long flag)
        raw_spin_lock_irqsave(&ch->lock, flags);
 
        if (!(ch->flags & (FLAG_CLOCKEVENT | FLAG_CLOCKSOURCE)))
-               ret = sh_cmt_enable(ch, &ch->rate);
+               ret = sh_cmt_enable(ch);
 
        if (ret)
                goto out;
@@ -640,10 +638,9 @@ static int sh_cmt_clocksource_enable(struct clocksource *cs)
        ch->total_cycles = 0;
 
        ret = sh_cmt_start(ch, FLAG_CLOCKSOURCE);
-       if (!ret) {
-               __clocksource_update_freq_hz(cs, ch->rate);
+       if (!ret)
                ch->cs_enabled = true;
-       }
+
        return ret;
 }
 
@@ -697,8 +694,7 @@ static int sh_cmt_register_clocksource(struct sh_cmt_channel *ch,
        dev_info(&ch->cmt->pdev->dev, "ch%u: used as clock source\n",
                 ch->index);
 
-       /* Register with dummy 1 Hz value, gets updated in ->enable() */
-       clocksource_register_hz(cs, 1);
+       clocksource_register_hz(cs, ch->cmt->rate);
        return 0;
 }
 
@@ -709,19 +705,10 @@ static struct sh_cmt_channel *ced_to_sh_cmt(struct clock_event_device *ced)
 
 static void sh_cmt_clock_event_start(struct sh_cmt_channel *ch, int periodic)
 {
-       struct clock_event_device *ced = &ch->ced;
-
        sh_cmt_start(ch, FLAG_CLOCKEVENT);
 
-       /* TODO: calculate good shift from rate and counter bit width */
-
-       ced->shift = 32;
-       ced->mult = div_sc(ch->rate, NSEC_PER_SEC, ced->shift);
-       ced->max_delta_ns = clockevent_delta2ns(ch->max_match_value, ced);
-       ced->min_delta_ns = clockevent_delta2ns(0x1f, ced);
-
        if (periodic)
-               sh_cmt_set_next(ch, ((ch->rate + HZ/2) / HZ) - 1);
+               sh_cmt_set_next(ch, ((ch->cmt->rate + HZ/2) / HZ) - 1);
        else
                sh_cmt_set_next(ch, ch->max_match_value);
 }
@@ -824,6 +811,14 @@ static int sh_cmt_register_clockevent(struct sh_cmt_channel *ch,
        ced->suspend = sh_cmt_clock_event_suspend;
        ced->resume = sh_cmt_clock_event_resume;
 
+       /* TODO: calculate good shift from rate and counter bit width */
+       ced->shift = 32;
+       ced->mult = div_sc(ch->cmt->rate, NSEC_PER_SEC, ced->shift);
+       ced->max_delta_ns = clockevent_delta2ns(ch->max_match_value, ced);
+       ced->max_delta_ticks = ch->max_match_value;
+       ced->min_delta_ns = clockevent_delta2ns(0x1f, ced);
+       ced->min_delta_ticks = 0x1f;
+
        dev_info(&ch->cmt->pdev->dev, "ch%u: used for clock events\n",
                 ch->index);
        clockevents_register_device(ced);
@@ -996,6 +991,18 @@ static int sh_cmt_setup(struct sh_cmt_device *cmt, struct platform_device *pdev)
        if (ret < 0)
                goto err_clk_put;
 
+       /* Determine clock rate. */
+       ret = clk_enable(cmt->clk);
+       if (ret < 0)
+               goto err_clk_unprepare;
+
+       if (cmt->info->width == 16)
+               cmt->rate = clk_get_rate(cmt->clk) / 512;
+       else
+               cmt->rate = clk_get_rate(cmt->clk) / 8;
+
+       clk_disable(cmt->clk);
+
        /* Map the memory resource(s). */
        ret = sh_cmt_map_memory(cmt);
        if (ret < 0)
index 1fbf2aa..31d8816 100644 (file)
@@ -46,7 +46,6 @@ struct sh_tmu_channel {
        void __iomem *base;
        int irq;
 
-       unsigned long rate;
        unsigned long periodic;
        struct clock_event_device ced;
        struct clocksource cs;
@@ -59,6 +58,7 @@ struct sh_tmu_device {
 
        void __iomem *mapbase;
        struct clk *clk;
+       unsigned long rate;
 
        enum sh_tmu_model model;
 
@@ -165,7 +165,6 @@ static int __sh_tmu_enable(struct sh_tmu_channel *ch)
        sh_tmu_write(ch, TCNT, 0xffffffff);
 
        /* configure channel to parent clock / 4, irq off */
-       ch->rate = clk_get_rate(ch->tmu->clk) / 4;
        sh_tmu_write(ch, TCR, TCR_TPSC_CLK4);
 
        /* enable channel */
@@ -271,10 +270,8 @@ static int sh_tmu_clocksource_enable(struct clocksource *cs)
                return 0;
 
        ret = sh_tmu_enable(ch);
-       if (!ret) {
-               __clocksource_update_freq_hz(cs, ch->rate);
+       if (!ret)
                ch->cs_enabled = true;
-       }
 
        return ret;
 }
@@ -334,8 +331,7 @@ static int sh_tmu_register_clocksource(struct sh_tmu_channel *ch,
        dev_info(&ch->tmu->pdev->dev, "ch%u: used as clock source\n",
                 ch->index);
 
-       /* Register with dummy 1 Hz value, gets updated in ->enable() */
-       clocksource_register_hz(cs, 1);
+       clocksource_register_hz(cs, ch->tmu->rate);
        return 0;
 }
 
@@ -346,14 +342,10 @@ static struct sh_tmu_channel *ced_to_sh_tmu(struct clock_event_device *ced)
 
 static void sh_tmu_clock_event_start(struct sh_tmu_channel *ch, int periodic)
 {
-       struct clock_event_device *ced = &ch->ced;
-
        sh_tmu_enable(ch);
 
-       clockevents_config(ced, ch->rate);
-
        if (periodic) {
-               ch->periodic = (ch->rate + HZ/2) / HZ;
+               ch->periodic = (ch->tmu->rate + HZ/2) / HZ;
                sh_tmu_set_next(ch, ch->periodic, 1);
        }
 }
@@ -435,7 +427,7 @@ static void sh_tmu_register_clockevent(struct sh_tmu_channel *ch,
        dev_info(&ch->tmu->pdev->dev, "ch%u: used for clock events\n",
                 ch->index);
 
-       clockevents_config_and_register(ced, 1, 0x300, 0xffffffff);
+       clockevents_config_and_register(ced, ch->tmu->rate, 0x300, 0xffffffff);
 
        ret = request_irq(ch->irq, sh_tmu_interrupt,
                          IRQF_TIMER | IRQF_IRQPOLL | IRQF_NOBALANCING,
@@ -561,6 +553,14 @@ static int sh_tmu_setup(struct sh_tmu_device *tmu, struct platform_device *pdev)
        if (ret < 0)
                goto err_clk_put;
 
+       /* Determine clock rate. */
+       ret = clk_enable(tmu->clk);
+       if (ret < 0)
+               goto err_clk_unprepare;
+
+       tmu->rate = clk_get_rate(tmu->clk) / 4;
+       clk_disable(tmu->clk);
+
        /* Map the memory resource. */
        ret = sh_tmu_map_memory(tmu);
        if (ret < 0) {
index c83452c..4452d5c 100644 (file)
@@ -159,25 +159,25 @@ static int __init sun4i_timer_init(struct device_node *node)
 
        timer_base = of_iomap(node, 0);
        if (!timer_base) {
-               pr_crit("Can't map registers");
+               pr_crit("Can't map registers\n");
                return -ENXIO;
        }
 
        irq = irq_of_parse_and_map(node, 0);
        if (irq <= 0) {
-               pr_crit("Can't parse IRQ");
+               pr_crit("Can't parse IRQ\n");
                return -EINVAL;
        }
 
        clk = of_clk_get(node, 0);
        if (IS_ERR(clk)) {
-               pr_crit("Can't get timer clock");
+               pr_crit("Can't get timer clock\n");
                return PTR_ERR(clk);
        }
 
        ret = clk_prepare_enable(clk);
        if (ret) {
-               pr_err("Failed to prepare clock");
+               pr_err("Failed to prepare clock\n");
                return ret;
        }
 
@@ -200,7 +200,7 @@ static int __init sun4i_timer_init(struct device_node *node)
        ret = clocksource_mmio_init(timer_base + TIMER_CNTVAL_REG(1), node->name,
                                    rate, 350, 32, clocksource_mmio_readl_down);
        if (ret) {
-               pr_err("Failed to register clocksource");
+               pr_err("Failed to register clocksource\n");
                return ret;
        }
 
index f960891..b9990b9 100644 (file)
@@ -245,7 +245,7 @@ static int __init tegra20_init_rtc(struct device_node *np)
 
        rtc_base = of_iomap(np, 0);
        if (!rtc_base) {
-               pr_err("Can't map RTC registers");
+               pr_err("Can't map RTC registers\n");
                return -ENXIO;
        }
 
index 4440aef..aea4380 100644 (file)
@@ -247,13 +247,13 @@ static int __init armada_370_xp_timer_common_init(struct device_node *np)
 
        timer_base = of_iomap(np, 0);
        if (!timer_base) {
-               pr_err("Failed to iomap");
+               pr_err("Failed to iomap\n");
                return -ENXIO;
        }
 
        local_base = of_iomap(np, 1);
        if (!local_base) {
-               pr_err("Failed to iomap");
+               pr_err("Failed to iomap\n");
                return -ENXIO;
        }
 
@@ -298,7 +298,7 @@ static int __init armada_370_xp_timer_common_init(struct device_node *np)
                                    "armada_370_xp_clocksource",
                                    timer_clk, 300, 32, clocksource_mmio_readl_down);
        if (res) {
-               pr_err("Failed to initialize clocksource mmio");
+               pr_err("Failed to initialize clocksource mmio\n");
                return res;
        }
 
@@ -315,7 +315,7 @@ static int __init armada_370_xp_timer_common_init(struct device_node *np)
                                armada_370_xp_evt);
        /* Immediately configure the timer on the boot CPU */
        if (res) {
-               pr_err("Failed to request percpu irq");
+               pr_err("Failed to request percpu irq\n");
                return res;
        }
 
@@ -324,7 +324,7 @@ static int __init armada_370_xp_timer_common_init(struct device_node *np)
                                armada_370_xp_timer_starting_cpu,
                                armada_370_xp_timer_dying_cpu);
        if (res) {
-               pr_err("Failed to setup hotplug state and timer");
+               pr_err("Failed to setup hotplug state and timer\n");
                return res;
        }
 
@@ -339,7 +339,7 @@ static int __init armada_xp_timer_init(struct device_node *np)
        int ret;
 
        if (IS_ERR(clk)) {
-               pr_err("Failed to get clock");
+               pr_err("Failed to get clock\n");
                return PTR_ERR(clk);
        }
 
@@ -375,7 +375,7 @@ static int __init armada_375_timer_init(struct device_node *np)
 
                /* Must have at least a clock */
                if (IS_ERR(clk)) {
-                       pr_err("Failed to get clock");
+                       pr_err("Failed to get clock\n");
                        return PTR_ERR(clk);
                }
 
@@ -399,7 +399,7 @@ static int __init armada_370_timer_init(struct device_node *np)
 
        clk = of_clk_get(np, 0);
        if (IS_ERR(clk)) {
-               pr_err("Failed to get clock");
+               pr_err("Failed to get clock\n");
                return PTR_ERR(clk);
        }
 
index 5ac344b..ce0f97b 100644 (file)
@@ -235,7 +235,7 @@ static int __init efm32_clockevent_init(struct device_node *np)
 
        ret = setup_irq(irq, &efm32_clock_event_irq);
        if (ret) {
-               pr_err("Failed setup irq");
+               pr_err("Failed setup irq\n");
                goto err_setup_irq;
        }
 
index a28f496..b9b97f6 100644 (file)
@@ -15,6 +15,7 @@
 #include <linux/bitops.h>
 #include <linux/clk.h>
 #include <linux/clockchips.h>
+#include <linux/delay.h>
 #include <linux/interrupt.h>
 #include <linux/of_address.h>
 #include <linux/of_irq.h>
 
 static void __iomem *timer_base;
 
+static unsigned long notrace orion_read_timer(void)
+{
+       return ~readl(timer_base + TIMER0_VAL);
+}
+
+static struct delay_timer orion_delay_timer = {
+       .read_current_timer = orion_read_timer,
+};
+
+static void orion_delay_timer_init(unsigned long rate)
+{
+       orion_delay_timer.freq = rate;
+       register_current_timer_delay(&orion_delay_timer);
+}
+
 /*
  * Free-running clocksource handling.
  */
@@ -106,6 +122,7 @@ static struct irqaction orion_clkevt_irq = {
 
 static int __init orion_timer_init(struct device_node *np)
 {
+       unsigned long rate;
        struct clk *clk;
        int irq, ret;
 
@@ -124,7 +141,7 @@ static int __init orion_timer_init(struct device_node *np)
 
        ret = clk_prepare_enable(clk);
        if (ret) {
-               pr_err("Failed to prepare clock");
+               pr_err("Failed to prepare clock\n");
                return ret;
        }
 
@@ -135,6 +152,8 @@ static int __init orion_timer_init(struct device_node *np)
                return -EINVAL;
        }
 
+       rate = clk_get_rate(clk);
+
        /* setup timer0 as free-running clocksource */
        writel(~0, timer_base + TIMER0_VAL);
        writel(~0, timer_base + TIMER0_RELOAD);
@@ -142,15 +161,15 @@ static int __init orion_timer_init(struct device_node *np)
                TIMER0_RELOAD_EN | TIMER0_EN,
                TIMER0_RELOAD_EN | TIMER0_EN);
 
-       ret = clocksource_mmio_init(timer_base + TIMER0_VAL, "orion_clocksource",
-                                   clk_get_rate(clk), 300, 32,
+       ret = clocksource_mmio_init(timer_base + TIMER0_VAL,
+                                   "orion_clocksource", rate, 300, 32,
                                    clocksource_mmio_readl_down);
        if (ret) {
-               pr_err("Failed to initialize mmio timer");
+               pr_err("Failed to initialize mmio timer\n");
                return ret;
        }
 
-       sched_clock_register(orion_read_sched_clock, 32, clk_get_rate(clk));
+       sched_clock_register(orion_read_sched_clock, 32, rate);
 
        /* setup timer1 as clockevent timer */
        ret = setup_irq(irq, &orion_clkevt_irq);
@@ -162,9 +181,12 @@ static int __init orion_timer_init(struct device_node *np)
        ticks_per_jiffy = (clk_get_rate(clk) + HZ/2) / HZ;
        orion_clkevt.cpumask = cpumask_of(0);
        orion_clkevt.irq = irq;
-       clockevents_config_and_register(&orion_clkevt, clk_get_rate(clk),
+       clockevents_config_and_register(&orion_clkevt, rate,
                                        ORION_ONESHOT_MIN, ORION_ONESHOT_MAX);
 
+
+       orion_delay_timer_init(rate);
+
        return 0;
 }
 CLOCKSOURCE_OF_DECLARE(orion_timer, "marvell,orion-timer", orion_timer_init);
index 3d8a181..50300ee 100644 (file)
@@ -192,7 +192,9 @@ static int sirfsoc_local_timer_starting_cpu(unsigned int cpu)
        ce->set_next_event = sirfsoc_timer_set_next_event;
        clockevents_calc_mult_shift(ce, atlas7_timer_rate, 60);
        ce->max_delta_ns = clockevent_delta2ns(-2, ce);
+       ce->max_delta_ticks = (unsigned long)-2;
        ce->min_delta_ns = clockevent_delta2ns(2, ce);
+       ce->min_delta_ticks = 2;
        ce->cpumask = cpumask_of(cpu);
 
        action->dev_id = ce;
index c0b5df3..cc11235 100644 (file)
@@ -226,7 +226,7 @@ static int __init at91sam926x_pit_dt_init(struct device_node *node)
        
        ret = clocksource_register_hz(&data->clksrc, pit_rate);
        if (ret) {
-               pr_err("Failed to register clocksource");
+               pr_err("Failed to register clocksource\n");
                return ret;
        }
 
index e9f50d2..94a161e 100644 (file)
@@ -161,19 +161,19 @@ static int __init digicolor_timer_init(struct device_node *node)
         */
        dc_timer_dev.base = of_iomap(node, 0);
        if (!dc_timer_dev.base) {
-               pr_err("Can't map registers");
+               pr_err("Can't map registers\n");
                return -ENXIO;
        }
 
        irq = irq_of_parse_and_map(node, dc_timer_dev.timer_id);
        if (irq <= 0) {
-               pr_err("Can't parse IRQ");
+               pr_err("Can't parse IRQ\n");
                return -EINVAL;
        }
 
        clk = of_clk_get(node, 0);
        if (IS_ERR(clk)) {
-               pr_err("Can't get timer clock");
+               pr_err("Can't get timer clock\n");
                return PTR_ERR(clk);
        }
        clk_prepare_enable(clk);
similarity index 72%
rename from drivers/clocksource/timer-gemini.c
rename to drivers/clocksource/timer-fttmr010.c
index dda27b7..b4a6f1e 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Gemini timer driver
+ * Faraday Technology FTTMR010 timer driver
  * Copyright (C) 2017 Linus Walleij <linus.walleij@linaro.org>
  *
  * Based on a rewrite of arch/arm/mach-gemini/timer.c:
 #include <linux/clockchips.h>
 #include <linux/clocksource.h>
 #include <linux/sched_clock.h>
-
-/*
- * Relevant registers in the global syscon
- */
-#define GLOBAL_STATUS          0x04
-#define CPU_AHB_RATIO_MASK     (0x3 << 18)
-#define CPU_AHB_1_1            (0x0 << 18)
-#define CPU_AHB_3_2            (0x1 << 18)
-#define CPU_AHB_24_13          (0x2 << 18)
-#define CPU_AHB_2_1            (0x3 << 18)
-#define REG_TO_AHB_SPEED(reg)  ((((reg) >> 15) & 0x7) * 10 + 130)
+#include <linux/clk.h>
 
 /*
  * Register definitions for the timers
 static unsigned int tick_rate;
 static void __iomem *base;
 
-static u64 notrace gemini_read_sched_clock(void)
+static u64 notrace fttmr010_read_sched_clock(void)
 {
        return readl(base + TIMER3_COUNT);
 }
 
-static int gemini_timer_set_next_event(unsigned long cycles,
+static int fttmr010_timer_set_next_event(unsigned long cycles,
                                       struct clock_event_device *evt)
 {
        u32 cr;
@@ -96,7 +86,7 @@ static int gemini_timer_set_next_event(unsigned long cycles,
        return 0;
 }
 
-static int gemini_timer_shutdown(struct clock_event_device *evt)
+static int fttmr010_timer_shutdown(struct clock_event_device *evt)
 {
        u32 cr;
 
@@ -127,7 +117,7 @@ static int gemini_timer_shutdown(struct clock_event_device *evt)
        return 0;
 }
 
-static int gemini_timer_set_periodic(struct clock_event_device *evt)
+static int fttmr010_timer_set_periodic(struct clock_event_device *evt)
 {
        u32 period = DIV_ROUND_CLOSEST(tick_rate, HZ);
        u32 cr;
@@ -158,54 +148,40 @@ static int gemini_timer_set_periodic(struct clock_event_device *evt)
 }
 
 /* Use TIMER1 as clock event */
-static struct clock_event_device gemini_clockevent = {
+static struct clock_event_device fttmr010_clockevent = {
        .name                   = "TIMER1",
        /* Reasonably fast and accurate clock event */
        .rating                 = 300,
        .shift                  = 32,
        .features               = CLOCK_EVT_FEAT_PERIODIC |
                                  CLOCK_EVT_FEAT_ONESHOT,
-       .set_next_event         = gemini_timer_set_next_event,
-       .set_state_shutdown     = gemini_timer_shutdown,
-       .set_state_periodic     = gemini_timer_set_periodic,
-       .set_state_oneshot      = gemini_timer_shutdown,
-       .tick_resume            = gemini_timer_shutdown,
+       .set_next_event         = fttmr010_timer_set_next_event,
+       .set_state_shutdown     = fttmr010_timer_shutdown,
+       .set_state_periodic     = fttmr010_timer_set_periodic,
+       .set_state_oneshot      = fttmr010_timer_shutdown,
+       .tick_resume            = fttmr010_timer_shutdown,
 };
 
 /*
  * IRQ handler for the timer
  */
-static irqreturn_t gemini_timer_interrupt(int irq, void *dev_id)
+static irqreturn_t fttmr010_timer_interrupt(int irq, void *dev_id)
 {
-       struct clock_event_device *evt = &gemini_clockevent;
+       struct clock_event_device *evt = &fttmr010_clockevent;
 
        evt->event_handler(evt);
        return IRQ_HANDLED;
 }
 
-static struct irqaction gemini_timer_irq = {
-       .name           = "Gemini Timer Tick",
+static struct irqaction fttmr010_timer_irq = {
+       .name           = "Faraday FTTMR010 Timer Tick",
        .flags          = IRQF_TIMER,
-       .handler        = gemini_timer_interrupt,
+       .handler        = fttmr010_timer_interrupt,
 };
 
-static int __init gemini_timer_of_init(struct device_node *np)
+static int __init fttmr010_timer_common_init(struct device_node *np)
 {
-       static struct regmap *map;
        int irq;
-       int ret;
-       u32 val;
-
-       map = syscon_regmap_lookup_by_phandle(np, "syscon");
-       if (IS_ERR(map)) {
-               pr_err("Can't get regmap for syscon handle");
-               return -ENODEV;
-       }
-       ret = regmap_read(map, GLOBAL_STATUS, &val);
-       if (ret) {
-               pr_err("Can't read syscon status register");
-               return -ENXIO;
-       }
 
        base = of_iomap(np, 0);
        if (!base) {
@@ -219,26 +195,6 @@ static int __init gemini_timer_of_init(struct device_node *np)
                return -EINVAL;
        }
 
-       tick_rate = REG_TO_AHB_SPEED(val) * 1000000;
-       printk(KERN_INFO "Bus: %dMHz", tick_rate / 1000000);
-
-       tick_rate /= 6;         /* APB bus run AHB*(1/6) */
-
-       switch (val & CPU_AHB_RATIO_MASK) {
-       case CPU_AHB_1_1:
-               printk(KERN_CONT "(1/1)\n");
-               break;
-       case CPU_AHB_3_2:
-               printk(KERN_CONT "(3/2)\n");
-               break;
-       case CPU_AHB_24_13:
-               printk(KERN_CONT "(24/13)\n");
-               break;
-       case CPU_AHB_2_1:
-               printk(KERN_CONT "(2/1)\n");
-               break;
-       }
-
        /*
         * Reset the interrupt mask and status
         */
@@ -255,9 +211,9 @@ static int __init gemini_timer_of_init(struct device_node *np)
        writel(0, base + TIMER3_MATCH1);
        writel(0, base + TIMER3_MATCH2);
        clocksource_mmio_init(base + TIMER3_COUNT,
-                             "gemini_clocksource", tick_rate,
+                             "fttmr010_clocksource", tick_rate,
                              300, 32, clocksource_mmio_readl_up);
-       sched_clock_register(gemini_read_sched_clock, 32, tick_rate);
+       sched_clock_register(fttmr010_read_sched_clock, 32, tick_rate);
 
        /*
         * Setup clockevent timer (interrupt-driven.)
@@ -266,12 +222,82 @@ static int __init gemini_timer_of_init(struct device_node *np)
        writel(0, base + TIMER1_LOAD);
        writel(0, base + TIMER1_MATCH1);
        writel(0, base + TIMER1_MATCH2);
-       setup_irq(irq, &gemini_timer_irq);
-       gemini_clockevent.cpumask = cpumask_of(0);
-       clockevents_config_and_register(&gemini_clockevent, tick_rate,
+       setup_irq(irq, &fttmr010_timer_irq);
+       fttmr010_clockevent.cpumask = cpumask_of(0);
+       clockevents_config_and_register(&fttmr010_clockevent, tick_rate,
                                        1, 0xffffffff);
 
        return 0;
 }
-CLOCKSOURCE_OF_DECLARE(nomadik_mtu, "cortina,gemini-timer",
-                      gemini_timer_of_init);
+
+static int __init fttmr010_timer_of_init(struct device_node *np)
+{
+       /*
+        * These implementations require a clock reference.
+        * FIXME: we currently only support clocking using PCLK
+        * and using EXTCLK is not supported in the driver.
+        */
+       struct clk *clk;
+
+       clk = of_clk_get_by_name(np, "PCLK");
+       if (IS_ERR(clk)) {
+               pr_err("could not get PCLK");
+               return PTR_ERR(clk);
+       }
+       tick_rate = clk_get_rate(clk);
+
+       return fttmr010_timer_common_init(np);
+}
+CLOCKSOURCE_OF_DECLARE(fttmr010, "faraday,fttmr010", fttmr010_timer_of_init);
+
+/*
+ * Gemini-specific: relevant registers in the global syscon
+ */
+#define GLOBAL_STATUS          0x04
+#define CPU_AHB_RATIO_MASK     (0x3 << 18)
+#define CPU_AHB_1_1            (0x0 << 18)
+#define CPU_AHB_3_2            (0x1 << 18)
+#define CPU_AHB_24_13          (0x2 << 18)
+#define CPU_AHB_2_1            (0x3 << 18)
+#define REG_TO_AHB_SPEED(reg)  ((((reg) >> 15) & 0x7) * 10 + 130)
+
+static int __init gemini_timer_of_init(struct device_node *np)
+{
+       static struct regmap *map;
+       int ret;
+       u32 val;
+
+       map = syscon_regmap_lookup_by_phandle(np, "syscon");
+       if (IS_ERR(map)) {
+               pr_err("Can't get regmap for syscon handle\n");
+               return -ENODEV;
+       }
+       ret = regmap_read(map, GLOBAL_STATUS, &val);
+       if (ret) {
+               pr_err("Can't read syscon status register\n");
+               return -ENXIO;
+       }
+
+       tick_rate = REG_TO_AHB_SPEED(val) * 1000000;
+       pr_info("Bus: %dMHz ", tick_rate / 1000000);
+
+       tick_rate /= 6;         /* APB bus run AHB*(1/6) */
+
+       switch (val & CPU_AHB_RATIO_MASK) {
+       case CPU_AHB_1_1:
+               pr_cont("(1/1)\n");
+               break;
+       case CPU_AHB_3_2:
+               pr_cont("(3/2)\n");
+               break;
+       case CPU_AHB_24_13:
+               pr_cont("(24/13)\n");
+               break;
+       case CPU_AHB_2_1:
+               pr_cont("(2/1)\n");
+               break;
+       }
+
+       return fttmr010_timer_common_init(np);
+}
+CLOCKSOURCE_OF_DECLARE(gemini, "cortina,gemini-timer", gemini_timer_of_init);
index df6e672..04ad306 100644 (file)
@@ -200,7 +200,7 @@ static int __init integrator_ap_timer_init_of(struct device_node *node)
        err = of_property_read_string(of_aliases,
                                "arm,timer-primary", &path);
        if (err) {
-               pr_warn("Failed to read property");
+               pr_warn("Failed to read property\n");
                return err;
        }
 
@@ -209,7 +209,7 @@ static int __init integrator_ap_timer_init_of(struct device_node *node)
        err = of_property_read_string(of_aliases,
                                "arm,timer-secondary", &path);
        if (err) {
-               pr_warn("Failed to read property");             
+               pr_warn("Failed to read property\n");
                return err;
        }
 
index da1f798..e74ea17 100644 (file)
@@ -55,7 +55,7 @@ static int __init nps_get_timer_clk(struct device_node *node,
        *clk = of_clk_get(node, 0);
        ret = PTR_ERR_OR_ZERO(*clk);
        if (ret) {
-               pr_err("timer missing clk");
+               pr_err("timer missing clk\n");
                return ret;
        }
 
@@ -247,7 +247,7 @@ static int __init nps_setup_clockevent(struct device_node *node)
 
        nps_timer0_irq = irq_of_parse_and_map(node, 0);
        if (nps_timer0_irq <= 0) {
-               pr_err("clockevent: missing irq");
+               pr_err("clockevent: missing irq\n");
                return -EINVAL;
        }
 
@@ -270,7 +270,7 @@ static int __init nps_setup_clockevent(struct device_node *node)
                                nps_timer_starting_cpu,
                                nps_timer_dying_cpu);
        if (ret) {
-               pr_err("Failed to setup hotplug state");
+               pr_err("Failed to setup hotplug state\n");
                clk_disable_unprepare(clk);
                free_percpu_irq(nps_timer0_irq, &nps_clockevent_device);
                return ret;
index bfa981a..b4122ed 100644 (file)
@@ -196,20 +196,20 @@ static int __init sirfsoc_prima2_timer_init(struct device_node *np)
 
        clk = of_clk_get(np, 0);
        if (IS_ERR(clk)) {
-               pr_err("Failed to get clock");
+               pr_err("Failed to get clock\n");
                return PTR_ERR(clk);
        }
 
        ret = clk_prepare_enable(clk);
        if (ret) {
-               pr_err("Failed to enable clock");
+               pr_err("Failed to enable clock\n");
                return ret;
        }
 
        rate = clk_get_rate(clk);
 
        if (rate < PRIMA2_CLOCK_FREQ || rate % PRIMA2_CLOCK_FREQ) {
-               pr_err("Invalid clock rate");
+               pr_err("Invalid clock rate\n");
                return -EINVAL;
        }
 
@@ -229,7 +229,7 @@ static int __init sirfsoc_prima2_timer_init(struct device_node *np)
 
        ret = clocksource_register_hz(&sirfsoc_clocksource, PRIMA2_CLOCK_FREQ);
        if (ret) {
-               pr_err("Failed to register clocksource");
+               pr_err("Failed to register clocksource\n");
                return ret;
        }
 
@@ -237,7 +237,7 @@ static int __init sirfsoc_prima2_timer_init(struct device_node *np)
 
        ret = setup_irq(sirfsoc_timer_irq.irq, &sirfsoc_timer_irq);
        if (ret) {
-               pr_err("Failed to setup irq");
+               pr_err("Failed to setup irq\n");
                return ret;
        }
 
index d078633..2d575a8 100644 (file)
@@ -299,13 +299,13 @@ static int __init integrator_cp_of_init(struct device_node *np)
 
        base = of_iomap(np, 0);
        if (!base) {
-               pr_err("Failed to iomap");
+               pr_err("Failed to iomap\n");
                return -ENXIO;
        }
 
        clk = of_clk_get(np, 0);
        if (IS_ERR(clk)) {
-               pr_err("Failed to get clock");
+               pr_err("Failed to get clock\n");
                return PTR_ERR(clk);
        }
 
index a3e662b..2e9c830 100644 (file)
@@ -332,19 +332,19 @@ static int __init sun5i_timer_init(struct device_node *node)
 
        timer_base = of_io_request_and_map(node, 0, of_node_full_name(node));
        if (IS_ERR(timer_base)) {
-               pr_err("Can't map registers");
+               pr_err("Can't map registers\n");
                return PTR_ERR(timer_base);;
        }
 
        irq = irq_of_parse_and_map(node, 0);
        if (irq <= 0) {
-               pr_err("Can't parse IRQ");
+               pr_err("Can't parse IRQ\n");
                return -EINVAL;
        }
 
        clk = of_clk_get(node, 0);
        if (IS_ERR(clk)) {
-               pr_err("Can't get timer clock");
+               pr_err("Can't get timer clock\n");
                return PTR_ERR(clk);
        }
 
index 55d8d84..e0849e2 100644 (file)
@@ -165,7 +165,7 @@ static int __init pit_timer_init(struct device_node *np)
 
        timer_base = of_iomap(np, 0);
        if (!timer_base) {
-               pr_err("Failed to iomap");
+               pr_err("Failed to iomap\n");
                return -ENXIO;
        }
 
index 74fa5c5..74ed7e9 100644 (file)
@@ -247,6 +247,12 @@ config ARM_TEGRA124_CPUFREQ
        help
          This adds the CPUFreq driver support for Tegra124 SOCs.
 
+config ARM_TEGRA186_CPUFREQ
+       tristate "Tegra186 CPUFreq support"
+       depends on ARCH_TEGRA && TEGRA_BPMP
+       help
+         This adds the CPUFreq driver support for Tegra186 SOCs.
+
 config ARM_TI_CPUFREQ
        bool "Texas Instruments CPUFreq support"
        depends on ARCH_OMAP2PLUS
index 9f5a804..b7e78f0 100644 (file)
@@ -77,6 +77,7 @@ obj-$(CONFIG_ARM_SPEAR_CPUFREQ)               += spear-cpufreq.o
 obj-$(CONFIG_ARM_STI_CPUFREQ)          += sti-cpufreq.o
 obj-$(CONFIG_ARM_TEGRA20_CPUFREQ)      += tegra20-cpufreq.o
 obj-$(CONFIG_ARM_TEGRA124_CPUFREQ)     += tegra124-cpufreq.o
+obj-$(CONFIG_ARM_TEGRA186_CPUFREQ)     += tegra186-cpufreq.o
 obj-$(CONFIG_ARM_TI_CPUFREQ)           += ti-cpufreq.o
 obj-$(CONFIG_ARM_VEXPRESS_SPC_CPUFREQ) += vexpress-spc-cpufreq.o
 obj-$(CONFIG_ACPI_CPPC_CPUFREQ) += cppc_cpufreq.o
index bc96d42..0e3f649 100644 (file)
@@ -2398,6 +2398,20 @@ EXPORT_SYMBOL_GPL(cpufreq_boost_enabled);
  *********************************************************************/
 static enum cpuhp_state hp_online;
 
+static int cpuhp_cpufreq_online(unsigned int cpu)
+{
+       cpufreq_online(cpu);
+
+       return 0;
+}
+
+static int cpuhp_cpufreq_offline(unsigned int cpu)
+{
+       cpufreq_offline(cpu);
+
+       return 0;
+}
+
 /**
  * cpufreq_register_driver - register a CPU Frequency driver
  * @driver_data: A struct cpufreq_driver containing the values#
@@ -2460,8 +2474,8 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data)
        }
 
        ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "cpufreq:online",
-                                       cpufreq_online,
-                                       cpufreq_offline);
+                                       cpuhp_cpufreq_online,
+                                       cpuhp_cpufreq_offline);
        if (ret < 0)
                goto err_if_unreg;
        hp_online = ret;
index 5c3ec1d..3575b82 100644 (file)
@@ -11,6 +11,7 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/cpufreq.h>
+#include <linux/cpu_cooling.h>
 #include <linux/delay.h>
 #include <linux/slab.h>
 #include <linux/platform_device.h>
@@ -18,6 +19,7 @@
 
 static struct cpufreq_frequency_table *freq_table;
 static struct clk *armss_clk;
+static struct thermal_cooling_device *cdev;
 
 static int dbx500_cpufreq_target(struct cpufreq_policy *policy,
                                unsigned int index)
@@ -32,6 +34,22 @@ static int dbx500_cpufreq_init(struct cpufreq_policy *policy)
        return cpufreq_generic_init(policy, freq_table, 20 * 1000);
 }
 
+static int dbx500_cpufreq_exit(struct cpufreq_policy *policy)
+{
+       if (!IS_ERR(cdev))
+               cpufreq_cooling_unregister(cdev);
+       return 0;
+}
+
+static void dbx500_cpufreq_ready(struct cpufreq_policy *policy)
+{
+       cdev = cpufreq_cooling_register(policy->cpus);
+       if (IS_ERR(cdev))
+               pr_err("Failed to register cooling device %ld\n", PTR_ERR(cdev));
+       else
+               pr_info("Cooling device registered: %s\n", cdev->type);
+}
+
 static struct cpufreq_driver dbx500_cpufreq_driver = {
        .flags  = CPUFREQ_STICKY | CPUFREQ_CONST_LOOPS |
                        CPUFREQ_NEED_INITIAL_FREQ_CHECK,
@@ -39,6 +57,8 @@ static struct cpufreq_driver dbx500_cpufreq_driver = {
        .target_index = dbx500_cpufreq_target,
        .get    = cpufreq_generic_get,
        .init   = dbx500_cpufreq_init,
+       .exit  = dbx500_cpufreq_exit,
+       .ready  = dbx500_cpufreq_ready,
        .name   = "DBX500",
        .attr   = cpufreq_generic_attr,
 };
index e28a31a..a757c0a 100644 (file)
@@ -34,6 +34,11 @@ struct cpufreq_acpi_io {
        unsigned int                            resume;
 };
 
+struct cpufreq_acpi_req {
+       unsigned int            cpu;
+       unsigned int            state;
+};
+
 static struct cpufreq_acpi_io  *acpi_io_data[NR_CPUS];
 
 static struct cpufreq_driver acpi_cpufreq_driver;
@@ -83,8 +88,7 @@ processor_get_pstate (
 static unsigned
 extract_clock (
        struct cpufreq_acpi_io *data,
-       unsigned value,
-       unsigned int cpu)
+       unsigned value)
 {
        unsigned long i;
 
@@ -98,60 +102,43 @@ extract_clock (
 }
 
 
-static unsigned int
+static long
 processor_get_freq (
-       struct cpufreq_acpi_io  *data,
-       unsigned int            cpu)
+       void *arg)
 {
-       int                     ret = 0;
-       u32                     value = 0;
-       cpumask_t               saved_mask;
-       unsigned long           clock_freq;
+       struct cpufreq_acpi_req *req = arg;
+       unsigned int            cpu = req->cpu;
+       struct cpufreq_acpi_io  *data = acpi_io_data[cpu];
+       u32                     value;
+       int                     ret;
 
        pr_debug("processor_get_freq\n");
-
-       saved_mask = current->cpus_allowed;
-       set_cpus_allowed_ptr(current, cpumask_of(cpu));
        if (smp_processor_id() != cpu)
-               goto migrate_end;
+               return -EAGAIN;
 
        /* processor_get_pstate gets the instantaneous frequency */
        ret = processor_get_pstate(&value);
-
        if (ret) {
-               set_cpus_allowed_ptr(current, &saved_mask);
                pr_warn("get performance failed with error %d\n", ret);
-               ret = 0;
-               goto migrate_end;
+               return ret;
        }
-       clock_freq = extract_clock(data, value, cpu);
-       ret = (clock_freq*1000);
-
-migrate_end:
-       set_cpus_allowed_ptr(current, &saved_mask);
-       return ret;
+       return 1000 * extract_clock(data, value);
 }
 
 
-static int
+static long
 processor_set_freq (
-       struct cpufreq_acpi_io  *data,
-       struct cpufreq_policy   *policy,
-       int                     state)
+       void *arg)
 {
-       int                     ret = 0;
-       u32                     value = 0;
-       cpumask_t               saved_mask;
-       int                     retval;
+       struct cpufreq_acpi_req *req = arg;
+       unsigned int            cpu = req->cpu;
+       struct cpufreq_acpi_io  *data = acpi_io_data[cpu];
+       int                     ret, state = req->state;
+       u32                     value;
 
        pr_debug("processor_set_freq\n");
-
-       saved_mask = current->cpus_allowed;
-       set_cpus_allowed_ptr(current, cpumask_of(policy->cpu));
-       if (smp_processor_id() != policy->cpu) {
-               retval = -EAGAIN;
-               goto migrate_end;
-       }
+       if (smp_processor_id() != cpu)
+               return -EAGAIN;
 
        if (state == data->acpi_data.state) {
                if (unlikely(data->resume)) {
@@ -159,8 +146,7 @@ processor_set_freq (
                        data->resume = 0;
                } else {
                        pr_debug("Already at target state (P%d)\n", state);
-                       retval = 0;
-                       goto migrate_end;
+                       return 0;
                }
        }
 
@@ -171,7 +157,6 @@ processor_set_freq (
         * First we write the target state's 'control' value to the
         * control_register.
         */
-
        value = (u32) data->acpi_data.states[state].control;
 
        pr_debug("Transitioning to state: 0x%08x\n", value);
@@ -179,17 +164,11 @@ processor_set_freq (
        ret = processor_set_pstate(value);
        if (ret) {
                pr_warn("Transition failed with error %d\n", ret);
-               retval = -ENODEV;
-               goto migrate_end;
+               return -ENODEV;
        }
 
        data->acpi_data.state = state;
-
-       retval = 0;
-
-migrate_end:
-       set_cpus_allowed_ptr(current, &saved_mask);
-       return (retval);
+       return 0;
 }
 
 
@@ -197,11 +176,13 @@ static unsigned int
 acpi_cpufreq_get (
        unsigned int            cpu)
 {
-       struct cpufreq_acpi_io *data = acpi_io_data[cpu];
+       struct cpufreq_acpi_req req;
+       long ret;
 
-       pr_debug("acpi_cpufreq_get\n");
+       req.cpu = cpu;
+       ret = work_on_cpu(cpu, processor_get_freq, &req);
 
-       return processor_get_freq(data, cpu);
+       return ret > 0 ? (unsigned int) ret : 0;
 }
 
 
@@ -210,7 +191,12 @@ acpi_cpufreq_target (
        struct cpufreq_policy   *policy,
        unsigned int index)
 {
-       return processor_set_freq(acpi_io_data[policy->cpu], policy, index);
+       struct cpufreq_acpi_req req;
+
+       req.cpu = policy->cpu;
+       req.state = index;
+
+       return work_on_cpu(req.cpu, processor_set_freq, &req);
 }
 
 static int
index 7719b02..9c13f09 100644 (file)
@@ -161,8 +161,13 @@ static int imx6q_set_target(struct cpufreq_policy *policy, unsigned int index)
 
 static int imx6q_cpufreq_init(struct cpufreq_policy *policy)
 {
+       int ret;
+
        policy->clk = arm_clk;
-       return cpufreq_generic_init(policy, freq_table, transition_latency);
+       ret = cpufreq_generic_init(policy, freq_table, transition_latency);
+       policy->suspend_freq = policy->max;
+
+       return ret;
 }
 
 static struct cpufreq_driver imx6q_cpufreq_driver = {
@@ -173,6 +178,7 @@ static struct cpufreq_driver imx6q_cpufreq_driver = {
        .init = imx6q_cpufreq_init,
        .name = "imx6q-cpufreq",
        .attr = cpufreq_generic_attr,
+       .suspend = cpufreq_generic_suspend,
 };
 
 static int imx6q_cpufreq_probe(struct platform_device *pdev)
@@ -222,6 +228,13 @@ static int imx6q_cpufreq_probe(struct platform_device *pdev)
        arm_reg = regulator_get(cpu_dev, "arm");
        pu_reg = regulator_get_optional(cpu_dev, "pu");
        soc_reg = regulator_get(cpu_dev, "soc");
+       if (PTR_ERR(arm_reg) == -EPROBE_DEFER ||
+                       PTR_ERR(soc_reg) == -EPROBE_DEFER ||
+                       PTR_ERR(pu_reg) == -EPROBE_DEFER) {
+               ret = -EPROBE_DEFER;
+               dev_dbg(cpu_dev, "regulators not ready, defer\n");
+               goto put_reg;
+       }
        if (IS_ERR(arm_reg) || IS_ERR(soc_reg)) {
                dev_err(cpu_dev, "failed to get regulators\n");
                ret = -ENOENT;
@@ -255,7 +268,7 @@ static int imx6q_cpufreq_probe(struct platform_device *pdev)
        ret = dev_pm_opp_init_cpufreq_table(cpu_dev, &freq_table);
        if (ret) {
                dev_err(cpu_dev, "failed to init cpufreq table: %d\n", ret);
-               goto put_reg;
+               goto out_free_opp;
        }
 
        /* Make imx6_soc_volt array's size same as arm opp number */
index 283491f..b7de5bd 100644 (file)
 #include <asm/cpufeature.h>
 #include <asm/intel-family.h>
 
+#define INTEL_PSTATE_DEFAULT_SAMPLING_INTERVAL (10 * NSEC_PER_MSEC)
+#define INTEL_PSTATE_HWP_SAMPLING_INTERVAL     (50 * NSEC_PER_MSEC)
+
 #define INTEL_CPUFREQ_TRANSITION_LATENCY       20000
+#define INTEL_CPUFREQ_TRANSITION_DELAY         500
 
 #ifdef CONFIG_ACPI
 #include <acpi/processor.h>
@@ -74,6 +78,11 @@ static inline int ceiling_fp(int32_t x)
        return ret;
 }
 
+static inline int32_t percent_fp(int percent)
+{
+       return div_fp(percent, 100);
+}
+
 static inline u64 mul_ext_fp(u64 x, u64 y)
 {
        return (x * y) >> EXT_FRAC_BITS;
@@ -186,45 +195,22 @@ struct _pid {
 };
 
 /**
- * struct perf_limits - Store user and policy limits
- * @no_turbo:          User requested turbo state from intel_pstate sysfs
- * @turbo_disabled:    Platform turbo status either from msr
- *                     MSR_IA32_MISC_ENABLE or when maximum available pstate
- *                     matches the maximum turbo pstate
- * @max_perf_pct:      Effective maximum performance limit in percentage, this
- *                     is minimum of either limits enforced by cpufreq policy
- *                     or limits from user set limits via intel_pstate sysfs
- * @min_perf_pct:      Effective minimum performance limit in percentage, this
- *                     is maximum of either limits enforced by cpufreq policy
- *                     or limits from user set limits via intel_pstate sysfs
- * @max_perf:          This is a scaled value between 0 to 255 for max_perf_pct
- *                     This value is used to limit max pstate
- * @min_perf:          This is a scaled value between 0 to 255 for min_perf_pct
- *                     This value is used to limit min pstate
- * @max_policy_pct:    The maximum performance in percentage enforced by
- *                     cpufreq setpolicy interface
- * @max_sysfs_pct:     The maximum performance in percentage enforced by
- *                     intel pstate sysfs interface, unused when per cpu
- *                     controls are enforced
- * @min_policy_pct:    The minimum performance in percentage enforced by
- *                     cpufreq setpolicy interface
- * @min_sysfs_pct:     The minimum performance in percentage enforced by
- *                     intel pstate sysfs interface, unused when per cpu
- *                     controls are enforced
- *
- * Storage for user and policy defined limits.
+ * struct global_params - Global parameters, mostly tunable via sysfs.
+ * @no_turbo:          Whether or not to use turbo P-states.
+ * @turbo_disabled:    Whethet or not turbo P-states are available at all,
+ *                     based on the MSR_IA32_MISC_ENABLE value and whether or
+ *                     not the maximum reported turbo P-state is different from
+ *                     the maximum reported non-turbo one.
+ * @min_perf_pct:      Minimum capacity limit in percent of the maximum turbo
+ *                     P-state capacity.
+ * @max_perf_pct:      Maximum capacity limit in percent of the maximum turbo
+ *                     P-state capacity.
  */
-struct perf_limits {
-       int no_turbo;
-       int turbo_disabled;
+struct global_params {
+       bool no_turbo;
+       bool turbo_disabled;
        int max_perf_pct;
        int min_perf_pct;
-       int32_t max_perf;
-       int32_t min_perf;
-       int max_policy_pct;
-       int max_sysfs_pct;
-       int min_policy_pct;
-       int min_sysfs_pct;
 };
 
 /**
@@ -245,9 +231,10 @@ struct perf_limits {
  * @prev_cummulative_iowait: IO Wait time difference from last and
  *                     current sample
  * @sample:            Storage for storing last Sample data
- * @perf_limits:       Pointer to perf_limit unique to this CPU
- *                     Not all field in the structure are applicable
- *                     when per cpu controls are enforced
+ * @min_perf:          Minimum capacity limit as a fraction of the maximum
+ *                     turbo P-state capacity.
+ * @max_perf:          Maximum capacity limit as a fraction of the maximum
+ *                     turbo P-state capacity.
  * @acpi_perf_data:    Stores ACPI perf information read from _PSS
  * @valid_pss_table:   Set to true for valid ACPI _PSS entries found
  * @epp_powersave:     Last saved HWP energy performance preference
@@ -279,7 +266,8 @@ struct cpudata {
        u64     prev_tsc;
        u64     prev_cummulative_iowait;
        struct sample sample;
-       struct perf_limits *perf_limits;
+       int32_t min_perf;
+       int32_t max_perf;
 #ifdef CONFIG_ACPI
        struct acpi_processor_performance acpi_perf_data;
        bool valid_pss_table;
@@ -324,7 +312,7 @@ struct pstate_adjust_policy {
  * @get_scaling:       Callback to get frequency scaling factor
  * @get_val:           Callback to convert P state to actual MSR write value
  * @get_vid:           Callback to get VID data for Atom platforms
- * @get_target_pstate: Callback to a function to calculate next P state to use
+ * @update_util:       Active mode utilization update callback.
  *
  * Core and Atom CPU models have different way to get P State limits. This
  * structure is used to store those callbacks.
@@ -337,43 +325,31 @@ struct pstate_funcs {
        int (*get_scaling)(void);
        u64 (*get_val)(struct cpudata*, int pstate);
        void (*get_vid)(struct cpudata *);
-       int32_t (*get_target_pstate)(struct cpudata *);
+       void (*update_util)(struct update_util_data *data, u64 time,
+                           unsigned int flags);
 };
 
-/**
- * struct cpu_defaults- Per CPU model default config data
- * @pid_policy:        PID config data
- * @funcs:             Callback function data
- */
-struct cpu_defaults {
-       struct pstate_adjust_policy pid_policy;
-       struct pstate_funcs funcs;
+static struct pstate_funcs pstate_funcs __read_mostly;
+static struct pstate_adjust_policy pid_params __read_mostly = {
+       .sample_rate_ms = 10,
+       .sample_rate_ns = 10 * NSEC_PER_MSEC,
+       .deadband = 0,
+       .setpoint = 97,
+       .p_gain_pct = 20,
+       .d_gain_pct = 0,
+       .i_gain_pct = 0,
 };
 
-static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu);
-static inline int32_t get_target_pstate_use_cpu_load(struct cpudata *cpu);
-
-static struct pstate_adjust_policy pid_params __read_mostly;
-static struct pstate_funcs pstate_funcs __read_mostly;
 static int hwp_active __read_mostly;
 static bool per_cpu_limits __read_mostly;
 
-static bool driver_registered __read_mostly;
+static struct cpufreq_driver *intel_pstate_driver __read_mostly;
 
 #ifdef CONFIG_ACPI
 static bool acpi_ppc;
 #endif
 
-static struct perf_limits global;
-
-static void intel_pstate_init_limits(struct perf_limits *limits)
-{
-       memset(limits, 0, sizeof(*limits));
-       limits->max_perf_pct = 100;
-       limits->max_perf = int_ext_tofp(1);
-       limits->max_policy_pct = 100;
-       limits->max_sysfs_pct = 100;
-}
+static struct global_params global;
 
 static DEFINE_MUTEX(intel_pstate_driver_lock);
 static DEFINE_MUTEX(intel_pstate_limits_lock);
@@ -530,29 +506,6 @@ static inline void intel_pstate_exit_perf_limits(struct cpufreq_policy *policy)
 }
 #endif
 
-static inline void pid_reset(struct _pid *pid, int setpoint, int busy,
-                            int deadband, int integral) {
-       pid->setpoint = int_tofp(setpoint);
-       pid->deadband  = int_tofp(deadband);
-       pid->integral  = int_tofp(integral);
-       pid->last_err  = int_tofp(setpoint) - int_tofp(busy);
-}
-
-static inline void pid_p_gain_set(struct _pid *pid, int percent)
-{
-       pid->p_gain = div_fp(percent, 100);
-}
-
-static inline void pid_i_gain_set(struct _pid *pid, int percent)
-{
-       pid->i_gain = div_fp(percent, 100);
-}
-
-static inline void pid_d_gain_set(struct _pid *pid, int percent)
-{
-       pid->d_gain = div_fp(percent, 100);
-}
-
 static signed int pid_calc(struct _pid *pid, int32_t busy)
 {
        signed int result;
@@ -590,23 +543,17 @@ static signed int pid_calc(struct _pid *pid, int32_t busy)
        return (signed int)fp_toint(result);
 }
 
-static inline void intel_pstate_busy_pid_reset(struct cpudata *cpu)
-{
-       pid_p_gain_set(&cpu->pid, pid_params.p_gain_pct);
-       pid_d_gain_set(&cpu->pid, pid_params.d_gain_pct);
-       pid_i_gain_set(&cpu->pid, pid_params.i_gain_pct);
-
-       pid_reset(&cpu->pid, pid_params.setpoint, 100, pid_params.deadband, 0);
-}
-
-static inline void intel_pstate_reset_all_pid(void)
+static inline void intel_pstate_pid_reset(struct cpudata *cpu)
 {
-       unsigned int cpu;
+       struct _pid *pid = &cpu->pid;
 
-       for_each_online_cpu(cpu) {
-               if (all_cpu_data[cpu])
-                       intel_pstate_busy_pid_reset(all_cpu_data[cpu]);
-       }
+       pid->p_gain = percent_fp(pid_params.p_gain_pct);
+       pid->d_gain = percent_fp(pid_params.d_gain_pct);
+       pid->i_gain = percent_fp(pid_params.i_gain_pct);
+       pid->setpoint = int_tofp(pid_params.setpoint);
+       pid->last_err  = pid->setpoint - int_tofp(100);
+       pid->deadband  = int_tofp(pid_params.deadband);
+       pid->integral  = 0;
 }
 
 static inline void update_turbo_state(void)
@@ -621,6 +568,14 @@ static inline void update_turbo_state(void)
                 cpu->pstate.max_pstate == cpu->pstate.turbo_pstate);
 }
 
+static int min_perf_pct_min(void)
+{
+       struct cpudata *cpu = all_cpu_data[0];
+
+       return DIV_ROUND_UP(cpu->pstate.min_pstate * 100,
+                           cpu->pstate.turbo_pstate);
+}
+
 static s16 intel_pstate_get_epb(struct cpudata *cpu_data)
 {
        u64 epb;
@@ -838,96 +793,80 @@ static struct freq_attr *hwp_cpufreq_attrs[] = {
        NULL,
 };
 
-static void intel_pstate_hwp_set(struct cpufreq_policy *policy)
+static void intel_pstate_hwp_set(unsigned int cpu)
 {
-       int min, hw_min, max, hw_max, cpu;
-       struct perf_limits *perf_limits = &global;
+       struct cpudata *cpu_data = all_cpu_data[cpu];
+       int min, hw_min, max, hw_max;
        u64 value, cap;
+       s16 epp;
 
-       for_each_cpu(cpu, policy->cpus) {
-               struct cpudata *cpu_data = all_cpu_data[cpu];
-               s16 epp;
-
-               if (per_cpu_limits)
-                       perf_limits = all_cpu_data[cpu]->perf_limits;
-
-               rdmsrl_on_cpu(cpu, MSR_HWP_CAPABILITIES, &cap);
-               hw_min = HWP_LOWEST_PERF(cap);
-               if (global.no_turbo)
-                       hw_max = HWP_GUARANTEED_PERF(cap);
-               else
-                       hw_max = HWP_HIGHEST_PERF(cap);
-
-               max = fp_ext_toint(hw_max * perf_limits->max_perf);
-               if (cpu_data->policy == CPUFREQ_POLICY_PERFORMANCE)
-                       min = max;
-               else
-                       min = fp_ext_toint(hw_max * perf_limits->min_perf);
+       rdmsrl_on_cpu(cpu, MSR_HWP_CAPABILITIES, &cap);
+       hw_min = HWP_LOWEST_PERF(cap);
+       if (global.no_turbo)
+               hw_max = HWP_GUARANTEED_PERF(cap);
+       else
+               hw_max = HWP_HIGHEST_PERF(cap);
 
-               rdmsrl_on_cpu(cpu, MSR_HWP_REQUEST, &value);
+       max = fp_ext_toint(hw_max * cpu_data->max_perf);
+       if (cpu_data->policy == CPUFREQ_POLICY_PERFORMANCE)
+               min = max;
+       else
+               min = fp_ext_toint(hw_max * cpu_data->min_perf);
 
-               value &= ~HWP_MIN_PERF(~0L);
-               value |= HWP_MIN_PERF(min);
+       rdmsrl_on_cpu(cpu, MSR_HWP_REQUEST, &value);
 
-               value &= ~HWP_MAX_PERF(~0L);
-               value |= HWP_MAX_PERF(max);
+       value &= ~HWP_MIN_PERF(~0L);
+       value |= HWP_MIN_PERF(min);
 
-               if (cpu_data->epp_policy == cpu_data->policy)
-                       goto skip_epp;
+       value &= ~HWP_MAX_PERF(~0L);
+       value |= HWP_MAX_PERF(max);
 
-               cpu_data->epp_policy = cpu_data->policy;
+       if (cpu_data->epp_policy == cpu_data->policy)
+               goto skip_epp;
 
-               if (cpu_data->epp_saved >= 0) {
-                       epp = cpu_data->epp_saved;
-                       cpu_data->epp_saved = -EINVAL;
-                       goto update_epp;
-               }
+       cpu_data->epp_policy = cpu_data->policy;
 
-               if (cpu_data->policy == CPUFREQ_POLICY_PERFORMANCE) {
-                       epp = intel_pstate_get_epp(cpu_data, value);
-                       cpu_data->epp_powersave = epp;
-                       /* If EPP read was failed, then don't try to write */
-                       if (epp < 0)
-                               goto skip_epp;
+       if (cpu_data->epp_saved >= 0) {
+               epp = cpu_data->epp_saved;
+               cpu_data->epp_saved = -EINVAL;
+               goto update_epp;
+       }
 
+       if (cpu_data->policy == CPUFREQ_POLICY_PERFORMANCE) {
+               epp = intel_pstate_get_epp(cpu_data, value);
+               cpu_data->epp_powersave = epp;
+               /* If EPP read was failed, then don't try to write */
+               if (epp < 0)
+                       goto skip_epp;
 
-                       epp = 0;
-               } else {
-                       /* skip setting EPP, when saved value is invalid */
-                       if (cpu_data->epp_powersave < 0)
-                               goto skip_epp;
+               epp = 0;
+       } else {
+               /* skip setting EPP, when saved value is invalid */
+               if (cpu_data->epp_powersave < 0)
+                       goto skip_epp;
 
-                       /*
-                        * No need to restore EPP when it is not zero. This
-                        * means:
-                        *  - Policy is not changed
-                        *  - user has manually changed
-                        *  - Error reading EPB
-                        */
-                       epp = intel_pstate_get_epp(cpu_data, value);
-                       if (epp)
-                               goto skip_epp;
+               /*
+                * No need to restore EPP when it is not zero. This
+                * means:
+                *  - Policy is not changed
+                *  - user has manually changed
+                *  - Error reading EPB
+                */
+               epp = intel_pstate_get_epp(cpu_data, value);
+               if (epp)
+                       goto skip_epp;
 
-                       epp = cpu_data->epp_powersave;
-               }
+               epp = cpu_data->epp_powersave;
+       }
 update_epp:
-               if (static_cpu_has(X86_FEATURE_HWP_EPP)) {
-                       value &= ~GENMASK_ULL(31, 24);
-                       value |= (u64)epp << 24;
-               } else {
-                       intel_pstate_set_epb(cpu, epp);
-               }
-skip_epp:
-               wrmsrl_on_cpu(cpu, MSR_HWP_REQUEST, value);
+       if (static_cpu_has(X86_FEATURE_HWP_EPP)) {
+               value &= ~GENMASK_ULL(31, 24);
+               value |= (u64)epp << 24;
+       } else {
+               intel_pstate_set_epb(cpu, epp);
        }
-}
-
-static int intel_pstate_hwp_set_policy(struct cpufreq_policy *policy)
-{
-       if (hwp_active)
-               intel_pstate_hwp_set(policy);
-
-       return 0;
+skip_epp:
+       wrmsrl_on_cpu(cpu, MSR_HWP_REQUEST, value);
 }
 
 static int intel_pstate_hwp_save_state(struct cpufreq_policy *policy)
@@ -944,20 +883,17 @@ static int intel_pstate_hwp_save_state(struct cpufreq_policy *policy)
 
 static int intel_pstate_resume(struct cpufreq_policy *policy)
 {
-       int ret;
-
        if (!hwp_active)
                return 0;
 
        mutex_lock(&intel_pstate_limits_lock);
 
        all_cpu_data[policy->cpu]->epp_policy = 0;
-
-       ret = intel_pstate_hwp_set_policy(policy);
+       intel_pstate_hwp_set(policy->cpu);
 
        mutex_unlock(&intel_pstate_limits_lock);
 
-       return ret;
+       return 0;
 }
 
 static void intel_pstate_update_policies(void)
@@ -971,9 +907,14 @@ static void intel_pstate_update_policies(void)
 /************************** debugfs begin ************************/
 static int pid_param_set(void *data, u64 val)
 {
+       unsigned int cpu;
+
        *(u32 *)data = val;
        pid_params.sample_rate_ns = pid_params.sample_rate_ms * NSEC_PER_MSEC;
-       intel_pstate_reset_all_pid();
+       for_each_possible_cpu(cpu)
+               if (all_cpu_data[cpu])
+                       intel_pstate_pid_reset(all_cpu_data[cpu]);
+
        return 0;
 }
 
@@ -1084,7 +1025,7 @@ static ssize_t show_turbo_pct(struct kobject *kobj,
 
        mutex_lock(&intel_pstate_driver_lock);
 
-       if (!driver_registered) {
+       if (!intel_pstate_driver) {
                mutex_unlock(&intel_pstate_driver_lock);
                return -EAGAIN;
        }
@@ -1109,7 +1050,7 @@ static ssize_t show_num_pstates(struct kobject *kobj,
 
        mutex_lock(&intel_pstate_driver_lock);
 
-       if (!driver_registered) {
+       if (!intel_pstate_driver) {
                mutex_unlock(&intel_pstate_driver_lock);
                return -EAGAIN;
        }
@@ -1129,7 +1070,7 @@ static ssize_t show_no_turbo(struct kobject *kobj,
 
        mutex_lock(&intel_pstate_driver_lock);
 
-       if (!driver_registered) {
+       if (!intel_pstate_driver) {
                mutex_unlock(&intel_pstate_driver_lock);
                return -EAGAIN;
        }
@@ -1157,7 +1098,7 @@ static ssize_t store_no_turbo(struct kobject *a, struct attribute *b,
 
        mutex_lock(&intel_pstate_driver_lock);
 
-       if (!driver_registered) {
+       if (!intel_pstate_driver) {
                mutex_unlock(&intel_pstate_driver_lock);
                return -EAGAIN;
        }
@@ -1174,6 +1115,15 @@ static ssize_t store_no_turbo(struct kobject *a, struct attribute *b,
 
        global.no_turbo = clamp_t(int, input, 0, 1);
 
+       if (global.no_turbo) {
+               struct cpudata *cpu = all_cpu_data[0];
+               int pct = cpu->pstate.max_pstate * 100 / cpu->pstate.turbo_pstate;
+
+               /* Squash the global minimum into the permitted range. */
+               if (global.min_perf_pct > pct)
+                       global.min_perf_pct = pct;
+       }
+
        mutex_unlock(&intel_pstate_limits_lock);
 
        intel_pstate_update_policies();
@@ -1195,18 +1145,14 @@ static ssize_t store_max_perf_pct(struct kobject *a, struct attribute *b,
 
        mutex_lock(&intel_pstate_driver_lock);
 
-       if (!driver_registered) {
+       if (!intel_pstate_driver) {
                mutex_unlock(&intel_pstate_driver_lock);
                return -EAGAIN;
        }
 
        mutex_lock(&intel_pstate_limits_lock);
 
-       global.max_sysfs_pct = clamp_t(int, input, 0 , 100);
-       global.max_perf_pct = min(global.max_policy_pct, global.max_sysfs_pct);
-       global.max_perf_pct = max(global.min_policy_pct, global.max_perf_pct);
-       global.max_perf_pct = max(global.min_perf_pct, global.max_perf_pct);
-       global.max_perf = percent_ext_fp(global.max_perf_pct);
+       global.max_perf_pct = clamp_t(int, input, global.min_perf_pct, 100);
 
        mutex_unlock(&intel_pstate_limits_lock);
 
@@ -1229,18 +1175,15 @@ static ssize_t store_min_perf_pct(struct kobject *a, struct attribute *b,
 
        mutex_lock(&intel_pstate_driver_lock);
 
-       if (!driver_registered) {
+       if (!intel_pstate_driver) {
                mutex_unlock(&intel_pstate_driver_lock);
                return -EAGAIN;
        }
 
        mutex_lock(&intel_pstate_limits_lock);
 
-       global.min_sysfs_pct = clamp_t(int, input, 0 , 100);
-       global.min_perf_pct = max(global.min_policy_pct, global.min_sysfs_pct);
-       global.min_perf_pct = min(global.max_policy_pct, global.min_perf_pct);
-       global.min_perf_pct = min(global.max_perf_pct, global.min_perf_pct);
-       global.min_perf = percent_ext_fp(global.min_perf_pct);
+       global.min_perf_pct = clamp_t(int, input,
+                                     min_perf_pct_min(), global.max_perf_pct);
 
        mutex_unlock(&intel_pstate_limits_lock);
 
@@ -1554,132 +1497,10 @@ static int knl_get_turbo_pstate(void)
        return ret;
 }
 
-static struct cpu_defaults core_params = {
-       .pid_policy = {
-               .sample_rate_ms = 10,
-               .deadband = 0,
-               .setpoint = 97,
-               .p_gain_pct = 20,
-               .d_gain_pct = 0,
-               .i_gain_pct = 0,
-       },
-       .funcs = {
-               .get_max = core_get_max_pstate,
-               .get_max_physical = core_get_max_pstate_physical,
-               .get_min = core_get_min_pstate,
-               .get_turbo = core_get_turbo_pstate,
-               .get_scaling = core_get_scaling,
-               .get_val = core_get_val,
-               .get_target_pstate = get_target_pstate_use_performance,
-       },
-};
-
-static const struct cpu_defaults silvermont_params = {
-       .pid_policy = {
-               .sample_rate_ms = 10,
-               .deadband = 0,
-               .setpoint = 60,
-               .p_gain_pct = 14,
-               .d_gain_pct = 0,
-               .i_gain_pct = 4,
-       },
-       .funcs = {
-               .get_max = atom_get_max_pstate,
-               .get_max_physical = atom_get_max_pstate,
-               .get_min = atom_get_min_pstate,
-               .get_turbo = atom_get_turbo_pstate,
-               .get_val = atom_get_val,
-               .get_scaling = silvermont_get_scaling,
-               .get_vid = atom_get_vid,
-               .get_target_pstate = get_target_pstate_use_cpu_load,
-       },
-};
-
-static const struct cpu_defaults airmont_params = {
-       .pid_policy = {
-               .sample_rate_ms = 10,
-               .deadband = 0,
-               .setpoint = 60,
-               .p_gain_pct = 14,
-               .d_gain_pct = 0,
-               .i_gain_pct = 4,
-       },
-       .funcs = {
-               .get_max = atom_get_max_pstate,
-               .get_max_physical = atom_get_max_pstate,
-               .get_min = atom_get_min_pstate,
-               .get_turbo = atom_get_turbo_pstate,
-               .get_val = atom_get_val,
-               .get_scaling = airmont_get_scaling,
-               .get_vid = atom_get_vid,
-               .get_target_pstate = get_target_pstate_use_cpu_load,
-       },
-};
-
-static const struct cpu_defaults knl_params = {
-       .pid_policy = {
-               .sample_rate_ms = 10,
-               .deadband = 0,
-               .setpoint = 97,
-               .p_gain_pct = 20,
-               .d_gain_pct = 0,
-               .i_gain_pct = 0,
-       },
-       .funcs = {
-               .get_max = core_get_max_pstate,
-               .get_max_physical = core_get_max_pstate_physical,
-               .get_min = core_get_min_pstate,
-               .get_turbo = knl_get_turbo_pstate,
-               .get_scaling = core_get_scaling,
-               .get_val = core_get_val,
-               .get_target_pstate = get_target_pstate_use_performance,
-       },
-};
-
-static const struct cpu_defaults bxt_params = {
-       .pid_policy = {
-               .sample_rate_ms = 10,
-               .deadband = 0,
-               .setpoint = 60,
-               .p_gain_pct = 14,
-               .d_gain_pct = 0,
-               .i_gain_pct = 4,
-       },
-       .funcs = {
-               .get_max = core_get_max_pstate,
-               .get_max_physical = core_get_max_pstate_physical,
-               .get_min = core_get_min_pstate,
-               .get_turbo = core_get_turbo_pstate,
-               .get_scaling = core_get_scaling,
-               .get_val = core_get_val,
-               .get_target_pstate = get_target_pstate_use_cpu_load,
-       },
-};
-
-static void intel_pstate_get_min_max(struct cpudata *cpu, int *min, int *max)
+static int intel_pstate_get_base_pstate(struct cpudata *cpu)
 {
-       int max_perf = cpu->pstate.turbo_pstate;
-       int max_perf_adj;
-       int min_perf;
-       struct perf_limits *perf_limits = &global;
-
-       if (global.no_turbo || global.turbo_disabled)
-               max_perf = cpu->pstate.max_pstate;
-
-       if (per_cpu_limits)
-               perf_limits = cpu->perf_limits;
-
-       /*
-        * performance can be limited by user through sysfs, by cpufreq
-        * policy, or by cpu specific default values determined through
-        * experimentation.
-        */
-       max_perf_adj = fp_ext_toint(max_perf * perf_limits->max_perf);
-       *max = clamp_t(int, max_perf_adj,
-                       cpu->pstate.min_pstate, cpu->pstate.turbo_pstate);
-
-       min_perf = fp_ext_toint(max_perf * perf_limits->min_perf);
-       *min = clamp_t(int, min_perf, cpu->pstate.min_pstate, max_perf);
+       return global.no_turbo || global.turbo_disabled ?
+                       cpu->pstate.max_pstate : cpu->pstate.turbo_pstate;
 }
 
 static void intel_pstate_set_pstate(struct cpudata *cpu, int pstate)
@@ -1702,11 +1523,13 @@ static void intel_pstate_set_min_pstate(struct cpudata *cpu)
 
 static void intel_pstate_max_within_limits(struct cpudata *cpu)
 {
-       int min_pstate, max_pstate;
+       int pstate;
 
        update_turbo_state();
-       intel_pstate_get_min_max(cpu, &min_pstate, &max_pstate);
-       intel_pstate_set_pstate(cpu, max_pstate);
+       pstate = intel_pstate_get_base_pstate(cpu);
+       pstate = max(cpu->pstate.min_pstate,
+                    fp_ext_toint(pstate * cpu->max_perf));
+       intel_pstate_set_pstate(cpu, pstate);
 }
 
 static void intel_pstate_get_cpu_pstates(struct cpudata *cpu)
@@ -1767,7 +1590,11 @@ static inline bool intel_pstate_sample(struct cpudata *cpu, u64 time)
         * that sample.time will always be reset before setting the utilization
         * update hook and make the caller skip the sample then.
         */
-       return !!cpu->last_sample_time;
+       if (cpu->last_sample_time) {
+               intel_pstate_calc_avg_perf(cpu);
+               return true;
+       }
+       return false;
 }
 
 static inline int32_t get_avg_frequency(struct cpudata *cpu)
@@ -1788,6 +1615,9 @@ static inline int32_t get_target_pstate_use_cpu_load(struct cpudata *cpu)
        int32_t busy_frac, boost;
        int target, avg_pstate;
 
+       if (cpu->policy == CPUFREQ_POLICY_PERFORMANCE)
+               return cpu->pstate.turbo_pstate;
+
        busy_frac = div_fp(sample->mperf, sample->tsc);
 
        boost = cpu->iowait_boost;
@@ -1824,6 +1654,9 @@ static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu)
        int32_t perf_scaled, max_pstate, current_pstate, sample_ratio;
        u64 duration_ns;
 
+       if (cpu->policy == CPUFREQ_POLICY_PERFORMANCE)
+               return cpu->pstate.turbo_pstate;
+
        /*
         * perf_scaled is the ratio of the average P-state during the last
         * sampling period to the P-state requested last time (in percent).
@@ -1858,11 +1691,13 @@ static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu)
 
 static int intel_pstate_prepare_request(struct cpudata *cpu, int pstate)
 {
-       int max_perf, min_perf;
+       int max_pstate = intel_pstate_get_base_pstate(cpu);
+       int min_pstate;
 
-       intel_pstate_get_min_max(cpu, &min_perf, &max_perf);
-       pstate = clamp_t(int, pstate, min_perf, max_perf);
-       return pstate;
+       min_pstate = max(cpu->pstate.min_pstate,
+                        fp_ext_toint(max_pstate * cpu->min_perf));
+       max_pstate = max(min_pstate, fp_ext_toint(max_pstate * cpu->max_perf));
+       return clamp_t(int, pstate, min_pstate, max_pstate);
 }
 
 static void intel_pstate_update_pstate(struct cpudata *cpu, int pstate)
@@ -1874,16 +1709,11 @@ static void intel_pstate_update_pstate(struct cpudata *cpu, int pstate)
        wrmsrl(MSR_IA32_PERF_CTL, pstate_funcs.get_val(cpu, pstate));
 }
 
-static inline void intel_pstate_adjust_busy_pstate(struct cpudata *cpu)
+static void intel_pstate_adjust_pstate(struct cpudata *cpu, int target_pstate)
 {
-       int from, target_pstate;
+       int from = cpu->pstate.current_pstate;
        struct sample *sample;
 
-       from = cpu->pstate.current_pstate;
-
-       target_pstate = cpu->policy == CPUFREQ_POLICY_PERFORMANCE ?
-               cpu->pstate.turbo_pstate : pstate_funcs.get_target_pstate(cpu);
-
        update_turbo_state();
 
        target_pstate = intel_pstate_prepare_request(cpu, target_pstate);
@@ -1902,76 +1732,155 @@ static inline void intel_pstate_adjust_busy_pstate(struct cpudata *cpu)
                fp_toint(cpu->iowait_boost * 100));
 }
 
+static void intel_pstate_update_util_hwp(struct update_util_data *data,
+                                        u64 time, unsigned int flags)
+{
+       struct cpudata *cpu = container_of(data, struct cpudata, update_util);
+       u64 delta_ns = time - cpu->sample.time;
+
+       if ((s64)delta_ns >= INTEL_PSTATE_HWP_SAMPLING_INTERVAL)
+               intel_pstate_sample(cpu, time);
+}
+
+static void intel_pstate_update_util_pid(struct update_util_data *data,
+                                        u64 time, unsigned int flags)
+{
+       struct cpudata *cpu = container_of(data, struct cpudata, update_util);
+       u64 delta_ns = time - cpu->sample.time;
+
+       if ((s64)delta_ns < pid_params.sample_rate_ns)
+               return;
+
+       if (intel_pstate_sample(cpu, time)) {
+               int target_pstate;
+
+               target_pstate = get_target_pstate_use_performance(cpu);
+               intel_pstate_adjust_pstate(cpu, target_pstate);
+       }
+}
+
 static void intel_pstate_update_util(struct update_util_data *data, u64 time,
                                     unsigned int flags)
 {
        struct cpudata *cpu = container_of(data, struct cpudata, update_util);
        u64 delta_ns;
 
-       if (pstate_funcs.get_target_pstate == get_target_pstate_use_cpu_load) {
-               if (flags & SCHED_CPUFREQ_IOWAIT) {
-                       cpu->iowait_boost = int_tofp(1);
-               } else if (cpu->iowait_boost) {
-                       /* Clear iowait_boost if the CPU may have been idle. */
-                       delta_ns = time - cpu->last_update;
-                       if (delta_ns > TICK_NSEC)
-                               cpu->iowait_boost = 0;
-               }
-               cpu->last_update = time;
+       if (flags & SCHED_CPUFREQ_IOWAIT) {
+               cpu->iowait_boost = int_tofp(1);
+       } else if (cpu->iowait_boost) {
+               /* Clear iowait_boost if the CPU may have been idle. */
+               delta_ns = time - cpu->last_update;
+               if (delta_ns > TICK_NSEC)
+                       cpu->iowait_boost = 0;
        }
-
+       cpu->last_update = time;
        delta_ns = time - cpu->sample.time;
-       if ((s64)delta_ns >= pid_params.sample_rate_ns) {
-               bool sample_taken = intel_pstate_sample(cpu, time);
+       if ((s64)delta_ns < INTEL_PSTATE_DEFAULT_SAMPLING_INTERVAL)
+               return;
 
-               if (sample_taken) {
-                       intel_pstate_calc_avg_perf(cpu);
-                       if (!hwp_active)
-                               intel_pstate_adjust_busy_pstate(cpu);
-               }
+       if (intel_pstate_sample(cpu, time)) {
+               int target_pstate;
+
+               target_pstate = get_target_pstate_use_cpu_load(cpu);
+               intel_pstate_adjust_pstate(cpu, target_pstate);
        }
 }
 
+static struct pstate_funcs core_funcs = {
+       .get_max = core_get_max_pstate,
+       .get_max_physical = core_get_max_pstate_physical,
+       .get_min = core_get_min_pstate,
+       .get_turbo = core_get_turbo_pstate,
+       .get_scaling = core_get_scaling,
+       .get_val = core_get_val,
+       .update_util = intel_pstate_update_util_pid,
+};
+
+static const struct pstate_funcs silvermont_funcs = {
+       .get_max = atom_get_max_pstate,
+       .get_max_physical = atom_get_max_pstate,
+       .get_min = atom_get_min_pstate,
+       .get_turbo = atom_get_turbo_pstate,
+       .get_val = atom_get_val,
+       .get_scaling = silvermont_get_scaling,
+       .get_vid = atom_get_vid,
+       .update_util = intel_pstate_update_util,
+};
+
+static const struct pstate_funcs airmont_funcs = {
+       .get_max = atom_get_max_pstate,
+       .get_max_physical = atom_get_max_pstate,
+       .get_min = atom_get_min_pstate,
+       .get_turbo = atom_get_turbo_pstate,
+       .get_val = atom_get_val,
+       .get_scaling = airmont_get_scaling,
+       .get_vid = atom_get_vid,
+       .update_util = intel_pstate_update_util,
+};
+
+static const struct pstate_funcs knl_funcs = {
+       .get_max = core_get_max_pstate,
+       .get_max_physical = core_get_max_pstate_physical,
+       .get_min = core_get_min_pstate,
+       .get_turbo = knl_get_turbo_pstate,
+       .get_scaling = core_get_scaling,
+       .get_val = core_get_val,
+       .update_util = intel_pstate_update_util_pid,
+};
+
+static const struct pstate_funcs bxt_funcs = {
+       .get_max = core_get_max_pstate,
+       .get_max_physical = core_get_max_pstate_physical,
+       .get_min = core_get_min_pstate,
+       .get_turbo = core_get_turbo_pstate,
+       .get_scaling = core_get_scaling,
+       .get_val = core_get_val,
+       .update_util = intel_pstate_update_util,
+};
+
 #define ICPU(model, policy) \
        { X86_VENDOR_INTEL, 6, model, X86_FEATURE_APERFMPERF,\
                        (unsigned long)&policy }
 
 static const struct x86_cpu_id intel_pstate_cpu_ids[] = {
-       ICPU(INTEL_FAM6_SANDYBRIDGE,            core_params),
-       ICPU(INTEL_FAM6_SANDYBRIDGE_X,          core_params),
-       ICPU(INTEL_FAM6_ATOM_SILVERMONT1,       silvermont_params),
-       ICPU(INTEL_FAM6_IVYBRIDGE,              core_params),
-       ICPU(INTEL_FAM6_HASWELL_CORE,           core_params),
-       ICPU(INTEL_FAM6_BROADWELL_CORE,         core_params),
-       ICPU(INTEL_FAM6_IVYBRIDGE_X,            core_params),
-       ICPU(INTEL_FAM6_HASWELL_X,              core_params),
-       ICPU(INTEL_FAM6_HASWELL_ULT,            core_params),
-       ICPU(INTEL_FAM6_HASWELL_GT3E,           core_params),
-       ICPU(INTEL_FAM6_BROADWELL_GT3E,         core_params),
-       ICPU(INTEL_FAM6_ATOM_AIRMONT,           airmont_params),
-       ICPU(INTEL_FAM6_SKYLAKE_MOBILE,         core_params),
-       ICPU(INTEL_FAM6_BROADWELL_X,            core_params),
-       ICPU(INTEL_FAM6_SKYLAKE_DESKTOP,        core_params),
-       ICPU(INTEL_FAM6_BROADWELL_XEON_D,       core_params),
-       ICPU(INTEL_FAM6_XEON_PHI_KNL,           knl_params),
-       ICPU(INTEL_FAM6_XEON_PHI_KNM,           knl_params),
-       ICPU(INTEL_FAM6_ATOM_GOLDMONT,          bxt_params),
+       ICPU(INTEL_FAM6_SANDYBRIDGE,            core_funcs),
+       ICPU(INTEL_FAM6_SANDYBRIDGE_X,          core_funcs),
+       ICPU(INTEL_FAM6_ATOM_SILVERMONT1,       silvermont_funcs),
+       ICPU(INTEL_FAM6_IVYBRIDGE,              core_funcs),
+       ICPU(INTEL_FAM6_HASWELL_CORE,           core_funcs),
+       ICPU(INTEL_FAM6_BROADWELL_CORE,         core_funcs),
+       ICPU(INTEL_FAM6_IVYBRIDGE_X,            core_funcs),
+       ICPU(INTEL_FAM6_HASWELL_X,              core_funcs),
+       ICPU(INTEL_FAM6_HASWELL_ULT,            core_funcs),
+       ICPU(INTEL_FAM6_HASWELL_GT3E,           core_funcs),
+       ICPU(INTEL_FAM6_BROADWELL_GT3E,         core_funcs),
+       ICPU(INTEL_FAM6_ATOM_AIRMONT,           airmont_funcs),
+       ICPU(INTEL_FAM6_SKYLAKE_MOBILE,         core_funcs),
+       ICPU(INTEL_FAM6_BROADWELL_X,            core_funcs),
+       ICPU(INTEL_FAM6_SKYLAKE_DESKTOP,        core_funcs),
+       ICPU(INTEL_FAM6_BROADWELL_XEON_D,       core_funcs),
+       ICPU(INTEL_FAM6_XEON_PHI_KNL,           knl_funcs),
+       ICPU(INTEL_FAM6_XEON_PHI_KNM,           knl_funcs),
+       ICPU(INTEL_FAM6_ATOM_GOLDMONT,          bxt_funcs),
+       ICPU(INTEL_FAM6_ATOM_GEMINI_LAKE,       bxt_funcs),
        {}
 };
 MODULE_DEVICE_TABLE(x86cpu, intel_pstate_cpu_ids);
 
 static const struct x86_cpu_id intel_pstate_cpu_oob_ids[] __initconst = {
-       ICPU(INTEL_FAM6_BROADWELL_XEON_D, core_params),
-       ICPU(INTEL_FAM6_BROADWELL_X, core_params),
-       ICPU(INTEL_FAM6_SKYLAKE_X, core_params),
+       ICPU(INTEL_FAM6_BROADWELL_XEON_D, core_funcs),
+       ICPU(INTEL_FAM6_BROADWELL_X, core_funcs),
+       ICPU(INTEL_FAM6_SKYLAKE_X, core_funcs),
        {}
 };
 
 static const struct x86_cpu_id intel_pstate_cpu_ee_disable_ids[] = {
-       ICPU(INTEL_FAM6_KABYLAKE_DESKTOP, core_params),
+       ICPU(INTEL_FAM6_KABYLAKE_DESKTOP, core_funcs),
        {}
 };
 
+static bool pid_in_use(void);
+
 static int intel_pstate_init_cpu(unsigned int cpunum)
 {
        struct cpudata *cpu;
@@ -1979,18 +1888,11 @@ static int intel_pstate_init_cpu(unsigned int cpunum)
        cpu = all_cpu_data[cpunum];
 
        if (!cpu) {
-               unsigned int size = sizeof(struct cpudata);
-
-               if (per_cpu_limits)
-                       size += sizeof(struct perf_limits);
-
-               cpu = kzalloc(size, GFP_KERNEL);
+               cpu = kzalloc(sizeof(*cpu), GFP_KERNEL);
                if (!cpu)
                        return -ENOMEM;
 
                all_cpu_data[cpunum] = cpu;
-               if (per_cpu_limits)
-                       cpu->perf_limits = (struct perf_limits *)(cpu + 1);
 
                cpu->epp_default = -EINVAL;
                cpu->epp_powersave = -EINVAL;
@@ -2009,14 +1911,12 @@ static int intel_pstate_init_cpu(unsigned int cpunum)
                        intel_pstate_disable_ee(cpunum);
 
                intel_pstate_hwp_enable(cpu);
-               pid_params.sample_rate_ms = 50;
-               pid_params.sample_rate_ns = 50 * NSEC_PER_MSEC;
+       } else if (pid_in_use()) {
+               intel_pstate_pid_reset(cpu);
        }
 
        intel_pstate_get_cpu_pstates(cpu);
 
-       intel_pstate_busy_pid_reset(cpu);
-
        pr_debug("controlling: cpu %d\n", cpunum);
 
        return 0;
@@ -2039,7 +1939,7 @@ static void intel_pstate_set_update_util_hook(unsigned int cpu_num)
        /* Prevent intel_pstate_update_util() from using stale data. */
        cpu->sample.time = 0;
        cpufreq_add_update_util_hook(cpu_num, &cpu->update_util,
-                                    intel_pstate_update_util);
+                                    pstate_funcs.update_util);
        cpu->update_util_set = true;
 }
 
@@ -2055,46 +1955,68 @@ static void intel_pstate_clear_update_util_hook(unsigned int cpu)
        synchronize_sched();
 }
 
+static int intel_pstate_get_max_freq(struct cpudata *cpu)
+{
+       return global.turbo_disabled || global.no_turbo ?
+                       cpu->pstate.max_freq : cpu->pstate.turbo_freq;
+}
+
 static void intel_pstate_update_perf_limits(struct cpufreq_policy *policy,
-                                           struct perf_limits *limits)
+                                           struct cpudata *cpu)
 {
+       int max_freq = intel_pstate_get_max_freq(cpu);
        int32_t max_policy_perf, min_policy_perf;
 
-       max_policy_perf = div_ext_fp(policy->max, policy->cpuinfo.max_freq);
+       max_policy_perf = div_ext_fp(policy->max, max_freq);
        max_policy_perf = clamp_t(int32_t, max_policy_perf, 0, int_ext_tofp(1));
        if (policy->max == policy->min) {
                min_policy_perf = max_policy_perf;
        } else {
-               min_policy_perf = div_ext_fp(policy->min,
-                                            policy->cpuinfo.max_freq);
+               min_policy_perf = div_ext_fp(policy->min, max_freq);
                min_policy_perf = clamp_t(int32_t, min_policy_perf,
                                          0, max_policy_perf);
        }
 
        /* Normalize user input to [min_perf, max_perf] */
-       limits->min_perf = max(min_policy_perf,
-                              percent_ext_fp(limits->min_sysfs_pct));
-       limits->min_perf = min(limits->min_perf, max_policy_perf);
-       limits->max_perf = min(max_policy_perf,
-                              percent_ext_fp(limits->max_sysfs_pct));
-       limits->max_perf = max(min_policy_perf, limits->max_perf);
+       if (per_cpu_limits) {
+               cpu->min_perf = min_policy_perf;
+               cpu->max_perf = max_policy_perf;
+       } else {
+               int32_t global_min, global_max;
+
+               /* Global limits are in percent of the maximum turbo P-state. */
+               global_max = percent_ext_fp(global.max_perf_pct);
+               global_min = percent_ext_fp(global.min_perf_pct);
+               if (max_freq != cpu->pstate.turbo_freq) {
+                       int32_t turbo_factor;
+
+                       turbo_factor = div_ext_fp(cpu->pstate.turbo_pstate,
+                                                 cpu->pstate.max_pstate);
+                       global_min = mul_ext_fp(global_min, turbo_factor);
+                       global_max = mul_ext_fp(global_max, turbo_factor);
+               }
+               global_min = clamp_t(int32_t, global_min, 0, global_max);
+
+               cpu->min_perf = max(min_policy_perf, global_min);
+               cpu->min_perf = min(cpu->min_perf, max_policy_perf);
+               cpu->max_perf = min(max_policy_perf, global_max);
+               cpu->max_perf = max(min_policy_perf, cpu->max_perf);
 
-       /* Make sure min_perf <= max_perf */
-       limits->min_perf = min(limits->min_perf, limits->max_perf);
+               /* Make sure min_perf <= max_perf */
+               cpu->min_perf = min(cpu->min_perf, cpu->max_perf);
+       }
 
-       limits->max_perf = round_up(limits->max_perf, EXT_FRAC_BITS);
-       limits->min_perf = round_up(limits->min_perf, EXT_FRAC_BITS);
-       limits->max_perf_pct = fp_ext_toint(limits->max_perf * 100);
-       limits->min_perf_pct = fp_ext_toint(limits->min_perf * 100);
+       cpu->max_perf = round_up(cpu->max_perf, EXT_FRAC_BITS);
+       cpu->min_perf = round_up(cpu->min_perf, EXT_FRAC_BITS);
 
        pr_debug("cpu:%d max_perf_pct:%d min_perf_pct:%d\n", policy->cpu,
-                limits->max_perf_pct, limits->min_perf_pct);
+                fp_ext_toint(cpu->max_perf * 100),
+                fp_ext_toint(cpu->min_perf * 100));
 }
 
 static int intel_pstate_set_policy(struct cpufreq_policy *policy)
 {
        struct cpudata *cpu;
-       struct perf_limits *perf_limits = &global;
 
        if (!policy->cpuinfo.max_freq)
                return -ENODEV;
@@ -2105,19 +2027,9 @@ static int intel_pstate_set_policy(struct cpufreq_policy *policy)
        cpu = all_cpu_data[policy->cpu];
        cpu->policy = policy->policy;
 
-       if (cpu->pstate.max_pstate_physical > cpu->pstate.max_pstate &&
-           policy->max < policy->cpuinfo.max_freq &&
-           policy->max > cpu->pstate.max_pstate * cpu->pstate.scaling) {
-               pr_debug("policy->max > max non turbo frequency\n");
-               policy->max = policy->cpuinfo.max_freq;
-       }
-
-       if (per_cpu_limits)
-               perf_limits = cpu->perf_limits;
-
        mutex_lock(&intel_pstate_limits_lock);
 
-       intel_pstate_update_perf_limits(policy, perf_limits);
+       intel_pstate_update_perf_limits(policy, cpu);
 
        if (cpu->policy == CPUFREQ_POLICY_PERFORMANCE) {
                /*
@@ -2130,38 +2042,38 @@ static int intel_pstate_set_policy(struct cpufreq_policy *policy)
 
        intel_pstate_set_update_util_hook(policy->cpu);
 
-       intel_pstate_hwp_set_policy(policy);
+       if (hwp_active)
+               intel_pstate_hwp_set(policy->cpu);
 
        mutex_unlock(&intel_pstate_limits_lock);
 
        return 0;
 }
 
+static void intel_pstate_adjust_policy_max(struct cpufreq_policy *policy,
+                                        struct cpudata *cpu)
+{
+       if (cpu->pstate.max_pstate_physical > cpu->pstate.max_pstate &&
+           policy->max < policy->cpuinfo.max_freq &&
+           policy->max > cpu->pstate.max_freq) {
+               pr_debug("policy->max > max non turbo frequency\n");
+               policy->max = policy->cpuinfo.max_freq;
+       }
+}
+
 static int intel_pstate_verify_policy(struct cpufreq_policy *policy)
 {
        struct cpudata *cpu = all_cpu_data[policy->cpu];
 
        update_turbo_state();
-       policy->cpuinfo.max_freq = global.turbo_disabled || global.no_turbo ?
-                                       cpu->pstate.max_freq :
-                                       cpu->pstate.turbo_freq;
-
-       cpufreq_verify_within_cpu_limits(policy);
+       cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq,
+                                    intel_pstate_get_max_freq(cpu));
 
        if (policy->policy != CPUFREQ_POLICY_POWERSAVE &&
            policy->policy != CPUFREQ_POLICY_PERFORMANCE)
                return -EINVAL;
 
-       /* When per-CPU limits are used, sysfs limits are not used */
-       if (!per_cpu_limits) {
-               unsigned int max_freq, min_freq;
-
-               max_freq = policy->cpuinfo.max_freq *
-                                       global.max_sysfs_pct / 100;
-               min_freq = policy->cpuinfo.max_freq *
-                                       global.min_sysfs_pct / 100;
-               cpufreq_verify_within_limits(policy, min_freq, max_freq);
-       }
+       intel_pstate_adjust_policy_max(policy, cpu);
 
        return 0;
 }
@@ -2202,8 +2114,8 @@ static int __intel_pstate_cpu_init(struct cpufreq_policy *policy)
 
        cpu = all_cpu_data[policy->cpu];
 
-       if (per_cpu_limits)
-               intel_pstate_init_limits(cpu->perf_limits);
+       cpu->max_perf = int_ext_tofp(1);
+       cpu->min_perf = 0;
 
        policy->min = cpu->pstate.min_pstate * cpu->pstate.scaling;
        policy->max = cpu->pstate.turbo_pstate * cpu->pstate.scaling;
@@ -2257,10 +2169,12 @@ static int intel_cpufreq_verify_policy(struct cpufreq_policy *policy)
        struct cpudata *cpu = all_cpu_data[policy->cpu];
 
        update_turbo_state();
-       policy->cpuinfo.max_freq = global.no_turbo || global.turbo_disabled ?
-                       cpu->pstate.max_freq : cpu->pstate.turbo_freq;
+       cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq,
+                                    intel_pstate_get_max_freq(cpu));
 
-       cpufreq_verify_within_cpu_limits(policy);
+       intel_pstate_adjust_policy_max(policy, cpu);
+
+       intel_pstate_update_perf_limits(policy, cpu);
 
        return 0;
 }
@@ -2324,6 +2238,7 @@ static int intel_cpufreq_cpu_init(struct cpufreq_policy *policy)
                return ret;
 
        policy->cpuinfo.transition_latency = INTEL_CPUFREQ_TRANSITION_LATENCY;
+       policy->transition_delay_us = INTEL_CPUFREQ_TRANSITION_DELAY;
        /* This reflects the intel_pstate_get_cpu_pstates() setting. */
        policy->cur = policy->cpuinfo.min_freq;
 
@@ -2341,7 +2256,13 @@ static struct cpufreq_driver intel_cpufreq = {
        .name           = "intel_cpufreq",
 };
 
-static struct cpufreq_driver *intel_pstate_driver = &intel_pstate;
+static struct cpufreq_driver *default_driver = &intel_pstate;
+
+static bool pid_in_use(void)
+{
+       return intel_pstate_driver == &intel_pstate &&
+               pstate_funcs.update_util == intel_pstate_update_util_pid;
+}
 
 static void intel_pstate_driver_cleanup(void)
 {
@@ -2358,26 +2279,26 @@ static void intel_pstate_driver_cleanup(void)
                }
        }
        put_online_cpus();
+       intel_pstate_driver = NULL;
 }
 
-static int intel_pstate_register_driver(void)
+static int intel_pstate_register_driver(struct cpufreq_driver *driver)
 {
        int ret;
 
-       intel_pstate_init_limits(&global);
+       memset(&global, 0, sizeof(global));
+       global.max_perf_pct = 100;
 
+       intel_pstate_driver = driver;
        ret = cpufreq_register_driver(intel_pstate_driver);
        if (ret) {
                intel_pstate_driver_cleanup();
                return ret;
        }
 
-       mutex_lock(&intel_pstate_limits_lock);
-       driver_registered = true;
-       mutex_unlock(&intel_pstate_limits_lock);
+       global.min_perf_pct = min_perf_pct_min();
 
-       if (intel_pstate_driver == &intel_pstate && !hwp_active &&
-           pstate_funcs.get_target_pstate != get_target_pstate_use_cpu_load)
+       if (pid_in_use())
                intel_pstate_debug_expose_params();
 
        return 0;
@@ -2388,14 +2309,9 @@ static int intel_pstate_unregister_driver(void)
        if (hwp_active)
                return -EBUSY;
 
-       if (intel_pstate_driver == &intel_pstate && !hwp_active &&
-           pstate_funcs.get_target_pstate != get_target_pstate_use_cpu_load)
+       if (pid_in_use())
                intel_pstate_debug_hide_params();
 
-       mutex_lock(&intel_pstate_limits_lock);
-       driver_registered = false;
-       mutex_unlock(&intel_pstate_limits_lock);
-
        cpufreq_unregister_driver(intel_pstate_driver);
        intel_pstate_driver_cleanup();
 
@@ -2404,7 +2320,7 @@ static int intel_pstate_unregister_driver(void)
 
 static ssize_t intel_pstate_show_status(char *buf)
 {
-       if (!driver_registered)
+       if (!intel_pstate_driver)
                return sprintf(buf, "off\n");
 
        return sprintf(buf, "%s\n", intel_pstate_driver == &intel_pstate ?
@@ -2416,11 +2332,11 @@ static int intel_pstate_update_status(const char *buf, size_t size)
        int ret;
 
        if (size == 3 && !strncmp(buf, "off", size))
-               return driver_registered ?
+               return intel_pstate_driver ?
                        intel_pstate_unregister_driver() : -EINVAL;
 
        if (size == 6 && !strncmp(buf, "active", size)) {
-               if (driver_registered) {
+               if (intel_pstate_driver) {
                        if (intel_pstate_driver == &intel_pstate)
                                return 0;
 
@@ -2429,13 +2345,12 @@ static int intel_pstate_update_status(const char *buf, size_t size)
                                return ret;
                }
 
-               intel_pstate_driver = &intel_pstate;
-               return intel_pstate_register_driver();
+               return intel_pstate_register_driver(&intel_pstate);
        }
 
        if (size == 7 && !strncmp(buf, "passive", size)) {
-               if (driver_registered) {
-                       if (intel_pstate_driver != &intel_pstate)
+               if (intel_pstate_driver) {
+                       if (intel_pstate_driver == &intel_cpufreq)
                                return 0;
 
                        ret = intel_pstate_unregister_driver();
@@ -2443,8 +2358,7 @@ static int intel_pstate_update_status(const char *buf, size_t size)
                                return ret;
                }
 
-               intel_pstate_driver = &intel_cpufreq;
-               return intel_pstate_register_driver();
+               return intel_pstate_register_driver(&intel_cpufreq);
        }
 
        return -EINVAL;
@@ -2465,23 +2379,17 @@ static int __init intel_pstate_msrs_not_valid(void)
        return 0;
 }
 
-static void __init copy_pid_params(struct pstate_adjust_policy *policy)
-{
-       pid_params.sample_rate_ms = policy->sample_rate_ms;
-       pid_params.sample_rate_ns = pid_params.sample_rate_ms * NSEC_PER_MSEC;
-       pid_params.p_gain_pct = policy->p_gain_pct;
-       pid_params.i_gain_pct = policy->i_gain_pct;
-       pid_params.d_gain_pct = policy->d_gain_pct;
-       pid_params.deadband = policy->deadband;
-       pid_params.setpoint = policy->setpoint;
-}
-
 #ifdef CONFIG_ACPI
 static void intel_pstate_use_acpi_profile(void)
 {
-       if (acpi_gbl_FADT.preferred_profile == PM_MOBILE)
-               pstate_funcs.get_target_pstate =
-                               get_target_pstate_use_cpu_load;
+       switch (acpi_gbl_FADT.preferred_profile) {
+       case PM_MOBILE:
+       case PM_TABLET:
+       case PM_APPLIANCE_PC:
+       case PM_DESKTOP:
+       case PM_WORKSTATION:
+               pstate_funcs.update_util = intel_pstate_update_util;
+       }
 }
 #else
 static void intel_pstate_use_acpi_profile(void)
@@ -2498,7 +2406,7 @@ static void __init copy_cpu_funcs(struct pstate_funcs *funcs)
        pstate_funcs.get_scaling = funcs->get_scaling;
        pstate_funcs.get_val   = funcs->get_val;
        pstate_funcs.get_vid   = funcs->get_vid;
-       pstate_funcs.get_target_pstate = funcs->get_target_pstate;
+       pstate_funcs.update_util = funcs->update_util;
 
        intel_pstate_use_acpi_profile();
 }
@@ -2637,28 +2545,30 @@ static const struct x86_cpu_id hwp_support_ids[] __initconst = {
 
 static int __init intel_pstate_init(void)
 {
-       const struct x86_cpu_id *id;
-       struct cpu_defaults *cpu_def;
-       int rc = 0;
+       int rc;
 
        if (no_load)
                return -ENODEV;
 
-       if (x86_match_cpu(hwp_support_ids) && !no_hwp) {
-               copy_cpu_funcs(&core_params.funcs);
-               hwp_active++;
-               intel_pstate.attr = hwp_cpufreq_attrs;
-               goto hwp_cpu_matched;
-       }
-
-       id = x86_match_cpu(intel_pstate_cpu_ids);
-       if (!id)
-               return -ENODEV;
+       if (x86_match_cpu(hwp_support_ids)) {
+               copy_cpu_funcs(&core_funcs);
+               if (no_hwp) {
+                       pstate_funcs.update_util = intel_pstate_update_util;
+               } else {
+                       hwp_active++;
+                       intel_pstate.attr = hwp_cpufreq_attrs;
+                       pstate_funcs.update_util = intel_pstate_update_util_hwp;
+                       goto hwp_cpu_matched;
+               }
+       } else {
+               const struct x86_cpu_id *id;
 
-       cpu_def = (struct cpu_defaults *)id->driver_data;
+               id = x86_match_cpu(intel_pstate_cpu_ids);
+               if (!id)
+                       return -ENODEV;
 
-       copy_pid_params(&cpu_def->pid_policy);
-       copy_cpu_funcs(&cpu_def->funcs);
+               copy_cpu_funcs((struct pstate_funcs *)id->driver_data);
+       }
 
        if (intel_pstate_msrs_not_valid())
                return -ENODEV;
@@ -2685,7 +2595,7 @@ hwp_cpu_matched:
        intel_pstate_sysfs_expose_params();
 
        mutex_lock(&intel_pstate_driver_lock);
-       rc = intel_pstate_register_driver();
+       rc = intel_pstate_register_driver(default_driver);
        mutex_unlock(&intel_pstate_driver_lock);
        if (rc)
                return rc;
@@ -2706,7 +2616,7 @@ static int __init intel_pstate_setup(char *str)
                no_load = 1;
        } else if (!strcmp(str, "passive")) {
                pr_info("Passive mode enabled\n");
-               intel_pstate_driver = &intel_cpufreq;
+               default_driver = &intel_cpufreq;
                no_hwp = 1;
        }
        if (!strcmp(str, "no_hwp")) {
index ab25b12..fd1886f 100644 (file)
@@ -573,14 +573,33 @@ static struct platform_driver mt8173_cpufreq_platdrv = {
        .probe          = mt8173_cpufreq_probe,
 };
 
-static int mt8173_cpufreq_driver_init(void)
+/* List of machines supported by this driver */
+static const struct of_device_id mt8173_cpufreq_machines[] __initconst = {
+       { .compatible = "mediatek,mt817x", },
+       { .compatible = "mediatek,mt8173", },
+       { .compatible = "mediatek,mt8176", },
+
+       { }
+};
+
+static int __init mt8173_cpufreq_driver_init(void)
 {
+       struct device_node *np;
+       const struct of_device_id *match;
        struct platform_device *pdev;
        int err;
 
-       if (!of_machine_is_compatible("mediatek,mt8173"))
+       np = of_find_node_by_path("/");
+       if (!np)
                return -ENODEV;
 
+       match = of_match_node(mt8173_cpufreq_machines, np);
+       of_node_put(np);
+       if (!match) {
+               pr_warn("Machine is not compatible with mt8173-cpufreq\n");
+               return -ENODEV;
+       }
+
        err = platform_driver_register(&mt8173_cpufreq_platdrv);
        if (err)
                return err;
index bfec1bc..e2ea433 100644 (file)
@@ -52,17 +52,27 @@ static u32 get_bus_freq(void)
 {
        struct device_node *soc;
        u32 sysfreq;
+       struct clk *pltclk;
+       int ret;
 
+       /* get platform freq by searching bus-frequency property */
        soc = of_find_node_by_type(NULL, "soc");
-       if (!soc)
-               return 0;
-
-       if (of_property_read_u32(soc, "bus-frequency", &sysfreq))
-               sysfreq = 0;
+       if (soc) {
+               ret = of_property_read_u32(soc, "bus-frequency", &sysfreq);
+               of_node_put(soc);
+               if (!ret)
+                       return sysfreq;
+       }
 
-       of_node_put(soc);
+       /* get platform freq by its clock name */
+       pltclk = clk_get(NULL, "cg-pll0-div1");
+       if (IS_ERR(pltclk)) {
+               pr_err("%s: can't get bus frequency %ld\n",
+                      __func__, PTR_ERR(pltclk));
+               return PTR_ERR(pltclk);
+       }
 
-       return sysfreq;
+       return clk_get_rate(pltclk);
 }
 
 static struct clk *cpu_to_clk(int cpu)
index 86628e2..719c3d9 100644 (file)
 
 static DEFINE_PER_CPU(struct clk, sh_cpuclk);
 
+struct cpufreq_target {
+       struct cpufreq_policy   *policy;
+       unsigned int            freq;
+};
+
 static unsigned int sh_cpufreq_get(unsigned int cpu)
 {
        return (clk_get_rate(&per_cpu(sh_cpuclk, cpu)) + 500) / 1000;
 }
 
-/*
- * Here we notify other drivers of the proposed change and the final change.
- */
-static int sh_cpufreq_target(struct cpufreq_policy *policy,
-                            unsigned int target_freq,
-                            unsigned int relation)
+static long __sh_cpufreq_target(void *arg)
 {
-       unsigned int cpu = policy->cpu;
+       struct cpufreq_target *target = arg;
+       struct cpufreq_policy *policy = target->policy;
+       int cpu = policy->cpu;
        struct clk *cpuclk = &per_cpu(sh_cpuclk, cpu);
-       cpumask_t cpus_allowed;
        struct cpufreq_freqs freqs;
        struct device *dev;
        long freq;
 
-       cpus_allowed = current->cpus_allowed;
-       set_cpus_allowed_ptr(current, cpumask_of(cpu));
-
-       BUG_ON(smp_processor_id() != cpu);
+       if (smp_processor_id() != cpu)
+               return -ENODEV;
 
        dev = get_cpu_device(cpu);
 
        /* Convert target_freq from kHz to Hz */
-       freq = clk_round_rate(cpuclk, target_freq * 1000);
+       freq = clk_round_rate(cpuclk, target->freq * 1000);
 
        if (freq < (policy->min * 1000) || freq > (policy->max * 1000))
                return -EINVAL;
 
-       dev_dbg(dev, "requested frequency %u Hz\n", target_freq * 1000);
+       dev_dbg(dev, "requested frequency %u Hz\n", target->freq * 1000);
 
        freqs.old       = sh_cpufreq_get(cpu);
        freqs.new       = (freq + 500) / 1000;
        freqs.flags     = 0;
 
-       cpufreq_freq_transition_begin(policy, &freqs);
-       set_cpus_allowed_ptr(current, &cpus_allowed);
+       cpufreq_freq_transition_begin(target->policy, &freqs);
        clk_set_rate(cpuclk, freq);
-       cpufreq_freq_transition_end(policy, &freqs, 0);
+       cpufreq_freq_transition_end(target->policy, &freqs, 0);
 
        dev_dbg(dev, "set frequency %lu Hz\n", freq);
-
        return 0;
 }
 
+/*
+ * Here we notify other drivers of the proposed change and the final change.
+ */
+static int sh_cpufreq_target(struct cpufreq_policy *policy,
+                            unsigned int target_freq,
+                            unsigned int relation)
+{
+       struct cpufreq_target data = { .policy = policy, .freq = target_freq };
+
+       return work_on_cpu(policy->cpu, __sh_cpufreq_target, &data);
+}
+
 static int sh_cpufreq_verify(struct cpufreq_policy *policy)
 {
        struct clk *cpuclk = &per_cpu(sh_cpuclk, policy->cpu);
index 35ddb6d..90f33ef 100644 (file)
@@ -118,10 +118,6 @@ static void us2e_transition(unsigned long estar, unsigned long new_bits,
                            unsigned long clock_tick,
                            unsigned long old_divisor, unsigned long divisor)
 {
-       unsigned long flags;
-
-       local_irq_save(flags);
-
        estar &= ~ESTAR_MODE_DIV_MASK;
 
        /* This is based upon the state transition diagram in the IIe manual.  */
@@ -152,8 +148,6 @@ static void us2e_transition(unsigned long estar, unsigned long new_bits,
        } else {
                BUG();
        }
-
-       local_irq_restore(flags);
 }
 
 static unsigned long index_to_estar_mode(unsigned int index)
@@ -229,48 +223,51 @@ static unsigned long estar_to_divisor(unsigned long estar)
        return ret;
 }
 
+static void __us2e_freq_get(void *arg)
+{
+       unsigned long *estar = arg;
+
+       *estar = read_hbreg(HBIRD_ESTAR_MODE_ADDR);
+}
+
 static unsigned int us2e_freq_get(unsigned int cpu)
 {
-       cpumask_t cpus_allowed;
        unsigned long clock_tick, estar;
 
-       cpumask_copy(&cpus_allowed, &current->cpus_allowed);
-       set_cpus_allowed_ptr(current, cpumask_of(cpu));
-
        clock_tick = sparc64_get_clock_tick(cpu) / 1000;
-       estar = read_hbreg(HBIRD_ESTAR_MODE_ADDR);
-
-       set_cpus_allowed_ptr(current, &cpus_allowed);
+       if (smp_call_function_single(cpu, __us2e_freq_get, &estar, 1))
+               return 0;
 
        return clock_tick / estar_to_divisor(estar);
 }
 
-static int us2e_freq_target(struct cpufreq_policy *policy, unsigned int index)
+static void __us2e_freq_target(void *arg)
 {
-       unsigned int cpu = policy->cpu;
+       unsigned int cpu = smp_processor_id();
+       unsigned int *index = arg;
        unsigned long new_bits, new_freq;
        unsigned long clock_tick, divisor, old_divisor, estar;
-       cpumask_t cpus_allowed;
-
-       cpumask_copy(&cpus_allowed, &current->cpus_allowed);
-       set_cpus_allowed_ptr(current, cpumask_of(cpu));
 
        new_freq = clock_tick = sparc64_get_clock_tick(cpu) / 1000;
-       new_bits = index_to_estar_mode(index);
-       divisor = index_to_divisor(index);
+       new_bits = index_to_estar_mode(*index);
+       divisor = index_to_divisor(*index);
        new_freq /= divisor;
 
        estar = read_hbreg(HBIRD_ESTAR_MODE_ADDR);
 
        old_divisor = estar_to_divisor(estar);
 
-       if (old_divisor != divisor)
+       if (old_divisor != divisor) {
                us2e_transition(estar, new_bits, clock_tick * 1000,
                                old_divisor, divisor);
+       }
+}
 
-       set_cpus_allowed_ptr(current, &cpus_allowed);
+static int us2e_freq_target(struct cpufreq_policy *policy, unsigned int index)
+{
+       unsigned int cpu = policy->cpu;
 
-       return 0;
+       return smp_call_function_single(cpu, __us2e_freq_target, &index, 1);
 }
 
 static int __init us2e_freq_cpu_init(struct cpufreq_policy *policy)
index a8d86a4..30645b0 100644 (file)
@@ -35,22 +35,28 @@ static struct us3_freq_percpu_info *us3_freq_table;
 #define SAFARI_CFG_DIV_32      0x0000000080000000UL
 #define SAFARI_CFG_DIV_MASK    0x00000000C0000000UL
 
-static unsigned long read_safari_cfg(void)
+static void read_safari_cfg(void *arg)
 {
-       unsigned long ret;
+       unsigned long ret, *val = arg;
 
        __asm__ __volatile__("ldxa      [%%g0] %1, %0"
                             : "=&r" (ret)
                             : "i" (ASI_SAFARI_CONFIG));
-       return ret;
+       *val = ret;
 }
 
-static void write_safari_cfg(unsigned long val)
+static void update_safari_cfg(void *arg)
 {
+       unsigned long reg, *new_bits = arg;
+
+       read_safari_cfg(&reg);
+       reg &= ~SAFARI_CFG_DIV_MASK;
+       reg |= *new_bits;
+
        __asm__ __volatile__("stxa      %0, [%%g0] %1\n\t"
                             "membar    #Sync"
                             : /* no outputs */
-                            : "r" (val), "i" (ASI_SAFARI_CONFIG)
+                            : "r" (reg), "i" (ASI_SAFARI_CONFIG)
                             : "memory");
 }
 
@@ -78,29 +84,17 @@ static unsigned long get_current_freq(unsigned int cpu, unsigned long safari_cfg
 
 static unsigned int us3_freq_get(unsigned int cpu)
 {
-       cpumask_t cpus_allowed;
        unsigned long reg;
-       unsigned int ret;
-
-       cpumask_copy(&cpus_allowed, &current->cpus_allowed);
-       set_cpus_allowed_ptr(current, cpumask_of(cpu));
-
-       reg = read_safari_cfg();
-       ret = get_current_freq(cpu, reg);
-
-       set_cpus_allowed_ptr(current, &cpus_allowed);
 
-       return ret;
+       if (smp_call_function_single(cpu, read_safari_cfg, &reg, 1))
+               return 0;
+       return get_current_freq(cpu, reg);
 }
 
 static int us3_freq_target(struct cpufreq_policy *policy, unsigned int index)
 {
        unsigned int cpu = policy->cpu;
-       unsigned long new_bits, new_freq, reg;
-       cpumask_t cpus_allowed;
-
-       cpumask_copy(&cpus_allowed, &current->cpus_allowed);
-       set_cpus_allowed_ptr(current, cpumask_of(cpu));
+       unsigned long new_bits, new_freq;
 
        new_freq = sparc64_get_clock_tick(cpu) / 1000;
        switch (index) {
@@ -121,15 +115,7 @@ static int us3_freq_target(struct cpufreq_policy *policy, unsigned int index)
                BUG();
        }
 
-       reg = read_safari_cfg();
-
-       reg &= ~SAFARI_CFG_DIV_MASK;
-       reg |= new_bits;
-       write_safari_cfg(reg);
-
-       set_cpus_allowed_ptr(current, &cpus_allowed);
-
-       return 0;
+       return smp_call_function_single(cpu, update_safari_cfg, &new_bits, 1);
 }
 
 static int __init us3_freq_cpu_init(struct cpufreq_policy *policy)
diff --git a/drivers/cpufreq/tegra186-cpufreq.c b/drivers/cpufreq/tegra186-cpufreq.c
new file mode 100644 (file)
index 0000000..fe78753
--- /dev/null
@@ -0,0 +1,275 @@
+/*
+ * Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/cpufreq.h>
+#include <linux/dma-mapping.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+
+#include <soc/tegra/bpmp.h>
+#include <soc/tegra/bpmp-abi.h>
+
+#define EDVD_CORE_VOLT_FREQ(core)              (0x20 + (core) * 0x4)
+#define EDVD_CORE_VOLT_FREQ_F_SHIFT            0
+#define EDVD_CORE_VOLT_FREQ_V_SHIFT            16
+
+struct tegra186_cpufreq_cluster_info {
+       unsigned long offset;
+       int cpus[4];
+       unsigned int bpmp_cluster_id;
+};
+
+#define NO_CPU -1
+static const struct tegra186_cpufreq_cluster_info tegra186_clusters[] = {
+       /* Denver cluster */
+       {
+               .offset = SZ_64K * 7,
+               .cpus = { 1, 2, NO_CPU, NO_CPU },
+               .bpmp_cluster_id = 0,
+       },
+       /* A57 cluster */
+       {
+               .offset = SZ_64K * 6,
+               .cpus = { 0, 3, 4, 5 },
+               .bpmp_cluster_id = 1,
+       },
+};
+
+struct tegra186_cpufreq_cluster {
+       const struct tegra186_cpufreq_cluster_info *info;
+       struct cpufreq_frequency_table *table;
+};
+
+struct tegra186_cpufreq_data {
+       void __iomem *regs;
+
+       size_t num_clusters;
+       struct tegra186_cpufreq_cluster *clusters;
+};
+
+static int tegra186_cpufreq_init(struct cpufreq_policy *policy)
+{
+       struct tegra186_cpufreq_data *data = cpufreq_get_driver_data();
+       unsigned int i;
+
+       for (i = 0; i < data->num_clusters; i++) {
+               struct tegra186_cpufreq_cluster *cluster = &data->clusters[i];
+               const struct tegra186_cpufreq_cluster_info *info =
+                       cluster->info;
+               int core;
+
+               for (core = 0; core < ARRAY_SIZE(info->cpus); core++) {
+                       if (info->cpus[core] == policy->cpu)
+                               break;
+               }
+               if (core == ARRAY_SIZE(info->cpus))
+                       continue;
+
+               policy->driver_data =
+                       data->regs + info->offset + EDVD_CORE_VOLT_FREQ(core);
+               cpufreq_table_validate_and_show(policy, cluster->table);
+       }
+
+       policy->cpuinfo.transition_latency = 300 * 1000;
+
+       return 0;
+}
+
+static int tegra186_cpufreq_set_target(struct cpufreq_policy *policy,
+                                      unsigned int index)
+{
+       struct cpufreq_frequency_table *tbl = policy->freq_table + index;
+       void __iomem *edvd_reg = policy->driver_data;
+       u32 edvd_val = tbl->driver_data;
+
+       writel(edvd_val, edvd_reg);
+
+       return 0;
+}
+
+static struct cpufreq_driver tegra186_cpufreq_driver = {
+       .name = "tegra186",
+       .flags = CPUFREQ_STICKY | CPUFREQ_HAVE_GOVERNOR_PER_POLICY,
+       .verify = cpufreq_generic_frequency_table_verify,
+       .target_index = tegra186_cpufreq_set_target,
+       .init = tegra186_cpufreq_init,
+       .attr = cpufreq_generic_attr,
+};
+
+static struct cpufreq_frequency_table *init_vhint_table(
+       struct platform_device *pdev, struct tegra_bpmp *bpmp,
+       unsigned int cluster_id)
+{
+       struct cpufreq_frequency_table *table;
+       struct mrq_cpu_vhint_request req;
+       struct tegra_bpmp_message msg;
+       struct cpu_vhint_data *data;
+       int err, i, j, num_rates = 0;
+       dma_addr_t phys;
+       void *virt;
+
+       virt = dma_alloc_coherent(bpmp->dev, sizeof(*data), &phys,
+                                 GFP_KERNEL | GFP_DMA32);
+       if (!virt)
+               return ERR_PTR(-ENOMEM);
+
+       data = (struct cpu_vhint_data *)virt;
+
+       memset(&req, 0, sizeof(req));
+       req.addr = phys;
+       req.cluster_id = cluster_id;
+
+       memset(&msg, 0, sizeof(msg));
+       msg.mrq = MRQ_CPU_VHINT;
+       msg.tx.data = &req;
+       msg.tx.size = sizeof(req);
+
+       err = tegra_bpmp_transfer(bpmp, &msg);
+       if (err) {
+               table = ERR_PTR(err);
+               goto free;
+       }
+
+       for (i = data->vfloor; i <= data->vceil; i++) {
+               u16 ndiv = data->ndiv[i];
+
+               if (ndiv < data->ndiv_min || ndiv > data->ndiv_max)
+                       continue;
+
+               /* Only store lowest voltage index for each rate */
+               if (i > 0 && ndiv == data->ndiv[i - 1])
+                       continue;
+
+               num_rates++;
+       }
+
+       table = devm_kcalloc(&pdev->dev, num_rates + 1, sizeof(*table),
+                            GFP_KERNEL);
+       if (!table) {
+               table = ERR_PTR(-ENOMEM);
+               goto free;
+       }
+
+       for (i = data->vfloor, j = 0; i <= data->vceil; i++) {
+               struct cpufreq_frequency_table *point;
+               u16 ndiv = data->ndiv[i];
+               u32 edvd_val = 0;
+
+               if (ndiv < data->ndiv_min || ndiv > data->ndiv_max)
+                       continue;
+
+               /* Only store lowest voltage index for each rate */
+               if (i > 0 && ndiv == data->ndiv[i - 1])
+                       continue;
+
+               edvd_val |= i << EDVD_CORE_VOLT_FREQ_V_SHIFT;
+               edvd_val |= ndiv << EDVD_CORE_VOLT_FREQ_F_SHIFT;
+
+               point = &table[j++];
+               point->driver_data = edvd_val;
+               point->frequency = data->ref_clk_hz * ndiv / data->pdiv /
+                       data->mdiv / 1000;
+       }
+
+       table[j].frequency = CPUFREQ_TABLE_END;
+
+free:
+       dma_free_coherent(bpmp->dev, sizeof(*data), virt, phys);
+
+       return table;
+}
+
+static int tegra186_cpufreq_probe(struct platform_device *pdev)
+{
+       struct tegra186_cpufreq_data *data;
+       struct tegra_bpmp *bpmp;
+       struct resource *res;
+       unsigned int i = 0, err;
+
+       data = devm_kzalloc(&pdev->dev, sizeof(*data), GFP_KERNEL);
+       if (!data)
+               return -ENOMEM;
+
+       data->clusters = devm_kcalloc(&pdev->dev, ARRAY_SIZE(tegra186_clusters),
+                                     sizeof(*data->clusters), GFP_KERNEL);
+       if (!data->clusters)
+               return -ENOMEM;
+
+       data->num_clusters = ARRAY_SIZE(tegra186_clusters);
+
+       bpmp = tegra_bpmp_get(&pdev->dev);
+       if (IS_ERR(bpmp))
+               return PTR_ERR(bpmp);
+
+       res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+       data->regs = devm_ioremap_resource(&pdev->dev, res);
+       if (IS_ERR(data->regs)) {
+               err = PTR_ERR(data->regs);
+               goto put_bpmp;
+       }
+
+       for (i = 0; i < data->num_clusters; i++) {
+               struct tegra186_cpufreq_cluster *cluster = &data->clusters[i];
+
+               cluster->info = &tegra186_clusters[i];
+               cluster->table = init_vhint_table(
+                       pdev, bpmp, cluster->info->bpmp_cluster_id);
+               if (IS_ERR(cluster->table)) {
+                       err = PTR_ERR(cluster->table);
+                       goto put_bpmp;
+               }
+       }
+
+       tegra_bpmp_put(bpmp);
+
+       tegra186_cpufreq_driver.driver_data = data;
+
+       err = cpufreq_register_driver(&tegra186_cpufreq_driver);
+       if (err)
+               return err;
+
+       return 0;
+
+put_bpmp:
+       tegra_bpmp_put(bpmp);
+
+       return err;
+}
+
+static int tegra186_cpufreq_remove(struct platform_device *pdev)
+{
+       cpufreq_unregister_driver(&tegra186_cpufreq_driver);
+
+       return 0;
+}
+
+static const struct of_device_id tegra186_cpufreq_of_match[] = {
+       { .compatible = "nvidia,tegra186-ccplex-cluster", },
+       { }
+};
+MODULE_DEVICE_TABLE(of, tegra186_cpufreq_of_match);
+
+static struct platform_driver tegra186_cpufreq_platform_driver = {
+       .driver = {
+               .name = "tegra186-cpufreq",
+               .of_match_table = tegra186_cpufreq_of_match,
+       },
+       .probe = tegra186_cpufreq_probe,
+       .remove = tegra186_cpufreq_remove,
+};
+module_platform_driver(tegra186_cpufreq_platform_driver);
+
+MODULE_AUTHOR("Mikko Perttunen <mperttunen@nvidia.com>");
+MODULE_DESCRIPTION("NVIDIA Tegra186 cpufreq driver");
+MODULE_LICENSE("GPL v2");
index 926ba98..12b9145 100644 (file)
@@ -118,7 +118,7 @@ static void __init cps_cpuidle_unregister(void)
 
 static int __init cps_cpuidle_init(void)
 {
-       int err, cpu, core, i;
+       int err, cpu, i;
        struct cpuidle_device *device;
 
        /* Detect supported states */
@@ -160,7 +160,6 @@ static int __init cps_cpuidle_init(void)
        }
 
        for_each_possible_cpu(cpu) {
-               core = cpu_data[cpu].core;
                device = &per_cpu(cpuidle_dev, cpu);
                device->cpu = cpu;
 #ifdef CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED
index cda8f62..12409a5 100644 (file)
@@ -56,10 +56,9 @@ static int snooze_loop(struct cpuidle_device *dev,
 
        snooze_exit_time = get_tb() + snooze_timeout;
        ppc64_runlatch_off();
+       HMT_very_low();
        while (!need_resched()) {
-               HMT_low();
-               HMT_very_low();
-               if (snooze_timeout_en && get_tb() > snooze_exit_time)
+               if (likely(snooze_timeout_en) && get_tb() > snooze_exit_time)
                        break;
        }
 
@@ -215,11 +214,25 @@ static inline void add_powernv_state(int index, const char *name,
        stop_psscr_table[index].mask = psscr_mask;
 }
 
+/*
+ * Returns 0 if prop1_len == prop2_len. Else returns -1
+ */
+static inline int validate_dt_prop_sizes(const char *prop1, int prop1_len,
+                                        const char *prop2, int prop2_len)
+{
+       if (prop1_len == prop2_len)
+               return 0;
+
+       pr_warn("cpuidle-powernv: array sizes don't match for %s and %s\n",
+               prop1, prop2);
+       return -1;
+}
+
 static int powernv_add_idle_states(void)
 {
        struct device_node *power_mgt;
        int nr_idle_states = 1; /* Snooze */
-       int dt_idle_states;
+       int dt_idle_states, count;
        u32 latency_ns[CPUIDLE_STATE_MAX];
        u32 residency_ns[CPUIDLE_STATE_MAX];
        u32 flags[CPUIDLE_STATE_MAX];
@@ -244,6 +257,21 @@ static int powernv_add_idle_states(void)
                goto out;
        }
 
+       count = of_property_count_u32_elems(power_mgt,
+                                           "ibm,cpu-idle-state-latencies-ns");
+
+       if (validate_dt_prop_sizes("ibm,cpu-idle-state-flags", dt_idle_states,
+                                  "ibm,cpu-idle-state-latencies-ns",
+                                  count) != 0)
+               goto out;
+
+       count = of_property_count_strings(power_mgt,
+                                         "ibm,cpu-idle-state-names");
+       if (validate_dt_prop_sizes("ibm,cpu-idle-state-flags", dt_idle_states,
+                                  "ibm,cpu-idle-state-names",
+                                  count) != 0)
+               goto out;
+
        /*
         * Since snooze is used as first idle state, max idle states allowed is
         * CPUIDLE_STATE_MAX -1
@@ -278,6 +306,22 @@ static int powernv_add_idle_states(void)
        has_stop_states = (flags[0] &
                           (OPAL_PM_STOP_INST_FAST | OPAL_PM_STOP_INST_DEEP));
        if (has_stop_states) {
+               count = of_property_count_u64_elems(power_mgt,
+                                                   "ibm,cpu-idle-state-psscr");
+               if (validate_dt_prop_sizes("ibm,cpu-idle-state-flags",
+                                          dt_idle_states,
+                                          "ibm,cpu-idle-state-psscr",
+                                          count) != 0)
+                       goto out;
+
+               count = of_property_count_u64_elems(power_mgt,
+                                                   "ibm,cpu-idle-state-psscr-mask");
+               if (validate_dt_prop_sizes("ibm,cpu-idle-state-flags",
+                                          dt_idle_states,
+                                          "ibm,cpu-idle-state-psscr-mask",
+                                          count) != 0)
+                       goto out;
+
                if (of_property_read_u64_array(power_mgt,
                    "ibm,cpu-idle-state-psscr", psscr_val, dt_idle_states)) {
                        pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-psscr in DT\n");
@@ -292,8 +336,21 @@ static int powernv_add_idle_states(void)
                }
        }
 
-       rc = of_property_read_u32_array(power_mgt,
-               "ibm,cpu-idle-state-residency-ns", residency_ns, dt_idle_states);
+       count = of_property_count_u32_elems(power_mgt,
+                                           "ibm,cpu-idle-state-residency-ns");
+
+       if (count < 0) {
+               rc = count;
+       } else if (validate_dt_prop_sizes("ibm,cpu-idle-state-flags",
+                                         dt_idle_states,
+                                         "ibm,cpu-idle-state-residency-ns",
+                                         count) != 0) {
+               goto out;
+       } else {
+               rc = of_property_read_u32_array(power_mgt,
+                                               "ibm,cpu-idle-state-residency-ns",
+                                               residency_ns, dt_idle_states);
+       }
 
        for (i = 0; i < dt_idle_states; i++) {
                unsigned int exit_latency, target_residency;
index 32100c4..49cbdcb 100644 (file)
@@ -506,7 +506,7 @@ static int caam_rsa_init_tfm(struct crypto_akcipher *tfm)
        ctx->dev = caam_jr_alloc();
 
        if (IS_ERR(ctx->dev)) {
-               dev_err(ctx->dev, "Job Ring Device allocation for transform failed\n");
+               pr_err("Job Ring Device allocation for transform failed\n");
                return PTR_ERR(ctx->dev);
        }
 
index fef39f9..5d7f73d 100644 (file)
@@ -281,7 +281,8 @@ static int deinstantiate_rng(struct device *ctrldev, int state_handle_mask)
                        /* Try to run it through DECO0 */
                        ret = run_descriptor_deco0(ctrldev, desc, &status);
 
-                       if (ret || status) {
+                       if (ret ||
+                           (status && status != JRSTA_SSRC_JUMP_HALT_CC)) {
                                dev_err(ctrldev,
                                        "Failed to deinstantiate RNG4 SH%d\n",
                                        sh_idx);
@@ -301,15 +302,13 @@ static int caam_remove(struct platform_device *pdev)
        struct device *ctrldev;
        struct caam_drv_private *ctrlpriv;
        struct caam_ctrl __iomem *ctrl;
-       int ring;
 
        ctrldev = &pdev->dev;
        ctrlpriv = dev_get_drvdata(ctrldev);
        ctrl = (struct caam_ctrl __iomem *)ctrlpriv->ctrl;
 
-       /* Remove platform devices for JobRs */
-       for (ring = 0; ring < ctrlpriv->total_jobrs; ring++)
-               of_device_unregister(ctrlpriv->jrpdev[ring]);
+       /* Remove platform devices under the crypto node */
+       of_platform_depopulate(ctrldev);
 
        /* De-initialize RNG state handles initialized by this driver. */
        if (ctrlpriv->rng4_sh_init)
@@ -418,10 +417,21 @@ DEFINE_SIMPLE_ATTRIBUTE(caam_fops_u32_ro, caam_debugfs_u32_get, NULL, "%llu\n");
 DEFINE_SIMPLE_ATTRIBUTE(caam_fops_u64_ro, caam_debugfs_u64_get, NULL, "%llu\n");
 #endif
 
+static const struct of_device_id caam_match[] = {
+       {
+               .compatible = "fsl,sec-v4.0",
+       },
+       {
+               .compatible = "fsl,sec4.0",
+       },
+       {},
+};
+MODULE_DEVICE_TABLE(of, caam_match);
+
 /* Probe routine for CAAM top (controller) level */
 static int caam_probe(struct platform_device *pdev)
 {
-       int ret, ring, ridx, rspec, gen_sk, ent_delay = RTSDCTL_ENT_DLY_MIN;
+       int ret, ring, gen_sk, ent_delay = RTSDCTL_ENT_DLY_MIN;
        u64 caam_id;
        struct device *dev;
        struct device_node *nprop, *np;
@@ -597,47 +607,24 @@ static int caam_probe(struct platform_device *pdev)
                goto iounmap_ctrl;
        }
 
-       /*
-        * Detect and enable JobRs
-        * First, find out how many ring spec'ed, allocate references
-        * for all, then go probe each one.
-        */
-       rspec = 0;
-       for_each_available_child_of_node(nprop, np)
-               if (of_device_is_compatible(np, "fsl,sec-v4.0-job-ring") ||
-                   of_device_is_compatible(np, "fsl,sec4.0-job-ring"))
-                       rspec++;
-
-       ctrlpriv->jrpdev = devm_kcalloc(&pdev->dev, rspec,
-                                       sizeof(*ctrlpriv->jrpdev), GFP_KERNEL);
-       if (ctrlpriv->jrpdev == NULL) {
-               ret = -ENOMEM;
+       ret = of_platform_populate(nprop, caam_match, NULL, dev);
+       if (ret) {
+               dev_err(dev, "JR platform devices creation error\n");
                goto iounmap_ctrl;
        }
 
        ring = 0;
-       ridx = 0;
-       ctrlpriv->total_jobrs = 0;
        for_each_available_child_of_node(nprop, np)
                if (of_device_is_compatible(np, "fsl,sec-v4.0-job-ring") ||
                    of_device_is_compatible(np, "fsl,sec4.0-job-ring")) {
-                       ctrlpriv->jrpdev[ring] =
-                               of_platform_device_create(np, NULL, dev);
-                       if (!ctrlpriv->jrpdev[ring]) {
-                               pr_warn("JR physical index %d: Platform device creation error\n",
-                                       ridx);
-                               ridx++;
-                               continue;
-                       }
                        ctrlpriv->jr[ring] = (struct caam_job_ring __iomem __force *)
                                             ((__force uint8_t *)ctrl +
-                                            (ridx + JR_BLOCK_NUMBER) *
+                                            (ring + JR_BLOCK_NUMBER) *
                                              BLOCK_OFFSET
                                             );
                        ctrlpriv->total_jobrs++;
                        ring++;
-                       ridx++;
-       }
+               }
 
        /* Check to see if QI present. If so, enable */
        ctrlpriv->qi_present =
@@ -847,17 +834,6 @@ disable_caam_ipg:
        return ret;
 }
 
-static struct of_device_id caam_match[] = {
-       {
-               .compatible = "fsl,sec-v4.0",
-       },
-       {
-               .compatible = "fsl,sec4.0",
-       },
-       {},
-};
-MODULE_DEVICE_TABLE(of, caam_match);
-
 static struct platform_driver caam_driver = {
        .driver = {
                .name = "caam",
index e2bcacc..dbed8ba 100644 (file)
@@ -66,7 +66,6 @@ struct caam_drv_private_jr {
 struct caam_drv_private {
 
        struct device *dev;
-       struct platform_device **jrpdev; /* Alloc'ed array per sub-device */
        struct platform_device *pdev;
 
        /* Physical-presence section */
index c5aac25..4ecb77a 100644 (file)
@@ -65,6 +65,11 @@ struct spu_queue {
        struct list_head        list;
 };
 
+struct spu_qreg {
+       struct spu_queue        *queue;
+       unsigned long           type;
+};
+
 static struct spu_queue **cpu_to_cwq;
 static struct spu_queue **cpu_to_mau;
 
@@ -1631,31 +1636,27 @@ static void queue_cache_destroy(void)
        kmem_cache_destroy(queue_cache[HV_NCS_QTYPE_CWQ - 1]);
 }
 
-static int spu_queue_register(struct spu_queue *p, unsigned long q_type)
+static long spu_queue_register_workfn(void *arg)
 {
-       cpumask_var_t old_allowed;
+       struct spu_qreg *qr = arg;
+       struct spu_queue *p = qr->queue;
+       unsigned long q_type = qr->type;
        unsigned long hv_ret;
 
-       if (cpumask_empty(&p->sharing))
-               return -EINVAL;
-
-       if (!alloc_cpumask_var(&old_allowed, GFP_KERNEL))
-               return -ENOMEM;
-
-       cpumask_copy(old_allowed, &current->cpus_allowed);
-
-       set_cpus_allowed_ptr(current, &p->sharing);
-
        hv_ret = sun4v_ncs_qconf(q_type, __pa(p->q),
                                 CWQ_NUM_ENTRIES, &p->qhandle);
        if (!hv_ret)
                sun4v_ncs_sethead_marker(p->qhandle, 0);
 
-       set_cpus_allowed_ptr(current, old_allowed);
+       return hv_ret ? -EINVAL : 0;
+}
 
-       free_cpumask_var(old_allowed);
+static int spu_queue_register(struct spu_queue *p, unsigned long q_type)
+{
+       int cpu = cpumask_any_and(&p->sharing, cpu_online_mask);
+       struct spu_qreg qr = { .queue = p, .type = q_type };
 
-       return (hv_ret ? -EINVAL : 0);
+       return work_on_cpu_safe(cpu, spu_queue_register_workfn, &qr);
 }
 
 static int spu_queue_setup(struct spu_queue *p)
index 3e2ab3b..9e95bf9 100644 (file)
@@ -2,6 +2,7 @@ menuconfig DEV_DAX
        tristate "DAX: direct access to differentiated memory"
        default m if NVDIMM_DAX
        depends on TRANSPARENT_HUGEPAGE
+       select SRCU
        help
          Support raw access to differentiated (persistence, bandwidth,
          latency...) memory via an mmap(2) capable character
index 80c6db2..806f180 100644 (file)
@@ -25,6 +25,7 @@
 #include "dax.h"
 
 static dev_t dax_devt;
+DEFINE_STATIC_SRCU(dax_srcu);
 static struct class *dax_class;
 static DEFINE_IDA(dax_minor_ida);
 static int nr_dax = CONFIG_NR_DEV_DAX;
@@ -60,7 +61,7 @@ struct dax_region {
  * @region - parent region
  * @dev - device backing the character device
  * @cdev - core chardev data
- * @alive - !alive + rcu grace period == no new mappings can be established
+ * @alive - !alive + srcu grace period == no new mappings can be established
  * @id - child id in the region
  * @num_resources - number of physical address extents in this device
  * @res - array of physical address ranges
@@ -569,7 +570,7 @@ static int __dax_dev_pud_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
 static int dax_dev_huge_fault(struct vm_fault *vmf,
                enum page_entry_size pe_size)
 {
-       int rc;
+       int rc, id;
        struct file *filp = vmf->vma->vm_file;
        struct dax_dev *dax_dev = filp->private_data;
 
@@ -578,7 +579,7 @@ static int dax_dev_huge_fault(struct vm_fault *vmf,
                        ? "write" : "read",
                        vmf->vma->vm_start, vmf->vma->vm_end);
 
-       rcu_read_lock();
+       id = srcu_read_lock(&dax_srcu);
        switch (pe_size) {
        case PE_SIZE_PTE:
                rc = __dax_dev_pte_fault(dax_dev, vmf);
@@ -592,7 +593,7 @@ static int dax_dev_huge_fault(struct vm_fault *vmf,
        default:
                return VM_FAULT_FALLBACK;
        }
-       rcu_read_unlock();
+       srcu_read_unlock(&dax_srcu, id);
 
        return rc;
 }
@@ -713,11 +714,11 @@ static void unregister_dax_dev(void *dev)
         * Note, rcu is not protecting the liveness of dax_dev, rcu is
         * ensuring that any fault handlers that might have seen
         * dax_dev->alive == true, have completed.  Any fault handlers
-        * that start after synchronize_rcu() has started will abort
+        * that start after synchronize_srcu() has started will abort
         * upon seeing dax_dev->alive == false.
         */
        dax_dev->alive = false;
-       synchronize_rcu();
+       synchronize_srcu(&dax_srcu);
        unmap_mapping_range(dax_dev->inode->i_mapping, 0, 0, 1);
        cdev_del(cdev);
        device_unregister(dev);
index 71576b8..a4f2fa1 100644 (file)
 #define DEVFREQ_GOV_SUSPEND                    0x4
 #define DEVFREQ_GOV_RESUME                     0x5
 
+/**
+ * struct devfreq_governor - Devfreq policy governor
+ * @node:              list node - contains registered devfreq governors
+ * @name:              Governor's name
+ * @immutable:         Immutable flag for governor. If the value is 1,
+ *                     this govenror is never changeable to other governor.
+ * @get_target_freq:   Returns desired operating frequency for the device.
+ *                     Basically, get_target_freq will run
+ *                     devfreq_dev_profile.get_dev_status() to get the
+ *                     status of the device (load = busy_time / total_time).
+ *                     If no_central_polling is set, this callback is called
+ *                     only with update_devfreq() notified by OPP.
+ * @event_handler:      Callback for devfreq core framework to notify events
+ *                      to governors. Events include per device governor
+ *                      init and exit, opp changes out of devfreq, suspend
+ *                      and resume of per device devfreq during device idle.
+ *
+ * Note that the callbacks are called with devfreq->lock locked by devfreq.
+ */
+struct devfreq_governor {
+       struct list_head node;
+
+       const char name[DEVFREQ_NAME_LEN];
+       const unsigned int immutable;
+       int (*get_target_freq)(struct devfreq *this, unsigned long *freq);
+       int (*event_handler)(struct devfreq *devfreq,
+                               unsigned int event, void *data);
+};
+
 /* Caution: devfreq->lock must be locked before calling update_devfreq */
 extern int update_devfreq(struct devfreq *devfreq);
 
index 4773f28..96afb2a 100644 (file)
@@ -10,26 +10,16 @@ config EDAC_SUPPORT
        bool
 
 menuconfig EDAC
-       bool "EDAC (Error Detection And Correction) reporting"
-       depends on HAS_IOMEM && EDAC_SUPPORT
+       tristate "EDAC (Error Detection And Correction) reporting"
+       depends on HAS_IOMEM && EDAC_SUPPORT && RAS
        help
-         EDAC is designed to report errors in the core system.
-         These are low-level errors that are reported in the CPU or
-         supporting chipset or other subsystems:
+         EDAC is a subsystem along with hardware-specific drivers designed to
+         report hardware errors. These are low-level errors that are reported
+         in the CPU or supporting chipset or other subsystems:
          memory errors, cache errors, PCI errors, thermal throttling, etc..
          If unsure, select 'Y'.
 
-         If this code is reporting problems on your system, please
-         see the EDAC project web pages for more information at:
-
-         <http://bluesmoke.sourceforge.net/>
-
-         and:
-
-         <http://buttersideup.com/edacwiki>
-
-         There is also a mailing list for the EDAC project, which can
-         be found via the sourceforge page.
+         The mailing list for the EDAC project is linux-edac@vger.kernel.org.
 
 if EDAC
 
@@ -62,21 +52,9 @@ config EDAC_DECODE_MCE
          which occur really early upon boot, before the module infrastructure
          has been initialized.
 
-config EDAC_MM_EDAC
-       tristate "Main Memory EDAC (Error Detection And Correction) reporting"
-       select RAS
-       help
-         Some systems are able to detect and correct errors in main
-         memory.  EDAC can report statistics on memory error
-         detection and correction (EDAC - or commonly referred to ECC
-         errors).  EDAC will also try to decode where these errors
-         occurred so that a particular failing memory module can be
-         replaced.  If unsure, select 'Y'.
-
 config EDAC_GHES
        bool "Output ACPI APEI/GHES BIOS detected errors via EDAC"
-       depends on ACPI_APEI_GHES && (EDAC_MM_EDAC=y)
-       default y
+       depends on ACPI_APEI_GHES && (EDAC=y)
        help
          Not all machines support hardware-driven error report. Some of those
          provide a BIOS-driven error report mechanism via ACPI, using the
@@ -98,7 +76,7 @@ config EDAC_GHES
 
 config EDAC_AMD64
        tristate "AMD64 (Opteron, Athlon64)"
-       depends on EDAC_MM_EDAC && AMD_NB && EDAC_DECODE_MCE
+       depends on AMD_NB && EDAC_DECODE_MCE
        help
          Support for error detection and correction of DRAM ECC errors on
          the AMD64 families (>= K8) of memory controllers.
@@ -124,28 +102,28 @@ config EDAC_AMD64_ERROR_INJECTION
 
 config EDAC_AMD76X
        tristate "AMD 76x (760, 762, 768)"
-       depends on EDAC_MM_EDAC && PCI && X86_32
+       depends on PCI && X86_32
        help
          Support for error detection and correction on the AMD 76x
          series of chipsets used with the Athlon processor.
 
 config EDAC_E7XXX
        tristate "Intel e7xxx (e7205, e7500, e7501, e7505)"
-       depends on EDAC_MM_EDAC && PCI && X86_32
+       depends on PCI && X86_32
        help
          Support for error detection and correction on the Intel
          E7205, E7500, E7501 and E7505 server chipsets.
 
 config EDAC_E752X
        tristate "Intel e752x (e7520, e7525, e7320) and 3100"
-       depends on EDAC_MM_EDAC && PCI && X86
+       depends on PCI && X86
        help
          Support for error detection and correction on the Intel
          E7520, E7525, E7320 server chipsets.
 
 config EDAC_I82443BXGX
        tristate "Intel 82443BX/GX (440BX/GX)"
-       depends on EDAC_MM_EDAC && PCI && X86_32
+       depends on PCI && X86_32
        depends on BROKEN
        help
          Support for error detection and correction on the Intel
@@ -153,56 +131,56 @@ config EDAC_I82443BXGX
 
 config EDAC_I82875P
        tristate "Intel 82875p (D82875P, E7210)"
-       depends on EDAC_MM_EDAC && PCI && X86_32
+       depends on PCI && X86_32
        help
          Support for error detection and correction on the Intel
          DP82785P and E7210 server chipsets.
 
 config EDAC_I82975X
        tristate "Intel 82975x (D82975x)"
-       depends on EDAC_MM_EDAC && PCI && X86
+       depends on PCI && X86
        help
          Support for error detection and correction on the Intel
          DP82975x server chipsets.
 
 config EDAC_I3000
        tristate "Intel 3000/3010"
-       depends on EDAC_MM_EDAC && PCI && X86
+       depends on PCI && X86
        help
          Support for error detection and correction on the Intel
          3000 and 3010 server chipsets.
 
 config EDAC_I3200
        tristate "Intel 3200"
-       depends on EDAC_MM_EDAC && PCI && X86
+       depends on PCI && X86
        help
          Support for error detection and correction on the Intel
          3200 and 3210 server chipsets.
 
 config EDAC_IE31200
        tristate "Intel e312xx"
-       depends on EDAC_MM_EDAC && PCI && X86
+       depends on PCI && X86
        help
          Support for error detection and correction on the Intel
          E3-1200 based DRAM controllers.
 
 config EDAC_X38
        tristate "Intel X38"
-       depends on EDAC_MM_EDAC && PCI && X86
+       depends on PCI && X86
        help
          Support for error detection and correction on the Intel
          X38 server chipsets.
 
 config EDAC_I5400
        tristate "Intel 5400 (Seaburg) chipsets"
-       depends on EDAC_MM_EDAC && PCI && X86
+       depends on PCI && X86
        help
          Support for error detection and correction the Intel
          i5400 MCH chipset (Seaburg).
 
 config EDAC_I7CORE
        tristate "Intel i7 Core (Nehalem) processors"
-       depends on EDAC_MM_EDAC && PCI && X86 && X86_MCE_INTEL
+       depends on PCI && X86 && X86_MCE_INTEL
        help
          Support for error detection and correction the Intel
          i7 Core (Nehalem) Integrated Memory Controller that exists on
@@ -211,58 +189,56 @@ config EDAC_I7CORE
 
 config EDAC_I82860
        tristate "Intel 82860"
-       depends on EDAC_MM_EDAC && PCI && X86_32
+       depends on PCI && X86_32
        help
          Support for error detection and correction on the Intel
          82860 chipset.
 
 config EDAC_R82600
        tristate "Radisys 82600 embedded chipset"
-       depends on EDAC_MM_EDAC && PCI && X86_32
+       depends on PCI && X86_32
        help
          Support for error detection and correction on the Radisys
          82600 embedded chipset.
 
 config EDAC_I5000
        tristate "Intel Greencreek/Blackford chipset"
-       depends on EDAC_MM_EDAC && X86 && PCI
+       depends on X86 && PCI
        help
          Support for error detection and correction the Intel
          Greekcreek/Blackford chipsets.
 
 config EDAC_I5100
        tristate "Intel San Clemente MCH"
-       depends on EDAC_MM_EDAC && X86 && PCI
+       depends on X86 && PCI
        help
          Support for error detection and correction the Intel
          San Clemente MCH.
 
 config EDAC_I7300
        tristate "Intel Clarksboro MCH"
-       depends on EDAC_MM_EDAC && X86 && PCI
+       depends on X86 && PCI
        help
          Support for error detection and correction the Intel
          Clarksboro MCH (Intel 7300 chipset).
 
 config EDAC_SBRIDGE
        tristate "Intel Sandy-Bridge/Ivy-Bridge/Haswell Integrated MC"
-       depends on EDAC_MM_EDAC && PCI && X86_64 && X86_MCE_INTEL
-       depends on PCI_MMCONFIG
+       depends on PCI && X86_64 && X86_MCE_INTEL && PCI_MMCONFIG
        help
          Support for error detection and correction the Intel
          Sandy Bridge, Ivy Bridge and Haswell Integrated Memory Controllers.
 
 config EDAC_SKX
        tristate "Intel Skylake server Integrated MC"
-       depends on EDAC_MM_EDAC && PCI && X86_64 && X86_MCE_INTEL
-       depends on PCI_MMCONFIG
+       depends on PCI && X86_64 && X86_MCE_INTEL && PCI_MMCONFIG
        help
          Support for error detection and correction the Intel
          Skylake server Integrated Memory Controllers.
 
 config EDAC_PND2
        tristate "Intel Pondicherry2"
-       depends on EDAC_MM_EDAC && PCI && X86_64 && X86_MCE_INTEL
+       depends on PCI && X86_64 && X86_MCE_INTEL
        help
          Support for error detection and correction on the Intel
          Pondicherry2 Integrated Memory Controller. This SoC IP is
@@ -271,36 +247,35 @@ config EDAC_PND2
 
 config EDAC_MPC85XX
        tristate "Freescale MPC83xx / MPC85xx"
-       depends on EDAC_MM_EDAC && FSL_SOC
+       depends on FSL_SOC
        help
          Support for error detection and correction on the Freescale
          MPC8349, MPC8560, MPC8540, MPC8548, T4240
 
 config EDAC_LAYERSCAPE
        tristate "Freescale Layerscape DDR"
-       depends on EDAC_MM_EDAC && ARCH_LAYERSCAPE
+       depends on ARCH_LAYERSCAPE
        help
          Support for error detection and correction on Freescale memory
          controllers on Layerscape SoCs.
 
 config EDAC_MV64X60
        tristate "Marvell MV64x60"
-       depends on EDAC_MM_EDAC && MV64X60
+       depends on MV64X60
        help
          Support for error detection and correction on the Marvell
          MV64360 and MV64460 chipsets.
 
 config EDAC_PASEMI
        tristate "PA Semi PWRficient"
-       depends on EDAC_MM_EDAC && PCI
-       depends on PPC_PASEMI
+       depends on PPC_PASEMI && PCI
        help
          Support for error detection and correction on PA Semi
          PWRficient.
 
 config EDAC_CELL
        tristate "Cell Broadband Engine memory controller"
-       depends on EDAC_MM_EDAC && PPC_CELL_COMMON
+       depends on PPC_CELL_COMMON
        help
          Support for error detection and correction on the
          Cell Broadband Engine internal memory controller
@@ -308,7 +283,7 @@ config EDAC_CELL
 
 config EDAC_PPC4XX
        tristate "PPC4xx IBM DDR2 Memory Controller"
-       depends on EDAC_MM_EDAC && 4xx
+       depends on 4xx
        help
          This enables support for EDAC on the ECC memory used
          with the IBM DDR2 memory controller found in various
@@ -317,7 +292,7 @@ config EDAC_PPC4XX
 
 config EDAC_AMD8131
        tristate "AMD8131 HyperTransport PCI-X Tunnel"
-       depends on EDAC_MM_EDAC && PCI && PPC_MAPLE
+       depends on PCI && PPC_MAPLE
        help
          Support for error detection and correction on the
          AMD8131 HyperTransport PCI-X Tunnel chip.
@@ -326,7 +301,7 @@ config EDAC_AMD8131
 
 config EDAC_AMD8111
        tristate "AMD8111 HyperTransport I/O Hub"
-       depends on EDAC_MM_EDAC && PCI && PPC_MAPLE
+       depends on PCI && PPC_MAPLE
        help
          Support for error detection and correction on the
          AMD8111 HyperTransport I/O Hub chip.
@@ -335,7 +310,7 @@ config EDAC_AMD8111
 
 config EDAC_CPC925
        tristate "IBM CPC925 Memory Controller (PPC970FX)"
-       depends on EDAC_MM_EDAC && PPC64
+       depends on PPC64
        help
          Support for error detection and correction on the
          IBM CPC925 Bridge and Memory Controller, which is
@@ -344,7 +319,7 @@ config EDAC_CPC925
 
 config EDAC_TILE
        tristate "Tilera Memory Controller"
-       depends on EDAC_MM_EDAC && TILE
+       depends on TILE
        default y
        help
          Support for error detection and correction on the
@@ -352,49 +327,59 @@ config EDAC_TILE
 
 config EDAC_HIGHBANK_MC
        tristate "Highbank Memory Controller"
-       depends on EDAC_MM_EDAC && ARCH_HIGHBANK
+       depends on ARCH_HIGHBANK
        help
          Support for error detection and correction on the
          Calxeda Highbank memory controller.
 
 config EDAC_HIGHBANK_L2
        tristate "Highbank L2 Cache"
-       depends on EDAC_MM_EDAC && ARCH_HIGHBANK
+       depends on ARCH_HIGHBANK
        help
          Support for error detection and correction on the
          Calxeda Highbank memory controller.
 
 config EDAC_OCTEON_PC
        tristate "Cavium Octeon Primary Caches"
-       depends on EDAC_MM_EDAC && CPU_CAVIUM_OCTEON
+       depends on CPU_CAVIUM_OCTEON
        help
          Support for error detection and correction on the primary caches of
          the cnMIPS cores of Cavium Octeon family SOCs.
 
 config EDAC_OCTEON_L2C
        tristate "Cavium Octeon Secondary Caches (L2C)"
-       depends on EDAC_MM_EDAC && CAVIUM_OCTEON_SOC
+       depends on CAVIUM_OCTEON_SOC
        help
          Support for error detection and correction on the
          Cavium Octeon family of SOCs.
 
 config EDAC_OCTEON_LMC
        tristate "Cavium Octeon DRAM Memory Controller (LMC)"
-       depends on EDAC_MM_EDAC && CAVIUM_OCTEON_SOC
+       depends on CAVIUM_OCTEON_SOC
        help
          Support for error detection and correction on the
          Cavium Octeon family of SOCs.
 
 config EDAC_OCTEON_PCI
        tristate "Cavium Octeon PCI Controller"
-       depends on EDAC_MM_EDAC && PCI && CAVIUM_OCTEON_SOC
+       depends on PCI && CAVIUM_OCTEON_SOC
        help
          Support for error detection and correction on the
          Cavium Octeon family of SOCs.
 
+config EDAC_THUNDERX
+       tristate "Cavium ThunderX EDAC"
+       depends on ARM64
+       depends on PCI
+       help
+         Support for error detection and correction on the
+         Cavium ThunderX memory controllers (LMC), Cache
+         Coherent Processor Interconnect (CCPI) and L2 cache
+         blocks (TAD, CBC, MCI).
+
 config EDAC_ALTERA
        bool "Altera SOCFPGA ECC"
-       depends on EDAC_MM_EDAC=y && ARCH_SOCFPGA
+       depends on EDAC=y && ARCH_SOCFPGA
        help
          Support for error detection and correction on the
          Altera SOCs. This must be selected for SDRAM ECC.
@@ -460,14 +445,14 @@ config EDAC_ALTERA_SDMMC
 
 config EDAC_SYNOPSYS
        tristate "Synopsys DDR Memory Controller"
-       depends on EDAC_MM_EDAC && ARCH_ZYNQ
+       depends on ARCH_ZYNQ
        help
          Support for error detection and correction on the Synopsys DDR
          memory controller.
 
 config EDAC_XGENE
        tristate "APM X-Gene SoC"
-       depends on EDAC_MM_EDAC && (ARM64 || COMPILE_TEST)
+       depends on (ARM64 || COMPILE_TEST)
        help
          Support for error detection and correction on the
          APM X-Gene family of SOCs.
index 587107e..0fd9ffa 100644 (file)
@@ -6,8 +6,7 @@
 # GNU General Public License.
 #
 
-obj-$(CONFIG_EDAC)                     := edac_stub.o
-obj-$(CONFIG_EDAC_MM_EDAC)             += edac_core.o
+obj-$(CONFIG_EDAC)                     := edac_core.o
 
 edac_core-y    := edac_mc.o edac_device.o edac_mc_sysfs.o
 edac_core-y    += edac_module.o edac_device_sysfs.o wq.o
@@ -67,13 +66,14 @@ obj-$(CONFIG_EDAC_AMD8131)          += amd8131_edac.o
 
 obj-$(CONFIG_EDAC_TILE)                        += tile_edac.o
 
-obj-$(CONFIG_EDAC_HIGHBANK_MC) += highbank_mc_edac.o
-obj-$(CONFIG_EDAC_HIGHBANK_L2) += highbank_l2_edac.o
+obj-$(CONFIG_EDAC_HIGHBANK_MC)         += highbank_mc_edac.o
+obj-$(CONFIG_EDAC_HIGHBANK_L2)         += highbank_l2_edac.o
 
 obj-$(CONFIG_EDAC_OCTEON_PC)           += octeon_edac-pc.o
 obj-$(CONFIG_EDAC_OCTEON_L2C)          += octeon_edac-l2c.o
 obj-$(CONFIG_EDAC_OCTEON_LMC)          += octeon_edac-lmc.o
 obj-$(CONFIG_EDAC_OCTEON_PCI)          += octeon_edac-pci.o
+obj-$(CONFIG_EDAC_THUNDERX)            += thunderx_edac.o
 
 obj-$(CONFIG_EDAC_ALTERA)              += altera_edac.o
 obj-$(CONFIG_EDAC_SYNOPSYS)            += synopsys_edac.o
index c5a5b91..7717b09 100644 (file)
@@ -1023,13 +1023,23 @@ out:
        return ret;
 }
 
+static int socfpga_is_a10(void)
+{
+       return of_machine_is_compatible("altr,socfpga-arria10");
+}
+
 static int validate_parent_available(struct device_node *np);
 static const struct of_device_id altr_edac_a10_device_of_match[];
 static int __init __maybe_unused altr_init_a10_ecc_device_type(char *compat)
 {
        int irq;
-       struct device_node *child, *np = of_find_compatible_node(NULL, NULL,
-                                       "altr,socfpga-a10-ecc-manager");
+       struct device_node *child, *np;
+
+       if (!socfpga_is_a10())
+               return -ENODEV;
+
+       np = of_find_compatible_node(NULL, NULL,
+                                    "altr,socfpga-a10-ecc-manager");
        if (!np) {
                edac_printk(KERN_ERR, EDAC_DEVICE, "ECC Manager not found\n");
                return -ENODEV;
@@ -1545,8 +1555,12 @@ static const struct edac_device_prv_data a10_sdmmceccb_data = {
 static int __init socfpga_init_sdmmc_ecc(void)
 {
        int rc = -ENODEV;
-       struct device_node *child = of_find_compatible_node(NULL, NULL,
-                                               "altr,socfpga-sdmmc-ecc");
+       struct device_node *child;
+
+       if (!socfpga_is_a10())
+               return -ENODEV;
+
+       child = of_find_compatible_node(NULL, NULL, "altr,socfpga-sdmmc-ecc");
        if (!child) {
                edac_printk(KERN_WARNING, EDAC_DEVICE, "SDMMC node not found\n");
                return -ENODEV;
index e5573c5..4800721 100644 (file)
 #define edac_atomic_scrub(va, size) do { } while (0)
 #endif
 
+int edac_op_state = EDAC_OPSTATE_INVAL;
+EXPORT_SYMBOL_GPL(edac_op_state);
+
+static int edac_report = EDAC_REPORTING_ENABLED;
+
 /* lock to memory controller's control array */
 static DEFINE_MUTEX(mem_ctls_mutex);
 static LIST_HEAD(mc_devices);
@@ -52,6 +57,65 @@ static void const *edac_mc_owner;
 
 static struct bus_type mc_bus[EDAC_MAX_MCS];
 
+int edac_get_report_status(void)
+{
+       return edac_report;
+}
+EXPORT_SYMBOL_GPL(edac_get_report_status);
+
+void edac_set_report_status(int new)
+{
+       if (new == EDAC_REPORTING_ENABLED ||
+           new == EDAC_REPORTING_DISABLED ||
+           new == EDAC_REPORTING_FORCE)
+               edac_report = new;
+}
+EXPORT_SYMBOL_GPL(edac_set_report_status);
+
+static int edac_report_set(const char *str, const struct kernel_param *kp)
+{
+       if (!str)
+               return -EINVAL;
+
+       if (!strncmp(str, "on", 2))
+               edac_report = EDAC_REPORTING_ENABLED;
+       else if (!strncmp(str, "off", 3))
+               edac_report = EDAC_REPORTING_DISABLED;
+       else if (!strncmp(str, "force", 5))
+               edac_report = EDAC_REPORTING_FORCE;
+
+       return 0;
+}
+
+static int edac_report_get(char *buffer, const struct kernel_param *kp)
+{
+       int ret = 0;
+
+       switch (edac_report) {
+       case EDAC_REPORTING_ENABLED:
+               ret = sprintf(buffer, "on");
+               break;
+       case EDAC_REPORTING_DISABLED:
+               ret = sprintf(buffer, "off");
+               break;
+       case EDAC_REPORTING_FORCE:
+               ret = sprintf(buffer, "force");
+               break;
+       default:
+               ret = -EINVAL;
+               break;
+       }
+
+       return ret;
+}
+
+static const struct kernel_param_ops edac_report_ops = {
+       .set = edac_report_set,
+       .get = edac_report_get,
+};
+
+module_param_cb(edac_report, &edac_report_ops, &edac_report, 0644);
+
 unsigned edac_dimm_info_location(struct dimm_info *dimm, char *buf,
                                 unsigned len)
 {
@@ -505,22 +569,6 @@ struct mem_ctl_info *find_mci_by_dev(struct device *dev)
 EXPORT_SYMBOL_GPL(find_mci_by_dev);
 
 /*
- * handler for EDAC to check if NMI type handler has asserted interrupt
- */
-static int edac_mc_assert_error_check_and_clear(void)
-{
-       int old_state;
-
-       if (edac_op_state == EDAC_OPSTATE_POLL)
-               return 1;
-
-       old_state = edac_err_assert;
-       edac_err_assert = 0;
-
-       return old_state;
-}
-
-/*
  * edac_mc_workq_function
  *     performs the operation scheduled by a workq request
  */
@@ -536,7 +584,7 @@ static void edac_mc_workq_function(struct work_struct *work_req)
                return;
        }
 
-       if (edac_mc_assert_error_check_and_clear())
+       if (edac_op_state == EDAC_OPSTATE_POLL)
                mci->edac_check(mci);
 
        mutex_unlock(&mem_ctls_mutex);
@@ -601,7 +649,6 @@ static int add_mc_to_global_list(struct mem_ctl_info *mci)
        }
 
        list_add_tail_rcu(&mci->link, insert_before);
-       atomic_inc(&edac_handlers);
        return 0;
 
 fail0:
@@ -619,7 +666,6 @@ fail1:
 
 static int del_mc_from_global_list(struct mem_ctl_info *mci)
 {
-       int handlers = atomic_dec_return(&edac_handlers);
        list_del_rcu(&mci->link);
 
        /* these are for safe removal of devices from global list while
@@ -628,7 +674,7 @@ static int del_mc_from_global_list(struct mem_ctl_info *mci)
        synchronize_rcu();
        INIT_LIST_HEAD(&mci->link);
 
-       return handlers;
+       return list_empty(&mc_devices);
 }
 
 struct mem_ctl_info *edac_mc_find(int idx)
@@ -763,7 +809,7 @@ struct mem_ctl_info *edac_mc_del_mc(struct device *dev)
        /* mark MCI offline: */
        mci->op_state = OP_OFFLINE;
 
-       if (!del_mc_from_global_list(mci))
+       if (del_mc_from_global_list(mci))
                edac_mc_owner = NULL;
 
        mutex_unlock(&mem_ctls_mutex);
@@ -1195,10 +1241,13 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
 
        /* Report the error via the trace interface */
        grain_bits = fls_long(e->grain) + 1;
-       trace_mc_event(type, e->msg, e->label, e->error_count,
-                      mci->mc_idx, e->top_layer, e->mid_layer, e->low_layer,
-                      (e->page_frame_number << PAGE_SHIFT) | e->offset_in_page,
-                      grain_bits, e->syndrome, e->other_detail);
+
+       if (IS_ENABLED(CONFIG_RAS))
+               trace_mc_event(type, e->msg, e->label, e->error_count,
+                              mci->mc_idx, e->top_layer, e->mid_layer,
+                              e->low_layer,
+                              (e->page_frame_number << PAGE_SHIFT) | e->offset_in_page,
+                              grain_bits, e->syndrome, e->other_detail);
 
        edac_raw_mc_handle_error(type, mci, e);
 }
diff --git a/drivers/edac/edac_stub.c b/drivers/edac/edac_stub.c
deleted file mode 100644 (file)
index 952e411..0000000
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * common EDAC components that must be in kernel
- *
- * Author: Dave Jiang <djiang@mvista.com>
- *
- * 2007 (c) MontaVista Software, Inc.
- * 2010 (c) Advanced Micro Devices Inc.
- *         Borislav Petkov <bp@alien8.de>
- *
- * This file is licensed under the terms of the GNU General Public
- * License version 2. This program is licensed "as is" without any
- * warranty of any kind, whether express or implied.
- *
- */
-#include <linux/module.h>
-#include <linux/edac.h>
-#include <linux/atomic.h>
-#include <linux/device.h>
-
-int edac_op_state = EDAC_OPSTATE_INVAL;
-EXPORT_SYMBOL_GPL(edac_op_state);
-
-atomic_t edac_handlers = ATOMIC_INIT(0);
-EXPORT_SYMBOL_GPL(edac_handlers);
-
-int edac_err_assert = 0;
-EXPORT_SYMBOL_GPL(edac_err_assert);
-
-int edac_report_status = EDAC_REPORTING_ENABLED;
-EXPORT_SYMBOL_GPL(edac_report_status);
-
-static int __init edac_report_setup(char *str)
-{
-       if (!str)
-               return -EINVAL;
-
-       if (!strncmp(str, "on", 2))
-               set_edac_report_status(EDAC_REPORTING_ENABLED);
-       else if (!strncmp(str, "off", 3))
-               set_edac_report_status(EDAC_REPORTING_DISABLED);
-       else if (!strncmp(str, "force", 5))
-               set_edac_report_status(EDAC_REPORTING_FORCE);
-
-       return 0;
-}
-__setup("edac_report=", edac_report_setup);
-
-/*
- * called to determine if there is an EDAC driver interested in
- * knowing an event (such as NMI) occurred
- */
-int edac_handler_set(void)
-{
-       if (edac_op_state == EDAC_OPSTATE_POLL)
-               return 0;
-
-       return atomic_read(&edac_handlers);
-}
-EXPORT_SYMBOL_GPL(edac_handler_set);
-
-/*
- * handler for NMI type of interrupts to assert error
- */
-void edac_atomic_assert_error(void)
-{
-       edac_err_assert++;
-}
-EXPORT_SYMBOL_GPL(edac_atomic_assert_error);
index 928e0db..1cad5a9 100644 (file)
@@ -1349,7 +1349,7 @@ static int pnd2_mce_check_error(struct notifier_block *nb, unsigned long val, vo
        struct dram_addr daddr;
        char *type;
 
-       if (get_edac_report_status() == EDAC_REPORTING_DISABLED)
+       if (edac_get_report_status() == EDAC_REPORTING_DISABLED)
                return NOTIFY_DONE;
 
        mci = pnd2_mci;
index a65ea44..ea21cb6 100644 (file)
@@ -3075,7 +3075,7 @@ static int sbridge_mce_check_error(struct notifier_block *nb, unsigned long val,
        struct sbridge_pvt *pvt;
        char *type;
 
-       if (get_edac_report_status() == EDAC_REPORTING_DISABLED)
+       if (edac_get_report_status() == EDAC_REPORTING_DISABLED)
                return NOTIFY_DONE;
 
        mci = get_mci_for_node_id(mce->socketid);
@@ -3441,7 +3441,7 @@ static int __init sbridge_init(void)
 
        if (rc >= 0) {
                mce_register_decode_chain(&sbridge_mce_dec);
-               if (get_edac_report_status() == EDAC_REPORTING_DISABLED)
+               if (edac_get_report_status() == EDAC_REPORTING_DISABLED)
                        sbridge_printk(KERN_WARNING, "Loading driver, error reporting disabled.\n");
                return 0;
        }
index 1159dba..64bef6c 100644 (file)
@@ -971,7 +971,7 @@ static int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
        struct mem_ctl_info *mci;
        char *type;
 
-       if (get_edac_report_status() == EDAC_REPORTING_DISABLED)
+       if (edac_get_report_status() == EDAC_REPORTING_DISABLED)
                return NOTIFY_DONE;
 
        /* ignore unless this is memory related with an address */
diff --git a/drivers/edac/thunderx_edac.c b/drivers/edac/thunderx_edac.c
new file mode 100644 (file)
index 0000000..86d585c
--- /dev/null
@@ -0,0 +1,2174 @@
+/*
+ * Cavium ThunderX memory controller kernel module
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright Cavium, Inc. (C) 2015-2017. All rights reserved.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/edac.h>
+#include <linux/interrupt.h>
+#include <linux/string.h>
+#include <linux/stop_machine.h>
+#include <linux/delay.h>
+#include <linux/sizes.h>
+#include <linux/atomic.h>
+#include <linux/bitfield.h>
+#include <linux/circ_buf.h>
+
+#include <asm/page.h>
+
+#include "edac_module.h"
+
+#define phys_to_pfn(phys)      (PFN_DOWN(phys))
+
+#define THUNDERX_NODE          GENMASK(45, 44)
+
+enum {
+       ERR_CORRECTED   = 1,
+       ERR_UNCORRECTED = 2,
+       ERR_UNKNOWN     = 3,
+};
+
+#define MAX_SYNDROME_REGS 4
+
+struct error_syndrome {
+       u64 reg[MAX_SYNDROME_REGS];
+};
+
+struct error_descr {
+       int     type;
+       u64     mask;
+       char    *descr;
+};
+
+static void decode_register(char *str, size_t size,
+                          const struct error_descr *descr,
+                          const uint64_t reg)
+{
+       int ret = 0;
+
+       while (descr->type && descr->mask && descr->descr) {
+               if (reg & descr->mask) {
+                       ret = snprintf(str, size, "\n\t%s, %s",
+                                      descr->type == ERR_CORRECTED ?
+                                        "Corrected" : "Uncorrected",
+                                      descr->descr);
+                       str += ret;
+                       size -= ret;
+               }
+               descr++;
+       }
+}
+
+static unsigned long get_bits(unsigned long data, int pos, int width)
+{
+       return (data >> pos) & ((1 << width) - 1);
+}
+
+#define L2C_CTL                        0x87E080800000
+#define L2C_CTL_DISIDXALIAS    BIT(0)
+
+#define PCI_DEVICE_ID_THUNDER_LMC 0xa022
+
+#define LMC_FADR               0x20
+#define LMC_FADR_FDIMM(x)      ((x >> 37) & 0x1)
+#define LMC_FADR_FBUNK(x)      ((x >> 36) & 0x1)
+#define LMC_FADR_FBANK(x)      ((x >> 32) & 0xf)
+#define LMC_FADR_FROW(x)       ((x >> 14) & 0xffff)
+#define LMC_FADR_FCOL(x)       ((x >> 0) & 0x1fff)
+
+#define LMC_NXM_FADR           0x28
+#define LMC_ECC_SYND           0x38
+
+#define LMC_ECC_PARITY_TEST    0x108
+
+#define LMC_INT_W1S            0x150
+
+#define LMC_INT_ENA_W1C                0x158
+#define LMC_INT_ENA_W1S                0x160
+
+#define LMC_CONFIG             0x188
+
+#define LMC_CONFIG_BG2         BIT(62)
+#define LMC_CONFIG_RANK_ENA    BIT(42)
+#define LMC_CONFIG_PBANK_LSB(x)        (((x) >> 5) & 0xF)
+#define LMC_CONFIG_ROW_LSB(x)  (((x) >> 2) & 0x7)
+
+#define LMC_CONTROL            0x190
+#define LMC_CONTROL_XOR_BANK   BIT(16)
+
+#define LMC_INT                        0x1F0
+
+#define LMC_INT_DDR_ERR                BIT(11)
+#define LMC_INT_DED_ERR                (0xFUL << 5)
+#define LMC_INT_SEC_ERR         (0xFUL << 1)
+#define LMC_INT_NXM_WR_MASK    BIT(0)
+
+#define LMC_DDR_PLL_CTL                0x258
+#define LMC_DDR_PLL_CTL_DDR4   BIT(29)
+
+#define LMC_FADR_SCRAMBLED     0x330
+
+#define LMC_INT_UE              (LMC_INT_DDR_ERR | LMC_INT_DED_ERR | \
+                                LMC_INT_NXM_WR_MASK)
+
+#define LMC_INT_CE             (LMC_INT_SEC_ERR)
+
+static const struct error_descr lmc_errors[] = {
+       {
+               .type  = ERR_CORRECTED,
+               .mask  = LMC_INT_SEC_ERR,
+               .descr = "Single-bit ECC error",
+       },
+       {
+               .type  = ERR_UNCORRECTED,
+               .mask  = LMC_INT_DDR_ERR,
+               .descr = "DDR chip error",
+       },
+       {
+               .type  = ERR_UNCORRECTED,
+               .mask  = LMC_INT_DED_ERR,
+               .descr = "Double-bit ECC error",
+       },
+       {
+               .type = ERR_UNCORRECTED,
+               .mask = LMC_INT_NXM_WR_MASK,
+               .descr = "Non-existent memory write",
+       },
+       {0, 0, NULL},
+};
+
+#define LMC_INT_EN_DDR_ERROR_ALERT_ENA BIT(5)
+#define LMC_INT_EN_DLCRAM_DED_ERR      BIT(4)
+#define LMC_INT_EN_DLCRAM_SEC_ERR      BIT(3)
+#define LMC_INT_INTR_DED_ENA           BIT(2)
+#define LMC_INT_INTR_SEC_ENA           BIT(1)
+#define LMC_INT_INTR_NXM_WR_ENA                BIT(0)
+
+#define LMC_INT_ENA_ALL                        GENMASK(5, 0)
+
+#define LMC_DDR_PLL_CTL                0x258
+#define LMC_DDR_PLL_CTL_DDR4   BIT(29)
+
+#define LMC_CONTROL            0x190
+#define LMC_CONTROL_RDIMM      BIT(0)
+
+#define LMC_SCRAM_FADR         0x330
+
+#define LMC_CHAR_MASK0         0x228
+#define LMC_CHAR_MASK2         0x238
+
+#define RING_ENTRIES   8
+
+struct debugfs_entry {
+       const char *name;
+       umode_t mode;
+       const struct file_operations fops;
+};
+
+struct lmc_err_ctx {
+       u64 reg_int;
+       u64 reg_fadr;
+       u64 reg_nxm_fadr;
+       u64 reg_scram_fadr;
+       u64 reg_ecc_synd;
+};
+
+struct thunderx_lmc {
+       void __iomem *regs;
+       struct pci_dev *pdev;
+       struct msix_entry msix_ent;
+
+       atomic_t ecc_int;
+
+       u64 mask0;
+       u64 mask2;
+       u64 parity_test;
+       u64 node;
+
+       int xbits;
+       int bank_width;
+       int pbank_lsb;
+       int dimm_lsb;
+       int rank_lsb;
+       int bank_lsb;
+       int row_lsb;
+       int col_hi_lsb;
+
+       int xor_bank;
+       int l2c_alias;
+
+       struct page *mem;
+
+       struct lmc_err_ctx err_ctx[RING_ENTRIES];
+       unsigned long ring_head;
+       unsigned long ring_tail;
+};
+
+#define ring_pos(pos, size) ((pos) & (size - 1))
+
+#define DEBUGFS_STRUCT(_name, _mode, _write, _read)                        \
+static struct debugfs_entry debugfs_##_name = {                                    \
+       .name = __stringify(_name),                                         \
+       .mode = VERIFY_OCTAL_PERMISSIONS(_mode),                            \
+       .fops = {                                                           \
+               .open = simple_open,                                        \
+               .write = _write,                                            \
+               .read  = _read,                                             \
+               .llseek = generic_file_llseek,                              \
+       },                                                                  \
+}
+
+#define DEBUGFS_FIELD_ATTR(_type, _field)                                  \
+static ssize_t thunderx_##_type##_##_field##_read(struct file *file,       \
+                                           char __user *data,              \
+                                           size_t count, loff_t *ppos)     \
+{                                                                          \
+       struct thunderx_##_type *pdata = file->private_data;                \
+       char buf[20];                                                       \
+                                                                           \
+       snprintf(buf, count, "0x%016llx", pdata->_field);                   \
+       return simple_read_from_buffer(data, count, ppos,                   \
+                                      buf, sizeof(buf));                   \
+}                                                                          \
+                                                                           \
+static ssize_t thunderx_##_type##_##_field##_write(struct file *file,      \
+                                            const char __user *data,       \
+                                            size_t count, loff_t *ppos)    \
+{                                                                          \
+       struct thunderx_##_type *pdata = file->private_data;                \
+       int res;                                                            \
+                                                                           \
+       res = kstrtoull_from_user(data, count, 0, &pdata->_field);          \
+                                                                           \
+       return res ? res : count;                                           \
+}                                                                          \
+                                                                           \
+DEBUGFS_STRUCT(_field, 0600,                                               \
+                  thunderx_##_type##_##_field##_write,                     \
+                  thunderx_##_type##_##_field##_read)                      \
+
+#define DEBUGFS_REG_ATTR(_type, _name, _reg)                               \
+static ssize_t thunderx_##_type##_##_name##_read(struct file *file,        \
+                                          char __user *data,               \
+                                          size_t count, loff_t *ppos)      \
+{                                                                          \
+       struct thunderx_##_type *pdata = file->private_data;                \
+       char buf[20];                                                       \
+                                                                           \
+       sprintf(buf, "0x%016llx", readq(pdata->regs + _reg));               \
+       return simple_read_from_buffer(data, count, ppos,                   \
+                                      buf, sizeof(buf));                   \
+}                                                                          \
+                                                                           \
+static ssize_t thunderx_##_type##_##_name##_write(struct file *file,       \
+                                           const char __user *data,        \
+                                           size_t count, loff_t *ppos)     \
+{                                                                          \
+       struct thunderx_##_type *pdata = file->private_data;                \
+       u64 val;                                                            \
+       int res;                                                            \
+                                                                           \
+       res = kstrtoull_from_user(data, count, 0, &val);                    \
+                                                                           \
+       if (!res) {                                                         \
+               writeq(val, pdata->regs + _reg);                            \
+               res = count;                                                \
+       }                                                                   \
+                                                                           \
+       return res;                                                         \
+}                                                                          \
+                                                                           \
+DEBUGFS_STRUCT(_name, 0600,                                                \
+              thunderx_##_type##_##_name##_write,                          \
+              thunderx_##_type##_##_name##_read)
+
+#define LMC_DEBUGFS_ENT(_field)        DEBUGFS_FIELD_ATTR(lmc, _field)
+
+/*
+ * To get an ECC error injected, the following steps are needed:
+ * - Setup the ECC injection by writing the appropriate parameters:
+ *     echo <bit mask value> > /sys/kernel/debug/<device number>/ecc_mask0
+ *     echo <bit mask value> > /sys/kernel/debug/<device number>/ecc_mask2
+ *     echo 0x802 > /sys/kernel/debug/<device number>/ecc_parity_test
+ * - Do the actual injection:
+ *     echo 1 > /sys/kernel/debug/<device number>/inject_ecc
+ */
+static ssize_t thunderx_lmc_inject_int_write(struct file *file,
+                                            const char __user *data,
+                                            size_t count, loff_t *ppos)
+{
+       struct thunderx_lmc *lmc = file->private_data;
+       u64 val;
+       int res;
+
+       res = kstrtoull_from_user(data, count, 0, &val);
+
+       if (!res) {
+               /* Trigger the interrupt */
+               writeq(val, lmc->regs + LMC_INT_W1S);
+               res = count;
+       }
+
+       return res;
+}
+
+static ssize_t thunderx_lmc_int_read(struct file *file,
+                                    char __user *data,
+                                    size_t count, loff_t *ppos)
+{
+       struct thunderx_lmc *lmc = file->private_data;
+       char buf[20];
+       u64 lmc_int = readq(lmc->regs + LMC_INT);
+
+       snprintf(buf, sizeof(buf), "0x%016llx", lmc_int);
+       return simple_read_from_buffer(data, count, ppos, buf, sizeof(buf));
+}
+
+#define TEST_PATTERN 0xa5
+
+static int inject_ecc_fn(void *arg)
+{
+       struct thunderx_lmc *lmc = arg;
+       uintptr_t addr, phys;
+       unsigned int cline_size = cache_line_size();
+       const unsigned int lines = PAGE_SIZE / cline_size;
+       unsigned int i, cl_idx;
+
+       addr = (uintptr_t)page_address(lmc->mem);
+       phys = (uintptr_t)page_to_phys(lmc->mem);
+
+       cl_idx = (phys & 0x7f) >> 4;
+       lmc->parity_test &= ~(7ULL << 8);
+       lmc->parity_test |= (cl_idx << 8);
+
+       writeq(lmc->mask0, lmc->regs + LMC_CHAR_MASK0);
+       writeq(lmc->mask2, lmc->regs + LMC_CHAR_MASK2);
+       writeq(lmc->parity_test, lmc->regs + LMC_ECC_PARITY_TEST);
+
+       readq(lmc->regs + LMC_CHAR_MASK0);
+       readq(lmc->regs + LMC_CHAR_MASK2);
+       readq(lmc->regs + LMC_ECC_PARITY_TEST);
+
+       for (i = 0; i < lines; i++) {
+               memset((void *)addr, TEST_PATTERN, cline_size);
+               barrier();
+
+               /*
+                * Flush L1 cachelines to the PoC (L2).
+                * This will cause cacheline eviction to the L2.
+                */
+               asm volatile("dc civac, %0\n"
+                            "dsb sy\n"
+                            : : "r"(addr + i * cline_size));
+       }
+
+       for (i = 0; i < lines; i++) {
+               /*
+                * Flush L2 cachelines to the DRAM.
+                * This will cause cacheline eviction to the DRAM
+                * and ECC corruption according to the masks set.
+                */
+               __asm__ volatile("sys #0,c11,C1,#2, %0\n"
+                                : : "r"(phys + i * cline_size));
+       }
+
+       for (i = 0; i < lines; i++) {
+               /*
+                * Invalidate L2 cachelines.
+                * The subsequent load will cause cacheline fetch
+                * from the DRAM and an error interrupt
+                */
+               __asm__ volatile("sys #0,c11,C1,#1, %0"
+                                : : "r"(phys + i * cline_size));
+       }
+
+       for (i = 0; i < lines; i++) {
+               /*
+                * Invalidate L1 cachelines.
+                * The subsequent load will cause cacheline fetch
+                * from the L2 and/or DRAM
+                */
+               asm volatile("dc ivac, %0\n"
+                            "dsb sy\n"
+                            : : "r"(addr + i * cline_size));
+       }
+
+       return 0;
+}
+
+static ssize_t thunderx_lmc_inject_ecc_write(struct file *file,
+                                            const char __user *data,
+                                            size_t count, loff_t *ppos)
+{
+       struct thunderx_lmc *lmc = file->private_data;
+
+       unsigned int cline_size = cache_line_size();
+
+       u8 tmp[cline_size];
+       void __iomem *addr;
+       unsigned int offs, timeout = 100000;
+
+       atomic_set(&lmc->ecc_int, 0);
+
+       lmc->mem = alloc_pages_node(lmc->node, GFP_KERNEL, 0);
+
+       if (!lmc->mem)
+               return -ENOMEM;
+
+       addr = page_address(lmc->mem);
+
+       while (!atomic_read(&lmc->ecc_int) && timeout--) {
+               stop_machine(inject_ecc_fn, lmc, NULL);
+
+               for (offs = 0; offs < PAGE_SIZE; offs += sizeof(tmp)) {
+                       /*
+                        * Do a load from the previously rigged location
+                        * This should generate an error interrupt.
+                        */
+                       memcpy(tmp, addr + offs, cline_size);
+                       asm volatile("dsb ld\n");
+               }
+       }
+
+       __free_pages(lmc->mem, 0);
+
+       return count;
+}
+
+LMC_DEBUGFS_ENT(mask0);
+LMC_DEBUGFS_ENT(mask2);
+LMC_DEBUGFS_ENT(parity_test);
+
+DEBUGFS_STRUCT(inject_int, 0200, thunderx_lmc_inject_int_write, NULL);
+DEBUGFS_STRUCT(inject_ecc, 0200, thunderx_lmc_inject_ecc_write, NULL);
+DEBUGFS_STRUCT(int_w1c, 0400, NULL, thunderx_lmc_int_read);
+
+struct debugfs_entry *lmc_dfs_ents[] = {
+       &debugfs_mask0,
+       &debugfs_mask2,
+       &debugfs_parity_test,
+       &debugfs_inject_ecc,
+       &debugfs_inject_int,
+       &debugfs_int_w1c,
+};
+
+static int thunderx_create_debugfs_nodes(struct dentry *parent,
+                                         struct debugfs_entry *attrs[],
+                                         void *data,
+                                         size_t num)
+{
+       int i;
+       struct dentry *ent;
+
+       if (!IS_ENABLED(CONFIG_EDAC_DEBUG))
+               return 0;
+
+       if (!parent)
+               return -ENOENT;
+
+       for (i = 0; i < num; i++) {
+               ent = edac_debugfs_create_file(attrs[i]->name, attrs[i]->mode,
+                                              parent, data, &attrs[i]->fops);
+
+               if (!ent)
+                       break;
+       }
+
+       return i;
+}
+
+static phys_addr_t thunderx_faddr_to_phys(u64 faddr, struct thunderx_lmc *lmc)
+{
+       phys_addr_t addr = 0;
+       int bank, xbits;
+
+       addr |= lmc->node << 40;
+       addr |= LMC_FADR_FDIMM(faddr) << lmc->dimm_lsb;
+       addr |= LMC_FADR_FBUNK(faddr) << lmc->rank_lsb;
+       addr |= LMC_FADR_FROW(faddr) << lmc->row_lsb;
+       addr |= (LMC_FADR_FCOL(faddr) >> 4) << lmc->col_hi_lsb;
+
+       bank = LMC_FADR_FBANK(faddr) << lmc->bank_lsb;
+
+       if (lmc->xor_bank)
+               bank ^= get_bits(addr, 12 + lmc->xbits, lmc->bank_width);
+
+       addr |= bank << lmc->bank_lsb;
+
+       xbits = PCI_FUNC(lmc->pdev->devfn);
+
+       if (lmc->l2c_alias)
+               xbits ^= get_bits(addr, 20, lmc->xbits) ^
+                        get_bits(addr, 12, lmc->xbits);
+
+       addr |= xbits << 7;
+
+       return addr;
+}
+
+static unsigned int thunderx_get_num_lmcs(unsigned int node)
+{
+       unsigned int number = 0;
+       struct pci_dev *pdev = NULL;
+
+       do {
+               pdev = pci_get_device(PCI_VENDOR_ID_CAVIUM,
+                                     PCI_DEVICE_ID_THUNDER_LMC,
+                                     pdev);
+               if (pdev) {
+#ifdef CONFIG_NUMA
+                       if (pdev->dev.numa_node == node)
+                               number++;
+#else
+                       number++;
+#endif
+               }
+       } while (pdev);
+
+       return number;
+}
+
+#define LMC_MESSAGE_SIZE       120
+#define LMC_OTHER_SIZE         (50 * ARRAY_SIZE(lmc_errors))
+
+static irqreturn_t thunderx_lmc_err_isr(int irq, void *dev_id)
+{
+       struct mem_ctl_info *mci = dev_id;
+       struct thunderx_lmc *lmc = mci->pvt_info;
+
+       unsigned long head = ring_pos(lmc->ring_head, ARRAY_SIZE(lmc->err_ctx));
+       struct lmc_err_ctx *ctx = &lmc->err_ctx[head];
+
+       writeq(0, lmc->regs + LMC_CHAR_MASK0);
+       writeq(0, lmc->regs + LMC_CHAR_MASK2);
+       writeq(0x2, lmc->regs + LMC_ECC_PARITY_TEST);
+
+       ctx->reg_int = readq(lmc->regs + LMC_INT);
+       ctx->reg_fadr = readq(lmc->regs + LMC_FADR);
+       ctx->reg_nxm_fadr = readq(lmc->regs + LMC_NXM_FADR);
+       ctx->reg_scram_fadr = readq(lmc->regs + LMC_SCRAM_FADR);
+       ctx->reg_ecc_synd = readq(lmc->regs + LMC_ECC_SYND);
+
+       lmc->ring_head++;
+
+       atomic_set(&lmc->ecc_int, 1);
+
+       /* Clear the interrupt */
+       writeq(ctx->reg_int, lmc->regs + LMC_INT);
+
+       return IRQ_WAKE_THREAD;
+}
+
+static irqreturn_t thunderx_lmc_threaded_isr(int irq, void *dev_id)
+{
+       struct mem_ctl_info *mci = dev_id;
+       struct thunderx_lmc *lmc = mci->pvt_info;
+       phys_addr_t phys_addr;
+
+       unsigned long tail;
+       struct lmc_err_ctx *ctx;
+
+       irqreturn_t ret = IRQ_NONE;
+
+       char *msg;
+       char *other;
+
+       msg = kmalloc(LMC_MESSAGE_SIZE, GFP_KERNEL);
+       other =  kmalloc(LMC_OTHER_SIZE, GFP_KERNEL);
+
+       if (!msg || !other)
+               goto err_free;
+
+       while (CIRC_CNT(lmc->ring_head, lmc->ring_tail,
+               ARRAY_SIZE(lmc->err_ctx))) {
+               tail = ring_pos(lmc->ring_tail, ARRAY_SIZE(lmc->err_ctx));
+
+               ctx = &lmc->err_ctx[tail];
+
+               dev_dbg(&lmc->pdev->dev, "LMC_INT: %016llx\n",
+                       ctx->reg_int);
+               dev_dbg(&lmc->pdev->dev, "LMC_FADR: %016llx\n",
+                       ctx->reg_fadr);
+               dev_dbg(&lmc->pdev->dev, "LMC_NXM_FADR: %016llx\n",
+                       ctx->reg_nxm_fadr);
+               dev_dbg(&lmc->pdev->dev, "LMC_SCRAM_FADR: %016llx\n",
+                       ctx->reg_scram_fadr);
+               dev_dbg(&lmc->pdev->dev, "LMC_ECC_SYND: %016llx\n",
+                       ctx->reg_ecc_synd);
+
+               snprintf(msg, LMC_MESSAGE_SIZE,
+                        "DIMM %lld rank %lld bank %lld row %lld col %lld",
+                        LMC_FADR_FDIMM(ctx->reg_scram_fadr),
+                        LMC_FADR_FBUNK(ctx->reg_scram_fadr),
+                        LMC_FADR_FBANK(ctx->reg_scram_fadr),
+                        LMC_FADR_FROW(ctx->reg_scram_fadr),
+                        LMC_FADR_FCOL(ctx->reg_scram_fadr));
+
+               decode_register(other, LMC_OTHER_SIZE, lmc_errors,
+                               ctx->reg_int);
+
+               phys_addr = thunderx_faddr_to_phys(ctx->reg_fadr, lmc);
+
+               if (ctx->reg_int & LMC_INT_UE)
+                       edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, 1,
+                                            phys_to_pfn(phys_addr),
+                                            offset_in_page(phys_addr),
+                                            0, -1, -1, -1, msg, other);
+               else if (ctx->reg_int & LMC_INT_CE)
+                       edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, 1,
+                                            phys_to_pfn(phys_addr),
+                                            offset_in_page(phys_addr),
+                                            0, -1, -1, -1, msg, other);
+
+               lmc->ring_tail++;
+       }
+
+       ret = IRQ_HANDLED;
+
+err_free:
+       kfree(msg);
+       kfree(other);
+
+       return ret;
+}
+
+#ifdef CONFIG_PM
+static int thunderx_lmc_suspend(struct pci_dev *pdev, pm_message_t state)
+{
+       pci_save_state(pdev);
+       pci_disable_device(pdev);
+
+       pci_set_power_state(pdev, pci_choose_state(pdev, state));
+
+       return 0;
+}
+
+static int thunderx_lmc_resume(struct pci_dev *pdev)
+{
+       pci_set_power_state(pdev, PCI_D0);
+       pci_enable_wake(pdev, PCI_D0, 0);
+       pci_restore_state(pdev);
+
+       return 0;
+}
+#endif
+
+static const struct pci_device_id thunderx_lmc_pci_tbl[] = {
+       { PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, PCI_DEVICE_ID_THUNDER_LMC) },
+       { 0, },
+};
+
+static inline int pci_dev_to_mc_idx(struct pci_dev *pdev)
+{
+       int node = dev_to_node(&pdev->dev);
+       int ret = PCI_FUNC(pdev->devfn);
+
+       ret += max(node, 0) << 3;
+
+       return ret;
+}
+
+static int thunderx_lmc_probe(struct pci_dev *pdev,
+                               const struct pci_device_id *id)
+{
+       struct thunderx_lmc *lmc;
+       struct edac_mc_layer layer;
+       struct mem_ctl_info *mci;
+       u64 lmc_control, lmc_ddr_pll_ctl, lmc_config;
+       int ret;
+       u64 lmc_int;
+       void *l2c_ioaddr;
+
+       layer.type = EDAC_MC_LAYER_SLOT;
+       layer.size = 2;
+       layer.is_virt_csrow = false;
+
+       ret = pcim_enable_device(pdev);
+       if (ret) {
+               dev_err(&pdev->dev, "Cannot enable PCI device: %d\n", ret);
+               return ret;
+       }
+
+       ret = pcim_iomap_regions(pdev, BIT(0), "thunderx_lmc");
+       if (ret) {
+               dev_err(&pdev->dev, "Cannot map PCI resources: %d\n", ret);
+               return ret;
+       }
+
+       mci = edac_mc_alloc(pci_dev_to_mc_idx(pdev), 1, &layer,
+                           sizeof(struct thunderx_lmc));
+       if (!mci)
+               return -ENOMEM;
+
+       mci->pdev = &pdev->dev;
+       lmc = mci->pvt_info;
+
+       pci_set_drvdata(pdev, mci);
+
+       lmc->regs = pcim_iomap_table(pdev)[0];
+
+       lmc_control = readq(lmc->regs + LMC_CONTROL);
+       lmc_ddr_pll_ctl = readq(lmc->regs + LMC_DDR_PLL_CTL);
+       lmc_config = readq(lmc->regs + LMC_CONFIG);
+
+       if (lmc_control & LMC_CONTROL_RDIMM) {
+               mci->mtype_cap = FIELD_GET(LMC_DDR_PLL_CTL_DDR4,
+                                          lmc_ddr_pll_ctl) ?
+                               MEM_RDDR4 : MEM_RDDR3;
+       } else {
+               mci->mtype_cap = FIELD_GET(LMC_DDR_PLL_CTL_DDR4,
+                                          lmc_ddr_pll_ctl) ?
+                               MEM_DDR4 : MEM_DDR3;
+       }
+
+       mci->edac_ctl_cap = EDAC_FLAG_NONE | EDAC_FLAG_SECDED;
+       mci->edac_cap = EDAC_FLAG_SECDED;
+
+       mci->mod_name = "thunderx-lmc";
+       mci->mod_ver = "1";
+       mci->ctl_name = "thunderx-lmc";
+       mci->dev_name = dev_name(&pdev->dev);
+       mci->scrub_mode = SCRUB_NONE;
+
+       lmc->pdev = pdev;
+       lmc->msix_ent.entry = 0;
+
+       lmc->ring_head = 0;
+       lmc->ring_tail = 0;
+
+       ret = pci_enable_msix_exact(pdev, &lmc->msix_ent, 1);
+       if (ret) {
+               dev_err(&pdev->dev, "Cannot enable interrupt: %d\n", ret);
+               goto err_free;
+       }
+
+       ret = devm_request_threaded_irq(&pdev->dev, lmc->msix_ent.vector,
+                                       thunderx_lmc_err_isr,
+                                       thunderx_lmc_threaded_isr, 0,
+                                       "[EDAC] ThunderX LMC", mci);
+       if (ret) {
+               dev_err(&pdev->dev, "Cannot set ISR: %d\n", ret);
+               goto err_free;
+       }
+
+       lmc->node = FIELD_GET(THUNDERX_NODE, pci_resource_start(pdev, 0));
+
+       lmc->xbits = thunderx_get_num_lmcs(lmc->node) >> 1;
+       lmc->bank_width = (FIELD_GET(LMC_DDR_PLL_CTL_DDR4, lmc_ddr_pll_ctl) &&
+                          FIELD_GET(LMC_CONFIG_BG2, lmc_config)) ? 4 : 3;
+
+       lmc->pbank_lsb = (lmc_config >> 5) & 0xf;
+       lmc->dimm_lsb  = 28 + lmc->pbank_lsb + lmc->xbits;
+       lmc->rank_lsb = lmc->dimm_lsb;
+       lmc->rank_lsb -= FIELD_GET(LMC_CONFIG_RANK_ENA, lmc_config) ? 1 : 0;
+       lmc->bank_lsb = 7 + lmc->xbits;
+       lmc->row_lsb = 14 + LMC_CONFIG_ROW_LSB(lmc_config) + lmc->xbits;
+
+       lmc->col_hi_lsb = lmc->bank_lsb + lmc->bank_width;
+
+       lmc->xor_bank = lmc_control & LMC_CONTROL_XOR_BANK;
+
+       l2c_ioaddr = ioremap(L2C_CTL | FIELD_PREP(THUNDERX_NODE, lmc->node),
+                            PAGE_SIZE);
+
+       if (!l2c_ioaddr) {
+               dev_err(&pdev->dev, "Cannot map L2C_CTL\n");
+               goto err_free;
+       }
+
+       lmc->l2c_alias = !(readq(l2c_ioaddr) & L2C_CTL_DISIDXALIAS);
+
+       iounmap(l2c_ioaddr);
+
+       ret = edac_mc_add_mc(mci);
+       if (ret) {
+               dev_err(&pdev->dev, "Cannot add the MC: %d\n", ret);
+               goto err_free;
+       }
+
+       lmc_int = readq(lmc->regs + LMC_INT);
+       writeq(lmc_int, lmc->regs + LMC_INT);
+
+       writeq(LMC_INT_ENA_ALL, lmc->regs + LMC_INT_ENA_W1S);
+
+       if (IS_ENABLED(CONFIG_EDAC_DEBUG)) {
+               ret = thunderx_create_debugfs_nodes(mci->debugfs,
+                                                   lmc_dfs_ents,
+                                                   lmc,
+                                                   ARRAY_SIZE(lmc_dfs_ents));
+
+               if (ret != ARRAY_SIZE(lmc_dfs_ents)) {
+                       dev_warn(&pdev->dev, "Error creating debugfs entries: %d%s\n",
+                                ret, ret >= 0 ? " created" : "");
+               }
+       }
+
+       return 0;
+
+err_free:
+       pci_set_drvdata(pdev, NULL);
+       edac_mc_free(mci);
+
+       return ret;
+}
+
+static void thunderx_lmc_remove(struct pci_dev *pdev)
+{
+       struct mem_ctl_info *mci = pci_get_drvdata(pdev);
+       struct thunderx_lmc *lmc = mci->pvt_info;
+
+       writeq(LMC_INT_ENA_ALL, lmc->regs + LMC_INT_ENA_W1C);
+
+       edac_mc_del_mc(&pdev->dev);
+       edac_mc_free(mci);
+}
+
+MODULE_DEVICE_TABLE(pci, thunderx_lmc_pci_tbl);
+
+static struct pci_driver thunderx_lmc_driver = {
+       .name     = "thunderx_lmc_edac",
+       .probe    = thunderx_lmc_probe,
+       .remove   = thunderx_lmc_remove,
+#ifdef CONFIG_PM
+       .suspend  = thunderx_lmc_suspend,
+       .resume   = thunderx_lmc_resume,
+#endif
+       .id_table = thunderx_lmc_pci_tbl,
+};
+
+/*---------------------- OCX driver ---------------------------------*/
+
+#define PCI_DEVICE_ID_THUNDER_OCX 0xa013
+
+#define OCX_LINK_INTS          3
+#define OCX_INTS               (OCX_LINK_INTS + 1)
+#define OCX_RX_LANES           24
+#define OCX_RX_LANE_STATS      15
+
+#define OCX_COM_INT            0x100
+#define OCX_COM_INT_W1S                0x108
+#define OCX_COM_INT_ENA_W1S    0x110
+#define OCX_COM_INT_ENA_W1C    0x118
+
+#define OCX_COM_IO_BADID               BIT(54)
+#define OCX_COM_MEM_BADID              BIT(53)
+#define OCX_COM_COPR_BADID             BIT(52)
+#define OCX_COM_WIN_REQ_BADID          BIT(51)
+#define OCX_COM_WIN_REQ_TOUT           BIT(50)
+#define OCX_COM_RX_LANE                        GENMASK(23, 0)
+
+#define OCX_COM_INT_CE                 (OCX_COM_IO_BADID      | \
+                                        OCX_COM_MEM_BADID     | \
+                                        OCX_COM_COPR_BADID    | \
+                                        OCX_COM_WIN_REQ_BADID | \
+                                        OCX_COM_WIN_REQ_TOUT)
+
+static const struct error_descr ocx_com_errors[] = {
+       {
+               .type  = ERR_CORRECTED,
+               .mask  = OCX_COM_IO_BADID,
+               .descr = "Invalid IO transaction node ID",
+       },
+       {
+               .type  = ERR_CORRECTED,
+               .mask  = OCX_COM_MEM_BADID,
+               .descr = "Invalid memory transaction node ID",
+       },
+       {
+               .type  = ERR_CORRECTED,
+               .mask  = OCX_COM_COPR_BADID,
+               .descr = "Invalid coprocessor transaction node ID",
+       },
+       {
+               .type  = ERR_CORRECTED,
+               .mask  = OCX_COM_WIN_REQ_BADID,
+               .descr = "Invalid SLI transaction node ID",
+       },
+       {
+               .type  = ERR_CORRECTED,
+               .mask  = OCX_COM_WIN_REQ_TOUT,
+               .descr = "Window/core request timeout",
+       },
+       {0, 0, NULL},
+};
+
+#define OCX_COM_LINKX_INT(x)           (0x120 + (x) * 8)
+#define OCX_COM_LINKX_INT_W1S(x)       (0x140 + (x) * 8)
+#define OCX_COM_LINKX_INT_ENA_W1S(x)   (0x160 + (x) * 8)
+#define OCX_COM_LINKX_INT_ENA_W1C(x)   (0x180 + (x) * 8)
+
+#define OCX_COM_LINK_BAD_WORD                  BIT(13)
+#define OCX_COM_LINK_ALIGN_FAIL                        BIT(12)
+#define OCX_COM_LINK_ALIGN_DONE                        BIT(11)
+#define OCX_COM_LINK_UP                                BIT(10)
+#define OCX_COM_LINK_STOP                      BIT(9)
+#define OCX_COM_LINK_BLK_ERR                   BIT(8)
+#define OCX_COM_LINK_REINIT                    BIT(7)
+#define OCX_COM_LINK_LNK_DATA                  BIT(6)
+#define OCX_COM_LINK_RXFIFO_DBE                        BIT(5)
+#define OCX_COM_LINK_RXFIFO_SBE                        BIT(4)
+#define OCX_COM_LINK_TXFIFO_DBE                        BIT(3)
+#define OCX_COM_LINK_TXFIFO_SBE                        BIT(2)
+#define OCX_COM_LINK_REPLAY_DBE                        BIT(1)
+#define OCX_COM_LINK_REPLAY_SBE                        BIT(0)
+
+static const struct error_descr ocx_com_link_errors[] = {
+       {
+               .type  = ERR_CORRECTED,
+               .mask  = OCX_COM_LINK_REPLAY_SBE,
+               .descr = "Replay buffer single-bit error",
+       },
+       {
+               .type  = ERR_CORRECTED,
+               .mask  = OCX_COM_LINK_TXFIFO_SBE,
+               .descr = "TX FIFO single-bit error",
+       },
+       {
+               .type  = ERR_CORRECTED,
+               .mask  = OCX_COM_LINK_RXFIFO_SBE,
+               .descr = "RX FIFO single-bit error",
+       },
+       {
+               .type  = ERR_CORRECTED,
+               .mask  = OCX_COM_LINK_BLK_ERR,
+               .descr = "Block code error",
+       },
+       {
+               .type  = ERR_CORRECTED,
+               .mask  = OCX_COM_LINK_ALIGN_FAIL,
+               .descr = "Link alignment failure",
+       },
+       {
+               .type  = ERR_CORRECTED,
+               .mask  = OCX_COM_LINK_BAD_WORD,
+               .descr = "Bad code word",
+       },
+       {
+               .type  = ERR_UNCORRECTED,
+               .mask  = OCX_COM_LINK_REPLAY_DBE,
+               .descr = "Replay buffer double-bit error",
+       },
+       {
+               .type  = ERR_UNCORRECTED,
+               .mask  = OCX_COM_LINK_TXFIFO_DBE,
+               .descr = "TX FIFO double-bit error",
+       },
+       {
+               .type  = ERR_UNCORRECTED,
+               .mask  = OCX_COM_LINK_RXFIFO_DBE,
+               .descr = "RX FIFO double-bit error",
+       },
+       {
+               .type  = ERR_UNCORRECTED,
+               .mask  = OCX_COM_LINK_STOP,
+               .descr = "Link stopped",
+       },
+       {0, 0, NULL},
+};
+
+#define OCX_COM_LINK_INT_UE       (OCX_COM_LINK_REPLAY_DBE | \
+                                  OCX_COM_LINK_TXFIFO_DBE | \
+                                  OCX_COM_LINK_RXFIFO_DBE | \
+                                  OCX_COM_LINK_STOP)
+
+#define OCX_COM_LINK_INT_CE       (OCX_COM_LINK_REPLAY_SBE | \
+                                  OCX_COM_LINK_TXFIFO_SBE | \
+                                  OCX_COM_LINK_RXFIFO_SBE | \
+                                  OCX_COM_LINK_BLK_ERR    | \
+                                  OCX_COM_LINK_ALIGN_FAIL | \
+                                  OCX_COM_LINK_BAD_WORD)
+
+#define OCX_LNE_INT(x)                 (0x8018 + (x) * 0x100)
+#define OCX_LNE_INT_EN(x)              (0x8020 + (x) * 0x100)
+#define OCX_LNE_BAD_CNT(x)             (0x8028 + (x) * 0x100)
+#define OCX_LNE_CFG(x)                 (0x8000 + (x) * 0x100)
+#define OCX_LNE_STAT(x, y)             (0x8040 + (x) * 0x100 + (y) * 8)
+
+#define OCX_LNE_CFG_RX_BDRY_LOCK_DIS           BIT(8)
+#define OCX_LNE_CFG_RX_STAT_WRAP_DIS           BIT(2)
+#define OCX_LNE_CFG_RX_STAT_RDCLR              BIT(1)
+#define OCX_LNE_CFG_RX_STAT_ENA                        BIT(0)
+
+
+#define OCX_LANE_BAD_64B67B                    BIT(8)
+#define OCX_LANE_DSKEW_FIFO_OVFL               BIT(5)
+#define OCX_LANE_SCRM_SYNC_LOSS                        BIT(4)
+#define OCX_LANE_UKWN_CNTL_WORD                        BIT(3)
+#define OCX_LANE_CRC32_ERR                     BIT(2)
+#define OCX_LANE_BDRY_SYNC_LOSS                        BIT(1)
+#define OCX_LANE_SERDES_LOCK_LOSS              BIT(0)
+
+#define OCX_COM_LANE_INT_UE       (0)
+#define OCX_COM_LANE_INT_CE       (OCX_LANE_SERDES_LOCK_LOSS | \
+                                  OCX_LANE_BDRY_SYNC_LOSS   | \
+                                  OCX_LANE_CRC32_ERR        | \
+                                  OCX_LANE_UKWN_CNTL_WORD   | \
+                                  OCX_LANE_SCRM_SYNC_LOSS   | \
+                                  OCX_LANE_DSKEW_FIFO_OVFL  | \
+                                  OCX_LANE_BAD_64B67B)
+
+static const struct error_descr ocx_lane_errors[] = {
+       {
+               .type  = ERR_CORRECTED,
+               .mask  = OCX_LANE_SERDES_LOCK_LOSS,
+               .descr = "RX SerDes lock lost",
+       },
+       {
+               .type  = ERR_CORRECTED,
+               .mask  = OCX_LANE_BDRY_SYNC_LOSS,
+               .descr = "RX word boundary lost",
+       },
+       {
+               .type  = ERR_CORRECTED,
+               .mask  = OCX_LANE_CRC32_ERR,
+               .descr = "CRC32 error",
+       },
+       {
+               .type  = ERR_CORRECTED,
+               .mask  = OCX_LANE_UKWN_CNTL_WORD,
+               .descr = "Unknown control word",
+       },
+       {
+               .type  = ERR_CORRECTED,
+               .mask  = OCX_LANE_SCRM_SYNC_LOSS,
+               .descr = "Scrambler synchronization lost",
+       },
+       {
+               .type  = ERR_CORRECTED,
+               .mask  = OCX_LANE_DSKEW_FIFO_OVFL,
+               .descr = "RX deskew FIFO overflow",
+       },
+       {
+               .type  = ERR_CORRECTED,
+               .mask  = OCX_LANE_BAD_64B67B,
+               .descr = "Bad 64B/67B codeword",
+       },
+       {0, 0, NULL},
+};
+
+#define OCX_LNE_INT_ENA_ALL            (GENMASK(9, 8) | GENMASK(6, 0))
+#define OCX_COM_INT_ENA_ALL            (GENMASK(54, 50) | GENMASK(23, 0))
+#define OCX_COM_LINKX_INT_ENA_ALL      (GENMASK(13, 12) | \
+                                        GENMASK(9, 7) | GENMASK(5, 0))
+
+#define OCX_TLKX_ECC_CTL(x)            (0x10018 + (x) * 0x2000)
+#define OCX_RLKX_ECC_CTL(x)            (0x18018 + (x) * 0x2000)
+
+struct ocx_com_err_ctx {
+       u64 reg_com_int;
+       u64 reg_lane_int[OCX_RX_LANES];
+       u64 reg_lane_stat11[OCX_RX_LANES];
+};
+
+struct ocx_link_err_ctx {
+       u64 reg_com_link_int;
+       int link;
+};
+
+struct thunderx_ocx {
+       void __iomem *regs;
+       int com_link;
+       struct pci_dev *pdev;
+       struct edac_device_ctl_info *edac_dev;
+
+       struct dentry *debugfs;
+       struct msix_entry msix_ent[OCX_INTS];
+
+       struct ocx_com_err_ctx com_err_ctx[RING_ENTRIES];
+       struct ocx_link_err_ctx link_err_ctx[RING_ENTRIES];
+
+       unsigned long com_ring_head;
+       unsigned long com_ring_tail;
+
+       unsigned long link_ring_head;
+       unsigned long link_ring_tail;
+};
+
+#define OCX_MESSAGE_SIZE       SZ_1K
+#define OCX_OTHER_SIZE         (50 * ARRAY_SIZE(ocx_com_link_errors))
+
+/* This handler is threaded */
+static irqreturn_t thunderx_ocx_com_isr(int irq, void *irq_id)
+{
+       struct msix_entry *msix = irq_id;
+       struct thunderx_ocx *ocx = container_of(msix, struct thunderx_ocx,
+                                               msix_ent[msix->entry]);
+
+       int lane;
+       unsigned long head = ring_pos(ocx->com_ring_head,
+                                     ARRAY_SIZE(ocx->com_err_ctx));
+       struct ocx_com_err_ctx *ctx = &ocx->com_err_ctx[head];
+
+       ctx->reg_com_int = readq(ocx->regs + OCX_COM_INT);
+
+       for (lane = 0; lane < OCX_RX_LANES; lane++) {
+               ctx->reg_lane_int[lane] =
+                       readq(ocx->regs + OCX_LNE_INT(lane));
+               ctx->reg_lane_stat11[lane] =
+                       readq(ocx->regs + OCX_LNE_STAT(lane, 11));
+
+               writeq(ctx->reg_lane_int[lane], ocx->regs + OCX_LNE_INT(lane));
+       }
+
+       writeq(ctx->reg_com_int, ocx->regs + OCX_COM_INT);
+
+       ocx->com_ring_head++;
+
+       return IRQ_WAKE_THREAD;
+}
+
+static irqreturn_t thunderx_ocx_com_threaded_isr(int irq, void *irq_id)
+{
+       struct msix_entry *msix = irq_id;
+       struct thunderx_ocx *ocx = container_of(msix, struct thunderx_ocx,
+                                               msix_ent[msix->entry]);
+
+       irqreturn_t ret = IRQ_NONE;
+
+       unsigned long tail;
+       struct ocx_com_err_ctx *ctx;
+       int lane;
+       char *msg;
+       char *other;
+
+       msg = kmalloc(OCX_MESSAGE_SIZE, GFP_KERNEL);
+       other = kmalloc(OCX_OTHER_SIZE, GFP_KERNEL);
+
+       if (!msg || !other)
+               goto err_free;
+
+       while (CIRC_CNT(ocx->com_ring_head, ocx->com_ring_tail,
+                       ARRAY_SIZE(ocx->com_err_ctx))) {
+               tail = ring_pos(ocx->com_ring_tail,
+                               ARRAY_SIZE(ocx->com_err_ctx));
+               ctx = &ocx->com_err_ctx[tail];
+
+               snprintf(msg, OCX_MESSAGE_SIZE, "%s: OCX_COM_INT: %016llx",
+                       ocx->edac_dev->ctl_name, ctx->reg_com_int);
+
+               decode_register(other, OCX_OTHER_SIZE,
+                               ocx_com_errors, ctx->reg_com_int);
+
+               strncat(msg, other, OCX_MESSAGE_SIZE);
+
+               for (lane = 0; lane < OCX_RX_LANES; lane++)
+                       if (ctx->reg_com_int & BIT(lane)) {
+                               snprintf(other, OCX_OTHER_SIZE,
+                                        "\n\tOCX_LNE_INT[%02d]: %016llx OCX_LNE_STAT11[%02d]: %016llx",
+                                        lane, ctx->reg_lane_int[lane],
+                                        lane, ctx->reg_lane_stat11[lane]);
+
+                               strncat(msg, other, OCX_MESSAGE_SIZE);
+
+                               decode_register(other, OCX_OTHER_SIZE,
+                                               ocx_lane_errors,
+                                               ctx->reg_lane_int[lane]);
+                               strncat(msg, other, OCX_MESSAGE_SIZE);
+                       }
+
+               if (ctx->reg_com_int & OCX_COM_INT_CE)
+                       edac_device_handle_ce(ocx->edac_dev, 0, 0, msg);
+
+               ocx->com_ring_tail++;
+       }
+
+       ret = IRQ_HANDLED;
+
+err_free:
+       kfree(other);
+       kfree(msg);
+
+       return ret;
+}
+
+static irqreturn_t thunderx_ocx_lnk_isr(int irq, void *irq_id)
+{
+       struct msix_entry *msix = irq_id;
+       struct thunderx_ocx *ocx = container_of(msix, struct thunderx_ocx,
+                                               msix_ent[msix->entry]);
+       unsigned long head = ring_pos(ocx->link_ring_head,
+                                     ARRAY_SIZE(ocx->link_err_ctx));
+       struct ocx_link_err_ctx *ctx = &ocx->link_err_ctx[head];
+
+       ctx->link = msix->entry;
+       ctx->reg_com_link_int = readq(ocx->regs + OCX_COM_LINKX_INT(ctx->link));
+
+       writeq(ctx->reg_com_link_int, ocx->regs + OCX_COM_LINKX_INT(ctx->link));
+
+       ocx->link_ring_head++;
+
+       return IRQ_WAKE_THREAD;
+}
+
+static irqreturn_t thunderx_ocx_lnk_threaded_isr(int irq, void *irq_id)
+{
+       struct msix_entry *msix = irq_id;
+       struct thunderx_ocx *ocx = container_of(msix, struct thunderx_ocx,
+                                               msix_ent[msix->entry]);
+       irqreturn_t ret = IRQ_NONE;
+       unsigned long tail;
+       struct ocx_link_err_ctx *ctx;
+
+       char *msg;
+       char *other;
+
+       msg = kmalloc(OCX_MESSAGE_SIZE, GFP_KERNEL);
+       other = kmalloc(OCX_OTHER_SIZE, GFP_KERNEL);
+
+       if (!msg || !other)
+               goto err_free;
+
+       while (CIRC_CNT(ocx->link_ring_head, ocx->link_ring_tail,
+                       ARRAY_SIZE(ocx->link_err_ctx))) {
+               tail = ring_pos(ocx->link_ring_head,
+                               ARRAY_SIZE(ocx->link_err_ctx));
+
+               ctx = &ocx->link_err_ctx[tail];
+
+               snprintf(msg, OCX_MESSAGE_SIZE,
+                        "%s: OCX_COM_LINK_INT[%d]: %016llx",
+                        ocx->edac_dev->ctl_name,
+                        ctx->link, ctx->reg_com_link_int);
+
+               decode_register(other, OCX_OTHER_SIZE,
+                               ocx_com_link_errors, ctx->reg_com_link_int);
+
+               strncat(msg, other, OCX_MESSAGE_SIZE);
+
+               if (ctx->reg_com_link_int & OCX_COM_LINK_INT_UE)
+                       edac_device_handle_ue(ocx->edac_dev, 0, 0, msg);
+               else if (ctx->reg_com_link_int & OCX_COM_LINK_INT_CE)
+                       edac_device_handle_ce(ocx->edac_dev, 0, 0, msg);
+
+               ocx->link_ring_tail++;
+       }
+
+       ret = IRQ_HANDLED;
+err_free:
+       kfree(other);
+       kfree(msg);
+
+       return ret;
+}
+
+#define OCX_DEBUGFS_ATTR(_name, _reg)  DEBUGFS_REG_ATTR(ocx, _name, _reg)
+
+OCX_DEBUGFS_ATTR(tlk0_ecc_ctl, OCX_TLKX_ECC_CTL(0));
+OCX_DEBUGFS_ATTR(tlk1_ecc_ctl, OCX_TLKX_ECC_CTL(1));
+OCX_DEBUGFS_ATTR(tlk2_ecc_ctl, OCX_TLKX_ECC_CTL(2));
+
+OCX_DEBUGFS_ATTR(rlk0_ecc_ctl, OCX_RLKX_ECC_CTL(0));
+OCX_DEBUGFS_ATTR(rlk1_ecc_ctl, OCX_RLKX_ECC_CTL(1));
+OCX_DEBUGFS_ATTR(rlk2_ecc_ctl, OCX_RLKX_ECC_CTL(2));
+
+OCX_DEBUGFS_ATTR(com_link0_int, OCX_COM_LINKX_INT_W1S(0));
+OCX_DEBUGFS_ATTR(com_link1_int, OCX_COM_LINKX_INT_W1S(1));
+OCX_DEBUGFS_ATTR(com_link2_int, OCX_COM_LINKX_INT_W1S(2));
+
+OCX_DEBUGFS_ATTR(lne00_badcnt, OCX_LNE_BAD_CNT(0));
+OCX_DEBUGFS_ATTR(lne01_badcnt, OCX_LNE_BAD_CNT(1));
+OCX_DEBUGFS_ATTR(lne02_badcnt, OCX_LNE_BAD_CNT(2));
+OCX_DEBUGFS_ATTR(lne03_badcnt, OCX_LNE_BAD_CNT(3));
+OCX_DEBUGFS_ATTR(lne04_badcnt, OCX_LNE_BAD_CNT(4));
+OCX_DEBUGFS_ATTR(lne05_badcnt, OCX_LNE_BAD_CNT(5));
+OCX_DEBUGFS_ATTR(lne06_badcnt, OCX_LNE_BAD_CNT(6));
+OCX_DEBUGFS_ATTR(lne07_badcnt, OCX_LNE_BAD_CNT(7));
+
+OCX_DEBUGFS_ATTR(lne08_badcnt, OCX_LNE_BAD_CNT(8));
+OCX_DEBUGFS_ATTR(lne09_badcnt, OCX_LNE_BAD_CNT(9));
+OCX_DEBUGFS_ATTR(lne10_badcnt, OCX_LNE_BAD_CNT(10));
+OCX_DEBUGFS_ATTR(lne11_badcnt, OCX_LNE_BAD_CNT(11));
+OCX_DEBUGFS_ATTR(lne12_badcnt, OCX_LNE_BAD_CNT(12));
+OCX_DEBUGFS_ATTR(lne13_badcnt, OCX_LNE_BAD_CNT(13));
+OCX_DEBUGFS_ATTR(lne14_badcnt, OCX_LNE_BAD_CNT(14));
+OCX_DEBUGFS_ATTR(lne15_badcnt, OCX_LNE_BAD_CNT(15));
+
+OCX_DEBUGFS_ATTR(lne16_badcnt, OCX_LNE_BAD_CNT(16));
+OCX_DEBUGFS_ATTR(lne17_badcnt, OCX_LNE_BAD_CNT(17));
+OCX_DEBUGFS_ATTR(lne18_badcnt, OCX_LNE_BAD_CNT(18));
+OCX_DEBUGFS_ATTR(lne19_badcnt, OCX_LNE_BAD_CNT(19));
+OCX_DEBUGFS_ATTR(lne20_badcnt, OCX_LNE_BAD_CNT(20));
+OCX_DEBUGFS_ATTR(lne21_badcnt, OCX_LNE_BAD_CNT(21));
+OCX_DEBUGFS_ATTR(lne22_badcnt, OCX_LNE_BAD_CNT(22));
+OCX_DEBUGFS_ATTR(lne23_badcnt, OCX_LNE_BAD_CNT(23));
+
+OCX_DEBUGFS_ATTR(com_int, OCX_COM_INT_W1S);
+
+struct debugfs_entry *ocx_dfs_ents[] = {
+       &debugfs_tlk0_ecc_ctl,
+       &debugfs_tlk1_ecc_ctl,
+       &debugfs_tlk2_ecc_ctl,
+
+       &debugfs_rlk0_ecc_ctl,
+       &debugfs_rlk1_ecc_ctl,
+       &debugfs_rlk2_ecc_ctl,
+
+       &debugfs_com_link0_int,
+       &debugfs_com_link1_int,
+       &debugfs_com_link2_int,
+
+       &debugfs_lne00_badcnt,
+       &debugfs_lne01_badcnt,
+       &debugfs_lne02_badcnt,
+       &debugfs_lne03_badcnt,
+       &debugfs_lne04_badcnt,
+       &debugfs_lne05_badcnt,
+       &debugfs_lne06_badcnt,
+       &debugfs_lne07_badcnt,
+       &debugfs_lne08_badcnt,
+       &debugfs_lne09_badcnt,
+       &debugfs_lne10_badcnt,
+       &debugfs_lne11_badcnt,
+       &debugfs_lne12_badcnt,
+       &debugfs_lne13_badcnt,
+       &debugfs_lne14_badcnt,
+       &debugfs_lne15_badcnt,
+       &debugfs_lne16_badcnt,
+       &debugfs_lne17_badcnt,
+       &debugfs_lne18_badcnt,
+       &debugfs_lne19_badcnt,
+       &debugfs_lne20_badcnt,
+       &debugfs_lne21_badcnt,
+       &debugfs_lne22_badcnt,
+       &debugfs_lne23_badcnt,
+
+       &debugfs_com_int,
+};
+
+static const struct pci_device_id thunderx_ocx_pci_tbl[] = {
+       { PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, PCI_DEVICE_ID_THUNDER_OCX) },
+       { 0, },
+};
+
+static void thunderx_ocx_clearstats(struct thunderx_ocx *ocx)
+{
+       int lane, stat, cfg;
+
+       for (lane = 0; lane < OCX_RX_LANES; lane++) {
+               cfg = readq(ocx->regs + OCX_LNE_CFG(lane));
+               cfg |= OCX_LNE_CFG_RX_STAT_RDCLR;
+               cfg &= ~OCX_LNE_CFG_RX_STAT_ENA;
+               writeq(cfg, ocx->regs + OCX_LNE_CFG(lane));
+
+               for (stat = 0; stat < OCX_RX_LANE_STATS; stat++)
+                       readq(ocx->regs + OCX_LNE_STAT(lane, stat));
+       }
+}
+
+static int thunderx_ocx_probe(struct pci_dev *pdev,
+                             const struct pci_device_id *id)
+{
+       struct thunderx_ocx *ocx;
+       struct edac_device_ctl_info *edac_dev;
+       char name[32];
+       int idx;
+       int i;
+       int ret;
+       u64 reg;
+
+       ret = pcim_enable_device(pdev);
+       if (ret) {
+               dev_err(&pdev->dev, "Cannot enable PCI device: %d\n", ret);
+               return ret;
+       }
+
+       ret = pcim_iomap_regions(pdev, BIT(0), "thunderx_ocx");
+       if (ret) {
+               dev_err(&pdev->dev, "Cannot map PCI resources: %d\n", ret);
+               return ret;
+       }
+
+       idx = edac_device_alloc_index();
+       snprintf(name, sizeof(name), "OCX%d", idx);
+       edac_dev = edac_device_alloc_ctl_info(sizeof(struct thunderx_ocx),
+                                             name, 1, "CCPI", 1,
+                                             0, NULL, 0, idx);
+       if (!edac_dev) {
+               dev_err(&pdev->dev, "Cannot allocate EDAC device: %d\n", ret);
+               return -ENOMEM;
+       }
+       ocx = edac_dev->pvt_info;
+       ocx->edac_dev = edac_dev;
+       ocx->com_ring_head = 0;
+       ocx->com_ring_tail = 0;
+       ocx->link_ring_head = 0;
+       ocx->link_ring_tail = 0;
+
+       ocx->regs = pcim_iomap_table(pdev)[0];
+       if (!ocx->regs) {
+               dev_err(&pdev->dev, "Cannot map PCI resources: %d\n", ret);
+               ret = -ENODEV;
+               goto err_free;
+       }
+
+       ocx->pdev = pdev;
+
+       for (i = 0; i < OCX_INTS; i++) {
+               ocx->msix_ent[i].entry = i;
+               ocx->msix_ent[i].vector = 0;
+       }
+
+       ret = pci_enable_msix_exact(pdev, ocx->msix_ent, OCX_INTS);
+       if (ret) {
+               dev_err(&pdev->dev, "Cannot enable interrupt: %d\n", ret);
+               goto err_free;
+       }
+
+       for (i = 0; i < OCX_INTS; i++) {
+               ret = devm_request_threaded_irq(&pdev->dev,
+                                               ocx->msix_ent[i].vector,
+                                               (i == 3) ?
+                                                thunderx_ocx_com_isr :
+                                                thunderx_ocx_lnk_isr,
+                                               (i == 3) ?
+                                                thunderx_ocx_com_threaded_isr :
+                                                thunderx_ocx_lnk_threaded_isr,
+                                               0, "[EDAC] ThunderX OCX",
+                                               &ocx->msix_ent[i]);
+               if (ret)
+                       goto err_free;
+       }
+
+       edac_dev->dev = &pdev->dev;
+       edac_dev->dev_name = dev_name(&pdev->dev);
+       edac_dev->mod_name = "thunderx-ocx";
+       edac_dev->ctl_name = "thunderx-ocx";
+
+       ret = edac_device_add_device(edac_dev);
+       if (ret) {
+               dev_err(&pdev->dev, "Cannot add EDAC device: %d\n", ret);
+               goto err_free;
+       }
+
+       if (IS_ENABLED(CONFIG_EDAC_DEBUG)) {
+               ocx->debugfs = edac_debugfs_create_dir(pdev->dev.kobj.name);
+
+               ret = thunderx_create_debugfs_nodes(ocx->debugfs,
+                                                   ocx_dfs_ents,
+                                                   ocx,
+                                                   ARRAY_SIZE(ocx_dfs_ents));
+               if (ret != ARRAY_SIZE(ocx_dfs_ents)) {
+                       dev_warn(&pdev->dev, "Error creating debugfs entries: %d%s\n",
+                                ret, ret >= 0 ? " created" : "");
+               }
+       }
+
+       pci_set_drvdata(pdev, edac_dev);
+
+       thunderx_ocx_clearstats(ocx);
+
+       for (i = 0; i < OCX_RX_LANES; i++) {
+               writeq(OCX_LNE_INT_ENA_ALL,
+                      ocx->regs + OCX_LNE_INT_EN(i));
+
+               reg = readq(ocx->regs + OCX_LNE_INT(i));
+               writeq(reg, ocx->regs + OCX_LNE_INT(i));
+
+       }
+
+       for (i = 0; i < OCX_LINK_INTS; i++) {
+               reg = readq(ocx->regs + OCX_COM_LINKX_INT(i));
+               writeq(reg, ocx->regs + OCX_COM_LINKX_INT(i));
+
+               writeq(OCX_COM_LINKX_INT_ENA_ALL,
+                      ocx->regs + OCX_COM_LINKX_INT_ENA_W1S(i));
+       }
+
+       reg = readq(ocx->regs + OCX_COM_INT);
+       writeq(reg, ocx->regs + OCX_COM_INT);
+
+       writeq(OCX_COM_INT_ENA_ALL, ocx->regs + OCX_COM_INT_ENA_W1S);
+
+       return 0;
+err_free:
+       edac_device_free_ctl_info(edac_dev);
+
+       return ret;
+}
+
+static void thunderx_ocx_remove(struct pci_dev *pdev)
+{
+       struct edac_device_ctl_info *edac_dev = pci_get_drvdata(pdev);
+       struct thunderx_ocx *ocx = edac_dev->pvt_info;
+       int i;
+
+       writeq(OCX_COM_INT_ENA_ALL, ocx->regs + OCX_COM_INT_ENA_W1C);
+
+       for (i = 0; i < OCX_INTS; i++) {
+               writeq(OCX_COM_LINKX_INT_ENA_ALL,
+                      ocx->regs + OCX_COM_LINKX_INT_ENA_W1C(i));
+       }
+
+       edac_debugfs_remove_recursive(ocx->debugfs);
+
+       edac_device_del_device(&pdev->dev);
+       edac_device_free_ctl_info(edac_dev);
+}
+
+MODULE_DEVICE_TABLE(pci, thunderx_ocx_pci_tbl);
+
+static struct pci_driver thunderx_ocx_driver = {
+       .name     = "thunderx_ocx_edac",
+       .probe    = thunderx_ocx_probe,
+       .remove   = thunderx_ocx_remove,
+       .id_table = thunderx_ocx_pci_tbl,
+};
+
+/*---------------------- L2C driver ---------------------------------*/
+
+#define PCI_DEVICE_ID_THUNDER_L2C_TAD 0xa02e
+#define PCI_DEVICE_ID_THUNDER_L2C_CBC 0xa02f
+#define PCI_DEVICE_ID_THUNDER_L2C_MCI 0xa030
+
+#define L2C_TAD_INT_W1C                0x40000
+#define L2C_TAD_INT_W1S                0x40008
+
+#define L2C_TAD_INT_ENA_W1C    0x40020
+#define L2C_TAD_INT_ENA_W1S    0x40028
+
+
+#define L2C_TAD_INT_L2DDBE      BIT(1)
+#define L2C_TAD_INT_SBFSBE      BIT(2)
+#define L2C_TAD_INT_SBFDBE      BIT(3)
+#define L2C_TAD_INT_FBFSBE      BIT(4)
+#define L2C_TAD_INT_FBFDBE      BIT(5)
+#define L2C_TAD_INT_TAGDBE      BIT(9)
+#define L2C_TAD_INT_RDDISLMC    BIT(15)
+#define L2C_TAD_INT_WRDISLMC    BIT(16)
+#define L2C_TAD_INT_LFBTO       BIT(17)
+#define L2C_TAD_INT_GSYNCTO     BIT(18)
+#define L2C_TAD_INT_RTGSBE      BIT(32)
+#define L2C_TAD_INT_RTGDBE      BIT(33)
+#define L2C_TAD_INT_RDDISOCI    BIT(34)
+#define L2C_TAD_INT_WRDISOCI    BIT(35)
+
+#define L2C_TAD_INT_ECC                (L2C_TAD_INT_L2DDBE | \
+                                L2C_TAD_INT_SBFSBE | L2C_TAD_INT_SBFDBE | \
+                                L2C_TAD_INT_FBFSBE | L2C_TAD_INT_FBFDBE)
+
+#define L2C_TAD_INT_CE          (L2C_TAD_INT_SBFSBE | \
+                                L2C_TAD_INT_FBFSBE)
+
+#define L2C_TAD_INT_UE          (L2C_TAD_INT_L2DDBE | \
+                                L2C_TAD_INT_SBFDBE | \
+                                L2C_TAD_INT_FBFDBE | \
+                                L2C_TAD_INT_TAGDBE | \
+                                L2C_TAD_INT_RTGDBE | \
+                                L2C_TAD_INT_WRDISOCI | \
+                                L2C_TAD_INT_RDDISOCI | \
+                                L2C_TAD_INT_WRDISLMC | \
+                                L2C_TAD_INT_RDDISLMC | \
+                                L2C_TAD_INT_LFBTO    | \
+                                L2C_TAD_INT_GSYNCTO)
+
+static const struct error_descr l2_tad_errors[] = {
+       {
+               .type  = ERR_CORRECTED,
+               .mask  = L2C_TAD_INT_SBFSBE,
+               .descr = "SBF single-bit error",
+       },
+       {
+               .type  = ERR_CORRECTED,
+               .mask  = L2C_TAD_INT_FBFSBE,
+               .descr = "FBF single-bit error",
+       },
+       {
+               .type  = ERR_UNCORRECTED,
+               .mask  = L2C_TAD_INT_L2DDBE,
+               .descr = "L2D double-bit error",
+       },
+       {
+               .type  = ERR_UNCORRECTED,
+               .mask  = L2C_TAD_INT_SBFDBE,
+               .descr = "SBF double-bit error",
+       },
+       {
+               .type  = ERR_UNCORRECTED,
+               .mask  = L2C_TAD_INT_FBFDBE,
+               .descr = "FBF double-bit error",
+       },
+       {
+               .type  = ERR_UNCORRECTED,
+               .mask  = L2C_TAD_INT_TAGDBE,
+               .descr = "TAG double-bit error",
+       },
+       {
+               .type  = ERR_UNCORRECTED,
+               .mask  = L2C_TAD_INT_RTGDBE,
+               .descr = "RTG double-bit error",
+       },
+       {
+               .type  = ERR_UNCORRECTED,
+               .mask  = L2C_TAD_INT_WRDISOCI,
+               .descr = "Write to a disabled CCPI",
+       },
+       {
+               .type  = ERR_UNCORRECTED,
+               .mask  = L2C_TAD_INT_RDDISOCI,
+               .descr = "Read from a disabled CCPI",
+       },
+       {
+               .type  = ERR_UNCORRECTED,
+               .mask  = L2C_TAD_INT_WRDISLMC,
+               .descr = "Write to a disabled LMC",
+       },
+       {
+               .type  = ERR_UNCORRECTED,
+               .mask  = L2C_TAD_INT_RDDISLMC,
+               .descr = "Read from a disabled LMC",
+       },
+       {
+               .type  = ERR_UNCORRECTED,
+               .mask  = L2C_TAD_INT_LFBTO,
+               .descr = "LFB entry timeout",
+       },
+       {
+               .type  = ERR_UNCORRECTED,
+               .mask  = L2C_TAD_INT_GSYNCTO,
+               .descr = "Global sync CCPI timeout",
+       },
+       {0, 0, NULL},
+};
+
+#define L2C_TAD_INT_TAG                (L2C_TAD_INT_TAGDBE)
+
+#define L2C_TAD_INT_RTG                (L2C_TAD_INT_RTGDBE)
+
+#define L2C_TAD_INT_DISLMC     (L2C_TAD_INT_WRDISLMC | L2C_TAD_INT_RDDISLMC)
+
+#define L2C_TAD_INT_DISOCI     (L2C_TAD_INT_WRDISOCI | L2C_TAD_INT_RDDISOCI)
+
+#define L2C_TAD_INT_ENA_ALL    (L2C_TAD_INT_ECC | L2C_TAD_INT_TAG | \
+                                L2C_TAD_INT_RTG | \
+                                L2C_TAD_INT_DISLMC | L2C_TAD_INT_DISOCI | \
+                                L2C_TAD_INT_LFBTO)
+
+#define L2C_TAD_TIMETWO                0x50000
+#define L2C_TAD_TIMEOUT                0x50100
+#define L2C_TAD_ERR            0x60000
+#define L2C_TAD_TQD_ERR                0x60100
+#define L2C_TAD_TTG_ERR                0x60200
+
+
+#define L2C_CBC_INT_W1C                0x60000
+
+#define L2C_CBC_INT_RSDSBE      BIT(0)
+#define L2C_CBC_INT_RSDDBE      BIT(1)
+
+#define L2C_CBC_INT_RSD                 (L2C_CBC_INT_RSDSBE | L2C_CBC_INT_RSDDBE)
+
+#define L2C_CBC_INT_MIBSBE      BIT(4)
+#define L2C_CBC_INT_MIBDBE      BIT(5)
+
+#define L2C_CBC_INT_MIB                 (L2C_CBC_INT_MIBSBE | L2C_CBC_INT_MIBDBE)
+
+#define L2C_CBC_INT_IORDDISOCI  BIT(6)
+#define L2C_CBC_INT_IOWRDISOCI  BIT(7)
+
+#define L2C_CBC_INT_IODISOCI    (L2C_CBC_INT_IORDDISOCI | \
+                                 L2C_CBC_INT_IOWRDISOCI)
+
+#define L2C_CBC_INT_CE          (L2C_CBC_INT_RSDSBE | L2C_CBC_INT_MIBSBE)
+#define L2C_CBC_INT_UE          (L2C_CBC_INT_RSDDBE | L2C_CBC_INT_MIBDBE)
+
+
+static const struct error_descr l2_cbc_errors[] = {
+       {
+               .type  = ERR_CORRECTED,
+               .mask  = L2C_CBC_INT_RSDSBE,
+               .descr = "RSD single-bit error",
+       },
+       {
+               .type  = ERR_CORRECTED,
+               .mask  = L2C_CBC_INT_MIBSBE,
+               .descr = "MIB single-bit error",
+       },
+       {
+               .type  = ERR_UNCORRECTED,
+               .mask  = L2C_CBC_INT_RSDDBE,
+               .descr = "RSD double-bit error",
+       },
+       {
+               .type  = ERR_UNCORRECTED,
+               .mask  = L2C_CBC_INT_MIBDBE,
+               .descr = "MIB double-bit error",
+       },
+       {
+               .type  = ERR_UNCORRECTED,
+               .mask  = L2C_CBC_INT_IORDDISOCI,
+               .descr = "Read from a disabled CCPI",
+       },
+       {
+               .type  = ERR_UNCORRECTED,
+               .mask  = L2C_CBC_INT_IOWRDISOCI,
+               .descr = "Write to a disabled CCPI",
+       },
+       {0, 0, NULL},
+};
+
+#define L2C_CBC_INT_W1S                0x60008
+#define L2C_CBC_INT_ENA_W1C    0x60020
+
+#define L2C_CBC_INT_ENA_ALL     (L2C_CBC_INT_RSD | L2C_CBC_INT_MIB | \
+                                 L2C_CBC_INT_IODISOCI)
+
+#define L2C_CBC_INT_ENA_W1S    0x60028
+
+#define L2C_CBC_IODISOCIERR    0x80008
+#define L2C_CBC_IOCERR         0x80010
+#define L2C_CBC_RSDERR         0x80018
+#define L2C_CBC_MIBERR         0x80020
+
+
+#define L2C_MCI_INT_W1C                0x0
+
+#define L2C_MCI_INT_VBFSBE      BIT(0)
+#define L2C_MCI_INT_VBFDBE      BIT(1)
+
+static const struct error_descr l2_mci_errors[] = {
+       {
+               .type  = ERR_CORRECTED,
+               .mask  = L2C_MCI_INT_VBFSBE,
+               .descr = "VBF single-bit error",
+       },
+       {
+               .type  = ERR_UNCORRECTED,
+               .mask  = L2C_MCI_INT_VBFDBE,
+               .descr = "VBF double-bit error",
+       },
+       {0, 0, NULL},
+};
+
+#define L2C_MCI_INT_W1S                0x8
+#define L2C_MCI_INT_ENA_W1C    0x20
+
+#define L2C_MCI_INT_ENA_ALL     (L2C_MCI_INT_VBFSBE | L2C_MCI_INT_VBFDBE)
+
+#define L2C_MCI_INT_ENA_W1S    0x28
+
+#define L2C_MCI_ERR            0x10000
+
+#define L2C_MESSAGE_SIZE       SZ_1K
+#define L2C_OTHER_SIZE         (50 * ARRAY_SIZE(l2_tad_errors))
+
+struct l2c_err_ctx {
+       char *reg_ext_name;
+       u64  reg_int;
+       u64  reg_ext;
+};
+
+struct thunderx_l2c {
+       void __iomem *regs;
+       struct pci_dev *pdev;
+       struct edac_device_ctl_info *edac_dev;
+
+       struct dentry *debugfs;
+
+       int index;
+
+       struct msix_entry msix_ent;
+
+       struct l2c_err_ctx err_ctx[RING_ENTRIES];
+       unsigned long ring_head;
+       unsigned long ring_tail;
+};
+
+static irqreturn_t thunderx_l2c_tad_isr(int irq, void *irq_id)
+{
+       struct msix_entry *msix = irq_id;
+       struct thunderx_l2c *tad = container_of(msix, struct thunderx_l2c,
+                                               msix_ent);
+
+       unsigned long head = ring_pos(tad->ring_head, ARRAY_SIZE(tad->err_ctx));
+       struct l2c_err_ctx *ctx = &tad->err_ctx[head];
+
+       ctx->reg_int = readq(tad->regs + L2C_TAD_INT_W1C);
+
+       if (ctx->reg_int & L2C_TAD_INT_ECC) {
+               ctx->reg_ext_name = "TQD_ERR";
+               ctx->reg_ext = readq(tad->regs + L2C_TAD_TQD_ERR);
+       } else if (ctx->reg_int & L2C_TAD_INT_TAG) {
+               ctx->reg_ext_name = "TTG_ERR";
+               ctx->reg_ext = readq(tad->regs + L2C_TAD_TTG_ERR);
+       } else if (ctx->reg_int & L2C_TAD_INT_LFBTO) {
+               ctx->reg_ext_name = "TIMEOUT";
+               ctx->reg_ext = readq(tad->regs + L2C_TAD_TIMEOUT);
+       } else if (ctx->reg_int & L2C_TAD_INT_DISOCI) {
+               ctx->reg_ext_name = "ERR";
+               ctx->reg_ext = readq(tad->regs + L2C_TAD_ERR);
+       }
+
+       writeq(ctx->reg_int, tad->regs + L2C_TAD_INT_W1C);
+
+       tad->ring_head++;
+
+       return IRQ_WAKE_THREAD;
+}
+
+static irqreturn_t thunderx_l2c_cbc_isr(int irq, void *irq_id)
+{
+       struct msix_entry *msix = irq_id;
+       struct thunderx_l2c *cbc = container_of(msix, struct thunderx_l2c,
+                                               msix_ent);
+
+       unsigned long head = ring_pos(cbc->ring_head, ARRAY_SIZE(cbc->err_ctx));
+       struct l2c_err_ctx *ctx = &cbc->err_ctx[head];
+
+       ctx->reg_int = readq(cbc->regs + L2C_CBC_INT_W1C);
+
+       if (ctx->reg_int & L2C_CBC_INT_RSD) {
+               ctx->reg_ext_name = "RSDERR";
+               ctx->reg_ext = readq(cbc->regs + L2C_CBC_RSDERR);
+       } else if (ctx->reg_int & L2C_CBC_INT_MIB) {
+               ctx->reg_ext_name = "MIBERR";
+               ctx->reg_ext = readq(cbc->regs + L2C_CBC_MIBERR);
+       } else if (ctx->reg_int & L2C_CBC_INT_IODISOCI) {
+               ctx->reg_ext_name = "IODISOCIERR";
+               ctx->reg_ext = readq(cbc->regs + L2C_CBC_IODISOCIERR);
+       }
+
+       writeq(ctx->reg_int, cbc->regs + L2C_CBC_INT_W1C);
+
+       cbc->ring_head++;
+
+       return IRQ_WAKE_THREAD;
+}
+
+static irqreturn_t thunderx_l2c_mci_isr(int irq, void *irq_id)
+{
+       struct msix_entry *msix = irq_id;
+       struct thunderx_l2c *mci = container_of(msix, struct thunderx_l2c,
+                                               msix_ent);
+
+       unsigned long head = ring_pos(mci->ring_head, ARRAY_SIZE(mci->err_ctx));
+       struct l2c_err_ctx *ctx = &mci->err_ctx[head];
+
+       ctx->reg_int = readq(mci->regs + L2C_MCI_INT_W1C);
+       ctx->reg_ext = readq(mci->regs + L2C_MCI_ERR);
+
+       writeq(ctx->reg_int, mci->regs + L2C_MCI_INT_W1C);
+
+       ctx->reg_ext_name = "ERR";
+
+       mci->ring_head++;
+
+       return IRQ_WAKE_THREAD;
+}
+
+static irqreturn_t thunderx_l2c_threaded_isr(int irq, void *irq_id)
+{
+       struct msix_entry *msix = irq_id;
+       struct thunderx_l2c *l2c = container_of(msix, struct thunderx_l2c,
+                                               msix_ent);
+
+       unsigned long tail = ring_pos(l2c->ring_tail, ARRAY_SIZE(l2c->err_ctx));
+       struct l2c_err_ctx *ctx = &l2c->err_ctx[tail];
+       irqreturn_t ret = IRQ_NONE;
+
+       u64 mask_ue, mask_ce;
+       const struct error_descr *l2_errors;
+       char *reg_int_name;
+
+       char *msg;
+       char *other;
+
+       msg = kmalloc(OCX_MESSAGE_SIZE, GFP_KERNEL);
+       other = kmalloc(OCX_OTHER_SIZE, GFP_KERNEL);
+
+       if (!msg || !other)
+               goto err_free;
+
+       switch (l2c->pdev->device) {
+       case PCI_DEVICE_ID_THUNDER_L2C_TAD:
+               reg_int_name = "L2C_TAD_INT";
+               mask_ue = L2C_TAD_INT_UE;
+               mask_ce = L2C_TAD_INT_CE;
+               l2_errors = l2_tad_errors;
+               break;
+       case PCI_DEVICE_ID_THUNDER_L2C_CBC:
+               reg_int_name = "L2C_CBC_INT";
+               mask_ue = L2C_CBC_INT_UE;
+               mask_ce = L2C_CBC_INT_CE;
+               l2_errors = l2_cbc_errors;
+               break;
+       case PCI_DEVICE_ID_THUNDER_L2C_MCI:
+               reg_int_name = "L2C_MCI_INT";
+               mask_ue = L2C_MCI_INT_VBFDBE;
+               mask_ce = L2C_MCI_INT_VBFSBE;
+               l2_errors = l2_mci_errors;
+               break;
+       default:
+               dev_err(&l2c->pdev->dev, "Unsupported device: %04x\n",
+                       l2c->pdev->device);
+               return IRQ_NONE;
+       }
+
+       while (CIRC_CNT(l2c->ring_head, l2c->ring_tail,
+                       ARRAY_SIZE(l2c->err_ctx))) {
+               snprintf(msg, L2C_MESSAGE_SIZE,
+                        "%s: %s: %016llx, %s: %016llx",
+                        l2c->edac_dev->ctl_name, reg_int_name, ctx->reg_int,
+                        ctx->reg_ext_name, ctx->reg_ext);
+
+               decode_register(other, L2C_OTHER_SIZE, l2_errors, ctx->reg_int);
+
+               strncat(msg, other, L2C_MESSAGE_SIZE);
+
+               if (ctx->reg_int & mask_ue)
+                       edac_device_handle_ue(l2c->edac_dev, 0, 0, msg);
+               else if (ctx->reg_int & mask_ce)
+                       edac_device_handle_ce(l2c->edac_dev, 0, 0, msg);
+
+               l2c->ring_tail++;
+       }
+
+       return IRQ_HANDLED;
+
+err_free:
+       kfree(other);
+       kfree(msg);
+
+       return ret;
+}
+
+#define L2C_DEBUGFS_ATTR(_name, _reg)  DEBUGFS_REG_ATTR(l2c, _name, _reg)
+
+L2C_DEBUGFS_ATTR(tad_int, L2C_TAD_INT_W1S);
+
+struct debugfs_entry *l2c_tad_dfs_ents[] = {
+       &debugfs_tad_int,
+};
+
+L2C_DEBUGFS_ATTR(cbc_int, L2C_CBC_INT_W1S);
+
+struct debugfs_entry *l2c_cbc_dfs_ents[] = {
+       &debugfs_cbc_int,
+};
+
+L2C_DEBUGFS_ATTR(mci_int, L2C_MCI_INT_W1S);
+
+struct debugfs_entry *l2c_mci_dfs_ents[] = {
+       &debugfs_mci_int,
+};
+
+static const struct pci_device_id thunderx_l2c_pci_tbl[] = {
+       { PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, PCI_DEVICE_ID_THUNDER_L2C_TAD), },
+       { PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, PCI_DEVICE_ID_THUNDER_L2C_CBC), },
+       { PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, PCI_DEVICE_ID_THUNDER_L2C_MCI), },
+       { 0, },
+};
+
+static int thunderx_l2c_probe(struct pci_dev *pdev,
+                             const struct pci_device_id *id)
+{
+       struct thunderx_l2c *l2c;
+       struct edac_device_ctl_info *edac_dev;
+       struct debugfs_entry **l2c_devattr;
+       size_t dfs_entries;
+       irqreturn_t (*thunderx_l2c_isr)(int, void *) = NULL;
+       char name[32];
+       const char *fmt;
+       u64 reg_en_offs, reg_en_mask;
+       int idx;
+       int ret;
+
+       ret = pcim_enable_device(pdev);
+       if (ret) {
+               dev_err(&pdev->dev, "Cannot enable PCI device: %d\n", ret);
+               return ret;
+       }
+
+       ret = pcim_iomap_regions(pdev, BIT(0), "thunderx_l2c");
+       if (ret) {
+               dev_err(&pdev->dev, "Cannot map PCI resources: %d\n", ret);
+               return ret;
+       }
+
+       switch (pdev->device) {
+       case PCI_DEVICE_ID_THUNDER_L2C_TAD:
+               thunderx_l2c_isr = thunderx_l2c_tad_isr;
+               l2c_devattr = l2c_tad_dfs_ents;
+               dfs_entries = ARRAY_SIZE(l2c_tad_dfs_ents);
+               fmt = "L2C-TAD%d";
+               reg_en_offs = L2C_TAD_INT_ENA_W1S;
+               reg_en_mask = L2C_TAD_INT_ENA_ALL;
+               break;
+       case PCI_DEVICE_ID_THUNDER_L2C_CBC:
+               thunderx_l2c_isr = thunderx_l2c_cbc_isr;
+               l2c_devattr = l2c_cbc_dfs_ents;
+               dfs_entries = ARRAY_SIZE(l2c_cbc_dfs_ents);
+               fmt = "L2C-CBC%d";
+               reg_en_offs = L2C_CBC_INT_ENA_W1S;
+               reg_en_mask = L2C_CBC_INT_ENA_ALL;
+               break;
+       case PCI_DEVICE_ID_THUNDER_L2C_MCI:
+               thunderx_l2c_isr = thunderx_l2c_mci_isr;
+               l2c_devattr = l2c_mci_dfs_ents;
+               dfs_entries = ARRAY_SIZE(l2c_mci_dfs_ents);
+               fmt = "L2C-MCI%d";
+               reg_en_offs = L2C_MCI_INT_ENA_W1S;
+               reg_en_mask = L2C_MCI_INT_ENA_ALL;
+               break;
+       default:
+               //Should never ever get here
+               dev_err(&pdev->dev, "Unsupported PCI device: %04x\n",
+                       pdev->device);
+               return -EINVAL;
+       }
+
+       idx = edac_device_alloc_index();
+       snprintf(name, sizeof(name), fmt, idx);
+
+       edac_dev = edac_device_alloc_ctl_info(sizeof(struct thunderx_l2c),
+                                             name, 1, "L2C", 1, 0,
+                                             NULL, 0, idx);
+       if (!edac_dev) {
+               dev_err(&pdev->dev, "Cannot allocate EDAC device\n");
+               return -ENOMEM;
+       }
+
+       l2c = edac_dev->pvt_info;
+       l2c->edac_dev = edac_dev;
+
+       l2c->regs = pcim_iomap_table(pdev)[0];
+       if (!l2c->regs) {
+               dev_err(&pdev->dev, "Cannot map PCI resources\n");
+               ret = -ENODEV;
+               goto err_free;
+       }
+
+       l2c->pdev = pdev;
+
+       l2c->ring_head = 0;
+       l2c->ring_tail = 0;
+
+       l2c->msix_ent.entry = 0;
+       l2c->msix_ent.vector = 0;
+
+       ret = pci_enable_msix_exact(pdev, &l2c->msix_ent, 1);
+       if (ret) {
+               dev_err(&pdev->dev, "Cannot enable interrupt: %d\n", ret);
+               goto err_free;
+       }
+
+       ret = devm_request_threaded_irq(&pdev->dev, l2c->msix_ent.vector,
+                                       thunderx_l2c_isr,
+                                       thunderx_l2c_threaded_isr,
+                                       0, "[EDAC] ThunderX L2C",
+                                       &l2c->msix_ent);
+       if (ret)
+               goto err_free;
+
+       edac_dev->dev = &pdev->dev;
+       edac_dev->dev_name = dev_name(&pdev->dev);
+       edac_dev->mod_name = "thunderx-l2c";
+       edac_dev->ctl_name = "thunderx-l2c";
+
+       ret = edac_device_add_device(edac_dev);
+       if (ret) {
+               dev_err(&pdev->dev, "Cannot add EDAC device: %d\n", ret);
+               goto err_free;
+       }
+
+       if (IS_ENABLED(CONFIG_EDAC_DEBUG)) {
+               l2c->debugfs = edac_debugfs_create_dir(pdev->dev.kobj.name);
+
+               thunderx_create_debugfs_nodes(l2c->debugfs, l2c_devattr,
+                                             l2c, dfs_entries);
+
+               if (ret != dfs_entries) {
+                       dev_warn(&pdev->dev, "Error creating debugfs entries: %d%s\n",
+                                ret, ret >= 0 ? " created" : "");
+               }
+       }
+
+       pci_set_drvdata(pdev, edac_dev);
+
+       writeq(reg_en_mask, l2c->regs + reg_en_offs);
+
+       return 0;
+
+err_free:
+       edac_device_free_ctl_info(edac_dev);
+
+       return ret;
+}
+
+static void thunderx_l2c_remove(struct pci_dev *pdev)
+{
+       struct edac_device_ctl_info *edac_dev = pci_get_drvdata(pdev);
+       struct thunderx_l2c *l2c = edac_dev->pvt_info;
+
+       switch (pdev->device) {
+       case PCI_DEVICE_ID_THUNDER_L2C_TAD:
+               writeq(L2C_TAD_INT_ENA_ALL, l2c->regs + L2C_TAD_INT_ENA_W1C);
+               break;
+       case PCI_DEVICE_ID_THUNDER_L2C_CBC:
+               writeq(L2C_CBC_INT_ENA_ALL, l2c->regs + L2C_CBC_INT_ENA_W1C);
+               break;
+       case PCI_DEVICE_ID_THUNDER_L2C_MCI:
+               writeq(L2C_MCI_INT_ENA_ALL, l2c->regs + L2C_MCI_INT_ENA_W1C);
+               break;
+       }
+
+       edac_debugfs_remove_recursive(l2c->debugfs);
+
+       edac_device_del_device(&pdev->dev);
+       edac_device_free_ctl_info(edac_dev);
+}
+
+MODULE_DEVICE_TABLE(pci, thunderx_l2c_pci_tbl);
+
+static struct pci_driver thunderx_l2c_driver = {
+       .name     = "thunderx_l2c_edac",
+       .probe    = thunderx_l2c_probe,
+       .remove   = thunderx_l2c_remove,
+       .id_table = thunderx_l2c_pci_tbl,
+};
+
+static int __init thunderx_edac_init(void)
+{
+       int rc = 0;
+
+       rc = pci_register_driver(&thunderx_lmc_driver);
+       if (rc)
+               return rc;
+
+       rc = pci_register_driver(&thunderx_ocx_driver);
+       if (rc)
+               goto err_lmc;
+
+       rc = pci_register_driver(&thunderx_l2c_driver);
+       if (rc)
+               goto err_ocx;
+
+       return rc;
+err_ocx:
+       pci_unregister_driver(&thunderx_ocx_driver);
+err_lmc:
+       pci_unregister_driver(&thunderx_lmc_driver);
+
+       return rc;
+}
+
+static void __exit thunderx_edac_exit(void)
+{
+       pci_unregister_driver(&thunderx_l2c_driver);
+       pci_unregister_driver(&thunderx_ocx_driver);
+       pci_unregister_driver(&thunderx_lmc_driver);
+
+}
+
+module_init(thunderx_edac_init);
+module_exit(thunderx_edac_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Cavium, Inc.");
+MODULE_DESCRIPTION("EDAC Driver for Cavium ThunderX");
index b40eb18..186fd73 100644 (file)
@@ -50,6 +50,13 @@ static void devm_extcon_dev_notifier_unreg(struct device *dev, void *res)
        extcon_unregister_notifier(this->edev, this->id, this->nb);
 }
 
+static void devm_extcon_dev_notifier_all_unreg(struct device *dev, void *res)
+{
+       struct extcon_dev_notifier_devres *this = res;
+
+       extcon_unregister_notifier_all(this->edev, this->nb);
+}
+
 /**
  * devm_extcon_dev_allocate - Allocate managed extcon device
  * @dev:               device owning the extcon device being created
@@ -214,3 +221,57 @@ void devm_extcon_unregister_notifier(struct device *dev,
                               devm_extcon_dev_match, edev));
 }
 EXPORT_SYMBOL(devm_extcon_unregister_notifier);
+
+/**
+ * devm_extcon_register_notifier_all()
+ *             - Resource-managed extcon_register_notifier_all()
+ * @dev:       device to allocate extcon device
+ * @edev:      the extcon device that has the external connecotr.
+ * @nb:                a notifier block to be registered.
+ *
+ * This function manages automatically the notifier of extcon device using
+ * device resource management and simplify the control of unregistering
+ * the notifier of extcon device. To get more information, refer that function.
+ *
+ * Returns 0 if success or negaive error number if failure.
+ */
+int devm_extcon_register_notifier_all(struct device *dev, struct extcon_dev *edev,
+                               struct notifier_block *nb)
+{
+       struct extcon_dev_notifier_devres *ptr;
+       int ret;
+
+       ptr = devres_alloc(devm_extcon_dev_notifier_all_unreg, sizeof(*ptr),
+                               GFP_KERNEL);
+       if (!ptr)
+               return -ENOMEM;
+
+       ret = extcon_register_notifier_all(edev, nb);
+       if (ret) {
+               devres_free(ptr);
+               return ret;
+       }
+
+       ptr->edev = edev;
+       ptr->nb = nb;
+       devres_add(dev, ptr);
+
+       return 0;
+}
+EXPORT_SYMBOL(devm_extcon_register_notifier_all);
+
+/**
+ * devm_extcon_unregister_notifier_all()
+ *             - Resource-managed extcon_unregister_notifier_all()
+ * @dev:       device to allocate extcon device
+ * @edev:      the extcon device that has the external connecotr.
+ * @nb:                a notifier block to be registered.
+ */
+void devm_extcon_unregister_notifier_all(struct device *dev,
+                               struct extcon_dev *edev,
+                               struct notifier_block *nb)
+{
+       WARN_ON(devres_release(dev, devm_extcon_dev_notifier_all_unreg,
+                              devm_extcon_dev_match, edev));
+}
+EXPORT_SYMBOL(devm_extcon_unregister_notifier_all);
index 09ac5e7..e775054 100644 (file)
@@ -448,8 +448,19 @@ int extcon_sync(struct extcon_dev *edev, unsigned int id)
        spin_lock_irqsave(&edev->lock, flags);
 
        state = !!(edev->state & BIT(index));
+
+       /*
+        * Call functions in a raw notifier chain for the specific one
+        * external connector.
+        */
        raw_notifier_call_chain(&edev->nh[index], state, edev);
 
+       /*
+        * Call functions in a raw notifier chain for the all supported
+        * external connectors.
+        */
+       raw_notifier_call_chain(&edev->nh_all, state, edev);
+
        /* This could be in interrupt handler */
        prop_buf = (char *)get_zeroed_page(GFP_ATOMIC);
        if (!prop_buf) {
@@ -954,6 +965,59 @@ int extcon_unregister_notifier(struct extcon_dev *edev, unsigned int id,
 }
 EXPORT_SYMBOL_GPL(extcon_unregister_notifier);
 
+/**
+ * extcon_register_notifier_all() - Register a notifier block for all connectors
+ * @edev:      the extcon device that has the external connecotr.
+ * @nb:                a notifier block to be registered.
+ *
+ * This fucntion registers a notifier block in order to receive the state
+ * change of all supported external connectors from extcon device.
+ * And The second parameter given to the callback of nb (val) is
+ * the current state and third parameter is the edev pointer.
+ *
+ * Returns 0 if success or error number if fail
+ */
+int extcon_register_notifier_all(struct extcon_dev *edev,
+                               struct notifier_block *nb)
+{
+       unsigned long flags;
+       int ret;
+
+       if (!edev || !nb)
+               return -EINVAL;
+
+       spin_lock_irqsave(&edev->lock, flags);
+       ret = raw_notifier_chain_register(&edev->nh_all, nb);
+       spin_unlock_irqrestore(&edev->lock, flags);
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(extcon_register_notifier_all);
+
+/**
+ * extcon_unregister_notifier_all() - Unregister a notifier block from extcon.
+ * @edev:      the extcon device that has the external connecotr.
+ * @nb:                a notifier block to be registered.
+ *
+ * Returns 0 if success or error number if fail
+ */
+int extcon_unregister_notifier_all(struct extcon_dev *edev,
+                               struct notifier_block *nb)
+{
+       unsigned long flags;
+       int ret;
+
+       if (!edev || !nb)
+               return -EINVAL;
+
+       spin_lock_irqsave(&edev->lock, flags);
+       ret = raw_notifier_chain_unregister(&edev->nh_all, nb);
+       spin_unlock_irqrestore(&edev->lock, flags);
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(extcon_unregister_notifier_all);
+
 static struct attribute *extcon_attrs[] = {
        &dev_attr_state.attr,
        &dev_attr_name.attr,
@@ -1212,6 +1276,8 @@ int extcon_dev_register(struct extcon_dev *edev)
        for (index = 0; index < edev->max_supported; index++)
                RAW_INIT_NOTIFIER_HEAD(&edev->nh[index]);
 
+       RAW_INIT_NOTIFIER_HEAD(&edev->nh_all);
+
        dev_set_drvdata(&edev->dev, edev);
        edev->state = 0;
 
index 993ddcc..dddddcf 100644 (file)
@@ -21,6 +21,8 @@
  * @dev:               Device of this extcon.
  * @state:             Attach/detach state of this extcon. Do not provide at
  *                     register-time.
+ * @nh_all:            Notifier for the state change events for all supported
+ *                     external connectors from this extcon.
  * @nh:                        Notifier for the state change events from this extcon
  * @entry:             To support list of extcon devices so that users can
  *                     search for extcon devices based on the extcon name.
@@ -43,6 +45,7 @@ struct extcon_dev {
 
        /* Internal data. Please do not set. */
        struct device dev;
+       struct raw_notifier_head nh_all;
        struct raw_notifier_head *nh;
        struct list_head entry;
        int max_supported;
index ad67342..0329d31 100644 (file)
@@ -9,6 +9,7 @@
 #
 KASAN_SANITIZE_runtime-wrappers.o      := n
 
+obj-$(CONFIG_ACPI_BGRT)                += efi-bgrt.o
 obj-$(CONFIG_EFI)                      += efi.o vars.o reboot.o memattr.o
 obj-$(CONFIG_EFI)                      += capsule.o memmap.o
 obj-$(CONFIG_EFI_VARS)                 += efivars.o
index f402ba2..6b5acef 100644 (file)
@@ -274,9 +274,9 @@ static int efi_pstore_write(enum pstore_type_id type,
        for (i = 0; i < DUMP_NAME_LEN; i++)
                efi_name[i] = name[i];
 
-       efivar_entry_set_safe(efi_name, vendor, PSTORE_EFI_ATTRIBUTES,
-                             !pstore_cannot_block_path(reason),
-                             size, psi->buf);
+       ret = efivar_entry_set_safe(efi_name, vendor, PSTORE_EFI_ATTRIBUTES,
+                                   !pstore_cannot_block_path(reason),
+                                   size, psi->buf);
 
        if (reason == KMSG_DUMP_OOPS)
                efivar_run_worker();
index d4056c6..8181ac1 100644 (file)
 
 #include "efistub.h"
 
-bool __nokaslr;
+/*
+ * This is the base address at which to start allocating virtual memory ranges
+ * for UEFI Runtime Services. This is in the low TTBR0 range so that we can use
+ * any allocation we choose, and eliminate the risk of a conflict after kexec.
+ * The value chosen is the largest non-zero power of 2 suitable for this purpose
+ * both on 32-bit and 64-bit ARM CPUs, to maximize the likelihood that it can
+ * be mapped efficiently.
+ * Since 32-bit ARM could potentially execute with a 1G/3G user/kernel split,
+ * map everything below 1 GB. (512 MB is a reasonable upper bound for the
+ * entire footprint of the UEFI runtime services memory regions)
+ */
+#define EFI_RT_VIRTUAL_BASE    SZ_512M
+#define EFI_RT_VIRTUAL_SIZE    SZ_512M
+
+#ifdef CONFIG_ARM64
+# define EFI_RT_VIRTUAL_LIMIT  TASK_SIZE_64
+#else
+# define EFI_RT_VIRTUAL_LIMIT  TASK_SIZE
+#endif
+
+static u64 virtmap_base = EFI_RT_VIRTUAL_BASE;
 
 efi_status_t efi_open_volume(efi_system_table_t *sys_table_arg,
                             void *__image, void **__fh)
@@ -118,8 +138,6 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table,
        if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
                goto fail;
 
-       pr_efi(sys_table, "Booting Linux Kernel...\n");
-
        status = check_platform_features(sys_table);
        if (status != EFI_SUCCESS)
                goto fail;
@@ -153,17 +171,15 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table,
                goto fail;
        }
 
-       /* check whether 'nokaslr' was passed on the command line */
-       if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) {
-               static const u8 default_cmdline[] = CONFIG_CMDLINE;
-               const u8 *str, *cmdline = cmdline_ptr;
+       if (IS_ENABLED(CONFIG_CMDLINE_EXTEND) ||
+           IS_ENABLED(CONFIG_CMDLINE_FORCE) ||
+           cmdline_size == 0)
+               efi_parse_options(CONFIG_CMDLINE);
 
-               if (IS_ENABLED(CONFIG_CMDLINE_FORCE))
-                       cmdline = default_cmdline;
-               str = strstr(cmdline, "nokaslr");
-               if (str == cmdline || (str > cmdline && *(str - 1) == ' '))
-                       __nokaslr = true;
-       }
+       if (!IS_ENABLED(CONFIG_CMDLINE_FORCE) && cmdline_size > 0)
+               efi_parse_options(cmdline_ptr);
+
+       pr_efi(sys_table, "Booting Linux Kernel...\n");
 
        si = setup_graphics(sys_table);
 
@@ -176,10 +192,6 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table,
                goto fail_free_cmdline;
        }
 
-       status = efi_parse_options(cmdline_ptr);
-       if (status != EFI_SUCCESS)
-               pr_efi_err(sys_table, "Failed to parse EFI cmdline options\n");
-
        secure_boot = efi_get_secureboot(sys_table);
 
        /*
@@ -213,8 +225,9 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table,
        if (!fdt_addr)
                pr_efi(sys_table, "Generating empty DTB\n");
 
-       status = handle_cmdline_files(sys_table, image, cmdline_ptr,
-                                     "initrd=", dram_base + SZ_512M,
+       status = handle_cmdline_files(sys_table, image, cmdline_ptr, "initrd=",
+                                     efi_get_max_initrd_addr(dram_base,
+                                                             *image_addr),
                                      (unsigned long *)&initrd_addr,
                                      (unsigned long *)&initrd_size);
        if (status != EFI_SUCCESS)
@@ -222,9 +235,29 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table,
 
        efi_random_get_seed(sys_table);
 
+       if (!nokaslr()) {
+               /*
+                * Randomize the base of the UEFI runtime services region.
+                * Preserve the 2 MB alignment of the region by taking a
+                * shift of 21 bit positions into account when scaling
+                * the headroom value using a 32-bit random value.
+                */
+               static const u64 headroom = EFI_RT_VIRTUAL_LIMIT -
+                                           EFI_RT_VIRTUAL_BASE -
+                                           EFI_RT_VIRTUAL_SIZE;
+               u32 rnd;
+
+               status = efi_get_random_bytes(sys_table, sizeof(rnd),
+                                             (u8 *)&rnd);
+               if (status == EFI_SUCCESS) {
+                       virtmap_base = EFI_RT_VIRTUAL_BASE +
+                                      (((headroom >> 21) * rnd) >> (32 - 21));
+               }
+       }
+
        new_fdt_addr = fdt_addr;
        status = allocate_new_fdt_and_exit_boot(sys_table, handle,
-                               &new_fdt_addr, dram_base + MAX_FDT_OFFSET,
+                               &new_fdt_addr, efi_get_max_fdt_addr(dram_base),
                                initrd_addr, initrd_size, cmdline_ptr,
                                fdt_addr, fdt_size);
 
@@ -251,18 +284,6 @@ fail:
        return EFI_ERROR;
 }
 
-/*
- * This is the base address at which to start allocating virtual memory ranges
- * for UEFI Runtime Services. This is in the low TTBR0 range so that we can use
- * any allocation we choose, and eliminate the risk of a conflict after kexec.
- * The value chosen is the largest non-zero power of 2 suitable for this purpose
- * both on 32-bit and 64-bit ARM CPUs, to maximize the likelihood that it can
- * be mapped efficiently.
- * Since 32-bit ARM could potentially execute with a 1G/3G user/kernel split,
- * map everything below 1 GB.
- */
-#define EFI_RT_VIRTUAL_BASE    SZ_512M
-
 static int cmp_mem_desc(const void *l, const void *r)
 {
        const efi_memory_desc_t *left = l, *right = r;
@@ -312,7 +333,7 @@ void efi_get_virtmap(efi_memory_desc_t *memory_map, unsigned long map_size,
                     unsigned long desc_size, efi_memory_desc_t *runtime_map,
                     int *count)
 {
-       u64 efi_virt_base = EFI_RT_VIRTUAL_BASE;
+       u64 efi_virt_base = virtmap_base;
        efi_memory_desc_t *in, *prev = NULL, *out = runtime_map;
        int l;
 
index e1f0b28..becbda4 100644 (file)
@@ -9,6 +9,8 @@
 #include <linux/efi.h>
 #include <asm/efi.h>
 
+#include "efistub.h"
+
 efi_status_t check_platform_features(efi_system_table_t *sys_table_arg)
 {
        int block;
@@ -63,6 +65,132 @@ void free_screen_info(efi_system_table_t *sys_table_arg, struct screen_info *si)
        efi_call_early(free_pool, si);
 }
 
+static efi_status_t reserve_kernel_base(efi_system_table_t *sys_table_arg,
+                                       unsigned long dram_base,
+                                       unsigned long *reserve_addr,
+                                       unsigned long *reserve_size)
+{
+       efi_physical_addr_t alloc_addr;
+       efi_memory_desc_t *memory_map;
+       unsigned long nr_pages, map_size, desc_size, buff_size;
+       efi_status_t status;
+       unsigned long l;
+
+       struct efi_boot_memmap map = {
+               .map            = &memory_map,
+               .map_size       = &map_size,
+               .desc_size      = &desc_size,
+               .desc_ver       = NULL,
+               .key_ptr        = NULL,
+               .buff_size      = &buff_size,
+       };
+
+       /*
+        * Reserve memory for the uncompressed kernel image. This is
+        * all that prevents any future allocations from conflicting
+        * with the kernel. Since we can't tell from the compressed
+        * image how much DRAM the kernel actually uses (due to BSS
+        * size uncertainty) we allocate the maximum possible size.
+        * Do this very early, as prints can cause memory allocations
+        * that may conflict with this.
+        */
+       alloc_addr = dram_base + MAX_UNCOMP_KERNEL_SIZE;
+       nr_pages = MAX_UNCOMP_KERNEL_SIZE / EFI_PAGE_SIZE;
+       status = efi_call_early(allocate_pages, EFI_ALLOCATE_MAX_ADDRESS,
+                               EFI_BOOT_SERVICES_DATA, nr_pages, &alloc_addr);
+       if (status == EFI_SUCCESS) {
+               if (alloc_addr == dram_base) {
+                       *reserve_addr = alloc_addr;
+                       *reserve_size = MAX_UNCOMP_KERNEL_SIZE;
+                       return EFI_SUCCESS;
+               }
+               /*
+                * If we end up here, the allocation succeeded but starts below
+                * dram_base. This can only occur if the real base of DRAM is
+                * not a multiple of 128 MB, in which case dram_base will have
+                * been rounded up. Since this implies that a part of the region
+                * was already occupied, we need to fall through to the code
+                * below to ensure that the existing allocations don't conflict.
+                * For this reason, we use EFI_BOOT_SERVICES_DATA above and not
+                * EFI_LOADER_DATA, which we wouldn't able to distinguish from
+                * allocations that we want to disallow.
+                */
+       }
+
+       /*
+        * If the allocation above failed, we may still be able to proceed:
+        * if the only allocations in the region are of types that will be
+        * released to the OS after ExitBootServices(), the decompressor can
+        * safely overwrite them.
+        */
+       status = efi_get_memory_map(sys_table_arg, &map);
+       if (status != EFI_SUCCESS) {
+               pr_efi_err(sys_table_arg,
+                          "reserve_kernel_base(): Unable to retrieve memory map.\n");
+               return status;
+       }
+
+       for (l = 0; l < map_size; l += desc_size) {
+               efi_memory_desc_t *desc;
+               u64 start, end;
+
+               desc = (void *)memory_map + l;
+               start = desc->phys_addr;
+               end = start + desc->num_pages * EFI_PAGE_SIZE;
+
+               /* Skip if entry does not intersect with region */
+               if (start >= dram_base + MAX_UNCOMP_KERNEL_SIZE ||
+                   end <= dram_base)
+                       continue;
+
+               switch (desc->type) {
+               case EFI_BOOT_SERVICES_CODE:
+               case EFI_BOOT_SERVICES_DATA:
+                       /* Ignore types that are released to the OS anyway */
+                       continue;
+
+               case EFI_CONVENTIONAL_MEMORY:
+                       /*
+                        * Reserve the intersection between this entry and the
+                        * region.
+                        */
+                       start = max(start, (u64)dram_base);
+                       end = min(end, (u64)dram_base + MAX_UNCOMP_KERNEL_SIZE);
+
+                       status = efi_call_early(allocate_pages,
+                                               EFI_ALLOCATE_ADDRESS,
+                                               EFI_LOADER_DATA,
+                                               (end - start) / EFI_PAGE_SIZE,
+                                               &start);
+                       if (status != EFI_SUCCESS) {
+                               pr_efi_err(sys_table_arg,
+                                       "reserve_kernel_base(): alloc failed.\n");
+                               goto out;
+                       }
+                       break;
+
+               case EFI_LOADER_CODE:
+               case EFI_LOADER_DATA:
+                       /*
+                        * These regions may be released and reallocated for
+                        * another purpose (including EFI_RUNTIME_SERVICE_DATA)
+                        * at any time during the execution of the OS loader,
+                        * so we cannot consider them as safe.
+                        */
+               default:
+                       /*
+                        * Treat any other allocation in the region as unsafe */
+                       status = EFI_OUT_OF_RESOURCES;
+                       goto out;
+               }
+       }
+
+       status = EFI_SUCCESS;
+out:
+       efi_call_early(free_pool, memory_map);
+       return status;
+}
+
 efi_status_t handle_kernel_image(efi_system_table_t *sys_table,
                                 unsigned long *image_addr,
                                 unsigned long *image_size,
@@ -71,10 +199,7 @@ efi_status_t handle_kernel_image(efi_system_table_t *sys_table,
                                 unsigned long dram_base,
                                 efi_loaded_image_t *image)
 {
-       unsigned long nr_pages;
        efi_status_t status;
-       /* Use alloc_addr to tranlsate between types */
-       efi_physical_addr_t alloc_addr;
 
        /*
         * Verify that the DRAM base address is compatible with the ARM
@@ -85,27 +210,12 @@ efi_status_t handle_kernel_image(efi_system_table_t *sys_table,
         */
        dram_base = round_up(dram_base, SZ_128M);
 
-       /*
-        * Reserve memory for the uncompressed kernel image. This is
-        * all that prevents any future allocations from conflicting
-        * with the kernel. Since we can't tell from the compressed
-        * image how much DRAM the kernel actually uses (due to BSS
-        * size uncertainty) we allocate the maximum possible size.
-        * Do this very early, as prints can cause memory allocations
-        * that may conflict with this.
-        */
-       alloc_addr = dram_base;
-       *reserve_size = MAX_UNCOMP_KERNEL_SIZE;
-       nr_pages = round_up(*reserve_size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE;
-       status = sys_table->boottime->allocate_pages(EFI_ALLOCATE_ADDRESS,
-                                                    EFI_LOADER_DATA,
-                                                    nr_pages, &alloc_addr);
+       status = reserve_kernel_base(sys_table, dram_base, reserve_addr,
+                                    reserve_size);
        if (status != EFI_SUCCESS) {
-               *reserve_size = 0;
                pr_efi_err(sys_table, "Unable to allocate memory for uncompressed kernel.\n");
                return status;
        }
-       *reserve_addr = alloc_addr;
 
        /*
         * Relocate the zImage, so that it appears in the lowest 128 MB
index eae693e..b4c2589 100644 (file)
@@ -16,8 +16,6 @@
 
 #include "efistub.h"
 
-extern bool __nokaslr;
-
 efi_status_t check_platform_features(efi_system_table_t *sys_table_arg)
 {
        u64 tg;
@@ -52,7 +50,7 @@ efi_status_t handle_kernel_image(efi_system_table_t *sys_table_arg,
        u64 phys_seed = 0;
 
        if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) {
-               if (!__nokaslr) {
+               if (!nokaslr()) {
                        status = efi_get_random_bytes(sys_table_arg,
                                                      sizeof(phys_seed),
                                                      (u8 *)&phys_seed);
index 919822b..b018436 100644 (file)
 
 static unsigned long __chunk_size = EFI_READ_CHUNK_SIZE;
 
+static int __section(.data) __nokaslr;
+static int __section(.data) __quiet;
+
+int __pure nokaslr(void)
+{
+       return __nokaslr;
+}
+int __pure is_quiet(void)
+{
+       return __quiet;
+}
+
 #define EFI_MMAP_NR_SLACK_SLOTS        8
 
 struct file_info {
@@ -409,17 +421,17 @@ static efi_status_t efi_file_close(void *handle)
  * environments, first in the early boot environment of the EFI boot
  * stub, and subsequently during the kernel boot.
  */
-efi_status_t efi_parse_options(char *cmdline)
+efi_status_t efi_parse_options(char const *cmdline)
 {
        char *str;
 
-       /*
-        * Currently, the only efi= option we look for is 'nochunk', which
-        * is intended to work around known issues on certain x86 UEFI
-        * versions. So ignore for now on other architectures.
-        */
-       if (!IS_ENABLED(CONFIG_X86))
-               return EFI_SUCCESS;
+       str = strstr(cmdline, "nokaslr");
+       if (str == cmdline || (str && str > cmdline && *(str - 1) == ' '))
+               __nokaslr = 1;
+
+       str = strstr(cmdline, "quiet");
+       if (str == cmdline || (str && str > cmdline && *(str - 1) == ' '))
+               __quiet = 1;
 
        /*
         * If no EFI parameters were specified on the cmdline we've got
@@ -436,14 +448,14 @@ efi_status_t efi_parse_options(char *cmdline)
         * Remember, because efi= is also used by the kernel we need to
         * skip over arguments we don't understand.
         */
-       while (*str) {
+       while (*str && *str != ' ') {
                if (!strncmp(str, "nochunk", 7)) {
                        str += strlen("nochunk");
                        __chunk_size = -1UL;
                }
 
                /* Group words together, delimited by "," */
-               while (*str && *str != ',')
+               while (*str && *str != ' ' && *str != ',')
                        str++;
 
                if (*str == ',')
index 71c4d0e..83f268c 100644 (file)
 #define EFI_ALLOC_ALIGN                EFI_PAGE_SIZE
 #endif
 
+extern int __pure nokaslr(void);
+extern int __pure is_quiet(void);
+
+#define pr_efi(sys_table, msg)         do {                            \
+       if (!is_quiet()) efi_printk(sys_table, "EFI stub: "msg);        \
+} while (0)
+
+#define pr_efi_err(sys_table, msg) efi_printk(sys_table, "EFI stub: ERROR: "msg)
+
 void efi_char16_printk(efi_system_table_t *, efi_char16_t *);
 
 efi_status_t efi_open_volume(efi_system_table_t *sys_table_arg, void *__image,
index 260c4b4..41f457b 100644 (file)
@@ -206,6 +206,10 @@ static efi_status_t exit_boot_func(efi_system_table_t *sys_table_arg,
        return update_fdt_memmap(p->new_fdt_addr, map);
 }
 
+#ifndef MAX_FDT_SIZE
+#define MAX_FDT_SIZE   SZ_2M
+#endif
+
 /*
  * Allocate memory for a new FDT, then add EFI, commandline, and
  * initrd related fields to the FDT.  This routine increases the
@@ -233,7 +237,6 @@ efi_status_t allocate_new_fdt_and_exit_boot(efi_system_table_t *sys_table,
        u32 desc_ver;
        unsigned long mmap_key;
        efi_memory_desc_t *memory_map, *runtime_map;
-       unsigned long new_fdt_size;
        efi_status_t status;
        int runtime_entry_count = 0;
        struct efi_boot_memmap map;
@@ -262,41 +265,29 @@ efi_status_t allocate_new_fdt_and_exit_boot(efi_system_table_t *sys_table,
               "Exiting boot services and installing virtual address map...\n");
 
        map.map = &memory_map;
+       status = efi_high_alloc(sys_table, MAX_FDT_SIZE, EFI_FDT_ALIGN,
+                               new_fdt_addr, max_addr);
+       if (status != EFI_SUCCESS) {
+               pr_efi_err(sys_table,
+                          "Unable to allocate memory for new device tree.\n");
+               goto fail;
+       }
+
        /*
-        * Estimate size of new FDT, and allocate memory for it. We
-        * will allocate a bigger buffer if this ends up being too
-        * small, so a rough guess is OK here.
+        * Now that we have done our final memory allocation (and free)
+        * we can get the memory map key needed for exit_boot_services().
         */
-       new_fdt_size = fdt_size + EFI_PAGE_SIZE;
-       while (1) {
-               status = efi_high_alloc(sys_table, new_fdt_size, EFI_FDT_ALIGN,
-                                       new_fdt_addr, max_addr);
-               if (status != EFI_SUCCESS) {
-                       pr_efi_err(sys_table, "Unable to allocate memory for new device tree.\n");
-                       goto fail;
-               }
-
-               status = update_fdt(sys_table,
-                                   (void *)fdt_addr, fdt_size,
-                                   (void *)*new_fdt_addr, new_fdt_size,
-                                   cmdline_ptr, initrd_addr, initrd_size);
+       status = efi_get_memory_map(sys_table, &map);
+       if (status != EFI_SUCCESS)
+               goto fail_free_new_fdt;
 
-               /* Succeeding the first time is the expected case. */
-               if (status == EFI_SUCCESS)
-                       break;
+       status = update_fdt(sys_table, (void *)fdt_addr, fdt_size,
+                           (void *)*new_fdt_addr, MAX_FDT_SIZE, cmdline_ptr,
+                           initrd_addr, initrd_size);
 
-               if (status == EFI_BUFFER_TOO_SMALL) {
-                       /*
-                        * We need to allocate more space for the new
-                        * device tree, so free existing buffer that is
-                        * too small.
-                        */
-                       efi_free(sys_table, new_fdt_size, *new_fdt_addr);
-                       new_fdt_size += EFI_PAGE_SIZE;
-               } else {
-                       pr_efi_err(sys_table, "Unable to construct new device tree.\n");
-                       goto fail_free_new_fdt;
-               }
+       if (status != EFI_SUCCESS) {
+               pr_efi_err(sys_table, "Unable to construct new device tree.\n");
+               goto fail_free_new_fdt;
        }
 
        priv.runtime_map = runtime_map;
@@ -340,7 +331,7 @@ efi_status_t allocate_new_fdt_and_exit_boot(efi_system_table_t *sys_table,
        pr_efi_err(sys_table, "Exit boot services failed.\n");
 
 fail_free_new_fdt:
-       efi_free(sys_table, new_fdt_size, *new_fdt_addr);
+       efi_free(sys_table, MAX_FDT_SIZE, *new_fdt_addr);
 
 fail:
        sys_table->boottime->free_pool(runtime_map);
index 932742e..24c461d 100644 (file)
@@ -149,7 +149,8 @@ setup_gop32(efi_system_table_t *sys_table_arg, struct screen_info *si,
 
                status = __gop_query32(sys_table_arg, gop32, &info, &size,
                                       &current_fb_base);
-               if (status == EFI_SUCCESS && (!first_gop || conout_found)) {
+               if (status == EFI_SUCCESS && (!first_gop || conout_found) &&
+                   info->pixel_format != PIXEL_BLT_ONLY) {
                        /*
                         * Systems that use the UEFI Console Splitter may
                         * provide multiple GOP devices, not all of which are
@@ -266,7 +267,8 @@ setup_gop64(efi_system_table_t *sys_table_arg, struct screen_info *si,
 
                status = __gop_query64(sys_table_arg, gop64, &info, &size,
                                       &current_fb_base);
-               if (status == EFI_SUCCESS && (!first_gop || conout_found)) {
+               if (status == EFI_SUCCESS && (!first_gop || conout_found) &&
+                   info->pixel_format != PIXEL_BLT_ONLY) {
                        /*
                         * Systems that use the UEFI Console Splitter may
                         * provide multiple GOP devices, not all of which are
index 5da36e5..8c34d50 100644 (file)
@@ -12,6 +12,8 @@
 #include <linux/efi.h>
 #include <asm/efi.h>
 
+#include "efistub.h"
+
 /* BIOS variables */
 static const efi_guid_t efi_variable_guid = EFI_GLOBAL_VARIABLE_GUID;
 static const efi_char16_t const efi_SecureBoot_name[] = {
index da48819..b78d923 100644 (file)
@@ -1317,7 +1317,7 @@ int etnaviv_gpu_submit(struct etnaviv_gpu *gpu,
        if (!fence) {
                event_free(gpu, event);
                ret = -ENOMEM;
-               goto out_pm_put;
+               goto out_unlock;
        }
 
        gpu->event[event].fence = fence;
@@ -1357,6 +1357,7 @@ int etnaviv_gpu_submit(struct etnaviv_gpu *gpu,
        hangcheck_timer_reset(gpu);
        ret = 0;
 
+out_unlock:
        mutex_unlock(&gpu->lock);
 
 out_pm_put:
index b7d7721..40af17e 100644 (file)
@@ -285,9 +285,6 @@ int intel_vgpu_emulate_cfg_write(struct intel_vgpu *vgpu, unsigned int offset,
 {
        int ret;
 
-       if (vgpu->failsafe)
-               return 0;
-
        if (WARN_ON(bytes > 4))
                return -EINVAL;
 
index f1f426a..d186c15 100644 (file)
@@ -775,7 +775,8 @@ static void init_vgpu_execlist(struct intel_vgpu *vgpu, int ring_id)
                        _EL_OFFSET_STATUS_PTR);
 
        ctx_status_ptr.dw = vgpu_vreg(vgpu, ctx_status_ptr_reg);
-       ctx_status_ptr.read_ptr = ctx_status_ptr.write_ptr = 0x7;
+       ctx_status_ptr.read_ptr = 0;
+       ctx_status_ptr.write_ptr = 0x7;
        vgpu_vreg(vgpu, ctx_status_ptr_reg) = ctx_status_ptr.dw;
 }
 
index 933a7c2..dce8d15 100644 (file)
@@ -75,11 +75,11 @@ static int expose_firmware_sysfs(struct intel_gvt *gvt)
        struct gvt_firmware_header *h;
        void *firmware;
        void *p;
-       unsigned long size;
+       unsigned long size, crc32_start;
        int i;
        int ret;
 
-       size = sizeof(*h) + info->mmio_size + info->cfg_space_size - 1;
+       size = sizeof(*h) + info->mmio_size + info->cfg_space_size;
        firmware = vzalloc(size);
        if (!firmware)
                return -ENOMEM;
@@ -112,6 +112,9 @@ static int expose_firmware_sysfs(struct intel_gvt *gvt)
 
        memcpy(gvt->firmware.mmio, p, info->mmio_size);
 
+       crc32_start = offsetof(struct gvt_firmware_header, crc32) + 4;
+       h->crc32 = crc32_le(0, firmware + crc32_start, size - crc32_start);
+
        firmware_attr.size = size;
        firmware_attr.private = firmware;
 
@@ -234,7 +237,7 @@ int intel_gvt_load_firmware(struct intel_gvt *gvt)
 
        firmware->mmio = mem;
 
-       sprintf(path, "%s/vid_0x%04x_did_0x%04x_rid_0x%04x.golden_hw_state",
+       sprintf(path, "%s/vid_0x%04x_did_0x%04x_rid_0x%02x.golden_hw_state",
                 GVT_FIRMWARE_PATH, pdev->vendor, pdev->device,
                 pdev->revision);
 
index 3b9d59e..ef3baa0 100644 (file)
@@ -52,6 +52,8 @@ static const struct intel_gvt_ops intel_gvt_ops = {
        .vgpu_create = intel_gvt_create_vgpu,
        .vgpu_destroy = intel_gvt_destroy_vgpu,
        .vgpu_reset = intel_gvt_reset_vgpu,
+       .vgpu_activate = intel_gvt_activate_vgpu,
+       .vgpu_deactivate = intel_gvt_deactivate_vgpu,
 };
 
 /**
index 6dfc48b..becae2f 100644 (file)
@@ -382,7 +382,8 @@ void intel_gvt_destroy_vgpu(struct intel_vgpu *vgpu);
 void intel_gvt_reset_vgpu_locked(struct intel_vgpu *vgpu, bool dmlr,
                                 unsigned int engine_mask);
 void intel_gvt_reset_vgpu(struct intel_vgpu *vgpu);
-
+void intel_gvt_activate_vgpu(struct intel_vgpu *vgpu);
+void intel_gvt_deactivate_vgpu(struct intel_vgpu *vgpu);
 
 /* validating GM functions */
 #define vgpu_gmadr_is_aperture(vgpu, gmadr) \
@@ -449,6 +450,8 @@ struct intel_gvt_ops {
                                struct intel_vgpu_type *);
        void (*vgpu_destroy)(struct intel_vgpu *);
        void (*vgpu_reset)(struct intel_vgpu *);
+       void (*vgpu_activate)(struct intel_vgpu *);
+       void (*vgpu_deactivate)(struct intel_vgpu *);
 };
 
 
index d641214..e466259 100644 (file)
@@ -544,6 +544,8 @@ static int intel_vgpu_open(struct mdev_device *mdev)
        if (ret)
                goto undo_group;
 
+       intel_gvt_ops->vgpu_activate(vgpu);
+
        atomic_set(&vgpu->vdev.released, 0);
        return ret;
 
@@ -569,6 +571,8 @@ static void __intel_vgpu_release(struct intel_vgpu *vgpu)
        if (atomic_cmpxchg(&vgpu->vdev.released, 0, 1))
                return;
 
+       intel_gvt_ops->vgpu_deactivate(vgpu);
+
        ret = vfio_unregister_notifier(mdev_dev(vgpu->vdev.mdev), VFIO_IOMMU_NOTIFY,
                                        &vgpu->vdev.iommu_notifier);
        WARN(ret, "vfio_unregister_notifier for iommu failed: %d\n", ret);
@@ -1340,13 +1344,6 @@ static int kvmgt_guest_init(struct mdev_device *mdev)
 
 static bool kvmgt_guest_exit(struct kvmgt_guest_info *info)
 {
-       struct intel_vgpu *vgpu = info->vgpu;
-
-       if (!info) {
-               gvt_vgpu_err("kvmgt_guest_info invalid\n");
-               return false;
-       }
-
        kvm_page_track_unregister_notifier(info->kvm, &info->track_node);
        kvm_put_kvm(info->kvm);
        kvmgt_protect_table_destroy(info);
index 41cfa5c..649ef28 100644 (file)
@@ -72,7 +72,7 @@ static struct {
        char *name;
 } vgpu_types[] = {
 /* Fixed vGPU type table */
-       { MB_TO_BYTES(64), MB_TO_BYTES(512), 4, GVT_EDID_1024_768, "8" },
+       { MB_TO_BYTES(64), MB_TO_BYTES(384), 4, GVT_EDID_1024_768, "8" },
        { MB_TO_BYTES(128), MB_TO_BYTES(512), 4, GVT_EDID_1920_1200, "4" },
        { MB_TO_BYTES(256), MB_TO_BYTES(1024), 4, GVT_EDID_1920_1200, "2" },
        { MB_TO_BYTES(512), MB_TO_BYTES(2048), 4, GVT_EDID_1920_1200, "1" },
@@ -179,20 +179,34 @@ static void intel_gvt_update_vgpu_types(struct intel_gvt *gvt)
 }
 
 /**
- * intel_gvt_destroy_vgpu - destroy a virtual GPU
+ * intel_gvt_active_vgpu - activate a virtual GPU
  * @vgpu: virtual GPU
  *
- * This function is called when user wants to destroy a virtual GPU.
+ * This function is called when user wants to activate a virtual GPU.
  *
  */
-void intel_gvt_destroy_vgpu(struct intel_vgpu *vgpu)
+void intel_gvt_activate_vgpu(struct intel_vgpu *vgpu)
+{
+       mutex_lock(&vgpu->gvt->lock);
+       vgpu->active = true;
+       mutex_unlock(&vgpu->gvt->lock);
+}
+
+/**
+ * intel_gvt_deactive_vgpu - deactivate a virtual GPU
+ * @vgpu: virtual GPU
+ *
+ * This function is called when user wants to deactivate a virtual GPU.
+ * All virtual GPU runtime information will be destroyed.
+ *
+ */
+void intel_gvt_deactivate_vgpu(struct intel_vgpu *vgpu)
 {
        struct intel_gvt *gvt = vgpu->gvt;
 
        mutex_lock(&gvt->lock);
 
        vgpu->active = false;
-       idr_remove(&gvt->vgpu_idr, vgpu->id);
 
        if (atomic_read(&vgpu->running_workload_num)) {
                mutex_unlock(&gvt->lock);
@@ -201,6 +215,26 @@ void intel_gvt_destroy_vgpu(struct intel_vgpu *vgpu)
        }
 
        intel_vgpu_stop_schedule(vgpu);
+
+       mutex_unlock(&gvt->lock);
+}
+
+/**
+ * intel_gvt_destroy_vgpu - destroy a virtual GPU
+ * @vgpu: virtual GPU
+ *
+ * This function is called when user wants to destroy a virtual GPU.
+ *
+ */
+void intel_gvt_destroy_vgpu(struct intel_vgpu *vgpu)
+{
+       struct intel_gvt *gvt = vgpu->gvt;
+
+       mutex_lock(&gvt->lock);
+
+       WARN(vgpu->active, "vGPU is still active!\n");
+
+       idr_remove(&gvt->vgpu_idr, vgpu->id);
        intel_vgpu_clean_sched_policy(vgpu);
        intel_vgpu_clean_gvt_context(vgpu);
        intel_vgpu_clean_execlist(vgpu);
@@ -277,7 +311,6 @@ static struct intel_vgpu *__intel_gvt_create_vgpu(struct intel_gvt *gvt,
        if (ret)
                goto out_clean_shadow_ctx;
 
-       vgpu->active = true;
        mutex_unlock(&gvt->lock);
 
        return vgpu;
index 1c75402..5c089b3 100644 (file)
@@ -1434,8 +1434,6 @@ static int i915_drm_suspend(struct drm_device *dev)
                goto out;
        }
 
-       intel_guc_suspend(dev_priv);
-
        intel_display_suspend(dev);
 
        intel_dp_mst_suspend(dev);
index 1e53c31..46fcd8b 100644 (file)
@@ -806,6 +806,7 @@ struct intel_csr {
        func(has_resource_streamer); \
        func(has_runtime_pm); \
        func(has_snoop); \
+       func(unfenced_needs_alignment); \
        func(cursor_needs_physical); \
        func(hws_needs_physical); \
        func(overlay_needs_physical); \
index 67b1fc5..fe531f9 100644 (file)
@@ -4348,6 +4348,8 @@ int i915_gem_suspend(struct drm_i915_private *dev_priv)
        i915_gem_context_lost(dev_priv);
        mutex_unlock(&dev->struct_mutex);
 
+       intel_guc_suspend(dev_priv);
+
        cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
        cancel_delayed_work_sync(&dev_priv->gt.retire_work);
 
index 30e0675..15a15d0 100644 (file)
@@ -888,6 +888,7 @@ i915_gem_execbuffer_reserve(struct intel_engine_cs *engine,
        struct list_head ordered_vmas;
        struct list_head pinned_vmas;
        bool has_fenced_gpu_access = INTEL_GEN(engine->i915) < 4;
+       bool needs_unfenced_map = INTEL_INFO(engine->i915)->unfenced_needs_alignment;
        int retry;
 
        vm = list_first_entry(vmas, struct i915_vma, exec_list)->vm;
@@ -908,7 +909,8 @@ i915_gem_execbuffer_reserve(struct intel_engine_cs *engine,
                if (!has_fenced_gpu_access)
                        entry->flags &= ~EXEC_OBJECT_NEEDS_FENCE;
                need_fence =
-                       entry->flags & EXEC_OBJECT_NEEDS_FENCE &&
+                       (entry->flags & EXEC_OBJECT_NEEDS_FENCE ||
+                        needs_unfenced_map) &&
                        i915_gem_object_is_tiled(obj);
                need_mappable = need_fence || need_reloc_mappable(vma);
 
index 2801a4d..96e45a4 100644 (file)
@@ -2704,7 +2704,7 @@ void i915_gem_gtt_finish_pages(struct drm_i915_gem_object *obj,
        struct i915_ggtt *ggtt = &dev_priv->ggtt;
 
        if (unlikely(ggtt->do_idle_maps)) {
-               if (i915_gem_wait_for_idle(dev_priv, I915_WAIT_LOCKED)) {
+               if (i915_gem_wait_for_idle(dev_priv, 0)) {
                        DRM_ERROR("Failed to wait for idle; VT'd may hang.\n");
                        /* Wait a bit, in hopes it avoids the hang */
                        udelay(10);
index e7c3c03..da70bfe 100644 (file)
@@ -37,6 +37,17 @@ static const char *i915_fence_get_driver_name(struct dma_fence *fence)
 
 static const char *i915_fence_get_timeline_name(struct dma_fence *fence)
 {
+       /* The timeline struct (as part of the ppgtt underneath a context)
+        * may be freed when the request is no longer in use by the GPU.
+        * We could extend the life of a context to beyond that of all
+        * fences, possibly keeping the hw resource around indefinitely,
+        * or we just give them a false name. Since
+        * dma_fence_ops.get_timeline_name is a debug feature, the occasional
+        * lie seems justifiable.
+        */
+       if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags))
+               return "signaled";
+
        return to_request(fence)->timeline->common->name;
 }
 
index d5d2b4c..70b3832 100644 (file)
@@ -53,6 +53,17 @@ static bool i915_gem_shrinker_lock(struct drm_device *dev, bool *unlock)
        BUG();
 }
 
+static void i915_gem_shrinker_unlock(struct drm_device *dev, bool unlock)
+{
+       if (!unlock)
+               return;
+
+       mutex_unlock(&dev->struct_mutex);
+
+       /* expedite the RCU grace period to free some request slabs */
+       synchronize_rcu_expedited();
+}
+
 static bool any_vma_pinned(struct drm_i915_gem_object *obj)
 {
        struct i915_vma *vma;
@@ -232,11 +243,8 @@ i915_gem_shrink(struct drm_i915_private *dev_priv,
                intel_runtime_pm_put(dev_priv);
 
        i915_gem_retire_requests(dev_priv);
-       if (unlock)
-               mutex_unlock(&dev_priv->drm.struct_mutex);
 
-       /* expedite the RCU grace period to free some request slabs */
-       synchronize_rcu_expedited();
+       i915_gem_shrinker_unlock(&dev_priv->drm, unlock);
 
        return count;
 }
@@ -293,8 +301,7 @@ i915_gem_shrinker_count(struct shrinker *shrinker, struct shrink_control *sc)
                        count += obj->base.size >> PAGE_SHIFT;
        }
 
-       if (unlock)
-               mutex_unlock(&dev->struct_mutex);
+       i915_gem_shrinker_unlock(dev, unlock);
 
        return count;
 }
@@ -321,8 +328,8 @@ i915_gem_shrinker_scan(struct shrinker *shrinker, struct shrink_control *sc)
                                         sc->nr_to_scan - freed,
                                         I915_SHRINK_BOUND |
                                         I915_SHRINK_UNBOUND);
-       if (unlock)
-               mutex_unlock(&dev->struct_mutex);
+
+       i915_gem_shrinker_unlock(dev, unlock);
 
        return freed;
 }
@@ -364,8 +371,7 @@ i915_gem_shrinker_unlock_uninterruptible(struct drm_i915_private *dev_priv,
                                         struct shrinker_lock_uninterruptible *slu)
 {
        dev_priv->mm.interruptible = slu->was_interruptible;
-       if (slu->unlock)
-               mutex_unlock(&dev_priv->drm.struct_mutex);
+       i915_gem_shrinker_unlock(&dev_priv->drm, slu->unlock);
 }
 
 static int
index ecb487b..9bbbd4e 100644 (file)
@@ -60,6 +60,7 @@
        .has_overlay = 1, .overlay_needs_physical = 1, \
        .has_gmch_display = 1, \
        .hws_needs_physical = 1, \
+       .unfenced_needs_alignment = 1, \
        .ring_mask = RENDER_RING, \
        GEN_DEFAULT_PIPEOFFSETS, \
        CURSOR_OFFSETS
@@ -101,6 +102,7 @@ static const struct intel_device_info intel_i915g_info = {
        .platform = INTEL_I915G, .cursor_needs_physical = 1,
        .has_overlay = 1, .overlay_needs_physical = 1,
        .hws_needs_physical = 1,
+       .unfenced_needs_alignment = 1,
 };
 
 static const struct intel_device_info intel_i915gm_info = {
@@ -112,6 +114,7 @@ static const struct intel_device_info intel_i915gm_info = {
        .supports_tv = 1,
        .has_fbc = 1,
        .hws_needs_physical = 1,
+       .unfenced_needs_alignment = 1,
 };
 
 static const struct intel_device_info intel_i945g_info = {
@@ -120,6 +123,7 @@ static const struct intel_device_info intel_i945g_info = {
        .has_hotplug = 1, .cursor_needs_physical = 1,
        .has_overlay = 1, .overlay_needs_physical = 1,
        .hws_needs_physical = 1,
+       .unfenced_needs_alignment = 1,
 };
 
 static const struct intel_device_info intel_i945gm_info = {
@@ -130,6 +134,7 @@ static const struct intel_device_info intel_i945gm_info = {
        .supports_tv = 1,
        .has_fbc = 1,
        .hws_needs_physical = 1,
+       .unfenced_needs_alignment = 1,
 };
 
 static const struct intel_device_info intel_g33_info = {
index a1b7eec..70964ca 100644 (file)
@@ -1705,7 +1705,7 @@ i915_perf_open_ioctl_locked(struct drm_i915_private *dev_priv,
         */
        if (WARN_ON(stream->sample_flags != props->sample_flags)) {
                ret = -ENODEV;
-               goto err_alloc;
+               goto err_flags;
        }
 
        list_add(&stream->link, &dev_priv->perf.streams);
@@ -1728,6 +1728,7 @@ i915_perf_open_ioctl_locked(struct drm_i915_private *dev_priv,
 
 err_open:
        list_del(&stream->link);
+err_flags:
        if (stream->ops->destroy)
                stream->ops->destroy(stream);
 err_alloc:
@@ -1793,6 +1794,11 @@ static int read_properties_unlocked(struct drm_i915_private *dev_priv,
                if (ret)
                        return ret;
 
+               if (id == 0 || id >= DRM_I915_PERF_PROP_MAX) {
+                       DRM_DEBUG("Unknown i915 perf property ID\n");
+                       return -EINVAL;
+               }
+
                switch ((enum drm_i915_perf_property_id)id) {
                case DRM_I915_PERF_PROP_CTX_HANDLE:
                        props->single_context = 1;
@@ -1862,9 +1868,8 @@ static int read_properties_unlocked(struct drm_i915_private *dev_priv,
                        props->oa_periodic = true;
                        props->oa_period_exponent = value;
                        break;
-               default:
+               case DRM_I915_PERF_PROP_MAX:
                        MISSING_CASE(id);
-                       DRM_DEBUG("Unknown i915 perf property ID\n");
                        return -EINVAL;
                }
 
index 471af3b..47517a0 100644 (file)
@@ -670,15 +670,14 @@ static void execlists_submit_request(struct drm_i915_gem_request *request)
 static struct intel_engine_cs *
 pt_lock_engine(struct i915_priotree *pt, struct intel_engine_cs *locked)
 {
-       struct intel_engine_cs *engine;
+       struct intel_engine_cs *engine =
+               container_of(pt, struct drm_i915_gem_request, priotree)->engine;
+
+       GEM_BUG_ON(!locked);
 
-       engine = container_of(pt,
-                             struct drm_i915_gem_request,
-                             priotree)->engine;
        if (engine != locked) {
-               if (locked)
-                       spin_unlock_irq(&locked->timeline->lock);
-               spin_lock_irq(&engine->timeline->lock);
+               spin_unlock(&locked->timeline->lock);
+               spin_lock(&engine->timeline->lock);
        }
 
        return engine;
@@ -686,7 +685,7 @@ pt_lock_engine(struct i915_priotree *pt, struct intel_engine_cs *locked)
 
 static void execlists_schedule(struct drm_i915_gem_request *request, int prio)
 {
-       struct intel_engine_cs *engine = NULL;
+       struct intel_engine_cs *engine;
        struct i915_dependency *dep, *p;
        struct i915_dependency stack;
        LIST_HEAD(dfs);
@@ -720,26 +719,23 @@ static void execlists_schedule(struct drm_i915_gem_request *request, int prio)
        list_for_each_entry_safe(dep, p, &dfs, dfs_link) {
                struct i915_priotree *pt = dep->signaler;
 
-               list_for_each_entry(p, &pt->signalers_list, signal_link)
+               /* Within an engine, there can be no cycle, but we may
+                * refer to the same dependency chain multiple times
+                * (redundant dependencies are not eliminated) and across
+                * engines.
+                */
+               list_for_each_entry(p, &pt->signalers_list, signal_link) {
+                       GEM_BUG_ON(p->signaler->priority < pt->priority);
                        if (prio > READ_ONCE(p->signaler->priority))
                                list_move_tail(&p->dfs_link, &dfs);
+               }
 
                list_safe_reset_next(dep, p, dfs_link);
-               if (!RB_EMPTY_NODE(&pt->node))
-                       continue;
-
-               engine = pt_lock_engine(pt, engine);
-
-               /* If it is not already in the rbtree, we can update the
-                * priority inplace and skip over it (and its dependencies)
-                * if it is referenced *again* as we descend the dfs.
-                */
-               if (prio > pt->priority && RB_EMPTY_NODE(&pt->node)) {
-                       pt->priority = prio;
-                       list_del_init(&dep->dfs_link);
-               }
        }
 
+       engine = request->engine;
+       spin_lock_irq(&engine->timeline->lock);
+
        /* Fifo and depth-first replacement ensure our deps execute before us */
        list_for_each_entry_safe_reverse(dep, p, &dfs, dfs_link) {
                struct i915_priotree *pt = dep->signaler;
@@ -751,16 +747,15 @@ static void execlists_schedule(struct drm_i915_gem_request *request, int prio)
                if (prio <= pt->priority)
                        continue;
 
-               GEM_BUG_ON(RB_EMPTY_NODE(&pt->node));
-
                pt->priority = prio;
-               rb_erase(&pt->node, &engine->execlist_queue);
-               if (insert_request(pt, &engine->execlist_queue))
-                       engine->execlist_first = &pt->node;
+               if (!RB_EMPTY_NODE(&pt->node)) {
+                       rb_erase(&pt->node, &engine->execlist_queue);
+                       if (insert_request(pt, &engine->execlist_queue))
+                               engine->execlist_first = &pt->node;
+               }
        }
 
-       if (engine)
-               spin_unlock_irq(&engine->timeline->lock);
+       spin_unlock_irq(&engine->timeline->lock);
 
        /* XXX Do we need to preempt to make room for us and our deps? */
 }
@@ -1440,7 +1435,9 @@ static void reset_common_ring(struct intel_engine_cs *engine,
        GEM_BUG_ON(request->ctx != port[0].request->ctx);
 
        /* Reset WaIdleLiteRestore:bdw,skl as well */
-       request->tail = request->wa_tail - WA_TAIL_DWORDS * sizeof(u32);
+       request->tail =
+               intel_ring_wrap(request->ring,
+                               request->wa_tail - WA_TAIL_DWORDS*sizeof(u32));
 }
 
 static int intel_logical_ring_emit_pdps(struct drm_i915_gem_request *req)
index 13dccb1..8cb2078 100644 (file)
@@ -521,11 +521,17 @@ static inline void intel_ring_advance(struct intel_ring *ring)
         */
 }
 
+static inline u32
+intel_ring_wrap(const struct intel_ring *ring, u32 pos)
+{
+       return pos & (ring->size - 1);
+}
+
 static inline u32 intel_ring_offset(struct intel_ring *ring, void *addr)
 {
        /* Don't write ring->size (equivalent to 0) as that hangs some GPUs. */
        u32 offset = addr - ring->vaddr;
-       return offset & (ring->size - 1);
+       return intel_ring_wrap(ring, offset);
 }
 
 int __intel_ring_space(int head, int tail, int size);
index 0b4440f..a9182d5 100644 (file)
@@ -995,7 +995,6 @@ nv50_wndw_atomic_destroy_state(struct drm_plane *plane,
 {
        struct nv50_wndw_atom *asyw = nv50_wndw_atom(state);
        __drm_atomic_helper_plane_destroy_state(&asyw->state);
-       dma_fence_put(asyw->state.fence);
        kfree(asyw);
 }
 
@@ -1007,7 +1006,6 @@ nv50_wndw_atomic_duplicate_state(struct drm_plane *plane)
        if (!(asyw = kmalloc(sizeof(*asyw), GFP_KERNEL)))
                return NULL;
        __drm_atomic_helper_plane_duplicate_state(plane, &asyw->state);
-       asyw->state.fence = NULL;
        asyw->interval = 1;
        asyw->sema = armw->sema;
        asyw->ntfy = armw->ntfy;
@@ -2036,6 +2034,7 @@ nv50_head_atomic_check_mode(struct nv50_head *head, struct nv50_head_atom *asyh)
        u32 vbackp  = (mode->vtotal - mode->vsync_end) * vscan / ilace;
        u32 hfrontp =  mode->hsync_start - mode->hdisplay;
        u32 vfrontp = (mode->vsync_start - mode->vdisplay) * vscan / ilace;
+       u32 blankus;
        struct nv50_head_mode *m = &asyh->mode;
 
        m->h.active = mode->htotal;
@@ -2049,9 +2048,10 @@ nv50_head_atomic_check_mode(struct nv50_head *head, struct nv50_head_atom *asyh)
        m->v.blanks = m->v.active - vfrontp - 1;
 
        /*XXX: Safe underestimate, even "0" works */
-       m->v.blankus = (m->v.active - mode->vdisplay - 2) * m->h.active;
-       m->v.blankus *= 1000;
-       m->v.blankus /= mode->clock;
+       blankus = (m->v.active - mode->vdisplay - 2) * m->h.active;
+       blankus *= 1000;
+       blankus /= mode->clock;
+       m->v.blankus = blankus;
 
        if (mode->flags & DRM_MODE_FLAG_INTERLACE) {
                m->v.blank2e =  m->v.active + m->v.synce + vbackp;
index 273562d..3b86a73 100644 (file)
@@ -714,7 +714,7 @@ nv4a_chipset = {
        .i2c = nv04_i2c_new,
        .imem = nv40_instmem_new,
        .mc = nv44_mc_new,
-       .mmu = nv44_mmu_new,
+       .mmu = nv04_mmu_new,
        .pci = nv40_pci_new,
        .therm = nv40_therm_new,
        .timer = nv41_timer_new,
@@ -2271,6 +2271,35 @@ nv136_chipset = {
        .fifo = gp100_fifo_new,
 };
 
+static const struct nvkm_device_chip
+nv137_chipset = {
+       .name = "GP107",
+       .bar = gf100_bar_new,
+       .bios = nvkm_bios_new,
+       .bus = gf100_bus_new,
+       .devinit = gm200_devinit_new,
+       .fb = gp102_fb_new,
+       .fuse = gm107_fuse_new,
+       .gpio = gk104_gpio_new,
+       .i2c = gm200_i2c_new,
+       .ibus = gm200_ibus_new,
+       .imem = nv50_instmem_new,
+       .ltc = gp100_ltc_new,
+       .mc = gp100_mc_new,
+       .mmu = gf100_mmu_new,
+       .pci = gp100_pci_new,
+       .pmu = gp102_pmu_new,
+       .timer = gk20a_timer_new,
+       .top = gk104_top_new,
+       .ce[0] = gp102_ce_new,
+       .ce[1] = gp102_ce_new,
+       .ce[2] = gp102_ce_new,
+       .ce[3] = gp102_ce_new,
+       .disp = gp102_disp_new,
+       .dma = gf119_dma_new,
+       .fifo = gp100_fifo_new,
+};
+
 static int
 nvkm_device_event_ctor(struct nvkm_object *object, void *data, u32 size,
                       struct nvkm_notify *notify)
@@ -2708,6 +2737,7 @@ nvkm_device_ctor(const struct nvkm_device_func *func,
                case 0x132: device->chip = &nv132_chipset; break;
                case 0x134: device->chip = &nv134_chipset; break;
                case 0x136: device->chip = &nv136_chipset; break;
+               case 0x137: device->chip = &nv137_chipset; break;
                default:
                        nvdev_error(device, "unknown chipset (%08x)\n", boot0);
                        goto done;
index 003ac91..8a88952 100644 (file)
@@ -198,7 +198,7 @@ nv31_mpeg_intr(struct nvkm_engine *engine)
                }
 
                if (type == 0x00000010) {
-                       if (!nv31_mpeg_mthd(mpeg, mthd, data))
+                       if (nv31_mpeg_mthd(mpeg, mthd, data))
                                show &= ~0x01000000;
                }
        }
index e536f37..c3cf02e 100644 (file)
@@ -172,7 +172,7 @@ nv44_mpeg_intr(struct nvkm_engine *engine)
                }
 
                if (type == 0x00000010) {
-                       if (!nv44_mpeg_mthd(subdev->device, mthd, data))
+                       if (nv44_mpeg_mthd(subdev->device, mthd, data))
                                show &= ~0x01000000;
                }
        }
index 917dcb9..0c87b1a 100644 (file)
@@ -14,6 +14,7 @@
 #include <linux/slab.h>
 #include <linux/fb.h>
 #include <linux/prefetch.h>
+#include <asm/unaligned.h>
 
 #include <drm/drmP.h>
 #include "udl_drv.h"
@@ -163,7 +164,7 @@ static void udl_compress_hline16(
                        const u8 *const start = pixel;
                        const uint16_t repeating_pixel_val16 = pixel_val16;
 
-                       *(uint16_t *)cmd = cpu_to_be16(pixel_val16);
+                       put_unaligned_be16(pixel_val16, cmd);
 
                        cmd += 2;
                        pixel += bpp;
index 63ec199..d162f0d 100644 (file)
@@ -819,8 +819,7 @@ static int hid_scan_report(struct hid_device *hid)
                hid->group = HID_GROUP_WACOM;
                break;
        case USB_VENDOR_ID_SYNAPTICS:
-               if (hid->group == HID_GROUP_GENERIC ||
-                   hid->group == HID_GROUP_MULTITOUCH_WIN_8)
+               if (hid->group == HID_GROUP_GENERIC)
                        if ((parser->scan_flags & HID_SCAN_FLAG_VENDOR_SPECIFIC)
                            && (parser->scan_flags & HID_SCAN_FLAG_GD_POINTER))
                                /*
@@ -2096,6 +2095,7 @@ static const struct hid_device_id hid_have_special_driver[] = {
        { HID_USB_DEVICE(USB_VENDOR_ID_UCLOGIC, USB_DEVICE_ID_UGEE_TABLET_45) },
        { HID_USB_DEVICE(USB_VENDOR_ID_UCLOGIC, USB_DEVICE_ID_UCLOGIC_DRAWIMAGE_G3) },
        { HID_USB_DEVICE(USB_VENDOR_ID_UGTIZER, USB_DEVICE_ID_UGTIZER_TABLET_GP0610) },
+       { HID_USB_DEVICE(USB_VENDOR_ID_UGEE, USB_DEVICE_ID_UGEE_TABLET_EX07S) },
        { HID_USB_DEVICE(USB_VENDOR_ID_WISEGROUP, USB_DEVICE_ID_SMARTJOY_PLUS) },
        { HID_USB_DEVICE(USB_VENDOR_ID_WISEGROUP, USB_DEVICE_ID_SUPER_JOY_BOX_3) },
        { HID_USB_DEVICE(USB_VENDOR_ID_WISEGROUP, USB_DEVICE_ID_DUAL_USB_JOYPAD) },
index 4e2648c..b26c030 100644 (file)
 #define USB_DEVICE_ID_UGEE_TABLET_45           0x0045
 #define USB_DEVICE_ID_YIYNOVA_TABLET           0x004d
 
+#define USB_VENDOR_ID_UGEE             0x28bd
+#define USB_DEVICE_ID_UGEE_TABLET_EX07S                0x0071
+
 #define USB_VENDOR_ID_UNITEC   0x227d
 #define USB_DEVICE_ID_UNITEC_USB_TOUCH_0709    0x0709
 #define USB_DEVICE_ID_UNITEC_USB_TOUCH_0A19    0x0a19
index 1509d72..e3e6e5c 100644 (file)
@@ -977,6 +977,7 @@ static int uclogic_probe(struct hid_device *hdev,
                }
                break;
        case USB_DEVICE_ID_UGTIZER_TABLET_GP0610:
+       case USB_DEVICE_ID_UGEE_TABLET_EX07S:
                /* If this is the pen interface */
                if (intf->cur_altsetting->desc.bInterfaceNumber == 1) {
                        rc = uclogic_tablet_enable(hdev);
@@ -1069,6 +1070,7 @@ static const struct hid_device_id uclogic_devices[] = {
        { HID_USB_DEVICE(USB_VENDOR_ID_UCLOGIC, USB_DEVICE_ID_UGEE_TABLET_45) },
        { HID_USB_DEVICE(USB_VENDOR_ID_UCLOGIC, USB_DEVICE_ID_UCLOGIC_DRAWIMAGE_G3) },
        { HID_USB_DEVICE(USB_VENDOR_ID_UGTIZER, USB_DEVICE_ID_UGTIZER_TABLET_GP0610) },
+       { HID_USB_DEVICE(USB_VENDOR_ID_UGEE, USB_DEVICE_ID_UGEE_TABLET_EX07S) },
        { }
 };
 MODULE_DEVICE_TABLE(hid, uclogic_devices);
index 94250c2..c68ac65 100644 (file)
@@ -2006,7 +2006,7 @@ static void wacom_wac_pen_event(struct hid_device *hdev, struct hid_field *field
                return;
        case HID_DG_TOOLSERIALNUMBER:
                wacom_wac->serial[0] = (wacom_wac->serial[0] & ~0xFFFFFFFFULL);
-               wacom_wac->serial[0] |= value;
+               wacom_wac->serial[0] |= (__u32)value;
                return;
        case WACOM_HID_WD_SENSE:
                wacom_wac->hid_data.sense_state = value;
@@ -2176,6 +2176,16 @@ static void wacom_wac_finger_usage_mapping(struct hid_device *hdev,
                wacom_wac->hid_data.cc_index = field->index;
                wacom_wac->hid_data.cc_value_index = usage->usage_index;
                break;
+       case HID_DG_CONTACTID:
+               if ((field->logical_maximum - field->logical_minimum) < touch_max) {
+                       /*
+                        * The HID descriptor for G11 sensors leaves logical
+                        * maximum set to '1' despite it being a multitouch
+                        * device. Override to a sensible number.
+                        */
+                       field->logical_maximum = 255;
+               }
+               break;
        }
 }
 
index 7ef8196..26b0510 100644 (file)
@@ -980,7 +980,7 @@ static int ssip_pn_xmit(struct sk_buff *skb, struct net_device *dev)
                goto drop;
        /* Pad to 32-bits - FIXME: Revisit*/
        if ((skb->len & 3) && skb_pad(skb, 4 - (skb->len & 3)))
-               goto drop;
+               goto inc_dropped;
 
        /*
         * Modem sends Phonet messages over SSI with its own endianess...
@@ -1032,8 +1032,9 @@ static int ssip_pn_xmit(struct sk_buff *skb, struct net_device *dev)
 drop2:
        hsi_free_msg(msg);
 drop:
-       dev->stats.tx_dropped++;
        dev_kfree_skb(skb);
+inc_dropped:
+       dev->stats.tx_dropped++;
 
        return 0;
 }
index 0649d53..22d5eaf 100644 (file)
@@ -341,6 +341,15 @@ config SENSORS_ASB100
          This driver can also be built as a module.  If so, the module
          will be called asb100.
 
+config SENSORS_ASPEED
+       tristate "ASPEED AST2400/AST2500 PWM and Fan tach driver"
+       help
+         This driver provides support for ASPEED AST2400/AST2500 PWM
+         and Fan Tacho controllers.
+
+         This driver can also be built as a module. If so, the module
+         will be called aspeed_pwm_tacho.
+
 config SENSORS_ATXP1
        tristate "Attansic ATXP1 VID controller"
        depends on I2C
@@ -1643,16 +1652,6 @@ config SENSORS_TMP421
          This driver can also be built as a module.  If so, the module
          will be called tmp421.
 
-config SENSORS_TWL4030_MADC
-       tristate "Texas Instruments TWL4030 MADC Hwmon"
-       depends on TWL4030_MADC
-       help
-       If you say yes here you get hwmon support for triton
-       TWL4030-MADC.
-
-       This driver can also be built as a module. If so it will be called
-       twl4030-madc-hwmon.
-
 config SENSORS_VEXPRESS
        tristate "Versatile Express"
        depends on VEXPRESS_CONFIG
index 5509edf..d4641a9 100644 (file)
@@ -46,6 +46,7 @@ obj-$(CONFIG_SENSORS_ADT7475) += adt7475.o
 obj-$(CONFIG_SENSORS_APPLESMC) += applesmc.o
 obj-$(CONFIG_SENSORS_ARM_SCPI) += scpi-hwmon.o
 obj-$(CONFIG_SENSORS_ASC7621)  += asc7621.o
+obj-$(CONFIG_SENSORS_ASPEED)   += aspeed-pwm-tacho.o
 obj-$(CONFIG_SENSORS_ATXP1)    += atxp1.o
 obj-$(CONFIG_SENSORS_CORETEMP) += coretemp.o
 obj-$(CONFIG_SENSORS_DA9052_ADC)+= da9052-hwmon.o
@@ -157,7 +158,6 @@ obj-$(CONFIG_SENSORS_TMP103)        += tmp103.o
 obj-$(CONFIG_SENSORS_TMP108)   += tmp108.o
 obj-$(CONFIG_SENSORS_TMP401)   += tmp401.o
 obj-$(CONFIG_SENSORS_TMP421)   += tmp421.o
-obj-$(CONFIG_SENSORS_TWL4030_MADC)+= twl4030-madc-hwmon.o
 obj-$(CONFIG_SENSORS_VEXPRESS) += vexpress-hwmon.o
 obj-$(CONFIG_SENSORS_VIA_CPUTEMP)+= via-cputemp.o
 obj-$(CONFIG_SENSORS_VIA686A)  += via686a.o
index 763490a..cec227f 100644 (file)
@@ -217,9 +217,16 @@ static const struct i2c_device_id ad7414_id[] = {
 };
 MODULE_DEVICE_TABLE(i2c, ad7414_id);
 
+static const struct of_device_id ad7414_of_match[] = {
+       { .compatible = "ad,ad7414" },
+       { },
+};
+MODULE_DEVICE_TABLE(of, ad7414_of_match);
+
 static struct i2c_driver ad7414_driver = {
        .driver = {
                .name   = "ad7414",
+               .of_match_table = of_match_ptr(ad7414_of_match),
        },
        .probe  = ad7414_probe,
        .id_table = ad7414_id,
index bbe3a5c..a557b46 100644 (file)
@@ -546,10 +546,17 @@ static const struct i2c_device_id adc128_id[] = {
 };
 MODULE_DEVICE_TABLE(i2c, adc128_id);
 
+static const struct of_device_id adc128_of_match[] = {
+       { .compatible = "ti,adc128d818" },
+       { },
+};
+MODULE_DEVICE_TABLE(of, adc128_of_match);
+
 static struct i2c_driver adc128_driver = {
        .class          = I2C_CLASS_HWMON,
        .driver = {
                .name   = "adc128d818",
+               .of_match_table = of_match_ptr(adc128_of_match),
        },
        .probe          = adc128_probe,
        .remove         = adc128_remove,
index 2b3105c..5140c27 100644 (file)
@@ -31,6 +31,7 @@
 #include <linux/hwmon-sysfs.h>
 #include <linux/err.h>
 #include <linux/mutex.h>
+#include <linux/of_device.h>
 #include <linux/of.h>
 
 #include <linux/i2c/ads1015.h>
@@ -268,7 +269,12 @@ static int ads1015_probe(struct i2c_client *client,
                            GFP_KERNEL);
        if (!data)
                return -ENOMEM;
-       data->id = id->driver_data;
+
+       if (client->dev.of_node)
+               data->id = (enum ads1015_chips)
+                       of_device_get_match_data(&client->dev);
+       else
+               data->id = id->driver_data;
        i2c_set_clientdata(client, data);
        mutex_init(&data->update_lock);
 
@@ -303,9 +309,23 @@ static const struct i2c_device_id ads1015_id[] = {
 };
 MODULE_DEVICE_TABLE(i2c, ads1015_id);
 
+static const struct of_device_id ads1015_of_match[] = {
+       {
+               .compatible = "ti,ads1015",
+               .data = (void *)ads1015
+       },
+       {
+               .compatible = "ti,ads1115",
+               .data = (void *)ads1115
+       },
+       { },
+};
+MODULE_DEVICE_TABLE(of, ads1015_of_match);
+
 static struct i2c_driver ads1015_driver = {
        .driver = {
                .name = "ads1015",
+               .of_match_table = of_match_ptr(ads1015_of_match),
        },
        .probe = ads1015_probe,
        .remove = ads1015_remove,
index ee396ff..898607b 100644 (file)
 #include <linux/i2c.h>
 #include <linux/init.h>
 #include <linux/module.h>
+#include <linux/of_device.h>
 #include <linux/platform_data/ads7828.h>
 #include <linux/regmap.h>
 #include <linux/slab.h>
+#include <linux/regulator/consumer.h>
 
 /* The ADS7828 registers */
 #define ADS7828_CMD_SD_SE      0x80    /* Single ended inputs */
@@ -118,9 +120,12 @@ static int ads7828_probe(struct i2c_client *client,
        struct ads7828_data *data;
        struct device *hwmon_dev;
        unsigned int vref_mv = ADS7828_INT_VREF_MV;
+       unsigned int vref_uv;
        bool diff_input = false;
        bool ext_vref = false;
        unsigned int regval;
+       enum ads7828_chips chip;
+       struct regulator *reg;
 
        data = devm_kzalloc(dev, sizeof(struct ads7828_data), GFP_KERNEL);
        if (!data)
@@ -131,14 +136,32 @@ static int ads7828_probe(struct i2c_client *client,
                ext_vref = pdata->ext_vref;
                if (ext_vref && pdata->vref_mv)
                        vref_mv = pdata->vref_mv;
+       } else if (dev->of_node) {
+               diff_input = of_property_read_bool(dev->of_node,
+                                                  "ti,differential-input");
+               reg = devm_regulator_get_optional(dev, "vref");
+               if (!IS_ERR(reg)) {
+                       vref_uv = regulator_get_voltage(reg);
+                       vref_mv = DIV_ROUND_CLOSEST(vref_uv, 1000);
+                       if (vref_mv < ADS7828_EXT_VREF_MV_MIN ||
+                           vref_mv > ADS7828_EXT_VREF_MV_MAX)
+                               return -EINVAL;
+                       ext_vref = true;
+               }
        }
 
+       if (client->dev.of_node)
+               chip = (enum ads7828_chips)
+                       of_device_get_match_data(&client->dev);
+       else
+               chip = id->driver_data;
+
        /* Bound Vref with min/max values */
        vref_mv = clamp_val(vref_mv, ADS7828_EXT_VREF_MV_MIN,
                            ADS7828_EXT_VREF_MV_MAX);
 
        /* ADS7828 uses 12-bit samples, while ADS7830 is 8-bit */
-       if (id->driver_data == ads7828) {
+       if (chip == ads7828) {
                data->lsb_resol = DIV_ROUND_CLOSEST(vref_mv * 1000, 4096);
                data->regmap = devm_regmap_init_i2c(client,
                                                    &ads2828_regmap_config);
@@ -177,9 +200,23 @@ static const struct i2c_device_id ads7828_device_ids[] = {
 };
 MODULE_DEVICE_TABLE(i2c, ads7828_device_ids);
 
+static const struct of_device_id ads7828_of_match[] = {
+       {
+               .compatible = "ti,ads7828",
+               .data = (void *)ads7828
+       },
+       {
+               .compatible = "ti,ads7830",
+               .data = (void *)ads7830
+       },
+       { },
+};
+MODULE_DEVICE_TABLE(of, ads7828_of_match);
+
 static struct i2c_driver ads7828_driver = {
        .driver = {
                .name = "ads7828",
+               .of_match_table = of_match_ptr(ads7828_of_match),
        },
 
        .id_table = ads7828_device_ids,
index c646670..c803e3c 100644 (file)
@@ -13,6 +13,7 @@
  */
 
 #include <linux/module.h>
+#include <linux/of_device.h>
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/i2c.h>
@@ -58,6 +59,8 @@
 #define REG_VENDID             0x3E
 #define REG_DEVID2             0x3F
 
+#define REG_CONFIG1            0x40
+
 #define REG_STATUS1            0x41
 #define REG_STATUS2            0x42
 
@@ -161,6 +164,27 @@ static const struct i2c_device_id adt7475_id[] = {
 };
 MODULE_DEVICE_TABLE(i2c, adt7475_id);
 
+static const struct of_device_id adt7475_of_match[] = {
+       {
+               .compatible = "adi,adt7473",
+               .data = (void *)adt7473
+       },
+       {
+               .compatible = "adi,adt7475",
+               .data = (void *)adt7475
+       },
+       {
+               .compatible = "adi,adt7476",
+               .data = (void *)adt7476
+       },
+       {
+               .compatible = "adi,adt7490",
+               .data = (void *)adt7490
+       },
+       { },
+};
+MODULE_DEVICE_TABLE(of, adt7475_of_match);
+
 struct adt7475_data {
        struct device *hwmon_dev;
        struct mutex lock;
@@ -1250,6 +1274,7 @@ static void adt7475_remove_files(struct i2c_client *client,
 static int adt7475_probe(struct i2c_client *client,
                         const struct i2c_device_id *id)
 {
+       enum chips chip;
        static const char * const names[] = {
                [adt7473] = "ADT7473",
                [adt7475] = "ADT7475",
@@ -1268,8 +1293,13 @@ static int adt7475_probe(struct i2c_client *client,
        mutex_init(&data->lock);
        i2c_set_clientdata(client, data);
 
+       if (client->dev.of_node)
+               chip = (enum chips)of_device_get_match_data(&client->dev);
+       else
+               chip = id->driver_data;
+
        /* Initialize device-specific values */
-       switch (id->driver_data) {
+       switch (chip) {
        case adt7476:
                data->has_voltage = 0x0e;       /* in1 to in3 */
                revision = adt7475_read(REG_DEVID2) & 0x07;
@@ -1343,6 +1373,17 @@ static int adt7475_probe(struct i2c_client *client,
        for (i = 0; i < ADT7475_PWM_COUNT; i++)
                adt7475_read_pwm(client, i);
 
+       /* Start monitoring */
+       switch (chip) {
+       case adt7475:
+       case adt7476:
+               i2c_smbus_write_byte_data(client, REG_CONFIG1,
+                                         adt7475_read(REG_CONFIG1) | 0x01);
+               break;
+       default:
+               break;
+       }
+
        ret = sysfs_create_group(&client->dev.kobj, &adt7475_attr_group);
        if (ret)
                return ret;
@@ -1428,6 +1469,7 @@ static struct i2c_driver adt7475_driver = {
        .class          = I2C_CLASS_HWMON,
        .driver = {
                .name   = "adt7475",
+               .of_match_table = of_match_ptr(adt7475_of_match),
        },
        .probe          = adt7475_probe,
        .remove         = adt7475_remove,
diff --git a/drivers/hwmon/aspeed-pwm-tacho.c b/drivers/hwmon/aspeed-pwm-tacho.c
new file mode 100644 (file)
index 0000000..48403a2
--- /dev/null
@@ -0,0 +1,835 @@
+/*
+ * Copyright (c) 2016 Google, Inc
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 or later as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/clk.h>
+#include <linux/gpio/consumer.h>
+#include <linux/delay.h>
+#include <linux/hwmon.h>
+#include <linux/hwmon-sysfs.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/of_platform.h>
+#include <linux/of_device.h>
+#include <linux/platform_device.h>
+#include <linux/sysfs.h>
+#include <linux/regmap.h>
+
+/* ASPEED PWM & FAN Tach Register Definition */
+#define ASPEED_PTCR_CTRL               0x00
+#define ASPEED_PTCR_CLK_CTRL           0x04
+#define ASPEED_PTCR_DUTY0_CTRL         0x08
+#define ASPEED_PTCR_DUTY1_CTRL         0x0c
+#define ASPEED_PTCR_TYPEM_CTRL         0x10
+#define ASPEED_PTCR_TYPEM_CTRL1                0x14
+#define ASPEED_PTCR_TYPEN_CTRL         0x18
+#define ASPEED_PTCR_TYPEN_CTRL1                0x1c
+#define ASPEED_PTCR_TACH_SOURCE                0x20
+#define ASPEED_PTCR_TRIGGER            0x28
+#define ASPEED_PTCR_RESULT             0x2c
+#define ASPEED_PTCR_INTR_CTRL          0x30
+#define ASPEED_PTCR_INTR_STS           0x34
+#define ASPEED_PTCR_TYPEM_LIMIT                0x38
+#define ASPEED_PTCR_TYPEN_LIMIT                0x3C
+#define ASPEED_PTCR_CTRL_EXT           0x40
+#define ASPEED_PTCR_CLK_CTRL_EXT       0x44
+#define ASPEED_PTCR_DUTY2_CTRL         0x48
+#define ASPEED_PTCR_DUTY3_CTRL         0x4c
+#define ASPEED_PTCR_TYPEO_CTRL         0x50
+#define ASPEED_PTCR_TYPEO_CTRL1                0x54
+#define ASPEED_PTCR_TACH_SOURCE_EXT    0x60
+#define ASPEED_PTCR_TYPEO_LIMIT                0x78
+
+/* ASPEED_PTCR_CTRL : 0x00 - General Control Register */
+#define ASPEED_PTCR_CTRL_SET_PWMD_TYPE_PART1   15
+#define ASPEED_PTCR_CTRL_SET_PWMD_TYPE_PART2   6
+#define ASPEED_PTCR_CTRL_SET_PWMD_TYPE_MASK    (BIT(7) | BIT(15))
+
+#define ASPEED_PTCR_CTRL_SET_PWMC_TYPE_PART1   14
+#define ASPEED_PTCR_CTRL_SET_PWMC_TYPE_PART2   5
+#define ASPEED_PTCR_CTRL_SET_PWMC_TYPE_MASK    (BIT(6) | BIT(14))
+
+#define ASPEED_PTCR_CTRL_SET_PWMB_TYPE_PART1   13
+#define ASPEED_PTCR_CTRL_SET_PWMB_TYPE_PART2   4
+#define ASPEED_PTCR_CTRL_SET_PWMB_TYPE_MASK    (BIT(5) | BIT(13))
+
+#define ASPEED_PTCR_CTRL_SET_PWMA_TYPE_PART1   12
+#define ASPEED_PTCR_CTRL_SET_PWMA_TYPE_PART2   3
+#define ASPEED_PTCR_CTRL_SET_PWMA_TYPE_MASK    (BIT(4) | BIT(12))
+
+#define        ASPEED_PTCR_CTRL_FAN_NUM_EN(x)  BIT(16 + (x))
+
+#define        ASPEED_PTCR_CTRL_PWMD_EN        BIT(11)
+#define        ASPEED_PTCR_CTRL_PWMC_EN        BIT(10)
+#define        ASPEED_PTCR_CTRL_PWMB_EN        BIT(9)
+#define        ASPEED_PTCR_CTRL_PWMA_EN        BIT(8)
+
+#define        ASPEED_PTCR_CTRL_CLK_SRC        BIT(1)
+#define        ASPEED_PTCR_CTRL_CLK_EN         BIT(0)
+
+/* ASPEED_PTCR_CLK_CTRL : 0x04 - Clock Control Register */
+/* TYPE N */
+#define ASPEED_PTCR_CLK_CTRL_TYPEN_MASK                GENMASK(31, 16)
+#define ASPEED_PTCR_CLK_CTRL_TYPEN_UNIT                24
+#define ASPEED_PTCR_CLK_CTRL_TYPEN_H           20
+#define ASPEED_PTCR_CLK_CTRL_TYPEN_L           16
+/* TYPE M */
+#define ASPEED_PTCR_CLK_CTRL_TYPEM_MASK         GENMASK(15, 0)
+#define ASPEED_PTCR_CLK_CTRL_TYPEM_UNIT                8
+#define ASPEED_PTCR_CLK_CTRL_TYPEM_H           4
+#define ASPEED_PTCR_CLK_CTRL_TYPEM_L           0
+
+/*
+ * ASPEED_PTCR_DUTY_CTRL/1/2/3 : 0x08/0x0C/0x48/0x4C - PWM-FAN duty control
+ * 0/1/2/3 register
+ */
+#define DUTY_CTRL_PWM2_FALL_POINT      24
+#define DUTY_CTRL_PWM2_RISE_POINT      16
+#define DUTY_CTRL_PWM2_RISE_FALL_MASK  GENMASK(31, 16)
+#define DUTY_CTRL_PWM1_FALL_POINT      8
+#define DUTY_CTRL_PWM1_RISE_POINT      0
+#define DUTY_CTRL_PWM1_RISE_FALL_MASK   GENMASK(15, 0)
+
+/* ASPEED_PTCR_TYPEM_CTRL : 0x10/0x18/0x50 - Type M/N/O Ctrl 0 Register */
+#define TYPE_CTRL_FAN_MASK             (GENMASK(5, 1) | GENMASK(31, 16))
+#define TYPE_CTRL_FAN1_MASK            GENMASK(31, 0)
+#define TYPE_CTRL_FAN_PERIOD           16
+#define TYPE_CTRL_FAN_MODE             4
+#define TYPE_CTRL_FAN_DIVISION         1
+#define TYPE_CTRL_FAN_TYPE_EN          1
+
+/* ASPEED_PTCR_TACH_SOURCE : 0x20/0x60 - Tach Source Register */
+/* bit [0,1] at 0x20, bit [2] at 0x60 */
+#define TACH_PWM_SOURCE_BIT01(x)       ((x) * 2)
+#define TACH_PWM_SOURCE_BIT2(x)                ((x) * 2)
+#define TACH_PWM_SOURCE_MASK_BIT01(x)  (0x3 << ((x) * 2))
+#define TACH_PWM_SOURCE_MASK_BIT2(x)   BIT((x) * 2)
+
+/* ASPEED_PTCR_RESULT : 0x2c - Result Register */
+#define RESULT_STATUS_MASK             BIT(31)
+#define RESULT_VALUE_MASK              0xfffff
+
+/* ASPEED_PTCR_CTRL_EXT : 0x40 - General Control Extension #1 Register */
+#define ASPEED_PTCR_CTRL_SET_PWMH_TYPE_PART1   15
+#define ASPEED_PTCR_CTRL_SET_PWMH_TYPE_PART2   6
+#define ASPEED_PTCR_CTRL_SET_PWMH_TYPE_MASK    (BIT(7) | BIT(15))
+
+#define ASPEED_PTCR_CTRL_SET_PWMG_TYPE_PART1   14
+#define ASPEED_PTCR_CTRL_SET_PWMG_TYPE_PART2   5
+#define ASPEED_PTCR_CTRL_SET_PWMG_TYPE_MASK    (BIT(6) | BIT(14))
+
+#define ASPEED_PTCR_CTRL_SET_PWMF_TYPE_PART1   13
+#define ASPEED_PTCR_CTRL_SET_PWMF_TYPE_PART2   4
+#define ASPEED_PTCR_CTRL_SET_PWMF_TYPE_MASK    (BIT(5) | BIT(13))
+
+#define ASPEED_PTCR_CTRL_SET_PWME_TYPE_PART1   12
+#define ASPEED_PTCR_CTRL_SET_PWME_TYPE_PART2   3
+#define ASPEED_PTCR_CTRL_SET_PWME_TYPE_MASK    (BIT(4) | BIT(12))
+
+#define        ASPEED_PTCR_CTRL_PWMH_EN        BIT(11)
+#define        ASPEED_PTCR_CTRL_PWMG_EN        BIT(10)
+#define        ASPEED_PTCR_CTRL_PWMF_EN        BIT(9)
+#define        ASPEED_PTCR_CTRL_PWME_EN        BIT(8)
+
+/* ASPEED_PTCR_CLK_EXT_CTRL : 0x44 - Clock Control Extension #1 Register */
+/* TYPE O */
+#define ASPEED_PTCR_CLK_CTRL_TYPEO_MASK         GENMASK(15, 0)
+#define ASPEED_PTCR_CLK_CTRL_TYPEO_UNIT                8
+#define ASPEED_PTCR_CLK_CTRL_TYPEO_H           4
+#define ASPEED_PTCR_CLK_CTRL_TYPEO_L           0
+
+#define PWM_MAX 255
+
+#define M_PWM_DIV_H 0x00
+#define M_PWM_DIV_L 0x05
+#define M_PWM_PERIOD 0x5F
+#define M_TACH_CLK_DIV 0x00
+#define M_TACH_MODE 0x00
+#define M_TACH_UNIT 0x1000
+#define INIT_FAN_CTRL 0xFF
+
+struct aspeed_pwm_tacho_data {
+       struct regmap *regmap;
+       unsigned long clk_freq;
+       bool pwm_present[8];
+       bool fan_tach_present[16];
+       u8 type_pwm_clock_unit[3];
+       u8 type_pwm_clock_division_h[3];
+       u8 type_pwm_clock_division_l[3];
+       u8 type_fan_tach_clock_division[3];
+       u16 type_fan_tach_unit[3];
+       u8 pwm_port_type[8];
+       u8 pwm_port_fan_ctrl[8];
+       u8 fan_tach_ch_source[16];
+       const struct attribute_group *groups[3];
+};
+
+enum type { TYPEM, TYPEN, TYPEO };
+
+struct type_params {
+       u32 l_value;
+       u32 h_value;
+       u32 unit_value;
+       u32 clk_ctrl_mask;
+       u32 clk_ctrl_reg;
+       u32 ctrl_reg;
+       u32 ctrl_reg1;
+};
+
+static const struct type_params type_params[] = {
+       [TYPEM] = {
+               .l_value = ASPEED_PTCR_CLK_CTRL_TYPEM_L,
+               .h_value = ASPEED_PTCR_CLK_CTRL_TYPEM_H,
+               .unit_value = ASPEED_PTCR_CLK_CTRL_TYPEM_UNIT,
+               .clk_ctrl_mask = ASPEED_PTCR_CLK_CTRL_TYPEM_MASK,
+               .clk_ctrl_reg = ASPEED_PTCR_CLK_CTRL,
+               .ctrl_reg = ASPEED_PTCR_TYPEM_CTRL,
+               .ctrl_reg1 = ASPEED_PTCR_TYPEM_CTRL1,
+       },
+       [TYPEN] = {
+               .l_value = ASPEED_PTCR_CLK_CTRL_TYPEN_L,
+               .h_value = ASPEED_PTCR_CLK_CTRL_TYPEN_H,
+               .unit_value = ASPEED_PTCR_CLK_CTRL_TYPEN_UNIT,
+               .clk_ctrl_mask = ASPEED_PTCR_CLK_CTRL_TYPEN_MASK,
+               .clk_ctrl_reg = ASPEED_PTCR_CLK_CTRL,
+               .ctrl_reg = ASPEED_PTCR_TYPEN_CTRL,
+               .ctrl_reg1 = ASPEED_PTCR_TYPEN_CTRL1,
+       },
+       [TYPEO] = {
+               .l_value = ASPEED_PTCR_CLK_CTRL_TYPEO_L,
+               .h_value = ASPEED_PTCR_CLK_CTRL_TYPEO_H,
+               .unit_value = ASPEED_PTCR_CLK_CTRL_TYPEO_UNIT,
+               .clk_ctrl_mask = ASPEED_PTCR_CLK_CTRL_TYPEO_MASK,
+               .clk_ctrl_reg = ASPEED_PTCR_CLK_CTRL_EXT,
+               .ctrl_reg = ASPEED_PTCR_TYPEO_CTRL,
+               .ctrl_reg1 = ASPEED_PTCR_TYPEO_CTRL1,
+       }
+};
+
+enum pwm_port { PWMA, PWMB, PWMC, PWMD, PWME, PWMF, PWMG, PWMH };
+
+struct pwm_port_params {
+       u32 pwm_en;
+       u32 ctrl_reg;
+       u32 type_part1;
+       u32 type_part2;
+       u32 type_mask;
+       u32 duty_ctrl_rise_point;
+       u32 duty_ctrl_fall_point;
+       u32 duty_ctrl_reg;
+       u32 duty_ctrl_rise_fall_mask;
+};
+
+static const struct pwm_port_params pwm_port_params[] = {
+       [PWMA] = {
+               .pwm_en = ASPEED_PTCR_CTRL_PWMA_EN,
+               .ctrl_reg = ASPEED_PTCR_CTRL,
+               .type_part1 = ASPEED_PTCR_CTRL_SET_PWMA_TYPE_PART1,
+               .type_part2 = ASPEED_PTCR_CTRL_SET_PWMA_TYPE_PART2,
+               .type_mask = ASPEED_PTCR_CTRL_SET_PWMA_TYPE_MASK,
+               .duty_ctrl_rise_point = DUTY_CTRL_PWM1_RISE_POINT,
+               .duty_ctrl_fall_point = DUTY_CTRL_PWM1_FALL_POINT,
+               .duty_ctrl_reg = ASPEED_PTCR_DUTY0_CTRL,
+               .duty_ctrl_rise_fall_mask = DUTY_CTRL_PWM1_RISE_FALL_MASK,
+       },
+       [PWMB] = {
+               .pwm_en = ASPEED_PTCR_CTRL_PWMB_EN,
+               .ctrl_reg = ASPEED_PTCR_CTRL,
+               .type_part1 = ASPEED_PTCR_CTRL_SET_PWMB_TYPE_PART1,
+               .type_part2 = ASPEED_PTCR_CTRL_SET_PWMB_TYPE_PART2,
+               .type_mask = ASPEED_PTCR_CTRL_SET_PWMB_TYPE_MASK,
+               .duty_ctrl_rise_point = DUTY_CTRL_PWM2_RISE_POINT,
+               .duty_ctrl_fall_point = DUTY_CTRL_PWM2_FALL_POINT,
+               .duty_ctrl_reg = ASPEED_PTCR_DUTY0_CTRL,
+               .duty_ctrl_rise_fall_mask = DUTY_CTRL_PWM2_RISE_FALL_MASK,
+       },
+       [PWMC] = {
+               .pwm_en = ASPEED_PTCR_CTRL_PWMC_EN,
+               .ctrl_reg = ASPEED_PTCR_CTRL,
+               .type_part1 = ASPEED_PTCR_CTRL_SET_PWMC_TYPE_PART1,
+               .type_part2 = ASPEED_PTCR_CTRL_SET_PWMC_TYPE_PART2,
+               .type_mask = ASPEED_PTCR_CTRL_SET_PWMC_TYPE_MASK,
+               .duty_ctrl_rise_point = DUTY_CTRL_PWM1_RISE_POINT,
+               .duty_ctrl_fall_point = DUTY_CTRL_PWM1_FALL_POINT,
+               .duty_ctrl_reg = ASPEED_PTCR_DUTY1_CTRL,
+               .duty_ctrl_rise_fall_mask = DUTY_CTRL_PWM1_RISE_FALL_MASK,
+       },
+       [PWMD] = {
+               .pwm_en = ASPEED_PTCR_CTRL_PWMD_EN,
+               .ctrl_reg = ASPEED_PTCR_CTRL,
+               .type_part1 = ASPEED_PTCR_CTRL_SET_PWMD_TYPE_PART1,
+               .type_part2 = ASPEED_PTCR_CTRL_SET_PWMD_TYPE_PART2,
+               .type_mask = ASPEED_PTCR_CTRL_SET_PWMD_TYPE_MASK,
+               .duty_ctrl_rise_point = DUTY_CTRL_PWM2_RISE_POINT,
+               .duty_ctrl_fall_point = DUTY_CTRL_PWM2_FALL_POINT,
+               .duty_ctrl_reg = ASPEED_PTCR_DUTY1_CTRL,
+               .duty_ctrl_rise_fall_mask = DUTY_CTRL_PWM2_RISE_FALL_MASK,
+       },
+       [PWME] = {
+               .pwm_en = ASPEED_PTCR_CTRL_PWME_EN,
+               .ctrl_reg = ASPEED_PTCR_CTRL_EXT,
+               .type_part1 = ASPEED_PTCR_CTRL_SET_PWME_TYPE_PART1,
+               .type_part2 = ASPEED_PTCR_CTRL_SET_PWME_TYPE_PART2,
+               .type_mask = ASPEED_PTCR_CTRL_SET_PWME_TYPE_MASK,
+               .duty_ctrl_rise_point = DUTY_CTRL_PWM1_RISE_POINT,
+               .duty_ctrl_fall_point = DUTY_CTRL_PWM1_FALL_POINT,
+               .duty_ctrl_reg = ASPEED_PTCR_DUTY2_CTRL,
+               .duty_ctrl_rise_fall_mask = DUTY_CTRL_PWM1_RISE_FALL_MASK,
+       },
+       [PWMF] = {
+               .pwm_en = ASPEED_PTCR_CTRL_PWMF_EN,
+               .ctrl_reg = ASPEED_PTCR_CTRL_EXT,
+               .type_part1 = ASPEED_PTCR_CTRL_SET_PWMF_TYPE_PART1,
+               .type_part2 = ASPEED_PTCR_CTRL_SET_PWMF_TYPE_PART2,
+               .type_mask = ASPEED_PTCR_CTRL_SET_PWMF_TYPE_MASK,
+               .duty_ctrl_rise_point = DUTY_CTRL_PWM2_RISE_POINT,
+               .duty_ctrl_fall_point = DUTY_CTRL_PWM2_FALL_POINT,
+               .duty_ctrl_reg = ASPEED_PTCR_DUTY2_CTRL,
+               .duty_ctrl_rise_fall_mask = DUTY_CTRL_PWM2_RISE_FALL_MASK,
+       },
+       [PWMG] = {
+               .pwm_en = ASPEED_PTCR_CTRL_PWMG_EN,
+               .ctrl_reg = ASPEED_PTCR_CTRL_EXT,
+               .type_part1 = ASPEED_PTCR_CTRL_SET_PWMG_TYPE_PART1,
+               .type_part2 = ASPEED_PTCR_CTRL_SET_PWMG_TYPE_PART2,
+               .type_mask = ASPEED_PTCR_CTRL_SET_PWMG_TYPE_MASK,
+               .duty_ctrl_rise_point = DUTY_CTRL_PWM1_RISE_POINT,
+               .duty_ctrl_fall_point = DUTY_CTRL_PWM1_FALL_POINT,
+               .duty_ctrl_reg = ASPEED_PTCR_DUTY3_CTRL,
+               .duty_ctrl_rise_fall_mask = DUTY_CTRL_PWM1_RISE_FALL_MASK,
+       },
+       [PWMH] = {
+               .pwm_en = ASPEED_PTCR_CTRL_PWMH_EN,
+               .ctrl_reg = ASPEED_PTCR_CTRL_EXT,
+               .type_part1 = ASPEED_PTCR_CTRL_SET_PWMH_TYPE_PART1,
+               .type_part2 = ASPEED_PTCR_CTRL_SET_PWMH_TYPE_PART2,
+               .type_mask = ASPEED_PTCR_CTRL_SET_PWMH_TYPE_MASK,
+               .duty_ctrl_rise_point = DUTY_CTRL_PWM2_RISE_POINT,
+               .duty_ctrl_fall_point = DUTY_CTRL_PWM2_FALL_POINT,
+               .duty_ctrl_reg = ASPEED_PTCR_DUTY3_CTRL,
+               .duty_ctrl_rise_fall_mask = DUTY_CTRL_PWM2_RISE_FALL_MASK,
+       }
+};
+
+static int regmap_aspeed_pwm_tacho_reg_write(void *context, unsigned int reg,
+                                            unsigned int val)
+{
+       void __iomem *regs = (void __iomem *)context;
+
+       writel(val, regs + reg);
+       return 0;
+}
+
+static int regmap_aspeed_pwm_tacho_reg_read(void *context, unsigned int reg,
+                                           unsigned int *val)
+{
+       void __iomem *regs = (void __iomem *)context;
+
+       *val = readl(regs + reg);
+       return 0;
+}
+
+static const struct regmap_config aspeed_pwm_tacho_regmap_config = {
+       .reg_bits = 32,
+       .val_bits = 32,
+       .reg_stride = 4,
+       .max_register = ASPEED_PTCR_TYPEO_LIMIT,
+       .reg_write = regmap_aspeed_pwm_tacho_reg_write,
+       .reg_read = regmap_aspeed_pwm_tacho_reg_read,
+       .fast_io = true,
+};
+
+static void aspeed_set_clock_enable(struct regmap *regmap, bool val)
+{
+       regmap_update_bits(regmap, ASPEED_PTCR_CTRL,
+                          ASPEED_PTCR_CTRL_CLK_EN,
+                          val ? ASPEED_PTCR_CTRL_CLK_EN : 0);
+}
+
+static void aspeed_set_clock_source(struct regmap *regmap, int val)
+{
+       regmap_update_bits(regmap, ASPEED_PTCR_CTRL,
+                          ASPEED_PTCR_CTRL_CLK_SRC,
+                          val ? ASPEED_PTCR_CTRL_CLK_SRC : 0);
+}
+
+static void aspeed_set_pwm_clock_values(struct regmap *regmap, u8 type,
+                                       u8 div_high, u8 div_low, u8 unit)
+{
+       u32 reg_value = ((div_high << type_params[type].h_value) |
+                        (div_low << type_params[type].l_value) |
+                        (unit << type_params[type].unit_value));
+
+       regmap_update_bits(regmap, type_params[type].clk_ctrl_reg,
+                          type_params[type].clk_ctrl_mask, reg_value);
+}
+
+static void aspeed_set_pwm_port_enable(struct regmap *regmap, u8 pwm_port,
+                                      bool enable)
+{
+       regmap_update_bits(regmap, pwm_port_params[pwm_port].ctrl_reg,
+                          pwm_port_params[pwm_port].pwm_en,
+                          enable ? pwm_port_params[pwm_port].pwm_en : 0);
+}
+
+static void aspeed_set_pwm_port_type(struct regmap *regmap,
+                                    u8 pwm_port, u8 type)
+{
+       u32 reg_value = (type & 0x1) << pwm_port_params[pwm_port].type_part1;
+
+       reg_value |= (type & 0x2) << pwm_port_params[pwm_port].type_part2;
+
+       regmap_update_bits(regmap, pwm_port_params[pwm_port].ctrl_reg,
+                          pwm_port_params[pwm_port].type_mask, reg_value);
+}
+
+static void aspeed_set_pwm_port_duty_rising_falling(struct regmap *regmap,
+                                                   u8 pwm_port, u8 rising,
+                                                   u8 falling)
+{
+       u32 reg_value = (rising <<
+                        pwm_port_params[pwm_port].duty_ctrl_rise_point);
+       reg_value |= (falling <<
+                     pwm_port_params[pwm_port].duty_ctrl_fall_point);
+
+       regmap_update_bits(regmap, pwm_port_params[pwm_port].duty_ctrl_reg,
+                          pwm_port_params[pwm_port].duty_ctrl_rise_fall_mask,
+                          reg_value);
+}
+
+static void aspeed_set_tacho_type_enable(struct regmap *regmap, u8 type,
+                                        bool enable)
+{
+       regmap_update_bits(regmap, type_params[type].ctrl_reg,
+                          TYPE_CTRL_FAN_TYPE_EN,
+                          enable ? TYPE_CTRL_FAN_TYPE_EN : 0);
+}
+
+static void aspeed_set_tacho_type_values(struct regmap *regmap, u8 type,
+                                        u8 mode, u16 unit, u8 division)
+{
+       u32 reg_value = ((mode << TYPE_CTRL_FAN_MODE) |
+                        (unit << TYPE_CTRL_FAN_PERIOD) |
+                        (division << TYPE_CTRL_FAN_DIVISION));
+
+       regmap_update_bits(regmap, type_params[type].ctrl_reg,
+                          TYPE_CTRL_FAN_MASK, reg_value);
+       regmap_update_bits(regmap, type_params[type].ctrl_reg1,
+                          TYPE_CTRL_FAN1_MASK, unit << 16);
+}
+
+static void aspeed_set_fan_tach_ch_enable(struct regmap *regmap, u8 fan_tach_ch,
+                                         bool enable)
+{
+       regmap_update_bits(regmap, ASPEED_PTCR_CTRL,
+                          ASPEED_PTCR_CTRL_FAN_NUM_EN(fan_tach_ch),
+                          enable ?
+                          ASPEED_PTCR_CTRL_FAN_NUM_EN(fan_tach_ch) : 0);
+}
+
+static void aspeed_set_fan_tach_ch_source(struct regmap *regmap, u8 fan_tach_ch,
+                                         u8 fan_tach_ch_source)
+{
+       u32 reg_value1 = ((fan_tach_ch_source & 0x3) <<
+                         TACH_PWM_SOURCE_BIT01(fan_tach_ch));
+       u32 reg_value2 = (((fan_tach_ch_source & 0x4) >> 2) <<
+                         TACH_PWM_SOURCE_BIT2(fan_tach_ch));
+
+       regmap_update_bits(regmap, ASPEED_PTCR_TACH_SOURCE,
+                          TACH_PWM_SOURCE_MASK_BIT01(fan_tach_ch),
+                          reg_value1);
+
+       regmap_update_bits(regmap, ASPEED_PTCR_TACH_SOURCE_EXT,
+                          TACH_PWM_SOURCE_MASK_BIT2(fan_tach_ch),
+                          reg_value2);
+}
+
+static void aspeed_set_pwm_port_fan_ctrl(struct aspeed_pwm_tacho_data *priv,
+                                        u8 index, u8 fan_ctrl)
+{
+       u16 period, dc_time_on;
+
+       period = priv->type_pwm_clock_unit[priv->pwm_port_type[index]];
+       period += 1;
+       dc_time_on = (fan_ctrl * period) / PWM_MAX;
+
+       if (dc_time_on == 0) {
+               aspeed_set_pwm_port_enable(priv->regmap, index, false);
+       } else {
+               if (dc_time_on == period)
+                       dc_time_on = 0;
+
+               aspeed_set_pwm_port_duty_rising_falling(priv->regmap, index, 0,
+                                                       dc_time_on);
+               aspeed_set_pwm_port_enable(priv->regmap, index, true);
+       }
+}
+
+static u32 aspeed_get_fan_tach_ch_measure_period(struct aspeed_pwm_tacho_data
+                                                *priv, u8 type)
+{
+       u32 clk;
+       u16 tacho_unit;
+       u8 clk_unit, div_h, div_l, tacho_div;
+
+       clk = priv->clk_freq;
+       clk_unit = priv->type_pwm_clock_unit[type];
+       div_h = priv->type_pwm_clock_division_h[type];
+       div_h = 0x1 << div_h;
+       div_l = priv->type_pwm_clock_division_l[type];
+       if (div_l == 0)
+               div_l = 1;
+       else
+               div_l = div_l * 2;
+
+       tacho_unit = priv->type_fan_tach_unit[type];
+       tacho_div = priv->type_fan_tach_clock_division[type];
+
+       tacho_div = 0x4 << (tacho_div * 2);
+       return clk / (clk_unit * div_h * div_l * tacho_div * tacho_unit);
+}
+
+static u32 aspeed_get_fan_tach_ch_rpm(struct aspeed_pwm_tacho_data *priv,
+                                     u8 fan_tach_ch)
+{
+       u32 raw_data, tach_div, clk_source, sec, val;
+       u8 fan_tach_ch_source, type;
+
+       regmap_write(priv->regmap, ASPEED_PTCR_TRIGGER, 0);
+       regmap_write(priv->regmap, ASPEED_PTCR_TRIGGER, 0x1 << fan_tach_ch);
+
+       fan_tach_ch_source = priv->fan_tach_ch_source[fan_tach_ch];
+       type = priv->pwm_port_type[fan_tach_ch_source];
+
+       sec = (1000 / aspeed_get_fan_tach_ch_measure_period(priv, type));
+       msleep(sec);
+
+       regmap_read(priv->regmap, ASPEED_PTCR_RESULT, &val);
+       raw_data = val & RESULT_VALUE_MASK;
+       tach_div = priv->type_fan_tach_clock_division[type];
+       tach_div = 0x4 << (tach_div * 2);
+       clk_source = priv->clk_freq;
+
+       if (raw_data == 0)
+               return 0;
+
+       return (clk_source * 60) / (2 * raw_data * tach_div);
+}
+
+static ssize_t set_pwm(struct device *dev, struct device_attribute *attr,
+                      const char *buf, size_t count)
+{
+       struct sensor_device_attribute *sensor_attr = to_sensor_dev_attr(attr);
+       int index = sensor_attr->index;
+       int ret;
+       struct aspeed_pwm_tacho_data *priv = dev_get_drvdata(dev);
+       long fan_ctrl;
+
+       ret = kstrtol(buf, 10, &fan_ctrl);
+       if (ret != 0)
+               return ret;
+
+       if (fan_ctrl < 0 || fan_ctrl > PWM_MAX)
+               return -EINVAL;
+
+       if (priv->pwm_port_fan_ctrl[index] == fan_ctrl)
+               return count;
+
+       priv->pwm_port_fan_ctrl[index] = fan_ctrl;
+       aspeed_set_pwm_port_fan_ctrl(priv, index, fan_ctrl);
+
+       return count;
+}
+
+static ssize_t show_pwm(struct device *dev, struct device_attribute *attr,
+                       char *buf)
+{
+       struct sensor_device_attribute *sensor_attr = to_sensor_dev_attr(attr);
+       int index = sensor_attr->index;
+       struct aspeed_pwm_tacho_data *priv = dev_get_drvdata(dev);
+
+       return sprintf(buf, "%u\n", priv->pwm_port_fan_ctrl[index]);
+}
+
+static ssize_t show_rpm(struct device *dev, struct device_attribute *attr,
+                       char *buf)
+{
+       struct sensor_device_attribute *sensor_attr = to_sensor_dev_attr(attr);
+       int index = sensor_attr->index;
+       u32 rpm;
+       struct aspeed_pwm_tacho_data *priv = dev_get_drvdata(dev);
+
+       rpm = aspeed_get_fan_tach_ch_rpm(priv, index);
+
+       return sprintf(buf, "%u\n", rpm);
+}
+
+static umode_t pwm_is_visible(struct kobject *kobj,
+                             struct attribute *a, int index)
+{
+       struct device *dev = container_of(kobj, struct device, kobj);
+       struct aspeed_pwm_tacho_data *priv = dev_get_drvdata(dev);
+
+       if (!priv->pwm_present[index])
+               return 0;
+       return a->mode;
+}
+
+static umode_t fan_dev_is_visible(struct kobject *kobj,
+                                 struct attribute *a, int index)
+{
+       struct device *dev = container_of(kobj, struct device, kobj);
+       struct aspeed_pwm_tacho_data *priv = dev_get_drvdata(dev);
+
+       if (!priv->fan_tach_present[index])
+               return 0;
+       return a->mode;
+}
+
+static SENSOR_DEVICE_ATTR(pwm0, 0644,
+                       show_pwm, set_pwm, 0);
+static SENSOR_DEVICE_ATTR(pwm1, 0644,
+                       show_pwm, set_pwm, 1);
+static SENSOR_DEVICE_ATTR(pwm2, 0644,
+                       show_pwm, set_pwm, 2);
+static SENSOR_DEVICE_ATTR(pwm3, 0644,
+                       show_pwm, set_pwm, 3);
+static SENSOR_DEVICE_ATTR(pwm4, 0644,
+                       show_pwm, set_pwm, 4);
+static SENSOR_DEVICE_ATTR(pwm5, 0644,
+                       show_pwm, set_pwm, 5);
+static SENSOR_DEVICE_ATTR(pwm6, 0644,
+                       show_pwm, set_pwm, 6);
+static SENSOR_DEVICE_ATTR(pwm7, 0644,
+                       show_pwm, set_pwm, 7);
+static struct attribute *pwm_dev_attrs[] = {
+       &sensor_dev_attr_pwm0.dev_attr.attr,
+       &sensor_dev_attr_pwm1.dev_attr.attr,
+       &sensor_dev_attr_pwm2.dev_attr.attr,
+       &sensor_dev_attr_pwm3.dev_attr.attr,
+       &sensor_dev_attr_pwm4.dev_attr.attr,
+       &sensor_dev_attr_pwm5.dev_attr.attr,
+       &sensor_dev_attr_pwm6.dev_attr.attr,
+       &sensor_dev_attr_pwm7.dev_attr.attr,
+       NULL,
+};
+
+static const struct attribute_group pwm_dev_group = {
+       .attrs = pwm_dev_attrs,
+       .is_visible = pwm_is_visible,
+};
+
+static SENSOR_DEVICE_ATTR(fan0_input, 0444,
+               show_rpm, NULL, 0);
+static SENSOR_DEVICE_ATTR(fan1_input, 0444,
+               show_rpm, NULL, 1);
+static SENSOR_DEVICE_ATTR(fan2_input, 0444,
+               show_rpm, NULL, 2);
+static SENSOR_DEVICE_ATTR(fan3_input, 0444,
+               show_rpm, NULL, 3);
+static SENSOR_DEVICE_ATTR(fan4_input, 0444,
+               show_rpm, NULL, 4);
+static SENSOR_DEVICE_ATTR(fan5_input, 0444,
+               show_rpm, NULL, 5);
+static SENSOR_DEVICE_ATTR(fan6_input, 0444,
+               show_rpm, NULL, 6);
+static SENSOR_DEVICE_ATTR(fan7_input, 0444,
+               show_rpm, NULL, 7);
+static SENSOR_DEVICE_ATTR(fan8_input, 0444,
+               show_rpm, NULL, 8);
+static SENSOR_DEVICE_ATTR(fan9_input, 0444,
+               show_rpm, NULL, 9);
+static SENSOR_DEVICE_ATTR(fan10_input, 0444,
+               show_rpm, NULL, 10);
+static SENSOR_DEVICE_ATTR(fan11_input, 0444,
+               show_rpm, NULL, 11);
+static SENSOR_DEVICE_ATTR(fan12_input, 0444,
+               show_rpm, NULL, 12);
+static SENSOR_DEVICE_ATTR(fan13_input, 0444,
+               show_rpm, NULL, 13);
+static SENSOR_DEVICE_ATTR(fan14_input, 0444,
+               show_rpm, NULL, 14);
+static SENSOR_DEVICE_ATTR(fan15_input, 0444,
+               show_rpm, NULL, 15);
+static struct attribute *fan_dev_attrs[] = {
+       &sensor_dev_attr_fan0_input.dev_attr.attr,
+       &sensor_dev_attr_fan1_input.dev_attr.attr,
+       &sensor_dev_attr_fan2_input.dev_attr.attr,
+       &sensor_dev_attr_fan3_input.dev_attr.attr,
+       &sensor_dev_attr_fan4_input.dev_attr.attr,
+       &sensor_dev_attr_fan5_input.dev_attr.attr,
+       &sensor_dev_attr_fan6_input.dev_attr.attr,
+       &sensor_dev_attr_fan7_input.dev_attr.attr,
+       &sensor_dev_attr_fan8_input.dev_attr.attr,
+       &sensor_dev_attr_fan9_input.dev_attr.attr,
+       &sensor_dev_attr_fan10_input.dev_attr.attr,
+       &sensor_dev_attr_fan11_input.dev_attr.attr,
+       &sensor_dev_attr_fan12_input.dev_attr.attr,
+       &sensor_dev_attr_fan13_input.dev_attr.attr,
+       &sensor_dev_attr_fan14_input.dev_attr.attr,
+       &sensor_dev_attr_fan15_input.dev_attr.attr,
+       NULL
+};
+
+static const struct attribute_group fan_dev_group = {
+       .attrs = fan_dev_attrs,
+       .is_visible = fan_dev_is_visible,
+};
+
+/*
+ * The clock type is type M :
+ * The PWM frequency = 24MHz / (type M clock division L bit *
+ * type M clock division H bit * (type M PWM period bit + 1))
+ */
+static void aspeed_create_type(struct aspeed_pwm_tacho_data *priv)
+{
+       priv->type_pwm_clock_division_h[TYPEM] = M_PWM_DIV_H;
+       priv->type_pwm_clock_division_l[TYPEM] = M_PWM_DIV_L;
+       priv->type_pwm_clock_unit[TYPEM] = M_PWM_PERIOD;
+       aspeed_set_pwm_clock_values(priv->regmap, TYPEM, M_PWM_DIV_H,
+                                   M_PWM_DIV_L, M_PWM_PERIOD);
+       aspeed_set_tacho_type_enable(priv->regmap, TYPEM, true);
+       priv->type_fan_tach_clock_division[TYPEM] = M_TACH_CLK_DIV;
+       priv->type_fan_tach_unit[TYPEM] = M_TACH_UNIT;
+       aspeed_set_tacho_type_values(priv->regmap, TYPEM, M_TACH_MODE,
+                                    M_TACH_UNIT, M_TACH_CLK_DIV);
+}
+
+static void aspeed_create_pwm_port(struct aspeed_pwm_tacho_data *priv,
+                                  u8 pwm_port)
+{
+       aspeed_set_pwm_port_enable(priv->regmap, pwm_port, true);
+       priv->pwm_present[pwm_port] = true;
+
+       priv->pwm_port_type[pwm_port] = TYPEM;
+       aspeed_set_pwm_port_type(priv->regmap, pwm_port, TYPEM);
+
+       priv->pwm_port_fan_ctrl[pwm_port] = INIT_FAN_CTRL;
+       aspeed_set_pwm_port_fan_ctrl(priv, pwm_port, INIT_FAN_CTRL);
+}
+
+static void aspeed_create_fan_tach_channel(struct aspeed_pwm_tacho_data *priv,
+                                          u8 *fan_tach_ch,
+                                          int count,
+                                          u8 pwm_source)
+{
+       u8 val, index;
+
+       for (val = 0; val < count; val++) {
+               index = fan_tach_ch[val];
+               aspeed_set_fan_tach_ch_enable(priv->regmap, index, true);
+               priv->fan_tach_present[index] = true;
+               priv->fan_tach_ch_source[index] = pwm_source;
+               aspeed_set_fan_tach_ch_source(priv->regmap, index, pwm_source);
+       }
+}
+
+static int aspeed_create_fan(struct device *dev,
+                            struct device_node *child,
+                            struct aspeed_pwm_tacho_data *priv)
+{
+       u8 *fan_tach_ch;
+       u32 pwm_port;
+       int ret, count;
+
+       ret = of_property_read_u32(child, "reg", &pwm_port);
+       if (ret)
+               return ret;
+       aspeed_create_pwm_port(priv, (u8)pwm_port);
+
+       count = of_property_count_u8_elems(child, "aspeed,fan-tach-ch");
+       if (count < 1)
+               return -EINVAL;
+       fan_tach_ch = devm_kzalloc(dev, sizeof(*fan_tach_ch) * count,
+                                  GFP_KERNEL);
+       if (!fan_tach_ch)
+               return -ENOMEM;
+       ret = of_property_read_u8_array(child, "aspeed,fan-tach-ch",
+                                       fan_tach_ch, count);
+       if (ret)
+               return ret;
+       aspeed_create_fan_tach_channel(priv, fan_tach_ch, count, pwm_port);
+
+       return 0;
+}
+
+static int aspeed_pwm_tacho_probe(struct platform_device *pdev)
+{
+       struct device *dev = &pdev->dev;
+       struct device_node *np, *child;
+       struct aspeed_pwm_tacho_data *priv;
+       void __iomem *regs;
+       struct resource *res;
+       struct device *hwmon;
+       struct clk *clk;
+       int ret;
+
+       np = dev->of_node;
+
+       res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+       if (!res)
+               return -ENOENT;
+       regs = devm_ioremap_resource(dev, res);
+       if (IS_ERR(regs))
+               return PTR_ERR(regs);
+       priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
+       if (!priv)
+               return -ENOMEM;
+       priv->regmap = devm_regmap_init(dev, NULL, (__force void *)regs,
+                       &aspeed_pwm_tacho_regmap_config);
+       if (IS_ERR(priv->regmap))
+               return PTR_ERR(priv->regmap);
+       regmap_write(priv->regmap, ASPEED_PTCR_TACH_SOURCE, 0);
+       regmap_write(priv->regmap, ASPEED_PTCR_TACH_SOURCE_EXT, 0);
+
+       clk = devm_clk_get(dev, NULL);
+       if (IS_ERR(clk))
+               return -ENODEV;
+       priv->clk_freq = clk_get_rate(clk);
+       aspeed_set_clock_enable(priv->regmap, true);
+       aspeed_set_clock_source(priv->regmap, 0);
+
+       aspeed_create_type(priv);
+
+       for_each_child_of_node(np, child) {
+               ret = aspeed_create_fan(dev, child, priv);
+               of_node_put(child);
+               if (ret)
+                       return ret;
+       }
+       of_node_put(np);
+
+       priv->groups[0] = &pwm_dev_group;
+       priv->groups[1] = &fan_dev_group;
+       priv->groups[2] = NULL;
+       hwmon = devm_hwmon_device_register_with_groups(dev,
+                                                      "aspeed_pwm_tacho",
+                                                      priv, priv->groups);
+       return PTR_ERR_OR_ZERO(hwmon);
+}
+
+static const struct of_device_id of_pwm_tacho_match_table[] = {
+       { .compatible = "aspeed,ast2400-pwm-tacho", },
+       { .compatible = "aspeed,ast2500-pwm-tacho", },
+       {},
+};
+MODULE_DEVICE_TABLE(of, of_pwm_tacho_match_table);
+
+static struct platform_driver aspeed_pwm_tacho_driver = {
+       .probe          = aspeed_pwm_tacho_probe,
+       .driver         = {
+               .name   = "aspeed_pwm_tacho",
+               .of_match_table = of_pwm_tacho_match_table,
+       },
+};
+
+module_platform_driver(aspeed_pwm_tacho_driver);
+
+MODULE_AUTHOR("Jaghathiswari Rankappagounder Natarajan <jaghu@google.com>");
+MODULE_DESCRIPTION("ASPEED PWM and Fan Tacho device driver");
+MODULE_LICENSE("GPL");
index 34704b0..3189246 100644 (file)
@@ -995,6 +995,13 @@ static struct dmi_system_id i8k_dmi_table[] __initdata = {
                },
                .driver_data = (void *)&i8k_config_data[DELL_XPS],
        },
+       {
+               .ident = "Dell XPS 15 9560",
+               .matches = {
+                       DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "XPS 15 9560"),
+               },
+       },
        { }
 };
 
index 28375d5..dd6e17c 100644 (file)
@@ -186,7 +186,7 @@ static ssize_t hwmon_attr_show_string(struct device *dev,
                                      char *buf)
 {
        struct hwmon_device_attribute *hattr = to_hwmon_attr(devattr);
-       char *s;
+       const char *s;
        int ret;
 
        ret = hattr->ops->read_string(dev, hattr->type, hattr->attr,
index 5378fde..aa0768c 100644 (file)
@@ -117,7 +117,7 @@ static long ina209_from_reg(const u8 reg, const u16 val)
        case INA209_SHUNT_VOLTAGE_POS_WARN:
        case INA209_SHUNT_VOLTAGE_NEG_WARN:
                /* LSB=10 uV. Convert to mV. */
-               return DIV_ROUND_CLOSEST(val, 100);
+               return DIV_ROUND_CLOSEST((s16)val, 100);
 
        case INA209_BUS_VOLTAGE:
        case INA209_BUS_VOLTAGE_MAX_PEAK:
@@ -146,7 +146,7 @@ static long ina209_from_reg(const u8 reg, const u16 val)
 
        case INA209_CURRENT:
                /* LSB=1 mA (selected). Is in mA */
-               return val;
+               return (s16)val;
        }
 
        /* programmer goofed */
@@ -608,11 +608,18 @@ static const struct i2c_device_id ina209_id[] = {
 };
 MODULE_DEVICE_TABLE(i2c, ina209_id);
 
+static const struct of_device_id ina209_of_match[] = {
+       { .compatible = "ti,ina209" },
+       { },
+};
+MODULE_DEVICE_TABLE(of, ina209_of_match);
+
 /* This is the driver that will be inserted */
 static struct i2c_driver ina209_driver = {
        .class          = I2C_CLASS_HWMON,
        .driver = {
                .name   = "ina209",
+               .of_match_table = of_match_ptr(ina209_of_match),
        },
        .probe          = ina209_probe,
        .remove         = ina209_remove,
index b24f1d3..62e38fa 100644 (file)
@@ -34,6 +34,7 @@
 #include <linux/hwmon.h>
 #include <linux/hwmon-sysfs.h>
 #include <linux/jiffies.h>
+#include <linux/of_device.h>
 #include <linux/of.h>
 #include <linux/delay.h>
 #include <linux/util_macros.h>
@@ -424,13 +425,19 @@ static int ina2xx_probe(struct i2c_client *client,
        struct device *hwmon_dev;
        u32 val;
        int ret, group = 0;
+       enum ina2xx_ids chip;
+
+       if (client->dev.of_node)
+               chip = (enum ina2xx_ids)of_device_get_match_data(&client->dev);
+       else
+               chip = id->driver_data;
 
        data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL);
        if (!data)
                return -ENOMEM;
 
        /* set the device type */
-       data->config = &ina2xx_config[id->driver_data];
+       data->config = &ina2xx_config[chip];
 
        if (of_property_read_u32(dev->of_node, "shunt-resistor", &val) < 0) {
                struct ina2xx_platform_data *pdata = dev_get_platdata(dev);
@@ -487,9 +494,35 @@ static const struct i2c_device_id ina2xx_id[] = {
 };
 MODULE_DEVICE_TABLE(i2c, ina2xx_id);
 
+static const struct of_device_id ina2xx_of_match[] = {
+       {
+               .compatible = "ti,ina219",
+               .data = (void *)ina219
+       },
+       {
+               .compatible = "ti,ina220",
+               .data = (void *)ina219
+       },
+       {
+               .compatible = "ti,ina226",
+               .data = (void *)ina226
+       },
+       {
+               .compatible = "ti,ina230",
+               .data = (void *)ina226
+       },
+       {
+               .compatible = "ti,ina231",
+               .data = (void *)ina226
+       },
+       { },
+};
+MODULE_DEVICE_TABLE(of, ina2xx_of_match);
+
 static struct i2c_driver ina2xx_driver = {
        .driver = {
                .name   = "ina2xx",
+               .of_match_table = of_match_ptr(ina2xx_of_match),
        },
        .probe          = ina2xx_probe,
        .id_table       = ina2xx_id,
index 2e19486..4c17709 100644 (file)
@@ -46,6 +46,7 @@
 #include <linux/hwmon.h>
 #include <linux/err.h>
 #include <linux/mutex.h>
+#include <linux/of_device.h>
 #include <linux/sysfs.h>
 #include <linux/types.h>
 
@@ -1115,6 +1116,10 @@ static int lm63_probe(struct i2c_client *client,
        mutex_init(&data->update_lock);
 
        /* Set the device type */
+       if (client->dev.of_node)
+               data->kind = (enum chips)of_device_get_match_data(&client->dev);
+       else
+               data->kind = id->driver_data;
        data->kind = id->driver_data;
        if (data->kind == lm64)
                data->temp2_offset = 16000;
@@ -1149,10 +1154,28 @@ static const struct i2c_device_id lm63_id[] = {
 };
 MODULE_DEVICE_TABLE(i2c, lm63_id);
 
+static const struct of_device_id lm63_of_match[] = {
+       {
+               .compatible = "national,lm63",
+               .data = (void *)lm63
+       },
+       {
+               .compatible = "national,lm64",
+               .data = (void *)lm64
+       },
+       {
+               .compatible = "national,lm96163",
+               .data = (void *)lm96163
+       },
+       { },
+};
+MODULE_DEVICE_TABLE(of, lm63_of_match);
+
 static struct i2c_driver lm63_driver = {
        .class          = I2C_CLASS_HWMON,
        .driver = {
                .name   = "lm63",
+               .of_match_table = of_match_ptr(lm63_of_match),
        },
        .probe          = lm63_probe,
        .id_table       = lm63_id,
index eff3b24..005ffb5 100644 (file)
@@ -26,6 +26,7 @@
 #include <linux/hwmon.h>
 #include <linux/hwmon-sysfs.h>
 #include <linux/err.h>
+#include <linux/of_device.h>
 #include <linux/of.h>
 #include <linux/regmap.h>
 #include "lm75.h"
@@ -273,7 +274,12 @@ lm75_probe(struct i2c_client *client, const struct i2c_device_id *id)
        int status, err;
        u8 set_mask, clr_mask;
        int new;
-       enum lm75_type kind = id->driver_data;
+       enum lm75_type kind;
+
+       if (client->dev.of_node)
+               kind = (enum lm75_type)of_device_get_match_data(&client->dev);
+       else
+               kind = id->driver_data;
 
        if (!i2c_check_functionality(client->adapter,
                        I2C_FUNC_SMBUS_BYTE_DATA | I2C_FUNC_SMBUS_WORD_DATA))
@@ -424,6 +430,95 @@ static const struct i2c_device_id lm75_ids[] = {
 };
 MODULE_DEVICE_TABLE(i2c, lm75_ids);
 
+static const struct of_device_id lm75_of_match[] = {
+       {
+               .compatible = "adi,adt75",
+               .data = (void *)adt75
+       },
+       {
+               .compatible = "dallas,ds1775",
+               .data = (void *)ds1775
+       },
+       {
+               .compatible = "dallas,ds75",
+               .data = (void *)ds75
+       },
+       {
+               .compatible = "dallas,ds7505",
+               .data = (void *)ds7505
+       },
+       {
+               .compatible = "gmt,g751",
+               .data = (void *)g751
+       },
+       {
+               .compatible = "national,lm75",
+               .data = (void *)lm75
+       },
+       {
+               .compatible = "national,lm75a",
+               .data = (void *)lm75a
+       },
+       {
+               .compatible = "national,lm75b",
+               .data = (void *)lm75b
+       },
+       {
+               .compatible = "maxim,max6625",
+               .data = (void *)max6625
+       },
+       {
+               .compatible = "maxim,max6626",
+               .data = (void *)max6626
+       },
+       {
+               .compatible = "maxim,mcp980x",
+               .data = (void *)mcp980x
+       },
+       {
+               .compatible = "st,stds75",
+               .data = (void *)stds75
+       },
+       {
+               .compatible = "microchip,tcn75",
+               .data = (void *)tcn75
+       },
+       {
+               .compatible = "ti,tmp100",
+               .data = (void *)tmp100
+       },
+       {
+               .compatible = "ti,tmp101",
+               .data = (void *)tmp101
+       },
+       {
+               .compatible = "ti,tmp105",
+               .data = (void *)tmp105
+       },
+       {
+               .compatible = "ti,tmp112",
+               .data = (void *)tmp112
+       },
+       {
+               .compatible = "ti,tmp175",
+               .data = (void *)tmp175
+       },
+       {
+               .compatible = "ti,tmp275",
+               .data = (void *)tmp275
+       },
+       {
+               .compatible = "ti,tmp75",
+               .data = (void *)tmp75
+       },
+       {
+               .compatible = "ti,tmp75c",
+               .data = (void *)tmp75c
+       },
+       { },
+};
+MODULE_DEVICE_TABLE(of, lm75_of_match);
+
 #define LM75A_ID 0xA1
 
 /* Return 0 if detection is successful, -ENODEV otherwise */
@@ -560,6 +655,7 @@ static struct i2c_driver lm75_driver = {
        .class          = I2C_CLASS_HWMON,
        .driver = {
                .name   = "lm75",
+               .of_match_table = of_match_ptr(lm75_of_match),
                .pm     = LM75_DEV_PM_OPS,
        },
        .probe          = lm75_probe,
index 691469f..0a32587 100644 (file)
@@ -25,6 +25,7 @@
  */
 
 #include <linux/module.h>
+#include <linux/of_device.h>
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/jiffies.h>
@@ -1552,7 +1553,10 @@ static int lm85_probe(struct i2c_client *client, const struct i2c_device_id *id)
                return -ENOMEM;
 
        data->client = client;
-       data->type = id->driver_data;
+       if (client->dev.of_node)
+               data->type = (enum chips)of_device_get_match_data(&client->dev);
+       else
+               data->type = id->driver_data;
        mutex_init(&data->update_lock);
 
        /* Fill in the chip specific driver values */
@@ -1623,10 +1627,60 @@ static const struct i2c_device_id lm85_id[] = {
 };
 MODULE_DEVICE_TABLE(i2c, lm85_id);
 
+static const struct of_device_id lm85_of_match[] = {
+       {
+               .compatible = "adi,adm1027",
+               .data = (void *)adm1027
+       },
+       {
+               .compatible = "adi,adt7463",
+               .data = (void *)adt7463
+       },
+       {
+               .compatible = "adi,adt7468",
+               .data = (void *)adt7468
+       },
+       {
+               .compatible = "national,lm85",
+               .data = (void *)lm85
+       },
+       {
+               .compatible = "national,lm85b",
+               .data = (void *)lm85
+       },
+       {
+               .compatible = "national,lm85c",
+               .data = (void *)lm85
+       },
+       {
+               .compatible = "smsc,emc6d100",
+               .data = (void *)emc6d100
+       },
+       {
+               .compatible = "smsc,emc6d101",
+               .data = (void *)emc6d100
+       },
+       {
+               .compatible = "smsc,emc6d102",
+               .data = (void *)emc6d102
+       },
+       {
+               .compatible = "smsc,emc6d103",
+               .data = (void *)emc6d103
+       },
+       {
+               .compatible = "smsc,emc6d103s",
+               .data = (void *)emc6d103s
+       },
+       { },
+};
+MODULE_DEVICE_TABLE(of, lm85_of_match);
+
 static struct i2c_driver lm85_driver = {
        .class          = I2C_CLASS_HWMON,
        .driver = {
                .name   = "lm85",
+               .of_match_table = of_match_ptr(lm85_of_match),
        },
        .probe          = lm85_probe,
        .id_table       = lm85_id,
index e06faf9..b48d307 100644 (file)
@@ -66,6 +66,7 @@
 #include <linux/hwmon-vid.h>
 #include <linux/err.h>
 #include <linux/mutex.h>
+#include <linux/regulator/consumer.h>
 
 /*
  * Addresses to scan
@@ -74,8 +75,6 @@
 
 static const unsigned short normal_i2c[] = { 0x2c, 0x2d, 0x2e, I2C_CLIENT_END };
 
-enum chips { lm87, adm1024 };
-
 /*
  * The LM87 registers
  */
@@ -855,8 +854,26 @@ static int lm87_init_client(struct i2c_client *client)
 {
        struct lm87_data *data = i2c_get_clientdata(client);
        int rc;
-
-       if (dev_get_platdata(&client->dev)) {
+       struct device_node *of_node = client->dev.of_node;
+       u8 val = 0;
+       struct regulator *vcc = NULL;
+
+       if (of_node) {
+               if (of_property_read_bool(of_node, "has-temp3"))
+                       val |= CHAN_TEMP3;
+               if (of_property_read_bool(of_node, "has-in6"))
+                       val |= CHAN_NO_FAN(0);
+               if (of_property_read_bool(of_node, "has-in7"))
+                       val |= CHAN_NO_FAN(1);
+               vcc = devm_regulator_get_optional(&client->dev, "vcc");
+               if (!IS_ERR(vcc)) {
+                       if (regulator_get_voltage(vcc) == 5000000)
+                               val |= CHAN_VCC_5V;
+               }
+               data->channel = val;
+               lm87_write_value(client,
+                               LM87_REG_CHANNEL_MODE, data->channel);
+       } else if (dev_get_platdata(&client->dev)) {
                data->channel = *(u8 *)dev_get_platdata(&client->dev);
                lm87_write_value(client,
                                 LM87_REG_CHANNEL_MODE, data->channel);
@@ -962,16 +979,24 @@ static int lm87_probe(struct i2c_client *client, const struct i2c_device_id *id)
  */
 
 static const struct i2c_device_id lm87_id[] = {
-       { "lm87", lm87 },
-       { "adm1024", adm1024 },
+       { "lm87", 0 },
+       { "adm1024", 0 },
        { }
 };
 MODULE_DEVICE_TABLE(i2c, lm87_id);
 
+static const struct of_device_id lm87_of_match[] = {
+       { .compatible = "ti,lm87" },
+       { .compatible = "adi,adm1024" },
+       { },
+};
+MODULE_DEVICE_TABLE(of, lm87_of_match);
+
 static struct i2c_driver lm87_driver = {
        .class          = I2C_CLASS_HWMON,
        .driver = {
                .name   = "lm87",
+               .of_match_table = lm87_of_match,
        },
        .probe          = lm87_probe,
        .id_table       = lm87_id,
index aff5297..c2f411c 100644 (file)
@@ -92,6 +92,7 @@
 #include <linux/hwmon.h>
 #include <linux/err.h>
 #include <linux/mutex.h>
+#include <linux/of_device.h>
 #include <linux/sysfs.h>
 #include <linux/interrupt.h>
 #include <linux/regulator/consumer.h>
@@ -235,6 +236,99 @@ static const struct i2c_device_id lm90_id[] = {
 };
 MODULE_DEVICE_TABLE(i2c, lm90_id);
 
+static const struct of_device_id lm90_of_match[] = {
+       {
+               .compatible = "adi,adm1032",
+               .data = (void *)adm1032
+       },
+       {
+               .compatible = "adi,adt7461",
+               .data = (void *)adt7461
+       },
+       {
+               .compatible = "adi,adt7461a",
+               .data = (void *)adt7461
+       },
+       {
+               .compatible = "gmt,g781",
+               .data = (void *)g781
+       },
+       {
+               .compatible = "national,lm90",
+               .data = (void *)lm90
+       },
+       {
+               .compatible = "national,lm86",
+               .data = (void *)lm86
+       },
+       {
+               .compatible = "national,lm89",
+               .data = (void *)lm86
+       },
+       {
+               .compatible = "national,lm99",
+               .data = (void *)lm99
+       },
+       {
+               .compatible = "dallas,max6646",
+               .data = (void *)max6646
+       },
+       {
+               .compatible = "dallas,max6647",
+               .data = (void *)max6646
+       },
+       {
+               .compatible = "dallas,max6649",
+               .data = (void *)max6646
+       },
+       {
+               .compatible = "dallas,max6657",
+               .data = (void *)max6657
+       },
+       {
+               .compatible = "dallas,max6658",
+               .data = (void *)max6657
+       },
+       {
+               .compatible = "dallas,max6659",
+               .data = (void *)max6659
+       },
+       {
+               .compatible = "dallas,max6680",
+               .data = (void *)max6680
+       },
+       {
+               .compatible = "dallas,max6681",
+               .data = (void *)max6680
+       },
+       {
+               .compatible = "dallas,max6695",
+               .data = (void *)max6696
+       },
+       {
+               .compatible = "dallas,max6696",
+               .data = (void *)max6696
+       },
+       {
+               .compatible = "onnn,nct1008",
+               .data = (void *)adt7461
+       },
+       {
+               .compatible = "winbond,w83l771",
+               .data = (void *)w83l771
+       },
+       {
+               .compatible = "nxp,sa56004",
+               .data = (void *)sa56004
+       },
+       {
+               .compatible = "ti,tmp451",
+               .data = (void *)tmp451
+       },
+       { },
+};
+MODULE_DEVICE_TABLE(of, lm90_of_match);
+
 /*
  * chip type specific parameters
  */
@@ -1677,7 +1771,10 @@ static int lm90_probe(struct i2c_client *client,
        mutex_init(&data->update_lock);
 
        /* Set the device type */
-       data->kind = id->driver_data;
+       if (client->dev.of_node)
+               data->kind = (enum chips)of_device_get_match_data(&client->dev);
+       else
+               data->kind = id->driver_data;
        if (data->kind == adm1032) {
                if (!i2c_check_functionality(adapter, I2C_FUNC_SMBUS_BYTE))
                        client->flags &= ~I2C_CLIENT_PEC;
@@ -1816,6 +1913,7 @@ static struct i2c_driver lm90_driver = {
        .class          = I2C_CLASS_HWMON,
        .driver = {
                .name   = "lm90",
+               .of_match_table = of_match_ptr(lm90_of_match),
        },
        .probe          = lm90_probe,
        .alert          = lm90_alert,
index a3bfd88..27cb06d 100644 (file)
@@ -622,10 +622,18 @@ static const struct i2c_device_id lm95245_id[] = {
 };
 MODULE_DEVICE_TABLE(i2c, lm95245_id);
 
+static const struct of_device_id lm95245_of_match[] = {
+       { .compatible = "national,lm95235" },
+       { .compatible = "national,lm95245" },
+       { },
+};
+MODULE_DEVICE_TABLE(of, lm95245_of_match);
+
 static struct i2c_driver lm95245_driver = {
        .class          = I2C_CLASS_HWMON,
        .driver = {
                .name   = "lm95245",
+               .of_match_table = of_match_ptr(lm95245_of_match),
        },
        .probe          = lm95245_probe,
        .id_table       = lm95245_id,
index f03a717..221fd14 100644 (file)
@@ -24,6 +24,7 @@
 #include <linux/hwmon-sysfs.h>
 #include <linux/err.h>
 #include <linux/mutex.h>
+#include <linux/of_device.h>
 #include <linux/of.h>
 
 #include <linux/platform_data/max6697.h>
@@ -632,7 +633,10 @@ static int max6697_probe(struct i2c_client *client,
        if (!data)
                return -ENOMEM;
 
-       data->type = id->driver_data;
+       if (client->dev.of_node)
+               data->type = (enum chips)of_device_get_match_data(&client->dev);
+       else
+               data->type = id->driver_data;
        data->chip = &max6697_chip_data[data->type];
        data->client = client;
        mutex_init(&data->update_lock);
@@ -662,10 +666,56 @@ static const struct i2c_device_id max6697_id[] = {
 };
 MODULE_DEVICE_TABLE(i2c, max6697_id);
 
+static const struct of_device_id max6697_of_match[] = {
+       {
+               .compatible = "maxim,max6581",
+               .data = (void *)max6581
+       },
+       {
+               .compatible = "maxim,max6602",
+               .data = (void *)max6602
+       },
+       {
+               .compatible = "maxim,max6622",
+               .data = (void *)max6622
+       },
+       {
+               .compatible = "maxim,max6636",
+               .data = (void *)max6636
+       },
+       {
+               .compatible = "maxim,max6689",
+               .data = (void *)max6689
+       },
+       {
+               .compatible = "maxim,max6693",
+               .data = (void *)max6693
+       },
+       {
+               .compatible = "maxim,max6694",
+               .data = (void *)max6694
+       },
+       {
+               .compatible = "maxim,max6697",
+               .data = (void *)max6697
+       },
+       {
+               .compatible = "maxim,max6698",
+               .data = (void *)max6698
+       },
+       {
+               .compatible = "maxim,max6699",
+               .data = (void *)max6699
+       },
+       { },
+};
+MODULE_DEVICE_TABLE(of, max6697_of_match);
+
 static struct i2c_driver max6697_driver = {
        .class = I2C_CLASS_HWMON,
        .driver = {
                .name   = "max6697",
+               .of_match_table = of_match_ptr(max6697_of_match),
        },
        .probe = max6697_probe,
        .id_table = max6697_id,
index 4ab5293..00d6995 100644 (file)
@@ -101,8 +101,8 @@ static const struct coefficients adm1075_coefficients[] = {
        [0] = { 27169, 0, -1 },         /* voltage */
        [1] = { 806, 20475, -1 },       /* current, irange25 */
        [2] = { 404, 20475, -1 },       /* current, irange50 */
-       [3] = { 0, -1, 8549 },          /* power, irange25 */
-       [4] = { 0, -1, 4279 },          /* power, irange50 */
+       [3] = { 8549, 0, -1 },          /* power, irange25 */
+       [4] = { 4279, 0, -1 },          /* power, irange50 */
 };
 
 static const struct coefficients adm1275_coefficients[] = {
index 3e3aa95..3518f0c 100644 (file)
@@ -21,6 +21,7 @@
 
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/of_device.h>
 #include <linux/init.h>
 #include <linux/err.h>
 #include <linux/slab.h>
@@ -119,6 +120,35 @@ static const struct i2c_device_id ucd9000_id[] = {
 };
 MODULE_DEVICE_TABLE(i2c, ucd9000_id);
 
+static const struct of_device_id ucd9000_of_match[] = {
+       {
+               .compatible = "ti,ucd9000",
+               .data = (void *)ucd9000
+       },
+       {
+               .compatible = "ti,ucd90120",
+               .data = (void *)ucd90120
+       },
+       {
+               .compatible = "ti,ucd90124",
+               .data = (void *)ucd90124
+       },
+       {
+               .compatible = "ti,ucd90160",
+               .data = (void *)ucd90160
+       },
+       {
+               .compatible = "ti,ucd9090",
+               .data = (void *)ucd9090
+       },
+       {
+               .compatible = "ti,ucd90910",
+               .data = (void *)ucd90910
+       },
+       { },
+};
+MODULE_DEVICE_TABLE(of, ucd9000_of_match);
+
 static int ucd9000_probe(struct i2c_client *client,
                         const struct i2c_device_id *id)
 {
@@ -126,6 +156,7 @@ static int ucd9000_probe(struct i2c_client *client,
        struct ucd9000_data *data;
        struct pmbus_driver_info *info;
        const struct i2c_device_id *mid;
+       enum chips chip;
        int i, ret;
 
        if (!i2c_check_functionality(client->adapter,
@@ -151,7 +182,12 @@ static int ucd9000_probe(struct i2c_client *client,
                return -ENODEV;
        }
 
-       if (id->driver_data != ucd9000 && id->driver_data != mid->driver_data)
+       if (client->dev.of_node)
+               chip = (enum chips)of_device_get_match_data(&client->dev);
+       else
+               chip = id->driver_data;
+
+       if (chip != ucd9000 && chip != mid->driver_data)
                dev_notice(&client->dev,
                           "Device mismatch: Configured %s, detected %s\n",
                           id->name, mid->name);
@@ -234,6 +270,7 @@ static int ucd9000_probe(struct i2c_client *client,
 static struct i2c_driver ucd9000_driver = {
        .driver = {
                .name = "ucd9000",
+               .of_match_table = of_match_ptr(ucd9000_of_match),
        },
        .probe = ucd9000_probe,
        .remove = pmbus_do_remove,
index 033d6ac..a8712c5 100644 (file)
@@ -20,6 +20,7 @@
 
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/of_device.h>
 #include <linux/init.h>
 #include <linux/err.h>
 #include <linux/slab.h>
@@ -46,12 +47,50 @@ static const struct i2c_device_id ucd9200_id[] = {
 };
 MODULE_DEVICE_TABLE(i2c, ucd9200_id);
 
+static const struct of_device_id ucd9200_of_match[] = {
+       {
+               .compatible = "ti,cd9200",
+               .data = (void *)ucd9200
+       },
+       {
+               .compatible = "ti,cd9220",
+               .data = (void *)ucd9220
+       },
+       {
+               .compatible = "ti,cd9222",
+               .data = (void *)ucd9222
+       },
+       {
+               .compatible = "ti,cd9224",
+               .data = (void *)ucd9224
+       },
+       {
+               .compatible = "ti,cd9240",
+               .data = (void *)ucd9240
+       },
+       {
+               .compatible = "ti,cd9244",
+               .data = (void *)ucd9244
+       },
+       {
+               .compatible = "ti,cd9246",
+               .data = (void *)ucd9246
+       },
+       {
+               .compatible = "ti,cd9248",
+               .data = (void *)ucd9248
+       },
+       { },
+};
+MODULE_DEVICE_TABLE(of, ucd9200_of_match);
+
 static int ucd9200_probe(struct i2c_client *client,
                         const struct i2c_device_id *id)
 {
        u8 block_buffer[I2C_SMBUS_BLOCK_MAX + 1];
        struct pmbus_driver_info *info;
        const struct i2c_device_id *mid;
+       enum chips chip;
        int i, j, ret;
 
        if (!i2c_check_functionality(client->adapter,
@@ -76,7 +115,13 @@ static int ucd9200_probe(struct i2c_client *client,
                dev_err(&client->dev, "Unsupported device\n");
                return -ENODEV;
        }
-       if (id->driver_data != ucd9200 && id->driver_data != mid->driver_data)
+
+       if (client->dev.of_node)
+               chip = (enum chips)of_device_get_match_data(&client->dev);
+       else
+               chip = id->driver_data;
+
+       if (chip != ucd9200 && chip != mid->driver_data)
                dev_notice(&client->dev,
                           "Device mismatch: Configured %s, detected %s\n",
                           id->name, mid->name);
@@ -167,6 +212,7 @@ static int ucd9200_probe(struct i2c_client *client,
 static struct i2c_driver ucd9200_driver = {
        .driver = {
                .name = "ucd9200",
+               .of_match_table = of_match_ptr(ucd9200_of_match),
        },
        .probe = ucd9200_probe,
        .remove = pmbus_do_remove,
index 5545068..d56251d 100644 (file)
@@ -85,6 +85,12 @@ static const struct i2c_device_id stts751_id[] = {
        { }
 };
 
+static const struct of_device_id stts751_of_match[] = {
+       { .compatible = "stts751" },
+       { },
+};
+MODULE_DEVICE_TABLE(of, stts751_of_match);
+
 struct stts751_priv {
        struct device *dev;
        struct i2c_client *client;
@@ -819,6 +825,7 @@ static struct i2c_driver stts751_driver = {
        .class          = I2C_CLASS_HWMON,
        .driver = {
                .name   = DEVNAME,
+               .of_match_table = of_match_ptr(stts751_of_match),
        },
        .probe          = stts751_probe,
        .id_table       = stts751_id,
index 36bba2a..5eafbaa 100644 (file)
@@ -323,8 +323,15 @@ static const struct i2c_device_id tmp102_id[] = {
 };
 MODULE_DEVICE_TABLE(i2c, tmp102_id);
 
+static const struct of_device_id tmp102_of_match[] = {
+       { .compatible = "ti,tmp102" },
+       { },
+};
+MODULE_DEVICE_TABLE(of, tmp102_of_match);
+
 static struct i2c_driver tmp102_driver = {
        .driver.name    = DRIVER_NAME,
+       .driver.of_match_table = of_match_ptr(tmp102_of_match),
        .driver.pm      = &tmp102_dev_pm_ops,
        .probe          = tmp102_probe,
        .id_table       = tmp102_id,
index ad571ec..7f85b14 100644 (file)
@@ -150,8 +150,7 @@ static int tmp103_probe(struct i2c_client *client,
        return PTR_ERR_OR_ZERO(hwmon_dev);
 }
 
-#ifdef CONFIG_PM
-static int tmp103_suspend(struct device *dev)
+static int __maybe_unused tmp103_suspend(struct device *dev)
 {
        struct regmap *regmap = dev_get_drvdata(dev);
 
@@ -159,7 +158,7 @@ static int tmp103_suspend(struct device *dev)
                                  TMP103_CONF_SD_MASK, 0);
 }
 
-static int tmp103_resume(struct device *dev)
+static int __maybe_unused tmp103_resume(struct device *dev)
 {
        struct regmap *regmap = dev_get_drvdata(dev);
 
@@ -167,15 +166,7 @@ static int tmp103_resume(struct device *dev)
                                  TMP103_CONF_SD_MASK, TMP103_CONF_SD);
 }
 
-static const struct dev_pm_ops tmp103_dev_pm_ops = {
-       .suspend        = tmp103_suspend,
-       .resume         = tmp103_resume,
-};
-
-#define TMP103_DEV_PM_OPS (&tmp103_dev_pm_ops)
-#else
-#define        TMP103_DEV_PM_OPS NULL
-#endif /* CONFIG_PM */
+static SIMPLE_DEV_PM_OPS(tmp103_dev_pm_ops, tmp103_suspend, tmp103_resume);
 
 static const struct i2c_device_id tmp103_id[] = {
        { "tmp103", 0 },
@@ -183,10 +174,17 @@ static const struct i2c_device_id tmp103_id[] = {
 };
 MODULE_DEVICE_TABLE(i2c, tmp103_id);
 
+static const struct of_device_id tmp103_of_match[] = {
+       { .compatible = "ti,tmp103" },
+       { },
+};
+MODULE_DEVICE_TABLE(of, tmp103_of_match);
+
 static struct i2c_driver tmp103_driver = {
        .driver = {
                .name   = "tmp103",
-               .pm     = TMP103_DEV_PM_OPS,
+               .of_match_table = of_match_ptr(tmp103_of_match),
+               .pm     = &tmp103_dev_pm_ops,
        },
        .probe          = tmp103_probe,
        .id_table       = tmp103_id,
index bfb98b9..e363992 100644 (file)
@@ -29,6 +29,7 @@
 #include <linux/hwmon-sysfs.h>
 #include <linux/err.h>
 #include <linux/mutex.h>
+#include <linux/of_device.h>
 #include <linux/sysfs.h>
 
 /* Addresses to scan */
@@ -69,6 +70,31 @@ static const struct i2c_device_id tmp421_id[] = {
 };
 MODULE_DEVICE_TABLE(i2c, tmp421_id);
 
+static const struct of_device_id tmp421_of_match[] = {
+       {
+               .compatible = "ti,tmp421",
+               .data = (void *)2
+       },
+       {
+               .compatible = "ti,tmp422",
+               .data = (void *)3
+       },
+       {
+               .compatible = "ti,tmp423",
+               .data = (void *)4
+       },
+       {
+               .compatible = "ti,tmp441",
+               .data = (void *)2
+       },
+       {
+               .compatible = "ti,tmp422",
+               .data = (void *)3
+       },
+       { },
+};
+MODULE_DEVICE_TABLE(of, tmp421_of_match);
+
 struct tmp421_data {
        struct i2c_client *client;
        struct mutex update_lock;
@@ -78,7 +104,7 @@ struct tmp421_data {
        struct hwmon_chip_info chip;
        char valid;
        unsigned long last_updated;
-       int channels;
+       unsigned long channels;
        u8 config;
        s16 temp[4];
 };
@@ -272,7 +298,11 @@ static int tmp421_probe(struct i2c_client *client,
                return -ENOMEM;
 
        mutex_init(&data->update_lock);
-       data->channels = id->driver_data;
+       if (client->dev.of_node)
+               data->channels = (unsigned long)
+                       of_device_get_match_data(&client->dev);
+       else
+               data->channels = id->driver_data;
        data->client = client;
 
        err = tmp421_init_client(client);
@@ -301,6 +331,7 @@ static struct i2c_driver tmp421_driver = {
        .class = I2C_CLASS_HWMON,
        .driver = {
                .name   = "tmp421",
+               .of_match_table = of_match_ptr(tmp421_of_match),
        },
        .probe = tmp421_probe,
        .id_table = tmp421_id,
diff --git a/drivers/hwmon/twl4030-madc-hwmon.c b/drivers/hwmon/twl4030-madc-hwmon.c
deleted file mode 100644 (file)
index b5caf7f..0000000
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- *
- * TWL4030 MADC Hwmon driver-This driver monitors the real time
- * conversion of analog signals like battery temperature,
- * battery type, battery level etc. User can ask for the conversion on a
- * particular channel using the sysfs nodes.
- *
- * Copyright (C) 2011 Texas Instruments Incorporated - http://www.ti.com/
- * J Keerthy <j-keerthy@ti.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
- * 02110-1301 USA
- *
- */
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/i2c/twl.h>
-#include <linux/device.h>
-#include <linux/platform_device.h>
-#include <linux/i2c/twl4030-madc.h>
-#include <linux/hwmon.h>
-#include <linux/hwmon-sysfs.h>
-#include <linux/stddef.h>
-#include <linux/sysfs.h>
-#include <linux/err.h>
-#include <linux/types.h>
-
-/*
- * sysfs hook function
- */
-static ssize_t madc_read(struct device *dev,
-                        struct device_attribute *devattr, char *buf)
-{
-       struct sensor_device_attribute *attr = to_sensor_dev_attr(devattr);
-       struct twl4030_madc_request req = {
-               .channels = 1 << attr->index,
-               .method = TWL4030_MADC_SW2,
-               .type = TWL4030_MADC_WAIT,
-       };
-       long val;
-
-       val = twl4030_madc_conversion(&req);
-       if (val < 0)
-               return val;
-
-       return sprintf(buf, "%d\n", req.rbuf[attr->index]);
-}
-
-/* sysfs nodes to read individual channels from user side */
-static SENSOR_DEVICE_ATTR(in0_input, S_IRUGO, madc_read, NULL, 0);
-static SENSOR_DEVICE_ATTR(temp1_input, S_IRUGO, madc_read, NULL, 1);
-static SENSOR_DEVICE_ATTR(in2_input, S_IRUGO, madc_read, NULL, 2);
-static SENSOR_DEVICE_ATTR(in3_input, S_IRUGO, madc_read, NULL, 3);
-static SENSOR_DEVICE_ATTR(in4_input, S_IRUGO, madc_read, NULL, 4);
-static SENSOR_DEVICE_ATTR(in5_input, S_IRUGO, madc_read, NULL, 5);
-static SENSOR_DEVICE_ATTR(in6_input, S_IRUGO, madc_read, NULL, 6);
-static SENSOR_DEVICE_ATTR(in7_input, S_IRUGO, madc_read, NULL, 7);
-static SENSOR_DEVICE_ATTR(in8_input, S_IRUGO, madc_read, NULL, 8);
-static SENSOR_DEVICE_ATTR(in9_input, S_IRUGO, madc_read, NULL, 9);
-static SENSOR_DEVICE_ATTR(curr10_input, S_IRUGO, madc_read, NULL, 10);
-static SENSOR_DEVICE_ATTR(in11_input, S_IRUGO, madc_read, NULL, 11);
-static SENSOR_DEVICE_ATTR(in12_input, S_IRUGO, madc_read, NULL, 12);
-static SENSOR_DEVICE_ATTR(in15_input, S_IRUGO, madc_read, NULL, 15);
-
-static struct attribute *twl4030_madc_attrs[] = {
-       &sensor_dev_attr_in0_input.dev_attr.attr,
-       &sensor_dev_attr_temp1_input.dev_attr.attr,
-       &sensor_dev_attr_in2_input.dev_attr.attr,
-       &sensor_dev_attr_in3_input.dev_attr.attr,
-       &sensor_dev_attr_in4_input.dev_attr.attr,
-       &sensor_dev_attr_in5_input.dev_attr.attr,
-       &sensor_dev_attr_in6_input.dev_attr.attr,
-       &sensor_dev_attr_in7_input.dev_attr.attr,
-       &sensor_dev_attr_in8_input.dev_attr.attr,
-       &sensor_dev_attr_in9_input.dev_attr.attr,
-       &sensor_dev_attr_curr10_input.dev_attr.attr,
-       &sensor_dev_attr_in11_input.dev_attr.attr,
-       &sensor_dev_attr_in12_input.dev_attr.attr,
-       &sensor_dev_attr_in15_input.dev_attr.attr,
-       NULL
-};
-ATTRIBUTE_GROUPS(twl4030_madc);
-
-static int twl4030_madc_hwmon_probe(struct platform_device *pdev)
-{
-       struct device *hwmon;
-
-       hwmon = devm_hwmon_device_register_with_groups(&pdev->dev,
-                                                      "twl4030_madc", NULL,
-                                                      twl4030_madc_groups);
-       return PTR_ERR_OR_ZERO(hwmon);
-}
-
-static struct platform_driver twl4030_madc_hwmon_driver = {
-       .probe = twl4030_madc_hwmon_probe,
-       .driver = {
-                  .name = "twl4030_madc_hwmon",
-                  },
-};
-
-module_platform_driver(twl4030_madc_hwmon_driver);
-
-MODULE_DESCRIPTION("TWL4030 ADC Hwmon driver");
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("J Keerthy");
-MODULE_ALIAS("platform:twl4030_madc_hwmon");
index ab346ed..ad68b6d 100644 (file)
@@ -135,11 +135,16 @@ superio_select(int ioreg, int ld)
        outb(ld, ioreg + 1);
 }
 
-static inline void
+static inline int
 superio_enter(int ioreg)
 {
+       if (!request_muxed_region(ioreg, 2, DRVNAME))
+               return -EBUSY;
+
        outb(0x87, ioreg);
        outb(0x87, ioreg);
+
+       return 0;
 }
 
 static inline void
@@ -148,6 +153,7 @@ superio_exit(int ioreg)
        outb(0xaa, ioreg);
        outb(0x02, ioreg);
        outb(0x02, ioreg + 1);
+       release_region(ioreg, 2);
 }
 
 /*
@@ -1970,8 +1976,6 @@ w83627ehf_check_fan_inputs(const struct w83627ehf_sio_data *sio_data,
                return;
        }
 
-       superio_enter(sio_data->sioreg);
-
        /* fan4 and fan5 share some pins with the GPIO and serial flash */
        if (sio_data->kind == nct6775) {
                /* On NCT6775, fan4 shares pins with the fdc interface */
@@ -2013,8 +2017,6 @@ w83627ehf_check_fan_inputs(const struct w83627ehf_sio_data *sio_data,
                fan4min = fan4pin;
        }
 
-       superio_exit(sio_data->sioreg);
-
        data->has_fan = data->has_fan_min = 0x03; /* fan1 and fan2 */
        data->has_fan |= (fan3pin << 2);
        data->has_fan_min |= (fan3pin << 2);
@@ -2352,7 +2354,11 @@ static int w83627ehf_probe(struct platform_device *pdev)
        w83627ehf_init_device(data, sio_data->kind);
 
        data->vrm = vid_which_vrm();
-       superio_enter(sio_data->sioreg);
+
+       err = superio_enter(sio_data->sioreg);
+       if (err)
+               goto exit_release;
+
        /* Read VID value */
        if (sio_data->kind == w83667hg || sio_data->kind == w83667hg_b ||
            sio_data->kind == nct6775 || sio_data->kind == nct6776) {
@@ -2364,8 +2370,10 @@ static int w83627ehf_probe(struct platform_device *pdev)
                superio_select(sio_data->sioreg, W83667HG_LD_VID);
                data->vid = superio_inb(sio_data->sioreg, 0xe3);
                err = device_create_file(dev, &dev_attr_cpu0_vid);
-               if (err)
+               if (err) {
+                       superio_exit(sio_data->sioreg);
                        goto exit_release;
+               }
        } else if (sio_data->kind != w83627uhg) {
                superio_select(sio_data->sioreg, W83627EHF_LD_HWM);
                if (superio_inb(sio_data->sioreg, SIO_REG_VID_CTRL) & 0x80) {
@@ -2401,8 +2409,10 @@ static int w83627ehf_probe(struct platform_device *pdev)
                                data->vid &= 0x3f;
 
                        err = device_create_file(dev, &dev_attr_cpu0_vid);
-                       if (err)
+                       if (err) {
+                               superio_exit(sio_data->sioreg);
                                goto exit_release;
+                       }
                } else {
                        dev_info(dev,
                                 "VID pins in output mode, CPU VID not available\n");
@@ -2424,10 +2434,10 @@ static int w83627ehf_probe(struct platform_device *pdev)
                pr_info("Enabled fan debounce for chip %s\n", data->name);
        }
 
-       superio_exit(sio_data->sioreg);
-
        w83627ehf_check_fan_inputs(sio_data, data);
 
+       superio_exit(sio_data->sioreg);
+
        /* Read fan clock dividers immediately */
        w83627ehf_update_fan_div_common(dev, data);
 
@@ -2712,8 +2722,11 @@ static int __init w83627ehf_find(int sioaddr, unsigned short *addr,
 
        u16 val;
        const char *sio_name;
+       int err;
 
-       superio_enter(sioaddr);
+       err = superio_enter(sioaddr);
+       if (err)
+               return err;
 
        if (force_id)
                val = force_id;
index d7325c6..979ea6e 100644 (file)
@@ -321,7 +321,7 @@ static int etb_set_buffer(struct coresight_device *csdev,
 
 static unsigned long etb_reset_buffer(struct coresight_device *csdev,
                                      struct perf_output_handle *handle,
-                                     void *sink_config, bool *lost)
+                                     void *sink_config)
 {
        unsigned long size = 0;
        struct cs_buffers *buf = sink_config;
@@ -343,7 +343,6 @@ static unsigned long etb_reset_buffer(struct coresight_device *csdev,
                 * resetting parameters here and squaring off with the ring
                 * buffer API in the tracer PMU is fine.
                 */
-               *lost = !!local_xchg(&buf->lost, 0);
                size = local_xchg(&buf->data_size, 0);
        }
 
@@ -385,7 +384,7 @@ static void etb_update_buffer(struct coresight_device *csdev,
                        (unsigned long)write_ptr);
 
                write_ptr &= ~(ETB_FRAME_SIZE_WORDS - 1);
-               local_inc(&buf->lost);
+               perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
        }
 
        /*
@@ -396,7 +395,7 @@ static void etb_update_buffer(struct coresight_device *csdev,
         */
        status = readl_relaxed(drvdata->base + ETB_STATUS_REG);
        if (status & ETB_STATUS_RAM_FULL) {
-               local_inc(&buf->lost);
+               perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
                to_read = capacity;
                read_ptr = write_ptr;
        } else {
@@ -429,7 +428,7 @@ static void etb_update_buffer(struct coresight_device *csdev,
                if (read_ptr > (drvdata->buffer_depth - 1))
                        read_ptr -= drvdata->buffer_depth;
                /* let the decoder know we've skipped ahead */
-               local_inc(&buf->lost);
+               perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
        }
 
        /* finally tell HW where we want to start reading from */
index 26cfac3..288a423 100644 (file)
@@ -302,7 +302,8 @@ out:
        return;
 
 fail_end_stop:
-       perf_aux_output_end(handle, 0, true);
+       perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
+       perf_aux_output_end(handle, 0);
 fail:
        event->hw.state = PERF_HES_STOPPED;
        goto out;
@@ -310,7 +311,6 @@ fail:
 
 static void etm_event_stop(struct perf_event *event, int mode)
 {
-       bool lost;
        int cpu = smp_processor_id();
        unsigned long size;
        struct coresight_device *sink, *csdev = per_cpu(csdev_src, cpu);
@@ -348,10 +348,9 @@ static void etm_event_stop(struct perf_event *event, int mode)
                        return;
 
                size = sink_ops(sink)->reset_buffer(sink, handle,
-                                                   event_data->snk_config,
-                                                   &lost);
+                                                   event_data->snk_config);
 
-               perf_aux_output_end(handle, size, lost);
+               perf_aux_output_end(handle, size);
        }
 
        /* Disabling the path make its elements available to other sessions */
index ef9d8e9..5f662d8 100644 (file)
@@ -76,7 +76,6 @@ enum cs_mode {
  * @nr_pages:  max number of pages granted to us
  * @offset:    offset within the current buffer
  * @data_size: how much we collected in this run
- * @lost:      other than zero if we had a HW buffer wrap around
  * @snapshot:  is this run in snapshot mode
  * @data_pages:        a handle the ring buffer
  */
@@ -85,7 +84,6 @@ struct cs_buffers {
        unsigned int            nr_pages;
        unsigned long           offset;
        local_t                 data_size;
-       local_t                 lost;
        bool                    snapshot;
        void                    **data_pages;
 };
index 1549436..aec61a6 100644 (file)
@@ -329,7 +329,7 @@ static int tmc_set_etf_buffer(struct coresight_device *csdev,
 
 static unsigned long tmc_reset_etf_buffer(struct coresight_device *csdev,
                                          struct perf_output_handle *handle,
-                                         void *sink_config, bool *lost)
+                                         void *sink_config)
 {
        long size = 0;
        struct cs_buffers *buf = sink_config;
@@ -350,7 +350,6 @@ static unsigned long tmc_reset_etf_buffer(struct coresight_device *csdev,
                 * resetting parameters here and squaring off with the ring
                 * buffer API in the tracer PMU is fine.
                 */
-               *lost = !!local_xchg(&buf->lost, 0);
                size = local_xchg(&buf->data_size, 0);
        }
 
@@ -389,7 +388,7 @@ static void tmc_update_etf_buffer(struct coresight_device *csdev,
         */
        status = readl_relaxed(drvdata->base + TMC_STS);
        if (status & TMC_STS_FULL) {
-               local_inc(&buf->lost);
+               perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
                to_read = drvdata->size;
        } else {
                to_read = CIRC_CNT(write_ptr, read_ptr, drvdata->size);
@@ -434,7 +433,7 @@ static void tmc_update_etf_buffer(struct coresight_device *csdev,
                        read_ptr -= drvdata->size;
                /* Tell the HW */
                writel_relaxed(read_ptr, drvdata->base + TMC_RRP);
-               local_inc(&buf->lost);
+               perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
        }
 
        cur = buf->cur;
index feb3006..5901937 100644 (file)
@@ -107,7 +107,8 @@ int ide_queue_pc_tail(ide_drive_t *drive, struct gendisk *disk,
        memcpy(scsi_req(rq)->cmd, pc->c, 12);
        if (drive->media == ide_tape)
                scsi_req(rq)->cmd[13] = REQ_IDETAPE_PC1;
-       error = blk_execute_rq(drive->queue, disk, rq, 0);
+       blk_execute_rq(drive->queue, disk, rq, 0);
+       error = scsi_req(rq)->result ? -EIO : 0;
 put_req:
        blk_put_request(rq);
        return error;
@@ -454,7 +455,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
                        debug_log("%s: I/O error\n", drive->name);
 
                        if (drive->media != ide_tape)
-                               pc->rq->errors++;
+                               scsi_req(pc->rq)->result++;
 
                        if (scsi_req(rq)->cmd[0] == REQUEST_SENSE) {
                                printk(KERN_ERR PFX "%s: I/O error in request "
@@ -488,13 +489,13 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
                        drive->failed_pc = NULL;
 
                if (ata_misc_request(rq)) {
-                       rq->errors = 0;
+                       scsi_req(rq)->result = 0;
                        error = 0;
                } else {
 
                        if (blk_rq_is_passthrough(rq) && uptodate <= 0) {
-                               if (rq->errors == 0)
-                                       rq->errors = -EIO;
+                               if (scsi_req(rq)->result == 0)
+                                       scsi_req(rq)->result = -EIO;
                        }
 
                        error = uptodate ? 0 : -EIO;
index 74f1b7d..07e5ff3 100644 (file)
@@ -247,10 +247,10 @@ static int ide_cd_breathe(ide_drive_t *drive, struct request *rq)
 
        struct cdrom_info *info = drive->driver_data;
 
-       if (!rq->errors)
+       if (!scsi_req(rq)->result)
                info->write_timeout = jiffies + ATAPI_WAIT_WRITE_BUSY;
 
-       rq->errors = 1;
+       scsi_req(rq)->result = 1;
 
        if (time_after(jiffies, info->write_timeout))
                return 0;
@@ -294,8 +294,8 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat)
        }
 
        /* if we have an error, pass CHECK_CONDITION as the SCSI status byte */
-       if (blk_rq_is_scsi(rq) && !rq->errors)
-               rq->errors = SAM_STAT_CHECK_CONDITION;
+       if (blk_rq_is_scsi(rq) && !scsi_req(rq)->result)
+               scsi_req(rq)->result = SAM_STAT_CHECK_CONDITION;
 
        if (blk_noretry_request(rq))
                do_end_request = 1;
@@ -325,7 +325,7 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat)
                 * Arrange to retry the request but be sure to give up if we've
                 * retried too many times.
                 */
-               if (++rq->errors > ERROR_MAX)
+               if (++scsi_req(rq)->result > ERROR_MAX)
                        do_end_request = 1;
                break;
        case ILLEGAL_REQUEST:
@@ -372,7 +372,7 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat)
                        /* go to the default handler for other errors */
                        ide_error(drive, "cdrom_decode_status", stat);
                        return 1;
-               } else if (++rq->errors > ERROR_MAX)
+               } else if (++scsi_req(rq)->result > ERROR_MAX)
                        /* we've racked up too many retries, abort */
                        do_end_request = 1;
        }
@@ -452,7 +452,8 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
                        }
                }
 
-               error = blk_execute_rq(drive->queue, info->disk, rq, 0);
+               blk_execute_rq(drive->queue, info->disk, rq, 0);
+               error = scsi_req(rq)->result ? -EIO : 0;
 
                if (buffer)
                        *bufflen = scsi_req(rq)->resid_len;
@@ -683,8 +684,8 @@ out_end:
                        if (cmd->nleft == 0)
                                uptodate = 1;
                } else {
-                       if (uptodate <= 0 && rq->errors == 0)
-                               rq->errors = -EIO;
+                       if (uptodate <= 0 && scsi_req(rq)->result == 0)
+                               scsi_req(rq)->result = -EIO;
                }
 
                if (uptodate == 0 && rq->bio)
@@ -1379,7 +1380,7 @@ static int ide_cdrom_prep_pc(struct request *rq)
         * appropriate action
         */
        if (c[0] == MODE_SENSE || c[0] == MODE_SELECT) {
-               rq->errors = ILLEGAL_REQUEST;
+               scsi_req(rq)->result = ILLEGAL_REQUEST;
                return BLKPREP_KILL;
        }
 
index 9fcefbc..55cd736 100644 (file)
@@ -307,7 +307,8 @@ int ide_cdrom_reset(struct cdrom_device_info *cdi)
        scsi_req_init(rq);
        ide_req(rq)->type = ATA_PRIV_MISC;
        rq->rq_flags = RQF_QUIET;
-       ret = blk_execute_rq(drive->queue, cd->disk, rq, 0);
+       blk_execute_rq(drive->queue, cd->disk, rq, 0);
+       ret = scsi_req(rq)->result ? -EIO : 0;
        blk_put_request(rq);
        /*
         * A reset will unlock the door. If it was previously locked,
index a45dda5..9b69c32 100644 (file)
@@ -173,8 +173,8 @@ int ide_devset_execute(ide_drive_t *drive, const struct ide_devset *setting,
        *(int *)&scsi_req(rq)->cmd[1] = arg;
        rq->special = setting->set;
 
-       if (blk_execute_rq(q, NULL, rq, 0))
-               ret = rq->errors;
+       blk_execute_rq(q, NULL, rq, 0);
+       ret = scsi_req(rq)->result;
        blk_put_request(rq);
 
        return ret;
@@ -186,7 +186,7 @@ ide_startstop_t ide_do_devset(ide_drive_t *drive, struct request *rq)
 
        err = setfunc(drive, *(int *)&scsi_req(rq)->cmd[1]);
        if (err)
-               rq->errors = err;
-       ide_complete_rq(drive, err, blk_rq_bytes(rq));
+               scsi_req(rq)->result = err;
+       ide_complete_rq(drive, 0, blk_rq_bytes(rq));
        return ide_stopped;
 }
index 1861597..7c06237 100644 (file)
@@ -470,7 +470,6 @@ ide_devset_get(multcount, mult_count);
 static int set_multcount(ide_drive_t *drive, int arg)
 {
        struct request *rq;
-       int error;
 
        if (arg < 0 || arg > (drive->id[ATA_ID_MAX_MULTSECT] & 0xff))
                return -EINVAL;
@@ -484,7 +483,7 @@ static int set_multcount(ide_drive_t *drive, int arg)
 
        drive->mult_req = arg;
        drive->special_flags |= IDE_SFLAG_SET_MULTMODE;
-       error = blk_execute_rq(drive->queue, NULL, rq, 0);
+       blk_execute_rq(drive->queue, NULL, rq, 0);
        blk_put_request(rq);
 
        return (drive->mult_count == arg) ? 0 : -EIO;
index 17a65ac..51c8122 100644 (file)
@@ -490,7 +490,7 @@ ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error)
         * make sure request is sane
         */
        if (hwif->rq)
-               hwif->rq->errors = 0;
+               scsi_req(hwif->rq)->result = 0;
        return ret;
 }
 
index cf3af68..4b7ffd7 100644 (file)
@@ -12,7 +12,7 @@ static ide_startstop_t ide_ata_error(ide_drive_t *drive, struct request *rq,
        if ((stat & ATA_BUSY) ||
            ((stat & ATA_DF) && (drive->dev_flags & IDE_DFLAG_NOWERR) == 0)) {
                /* other bits are useless when BUSY */
-               rq->errors |= ERROR_RESET;
+               scsi_req(rq)->result |= ERROR_RESET;
        } else if (stat & ATA_ERR) {
                /* err has different meaning on cdrom and tape */
                if (err == ATA_ABORTED) {
@@ -25,10 +25,10 @@ static ide_startstop_t ide_ata_error(ide_drive_t *drive, struct request *rq,
                        drive->crc_count++;
                } else if (err & (ATA_BBK | ATA_UNC)) {
                        /* retries won't help these */
-                       rq->errors = ERROR_MAX;
+                       scsi_req(rq)->result = ERROR_MAX;
                } else if (err & ATA_TRK0NF) {
                        /* help it find track zero */
-                       rq->errors |= ERROR_RECAL;
+                       scsi_req(rq)->result |= ERROR_RECAL;
                }
        }
 
@@ -39,23 +39,23 @@ static ide_startstop_t ide_ata_error(ide_drive_t *drive, struct request *rq,
                ide_pad_transfer(drive, READ, nsect * SECTOR_SIZE);
        }
 
-       if (rq->errors >= ERROR_MAX || blk_noretry_request(rq)) {
+       if (scsi_req(rq)->result >= ERROR_MAX || blk_noretry_request(rq)) {
                ide_kill_rq(drive, rq);
                return ide_stopped;
        }
 
        if (hwif->tp_ops->read_status(hwif) & (ATA_BUSY | ATA_DRQ))
-               rq->errors |= ERROR_RESET;
+               scsi_req(rq)->result |= ERROR_RESET;
 
-       if ((rq->errors & ERROR_RESET) == ERROR_RESET) {
-               ++rq->errors;
+       if ((scsi_req(rq)->result & ERROR_RESET) == ERROR_RESET) {
+               ++scsi_req(rq)->result;
                return ide_do_reset(drive);
        }
 
-       if ((rq->errors & ERROR_RECAL) == ERROR_RECAL)
+       if ((scsi_req(rq)->result & ERROR_RECAL) == ERROR_RECAL)
                drive->special_flags |= IDE_SFLAG_RECALIBRATE;
 
-       ++rq->errors;
+       ++scsi_req(rq)->result;
 
        return ide_stopped;
 }
@@ -68,7 +68,7 @@ static ide_startstop_t ide_atapi_error(ide_drive_t *drive, struct request *rq,
        if ((stat & ATA_BUSY) ||
            ((stat & ATA_DF) && (drive->dev_flags & IDE_DFLAG_NOWERR) == 0)) {
                /* other bits are useless when BUSY */
-               rq->errors |= ERROR_RESET;
+               scsi_req(rq)->result |= ERROR_RESET;
        } else {
                /* add decoding error stuff */
        }
@@ -77,14 +77,14 @@ static ide_startstop_t ide_atapi_error(ide_drive_t *drive, struct request *rq,
                /* force an abort */
                hwif->tp_ops->exec_command(hwif, ATA_CMD_IDLEIMMEDIATE);
 
-       if (rq->errors >= ERROR_MAX) {
+       if (scsi_req(rq)->result >= ERROR_MAX) {
                ide_kill_rq(drive, rq);
        } else {
-               if ((rq->errors & ERROR_RESET) == ERROR_RESET) {
-                       ++rq->errors;
+               if ((scsi_req(rq)->result & ERROR_RESET) == ERROR_RESET) {
+                       ++scsi_req(rq)->result;
                        return ide_do_reset(drive);
                }
-               ++rq->errors;
+               ++scsi_req(rq)->result;
        }
 
        return ide_stopped;
@@ -130,11 +130,11 @@ ide_startstop_t ide_error(ide_drive_t *drive, const char *msg, u8 stat)
                        if (cmd)
                                ide_complete_cmd(drive, cmd, stat, err);
                } else if (ata_pm_request(rq)) {
-                       rq->errors = 1;
+                       scsi_req(rq)->result = 1;
                        ide_complete_pm_rq(drive, rq);
                        return ide_stopped;
                }
-               rq->errors = err;
+               scsi_req(rq)->result = err;
                ide_complete_rq(drive, err ? -EIO : 0, blk_rq_bytes(rq));
                return ide_stopped;
        }
@@ -149,8 +149,8 @@ static inline void ide_complete_drive_reset(ide_drive_t *drive, int err)
 
        if (rq && ata_misc_request(rq) &&
            scsi_req(rq)->cmd[0] == REQ_DRIVE_RESET) {
-               if (err <= 0 && rq->errors == 0)
-                       rq->errors = -EIO;
+               if (err <= 0 && scsi_req(rq)->result == 0)
+                       scsi_req(rq)->result = -EIO;
                ide_complete_rq(drive, err ? err : 0, blk_rq_bytes(rq));
        }
 }
index a69e801..8ac6048 100644 (file)
@@ -98,7 +98,7 @@ static int ide_floppy_callback(ide_drive_t *drive, int dsc)
        }
 
        if (ata_misc_request(rq))
-               rq->errors = uptodate ? 0 : IDE_DRV_ERROR_GENERAL;
+               scsi_req(rq)->result = uptodate ? 0 : IDE_DRV_ERROR_GENERAL;
 
        return uptodate;
 }
@@ -239,7 +239,7 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
                                        ? rq->rq_disk->disk_name
                                        : "dev?"));
 
-       if (rq->errors >= ERROR_MAX) {
+       if (scsi_req(rq)->result >= ERROR_MAX) {
                if (drive->failed_pc) {
                        ide_floppy_report_error(floppy, drive->failed_pc);
                        drive->failed_pc = NULL;
@@ -247,7 +247,7 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
                        printk(KERN_ERR PFX "%s: I/O error\n", drive->name);
 
                if (ata_misc_request(rq)) {
-                       rq->errors = 0;
+                       scsi_req(rq)->result = 0;
                        ide_complete_rq(drive, 0, blk_rq_bytes(rq));
                        return ide_stopped;
                } else
@@ -301,8 +301,8 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
        return ide_floppy_issue_pc(drive, &cmd, pc);
 out_end:
        drive->failed_pc = NULL;
-       if (blk_rq_is_passthrough(rq) && rq->errors == 0)
-               rq->errors = -EIO;
+       if (blk_rq_is_passthrough(rq) && scsi_req(rq)->result == 0)
+               scsi_req(rq)->result = -EIO;
        ide_complete_rq(drive, -EIO, blk_rq_bytes(rq));
        return ide_stopped;
 }
index 043b1fb..45b3f41 100644 (file)
@@ -141,12 +141,12 @@ void ide_kill_rq(ide_drive_t *drive, struct request *rq)
        drive->failed_pc = NULL;
 
        if ((media == ide_floppy || media == ide_tape) && drv_req) {
-               rq->errors = 0;
+               scsi_req(rq)->result = 0;
        } else {
                if (media == ide_tape)
-                       rq->errors = IDE_DRV_ERROR_GENERAL;
-               else if (blk_rq_is_passthrough(rq) && rq->errors == 0)
-                       rq->errors = -EIO;
+                       scsi_req(rq)->result = IDE_DRV_ERROR_GENERAL;
+               else if (blk_rq_is_passthrough(rq) && scsi_req(rq)->result == 0)
+                       scsi_req(rq)->result = -EIO;
        }
 
        ide_complete_rq(drive, -EIO, blk_rq_bytes(rq));
@@ -271,7 +271,7 @@ static ide_startstop_t execute_drive_cmd (ide_drive_t *drive,
 #ifdef DEBUG
        printk("%s: DRIVE_CMD (null)\n", drive->name);
 #endif
-       rq->errors = 0;
+       scsi_req(rq)->result = 0;
        ide_complete_rq(drive, 0, blk_rq_bytes(rq));
 
        return ide_stopped;
index 248a3e0..8c0d172 100644 (file)
@@ -128,7 +128,8 @@ static int ide_cmd_ioctl(ide_drive_t *drive, unsigned long arg)
                rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM);
                scsi_req_init(rq);
                ide_req(rq)->type = ATA_PRIV_TASKFILE;
-               err = blk_execute_rq(drive->queue, NULL, rq, 0);
+               blk_execute_rq(drive->queue, NULL, rq, 0);
+               err = scsi_req(rq)->result ? -EIO : 0;
                blk_put_request(rq);
 
                return err;
@@ -227,8 +228,8 @@ static int generic_drive_reset(ide_drive_t *drive)
        ide_req(rq)->type = ATA_PRIV_MISC;
        scsi_req(rq)->cmd_len = 1;
        scsi_req(rq)->cmd[0] = REQ_DRIVE_RESET;
-       if (blk_execute_rq(drive->queue, NULL, rq, 1))
-               ret = rq->errors;
+       blk_execute_rq(drive->queue, NULL, rq, 1);
+       ret = scsi_req(rq)->result;
        blk_put_request(rq);
        return ret;
 }
index 101aed9..94e3107 100644 (file)
@@ -37,7 +37,8 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout)
        scsi_req(rq)->cmd_len = 1;
        ide_req(rq)->type = ATA_PRIV_MISC;
        rq->special = &timeout;
-       rc = blk_execute_rq(q, NULL, rq, 1);
+       blk_execute_rq(q, NULL, rq, 1);
+       rc = scsi_req(rq)->result ? -EIO : 0;
        blk_put_request(rq);
        if (rc)
                goto out;
index ec951be..0977fc1 100644 (file)
@@ -27,7 +27,8 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg)
                mesg.event = PM_EVENT_FREEZE;
        rqpm.pm_state = mesg.event;
 
-       ret = blk_execute_rq(drive->queue, NULL, rq, 0);
+       blk_execute_rq(drive->queue, NULL, rq, 0);
+       ret = scsi_req(rq)->result ? -EIO : 0;
        blk_put_request(rq);
 
        if (ret == 0 && ide_port_acpi(hwif)) {
@@ -55,8 +56,8 @@ static int ide_pm_execute_rq(struct request *rq)
        spin_lock_irq(q->queue_lock);
        if (unlikely(blk_queue_dying(q))) {
                rq->rq_flags |= RQF_QUIET;
-               rq->errors = -ENXIO;
-               __blk_end_request_all(rq, rq->errors);
+               scsi_req(rq)->result = -ENXIO;
+               __blk_end_request_all(rq, 0);
                spin_unlock_irq(q->queue_lock);
                return -ENXIO;
        }
@@ -66,7 +67,7 @@ static int ide_pm_execute_rq(struct request *rq)
 
        wait_for_completion_io(&wait);
 
-       return rq->errors ? -EIO : 0;
+       return scsi_req(rq)->result ? -EIO : 0;
 }
 
 int generic_ide_resume(struct device *dev)
index d8a552b..a0651f9 100644 (file)
@@ -366,7 +366,7 @@ static int ide_tape_callback(ide_drive_t *drive, int dsc)
                        err = pc->error;
                }
        }
-       rq->errors = err;
+       scsi_req(rq)->result = err;
 
        return uptodate;
 }
@@ -879,7 +879,7 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int size)
                tape->valid = 0;
 
        ret = size;
-       if (rq->errors == IDE_DRV_ERROR_GENERAL)
+       if (scsi_req(rq)->result == IDE_DRV_ERROR_GENERAL)
                ret = -EIO;
 out_put:
        blk_put_request(rq);
index 4c0007c..d71199d 100644 (file)
@@ -287,7 +287,7 @@ static void ide_pio_datablock(ide_drive_t *drive, struct ide_cmd *cmd,
        u8 saved_io_32bit = drive->io_32bit;
 
        if (cmd->tf_flags & IDE_TFLAG_FS)
-               cmd->rq->errors = 0;
+               scsi_req(cmd->rq)->result = 0;
 
        if (cmd->tf_flags & IDE_TFLAG_IO_16BIT)
                drive->io_32bit = 0;
@@ -329,7 +329,7 @@ void ide_finish_cmd(ide_drive_t *drive, struct ide_cmd *cmd, u8 stat)
        u8 set_xfer = !!(cmd->tf_flags & IDE_TFLAG_SET_XFER);
 
        ide_complete_cmd(drive, cmd, stat, err);
-       rq->errors = err;
+       scsi_req(rq)->result = err;
 
        if (err == 0 && set_xfer) {
                ide_set_xfer_rate(drive, nsect);
@@ -452,8 +452,8 @@ int ide_raw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd, u8 *buf,
        rq->special = cmd;
        cmd->rq = rq;
 
-       error = blk_execute_rq(drive->queue, NULL, rq, 0);
-
+       blk_execute_rq(drive->queue, NULL, rq, 0);
+       error = scsi_req(rq)->result ? -EIO : 0;
 put_req:
        blk_put_request(rq);
        return error;
index ca5759c..43a6cb0 100644 (file)
@@ -370,10 +370,12 @@ static int hid_accel_3d_probe(struct platform_device *pdev)
                name = "accel_3d";
                channel_spec = accel_3d_channels;
                channel_size = sizeof(accel_3d_channels);
+               indio_dev->num_channels = ARRAY_SIZE(accel_3d_channels);
        } else {
                name = "gravity";
                channel_spec = gravity_channels;
                channel_size = sizeof(gravity_channels);
+               indio_dev->num_channels = ARRAY_SIZE(gravity_channels);
        }
        ret = hid_sensor_parse_common_attributes(hsdev, hsdev->usage,
                                        &accel_state->common_attributes);
@@ -395,7 +397,6 @@ static int hid_accel_3d_probe(struct platform_device *pdev)
                goto error_free_dev_mem;
        }
 
-       indio_dev->num_channels = ARRAY_SIZE(accel_3d_channels);
        indio_dev->dev.parent = &pdev->dev;
        indio_dev->info = &accel_3d_info;
        indio_dev->name = name;
index d6c372b..c17596f 100644 (file)
@@ -61,7 +61,7 @@ static int cros_ec_sensors_read(struct iio_dev *indio_dev,
                ret = st->core.read_ec_sensors_data(indio_dev, 1 << idx, &data);
                if (ret < 0)
                        break;
-
+               ret = IIO_VAL_INT;
                *val = data;
                break;
        case IIO_CHAN_INFO_CALIBBIAS:
@@ -76,7 +76,7 @@ static int cros_ec_sensors_read(struct iio_dev *indio_dev,
                for (i = CROS_EC_SENSOR_X; i < CROS_EC_SENSOR_MAX_AXIS; i++)
                        st->core.calib[i] =
                                st->core.resp->sensor_offset.offset[i];
-
+               ret = IIO_VAL_INT;
                *val = st->core.calib[idx];
                break;
        case IIO_CHAN_INFO_SCALE:
index 7afdac4..01e02b9 100644 (file)
@@ -379,6 +379,8 @@ int hid_sensor_parse_common_attributes(struct hid_sensor_hub_device *hsdev,
 {
 
        struct hid_sensor_hub_attribute_info timestamp;
+       s32 value;
+       int ret;
 
        hid_sensor_get_reporting_interval(hsdev, usage_id, st);
 
@@ -417,6 +419,14 @@ int hid_sensor_parse_common_attributes(struct hid_sensor_hub_device *hsdev,
                st->sensitivity.index, st->sensitivity.report_id,
                timestamp.index, timestamp.report_id);
 
+       ret = sensor_hub_get_feature(hsdev,
+                               st->power_state.report_id,
+                               st->power_state.index, sizeof(value), &value);
+       if (ret < 0)
+               return ret;
+       if (value < 0)
+               return -EINVAL;
+
        return 0;
 }
 EXPORT_SYMBOL(hid_sensor_parse_common_attributes);
index f7fcfa8..821919d 100644 (file)
@@ -27,6 +27,7 @@
 #include <linux/iio/trigger_consumer.h>
 #include <linux/iio/triggered_buffer.h>
 #include <linux/regmap.h>
+#include <linux/delay.h>
 #include "bmg160.h"
 
 #define BMG160_IRQ_NAME                "bmg160_event"
@@ -52,6 +53,9 @@
 #define BMG160_DEF_BW                  100
 #define BMG160_REG_PMU_BW_RES          BIT(7)
 
+#define BMG160_GYRO_REG_RESET          0x14
+#define BMG160_GYRO_RESET_VAL          0xb6
+
 #define BMG160_REG_INT_MAP_0           0x17
 #define BMG160_INT_MAP_0_BIT_ANY       BIT(1)
 
@@ -236,6 +240,14 @@ static int bmg160_chip_init(struct bmg160_data *data)
        int ret;
        unsigned int val;
 
+       /*
+        * Reset chip to get it in a known good state. A delay of 30ms after
+        * reset is required according to the datasheet.
+        */
+       regmap_write(data->regmap, BMG160_GYRO_REG_RESET,
+                    BMG160_GYRO_RESET_VAL);
+       usleep_range(30000, 30700);
+
        ret = regmap_read(data->regmap, BMG160_REG_CHIP_ID, &val);
        if (ret < 0) {
                dev_err(dev, "Error reading reg_chip_id\n");
index d18ded4..3ff91e0 100644 (file)
@@ -610,10 +610,9 @@ static ssize_t __iio_format_value(char *buf, size_t len, unsigned int type,
                tmp0 = (int)div_s64_rem(tmp, 1000000000, &tmp1);
                return snprintf(buf, len, "%d.%09u", tmp0, abs(tmp1));
        case IIO_VAL_FRACTIONAL_LOG2:
-               tmp = (s64)vals[0] * 1000000000LL >> vals[1];
-               tmp1 = do_div(tmp, 1000000000LL);
-               tmp0 = tmp;
-               return snprintf(buf, len, "%d.%09u", tmp0, tmp1);
+               tmp = shift_right((s64)vals[0] * 1000000000LL, vals[1]);
+               tmp0 = (int)div_s64_rem(tmp, 1000000000LL, &tmp1);
+               return snprintf(buf, len, "%d.%09u", tmp0, abs(tmp1));
        case IIO_VAL_INT_MULTIPLE:
        {
                int i;
index 5f26808..fd0edca 100644 (file)
@@ -457,6 +457,7 @@ static const struct st_sensor_settings st_press_sensors_settings[] = {
                        .addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR,
                },
                .multi_read_bit = true,
+               .bootime = 2,
        },
 };
 
index 91cbe86..fcbed35 100644 (file)
@@ -817,6 +817,7 @@ isert_post_recvm(struct isert_conn *isert_conn, u32 count)
                rx_wr->sg_list = &rx_desc->rx_sg;
                rx_wr->num_sge = 1;
                rx_wr->next = rx_wr + 1;
+               rx_desc->in_use = false;
        }
        rx_wr--;
        rx_wr->next = NULL; /* mark end of work requests list */
@@ -835,6 +836,15 @@ isert_post_recv(struct isert_conn *isert_conn, struct iser_rx_desc *rx_desc)
        struct ib_recv_wr *rx_wr_failed, rx_wr;
        int ret;
 
+       if (!rx_desc->in_use) {
+               /*
+                * if the descriptor is not in-use we already reposted it
+                * for recv, so just silently return
+                */
+               return 0;
+       }
+
+       rx_desc->in_use = false;
        rx_wr.wr_cqe = &rx_desc->rx_cqe;
        rx_wr.sg_list = &rx_desc->rx_sg;
        rx_wr.num_sge = 1;
@@ -1397,6 +1407,8 @@ isert_recv_done(struct ib_cq *cq, struct ib_wc *wc)
                return;
        }
 
+       rx_desc->in_use = true;
+
        ib_dma_sync_single_for_cpu(ib_dev, rx_desc->dma_addr,
                        ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
 
@@ -1659,10 +1671,23 @@ isert_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc)
        ret = isert_check_pi_status(cmd, isert_cmd->rw.sig->sig_mr);
        isert_rdma_rw_ctx_destroy(isert_cmd, isert_conn);
 
-       if (ret)
-               transport_send_check_condition_and_sense(cmd, cmd->pi_err, 0);
-       else
-               isert_put_response(isert_conn->conn, isert_cmd->iscsi_cmd);
+       if (ret) {
+               /*
+                * transport_generic_request_failure() expects to have
+                * plus two references to handle queue-full, so re-add
+                * one here as target-core will have already dropped
+                * it after the first isert_put_datain() callback.
+                */
+               kref_get(&cmd->cmd_kref);
+               transport_generic_request_failure(cmd, cmd->pi_err);
+       } else {
+               /*
+                * XXX: isert_put_response() failure is not retried.
+                */
+               ret = isert_put_response(isert_conn->conn, isert_cmd->iscsi_cmd);
+               if (ret)
+                       pr_warn_ratelimited("isert_put_response() ret: %d\n", ret);
+       }
 }
 
 static void
@@ -1699,13 +1724,15 @@ isert_rdma_read_done(struct ib_cq *cq, struct ib_wc *wc)
        cmd->i_state = ISTATE_RECEIVED_LAST_DATAOUT;
        spin_unlock_bh(&cmd->istate_lock);
 
-       if (ret) {
-               target_put_sess_cmd(se_cmd);
-               transport_send_check_condition_and_sense(se_cmd,
-                                                        se_cmd->pi_err, 0);
-       } else {
+       /*
+        * transport_generic_request_failure() will drop the extra
+        * se_cmd->cmd_kref reference after T10-PI error, and handle
+        * any non-zero ->queue_status() callback error retries.
+        */
+       if (ret)
+               transport_generic_request_failure(se_cmd, se_cmd->pi_err);
+       else
                target_execute_cmd(se_cmd);
-       }
 }
 
 static void
@@ -2171,26 +2198,28 @@ isert_put_datain(struct iscsi_conn *conn, struct iscsi_cmd *cmd)
                chain_wr = &isert_cmd->tx_desc.send_wr;
        }
 
-       isert_rdma_rw_ctx_post(isert_cmd, isert_conn, cqe, chain_wr);
-       isert_dbg("Cmd: %p posted RDMA_WRITE for iSER Data READ\n", isert_cmd);
-       return 1;
+       rc = isert_rdma_rw_ctx_post(isert_cmd, isert_conn, cqe, chain_wr);
+       isert_dbg("Cmd: %p posted RDMA_WRITE for iSER Data READ rc: %d\n",
+                 isert_cmd, rc);
+       return rc;
 }
 
 static int
 isert_get_dataout(struct iscsi_conn *conn, struct iscsi_cmd *cmd, bool recovery)
 {
        struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd);
+       int ret;
 
        isert_dbg("Cmd: %p RDMA_READ data_length: %u write_data_done: %u\n",
                 isert_cmd, cmd->se_cmd.data_length, cmd->write_data_done);
 
        isert_cmd->tx_desc.tx_cqe.done = isert_rdma_read_done;
-       isert_rdma_rw_ctx_post(isert_cmd, conn->context,
-                       &isert_cmd->tx_desc.tx_cqe, NULL);
+       ret = isert_rdma_rw_ctx_post(isert_cmd, conn->context,
+                                    &isert_cmd->tx_desc.tx_cqe, NULL);
 
-       isert_dbg("Cmd: %p posted RDMA_READ memory for ISER Data WRITE\n",
-                isert_cmd);
-       return 0;
+       isert_dbg("Cmd: %p posted RDMA_READ memory for ISER Data WRITE rc: %d\n",
+                isert_cmd, ret);
+       return ret;
 }
 
 static int
index c02ada5..87d994d 100644 (file)
@@ -60,7 +60,7 @@
 
 #define ISER_RX_PAD_SIZE       (ISCSI_DEF_MAX_RECV_SEG_LEN + 4096 - \
                (ISER_RX_PAYLOAD_SIZE + sizeof(u64) + sizeof(struct ib_sge) + \
-                sizeof(struct ib_cqe)))
+                sizeof(struct ib_cqe) + sizeof(bool)))
 
 #define ISCSI_ISER_SG_TABLESIZE                256
 
@@ -85,6 +85,7 @@ struct iser_rx_desc {
        u64             dma_addr;
        struct ib_sge   rx_sg;
        struct ib_cqe   rx_cqe;
+       bool            in_use;
        char            pad[ISER_RX_PAD_SIZE];
 } __packed;
 
index 155fcb3..153b1ee 100644 (file)
@@ -202,6 +202,7 @@ static const struct xpad_device {
        { 0x1430, 0x8888, "TX6500+ Dance Pad (first generation)", MAP_DPAD_TO_BUTTONS, XTYPE_XBOX },
        { 0x146b, 0x0601, "BigBen Interactive XBOX 360 Controller", 0, XTYPE_XBOX360 },
        { 0x1532, 0x0037, "Razer Sabertooth", 0, XTYPE_XBOX360 },
+       { 0x1532, 0x0a03, "Razer Wildcat", 0, XTYPE_XBOXONE },
        { 0x15e4, 0x3f00, "Power A Mini Pro Elite", 0, XTYPE_XBOX360 },
        { 0x15e4, 0x3f0a, "Xbox Airflo wired controller", 0, XTYPE_XBOX360 },
        { 0x15e4, 0x3f10, "Batarang Xbox 360 controller", 0, XTYPE_XBOX360 },
@@ -326,6 +327,7 @@ static struct usb_device_id xpad_table[] = {
        XPAD_XBOX360_VENDOR(0x1430),            /* RedOctane X-Box 360 controllers */
        XPAD_XBOX360_VENDOR(0x146b),            /* BigBen Interactive Controllers */
        XPAD_XBOX360_VENDOR(0x1532),            /* Razer Sabertooth */
+       XPAD_XBOXONE_VENDOR(0x1532),            /* Razer Wildcat */
        XPAD_XBOX360_VENDOR(0x15e4),            /* Numark X-Box 360 controllers */
        XPAD_XBOX360_VENDOR(0x162e),            /* Joytech X-Box 360 controllers */
        XPAD_XBOX360_VENDOR(0x1689),            /* Razer Onza */
index efc8ec3..e73d968 100644 (file)
@@ -1118,6 +1118,7 @@ static int elantech_get_resolution_v4(struct psmouse *psmouse,
  * Asus UX32VD             0x361f02        00, 15, 0e      clickpad
  * Avatar AVIU-145A2       0x361f00        ?               clickpad
  * Fujitsu LIFEBOOK E544   0x470f00        d0, 12, 09      2 hw buttons
+ * Fujitsu LIFEBOOK E547   0x470f00        50, 12, 09      2 hw buttons
  * Fujitsu LIFEBOOK E554   0x570f01        40, 14, 0c      2 hw buttons
  * Fujitsu T725            0x470f01        05, 12, 09      2 hw buttons
  * Fujitsu H730            0x570f00        c0, 14, 0c      3 hw buttons (**)
@@ -1524,6 +1525,13 @@ static const struct dmi_system_id elantech_dmi_force_crc_enabled[] = {
                },
        },
        {
+               /* Fujitsu LIFEBOOK E547 does not work with crc_enabled == 0 */
+               .matches = {
+                       DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU"),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "LIFEBOOK E547"),
+               },
+       },
+       {
                /* Fujitsu LIFEBOOK E554  does not work with crc_enabled == 0 */
                .matches = {
                        DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU"),
index 312bd6c..09720d9 100644 (file)
@@ -620,6 +620,13 @@ static const struct dmi_system_id __initconst i8042_dmi_reset_table[] = {
                        DMI_MATCH(DMI_PRODUCT_NAME, "20046"),
                },
        },
+       {
+               /* Clevo P650RS, 650RP6, Sager NP8152-S, and others */
+               .matches = {
+                       DMI_MATCH(DMI_SYS_VENDOR, "Notebook"),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "P65xRP"),
+               },
+       },
        { }
 };
 
index b17536d..63cacf5 100644 (file)
@@ -1234,7 +1234,7 @@ static void __domain_flush_pages(struct protection_domain *domain,
 
        build_inv_iommu_pages(&cmd, address, size, domain->id, pde);
 
-       for (i = 0; i < amd_iommus_present; ++i) {
+       for (i = 0; i < amd_iommu_get_num_iommus(); ++i) {
                if (!domain->dev_iommu[i])
                        continue;
 
@@ -1278,7 +1278,7 @@ static void domain_flush_complete(struct protection_domain *domain)
 {
        int i;
 
-       for (i = 0; i < amd_iommus_present; ++i) {
+       for (i = 0; i < amd_iommu_get_num_iommus(); ++i) {
                if (domain && !domain->dev_iommu[i])
                        continue;
 
@@ -3363,7 +3363,7 @@ static int __flush_pasid(struct protection_domain *domain, int pasid,
         * IOMMU TLB needs to be flushed before Device TLB to
         * prevent device TLB refill from IOMMU TLB
         */
-       for (i = 0; i < amd_iommus_present; ++i) {
+       for (i = 0; i < amd_iommu_get_num_iommus(); ++i) {
                if (domain->dev_iommu[i] == 0)
                        continue;
 
index 6130278..5a11328 100644 (file)
@@ -167,7 +167,9 @@ LIST_HEAD(amd_iommu_list);          /* list of all AMD IOMMUs in the
 
 /* Array to assign indices to IOMMUs*/
 struct amd_iommu *amd_iommus[MAX_IOMMUS];
-int amd_iommus_present;
+
+/* Number of IOMMUs present in the system */
+static int amd_iommus_present;
 
 /* IOMMUs have a non-present cache? */
 bool amd_iommu_np_cache __read_mostly;
@@ -254,10 +256,6 @@ static int amd_iommu_enable_interrupts(void);
 static int __init iommu_go_to_state(enum iommu_init_state state);
 static void init_device_table_dma(void);
 
-static int iommu_pc_get_set_reg_val(struct amd_iommu *iommu,
-                                   u8 bank, u8 cntr, u8 fxn,
-                                   u64 *value, bool is_write);
-
 static inline void update_last_devid(u16 devid)
 {
        if (devid > amd_iommu_last_bdf)
@@ -272,6 +270,11 @@ static inline unsigned long tbl_size(int entry_size)
        return 1UL << shift;
 }
 
+int amd_iommu_get_num_iommus(void)
+{
+       return amd_iommus_present;
+}
+
 /* Access to l1 and l2 indexed register spaces */
 
 static u32 iommu_read_l1(struct amd_iommu *iommu, u16 l1, u8 address)
@@ -1336,7 +1339,7 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
 
        /* Add IOMMU to internal data structures */
        list_add_tail(&iommu->list, &amd_iommu_list);
-       iommu->index             = amd_iommus_present++;
+       iommu->index = amd_iommus_present++;
 
        if (unlikely(iommu->index >= MAX_IOMMUS)) {
                WARN(1, "AMD-Vi: System has more IOMMUs than supported by this driver\n");
@@ -1477,6 +1480,8 @@ static int __init init_iommu_all(struct acpi_table_header *table)
        return 0;
 }
 
+static int iommu_pc_get_set_reg(struct amd_iommu *iommu, u8 bank, u8 cntr,
+                               u8 fxn, u64 *value, bool is_write);
 
 static void init_iommu_perf_ctr(struct amd_iommu *iommu)
 {
@@ -1488,8 +1493,8 @@ static void init_iommu_perf_ctr(struct amd_iommu *iommu)
        amd_iommu_pc_present = true;
 
        /* Check if the performance counters can be written to */
-       if ((0 != iommu_pc_get_set_reg_val(iommu, 0, 0, 0, &val, true)) ||
-           (0 != iommu_pc_get_set_reg_val(iommu, 0, 0, 0, &val2, false)) ||
+       if ((iommu_pc_get_set_reg(iommu, 0, 0, 0, &val, true)) ||
+           (iommu_pc_get_set_reg(iommu, 0, 0, 0, &val2, false)) ||
            (val != val2)) {
                pr_err("AMD-Vi: Unable to write to IOMMU perf counter.\n");
                amd_iommu_pc_present = false;
@@ -2711,6 +2716,18 @@ bool amd_iommu_v2_supported(void)
 }
 EXPORT_SYMBOL(amd_iommu_v2_supported);
 
+struct amd_iommu *get_amd_iommu(unsigned int idx)
+{
+       unsigned int i = 0;
+       struct amd_iommu *iommu;
+
+       for_each_iommu(iommu)
+               if (i++ == idx)
+                       return iommu;
+       return NULL;
+}
+EXPORT_SYMBOL(get_amd_iommu);
+
 /****************************************************************************
  *
  * IOMMU EFR Performance Counter support functionality. This code allows
@@ -2718,17 +2735,14 @@ EXPORT_SYMBOL(amd_iommu_v2_supported);
  *
  ****************************************************************************/
 
-u8 amd_iommu_pc_get_max_banks(u16 devid)
+u8 amd_iommu_pc_get_max_banks(unsigned int idx)
 {
-       struct amd_iommu *iommu;
-       u8 ret = 0;
+       struct amd_iommu *iommu = get_amd_iommu(idx);
 
-       /* locate the iommu governing the devid */
-       iommu = amd_iommu_rlookup_table[devid];
        if (iommu)
-               ret = iommu->max_banks;
+               return iommu->max_banks;
 
-       return ret;
+       return 0;
 }
 EXPORT_SYMBOL(amd_iommu_pc_get_max_banks);
 
@@ -2738,62 +2752,69 @@ bool amd_iommu_pc_supported(void)
 }
 EXPORT_SYMBOL(amd_iommu_pc_supported);
 
-u8 amd_iommu_pc_get_max_counters(u16 devid)
+u8 amd_iommu_pc_get_max_counters(unsigned int idx)
 {
-       struct amd_iommu *iommu;
-       u8 ret = 0;
+       struct amd_iommu *iommu = get_amd_iommu(idx);
 
-       /* locate the iommu governing the devid */
-       iommu = amd_iommu_rlookup_table[devid];
        if (iommu)
-               ret = iommu->max_counters;
+               return iommu->max_counters;
 
-       return ret;
+       return 0;
 }
 EXPORT_SYMBOL(amd_iommu_pc_get_max_counters);
 
-static int iommu_pc_get_set_reg_val(struct amd_iommu *iommu,
-                                   u8 bank, u8 cntr, u8 fxn,
-                                   u64 *value, bool is_write)
+static int iommu_pc_get_set_reg(struct amd_iommu *iommu, u8 bank, u8 cntr,
+                               u8 fxn, u64 *value, bool is_write)
 {
        u32 offset;
        u32 max_offset_lim;
 
+       /* Make sure the IOMMU PC resource is available */
+       if (!amd_iommu_pc_present)
+               return -ENODEV;
+
        /* Check for valid iommu and pc register indexing */
-       if (WARN_ON((fxn > 0x28) || (fxn & 7)))
+       if (WARN_ON(!iommu || (fxn > 0x28) || (fxn & 7)))
                return -ENODEV;
 
-       offset = (u32)(((0x40|bank) << 12) | (cntr << 8) | fxn);
+       offset = (u32)(((0x40 | bank) << 12) | (cntr << 8) | fxn);
 
        /* Limit the offset to the hw defined mmio region aperture */
-       max_offset_lim = (u32)(((0x40|iommu->max_banks) << 12) |
+       max_offset_lim = (u32)(((0x40 | iommu->max_banks) << 12) |
                                (iommu->max_counters << 8) | 0x28);
        if ((offset < MMIO_CNTR_REG_OFFSET) ||
            (offset > max_offset_lim))
                return -EINVAL;
 
        if (is_write) {
-               writel((u32)*value, iommu->mmio_base + offset);
-               writel((*value >> 32), iommu->mmio_base + offset + 4);
+               u64 val = *value & GENMASK_ULL(47, 0);
+
+               writel((u32)val, iommu->mmio_base + offset);
+               writel((val >> 32), iommu->mmio_base + offset + 4);
        } else {
                *value = readl(iommu->mmio_base + offset + 4);
                *value <<= 32;
-               *value = readl(iommu->mmio_base + offset);
+               *value |= readl(iommu->mmio_base + offset);
+               *value &= GENMASK_ULL(47, 0);
        }
 
        return 0;
 }
-EXPORT_SYMBOL(amd_iommu_pc_get_set_reg_val);
 
-int amd_iommu_pc_get_set_reg_val(u16 devid, u8 bank, u8 cntr, u8 fxn,
-                                   u64 *value, bool is_write)
+int amd_iommu_pc_get_reg(struct amd_iommu *iommu, u8 bank, u8 cntr, u8 fxn, u64 *value)
 {
-       struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
+       if (!iommu)
+               return -EINVAL;
 
-       /* Make sure the IOMMU PC resource is available */
-       if (!amd_iommu_pc_present || iommu == NULL)
-               return -ENODEV;
+       return iommu_pc_get_set_reg(iommu, bank, cntr, fxn, value, false);
+}
+EXPORT_SYMBOL(amd_iommu_pc_get_reg);
+
+int amd_iommu_pc_set_reg(struct amd_iommu *iommu, u8 bank, u8 cntr, u8 fxn, u64 *value)
+{
+       if (!iommu)
+               return -EINVAL;
 
-       return iommu_pc_get_set_reg_val(iommu, bank, cntr, fxn,
-                                       value, is_write);
+       return iommu_pc_get_set_reg(iommu, bank, cntr, fxn, value, true);
 }
+EXPORT_SYMBOL(amd_iommu_pc_set_reg);
index 7eb60c1..466260f 100644 (file)
@@ -21,6 +21,7 @@
 
 #include "amd_iommu_types.h"
 
+extern int amd_iommu_get_num_iommus(void);
 extern int amd_iommu_init_dma_ops(void);
 extern int amd_iommu_init_passthrough(void);
 extern irqreturn_t amd_iommu_int_thread(int irq, void *data);
@@ -56,13 +57,6 @@ extern int amd_iommu_domain_set_gcr3(struct iommu_domain *dom, int pasid,
 extern int amd_iommu_domain_clear_gcr3(struct iommu_domain *dom, int pasid);
 extern struct iommu_domain *amd_iommu_get_v2_domain(struct pci_dev *pdev);
 
-/* IOMMU Performance Counter functions */
-extern bool amd_iommu_pc_supported(void);
-extern u8 amd_iommu_pc_get_max_banks(u16 devid);
-extern u8 amd_iommu_pc_get_max_counters(u16 devid);
-extern int amd_iommu_pc_get_set_reg_val(u16 devid, u8 bank, u8 cntr, u8 fxn,
-                                   u64 *value, bool is_write);
-
 #ifdef CONFIG_IRQ_REMAP
 extern int amd_iommu_create_irq_domain(struct amd_iommu *iommu);
 #else
index 003f3ce..4de8f41 100644 (file)
@@ -611,9 +611,6 @@ extern struct list_head amd_iommu_list;
  */
 extern struct amd_iommu *amd_iommus[MAX_IOMMUS];
 
-/* Number of IOMMUs present in the system */
-extern int amd_iommus_present;
-
 /*
  * Declarations for the global list of all protection domains
  */
index 8162121..595d0c9 100644 (file)
@@ -115,6 +115,12 @@ config DW_APB_ICTL
        select GENERIC_IRQ_CHIP
        select IRQ_DOMAIN
 
+config FARADAY_FTINTC010
+       bool
+       select IRQ_DOMAIN
+       select MULTI_IRQ_HANDLER
+       select SPARSE_IRQ
+
 config HISILICON_IRQ_MBIGEN
        bool
        select ARM_GIC_V3
index 152bc40..b64c59b 100644 (file)
@@ -6,7 +6,7 @@ obj-$(CONFIG_ATH79)                     += irq-ath79-misc.o
 obj-$(CONFIG_ARCH_BCM2835)             += irq-bcm2835.o
 obj-$(CONFIG_ARCH_BCM2835)             += irq-bcm2836.o
 obj-$(CONFIG_ARCH_EXYNOS)              += exynos-combiner.o
-obj-$(CONFIG_ARCH_GEMINI)              += irq-gemini.o
+obj-$(CONFIG_FARADAY_FTINTC010)                += irq-ftintc010.o
 obj-$(CONFIG_ARCH_HIP04)               += irq-hip04.o
 obj-$(CONFIG_ARCH_LPC32XX)             += irq-lpc32xx.o
 obj-$(CONFIG_ARCH_MMP)                 += irq-mmp.o
@@ -16,7 +16,6 @@ obj-$(CONFIG_ARCH_S3C24XX)            += irq-s3c24xx.o
 obj-$(CONFIG_DW_APB_ICTL)              += irq-dw-apb-ictl.o
 obj-$(CONFIG_METAG)                    += irq-metag-ext.o
 obj-$(CONFIG_METAG_PERFCOUNTER_IRQS)   += irq-metag.o
-obj-$(CONFIG_ARCH_MOXART)              += irq-moxart.o
 obj-$(CONFIG_CLPS711X_IRQCHIP)         += irq-clps711x.o
 obj-$(CONFIG_OR1K_PIC)                 += irq-or1k-pic.o
 obj-$(CONFIG_ORION_IRQCHIP)            += irq-orion.o
@@ -62,7 +61,7 @@ obj-$(CONFIG_BCM7120_L2_IRQ)          += irq-bcm7120-l2.o
 obj-$(CONFIG_BRCMSTB_L2_IRQ)           += irq-brcmstb-l2.o
 obj-$(CONFIG_KEYSTONE_IRQ)             += irq-keystone.o
 obj-$(CONFIG_MIPS_GIC)                 += irq-mips-gic.o
-obj-$(CONFIG_ARCH_MEDIATEK)            += irq-mtk-sysirq.o
+obj-$(CONFIG_ARCH_MEDIATEK)            += irq-mtk-sysirq.o irq-mtk-cirq.o
 obj-$(CONFIG_ARCH_DIGICOLOR)           += irq-digicolor.o
 obj-$(CONFIG_RENESAS_H8300H_INTC)      += irq-renesas-h8300h.o
 obj-$(CONFIG_RENESAS_H8S_INTC)         += irq-renesas-h8s.o
index 2a624d8..c04ee9a 100644 (file)
@@ -150,6 +150,8 @@ static int aic5_set_type(struct irq_data *d, unsigned type)
 }
 
 #ifdef CONFIG_PM
+static u32 *smr_cache;
+
 static void aic5_suspend(struct irq_data *d)
 {
        struct irq_domain *domain = d->domain;
@@ -159,6 +161,12 @@ static void aic5_suspend(struct irq_data *d)
        int i;
        u32 mask;
 
+       if (smr_cache)
+               for (i = 0; i < domain->revmap_size; i++) {
+                       irq_reg_writel(bgc, i, AT91_AIC5_SSR);
+                       smr_cache[i] = irq_reg_readl(bgc, AT91_AIC5_SMR);
+               }
+
        irq_gc_lock(bgc);
        for (i = 0; i < dgc->irqs_per_chip; i++) {
                mask = 1 << i;
@@ -184,9 +192,21 @@ static void aic5_resume(struct irq_data *d)
        u32 mask;
 
        irq_gc_lock(bgc);
+
+       if (smr_cache) {
+               irq_reg_writel(bgc, 0xffffffff, AT91_AIC5_SPU);
+               for (i = 0; i < domain->revmap_size; i++) {
+                       irq_reg_writel(bgc, i, AT91_AIC5_SSR);
+                       irq_reg_writel(bgc, i, AT91_AIC5_SVR);
+                       irq_reg_writel(bgc, smr_cache[i], AT91_AIC5_SMR);
+               }
+       }
+
        for (i = 0; i < dgc->irqs_per_chip; i++) {
                mask = 1 << i;
-               if ((mask & gc->mask_cache) == (mask & gc->wake_active))
+
+               if (!smr_cache &&
+                   ((mask & gc->mask_cache) == (mask & gc->wake_active)))
                        continue;
 
                irq_reg_writel(bgc, i + gc->irq_base, AT91_AIC5_SSR);
@@ -342,6 +362,13 @@ static int __init aic5_of_init(struct device_node *node,
 static int __init sama5d2_aic5_of_init(struct device_node *node,
                                       struct device_node *parent)
 {
+#ifdef CONFIG_PM
+       smr_cache = kcalloc(DIV_ROUND_UP(NR_SAMA5D2_IRQS, 32) * 32,
+                           sizeof(*smr_cache), GFP_KERNEL);
+       if (!smr_cache)
+               return -ENOMEM;
+#endif
+
        return aic5_of_init(node, parent, NR_SAMA5D2_IRQS);
 }
 IRQCHIP_DECLARE(sama5d2_aic5, "atmel,sama5d2-aic", sama5d2_aic5_of_init);
diff --git a/drivers/irqchip/irq-ftintc010.c b/drivers/irqchip/irq-ftintc010.c
new file mode 100644 (file)
index 0000000..cd2dc8b
--- /dev/null
@@ -0,0 +1,194 @@
+/*
+ * irqchip for the Faraday Technology FTINTC010 Copyright (C) 2017 Linus
+ * Walleij <linus.walleij@linaro.org>
+ *
+ * Based on arch/arm/mach-gemini/irq.c
+ * Copyright (C) 2001-2006 Storlink, Corp.
+ * Copyright (C) 2008-2009 Paulius Zaleckas <paulius.zaleckas@gmail.com>
+ */
+#include <linux/bitops.h>
+#include <linux/irq.h>
+#include <linux/io.h>
+#include <linux/irqchip.h>
+#include <linux/irqchip/versatile-fpga.h>
+#include <linux/irqdomain.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/of_irq.h>
+#include <linux/cpu.h>
+
+#include <asm/exception.h>
+#include <asm/mach/irq.h>
+
+#define FT010_NUM_IRQS 32
+
+#define FT010_IRQ_SOURCE(base_addr)    (base_addr + 0x00)
+#define FT010_IRQ_MASK(base_addr)      (base_addr + 0x04)
+#define FT010_IRQ_CLEAR(base_addr)     (base_addr + 0x08)
+/* Selects level- or edge-triggered */
+#define FT010_IRQ_MODE(base_addr)      (base_addr + 0x0C)
+/* Selects active low/high or falling/rising edge */
+#define FT010_IRQ_POLARITY(base_addr)  (base_addr + 0x10)
+#define FT010_IRQ_STATUS(base_addr)    (base_addr + 0x14)
+#define FT010_FIQ_SOURCE(base_addr)    (base_addr + 0x20)
+#define FT010_FIQ_MASK(base_addr)      (base_addr + 0x24)
+#define FT010_FIQ_CLEAR(base_addr)     (base_addr + 0x28)
+#define FT010_FIQ_MODE(base_addr)      (base_addr + 0x2C)
+#define FT010_FIQ_POLARITY(base_addr)  (base_addr + 0x30)
+#define FT010_FIQ_STATUS(base_addr)    (base_addr + 0x34)
+
+/**
+ * struct ft010_irq_data - irq data container for the Faraday IRQ controller
+ * @base: memory offset in virtual memory
+ * @chip: chip container for this instance
+ * @domain: IRQ domain for this instance
+ */
+struct ft010_irq_data {
+       void __iomem *base;
+       struct irq_chip chip;
+       struct irq_domain *domain;
+};
+
+static void ft010_irq_mask(struct irq_data *d)
+{
+       struct ft010_irq_data *f = irq_data_get_irq_chip_data(d);
+       unsigned int mask;
+
+       mask = readl(FT010_IRQ_MASK(f->base));
+       mask &= ~BIT(irqd_to_hwirq(d));
+       writel(mask, FT010_IRQ_MASK(f->base));
+}
+
+static void ft010_irq_unmask(struct irq_data *d)
+{
+       struct ft010_irq_data *f = irq_data_get_irq_chip_data(d);
+       unsigned int mask;
+
+       mask = readl(FT010_IRQ_MASK(f->base));
+       mask |= BIT(irqd_to_hwirq(d));
+       writel(mask, FT010_IRQ_MASK(f->base));
+}
+
+static void ft010_irq_ack(struct irq_data *d)
+{
+       struct ft010_irq_data *f = irq_data_get_irq_chip_data(d);
+
+       writel(BIT(irqd_to_hwirq(d)), FT010_IRQ_CLEAR(f->base));
+}
+
+static int ft010_irq_set_type(struct irq_data *d, unsigned int trigger)
+{
+       struct ft010_irq_data *f = irq_data_get_irq_chip_data(d);
+       int offset = irqd_to_hwirq(d);
+       u32 mode, polarity;
+
+       mode = readl(FT010_IRQ_MODE(f->base));
+       polarity = readl(FT010_IRQ_POLARITY(f->base));
+
+       if (trigger & (IRQ_TYPE_LEVEL_LOW)) {
+               irq_set_handler_locked(d, handle_level_irq);
+               mode &= ~BIT(offset);
+               polarity |= BIT(offset);
+       } else if (trigger & (IRQ_TYPE_LEVEL_HIGH)) {
+               irq_set_handler_locked(d, handle_level_irq);
+               mode &= ~BIT(offset);
+               polarity &= ~BIT(offset);
+       } else if (trigger & IRQ_TYPE_EDGE_FALLING) {
+               irq_set_handler_locked(d, handle_edge_irq);
+               mode |= BIT(offset);
+               polarity |= BIT(offset);
+       } else if (trigger & IRQ_TYPE_EDGE_RISING) {
+               irq_set_handler_locked(d, handle_edge_irq);
+               mode |= BIT(offset);
+               polarity &= ~BIT(offset);
+       } else {
+               irq_set_handler_locked(d, handle_bad_irq);
+               pr_warn("Faraday IRQ: no supported trigger selected for line %d\n",
+                       offset);
+       }
+
+       writel(mode, FT010_IRQ_MODE(f->base));
+       writel(polarity, FT010_IRQ_POLARITY(f->base));
+
+       return 0;
+}
+
+static struct irq_chip ft010_irq_chip = {
+       .name           = "FTINTC010",
+       .irq_ack        = ft010_irq_ack,
+       .irq_mask       = ft010_irq_mask,
+       .irq_unmask     = ft010_irq_unmask,
+       .irq_set_type   = ft010_irq_set_type,
+};
+
+/* Local static for the IRQ entry call */
+static struct ft010_irq_data firq;
+
+asmlinkage void __exception_irq_entry ft010_irqchip_handle_irq(struct pt_regs *regs)
+{
+       struct ft010_irq_data *f = &firq;
+       int irq;
+       u32 status;
+
+       while ((status = readl(FT010_IRQ_STATUS(f->base)))) {
+               irq = ffs(status) - 1;
+               handle_domain_irq(f->domain, irq, regs);
+       }
+}
+
+static int ft010_irqdomain_map(struct irq_domain *d, unsigned int irq,
+                               irq_hw_number_t hwirq)
+{
+       struct ft010_irq_data *f = d->host_data;
+
+       irq_set_chip_data(irq, f);
+       /* All IRQs should set up their type, flags as bad by default */
+       irq_set_chip_and_handler(irq, &ft010_irq_chip, handle_bad_irq);
+       irq_set_probe(irq);
+
+       return 0;
+}
+
+static void ft010_irqdomain_unmap(struct irq_domain *d, unsigned int irq)
+{
+       irq_set_chip_and_handler(irq, NULL, NULL);
+       irq_set_chip_data(irq, NULL);
+}
+
+static const struct irq_domain_ops ft010_irqdomain_ops = {
+       .map = ft010_irqdomain_map,
+       .unmap = ft010_irqdomain_unmap,
+       .xlate = irq_domain_xlate_onetwocell,
+};
+
+int __init ft010_of_init_irq(struct device_node *node,
+                             struct device_node *parent)
+{
+       struct ft010_irq_data *f = &firq;
+
+       /*
+        * Disable the idle handler by default since it is buggy
+        * For more info see arch/arm/mach-gemini/idle.c
+        */
+       cpu_idle_poll_ctrl(true);
+
+       f->base = of_iomap(node, 0);
+       WARN(!f->base, "unable to map gemini irq registers\n");
+
+       /* Disable all interrupts */
+       writel(0, FT010_IRQ_MASK(f->base));
+       writel(0, FT010_FIQ_MASK(f->base));
+
+       f->domain = irq_domain_add_simple(node, FT010_NUM_IRQS, 0,
+                                         &ft010_irqdomain_ops, f);
+       set_handle_irq(ft010_irqchip_handle_irq);
+
+       return 0;
+}
+IRQCHIP_DECLARE(faraday, "faraday,ftintc010",
+               ft010_of_init_irq);
+IRQCHIP_DECLARE(gemini, "cortina,gemini-interrupt-controller",
+               ft010_of_init_irq);
+IRQCHIP_DECLARE(moxa, "moxa,moxart-ic",
+               ft010_of_init_irq);
diff --git a/drivers/irqchip/irq-gemini.c b/drivers/irqchip/irq-gemini.c
deleted file mode 100644 (file)
index 495224c..0000000
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * irqchip for the Cortina Systems Gemini Copyright (C) 2017 Linus
- * Walleij <linus.walleij@linaro.org>
- *
- * Based on arch/arm/mach-gemini/irq.c
- * Copyright (C) 2001-2006 Storlink, Corp.
- * Copyright (C) 2008-2009 Paulius Zaleckas <paulius.zaleckas@teltonika.lt>
- */
-#include <linux/bitops.h>
-#include <linux/irq.h>
-#include <linux/io.h>
-#include <linux/irqchip.h>
-#include <linux/irqchip/versatile-fpga.h>
-#include <linux/irqdomain.h>
-#include <linux/module.h>
-#include <linux/of.h>
-#include <linux/of_address.h>
-#include <linux/of_irq.h>
-#include <linux/cpu.h>
-
-#include <asm/exception.h>
-#include <asm/mach/irq.h>
-
-#define GEMINI_NUM_IRQS 32
-
-#define GEMINI_IRQ_SOURCE(base_addr)   (base_addr + 0x00)
-#define GEMINI_IRQ_MASK(base_addr)     (base_addr + 0x04)
-#define GEMINI_IRQ_CLEAR(base_addr)    (base_addr + 0x08)
-#define GEMINI_IRQ_MODE(base_addr)     (base_addr + 0x0C)
-#define GEMINI_IRQ_POLARITY(base_addr) (base_addr + 0x10)
-#define GEMINI_IRQ_STATUS(base_addr)   (base_addr + 0x14)
-#define GEMINI_FIQ_SOURCE(base_addr)   (base_addr + 0x20)
-#define GEMINI_FIQ_MASK(base_addr)     (base_addr + 0x24)
-#define GEMINI_FIQ_CLEAR(base_addr)    (base_addr + 0x28)
-#define GEMINI_FIQ_MODE(base_addr)     (base_addr + 0x2C)
-#define GEMINI_FIQ_POLARITY(base_addr) (base_addr + 0x30)
-#define GEMINI_FIQ_STATUS(base_addr)   (base_addr + 0x34)
-
-/**
- * struct gemini_irq_data - irq data container for the Gemini IRQ controller
- * @base: memory offset in virtual memory
- * @chip: chip container for this instance
- * @domain: IRQ domain for this instance
- */
-struct gemini_irq_data {
-       void __iomem *base;
-       struct irq_chip chip;
-       struct irq_domain *domain;
-};
-
-static void gemini_irq_mask(struct irq_data *d)
-{
-       struct gemini_irq_data *g = irq_data_get_irq_chip_data(d);
-       unsigned int mask;
-
-       mask = readl(GEMINI_IRQ_MASK(g->base));
-       mask &= ~BIT(irqd_to_hwirq(d));
-       writel(mask, GEMINI_IRQ_MASK(g->base));
-}
-
-static void gemini_irq_unmask(struct irq_data *d)
-{
-       struct gemini_irq_data *g = irq_data_get_irq_chip_data(d);
-       unsigned int mask;
-
-       mask = readl(GEMINI_IRQ_MASK(g->base));
-       mask |= BIT(irqd_to_hwirq(d));
-       writel(mask, GEMINI_IRQ_MASK(g->base));
-}
-
-static void gemini_irq_ack(struct irq_data *d)
-{
-       struct gemini_irq_data *g = irq_data_get_irq_chip_data(d);
-
-       writel(BIT(irqd_to_hwirq(d)), GEMINI_IRQ_CLEAR(g->base));
-}
-
-static int gemini_irq_set_type(struct irq_data *d, unsigned int trigger)
-{
-       struct gemini_irq_data *g = irq_data_get_irq_chip_data(d);
-       int offset = irqd_to_hwirq(d);
-       u32 mode, polarity;
-
-       mode = readl(GEMINI_IRQ_MODE(g->base));
-       polarity = readl(GEMINI_IRQ_POLARITY(g->base));
-
-       if (trigger & (IRQ_TYPE_LEVEL_HIGH)) {
-               irq_set_handler_locked(d, handle_level_irq);
-               /* Disable edge detection */
-               mode &= ~BIT(offset);
-               polarity &= ~BIT(offset);
-       } else if (trigger & IRQ_TYPE_EDGE_RISING) {
-               irq_set_handler_locked(d, handle_edge_irq);
-               mode |= BIT(offset);
-               polarity |= BIT(offset);
-       } else if (trigger & IRQ_TYPE_EDGE_FALLING) {
-               irq_set_handler_locked(d, handle_edge_irq);
-               mode |= BIT(offset);
-               polarity &= ~BIT(offset);
-       } else {
-               irq_set_handler_locked(d, handle_bad_irq);
-               pr_warn("GEMINI IRQ: no supported trigger selected for line %d\n",
-                       offset);
-       }
-
-       writel(mode, GEMINI_IRQ_MODE(g->base));
-       writel(polarity, GEMINI_IRQ_POLARITY(g->base));
-
-       return 0;
-}
-
-static struct irq_chip gemini_irq_chip = {
-       .name           = "GEMINI",
-       .irq_ack        = gemini_irq_ack,
-       .irq_mask       = gemini_irq_mask,
-       .irq_unmask     = gemini_irq_unmask,
-       .irq_set_type   = gemini_irq_set_type,
-};
-
-/* Local static for the IRQ entry call */
-static struct gemini_irq_data girq;
-
-asmlinkage void __exception_irq_entry gemini_irqchip_handle_irq(struct pt_regs *regs)
-{
-       struct gemini_irq_data *g = &girq;
-       int irq;
-       u32 status;
-
-       while ((status = readl(GEMINI_IRQ_STATUS(g->base)))) {
-               irq = ffs(status) - 1;
-               handle_domain_irq(g->domain, irq, regs);
-       }
-}
-
-static int gemini_irqdomain_map(struct irq_domain *d, unsigned int irq,
-                               irq_hw_number_t hwirq)
-{
-       struct gemini_irq_data *g = d->host_data;
-
-       irq_set_chip_data(irq, g);
-       /* All IRQs should set up their type, flags as bad by default */
-       irq_set_chip_and_handler(irq, &gemini_irq_chip, handle_bad_irq);
-       irq_set_probe(irq);
-
-       return 0;
-}
-
-static void gemini_irqdomain_unmap(struct irq_domain *d, unsigned int irq)
-{
-       irq_set_chip_and_handler(irq, NULL, NULL);
-       irq_set_chip_data(irq, NULL);
-}
-
-static const struct irq_domain_ops gemini_irqdomain_ops = {
-       .map = gemini_irqdomain_map,
-       .unmap = gemini_irqdomain_unmap,
-       .xlate = irq_domain_xlate_onetwocell,
-};
-
-int __init gemini_of_init_irq(struct device_node *node,
-                             struct device_node *parent)
-{
-       struct gemini_irq_data *g = &girq;
-
-       /*
-        * Disable the idle handler by default since it is buggy
-        * For more info see arch/arm/mach-gemini/idle.c
-        */
-       cpu_idle_poll_ctrl(true);
-
-       g->base = of_iomap(node, 0);
-       WARN(!g->base, "unable to map gemini irq registers\n");
-
-       /* Disable all interrupts */
-       writel(0, GEMINI_IRQ_MASK(g->base));
-       writel(0, GEMINI_FIQ_MASK(g->base));
-
-       g->domain = irq_domain_add_simple(node, GEMINI_NUM_IRQS, 0,
-                                         &gemini_irqdomain_ops, g);
-       set_handle_irq(gemini_irqchip_handle_irq);
-
-       return 0;
-}
-IRQCHIP_DECLARE(gemini, "cortina,gemini-interrupt-controller",
-               gemini_of_init_irq);
index 470b4aa..9e9dda3 100644 (file)
@@ -15,6 +15,7 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
+#include <linux/acpi_iort.h>
 #include <linux/device.h>
 #include <linux/msi.h>
 #include <linux/of.h>
@@ -24,15 +25,11 @@ static struct irq_chip its_pmsi_irq_chip = {
        .name                   = "ITS-pMSI",
 };
 
-static int its_pmsi_prepare(struct irq_domain *domain, struct device *dev,
-                           int nvec, msi_alloc_info_t *info)
+static int of_pmsi_get_dev_id(struct irq_domain *domain, struct device *dev,
+                                 u32 *dev_id)
 {
-       struct msi_domain_info *msi_info;
-       u32 dev_id;
        int ret, index = 0;
 
-       msi_info = msi_get_domain_info(domain->parent);
-
        /* Suck the DeviceID out of the msi-parent property */
        do {
                struct of_phandle_args args;
@@ -43,11 +40,32 @@ static int its_pmsi_prepare(struct irq_domain *domain, struct device *dev,
                if (args.np == irq_domain_get_of_node(domain)) {
                        if (WARN_ON(args.args_count != 1))
                                return -EINVAL;
-                       dev_id = args.args[0];
+                       *dev_id = args.args[0];
                        break;
                }
        } while (!ret);
 
+       return ret;
+}
+
+int __weak iort_pmsi_get_dev_id(struct device *dev, u32 *dev_id)
+{
+       return -1;
+}
+
+static int its_pmsi_prepare(struct irq_domain *domain, struct device *dev,
+                           int nvec, msi_alloc_info_t *info)
+{
+       struct msi_domain_info *msi_info;
+       u32 dev_id;
+       int ret;
+
+       msi_info = msi_get_domain_info(domain->parent);
+
+       if (dev->of_node)
+               ret = of_pmsi_get_dev_id(domain, dev, &dev_id);
+       else
+               ret = iort_pmsi_get_dev_id(dev, &dev_id);
        if (ret)
                return ret;
 
@@ -73,34 +91,79 @@ static struct of_device_id its_device_id[] = {
        {},
 };
 
-static int __init its_pmsi_init(void)
+static int __init its_pmsi_init_one(struct fwnode_handle *fwnode,
+                               const char *name)
 {
-       struct device_node *np;
        struct irq_domain *parent;
 
+       parent = irq_find_matching_fwnode(fwnode, DOMAIN_BUS_NEXUS);
+       if (!parent || !msi_get_domain_info(parent)) {
+               pr_err("%s: unable to locate ITS domain\n", name);
+               return -ENXIO;
+       }
+
+       if (!platform_msi_create_irq_domain(fwnode, &its_pmsi_domain_info,
+                                           parent)) {
+               pr_err("%s: unable to create platform domain\n", name);
+               return -ENXIO;
+       }
+
+       pr_info("Platform MSI: %s domain created\n", name);
+       return 0;
+}
+
+#ifdef CONFIG_ACPI
+static int __init
+its_pmsi_parse_madt(struct acpi_subtable_header *header,
+                       const unsigned long end)
+{
+       struct acpi_madt_generic_translator *its_entry;
+       struct fwnode_handle *domain_handle;
+       const char *node_name;
+       int err = -ENXIO;
+
+       its_entry = (struct acpi_madt_generic_translator *)header;
+       node_name = kasprintf(GFP_KERNEL, "ITS@0x%lx",
+                             (long)its_entry->base_address);
+       domain_handle = iort_find_domain_token(its_entry->translation_id);
+       if (!domain_handle) {
+               pr_err("%s: Unable to locate ITS domain handle\n", node_name);
+               goto out;
+       }
+
+       err = its_pmsi_init_one(domain_handle, node_name);
+
+out:
+       kfree(node_name);
+       return err;
+}
+
+static void __init its_pmsi_acpi_init(void)
+{
+       acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_TRANSLATOR,
+                             its_pmsi_parse_madt, 0);
+}
+#else
+static inline void its_pmsi_acpi_init(void) { }
+#endif
+
+static void __init its_pmsi_of_init(void)
+{
+       struct device_node *np;
+
        for (np = of_find_matching_node(NULL, its_device_id); np;
             np = of_find_matching_node(np, its_device_id)) {
                if (!of_property_read_bool(np, "msi-controller"))
                        continue;
 
-               parent = irq_find_matching_host(np, DOMAIN_BUS_NEXUS);
-               if (!parent || !msi_get_domain_info(parent)) {
-                       pr_err("%s: unable to locate ITS domain\n",
-                              np->full_name);
-                       continue;
-               }
-
-               if (!platform_msi_create_irq_domain(of_node_to_fwnode(np),
-                                                   &its_pmsi_domain_info,
-                                                   parent)) {
-                       pr_err("%s: unable to create platform domain\n",
-                              np->full_name);
-                       continue;
-               }
-
-               pr_info("Platform MSI: %s domain created\n", np->full_name);
+               its_pmsi_init_one(of_node_to_fwnode(np), np->full_name);
        }
+}
 
+static int __init its_pmsi_init(void)
+{
+       its_pmsi_of_init();
+       its_pmsi_acpi_init();
        return 0;
 }
 early_initcall(its_pmsi_init);
index f77f840..45ea193 100644 (file)
  */
 
 #include <linux/acpi.h>
+#include <linux/acpi_iort.h>
 #include <linux/bitmap.h>
 #include <linux/cpu.h>
 #include <linux/delay.h>
 #include <linux/dma-iommu.h>
 #include <linux/interrupt.h>
 #include <linux/irqdomain.h>
-#include <linux/acpi_iort.h>
 #include <linux/log2.h>
 #include <linux/mm.h>
 #include <linux/msi.h>
index 15af9a9..9463f35 100644 (file)
@@ -230,6 +230,8 @@ static int __init imx_gpcv2_irqchip_init(struct device_node *node,
                return -ENOMEM;
        }
 
+       raw_spin_lock_init(&cd->rlock);
+
        cd->gpc_base = of_iomap(node, 0);
        if (!cd->gpc_base) {
                pr_err("fsl-gpcv2: unable to map gpc registers\n");
@@ -266,6 +268,11 @@ static int __init imx_gpcv2_irqchip_init(struct device_node *node,
        imx_gpcv2_instance = cd;
        register_syscore_ops(&imx_gpcv2_syscore_ops);
 
+       /*
+        * Clear the OF_POPULATED flag set in of_irq_init so that
+        * later the GPC power domain driver will not be skipped.
+        */
+       of_node_clear_flag(node, OF_POPULATED);
        return 0;
 }
 
index 03b79b0..d2306c8 100644 (file)
@@ -16,6 +16,7 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
+#include <linux/acpi.h>
 #include <linux/interrupt.h>
 #include <linux/irqchip.h>
 #include <linux/module.h>
@@ -180,7 +181,7 @@ static int mbigen_domain_translate(struct irq_domain *d,
                                    unsigned long *hwirq,
                                    unsigned int *type)
 {
-       if (is_of_node(fwspec->fwnode)) {
+       if (is_of_node(fwspec->fwnode) || is_acpi_device_node(fwspec->fwnode)) {
                if (fwspec->param_count != 2)
                        return -EINVAL;
 
@@ -236,27 +237,15 @@ static struct irq_domain_ops mbigen_domain_ops = {
        .free           = irq_domain_free_irqs_common,
 };
 
-static int mbigen_device_probe(struct platform_device *pdev)
+static int mbigen_of_create_domain(struct platform_device *pdev,
+                                  struct mbigen_device *mgn_chip)
 {
-       struct mbigen_device *mgn_chip;
+       struct device *parent;
        struct platform_device *child;
        struct irq_domain *domain;
        struct device_node *np;
-       struct device *parent;
-       struct resource *res;
        u32 num_pins;
 
-       mgn_chip = devm_kzalloc(&pdev->dev, sizeof(*mgn_chip), GFP_KERNEL);
-       if (!mgn_chip)
-               return -ENOMEM;
-
-       mgn_chip->pdev = pdev;
-
-       res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-       mgn_chip->base = devm_ioremap_resource(&pdev->dev, res);
-       if (IS_ERR(mgn_chip->base))
-               return PTR_ERR(mgn_chip->base);
-
        for_each_child_of_node(pdev->dev.of_node, np) {
                if (!of_property_read_bool(np, "interrupt-controller"))
                        continue;
@@ -280,6 +269,91 @@ static int mbigen_device_probe(struct platform_device *pdev)
                        return -ENOMEM;
        }
 
+       return 0;
+}
+
+#ifdef CONFIG_ACPI
+static int mbigen_acpi_create_domain(struct platform_device *pdev,
+                                    struct mbigen_device *mgn_chip)
+{
+       struct irq_domain *domain;
+       u32 num_pins = 0;
+       int ret;
+
+       /*
+        * "num-pins" is the total number of interrupt pins implemented in
+        * this mbigen instance, and mbigen is an interrupt controller
+        * connected to ITS  converting wired interrupts into MSI, so we
+        * use "num-pins" to alloc MSI vectors which are needed by client
+        * devices connected to it.
+        *
+        * Here is the DSDT device node used for mbigen in firmware:
+        *      Device(MBI0) {
+        *              Name(_HID, "HISI0152")
+        *              Name(_UID, Zero)
+        *              Name(_CRS, ResourceTemplate() {
+        *                      Memory32Fixed(ReadWrite, 0xa0080000, 0x10000)
+        *              })
+        *
+        *              Name(_DSD, Package () {
+        *                      ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"),
+        *                      Package () {
+        *                              Package () {"num-pins", 378}
+        *                      }
+        *              })
+        *      }
+        */
+       ret = device_property_read_u32(&pdev->dev, "num-pins", &num_pins);
+       if (ret || num_pins == 0)
+               return -EINVAL;
+
+       domain = platform_msi_create_device_domain(&pdev->dev, num_pins,
+                                                  mbigen_write_msg,
+                                                  &mbigen_domain_ops,
+                                                  mgn_chip);
+       if (!domain)
+               return -ENOMEM;
+
+       return 0;
+}
+#else
+static inline int mbigen_acpi_create_domain(struct platform_device *pdev,
+                                           struct mbigen_device *mgn_chip)
+{
+       return -ENODEV;
+}
+#endif
+
+static int mbigen_device_probe(struct platform_device *pdev)
+{
+       struct mbigen_device *mgn_chip;
+       struct resource *res;
+       int err;
+
+       mgn_chip = devm_kzalloc(&pdev->dev, sizeof(*mgn_chip), GFP_KERNEL);
+       if (!mgn_chip)
+               return -ENOMEM;
+
+       mgn_chip->pdev = pdev;
+
+       res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+       mgn_chip->base = devm_ioremap_resource(&pdev->dev, res);
+       if (IS_ERR(mgn_chip->base))
+               return PTR_ERR(mgn_chip->base);
+
+       if (IS_ENABLED(CONFIG_OF) && pdev->dev.of_node)
+               err = mbigen_of_create_domain(pdev, mgn_chip);
+       else if (ACPI_COMPANION(&pdev->dev))
+               err = mbigen_acpi_create_domain(pdev, mgn_chip);
+       else
+               err = -EINVAL;
+
+       if (err) {
+               dev_err(&pdev->dev, "Failed to create mbi-gen@%p irqdomain",
+                       mgn_chip->base);
+               return err;
+       }
+
        platform_set_drvdata(pdev, mgn_chip);
        return 0;
 }
@@ -290,11 +364,17 @@ static const struct of_device_id mbigen_of_match[] = {
 };
 MODULE_DEVICE_TABLE(of, mbigen_of_match);
 
+static const struct acpi_device_id mbigen_acpi_match[] = {
+       { "HISI0152", 0 },
+       {}
+};
+MODULE_DEVICE_TABLE(acpi, mbigen_acpi_match);
+
 static struct platform_driver mbigen_platform_driver = {
        .driver = {
                .name           = "Hisilicon MBIGEN-V2",
-               .owner          = THIS_MODULE,
                .of_match_table = mbigen_of_match,
+               .acpi_match_table = ACPI_PTR(mbigen_acpi_match),
        },
        .probe                  = mbigen_device_probe,
 };
index cd20df1..eb7fbe1 100644 (file)
@@ -29,25 +29,12 @@ struct gic_pcpu_mask {
        DECLARE_BITMAP(pcpu_mask, GIC_MAX_INTRS);
 };
 
-struct gic_irq_spec {
-       enum {
-               GIC_DEVICE,
-               GIC_IPI
-       } type;
-
-       union {
-               struct cpumask *ipimask;
-               unsigned int hwirq;
-       };
-};
-
 static unsigned long __gic_base_addr;
 
 static void __iomem *gic_base;
 static struct gic_pcpu_mask pcpu_masks[NR_CPUS];
 static DEFINE_SPINLOCK(gic_lock);
 static struct irq_domain *gic_irq_domain;
-static struct irq_domain *gic_dev_domain;
 static struct irq_domain *gic_ipi_domain;
 static int gic_shared_intrs;
 static int gic_vpes;
@@ -55,6 +42,7 @@ static unsigned int gic_cpu_pin;
 static unsigned int timer_cpu_pin;
 static struct irq_chip gic_level_irq_controller, gic_edge_irq_controller;
 DECLARE_BITMAP(ipi_resrv, GIC_MAX_INTRS);
+DECLARE_BITMAP(ipi_available, GIC_MAX_INTRS);
 
 static void __gic_irq_dispatch(void);
 
@@ -693,132 +681,7 @@ static int gic_shared_irq_domain_map(struct irq_domain *d, unsigned int virq,
        return 0;
 }
 
-static int gic_setup_dev_chip(struct irq_domain *d, unsigned int virq,
-                             unsigned int hwirq)
-{
-       struct irq_chip *chip;
-       int err;
-
-       if (hwirq >= GIC_SHARED_HWIRQ_BASE) {
-               err = irq_domain_set_hwirq_and_chip(d, virq, hwirq,
-                                                   &gic_level_irq_controller,
-                                                   NULL);
-       } else {
-               switch (GIC_HWIRQ_TO_LOCAL(hwirq)) {
-               case GIC_LOCAL_INT_TIMER:
-               case GIC_LOCAL_INT_PERFCTR:
-               case GIC_LOCAL_INT_FDC:
-                       /*
-                        * HACK: These are all really percpu interrupts, but
-                        * the rest of the MIPS kernel code does not use the
-                        * percpu IRQ API for them.
-                        */
-                       chip = &gic_all_vpes_local_irq_controller;
-                       irq_set_handler(virq, handle_percpu_irq);
-                       break;
-
-               default:
-                       chip = &gic_local_irq_controller;
-                       irq_set_handler(virq, handle_percpu_devid_irq);
-                       irq_set_percpu_devid(virq);
-                       break;
-               }
-
-               err = irq_domain_set_hwirq_and_chip(d, virq, hwirq,
-                                                   chip, NULL);
-       }
-
-       return err;
-}
-
-static int gic_irq_domain_alloc(struct irq_domain *d, unsigned int virq,
-                               unsigned int nr_irqs, void *arg)
-{
-       struct gic_irq_spec *spec = arg;
-       irq_hw_number_t hwirq, base_hwirq;
-       int cpu, ret, i;
-
-       if (spec->type == GIC_DEVICE) {
-               /* verify that shared irqs don't conflict with an IPI irq */
-               if ((spec->hwirq >= GIC_SHARED_HWIRQ_BASE) &&
-                   test_bit(GIC_HWIRQ_TO_SHARED(spec->hwirq), ipi_resrv))
-                       return -EBUSY;
-
-               return gic_setup_dev_chip(d, virq, spec->hwirq);
-       } else {
-               base_hwirq = find_first_bit(ipi_resrv, gic_shared_intrs);
-               if (base_hwirq == gic_shared_intrs) {
-                       return -ENOMEM;
-               }
-
-               /* check that we have enough space */
-               for (i = base_hwirq; i < nr_irqs; i++) {
-                       if (!test_bit(i, ipi_resrv))
-                               return -EBUSY;
-               }
-               bitmap_clear(ipi_resrv, base_hwirq, nr_irqs);
-
-               /* map the hwirq for each cpu consecutively */
-               i = 0;
-               for_each_cpu(cpu, spec->ipimask) {
-                       hwirq = GIC_SHARED_TO_HWIRQ(base_hwirq + i);
-
-                       ret = irq_domain_set_hwirq_and_chip(d, virq + i, hwirq,
-                                                           &gic_level_irq_controller,
-                                                           NULL);
-                       if (ret)
-                               goto error;
-
-                       irq_set_handler(virq + i, handle_level_irq);
-
-                       ret = gic_shared_irq_domain_map(d, virq + i, hwirq, cpu);
-                       if (ret)
-                               goto error;
-
-                       i++;
-               }
-
-               /*
-                * tell the parent about the base hwirq we allocated so it can
-                * set its own domain data
-                */
-               spec->hwirq = base_hwirq;
-       }
-
-       return 0;
-error:
-       bitmap_set(ipi_resrv, base_hwirq, nr_irqs);
-       return ret;
-}
-
-void gic_irq_domain_free(struct irq_domain *d, unsigned int virq,
-                        unsigned int nr_irqs)
-{
-       irq_hw_number_t base_hwirq;
-       struct irq_data *data;
-
-       data = irq_get_irq_data(virq);
-       if (!data)
-               return;
-
-       base_hwirq = GIC_HWIRQ_TO_SHARED(irqd_to_hwirq(data));
-       bitmap_set(ipi_resrv, base_hwirq, nr_irqs);
-}
-
-int gic_irq_domain_match(struct irq_domain *d, struct device_node *node,
-                        enum irq_domain_bus_token bus_token)
-{
-       /* this domain should'nt be accessed directly */
-       return 0;
-}
-
-static const struct irq_domain_ops gic_irq_domain_ops = {
-       .alloc = gic_irq_domain_alloc,
-       .free = gic_irq_domain_free,
-       .match = gic_irq_domain_match,
-};
-
-static int gic_dev_domain_xlate(struct irq_domain *d, struct device_node *ctrlr,
+static int gic_irq_domain_xlate(struct irq_domain *d, struct device_node *ctrlr,
                                const u32 *intspec, unsigned int intsize,
                                irq_hw_number_t *out_hwirq,
                                unsigned int *out_type)
@@ -837,58 +700,82 @@ static int gic_dev_domain_xlate(struct irq_domain *d, struct device_node *ctrlr,
        return 0;
 }
 
-static int gic_dev_domain_alloc(struct irq_domain *d, unsigned int virq,
-                               unsigned int nr_irqs, void *arg)
+static int gic_irq_domain_map(struct irq_domain *d, unsigned int virq,
+                             irq_hw_number_t hwirq)
 {
-       struct irq_fwspec *fwspec = arg;
-       struct gic_irq_spec spec = {
-               .type = GIC_DEVICE,
-       };
-       int i, ret;
+       int err;
 
-       if (fwspec->param[0] == GIC_SHARED)
-               spec.hwirq = GIC_SHARED_TO_HWIRQ(fwspec->param[1]);
-       else
-               spec.hwirq = GIC_LOCAL_TO_HWIRQ(fwspec->param[1]);
+       if (hwirq >= GIC_SHARED_HWIRQ_BASE) {
+               /* verify that shared irqs don't conflict with an IPI irq */
+               if (test_bit(GIC_HWIRQ_TO_SHARED(hwirq), ipi_resrv))
+                       return -EBUSY;
 
-       ret = irq_domain_alloc_irqs_parent(d, virq, nr_irqs, &spec);
-       if (ret)
-               return ret;
+               err = irq_domain_set_hwirq_and_chip(d, virq, hwirq,
+                                                   &gic_level_irq_controller,
+                                                   NULL);
+               if (err)
+                       return err;
 
-       for (i = 0; i < nr_irqs; i++) {
-               ret = gic_setup_dev_chip(d, virq + i, spec.hwirq + i);
-               if (ret)
-                       goto error;
+               return gic_shared_irq_domain_map(d, virq, hwirq, 0);
        }
 
-       return 0;
+       switch (GIC_HWIRQ_TO_LOCAL(hwirq)) {
+       case GIC_LOCAL_INT_TIMER:
+       case GIC_LOCAL_INT_PERFCTR:
+       case GIC_LOCAL_INT_FDC:
+               /*
+                * HACK: These are all really percpu interrupts, but
+                * the rest of the MIPS kernel code does not use the
+                * percpu IRQ API for them.
+                */
+               err = irq_domain_set_hwirq_and_chip(d, virq, hwirq,
+                                                   &gic_all_vpes_local_irq_controller,
+                                                   NULL);
+               if (err)
+                       return err;
 
-error:
-       irq_domain_free_irqs_parent(d, virq, nr_irqs);
-       return ret;
+               irq_set_handler(virq, handle_percpu_irq);
+               break;
+
+       default:
+               err = irq_domain_set_hwirq_and_chip(d, virq, hwirq,
+                                                   &gic_local_irq_controller,
+                                                   NULL);
+               if (err)
+                       return err;
+
+               irq_set_handler(virq, handle_percpu_devid_irq);
+               irq_set_percpu_devid(virq);
+               break;
+       }
+
+       return gic_local_irq_domain_map(d, virq, hwirq);
 }
 
-void gic_dev_domain_free(struct irq_domain *d, unsigned int virq,
-                        unsigned int nr_irqs)
+static int gic_irq_domain_alloc(struct irq_domain *d, unsigned int virq,
+                               unsigned int nr_irqs, void *arg)
 {
-       /* no real allocation is done for dev irqs, so no need to free anything */
-       return;
+       struct irq_fwspec *fwspec = arg;
+       irq_hw_number_t hwirq;
+
+       if (fwspec->param[0] == GIC_SHARED)
+               hwirq = GIC_SHARED_TO_HWIRQ(fwspec->param[1]);
+       else
+               hwirq = GIC_LOCAL_TO_HWIRQ(fwspec->param[1]);
+
+       return gic_irq_domain_map(d, virq, hwirq);
 }
 
-static void gic_dev_domain_activate(struct irq_domain *domain,
-                                   struct irq_data *d)
+void gic_irq_domain_free(struct irq_domain *d, unsigned int virq,
+                        unsigned int nr_irqs)
 {
-       if (GIC_HWIRQ_TO_LOCAL(d->hwirq) < GIC_NUM_LOCAL_INTRS)
-               gic_local_irq_domain_map(domain, d->irq, d->hwirq);
-       else
-               gic_shared_irq_domain_map(domain, d->irq, d->hwirq, 0);
 }
 
-static struct irq_domain_ops gic_dev_domain_ops = {
-       .xlate = gic_dev_domain_xlate,
-       .alloc = gic_dev_domain_alloc,
-       .free = gic_dev_domain_free,
-       .activate = gic_dev_domain_activate,
+static const struct irq_domain_ops gic_irq_domain_ops = {
+       .xlate = gic_irq_domain_xlate,
+       .alloc = gic_irq_domain_alloc,
+       .free = gic_irq_domain_free,
+       .map = gic_irq_domain_map,
 };
 
 static int gic_ipi_domain_xlate(struct irq_domain *d, struct device_node *ctrlr,
@@ -910,20 +797,32 @@ static int gic_ipi_domain_alloc(struct irq_domain *d, unsigned int virq,
                                unsigned int nr_irqs, void *arg)
 {
        struct cpumask *ipimask = arg;
-       struct gic_irq_spec spec = {
-               .type = GIC_IPI,
-               .ipimask = ipimask
-       };
-       int ret, i;
-
-       ret = irq_domain_alloc_irqs_parent(d, virq, nr_irqs, &spec);
-       if (ret)
-               return ret;
-
-       /* the parent should have set spec.hwirq to the base_hwirq it allocated */
-       for (i = 0; i < nr_irqs; i++) {
-               ret = irq_domain_set_hwirq_and_chip(d, virq + i,
-                                                   GIC_SHARED_TO_HWIRQ(spec.hwirq + i),
+       irq_hw_number_t hwirq, base_hwirq;
+       int cpu, ret, i;
+
+       base_hwirq = find_first_bit(ipi_available, gic_shared_intrs);
+       if (base_hwirq == gic_shared_intrs)
+               return -ENOMEM;
+
+       /* check that we have enough space */
+       for (i = base_hwirq; i < nr_irqs; i++) {
+               if (!test_bit(i, ipi_available))
+                       return -EBUSY;
+       }
+       bitmap_clear(ipi_available, base_hwirq, nr_irqs);
+
+       /* map the hwirq for each cpu consecutively */
+       i = 0;
+       for_each_cpu(cpu, ipimask) {
+               hwirq = GIC_SHARED_TO_HWIRQ(base_hwirq + i);
+
+               ret = irq_domain_set_hwirq_and_chip(d, virq + i, hwirq,
+                                                   &gic_edge_irq_controller,
+                                                   NULL);
+               if (ret)
+                       goto error;
+
+               ret = irq_domain_set_hwirq_and_chip(d->parent, virq + i, hwirq,
                                                    &gic_edge_irq_controller,
                                                    NULL);
                if (ret)
@@ -932,18 +831,32 @@ static int gic_ipi_domain_alloc(struct irq_domain *d, unsigned int virq,
                ret = irq_set_irq_type(virq + i, IRQ_TYPE_EDGE_RISING);
                if (ret)
                        goto error;
+
+               ret = gic_shared_irq_domain_map(d, virq + i, hwirq, cpu);
+               if (ret)
+                       goto error;
+
+               i++;
        }
 
        return 0;
 error:
-       irq_domain_free_irqs_parent(d, virq, nr_irqs);
+       bitmap_set(ipi_available, base_hwirq, nr_irqs);
        return ret;
 }
 
 void gic_ipi_domain_free(struct irq_domain *d, unsigned int virq,
                         unsigned int nr_irqs)
 {
-       irq_domain_free_irqs_parent(d, virq, nr_irqs);
+       irq_hw_number_t base_hwirq;
+       struct irq_data *data;
+
+       data = irq_get_irq_data(virq);
+       if (!data)
+               return;
+
+       base_hwirq = GIC_HWIRQ_TO_SHARED(irqd_to_hwirq(data));
+       bitmap_set(ipi_available, base_hwirq, nr_irqs);
 }
 
 int gic_ipi_domain_match(struct irq_domain *d, struct device_node *node,
@@ -968,38 +881,6 @@ static struct irq_domain_ops gic_ipi_domain_ops = {
        .match = gic_ipi_domain_match,
 };
 
-static void __init gic_map_single_int(struct device_node *node,
-                                     unsigned int irq)
-{
-       unsigned int linux_irq;
-       struct irq_fwspec local_int_fwspec = {
-               .fwnode         = &node->fwnode,
-               .param_count    = 3,
-               .param          = {
-                       [0]     = GIC_LOCAL,
-                       [1]     = irq,
-                       [2]     = IRQ_TYPE_NONE,
-               },
-       };
-
-       if (!gic_local_irq_is_routable(irq))
-               return;
-
-       linux_irq = irq_create_fwspec_mapping(&local_int_fwspec);
-       WARN_ON(!linux_irq);
-}
-
-static void __init gic_map_interrupts(struct device_node *node)
-{
-       gic_map_single_int(node, GIC_LOCAL_INT_WD);
-       gic_map_single_int(node, GIC_LOCAL_INT_COMPARE);
-       gic_map_single_int(node, GIC_LOCAL_INT_TIMER);
-       gic_map_single_int(node, GIC_LOCAL_INT_PERFCTR);
-       gic_map_single_int(node, GIC_LOCAL_INT_SWINT0);
-       gic_map_single_int(node, GIC_LOCAL_INT_SWINT1);
-       gic_map_single_int(node, GIC_LOCAL_INT_FDC);
-}
-
 static void __init __gic_init(unsigned long gic_base_addr,
                              unsigned long gic_addrspace_size,
                              unsigned int cpu_vec, unsigned int irqbase,
@@ -1071,13 +952,6 @@ static void __init __gic_init(unsigned long gic_base_addr,
                panic("Failed to add GIC IRQ domain");
        gic_irq_domain->name = "mips-gic-irq";
 
-       gic_dev_domain = irq_domain_add_hierarchy(gic_irq_domain, 0,
-                                                 GIC_NUM_LOCAL_INTRS + gic_shared_intrs,
-                                                 node, &gic_dev_domain_ops, NULL);
-       if (!gic_dev_domain)
-               panic("Failed to add GIC DEV domain");
-       gic_dev_domain->name = "mips-gic-dev";
-
        gic_ipi_domain = irq_domain_add_hierarchy(gic_irq_domain,
                                                  IRQ_DOMAIN_FLAG_IPI_PER_CPU,
                                                  GIC_NUM_LOCAL_INTRS + gic_shared_intrs,
@@ -1098,8 +972,8 @@ static void __init __gic_init(unsigned long gic_base_addr,
                           2 * gic_vpes);
        }
 
+       bitmap_copy(ipi_available, ipi_resrv, GIC_MAX_INTRS);
        gic_basic_init();
-       gic_map_interrupts(node);
 }
 
 void __init gic_init(unsigned long gic_base_addr,
diff --git a/drivers/irqchip/irq-moxart.c b/drivers/irqchip/irq-moxart.c
deleted file mode 100644 (file)
index a24b06a..0000000
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * MOXA ART SoCs IRQ chip driver.
- *
- * Copyright (C) 2013 Jonas Jensen
- *
- * Jonas Jensen <jonas.jensen@gmail.com>
- *
- * This file is licensed under the terms of the GNU General Public
- * License version 2.  This program is licensed "as is" without any
- * warranty of any kind, whether express or implied.
- */
-
-#include <linux/io.h>
-#include <linux/irq.h>
-#include <linux/irqchip.h>
-#include <linux/of.h>
-#include <linux/of_address.h>
-#include <linux/of_irq.h>
-#include <linux/irqdomain.h>
-
-#include <asm/exception.h>
-
-#define IRQ_SOURCE_REG         0
-#define IRQ_MASK_REG           0x04
-#define IRQ_CLEAR_REG          0x08
-#define IRQ_MODE_REG           0x0c
-#define IRQ_LEVEL_REG          0x10
-#define IRQ_STATUS_REG         0x14
-
-#define FIQ_SOURCE_REG         0x20
-#define FIQ_MASK_REG           0x24
-#define FIQ_CLEAR_REG          0x28
-#define FIQ_MODE_REG           0x2c
-#define FIQ_LEVEL_REG          0x30
-#define FIQ_STATUS_REG         0x34
-
-
-struct moxart_irq_data {
-       void __iomem *base;
-       struct irq_domain *domain;
-       unsigned int interrupt_mask;
-};
-
-static struct moxart_irq_data intc;
-
-static void __exception_irq_entry handle_irq(struct pt_regs *regs)
-{
-       u32 irqstat;
-       int hwirq;
-
-       irqstat = readl(intc.base + IRQ_STATUS_REG);
-
-       while (irqstat) {
-               hwirq = ffs(irqstat) - 1;
-               handle_IRQ(irq_linear_revmap(intc.domain, hwirq), regs);
-               irqstat &= ~(1 << hwirq);
-       }
-}
-
-static int __init moxart_of_intc_init(struct device_node *node,
-                                     struct device_node *parent)
-{
-       unsigned int clr = IRQ_NOREQUEST | IRQ_NOPROBE | IRQ_NOAUTOEN;
-       int ret;
-       struct irq_chip_generic *gc;
-
-       intc.base = of_iomap(node, 0);
-       if (!intc.base) {
-               pr_err("%s: unable to map IC registers\n",
-                      node->full_name);
-               return -EINVAL;
-       }
-
-       intc.domain = irq_domain_add_linear(node, 32, &irq_generic_chip_ops,
-                                           intc.base);
-       if (!intc.domain) {
-               pr_err("%s: unable to create IRQ domain\n", node->full_name);
-               return -EINVAL;
-       }
-
-       ret = irq_alloc_domain_generic_chips(intc.domain, 32, 1,
-                                            "MOXARTINTC", handle_edge_irq,
-                                            clr, 0, IRQ_GC_INIT_MASK_CACHE);
-       if (ret) {
-               pr_err("%s: could not allocate generic chip\n",
-                      node->full_name);
-               irq_domain_remove(intc.domain);
-               return -EINVAL;
-       }
-
-       ret = of_property_read_u32(node, "interrupt-mask",
-                                  &intc.interrupt_mask);
-       if (ret)
-               pr_err("%s: could not read interrupt-mask DT property\n",
-                      node->full_name);
-
-       gc = irq_get_domain_generic_chip(intc.domain, 0);
-
-       gc->reg_base = intc.base;
-       gc->chip_types[0].regs.mask = IRQ_MASK_REG;
-       gc->chip_types[0].regs.ack = IRQ_CLEAR_REG;
-       gc->chip_types[0].chip.irq_ack = irq_gc_ack_set_bit;
-       gc->chip_types[0].chip.irq_mask = irq_gc_mask_clr_bit;
-       gc->chip_types[0].chip.irq_unmask = irq_gc_mask_set_bit;
-
-       writel(0, intc.base + IRQ_MASK_REG);
-       writel(0xffffffff, intc.base + IRQ_CLEAR_REG);
-
-       writel(intc.interrupt_mask, intc.base + IRQ_MODE_REG);
-       writel(intc.interrupt_mask, intc.base + IRQ_LEVEL_REG);
-
-       set_handle_irq(handle_irq);
-
-       return 0;
-}
-IRQCHIP_DECLARE(moxa_moxart_ic, "moxa,moxart-ic", moxart_of_intc_init);
diff --git a/drivers/irqchip/irq-mtk-cirq.c b/drivers/irqchip/irq-mtk-cirq.c
new file mode 100644 (file)
index 0000000..18c65c1
--- /dev/null
@@ -0,0 +1,306 @@
+/*
+ * Copyright (c) 2016 MediaTek Inc.
+ * Author: Youlin.Pei <youlin.pei@mediatek.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/irq.h>
+#include <linux/irqchip.h>
+#include <linux/irqdomain.h>
+#include <linux/of.h>
+#include <linux/of_irq.h>
+#include <linux/of_address.h>
+#include <linux/slab.h>
+#include <linux/syscore_ops.h>
+
+#define CIRQ_ACK       0x40
+#define CIRQ_MASK_SET  0xc0
+#define CIRQ_MASK_CLR  0x100
+#define CIRQ_SENS_SET  0x180
+#define CIRQ_SENS_CLR  0x1c0
+#define CIRQ_POL_SET   0x240
+#define CIRQ_POL_CLR   0x280
+#define CIRQ_CONTROL   0x300
+
+#define CIRQ_EN        0x1
+#define CIRQ_EDGE      0x2
+#define CIRQ_FLUSH     0x4
+
+struct mtk_cirq_chip_data {
+       void __iomem *base;
+       unsigned int ext_irq_start;
+       unsigned int ext_irq_end;
+       struct irq_domain *domain;
+};
+
+static struct mtk_cirq_chip_data *cirq_data;
+
+static void mtk_cirq_write_mask(struct irq_data *data, unsigned int offset)
+{
+       struct mtk_cirq_chip_data *chip_data = data->chip_data;
+       unsigned int cirq_num = data->hwirq;
+       u32 mask = 1 << (cirq_num % 32);
+
+       writel_relaxed(mask, chip_data->base + offset + (cirq_num / 32) * 4);
+}
+
+static void mtk_cirq_mask(struct irq_data *data)
+{
+       mtk_cirq_write_mask(data, CIRQ_MASK_SET);
+       irq_chip_mask_parent(data);
+}
+
+static void mtk_cirq_unmask(struct irq_data *data)
+{
+       mtk_cirq_write_mask(data, CIRQ_MASK_CLR);
+       irq_chip_unmask_parent(data);
+}
+
+static int mtk_cirq_set_type(struct irq_data *data, unsigned int type)
+{
+       int ret;
+
+       switch (type & IRQ_TYPE_SENSE_MASK) {
+       case IRQ_TYPE_EDGE_FALLING:
+               mtk_cirq_write_mask(data, CIRQ_POL_CLR);
+               mtk_cirq_write_mask(data, CIRQ_SENS_CLR);
+               break;
+       case IRQ_TYPE_EDGE_RISING:
+               mtk_cirq_write_mask(data, CIRQ_POL_SET);
+               mtk_cirq_write_mask(data, CIRQ_SENS_CLR);
+               break;
+       case IRQ_TYPE_LEVEL_LOW:
+               mtk_cirq_write_mask(data, CIRQ_POL_CLR);
+               mtk_cirq_write_mask(data, CIRQ_SENS_SET);
+               break;
+       case IRQ_TYPE_LEVEL_HIGH:
+               mtk_cirq_write_mask(data, CIRQ_POL_SET);
+               mtk_cirq_write_mask(data, CIRQ_SENS_SET);
+               break;
+       default:
+               break;
+       }
+
+       data = data->parent_data;
+       ret = data->chip->irq_set_type(data, type);
+       return ret;
+}
+
+static struct irq_chip mtk_cirq_chip = {
+       .name                   = "MT_CIRQ",
+       .irq_mask               = mtk_cirq_mask,
+       .irq_unmask             = mtk_cirq_unmask,
+       .irq_eoi                = irq_chip_eoi_parent,
+       .irq_set_type           = mtk_cirq_set_type,
+       .irq_retrigger          = irq_chip_retrigger_hierarchy,
+#ifdef CONFIG_SMP
+       .irq_set_affinity       = irq_chip_set_affinity_parent,
+#endif
+};
+
+static int mtk_cirq_domain_translate(struct irq_domain *d,
+                                    struct irq_fwspec *fwspec,
+                                    unsigned long *hwirq,
+                                    unsigned int *type)
+{
+       if (is_of_node(fwspec->fwnode)) {
+               if (fwspec->param_count != 3)
+                       return -EINVAL;
+
+               /* No PPI should point to this domain */
+               if (fwspec->param[0] != 0)
+                       return -EINVAL;
+
+               /* cirq support irq number check */
+               if (fwspec->param[1] < cirq_data->ext_irq_start ||
+                   fwspec->param[1] > cirq_data->ext_irq_end)
+                       return -EINVAL;
+
+               *hwirq = fwspec->param[1] - cirq_data->ext_irq_start;
+               *type = fwspec->param[2] & IRQ_TYPE_SENSE_MASK;
+               return 0;
+       }
+
+       return -EINVAL;
+}
+
+static int mtk_cirq_domain_alloc(struct irq_domain *domain, unsigned int virq,
+                                unsigned int nr_irqs, void *arg)
+{
+       int ret;
+       irq_hw_number_t hwirq;
+       unsigned int type;
+       struct irq_fwspec *fwspec = arg;
+       struct irq_fwspec parent_fwspec = *fwspec;
+
+       ret = mtk_cirq_domain_translate(domain, fwspec, &hwirq, &type);
+       if (ret)
+               return ret;
+
+       if (WARN_ON(nr_irqs != 1))
+               return -EINVAL;
+
+       irq_domain_set_hwirq_and_chip(domain, virq, hwirq,
+                                     &mtk_cirq_chip,
+                                     domain->host_data);
+
+       parent_fwspec.fwnode = domain->parent->fwnode;
+       return irq_domain_alloc_irqs_parent(domain, virq, nr_irqs,
+                                           &parent_fwspec);
+}
+
+static const struct irq_domain_ops cirq_domain_ops = {
+       .translate      = mtk_cirq_domain_translate,
+       .alloc          = mtk_cirq_domain_alloc,
+       .free           = irq_domain_free_irqs_common,
+};
+
+#ifdef CONFIG_PM_SLEEP
+static int mtk_cirq_suspend(void)
+{
+       u32 value, mask;
+       unsigned int irq, hwirq_num;
+       bool pending, masked;
+       int i, pendret, maskret;
+
+       /*
+        * When external interrupts happened, CIRQ will record the status
+        * even CIRQ is not enabled. When execute flush command, CIRQ will
+        * resend the signals according to the status. So if don't clear the
+        * status, CIRQ will resend the wrong signals.
+        *
+        * arch_suspend_disable_irqs() will be called before CIRQ suspend
+        * callback. If clear all the status simply, the external interrupts
+        * which happened between arch_suspend_disable_irqs and CIRQ suspend
+        * callback will be lost. Using following steps to avoid this issue;
+        *
+        * - Iterate over all the CIRQ supported interrupts;
+        * - For each interrupt, inspect its pending and masked status at GIC
+        *   level;
+        * - If pending and unmasked, it happened between
+        *   arch_suspend_disable_irqs and CIRQ suspend callback, don't ACK
+        *   it. Otherwise, ACK it.
+        */
+       hwirq_num = cirq_data->ext_irq_end - cirq_data->ext_irq_start + 1;
+       for (i = 0; i < hwirq_num; i++) {
+               irq = irq_find_mapping(cirq_data->domain, i);
+               if (irq) {
+                       pendret = irq_get_irqchip_state(irq,
+                                                       IRQCHIP_STATE_PENDING,
+                                                       &pending);
+
+                       maskret = irq_get_irqchip_state(irq,
+                                                       IRQCHIP_STATE_MASKED,
+                                                       &masked);
+
+                       if (pendret == 0 && maskret == 0 &&
+                           (pending && !masked))
+                               continue;
+               }
+
+               mask = 1 << (i % 32);
+               writel_relaxed(mask, cirq_data->base + CIRQ_ACK + (i / 32) * 4);
+       }
+
+       /* set edge_only mode, record edge-triggerd interrupts */
+       /* enable cirq */
+       value = readl_relaxed(cirq_data->base + CIRQ_CONTROL);
+       value |= (CIRQ_EDGE | CIRQ_EN);
+       writel_relaxed(value, cirq_data->base + CIRQ_CONTROL);
+
+       return 0;
+}
+
+static void mtk_cirq_resume(void)
+{
+       u32 value;
+
+       /* flush recored interrupts, will send signals to parent controller */
+       value = readl_relaxed(cirq_data->base + CIRQ_CONTROL);
+       writel_relaxed(value | CIRQ_FLUSH, cirq_data->base + CIRQ_CONTROL);
+
+       /* disable cirq */
+       value = readl_relaxed(cirq_data->base + CIRQ_CONTROL);
+       value &= ~(CIRQ_EDGE | CIRQ_EN);
+       writel_relaxed(value, cirq_data->base + CIRQ_CONTROL);
+}
+
+static struct syscore_ops mtk_cirq_syscore_ops = {
+       .suspend        = mtk_cirq_suspend,
+       .resume         = mtk_cirq_resume,
+};
+
+static void mtk_cirq_syscore_init(void)
+{
+       register_syscore_ops(&mtk_cirq_syscore_ops);
+}
+#else
+static inline void mtk_cirq_syscore_init(void) {}
+#endif
+
+static int __init mtk_cirq_of_init(struct device_node *node,
+                                  struct device_node *parent)
+{
+       struct irq_domain *domain, *domain_parent;
+       unsigned int irq_num;
+       int ret;
+
+       domain_parent = irq_find_host(parent);
+       if (!domain_parent) {
+               pr_err("mtk_cirq: interrupt-parent not found\n");
+               return -EINVAL;
+       }
+
+       cirq_data = kzalloc(sizeof(*cirq_data), GFP_KERNEL);
+       if (!cirq_data)
+               return -ENOMEM;
+
+       cirq_data->base = of_iomap(node, 0);
+       if (!cirq_data->base) {
+               pr_err("mtk_cirq: unable to map cirq register\n");
+               ret = -ENXIO;
+               goto out_free;
+       }
+
+       ret = of_property_read_u32_index(node, "mediatek,ext-irq-range", 0,
+                                        &cirq_data->ext_irq_start);
+       if (ret)
+               goto out_unmap;
+
+       ret = of_property_read_u32_index(node, "mediatek,ext-irq-range", 1,
+                                        &cirq_data->ext_irq_end);
+       if (ret)
+               goto out_unmap;
+
+       irq_num = cirq_data->ext_irq_end - cirq_data->ext_irq_start + 1;
+       domain = irq_domain_add_hierarchy(domain_parent, 0,
+                                         irq_num, node,
+                                         &cirq_domain_ops, cirq_data);
+       if (!domain) {
+               ret = -ENOMEM;
+               goto out_unmap;
+       }
+       cirq_data->domain = domain;
+
+       mtk_cirq_syscore_init();
+
+       return 0;
+
+out_unmap:
+       iounmap(cirq_data->base);
+out_free:
+       kfree(cirq_data);
+       return ret;
+}
+
+IRQCHIP_DECLARE(mtk_cirq, "mediatek,mtk-cirq", mtk_cirq_of_init);
index 63ac73b..eeac512 100644 (file)
 
 struct mtk_sysirq_chip_data {
        spinlock_t lock;
-       void __iomem *intpol_base;
+       u32 nr_intpol_bases;
+       void __iomem **intpol_bases;
+       u32 *intpol_words;
+       u8 *intpol_idx;
+       u16 *which_word;
 };
 
 static int mtk_sysirq_set_type(struct irq_data *data, unsigned int type)
 {
        irq_hw_number_t hwirq = data->hwirq;
        struct mtk_sysirq_chip_data *chip_data = data->chip_data;
+       u8 intpol_idx = chip_data->intpol_idx[hwirq];
+       void __iomem *base;
        u32 offset, reg_index, value;
        unsigned long flags;
        int ret;
 
+       base = chip_data->intpol_bases[intpol_idx];
+       reg_index = chip_data->which_word[hwirq];
        offset = hwirq & 0x1f;
-       reg_index = hwirq >> 5;
 
        spin_lock_irqsave(&chip_data->lock, flags);
-       value = readl_relaxed(chip_data->intpol_base + reg_index * 4);
+       value = readl_relaxed(base + reg_index * 4);
        if (type == IRQ_TYPE_LEVEL_LOW || type == IRQ_TYPE_EDGE_FALLING) {
                if (type == IRQ_TYPE_LEVEL_LOW)
                        type = IRQ_TYPE_LEVEL_HIGH;
@@ -49,7 +56,8 @@ static int mtk_sysirq_set_type(struct irq_data *data, unsigned int type)
        } else {
                value &= ~(1 << offset);
        }
-       writel(value, chip_data->intpol_base + reg_index * 4);
+
+       writel_relaxed(value, base + reg_index * 4);
 
        data = data->parent_data;
        ret = data->chip->irq_set_type(data, type);
@@ -124,8 +132,7 @@ static int __init mtk_sysirq_of_init(struct device_node *node,
 {
        struct irq_domain *domain, *domain_parent;
        struct mtk_sysirq_chip_data *chip_data;
-       int ret, size, intpol_num;
-       struct resource res;
+       int ret, size, intpol_num = 0, nr_intpol_bases = 0, i = 0;
 
        domain_parent = irq_find_host(parent);
        if (!domain_parent) {
@@ -133,36 +140,103 @@ static int __init mtk_sysirq_of_init(struct device_node *node,
                return -EINVAL;
        }
 
-       ret = of_address_to_resource(node, 0, &res);
-       if (ret)
-               return ret;
-
        chip_data = kzalloc(sizeof(*chip_data), GFP_KERNEL);
        if (!chip_data)
                return -ENOMEM;
 
-       size = resource_size(&res);
-       intpol_num = size * 8;
-       chip_data->intpol_base = ioremap(res.start, size);
-       if (!chip_data->intpol_base) {
-               pr_err("mtk_sysirq: unable to map sysirq register\n");
-               ret = -ENXIO;
-               goto out_free;
+       while (of_get_address(node, i++, NULL, NULL))
+               nr_intpol_bases++;
+
+       if (nr_intpol_bases == 0) {
+               pr_err("mtk_sysirq: base address not specified\n");
+               ret = -EINVAL;
+               goto out_free_chip;
+       }
+
+       chip_data->intpol_words = kcalloc(nr_intpol_bases,
+                                         sizeof(*chip_data->intpol_words),
+                                         GFP_KERNEL);
+       if (!chip_data->intpol_words) {
+               ret = -ENOMEM;
+               goto out_free_chip;
+       }
+
+       chip_data->intpol_bases = kcalloc(nr_intpol_bases,
+                                         sizeof(*chip_data->intpol_bases),
+                                         GFP_KERNEL);
+       if (!chip_data->intpol_bases) {
+               ret = -ENOMEM;
+               goto out_free_intpol_words;
+       }
+
+       for (i = 0; i < nr_intpol_bases; i++) {
+               struct resource res;
+
+               ret = of_address_to_resource(node, i, &res);
+               size = resource_size(&res);
+               intpol_num += size * 8;
+               chip_data->intpol_words[i] = size / 4;
+               chip_data->intpol_bases[i] = of_iomap(node, i);
+               if (ret || !chip_data->intpol_bases[i]) {
+                       pr_err("%s: couldn't map region %d\n",
+                              node->full_name, i);
+                       ret = -ENODEV;
+                       goto out_free_intpol;
+               }
+       }
+
+       chip_data->intpol_idx = kcalloc(intpol_num,
+                                       sizeof(*chip_data->intpol_idx),
+                                       GFP_KERNEL);
+       if (!chip_data->intpol_idx) {
+               ret = -ENOMEM;
+               goto out_free_intpol;
+       }
+
+       chip_data->which_word = kcalloc(intpol_num,
+                                       sizeof(*chip_data->which_word),
+                                       GFP_KERNEL);
+       if (!chip_data->which_word) {
+               ret = -ENOMEM;
+               goto out_free_intpol_idx;
+       }
+
+       /*
+        * assign an index of the intpol_bases for each irq
+        * to set it fast later
+        */
+       for (i = 0; i < intpol_num ; i++) {
+               u32 word = i / 32, j;
+
+               for (j = 0; word >= chip_data->intpol_words[j] ; j++)
+                       word -= chip_data->intpol_words[j];
+
+               chip_data->intpol_idx[i] = j;
+               chip_data->which_word[i] = word;
        }
 
        domain = irq_domain_add_hierarchy(domain_parent, 0, intpol_num, node,
                                          &sysirq_domain_ops, chip_data);
        if (!domain) {
                ret = -ENOMEM;
-               goto out_unmap;
+               goto out_free_which_word;
        }
        spin_lock_init(&chip_data->lock);
 
        return 0;
 
-out_unmap:
-       iounmap(chip_data->intpol_base);
-out_free:
+out_free_which_word:
+       kfree(chip_data->which_word);
+out_free_intpol_idx:
+       kfree(chip_data->intpol_idx);
+out_free_intpol:
+       for (i = 0; i < nr_intpol_bases; i++)
+               if (chip_data->intpol_bases[i])
+                       iounmap(chip_data->intpol_bases[i]);
+       kfree(chip_data->intpol_bases);
+out_free_intpol_words:
+       kfree(chip_data->intpol_words);
+out_free_chip:
        kfree(chip_data);
        return ret;
 }
index 1dfd108..9ca691d 100644 (file)
@@ -1032,6 +1032,7 @@ static int old_capi_manufacturer(unsigned int cmd, void __user *data)
                                                     sizeof(avmb1_carddef))))
                                return -EFAULT;
                        cdef.cardtype = AVM_CARDTYPE_B1;
+                       cdef.cardnr = 0;
                } else {
                        if ((retval = copy_from_user(&cdef, data,
                                                     sizeof(avmb1_extcarddef))))
index 275f467..6c29998 100644 (file)
@@ -76,6 +76,15 @@ config LEDS_BCM6358
          This option enables support for LEDs connected to the BCM6358
          LED HW controller accessed via MMIO registers.
 
+config LEDS_CPCAP
+       tristate "LED Support for Motorola CPCAP"
+       depends on LEDS_CLASS
+       depends on MFD_CPCAP
+       depends on OF
+       help
+         This option enables support for LEDs offered by Motorola's
+         CPCAP PMIC.
+
 config LEDS_LM3530
        tristate "LCD Backlight driver for LM3530"
        depends on LEDS_CLASS
@@ -126,6 +135,14 @@ config LEDS_MIKROTIK_RB532
          This option enables support for the so called "User LED" of
          Mikrotik's Routerboard 532.
 
+config LEDS_MT6323
+       tristate "LED Support for Mediatek MT6323 PMIC"
+       depends on LEDS_CLASS
+       depends on MFD_MT6397
+       help
+         This option enables support for on-chip LED drivers found on
+         Mediatek MT6323 PMIC.
+
 config LEDS_S3C24XX
        tristate "LED Support for Samsung S3C24XX GPIO LEDs"
        depends on LEDS_CLASS
@@ -241,7 +258,6 @@ config LEDS_LP3952
        tristate "LED Support for TI LP3952 2 channel LED driver"
        depends on LEDS_CLASS
        depends on I2C
-       depends on ACPI
        depends on GPIOLIB
        select REGMAP_I2C
        help
@@ -463,15 +479,6 @@ config LEDS_ADP5520
          To compile this driver as a module, choose M here: the module will
          be called leds-adp5520.
 
-config LEDS_DELL_NETBOOKS
-       tristate "External LED on Dell Business Netbooks"
-       depends on LEDS_CLASS
-       depends on X86 && ACPI_WMI
-       depends on DELL_SMBIOS
-       help
-         This adds support for the Latitude 2100 and similar
-         notebooks that have an external LED.
-
 config LEDS_MC13783
        tristate "LED Support for MC13XXX PMIC"
        depends on LEDS_CLASS
index 6b82737..45f1339 100644 (file)
@@ -11,6 +11,7 @@ obj-$(CONFIG_LEDS_AAT1290)            += leds-aat1290.o
 obj-$(CONFIG_LEDS_BCM6328)             += leds-bcm6328.o
 obj-$(CONFIG_LEDS_BCM6358)             += leds-bcm6358.o
 obj-$(CONFIG_LEDS_BD2802)              += leds-bd2802.o
+obj-$(CONFIG_LEDS_CPCAP)               += leds-cpcap.o
 obj-$(CONFIG_LEDS_LOCOMO)              += leds-locomo.o
 obj-$(CONFIG_LEDS_LM3530)              += leds-lm3530.o
 obj-$(CONFIG_LEDS_LM3533)              += leds-lm3533.o
@@ -52,7 +53,6 @@ obj-$(CONFIG_LEDS_REGULATOR)          += leds-regulator.o
 obj-$(CONFIG_LEDS_INTEL_SS4200)                += leds-ss4200.o
 obj-$(CONFIG_LEDS_LT3593)              += leds-lt3593.o
 obj-$(CONFIG_LEDS_ADP5520)             += leds-adp5520.o
-obj-$(CONFIG_LEDS_DELL_NETBOOKS)       += dell-led.o
 obj-$(CONFIG_LEDS_MC13783)             += leds-mc13783.o
 obj-$(CONFIG_LEDS_NS2)                 += leds-ns2.o
 obj-$(CONFIG_LEDS_NETXBIG)             += leds-netxbig.o
@@ -72,6 +72,7 @@ obj-$(CONFIG_LEDS_IS31FL32XX)         += leds-is31fl32xx.o
 obj-$(CONFIG_LEDS_PM8058)              += leds-pm8058.o
 obj-$(CONFIG_LEDS_MLXCPLD)             += leds-mlxcpld.o
 obj-$(CONFIG_LEDS_NIC78BX)             += leds-nic78bx.o
+obj-$(CONFIG_LEDS_MT6323)              += leds-mt6323.o
 
 # LED SPI Drivers
 obj-$(CONFIG_LEDS_DAC124S085)          += leds-dac124s085.o
index f2b0a80..b0e2d55 100644 (file)
@@ -244,11 +244,14 @@ static int led_classdev_next_name(const char *init_name, char *name,
 }
 
 /**
- * led_classdev_register - register a new object of led_classdev class.
- * @parent: The device to register.
+ * of_led_classdev_register - register a new object of led_classdev class.
+ *
+ * @parent: parent of LED device
  * @led_cdev: the led_classdev structure for this device.
+ * @np: DT node describing this LED
  */
-int led_classdev_register(struct device *parent, struct led_classdev *led_cdev)
+int of_led_classdev_register(struct device *parent, struct device_node *np,
+                           struct led_classdev *led_cdev)
 {
        char name[LED_MAX_NAME_SIZE];
        int ret;
@@ -261,6 +264,7 @@ int led_classdev_register(struct device *parent, struct led_classdev *led_cdev)
                                led_cdev, led_cdev->groups, "%s", name);
        if (IS_ERR(led_cdev->dev))
                return PTR_ERR(led_cdev->dev);
+       led_cdev->dev->of_node = np;
 
        if (ret)
                dev_warn(parent, "Led %s renamed to %s due to name collision",
@@ -303,7 +307,7 @@ int led_classdev_register(struct device *parent, struct led_classdev *led_cdev)
 
        return 0;
 }
-EXPORT_SYMBOL_GPL(led_classdev_register);
+EXPORT_SYMBOL_GPL(of_led_classdev_register);
 
 /**
  * led_classdev_unregister - unregisters a object of led_properties class.
@@ -348,12 +352,14 @@ static void devm_led_classdev_release(struct device *dev, void *res)
 }
 
 /**
- * devm_led_classdev_register - resource managed led_classdev_register()
- * @parent: The device to register.
+ * devm_of_led_classdev_register - resource managed led_classdev_register()
+ *
+ * @parent: parent of LED device
  * @led_cdev: the led_classdev structure for this device.
  */
-int devm_led_classdev_register(struct device *parent,
-                              struct led_classdev *led_cdev)
+int devm_of_led_classdev_register(struct device *parent,
+                                 struct device_node *np,
+                                 struct led_classdev *led_cdev)
 {
        struct led_classdev **dr;
        int rc;
@@ -362,7 +368,7 @@ int devm_led_classdev_register(struct device *parent,
        if (!dr)
                return -ENOMEM;
 
-       rc = led_classdev_register(parent, led_cdev);
+       rc = of_led_classdev_register(parent, np, led_cdev);
        if (rc) {
                devres_free(dr);
                return rc;
@@ -373,7 +379,7 @@ int devm_led_classdev_register(struct device *parent,
 
        return 0;
 }
-EXPORT_SYMBOL_GPL(devm_led_classdev_register);
+EXPORT_SYMBOL_GPL(devm_of_led_classdev_register);
 
 static int devm_led_classdev_match(struct device *dev, void *res, void *data)
 {
diff --git a/drivers/leds/leds-cpcap.c b/drivers/leds/leds-cpcap.c
new file mode 100644 (file)
index 0000000..f0f28c4
--- /dev/null
@@ -0,0 +1,239 @@
+/*
+ * Copyright (c) 2017 Sebastian Reichel <sre@kernel.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 or
+ * later as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/leds.h>
+#include <linux/mfd/motorola-cpcap.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/of_device.h>
+#include <linux/platform_device.h>
+#include <linux/regmap.h>
+#include <linux/regulator/consumer.h>
+
+#define CPCAP_LED_NO_CURRENT 0x0001
+
+struct cpcap_led_info {
+       u16 reg;
+       u16 mask;
+       u16 limit;
+       u16 init_mask;
+       u16 init_val;
+};
+
+static const struct cpcap_led_info cpcap_led_red = {
+       .reg    = CPCAP_REG_REDC,
+       .mask   = 0x03FF,
+       .limit  = 31,
+};
+
+static const struct cpcap_led_info cpcap_led_green = {
+       .reg    = CPCAP_REG_GREENC,
+       .mask   = 0x03FF,
+       .limit  = 31,
+};
+
+static const struct cpcap_led_info cpcap_led_blue = {
+       .reg    = CPCAP_REG_BLUEC,
+       .mask   = 0x03FF,
+       .limit  = 31,
+};
+
+/* aux display light */
+static const struct cpcap_led_info cpcap_led_adl = {
+       .reg            = CPCAP_REG_ADLC,
+       .mask           = 0x000F,
+       .limit          = 1,
+       .init_mask      = 0x7FFF,
+       .init_val       = 0x5FF0,
+};
+
+/* camera privacy led */
+static const struct cpcap_led_info cpcap_led_cp = {
+       .reg            = CPCAP_REG_CLEDC,
+       .mask           = 0x0007,
+       .limit          = 1,
+       .init_mask      = 0x03FF,
+       .init_val       = 0x0008,
+};
+
+struct cpcap_led {
+       struct led_classdev led;
+       const struct cpcap_led_info *info;
+       struct device *dev;
+       struct regmap *regmap;
+       struct mutex update_lock;
+       struct regulator *vdd;
+       bool powered;
+
+       u32 current_limit;
+};
+
+static u16 cpcap_led_val(u8 current_limit, u8 duty_cycle)
+{
+       current_limit &= 0x1f; /* 5 bit */
+       duty_cycle &= 0x0f; /* 4 bit */
+
+       return current_limit << 4 | duty_cycle;
+}
+
+static int cpcap_led_set_power(struct cpcap_led *led, bool status)
+{
+       int err;
+
+       if (status == led->powered)
+               return 0;
+
+       if (status)
+               err = regulator_enable(led->vdd);
+       else
+               err = regulator_disable(led->vdd);
+
+       if (err) {
+               dev_err(led->dev, "regulator failure: %d", err);
+               return err;
+       }
+
+       led->powered = status;
+
+       return 0;
+}
+
+static int cpcap_led_set(struct led_classdev *ledc, enum led_brightness value)
+{
+       struct cpcap_led *led = container_of(ledc, struct cpcap_led, led);
+       int brightness;
+       int err;
+
+       mutex_lock(&led->update_lock);
+
+       if (value > LED_OFF) {
+               err = cpcap_led_set_power(led, true);
+               if (err)
+                       goto exit;
+       }
+
+       if (value == LED_OFF) {
+               /* Avoid HW issue by turning off current before duty cycle */
+               err = regmap_update_bits(led->regmap,
+                       led->info->reg, led->info->mask, CPCAP_LED_NO_CURRENT);
+               if (err) {
+                       dev_err(led->dev, "regmap failed: %d", err);
+                       goto exit;
+               }
+
+               brightness = cpcap_led_val(value, LED_OFF);
+       } else {
+               brightness = cpcap_led_val(value, LED_ON);
+       }
+
+       err = regmap_update_bits(led->regmap, led->info->reg, led->info->mask,
+               brightness);
+       if (err) {
+               dev_err(led->dev, "regmap failed: %d", err);
+               goto exit;
+       }
+
+       if (value == LED_OFF) {
+               err = cpcap_led_set_power(led, false);
+               if (err)
+                       goto exit;
+       }
+
+exit:
+       mutex_unlock(&led->update_lock);
+       return err;
+}
+
+static const struct of_device_id cpcap_led_of_match[] = {
+       { .compatible = "motorola,cpcap-led-red", .data = &cpcap_led_red },
+       { .compatible = "motorola,cpcap-led-green", .data = &cpcap_led_green },
+       { .compatible = "motorola,cpcap-led-blue",  .data = &cpcap_led_blue },
+       { .compatible = "motorola,cpcap-led-adl", .data = &cpcap_led_adl },
+       { .compatible = "motorola,cpcap-led-cp", .data = &cpcap_led_cp },
+       {},
+};
+MODULE_DEVICE_TABLE(of, cpcap_led_of_match);
+
+static int cpcap_led_probe(struct platform_device *pdev)
+{
+       const struct of_device_id *match;
+       struct cpcap_led *led;
+       int err;
+
+       match = of_match_device(of_match_ptr(cpcap_led_of_match), &pdev->dev);
+       if (!match || !match->data)
+               return -EINVAL;
+
+       led = devm_kzalloc(&pdev->dev, sizeof(*led), GFP_KERNEL);
+       if (!led)
+               return -ENOMEM;
+       platform_set_drvdata(pdev, led);
+       led->info = match->data;
+       led->dev = &pdev->dev;
+
+       if (led->info->reg == 0x0000) {
+               dev_err(led->dev, "Unsupported LED");
+               return -ENODEV;
+       }
+
+       led->regmap = dev_get_regmap(pdev->dev.parent, NULL);
+       if (!led->regmap)
+               return -ENODEV;
+
+       led->vdd = devm_regulator_get(&pdev->dev, "vdd");
+       if (IS_ERR(led->vdd)) {
+               err = PTR_ERR(led->vdd);
+               dev_err(led->dev, "Couldn't get regulator: %d", err);
+               return err;
+       }
+
+       err = device_property_read_string(&pdev->dev, "label", &led->led.name);
+       if (err) {
+               dev_err(led->dev, "Couldn't read LED label: %d", err);
+               return err;
+       }
+
+       if (led->info->init_mask) {
+               err = regmap_update_bits(led->regmap, led->info->reg,
+                       led->info->init_mask, led->info->init_val);
+               if (err) {
+                       dev_err(led->dev, "regmap failed: %d", err);
+                       return err;
+               }
+       }
+
+       mutex_init(&led->update_lock);
+
+       led->led.max_brightness = led->info->limit;
+       led->led.brightness_set_blocking = cpcap_led_set;
+       err = devm_led_classdev_register(&pdev->dev, &led->led);
+       if (err) {
+               dev_err(led->dev, "Couldn't register LED: %d", err);
+               return err;
+       }
+
+       return 0;
+}
+
+static struct platform_driver cpcap_led_driver = {
+       .probe = cpcap_led_probe,
+       .driver = {
+               .name = "cpcap-led",
+               .of_match_table = cpcap_led_of_match,
+       },
+};
+module_platform_driver(cpcap_led_driver);
+
+MODULE_DESCRIPTION("CPCAP LED driver");
+MODULE_AUTHOR("Sebastian Reichel <sre@kernel.org>");
+MODULE_LICENSE("GPL");
index 066fc75..e753ba9 100644 (file)
@@ -77,7 +77,7 @@ static int gpio_blink_set(struct led_classdev *led_cdev,
 
 static int create_gpio_led(const struct gpio_led *template,
        struct gpio_led_data *led_dat, struct device *parent,
-       gpio_blink_set_t blink_set)
+       struct device_node *np, gpio_blink_set_t blink_set)
 {
        int ret, state;
 
@@ -139,7 +139,7 @@ static int create_gpio_led(const struct gpio_led *template,
        if (ret < 0)
                return ret;
 
-       return devm_led_classdev_register(parent, &led_dat->cdev);
+       return devm_of_led_classdev_register(parent, np, &led_dat->cdev);
 }
 
 struct gpio_leds_priv {
@@ -208,7 +208,7 @@ static struct gpio_leds_priv *gpio_leds_create(struct platform_device *pdev)
                if (fwnode_property_present(child, "panic-indicator"))
                        led.panic_indicator = 1;
 
-               ret = create_gpio_led(&led, led_dat, dev, NULL);
+               ret = create_gpio_led(&led, led_dat, dev, np, NULL);
                if (ret < 0) {
                        fwnode_handle_put(child);
                        return ERR_PTR(ret);
@@ -242,9 +242,9 @@ static int gpio_led_probe(struct platform_device *pdev)
 
                priv->num_leds = pdata->num_leds;
                for (i = 0; i < priv->num_leds; i++) {
-                       ret = create_gpio_led(&pdata->leds[i],
-                                             &priv->leds[i],
-                                             &pdev->dev, pdata->gpio_blink_set);
+                       ret = create_gpio_led(&pdata->leds[i], &priv->leds[i],
+                                             &pdev->dev, NULL,
+                                             pdata->gpio_blink_set);
                        if (ret < 0)
                                return ret;
                }
index 4847e89..847f7f2 100644 (file)
@@ -10,7 +10,6 @@
  *
  */
 
-#include <linux/acpi.h>
 #include <linux/delay.h>
 #include <linux/gpio.h>
 #include <linux/i2c.h>
@@ -103,10 +102,11 @@ static int lp3952_get_label(struct device *dev, const char *label, char *dest)
        const char *str;
 
        ret = device_property_read_string(dev, label, &str);
-       if (!ret)
-               strncpy(dest, str, LP3952_LABEL_MAX_LEN);
+       if (ret)
+               return ret;
 
-       return ret;
+       strncpy(dest, str, LP3952_LABEL_MAX_LEN);
+       return 0;
 }
 
 static int lp3952_register_led_classdev(struct lp3952_led_array *priv)
@@ -276,19 +276,9 @@ static const struct i2c_device_id lp3952_id[] = {
 };
 MODULE_DEVICE_TABLE(i2c, lp3952_id);
 
-#ifdef CONFIG_ACPI
-static const struct acpi_device_id lp3952_acpi_match[] = {
-       {"TXNW3952", 0},
-       {}
-};
-
-MODULE_DEVICE_TABLE(acpi, lp3952_acpi_match);
-#endif
-
 static struct i2c_driver lp3952_i2c_driver = {
        .driver = {
                        .name = LP3952_NAME,
-                       .acpi_match_table = ACPI_PTR(lp3952_acpi_match),
        },
        .probe = lp3952_probe,
        .remove = lp3952_remove,
diff --git a/drivers/leds/leds-mt6323.c b/drivers/leds/leds-mt6323.c
new file mode 100644 (file)
index 0000000..8893c74
--- /dev/null
@@ -0,0 +1,502 @@
+/*
+ * LED driver for Mediatek MT6323 PMIC
+ *
+ * Copyright (C) 2017 Sean Wang <sean.wang@mediatek.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+#include <linux/kernel.h>
+#include <linux/leds.h>
+#include <linux/mfd/mt6323/registers.h>
+#include <linux/mfd/mt6397/core.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/regmap.h>
+
+/*
+ * Register field for MT6323_TOP_CKPDN0 to enable
+ * 32K clock common for LED device.
+ */
+#define MT6323_RG_DRV_32K_CK_PDN       BIT(11)
+#define MT6323_RG_DRV_32K_CK_PDN_MASK  BIT(11)
+
+/*
+ * Register field for MT6323_TOP_CKPDN2 to enable
+ * individual clock for LED device.
+ */
+#define MT6323_RG_ISINK_CK_PDN(i)      BIT(i)
+#define MT6323_RG_ISINK_CK_PDN_MASK(i) BIT(i)
+
+/*
+ * Register field for MT6323_TOP_CKCON1 to select
+ * clock source.
+ */
+#define MT6323_RG_ISINK_CK_SEL_MASK(i) (BIT(10) << (i))
+
+/*
+ * Register for MT6323_ISINK_CON0 to setup the
+ * duty cycle of the blink.
+ */
+#define MT6323_ISINK_CON0(i)           (MT6323_ISINK0_CON0 + 0x8 * (i))
+#define MT6323_ISINK_DIM_DUTY_MASK     (0x1f << 8)
+#define MT6323_ISINK_DIM_DUTY(i)       (((i) << 8) & \
+                                       MT6323_ISINK_DIM_DUTY_MASK)
+
+/* Register to setup the period of the blink. */
+#define MT6323_ISINK_CON1(i)           (MT6323_ISINK0_CON1 + 0x8 * (i))
+#define MT6323_ISINK_DIM_FSEL_MASK     (0xffff)
+#define MT6323_ISINK_DIM_FSEL(i)       ((i) & MT6323_ISINK_DIM_FSEL_MASK)
+
+/* Register to control the brightness. */
+#define MT6323_ISINK_CON2(i)           (MT6323_ISINK0_CON2 + 0x8 * (i))
+#define MT6323_ISINK_CH_STEP_SHIFT     12
+#define MT6323_ISINK_CH_STEP_MASK      (0x7 << 12)
+#define MT6323_ISINK_CH_STEP(i)                (((i) << 12) & \
+                                       MT6323_ISINK_CH_STEP_MASK)
+#define MT6323_ISINK_SFSTR0_TC_MASK    (0x3 << 1)
+#define MT6323_ISINK_SFSTR0_TC(i)      (((i) << 1) & \
+                                       MT6323_ISINK_SFSTR0_TC_MASK)
+#define MT6323_ISINK_SFSTR0_EN_MASK    BIT(0)
+#define MT6323_ISINK_SFSTR0_EN         BIT(0)
+
+/* Register to LED channel enablement. */
+#define MT6323_ISINK_CH_EN_MASK(i)     BIT(i)
+#define MT6323_ISINK_CH_EN(i)          BIT(i)
+
+#define MT6323_MAX_PERIOD              10000
+#define MT6323_MAX_LEDS                        4
+#define MT6323_MAX_BRIGHTNESS          6
+#define MT6323_UNIT_DUTY               3125
+#define MT6323_CAL_HW_DUTY(o, p)       DIV_ROUND_CLOSEST((o) * 100000ul,\
+                                       (p) * MT6323_UNIT_DUTY)
+
+struct mt6323_leds;
+
+/**
+ * struct mt6323_led - state container for the LED device
+ * @id:                        the identifier in MT6323 LED device
+ * @parent:            the pointer to MT6323 LED controller
+ * @cdev:              LED class device for this LED device
+ * @current_brightness: current state of the LED device
+ */
+struct mt6323_led {
+       int                     id;
+       struct mt6323_leds      *parent;
+       struct led_classdev     cdev;
+       enum led_brightness     current_brightness;
+};
+
+/**
+ * struct mt6323_leds -        state container for holding LED controller
+ *                     of the driver
+ * @dev:               the device pointer
+ * @hw:                        the underlying hardware providing shared
+ *                     bus for the register operations
+ * @lock:              the lock among process context
+ * @led:               the array that contains the state of individual
+ *                     LED device
+ */
+struct mt6323_leds {
+       struct device           *dev;
+       struct mt6397_chip      *hw;
+       /* protect among process context */
+       struct mutex            lock;
+       struct mt6323_led       *led[MT6323_MAX_LEDS];
+};
+
+static int mt6323_led_hw_brightness(struct led_classdev *cdev,
+                                   enum led_brightness brightness)
+{
+       struct mt6323_led *led = container_of(cdev, struct mt6323_led, cdev);
+       struct mt6323_leds *leds = led->parent;
+       struct regmap *regmap = leds->hw->regmap;
+       u32 con2_mask = 0, con2_val = 0;
+       int ret;
+
+       /*
+        * Setup current output for the corresponding
+        * brightness level.
+        */
+       con2_mask |= MT6323_ISINK_CH_STEP_MASK |
+                    MT6323_ISINK_SFSTR0_TC_MASK |
+                    MT6323_ISINK_SFSTR0_EN_MASK;
+       con2_val |=  MT6323_ISINK_CH_STEP(brightness - 1) |
+                    MT6323_ISINK_SFSTR0_TC(2) |
+                    MT6323_ISINK_SFSTR0_EN;
+
+       ret = regmap_update_bits(regmap, MT6323_ISINK_CON2(led->id),
+                                con2_mask, con2_val);
+       return ret;
+}
+
+static int mt6323_led_hw_off(struct led_classdev *cdev)
+{
+       struct mt6323_led *led = container_of(cdev, struct mt6323_led, cdev);
+       struct mt6323_leds *leds = led->parent;
+       struct regmap *regmap = leds->hw->regmap;
+       unsigned int status;
+       int ret;
+
+       status = MT6323_ISINK_CH_EN(led->id);
+       ret = regmap_update_bits(regmap, MT6323_ISINK_EN_CTRL,
+                                MT6323_ISINK_CH_EN_MASK(led->id), ~status);
+       if (ret < 0)
+               return ret;
+
+       usleep_range(100, 300);
+       ret = regmap_update_bits(regmap, MT6323_TOP_CKPDN2,
+                                MT6323_RG_ISINK_CK_PDN_MASK(led->id),
+                                MT6323_RG_ISINK_CK_PDN(led->id));
+       if (ret < 0)
+               return ret;
+
+       return 0;
+}
+
+static enum led_brightness
+mt6323_get_led_hw_brightness(struct led_classdev *cdev)
+{
+       struct mt6323_led *led = container_of(cdev, struct mt6323_led, cdev);
+       struct mt6323_leds *leds = led->parent;
+       struct regmap *regmap = leds->hw->regmap;
+       unsigned int status;
+       int ret;
+
+       ret = regmap_read(regmap, MT6323_TOP_CKPDN2, &status);
+       if (ret < 0)
+               return ret;
+
+       if (status & MT6323_RG_ISINK_CK_PDN_MASK(led->id))
+               return 0;
+
+       ret = regmap_read(regmap, MT6323_ISINK_EN_CTRL, &status);
+       if (ret < 0)
+               return ret;
+
+       if (!(status & MT6323_ISINK_CH_EN(led->id)))
+               return 0;
+
+       ret = regmap_read(regmap, MT6323_ISINK_CON2(led->id), &status);
+       if (ret < 0)
+               return ret;
+
+       return  ((status & MT6323_ISINK_CH_STEP_MASK)
+                 >> MT6323_ISINK_CH_STEP_SHIFT) + 1;
+}
+
+static int mt6323_led_hw_on(struct led_classdev *cdev,
+                           enum led_brightness brightness)
+{
+       struct mt6323_led *led = container_of(cdev, struct mt6323_led, cdev);
+       struct mt6323_leds *leds = led->parent;
+       struct regmap *regmap = leds->hw->regmap;
+       unsigned int status;
+       int ret;
+
+       /*
+        * Setup required clock source, enable the corresponding
+        * clock and channel and let work with continuous blink as
+        * the default.
+        */
+       ret = regmap_update_bits(regmap, MT6323_TOP_CKCON1,
+                                MT6323_RG_ISINK_CK_SEL_MASK(led->id), 0);
+       if (ret < 0)
+               return ret;
+
+       status = MT6323_RG_ISINK_CK_PDN(led->id);
+       ret = regmap_update_bits(regmap, MT6323_TOP_CKPDN2,
+                                MT6323_RG_ISINK_CK_PDN_MASK(led->id),
+                                ~status);
+       if (ret < 0)
+               return ret;
+
+       usleep_range(100, 300);
+
+       ret = regmap_update_bits(regmap, MT6323_ISINK_EN_CTRL,
+                                MT6323_ISINK_CH_EN_MASK(led->id),
+                                MT6323_ISINK_CH_EN(led->id));
+       if (ret < 0)
+               return ret;
+
+       ret = mt6323_led_hw_brightness(cdev, brightness);
+       if (ret < 0)
+               return ret;
+
+       ret = regmap_update_bits(regmap, MT6323_ISINK_CON0(led->id),
+                                MT6323_ISINK_DIM_DUTY_MASK,
+                                MT6323_ISINK_DIM_DUTY(31));
+       if (ret < 0)
+               return ret;
+
+       ret = regmap_update_bits(regmap, MT6323_ISINK_CON1(led->id),
+                                MT6323_ISINK_DIM_FSEL_MASK,
+                                MT6323_ISINK_DIM_FSEL(1000));
+       if (ret < 0)
+               return ret;
+
+       return 0;
+}
+
+static int mt6323_led_set_blink(struct led_classdev *cdev,
+                               unsigned long *delay_on,
+                               unsigned long *delay_off)
+{
+       struct mt6323_led *led = container_of(cdev, struct mt6323_led, cdev);
+       struct mt6323_leds *leds = led->parent;
+       struct regmap *regmap = leds->hw->regmap;
+       unsigned long period;
+       u8 duty_hw;
+       int ret;
+
+       /*
+        * Units are in ms, if over the hardware able
+        * to support, fallback into software blink
+        */
+       period = *delay_on + *delay_off;
+
+       if (period > MT6323_MAX_PERIOD)
+               return -EINVAL;
+
+       /*
+        * LED subsystem requires a default user
+        * friendly blink pattern for the LED so using
+        * 1Hz duty cycle 50% here if without specific
+        * value delay_on and delay off being assigned.
+        */
+       if (!*delay_on && !*delay_off) {
+               *delay_on = 500;
+               *delay_off = 500;
+       }
+
+       /*
+        * Calculate duty_hw based on the percentage of period during
+        * which the led is ON.
+        */
+       duty_hw = MT6323_CAL_HW_DUTY(*delay_on, period);
+
+       /* hardware doesn't support zero duty cycle. */
+       if (!duty_hw)
+               return -EINVAL;
+
+       mutex_lock(&leds->lock);
+       /*
+        * Set max_brightness as the software blink behavior
+        * when no blink brightness.
+        */
+       if (!led->current_brightness) {
+               ret = mt6323_led_hw_on(cdev, cdev->max_brightness);
+               if (ret < 0)
+                       goto out;
+               led->current_brightness = cdev->max_brightness;
+       }
+
+       ret = regmap_update_bits(regmap, MT6323_ISINK_CON0(led->id),
+                                MT6323_ISINK_DIM_DUTY_MASK,
+                                MT6323_ISINK_DIM_DUTY(duty_hw - 1));
+       if (ret < 0)
+               goto out;
+
+       ret = regmap_update_bits(regmap, MT6323_ISINK_CON1(led->id),
+                                MT6323_ISINK_DIM_FSEL_MASK,
+                                MT6323_ISINK_DIM_FSEL(period - 1));
+out:
+       mutex_unlock(&leds->lock);
+
+       return ret;
+}
+
+static int mt6323_led_set_brightness(struct led_classdev *cdev,
+                                    enum led_brightness brightness)
+{
+       struct mt6323_led *led = container_of(cdev, struct mt6323_led, cdev);
+       struct mt6323_leds *leds = led->parent;
+       int ret;
+
+       mutex_lock(&leds->lock);
+
+       if (!led->current_brightness && brightness) {
+               ret = mt6323_led_hw_on(cdev, brightness);
+               if (ret < 0)
+                       goto out;
+       } else if (brightness) {
+               ret = mt6323_led_hw_brightness(cdev, brightness);
+               if (ret < 0)
+                       goto out;
+       } else {
+               ret = mt6323_led_hw_off(cdev);
+               if (ret < 0)
+                       goto out;
+       }
+
+       led->current_brightness = brightness;
+out:
+       mutex_unlock(&leds->lock);
+
+       return ret;
+}
+
+static int mt6323_led_set_dt_default(struct led_classdev *cdev,
+                                    struct device_node *np)
+{
+       struct mt6323_led *led = container_of(cdev, struct mt6323_led, cdev);
+       const char *state;
+       int ret = 0;
+
+       led->cdev.name = of_get_property(np, "label", NULL) ? : np->name;
+       led->cdev.default_trigger = of_get_property(np,
+                                                   "linux,default-trigger",
+                                                   NULL);
+
+       state = of_get_property(np, "default-state", NULL);
+       if (state) {
+               if (!strcmp(state, "keep")) {
+                       ret = mt6323_get_led_hw_brightness(cdev);
+                       if (ret < 0)
+                               return ret;
+                       led->current_brightness = ret;
+                       ret = 0;
+               } else if (!strcmp(state, "on")) {
+                       ret =
+                       mt6323_led_set_brightness(cdev, cdev->max_brightness);
+               } else  {
+                       ret = mt6323_led_set_brightness(cdev, LED_OFF);
+               }
+       }
+
+       return ret;
+}
+
+static int mt6323_led_probe(struct platform_device *pdev)
+{
+       struct device *dev = &pdev->dev;
+       struct device_node *np = pdev->dev.of_node;
+       struct device_node *child;
+       struct mt6397_chip *hw = dev_get_drvdata(pdev->dev.parent);
+       struct mt6323_leds *leds;
+       struct mt6323_led *led;
+       int ret;
+       unsigned int status;
+       u32 reg;
+
+       leds = devm_kzalloc(dev, sizeof(*leds), GFP_KERNEL);
+       if (!leds)
+               return -ENOMEM;
+
+       platform_set_drvdata(pdev, leds);
+       leds->dev = dev;
+
+       /*
+        * leds->hw points to the underlying bus for the register
+        * controlled.
+        */
+       leds->hw = hw;
+       mutex_init(&leds->lock);
+
+       status = MT6323_RG_DRV_32K_CK_PDN;
+       ret = regmap_update_bits(leds->hw->regmap, MT6323_TOP_CKPDN0,
+                                MT6323_RG_DRV_32K_CK_PDN_MASK, ~status);
+       if (ret < 0) {
+               dev_err(leds->dev,
+                       "Failed to update MT6323_TOP_CKPDN0 Register\n");
+               return ret;
+       }
+
+       for_each_available_child_of_node(np, child) {
+               ret = of_property_read_u32(child, "reg", &reg);
+               if (ret) {
+                       dev_err(dev, "Failed to read led 'reg' property\n");
+                       goto put_child_node;
+               }
+
+               if (reg >= MT6323_MAX_LEDS || leds->led[reg]) {
+                       dev_err(dev, "Invalid led reg %u\n", reg);
+                       ret = -EINVAL;
+                       goto put_child_node;
+               }
+
+               led = devm_kzalloc(dev, sizeof(*led), GFP_KERNEL);
+               if (!led) {
+                       ret = -ENOMEM;
+                       goto put_child_node;
+               }
+
+               leds->led[reg] = led;
+               leds->led[reg]->id = reg;
+               leds->led[reg]->cdev.max_brightness = MT6323_MAX_BRIGHTNESS;
+               leds->led[reg]->cdev.brightness_set_blocking =
+                                       mt6323_led_set_brightness;
+               leds->led[reg]->cdev.blink_set = mt6323_led_set_blink;
+               leds->led[reg]->cdev.brightness_get =
+                                       mt6323_get_led_hw_brightness;
+               leds->led[reg]->parent = leds;
+
+               ret = mt6323_led_set_dt_default(&leds->led[reg]->cdev, child);
+               if (ret < 0) {
+                       dev_err(leds->dev,
+                               "Failed to LED set default from devicetree\n");
+                       goto put_child_node;
+               }
+
+               ret = devm_led_classdev_register(dev, &leds->led[reg]->cdev);
+               if (ret) {
+                       dev_err(&pdev->dev, "Failed to register LED: %d\n",
+                               ret);
+                       goto put_child_node;
+               }
+               leds->led[reg]->cdev.dev->of_node = child;
+       }
+
+       return 0;
+
+put_child_node:
+       of_node_put(child);
+       return ret;
+}
+
+static int mt6323_led_remove(struct platform_device *pdev)
+{
+       struct mt6323_leds *leds = platform_get_drvdata(pdev);
+       int i;
+
+       /* Turn the LEDs off on driver removal. */
+       for (i = 0 ; leds->led[i] ; i++)
+               mt6323_led_hw_off(&leds->led[i]->cdev);
+
+       regmap_update_bits(leds->hw->regmap, MT6323_TOP_CKPDN0,
+                          MT6323_RG_DRV_32K_CK_PDN_MASK,
+                          MT6323_RG_DRV_32K_CK_PDN);
+
+       mutex_destroy(&leds->lock);
+
+       return 0;
+}
+
+static const struct of_device_id mt6323_led_dt_match[] = {
+       { .compatible = "mediatek,mt6323-led" },
+       {},
+};
+MODULE_DEVICE_TABLE(of, mt6323_led_dt_match);
+
+static struct platform_driver mt6323_led_driver = {
+       .probe          = mt6323_led_probe,
+       .remove         = mt6323_led_remove,
+       .driver         = {
+               .name   = "mt6323-led",
+               .of_match_table = mt6323_led_dt_match,
+       },
+};
+
+module_platform_driver(mt6323_led_driver);
+
+MODULE_DESCRIPTION("LED driver for Mediatek MT6323 PMIC");
+MODULE_AUTHOR("Sean Wang <sean.wang@mediatek.com>");
+MODULE_LICENSE("GPL");
index 06e6310..7fea18b 100644 (file)
@@ -254,6 +254,21 @@ static void pca9532_input_work(struct work_struct *work)
        mutex_unlock(&data->update_lock);
 }
 
+static enum pca9532_state pca9532_getled(struct pca9532_led *led)
+{
+       struct i2c_client *client = led->client;
+       struct pca9532_data *data = i2c_get_clientdata(client);
+       u8 maxleds = data->chip_info->num_leds;
+       char reg;
+       enum pca9532_state ret;
+
+       mutex_lock(&data->update_lock);
+       reg = i2c_smbus_read_byte_data(client, LED_REG(maxleds, led->id));
+       ret = reg >> LED_NUM(led->id)/2;
+       mutex_unlock(&data->update_lock);
+       return ret;
+}
+
 #ifdef CONFIG_LEDS_PCA9532_GPIO
 static int pca9532_gpio_request_pin(struct gpio_chip *gc, unsigned offset)
 {
@@ -366,7 +381,10 @@ static int pca9532_configure(struct i2c_client *client,
                        gpios++;
                        break;
                case PCA9532_TYPE_LED:
-                       led->state = pled->state;
+                       if (pled->state == PCA9532_KEEP)
+                               led->state = pca9532_getled(led);
+                       else
+                               led->state = pled->state;
                        led->name = pled->name;
                        led->ldev.name = led->name;
                        led->ldev.default_trigger = pled->default_trigger;
@@ -456,6 +474,7 @@ pca9532_of_populate_pdata(struct device *dev, struct device_node *np)
        const struct of_device_id *match;
        int devid, maxleds;
        int i = 0;
+       const char *state;
 
        match = of_match_device(of_pca9532_leds_match, dev);
        if (!match)
@@ -475,6 +494,12 @@ pca9532_of_populate_pdata(struct device *dev, struct device_node *np)
                of_property_read_u32(child, "type", &pdata->leds[i].type);
                of_property_read_string(child, "linux,default-trigger",
                                        &pdata->leds[i].default_trigger);
+               if (!of_property_read_string(child, "default-state", &state)) {
+                       if (!strcmp(state, "on"))
+                               pdata->leds[i].state = PCA9532_ON;
+                       else if (!strcmp(state, "keep"))
+                               pdata->leds[i].state = PCA9532_KEEP;
+               }
                if (++i >= maxleds) {
                        of_node_put(child);
                        break;
index a418964..66a6260 100644 (file)
 #define MAX_NAME_LEN   8
 
 struct led_trigger_cpu {
+       bool is_active;
        char name[MAX_NAME_LEN];
        struct led_trigger *_trig;
 };
 
 static DEFINE_PER_CPU(struct led_trigger_cpu, cpu_trig);
 
+static struct led_trigger *trig_cpu_all;
+static atomic_t num_active_cpus = ATOMIC_INIT(0);
+
 /**
  * ledtrig_cpu - emit a CPU event as a trigger
  * @evt: CPU event to be emitted
@@ -47,26 +51,46 @@ static DEFINE_PER_CPU(struct led_trigger_cpu, cpu_trig);
 void ledtrig_cpu(enum cpu_led_event ledevt)
 {
        struct led_trigger_cpu *trig = this_cpu_ptr(&cpu_trig);
+       bool is_active = trig->is_active;
 
        /* Locate the correct CPU LED */
        switch (ledevt) {
        case CPU_LED_IDLE_END:
        case CPU_LED_START:
                /* Will turn the LED on, max brightness */
-               led_trigger_event(trig->_trig, LED_FULL);
+               is_active = true;
                break;
 
        case CPU_LED_IDLE_START:
        case CPU_LED_STOP:
        case CPU_LED_HALTED:
                /* Will turn the LED off */
-               led_trigger_event(trig->_trig, LED_OFF);
+               is_active = false;
                break;
 
        default:
                /* Will leave the LED as it is */
                break;
        }
+
+       if (is_active != trig->is_active) {
+               unsigned int active_cpus;
+               unsigned int total_cpus;
+
+               /* Update trigger state */
+               trig->is_active = is_active;
+               atomic_add(is_active ? 1 : -1, &num_active_cpus);
+               active_cpus = atomic_read(&num_active_cpus);
+               total_cpus = num_present_cpus();
+
+               led_trigger_event(trig->_trig,
+                       is_active ? LED_FULL : LED_OFF);
+
+
+               led_trigger_event(trig_cpu_all,
+                       DIV_ROUND_UP(LED_FULL * active_cpus, total_cpus));
+
+       }
 }
 EXPORT_SYMBOL(ledtrig_cpu);
 
@@ -113,6 +137,11 @@ static int __init ledtrig_cpu_init(void)
        BUILD_BUG_ON(CONFIG_NR_CPUS > 9999);
 
        /*
+        * Registering a trigger for all CPUs.
+        */
+       led_trigger_register_simple("cpu", &trig_cpu_all);
+
+       /*
         * Registering CPU led trigger for each CPU core here
         * ignores CPU hotplug, but after this CPU hotplug works
         * fine with this trigger.
index 0527141..ead61a9 100644 (file)
@@ -33,4 +33,13 @@ config NVM_RRPC
        host. The target is implemented using a linear mapping table and
        cost-based garbage collection. It is optimized for 4K IO sizes.
 
+config NVM_PBLK
+       tristate "Physical Block Device Open-Channel SSD target"
+       ---help---
+       Allows an open-channel SSD to be exposed as a block device to the
+       host. The target assumes the device exposes raw flash and must be
+       explicitly managed by the host.
+
+       Please note the disk format is considered EXPERIMENTAL for now.
+
 endif # NVM
index b2a39e2..82d1a11 100644 (file)
@@ -4,3 +4,8 @@
 
 obj-$(CONFIG_NVM)              := core.o
 obj-$(CONFIG_NVM_RRPC)         += rrpc.o
+obj-$(CONFIG_NVM_PBLK)         += pblk.o
+pblk-y                         := pblk-init.o pblk-core.o pblk-rb.o \
+                                  pblk-write.o pblk-cache.o pblk-read.o \
+                                  pblk-gc.o pblk-recovery.o pblk-map.o \
+                                  pblk-rl.o pblk-sysfs.o
index 5262ba6..54a06c3 100644 (file)
@@ -89,7 +89,7 @@ static void nvm_release_luns_err(struct nvm_dev *dev, int lun_begin,
                WARN_ON(!test_and_clear_bit(i, dev->lun_map));
 }
 
-static void nvm_remove_tgt_dev(struct nvm_tgt_dev *tgt_dev)
+static void nvm_remove_tgt_dev(struct nvm_tgt_dev *tgt_dev, int clear)
 {
        struct nvm_dev *dev = tgt_dev->parent;
        struct nvm_dev_map *dev_map = tgt_dev->map;
@@ -100,11 +100,14 @@ static void nvm_remove_tgt_dev(struct nvm_tgt_dev *tgt_dev)
                int *lun_offs = ch_map->lun_offs;
                int ch = i + ch_map->ch_off;
 
-               for (j = 0; j < ch_map->nr_luns; j++) {
-                       int lun = j + lun_offs[j];
-                       int lunid = (ch * dev->geo.luns_per_chnl) + lun;
+               if (clear) {
+                       for (j = 0; j < ch_map->nr_luns; j++) {
+                               int lun = j + lun_offs[j];
+                               int lunid = (ch * dev->geo.luns_per_chnl) + lun;
 
-                       WARN_ON(!test_and_clear_bit(lunid, dev->lun_map));
+                               WARN_ON(!test_and_clear_bit(lunid,
+                                                       dev->lun_map));
+                       }
                }
 
                kfree(ch_map->lun_offs);
@@ -232,6 +235,7 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
        struct nvm_target *t;
        struct nvm_tgt_dev *tgt_dev;
        void *targetdata;
+       int ret;
 
        tt = nvm_find_target_type(create->tgttype, 1);
        if (!tt) {
@@ -252,34 +256,43 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
                return -ENOMEM;
 
        t = kmalloc(sizeof(struct nvm_target), GFP_KERNEL);
-       if (!t)
+       if (!t) {
+               ret = -ENOMEM;
                goto err_reserve;
+       }
 
        tgt_dev = nvm_create_tgt_dev(dev, s->lun_begin, s->lun_end);
        if (!tgt_dev) {
                pr_err("nvm: could not create target device\n");
+               ret = -ENOMEM;
                goto err_t;
        }
 
-       tqueue = blk_alloc_queue_node(GFP_KERNEL, dev->q->node);
-       if (!tqueue)
+       tdisk = alloc_disk(0);
+       if (!tdisk) {
+               ret = -ENOMEM;
                goto err_dev;
-       blk_queue_make_request(tqueue, tt->make_rq);
+       }
 
-       tdisk = alloc_disk(0);
-       if (!tdisk)
-               goto err_queue;
+       tqueue = blk_alloc_queue_node(GFP_KERNEL, dev->q->node);
+       if (!tqueue) {
+               ret = -ENOMEM;
+               goto err_disk;
+       }
+       blk_queue_make_request(tqueue, tt->make_rq);
 
-       sprintf(tdisk->disk_name, "%s", create->tgtname);
+       strlcpy(tdisk->disk_name, create->tgtname, sizeof(tdisk->disk_name));
        tdisk->flags = GENHD_FL_EXT_DEVT;
        tdisk->major = 0;
        tdisk->first_minor = 0;
        tdisk->fops = &nvm_fops;
        tdisk->queue = tqueue;
 
-       targetdata = tt->init(tgt_dev, tdisk);
-       if (IS_ERR(targetdata))
+       targetdata = tt->init(tgt_dev, tdisk, create->flags);
+       if (IS_ERR(targetdata)) {
+               ret = PTR_ERR(targetdata);
                goto err_init;
+       }
 
        tdisk->private_data = targetdata;
        tqueue->queuedata = targetdata;
@@ -289,8 +302,10 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
        set_capacity(tdisk, tt->capacity(targetdata));
        add_disk(tdisk);
 
-       if (tt->sysfs_init && tt->sysfs_init(tdisk))
+       if (tt->sysfs_init && tt->sysfs_init(tdisk)) {
+               ret = -ENOMEM;
                goto err_sysfs;
+       }
 
        t->type = tt;
        t->disk = tdisk;
@@ -305,16 +320,17 @@ err_sysfs:
        if (tt->exit)
                tt->exit(targetdata);
 err_init:
-       put_disk(tdisk);
-err_queue:
        blk_cleanup_queue(tqueue);
+       tdisk->queue = NULL;
+err_disk:
+       put_disk(tdisk);
 err_dev:
-       nvm_remove_tgt_dev(tgt_dev);
+       nvm_remove_tgt_dev(tgt_dev, 0);
 err_t:
        kfree(t);
 err_reserve:
        nvm_release_luns_err(dev, s->lun_begin, s->lun_end);
-       return -ENOMEM;
+       return ret;
 }
 
 static void __nvm_remove_target(struct nvm_target *t)
@@ -332,7 +348,7 @@ static void __nvm_remove_target(struct nvm_target *t)
        if (tt->exit)
                tt->exit(tdisk->private_data);
 
-       nvm_remove_tgt_dev(t->dev);
+       nvm_remove_tgt_dev(t->dev, 1);
        put_disk(tdisk);
 
        list_del(&t->list);
@@ -411,6 +427,18 @@ err_rmap:
        return -ENOMEM;
 }
 
+static void nvm_unregister_map(struct nvm_dev *dev)
+{
+       struct nvm_dev_map *rmap = dev->rmap;
+       int i;
+
+       for (i = 0; i < dev->geo.nr_chnls; i++)
+               kfree(rmap->chnls[i].lun_offs);
+
+       kfree(rmap->chnls);
+       kfree(rmap);
+}
+
 static void nvm_map_to_dev(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p)
 {
        struct nvm_dev_map *dev_map = tgt_dev->map;
@@ -486,7 +514,6 @@ void nvm_part_to_tgt(struct nvm_dev *dev, sector_t *entries,
                int *lun_roffs;
                struct ppa_addr gaddr;
                u64 pba = le64_to_cpu(entries[i]);
-               int off;
                u64 diff;
 
                if (!pba)
@@ -496,8 +523,6 @@ void nvm_part_to_tgt(struct nvm_dev *dev, sector_t *entries,
                ch_rmap = &dev_rmap->chnls[gaddr.g.ch];
                lun_roffs = ch_rmap->lun_offs;
 
-               off = gaddr.g.ch * geo->luns_per_chnl + gaddr.g.lun;
-
                diff = ((ch_rmap->ch_off * geo->luns_per_chnl) +
                                (lun_roffs[gaddr.g.lun])) * geo->sec_per_lun;
 
@@ -590,11 +615,11 @@ int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas,
 
        memset(&rqd, 0, sizeof(struct nvm_rq));
 
-       nvm_set_rqd_ppalist(dev, &rqd, ppas, nr_ppas, 1);
+       nvm_set_rqd_ppalist(tgt_dev, &rqd, ppas, nr_ppas, 1);
        nvm_rq_tgt_to_dev(tgt_dev, &rqd);
 
        ret = dev->ops->set_bb_tbl(dev, &rqd.ppa_addr, rqd.nr_ppas, type);
-       nvm_free_rqd_ppalist(dev, &rqd);
+       nvm_free_rqd_ppalist(tgt_dev, &rqd);
        if (ret) {
                pr_err("nvm: failed bb mark\n");
                return -EINVAL;
@@ -626,34 +651,45 @@ int nvm_submit_io(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
 }
 EXPORT_SYMBOL(nvm_submit_io);
 
-int nvm_erase_blk(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas, int flags)
+static void nvm_end_io_sync(struct nvm_rq *rqd)
 {
-       struct nvm_dev *dev = tgt_dev->parent;
-       struct nvm_rq rqd;
-       int ret;
+       struct completion *waiting = rqd->private;
 
-       if (!dev->ops->erase_block)
-               return 0;
+       complete(waiting);
+}
 
-       nvm_map_to_dev(tgt_dev, ppas);
+int nvm_erase_sync(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas,
+                                                               int nr_ppas)
+{
+       struct nvm_geo *geo = &tgt_dev->geo;
+       struct nvm_rq rqd;
+       int ret;
+       DECLARE_COMPLETION_ONSTACK(wait);
 
        memset(&rqd, 0, sizeof(struct nvm_rq));
 
-       ret = nvm_set_rqd_ppalist(dev, &rqd, ppas, 1, 1);
+       rqd.opcode = NVM_OP_ERASE;
+       rqd.end_io = nvm_end_io_sync;
+       rqd.private = &wait;
+       rqd.flags = geo->plane_mode >> 1;
+
+       ret = nvm_set_rqd_ppalist(tgt_dev, &rqd, ppas, nr_ppas, 1);
        if (ret)
                return ret;
 
-       nvm_rq_tgt_to_dev(tgt_dev, &rqd);
-
-       rqd.flags = flags;
-
-       ret = dev->ops->erase_block(dev, &rqd);
+       ret = nvm_submit_io(tgt_dev, &rqd);
+       if (ret) {
+               pr_err("rrpr: erase I/O submission failed: %d\n", ret);
+               goto free_ppa_list;
+       }
+       wait_for_completion_io(&wait);
 
-       nvm_free_rqd_ppalist(dev, &rqd);
+free_ppa_list:
+       nvm_free_rqd_ppalist(tgt_dev, &rqd);
 
        return ret;
 }
-EXPORT_SYMBOL(nvm_erase_blk);
+EXPORT_SYMBOL(nvm_erase_sync);
 
 int nvm_get_l2p_tbl(struct nvm_tgt_dev *tgt_dev, u64 slba, u32 nlb,
                    nvm_l2p_update_fn *update_l2p, void *priv)
@@ -732,10 +768,11 @@ void nvm_put_area(struct nvm_tgt_dev *tgt_dev, sector_t begin)
 }
 EXPORT_SYMBOL(nvm_put_area);
 
-int nvm_set_rqd_ppalist(struct nvm_dev *dev, struct nvm_rq *rqd,
+int nvm_set_rqd_ppalist(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd,
                        const struct ppa_addr *ppas, int nr_ppas, int vblk)
 {
-       struct nvm_geo *geo = &dev->geo;
+       struct nvm_dev *dev = tgt_dev->parent;
+       struct nvm_geo *geo = &tgt_dev->geo;
        int i, plane_cnt, pl_idx;
        struct ppa_addr ppa;
 
@@ -773,12 +810,12 @@ int nvm_set_rqd_ppalist(struct nvm_dev *dev, struct nvm_rq *rqd,
 }
 EXPORT_SYMBOL(nvm_set_rqd_ppalist);
 
-void nvm_free_rqd_ppalist(struct nvm_dev *dev, struct nvm_rq *rqd)
+void nvm_free_rqd_ppalist(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
 {
        if (!rqd->ppa_list)
                return;
 
-       nvm_dev_dma_free(dev, rqd->ppa_list, rqd->dma_ppa_list);
+       nvm_dev_dma_free(tgt_dev->parent, rqd->ppa_list, rqd->dma_ppa_list);
 }
 EXPORT_SYMBOL(nvm_free_rqd_ppalist);
 
@@ -972,7 +1009,7 @@ err_fmtype:
        return ret;
 }
 
-void nvm_free(struct nvm_dev *dev)
+static void nvm_free(struct nvm_dev *dev)
 {
        if (!dev)
                return;
@@ -980,7 +1017,7 @@ void nvm_free(struct nvm_dev *dev)
        if (dev->dma_pool)
                dev->ops->destroy_dma_pool(dev->dma_pool);
 
-       kfree(dev->rmap);
+       nvm_unregister_map(dev);
        kfree(dev->lptbl);
        kfree(dev->lun_map);
        kfree(dev);
@@ -1174,13 +1211,13 @@ static long nvm_ioctl_get_devices(struct file *file, void __user *arg)
        list_for_each_entry(dev, &nvm_devices, devices) {
                struct nvm_ioctl_device_info *info = &devices->info[i];
 
-               sprintf(info->devname, "%s", dev->name);
+               strlcpy(info->devname, dev->name, sizeof(info->devname));
 
                /* kept for compatibility */
                info->bmversion[0] = 1;
                info->bmversion[1] = 0;
                info->bmversion[2] = 0;
-               sprintf(info->bmname, "%s", "gennvm");
+               strlcpy(info->bmname, "gennvm", sizeof(info->bmname));
                i++;
 
                if (i > 31) {
@@ -1217,8 +1254,16 @@ static long nvm_ioctl_dev_create(struct file *file, void __user *arg)
        create.tgtname[DISK_NAME_LEN - 1] = '\0';
 
        if (create.flags != 0) {
-               pr_err("nvm: no flags supported\n");
-               return -EINVAL;
+               __u32 flags = create.flags;
+
+               /* Check for valid flags */
+               if (flags & NVM_TARGET_FACTORY)
+                       flags &= ~NVM_TARGET_FACTORY;
+
+               if (flags) {
+                       pr_err("nvm: flag not supported\n");
+                       return -EINVAL;
+               }
        }
 
        return __nvm_configure_create(&create);
diff --git a/drivers/lightnvm/pblk-cache.c b/drivers/lightnvm/pblk-cache.c
new file mode 100644 (file)
index 0000000..59bcea8
--- /dev/null
@@ -0,0 +1,114 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ *                  Matias Bjorling <matias@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * pblk-cache.c - pblk's write cache
+ */
+
+#include "pblk.h"
+
+int pblk_write_to_cache(struct pblk *pblk, struct bio *bio, unsigned long flags)
+{
+       struct pblk_w_ctx w_ctx;
+       sector_t lba = pblk_get_lba(bio);
+       unsigned int bpos, pos;
+       int nr_entries = pblk_get_secs(bio);
+       int i, ret;
+
+       /* Update the write buffer head (mem) with the entries that we can
+        * write. The write in itself cannot fail, so there is no need to
+        * rollback from here on.
+        */
+retry:
+       ret = pblk_rb_may_write_user(&pblk->rwb, bio, nr_entries, &bpos);
+       if (ret == NVM_IO_REQUEUE) {
+               io_schedule();
+               goto retry;
+       }
+
+       if (unlikely(!bio_has_data(bio)))
+               goto out;
+
+       w_ctx.flags = flags;
+       pblk_ppa_set_empty(&w_ctx.ppa);
+
+       for (i = 0; i < nr_entries; i++) {
+               void *data = bio_data(bio);
+
+               w_ctx.lba = lba + i;
+
+               pos = pblk_rb_wrap_pos(&pblk->rwb, bpos + i);
+               pblk_rb_write_entry_user(&pblk->rwb, data, w_ctx, pos);
+
+               bio_advance(bio, PBLK_EXPOSED_PAGE_SIZE);
+       }
+
+#ifdef CONFIG_NVM_DEBUG
+       atomic_long_add(nr_entries, &pblk->inflight_writes);
+       atomic_long_add(nr_entries, &pblk->req_writes);
+#endif
+
+out:
+       pblk_write_should_kick(pblk);
+       return ret;
+}
+
+/*
+ * On GC the incoming lbas are not necessarily sequential. Also, some of the
+ * lbas might not be valid entries, which are marked as empty by the GC thread
+ */
+int pblk_write_gc_to_cache(struct pblk *pblk, void *data, u64 *lba_list,
+                          unsigned int nr_entries, unsigned int nr_rec_entries,
+                          struct pblk_line *gc_line, unsigned long flags)
+{
+       struct pblk_w_ctx w_ctx;
+       unsigned int bpos, pos;
+       int i, valid_entries;
+
+       /* Update the write buffer head (mem) with the entries that we can
+        * write. The write in itself cannot fail, so there is no need to
+        * rollback from here on.
+        */
+retry:
+       if (!pblk_rb_may_write_gc(&pblk->rwb, nr_rec_entries, &bpos)) {
+               io_schedule();
+               goto retry;
+       }
+
+       w_ctx.flags = flags;
+       pblk_ppa_set_empty(&w_ctx.ppa);
+
+       for (i = 0, valid_entries = 0; i < nr_entries; i++) {
+               if (lba_list[i] == ADDR_EMPTY)
+                       continue;
+
+               w_ctx.lba = lba_list[i];
+
+               pos = pblk_rb_wrap_pos(&pblk->rwb, bpos + valid_entries);
+               pblk_rb_write_entry_gc(&pblk->rwb, data, w_ctx, gc_line, pos);
+
+               data += PBLK_EXPOSED_PAGE_SIZE;
+               valid_entries++;
+       }
+
+       WARN_ONCE(nr_rec_entries != valid_entries,
+                                       "pblk: inconsistent GC write\n");
+
+#ifdef CONFIG_NVM_DEBUG
+       atomic_long_add(valid_entries, &pblk->inflight_writes);
+       atomic_long_add(valid_entries, &pblk->recov_gc_writes);
+#endif
+
+       pblk_write_should_kick(pblk);
+       return NVM_IO_OK;
+}
diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
new file mode 100644 (file)
index 0000000..5e44768
--- /dev/null
@@ -0,0 +1,1667 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ *                  Matias Bjorling <matias@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * pblk-core.c - pblk's core functionality
+ *
+ */
+
+#include "pblk.h"
+#include <linux/time.h>
+
+static void pblk_mark_bb(struct pblk *pblk, struct pblk_line *line,
+                        struct ppa_addr *ppa)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       int pos = pblk_dev_ppa_to_pos(geo, *ppa);
+
+       pr_debug("pblk: erase failed: line:%d, pos:%d\n", line->id, pos);
+       atomic_long_inc(&pblk->erase_failed);
+
+       atomic_dec(&line->blk_in_line);
+       if (test_and_set_bit(pos, line->blk_bitmap))
+               pr_err("pblk: attempted to erase bb: line:%d, pos:%d\n",
+                                                       line->id, pos);
+
+       pblk_line_run_ws(pblk, NULL, ppa, pblk_line_mark_bb);
+}
+
+static void __pblk_end_io_erase(struct pblk *pblk, struct nvm_rq *rqd)
+{
+       struct pblk_line *line;
+
+       line = &pblk->lines[pblk_dev_ppa_to_line(rqd->ppa_addr)];
+       atomic_dec(&line->left_seblks);
+
+       if (rqd->error) {
+               struct ppa_addr *ppa;
+
+               ppa = kmalloc(sizeof(struct ppa_addr), GFP_ATOMIC);
+               if (!ppa)
+                       return;
+
+               *ppa = rqd->ppa_addr;
+               pblk_mark_bb(pblk, line, ppa);
+       }
+}
+
+/* Erase completion assumes that only one block is erased at the time */
+static void pblk_end_io_erase(struct nvm_rq *rqd)
+{
+       struct pblk *pblk = rqd->private;
+
+       up(&pblk->erase_sem);
+       __pblk_end_io_erase(pblk, rqd);
+       mempool_free(rqd, pblk->r_rq_pool);
+}
+
+static void __pblk_map_invalidate(struct pblk *pblk, struct pblk_line *line,
+                                 u64 paddr)
+{
+       struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+       struct list_head *move_list = NULL;
+
+       /* Lines being reclaimed (GC'ed) cannot be invalidated. Before the L2P
+        * table is modified with reclaimed sectors, a check is done to endure
+        * that newer updates are not overwritten.
+        */
+       spin_lock(&line->lock);
+       if (line->state == PBLK_LINESTATE_GC ||
+                                       line->state == PBLK_LINESTATE_FREE) {
+               spin_unlock(&line->lock);
+               return;
+       }
+
+       if (test_and_set_bit(paddr, line->invalid_bitmap)) {
+               WARN_ONCE(1, "pblk: double invalidate\n");
+               spin_unlock(&line->lock);
+               return;
+       }
+       line->vsc--;
+
+       if (line->state == PBLK_LINESTATE_CLOSED)
+               move_list = pblk_line_gc_list(pblk, line);
+       spin_unlock(&line->lock);
+
+       if (move_list) {
+               spin_lock(&l_mg->gc_lock);
+               spin_lock(&line->lock);
+               /* Prevent moving a line that has just been chosen for GC */
+               if (line->state == PBLK_LINESTATE_GC ||
+                                       line->state == PBLK_LINESTATE_FREE) {
+                       spin_unlock(&line->lock);
+                       spin_unlock(&l_mg->gc_lock);
+                       return;
+               }
+               spin_unlock(&line->lock);
+
+               list_move_tail(&line->list, move_list);
+               spin_unlock(&l_mg->gc_lock);
+       }
+}
+
+void pblk_map_invalidate(struct pblk *pblk, struct ppa_addr ppa)
+{
+       struct pblk_line *line;
+       u64 paddr;
+       int line_id;
+
+#ifdef CONFIG_NVM_DEBUG
+       /* Callers must ensure that the ppa points to a device address */
+       BUG_ON(pblk_addr_in_cache(ppa));
+       BUG_ON(pblk_ppa_empty(ppa));
+#endif
+
+       line_id = pblk_tgt_ppa_to_line(ppa);
+       line = &pblk->lines[line_id];
+       paddr = pblk_dev_ppa_to_line_addr(pblk, ppa);
+
+       __pblk_map_invalidate(pblk, line, paddr);
+}
+
+void pblk_map_pad_invalidate(struct pblk *pblk, struct pblk_line *line,
+                            u64 paddr)
+{
+       __pblk_map_invalidate(pblk, line, paddr);
+
+       pblk_rb_sync_init(&pblk->rwb, NULL);
+       line->left_ssecs--;
+       if (!line->left_ssecs)
+               pblk_line_run_ws(pblk, line, NULL, pblk_line_close_ws);
+       pblk_rb_sync_end(&pblk->rwb, NULL);
+}
+
+static void pblk_invalidate_range(struct pblk *pblk, sector_t slba,
+                                 unsigned int nr_secs)
+{
+       sector_t lba;
+
+       spin_lock(&pblk->trans_lock);
+       for (lba = slba; lba < slba + nr_secs; lba++) {
+               struct ppa_addr ppa;
+
+               ppa = pblk_trans_map_get(pblk, lba);
+
+               if (!pblk_addr_in_cache(ppa) && !pblk_ppa_empty(ppa))
+                       pblk_map_invalidate(pblk, ppa);
+
+               pblk_ppa_set_empty(&ppa);
+               pblk_trans_map_set(pblk, lba, ppa);
+       }
+       spin_unlock(&pblk->trans_lock);
+}
+
+struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int rw)
+{
+       mempool_t *pool;
+       struct nvm_rq *rqd;
+       int rq_size;
+
+       if (rw == WRITE) {
+               pool = pblk->w_rq_pool;
+               rq_size = pblk_w_rq_size;
+       } else {
+               pool = pblk->r_rq_pool;
+               rq_size = pblk_r_rq_size;
+       }
+
+       rqd = mempool_alloc(pool, GFP_KERNEL);
+       memset(rqd, 0, rq_size);
+
+       return rqd;
+}
+
+void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int rw)
+{
+       mempool_t *pool;
+
+       if (rw == WRITE)
+               pool = pblk->w_rq_pool;
+       else
+               pool = pblk->r_rq_pool;
+
+       mempool_free(rqd, pool);
+}
+
+void pblk_bio_free_pages(struct pblk *pblk, struct bio *bio, int off,
+                        int nr_pages)
+{
+       struct bio_vec bv;
+       int i;
+
+       WARN_ON(off + nr_pages != bio->bi_vcnt);
+
+       bio_advance(bio, off * PBLK_EXPOSED_PAGE_SIZE);
+       for (i = off; i < nr_pages + off; i++) {
+               bv = bio->bi_io_vec[i];
+               mempool_free(bv.bv_page, pblk->page_pool);
+       }
+}
+
+int pblk_bio_add_pages(struct pblk *pblk, struct bio *bio, gfp_t flags,
+                      int nr_pages)
+{
+       struct request_queue *q = pblk->dev->q;
+       struct page *page;
+       int i, ret;
+
+       for (i = 0; i < nr_pages; i++) {
+               page = mempool_alloc(pblk->page_pool, flags);
+               if (!page)
+                       goto err;
+
+               ret = bio_add_pc_page(q, bio, page, PBLK_EXPOSED_PAGE_SIZE, 0);
+               if (ret != PBLK_EXPOSED_PAGE_SIZE) {
+                       pr_err("pblk: could not add page to bio\n");
+                       mempool_free(page, pblk->page_pool);
+                       goto err;
+               }
+       }
+
+       return 0;
+err:
+       pblk_bio_free_pages(pblk, bio, 0, i - 1);
+       return -1;
+}
+
+static void pblk_write_kick(struct pblk *pblk)
+{
+       wake_up_process(pblk->writer_ts);
+       mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(1000));
+}
+
+void pblk_write_timer_fn(unsigned long data)
+{
+       struct pblk *pblk = (struct pblk *)data;
+
+       /* kick the write thread every tick to flush outstanding data */
+       pblk_write_kick(pblk);
+}
+
+void pblk_write_should_kick(struct pblk *pblk)
+{
+       unsigned int secs_avail = pblk_rb_read_count(&pblk->rwb);
+
+       if (secs_avail >= pblk->min_write_pgs)
+               pblk_write_kick(pblk);
+}
+
+void pblk_end_bio_sync(struct bio *bio)
+{
+       struct completion *waiting = bio->bi_private;
+
+       complete(waiting);
+}
+
+void pblk_end_io_sync(struct nvm_rq *rqd)
+{
+       struct completion *waiting = rqd->private;
+
+       complete(waiting);
+}
+
+void pblk_flush_writer(struct pblk *pblk)
+{
+       struct bio *bio;
+       int ret;
+       DECLARE_COMPLETION_ONSTACK(wait);
+
+       bio = bio_alloc(GFP_KERNEL, 1);
+       if (!bio)
+               return;
+
+       bio->bi_iter.bi_sector = 0; /* internal bio */
+       bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_OP_FLUSH);
+       bio->bi_private = &wait;
+       bio->bi_end_io = pblk_end_bio_sync;
+
+       ret = pblk_write_to_cache(pblk, bio, 0);
+       if (ret == NVM_IO_OK) {
+               if (!wait_for_completion_io_timeout(&wait,
+                               msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
+                       pr_err("pblk: flush cache timed out\n");
+               }
+       } else if (ret != NVM_IO_DONE) {
+               pr_err("pblk: tear down bio failed\n");
+       }
+
+       if (bio->bi_error)
+               pr_err("pblk: flush sync write failed (%u)\n", bio->bi_error);
+
+       bio_put(bio);
+}
+
+struct list_head *pblk_line_gc_list(struct pblk *pblk, struct pblk_line *line)
+{
+       struct pblk_line_meta *lm = &pblk->lm;
+       struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+       struct list_head *move_list = NULL;
+
+       if (!line->vsc) {
+               if (line->gc_group != PBLK_LINEGC_FULL) {
+                       line->gc_group = PBLK_LINEGC_FULL;
+                       move_list = &l_mg->gc_full_list;
+               }
+       } else if (line->vsc < lm->mid_thrs) {
+               if (line->gc_group != PBLK_LINEGC_HIGH) {
+                       line->gc_group = PBLK_LINEGC_HIGH;
+                       move_list = &l_mg->gc_high_list;
+               }
+       } else if (line->vsc < lm->high_thrs) {
+               if (line->gc_group != PBLK_LINEGC_MID) {
+                       line->gc_group = PBLK_LINEGC_MID;
+                       move_list = &l_mg->gc_mid_list;
+               }
+       } else if (line->vsc < line->sec_in_line) {
+               if (line->gc_group != PBLK_LINEGC_LOW) {
+                       line->gc_group = PBLK_LINEGC_LOW;
+                       move_list = &l_mg->gc_low_list;
+               }
+       } else if (line->vsc == line->sec_in_line) {
+               if (line->gc_group != PBLK_LINEGC_EMPTY) {
+                       line->gc_group = PBLK_LINEGC_EMPTY;
+                       move_list = &l_mg->gc_empty_list;
+               }
+       } else {
+               line->state = PBLK_LINESTATE_CORRUPT;
+               line->gc_group = PBLK_LINEGC_NONE;
+               move_list =  &l_mg->corrupt_list;
+               pr_err("pblk: corrupted vsc for line %d, vsc:%d (%d/%d/%d)\n",
+                                               line->id, line->vsc,
+                                               line->sec_in_line,
+                                               lm->high_thrs, lm->mid_thrs);
+       }
+
+       return move_list;
+}
+
+void pblk_discard(struct pblk *pblk, struct bio *bio)
+{
+       sector_t slba = pblk_get_lba(bio);
+       sector_t nr_secs = pblk_get_secs(bio);
+
+       pblk_invalidate_range(pblk, slba, nr_secs);
+}
+
+struct ppa_addr pblk_get_lba_map(struct pblk *pblk, sector_t lba)
+{
+       struct ppa_addr ppa;
+
+       spin_lock(&pblk->trans_lock);
+       ppa = pblk_trans_map_get(pblk, lba);
+       spin_unlock(&pblk->trans_lock);
+
+       return ppa;
+}
+
+void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd)
+{
+       atomic_long_inc(&pblk->write_failed);
+#ifdef CONFIG_NVM_DEBUG
+       pblk_print_failed_rqd(pblk, rqd, rqd->error);
+#endif
+}
+
+void pblk_log_read_err(struct pblk *pblk, struct nvm_rq *rqd)
+{
+       /* Empty page read is not necessarily an error (e.g., L2P recovery) */
+       if (rqd->error == NVM_RSP_ERR_EMPTYPAGE) {
+               atomic_long_inc(&pblk->read_empty);
+               return;
+       }
+
+       switch (rqd->error) {
+       case NVM_RSP_WARN_HIGHECC:
+               atomic_long_inc(&pblk->read_high_ecc);
+               break;
+       case NVM_RSP_ERR_FAILECC:
+       case NVM_RSP_ERR_FAILCRC:
+               atomic_long_inc(&pblk->read_failed);
+               break;
+       default:
+               pr_err("pblk: unknown read error:%d\n", rqd->error);
+       }
+#ifdef CONFIG_NVM_DEBUG
+       pblk_print_failed_rqd(pblk, rqd, rqd->error);
+#endif
+}
+
+int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+
+#ifdef CONFIG_NVM_DEBUG
+       struct ppa_addr *ppa_list;
+
+       ppa_list = (rqd->nr_ppas > 1) ? rqd->ppa_list : &rqd->ppa_addr;
+       if (pblk_boundary_ppa_checks(dev, ppa_list, rqd->nr_ppas)) {
+               WARN_ON(1);
+               return -EINVAL;
+       }
+
+       if (rqd->opcode == NVM_OP_PWRITE) {
+               struct pblk_line *line;
+               struct ppa_addr ppa;
+               int i;
+
+               for (i = 0; i < rqd->nr_ppas; i++) {
+                       ppa = ppa_list[i];
+                       line = &pblk->lines[pblk_dev_ppa_to_line(ppa)];
+
+                       spin_lock(&line->lock);
+                       if (line->state != PBLK_LINESTATE_OPEN) {
+                               pr_err("pblk: bad ppa: line:%d,state:%d\n",
+                                                       line->id, line->state);
+                               WARN_ON(1);
+                               spin_unlock(&line->lock);
+                               return -EINVAL;
+                       }
+                       spin_unlock(&line->lock);
+               }
+       }
+#endif
+       return nvm_submit_io(dev, rqd);
+}
+
+struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data,
+                             unsigned int nr_secs, unsigned int len,
+                             gfp_t gfp_mask)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+       void *kaddr = data;
+       struct page *page;
+       struct bio *bio;
+       int i, ret;
+
+       if (l_mg->emeta_alloc_type == PBLK_KMALLOC_META)
+               return bio_map_kern(dev->q, kaddr, len, gfp_mask);
+
+       bio = bio_kmalloc(gfp_mask, nr_secs);
+       if (!bio)
+               return ERR_PTR(-ENOMEM);
+
+       for (i = 0; i < nr_secs; i++) {
+               page = vmalloc_to_page(kaddr);
+               if (!page) {
+                       pr_err("pblk: could not map vmalloc bio\n");
+                       bio_put(bio);
+                       bio = ERR_PTR(-ENOMEM);
+                       goto out;
+               }
+
+               ret = bio_add_pc_page(dev->q, bio, page, PAGE_SIZE, 0);
+               if (ret != PAGE_SIZE) {
+                       pr_err("pblk: could not add page to bio\n");
+                       bio_put(bio);
+                       bio = ERR_PTR(-ENOMEM);
+                       goto out;
+               }
+
+               kaddr += PAGE_SIZE;
+       }
+out:
+       return bio;
+}
+
+int pblk_calc_secs(struct pblk *pblk, unsigned long secs_avail,
+                  unsigned long secs_to_flush)
+{
+       int max = pblk->max_write_pgs;
+       int min = pblk->min_write_pgs;
+       int secs_to_sync = 0;
+
+       if (secs_avail >= max)
+               secs_to_sync = max;
+       else if (secs_avail >= min)
+               secs_to_sync = min * (secs_avail / min);
+       else if (secs_to_flush)
+               secs_to_sync = min;
+
+       return secs_to_sync;
+}
+
+static u64 __pblk_alloc_page(struct pblk *pblk, struct pblk_line *line,
+                            int nr_secs)
+{
+       u64 addr;
+       int i;
+
+       /* logic error: ppa out-of-bounds. Prevent generating bad address */
+       if (line->cur_sec + nr_secs > pblk->lm.sec_per_line) {
+               WARN(1, "pblk: page allocation out of bounds\n");
+               nr_secs = pblk->lm.sec_per_line - line->cur_sec;
+       }
+
+       line->cur_sec = addr = find_next_zero_bit(line->map_bitmap,
+                                       pblk->lm.sec_per_line, line->cur_sec);
+       for (i = 0; i < nr_secs; i++, line->cur_sec++)
+               WARN_ON(test_and_set_bit(line->cur_sec, line->map_bitmap));
+
+       return addr;
+}
+
+u64 pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs)
+{
+       u64 addr;
+
+       /* Lock needed in case a write fails and a recovery needs to remap
+        * failed write buffer entries
+        */
+       spin_lock(&line->lock);
+       addr = __pblk_alloc_page(pblk, line, nr_secs);
+       line->left_msecs -= nr_secs;
+       WARN(line->left_msecs < 0, "pblk: page allocation out of bounds\n");
+       spin_unlock(&line->lock);
+
+       return addr;
+}
+
+/*
+ * Submit emeta to one LUN in the raid line at the time to avoid a deadlock when
+ * taking the per LUN semaphore.
+ */
+static int pblk_line_submit_emeta_io(struct pblk *pblk, struct pblk_line *line,
+                                    u64 paddr, int dir)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       struct pblk_line_meta *lm = &pblk->lm;
+       struct bio *bio;
+       struct nvm_rq rqd;
+       struct ppa_addr *ppa_list;
+       dma_addr_t dma_ppa_list;
+       void *emeta = line->emeta;
+       int min = pblk->min_write_pgs;
+       int left_ppas = lm->emeta_sec;
+       int id = line->id;
+       int rq_ppas, rq_len;
+       int cmd_op, bio_op;
+       int flags;
+       int i, j;
+       int ret;
+       DECLARE_COMPLETION_ONSTACK(wait);
+
+       if (dir == WRITE) {
+               bio_op = REQ_OP_WRITE;
+               cmd_op = NVM_OP_PWRITE;
+               flags = pblk_set_progr_mode(pblk, WRITE);
+       } else if (dir == READ) {
+               bio_op = REQ_OP_READ;
+               cmd_op = NVM_OP_PREAD;
+               flags = pblk_set_read_mode(pblk);
+       } else
+               return -EINVAL;
+
+       ppa_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, &dma_ppa_list);
+       if (!ppa_list)
+               return -ENOMEM;
+
+next_rq:
+       memset(&rqd, 0, sizeof(struct nvm_rq));
+
+       rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
+       rq_len = rq_ppas * geo->sec_size;
+
+       bio = pblk_bio_map_addr(pblk, emeta, rq_ppas, rq_len, GFP_KERNEL);
+       if (IS_ERR(bio)) {
+               ret = PTR_ERR(bio);
+               goto free_rqd_dma;
+       }
+
+       bio->bi_iter.bi_sector = 0; /* internal bio */
+       bio_set_op_attrs(bio, bio_op, 0);
+
+       rqd.bio = bio;
+       rqd.opcode = cmd_op;
+       rqd.flags = flags;
+       rqd.nr_ppas = rq_ppas;
+       rqd.ppa_list = ppa_list;
+       rqd.dma_ppa_list = dma_ppa_list;
+       rqd.end_io = pblk_end_io_sync;
+       rqd.private = &wait;
+
+       if (dir == WRITE) {
+               for (i = 0; i < rqd.nr_ppas; ) {
+                       spin_lock(&line->lock);
+                       paddr = __pblk_alloc_page(pblk, line, min);
+                       spin_unlock(&line->lock);
+                       for (j = 0; j < min; j++, i++, paddr++)
+                               rqd.ppa_list[i] =
+                                       addr_to_gen_ppa(pblk, paddr, id);
+               }
+       } else {
+               for (i = 0; i < rqd.nr_ppas; ) {
+                       struct ppa_addr ppa = addr_to_gen_ppa(pblk, paddr, id);
+                       int pos = pblk_dev_ppa_to_pos(geo, ppa);
+
+                       while (test_bit(pos, line->blk_bitmap)) {
+                               paddr += min;
+                               if (pblk_boundary_paddr_checks(pblk, paddr)) {
+                                       pr_err("pblk: corrupt emeta line:%d\n",
+                                                               line->id);
+                                       bio_put(bio);
+                                       ret = -EINTR;
+                                       goto free_rqd_dma;
+                               }
+
+                               ppa = addr_to_gen_ppa(pblk, paddr, id);
+                               pos = pblk_dev_ppa_to_pos(geo, ppa);
+                       }
+
+                       if (pblk_boundary_paddr_checks(pblk, paddr + min)) {
+                               pr_err("pblk: corrupt emeta line:%d\n",
+                                                               line->id);
+                               bio_put(bio);
+                               ret = -EINTR;
+                               goto free_rqd_dma;
+                       }
+
+                       for (j = 0; j < min; j++, i++, paddr++)
+                               rqd.ppa_list[i] =
+                                       addr_to_gen_ppa(pblk, paddr, line->id);
+               }
+       }
+
+       ret = pblk_submit_io(pblk, &rqd);
+       if (ret) {
+               pr_err("pblk: emeta I/O submission failed: %d\n", ret);
+               bio_put(bio);
+               goto free_rqd_dma;
+       }
+
+       if (!wait_for_completion_io_timeout(&wait,
+                               msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
+               pr_err("pblk: emeta I/O timed out\n");
+       }
+       reinit_completion(&wait);
+
+       bio_put(bio);
+
+       if (rqd.error) {
+               if (dir == WRITE)
+                       pblk_log_write_err(pblk, &rqd);
+               else
+                       pblk_log_read_err(pblk, &rqd);
+       }
+
+       emeta += rq_len;
+       left_ppas -= rq_ppas;
+       if (left_ppas)
+               goto next_rq;
+free_rqd_dma:
+       nvm_dev_dma_free(dev->parent, ppa_list, dma_ppa_list);
+       return ret;
+}
+
+u64 pblk_line_smeta_start(struct pblk *pblk, struct pblk_line *line)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       struct pblk_line_meta *lm = &pblk->lm;
+       int bit;
+
+       /* This usually only happens on bad lines */
+       bit = find_first_zero_bit(line->blk_bitmap, lm->blk_per_line);
+       if (bit >= lm->blk_per_line)
+               return -1;
+
+       return bit * geo->sec_per_pl;
+}
+
+static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line,
+                                    u64 paddr, int dir)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct pblk_line_meta *lm = &pblk->lm;
+       struct bio *bio;
+       struct nvm_rq rqd;
+       __le64 *lba_list = NULL;
+       int i, ret;
+       int cmd_op, bio_op;
+       int flags;
+       DECLARE_COMPLETION_ONSTACK(wait);
+
+       if (dir == WRITE) {
+               bio_op = REQ_OP_WRITE;
+               cmd_op = NVM_OP_PWRITE;
+               flags = pblk_set_progr_mode(pblk, WRITE);
+               lba_list = pblk_line_emeta_to_lbas(line->emeta);
+       } else if (dir == READ) {
+               bio_op = REQ_OP_READ;
+               cmd_op = NVM_OP_PREAD;
+               flags = pblk_set_read_mode(pblk);
+       } else
+               return -EINVAL;
+
+       memset(&rqd, 0, sizeof(struct nvm_rq));
+
+       rqd.ppa_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL,
+                                                       &rqd.dma_ppa_list);
+       if (!rqd.ppa_list)
+               return -ENOMEM;
+
+       bio = bio_map_kern(dev->q, line->smeta, lm->smeta_len, GFP_KERNEL);
+       if (IS_ERR(bio)) {
+               ret = PTR_ERR(bio);
+               goto free_ppa_list;
+       }
+
+       bio->bi_iter.bi_sector = 0; /* internal bio */
+       bio_set_op_attrs(bio, bio_op, 0);
+
+       rqd.bio = bio;
+       rqd.opcode = cmd_op;
+       rqd.flags = flags;
+       rqd.nr_ppas = lm->smeta_sec;
+       rqd.end_io = pblk_end_io_sync;
+       rqd.private = &wait;
+
+       for (i = 0; i < lm->smeta_sec; i++, paddr++) {
+               rqd.ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id);
+               if (dir == WRITE)
+                       lba_list[paddr] = cpu_to_le64(ADDR_EMPTY);
+       }
+
+       /*
+        * This I/O is sent by the write thread when a line is replace. Since
+        * the write thread is the only one sending write and erase commands,
+        * there is no need to take the LUN semaphore.
+        */
+       ret = pblk_submit_io(pblk, &rqd);
+       if (ret) {
+               pr_err("pblk: smeta I/O submission failed: %d\n", ret);
+               bio_put(bio);
+               goto free_ppa_list;
+       }
+
+       if (!wait_for_completion_io_timeout(&wait,
+                               msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
+               pr_err("pblk: smeta I/O timed out\n");
+       }
+
+       if (rqd.error) {
+               if (dir == WRITE)
+                       pblk_log_write_err(pblk, &rqd);
+               else
+                       pblk_log_read_err(pblk, &rqd);
+       }
+
+free_ppa_list:
+       nvm_dev_dma_free(dev->parent, rqd.ppa_list, rqd.dma_ppa_list);
+
+       return ret;
+}
+
+int pblk_line_read_smeta(struct pblk *pblk, struct pblk_line *line)
+{
+       u64 bpaddr = pblk_line_smeta_start(pblk, line);
+
+       return pblk_line_submit_smeta_io(pblk, line, bpaddr, READ);
+}
+
+int pblk_line_read_emeta(struct pblk *pblk, struct pblk_line *line)
+{
+       return pblk_line_submit_emeta_io(pblk, line, line->emeta_ssec, READ);
+}
+
+static void pblk_setup_e_rq(struct pblk *pblk, struct nvm_rq *rqd,
+                           struct ppa_addr ppa)
+{
+       rqd->opcode = NVM_OP_ERASE;
+       rqd->ppa_addr = ppa;
+       rqd->nr_ppas = 1;
+       rqd->flags = pblk_set_progr_mode(pblk, ERASE);
+       rqd->bio = NULL;
+}
+
+static int pblk_blk_erase_sync(struct pblk *pblk, struct ppa_addr ppa)
+{
+       struct nvm_rq rqd;
+       int ret;
+       DECLARE_COMPLETION_ONSTACK(wait);
+
+       memset(&rqd, 0, sizeof(struct nvm_rq));
+
+       pblk_setup_e_rq(pblk, &rqd, ppa);
+
+       rqd.end_io = pblk_end_io_sync;
+       rqd.private = &wait;
+
+       /* The write thread schedules erases so that it minimizes disturbances
+        * with writes. Thus, there is no need to take the LUN semaphore.
+        */
+       ret = pblk_submit_io(pblk, &rqd);
+       if (ret) {
+               struct nvm_tgt_dev *dev = pblk->dev;
+               struct nvm_geo *geo = &dev->geo;
+
+               pr_err("pblk: could not sync erase line:%d,blk:%d\n",
+                                       pblk_dev_ppa_to_line(ppa),
+                                       pblk_dev_ppa_to_pos(geo, ppa));
+
+               rqd.error = ret;
+               goto out;
+       }
+
+       if (!wait_for_completion_io_timeout(&wait,
+                               msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
+               pr_err("pblk: sync erase timed out\n");
+       }
+
+out:
+       rqd.private = pblk;
+       __pblk_end_io_erase(pblk, &rqd);
+
+       return 0;
+}
+
+int pblk_line_erase(struct pblk *pblk, struct pblk_line *line)
+{
+       struct pblk_line_meta *lm = &pblk->lm;
+       struct ppa_addr ppa;
+       int bit = -1;
+
+       /* Erase only good blocks, one at a time */
+       do {
+               spin_lock(&line->lock);
+               bit = find_next_zero_bit(line->erase_bitmap, lm->blk_per_line,
+                                                               bit + 1);
+               if (bit >= lm->blk_per_line) {
+                       spin_unlock(&line->lock);
+                       break;
+               }
+
+               ppa = pblk->luns[bit].bppa; /* set ch and lun */
+               ppa.g.blk = line->id;
+
+               atomic_dec(&line->left_eblks);
+               WARN_ON(test_and_set_bit(bit, line->erase_bitmap));
+               spin_unlock(&line->lock);
+
+               if (pblk_blk_erase_sync(pblk, ppa)) {
+                       pr_err("pblk: failed to erase line %d\n", line->id);
+                       return -ENOMEM;
+               }
+       } while (1);
+
+       return 0;
+}
+
+/* For now lines are always assumed full lines. Thus, smeta former and current
+ * lun bitmaps are omitted.
+ */
+static int pblk_line_set_metadata(struct pblk *pblk, struct pblk_line *line,
+                                 struct pblk_line *cur)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       struct pblk_line_meta *lm = &pblk->lm;
+       struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+       struct line_smeta *smeta = line->smeta;
+       struct line_emeta *emeta = line->emeta;
+       int nr_blk_line;
+
+       /* After erasing the line, new bad blocks might appear and we risk
+        * having an invalid line
+        */
+       nr_blk_line = lm->blk_per_line -
+                       bitmap_weight(line->blk_bitmap, lm->blk_per_line);
+       if (nr_blk_line < lm->min_blk_line) {
+               spin_lock(&l_mg->free_lock);
+               spin_lock(&line->lock);
+               line->state = PBLK_LINESTATE_BAD;
+               spin_unlock(&line->lock);
+
+               list_add_tail(&line->list, &l_mg->bad_list);
+               spin_unlock(&l_mg->free_lock);
+
+               pr_debug("pblk: line %d is bad\n", line->id);
+
+               return 0;
+       }
+
+       /* Run-time metadata */
+       line->lun_bitmap = ((void *)(smeta)) + sizeof(struct line_smeta);
+
+       /* Mark LUNs allocated in this line (all for now) */
+       bitmap_set(line->lun_bitmap, 0, lm->lun_bitmap_len);
+
+       smeta->header.identifier = cpu_to_le32(PBLK_MAGIC);
+       memcpy(smeta->header.uuid, pblk->instance_uuid, 16);
+       smeta->header.id = cpu_to_le32(line->id);
+       smeta->header.type = cpu_to_le16(line->type);
+       smeta->header.version = cpu_to_le16(1);
+
+       /* Start metadata */
+       smeta->seq_nr = cpu_to_le64(line->seq_nr);
+       smeta->window_wr_lun = cpu_to_le32(geo->nr_luns);
+
+       /* Fill metadata among lines */
+       if (cur) {
+               memcpy(line->lun_bitmap, cur->lun_bitmap, lm->lun_bitmap_len);
+               smeta->prev_id = cpu_to_le32(cur->id);
+               cur->emeta->next_id = cpu_to_le32(line->id);
+       } else {
+               smeta->prev_id = cpu_to_le32(PBLK_LINE_EMPTY);
+       }
+
+       /* All smeta must be set at this point */
+       smeta->header.crc = cpu_to_le32(pblk_calc_meta_header_crc(pblk, smeta));
+       smeta->crc = cpu_to_le32(pblk_calc_smeta_crc(pblk, smeta));
+
+       /* End metadata */
+       memcpy(&emeta->header, &smeta->header, sizeof(struct line_header));
+       emeta->seq_nr = cpu_to_le64(line->seq_nr);
+       emeta->nr_lbas = cpu_to_le64(line->sec_in_line);
+       emeta->nr_valid_lbas = cpu_to_le64(0);
+       emeta->next_id = cpu_to_le32(PBLK_LINE_EMPTY);
+       emeta->crc = cpu_to_le32(0);
+       emeta->prev_id = smeta->prev_id;
+
+       return 1;
+}
+
+/* For now lines are always assumed full lines. Thus, smeta former and current
+ * lun bitmaps are omitted.
+ */
+static int pblk_line_init_bb(struct pblk *pblk, struct pblk_line *line,
+                            int init)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       struct pblk_line_meta *lm = &pblk->lm;
+       struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+       int nr_bb = 0;
+       u64 off;
+       int bit = -1;
+
+       line->sec_in_line = lm->sec_per_line;
+
+       /* Capture bad block information on line mapping bitmaps */
+       while ((bit = find_next_bit(line->blk_bitmap, lm->blk_per_line,
+                                       bit + 1)) < lm->blk_per_line) {
+               off = bit * geo->sec_per_pl;
+               bitmap_shift_left(l_mg->bb_aux, l_mg->bb_template, off,
+                                                       lm->sec_per_line);
+               bitmap_or(line->map_bitmap, line->map_bitmap, l_mg->bb_aux,
+                                                       lm->sec_per_line);
+               line->sec_in_line -= geo->sec_per_blk;
+               if (bit >= lm->emeta_bb)
+                       nr_bb++;
+       }
+
+       /* Mark smeta metadata sectors as bad sectors */
+       bit = find_first_zero_bit(line->blk_bitmap, lm->blk_per_line);
+       off = bit * geo->sec_per_pl;
+retry_smeta:
+       bitmap_set(line->map_bitmap, off, lm->smeta_sec);
+       line->sec_in_line -= lm->smeta_sec;
+       line->smeta_ssec = off;
+       line->cur_sec = off + lm->smeta_sec;
+
+       if (init && pblk_line_submit_smeta_io(pblk, line, off, WRITE)) {
+               pr_debug("pblk: line smeta I/O failed. Retry\n");
+               off += geo->sec_per_pl;
+               goto retry_smeta;
+       }
+
+       bitmap_copy(line->invalid_bitmap, line->map_bitmap, lm->sec_per_line);
+
+       /* Mark emeta metadata sectors as bad sectors. We need to consider bad
+        * blocks to make sure that there are enough sectors to store emeta
+        */
+       bit = lm->sec_per_line;
+       off = lm->sec_per_line - lm->emeta_sec;
+       bitmap_set(line->invalid_bitmap, off, lm->emeta_sec);
+       while (nr_bb) {
+               off -= geo->sec_per_pl;
+               if (!test_bit(off, line->invalid_bitmap)) {
+                       bitmap_set(line->invalid_bitmap, off, geo->sec_per_pl);
+                       nr_bb--;
+               }
+       }
+
+       line->sec_in_line -= lm->emeta_sec;
+       line->emeta_ssec = off;
+       line->vsc = line->left_ssecs = line->left_msecs = line->sec_in_line;
+
+       if (lm->sec_per_line - line->sec_in_line !=
+               bitmap_weight(line->invalid_bitmap, lm->sec_per_line)) {
+               spin_lock(&line->lock);
+               line->state = PBLK_LINESTATE_BAD;
+               spin_unlock(&line->lock);
+
+               list_add_tail(&line->list, &l_mg->bad_list);
+               pr_err("pblk: unexpected line %d is bad\n", line->id);
+
+               return 0;
+       }
+
+       return 1;
+}
+
+static int pblk_line_prepare(struct pblk *pblk, struct pblk_line *line)
+{
+       struct pblk_line_meta *lm = &pblk->lm;
+       int blk_in_line = atomic_read(&line->blk_in_line);
+
+       line->map_bitmap = mempool_alloc(pblk->line_meta_pool, GFP_ATOMIC);
+       if (!line->map_bitmap)
+               return -ENOMEM;
+       memset(line->map_bitmap, 0, lm->sec_bitmap_len);
+
+       /* invalid_bitmap is special since it is used when line is closed. No
+        * need to zeroized; it will be initialized using bb info form
+        * map_bitmap
+        */
+       line->invalid_bitmap = mempool_alloc(pblk->line_meta_pool, GFP_ATOMIC);
+       if (!line->invalid_bitmap) {
+               mempool_free(line->map_bitmap, pblk->line_meta_pool);
+               return -ENOMEM;
+       }
+
+       spin_lock(&line->lock);
+       if (line->state != PBLK_LINESTATE_FREE) {
+               spin_unlock(&line->lock);
+               WARN(1, "pblk: corrupted line state\n");
+               return -EINTR;
+       }
+       line->state = PBLK_LINESTATE_OPEN;
+
+       atomic_set(&line->left_eblks, blk_in_line);
+       atomic_set(&line->left_seblks, blk_in_line);
+       spin_unlock(&line->lock);
+
+       /* Bad blocks do not need to be erased */
+       bitmap_copy(line->erase_bitmap, line->blk_bitmap, lm->blk_per_line);
+
+       kref_init(&line->ref);
+
+       return 0;
+}
+
+int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line)
+{
+       struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+       int ret;
+
+       spin_lock(&l_mg->free_lock);
+       l_mg->data_line = line;
+       list_del(&line->list);
+
+       ret = pblk_line_prepare(pblk, line);
+       if (ret) {
+               list_add(&line->list, &l_mg->free_list);
+               spin_unlock(&l_mg->free_lock);
+               return ret;
+       }
+       spin_unlock(&l_mg->free_lock);
+
+       pblk_rl_free_lines_dec(&pblk->rl, line);
+
+       if (!pblk_line_init_bb(pblk, line, 0)) {
+               list_add(&line->list, &l_mg->free_list);
+               return -EINTR;
+       }
+
+       return 0;
+}
+
+void pblk_line_recov_close(struct pblk *pblk, struct pblk_line *line)
+{
+       mempool_free(line->map_bitmap, pblk->line_meta_pool);
+       line->map_bitmap = NULL;
+       line->smeta = NULL;
+       line->emeta = NULL;
+}
+
+struct pblk_line *pblk_line_get(struct pblk *pblk)
+{
+       struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+       struct pblk_line_meta *lm = &pblk->lm;
+       struct pblk_line *line = NULL;
+       int bit;
+
+       lockdep_assert_held(&l_mg->free_lock);
+
+retry_get:
+       if (list_empty(&l_mg->free_list)) {
+               pr_err("pblk: no free lines\n");
+               goto out;
+       }
+
+       line = list_first_entry(&l_mg->free_list, struct pblk_line, list);
+       list_del(&line->list);
+       l_mg->nr_free_lines--;
+
+       bit = find_first_zero_bit(line->blk_bitmap, lm->blk_per_line);
+       if (unlikely(bit >= lm->blk_per_line)) {
+               spin_lock(&line->lock);
+               line->state = PBLK_LINESTATE_BAD;
+               spin_unlock(&line->lock);
+
+               list_add_tail(&line->list, &l_mg->bad_list);
+
+               pr_debug("pblk: line %d is bad\n", line->id);
+               goto retry_get;
+       }
+
+       if (pblk_line_prepare(pblk, line)) {
+               pr_err("pblk: failed to prepare line %d\n", line->id);
+               list_add(&line->list, &l_mg->free_list);
+               return NULL;
+       }
+
+out:
+       return line;
+}
+
+static struct pblk_line *pblk_line_retry(struct pblk *pblk,
+                                        struct pblk_line *line)
+{
+       struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+       struct pblk_line *retry_line;
+
+       spin_lock(&l_mg->free_lock);
+       retry_line = pblk_line_get(pblk);
+       if (!retry_line) {
+               l_mg->data_line = NULL;
+               spin_unlock(&l_mg->free_lock);
+               return NULL;
+       }
+
+       retry_line->smeta = line->smeta;
+       retry_line->emeta = line->emeta;
+       retry_line->meta_line = line->meta_line;
+
+       pblk_line_free(pblk, line);
+       l_mg->data_line = retry_line;
+       spin_unlock(&l_mg->free_lock);
+
+       if (pblk_line_erase(pblk, retry_line)) {
+               spin_lock(&l_mg->free_lock);
+               l_mg->data_line = NULL;
+               spin_unlock(&l_mg->free_lock);
+               return NULL;
+       }
+
+       pblk_rl_free_lines_dec(&pblk->rl, retry_line);
+
+       return retry_line;
+}
+
+struct pblk_line *pblk_line_get_first_data(struct pblk *pblk)
+{
+       struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+       struct pblk_line *line;
+       int meta_line;
+       int is_next = 0;
+
+       spin_lock(&l_mg->free_lock);
+       line = pblk_line_get(pblk);
+       if (!line) {
+               spin_unlock(&l_mg->free_lock);
+               return NULL;
+       }
+
+       line->seq_nr = l_mg->d_seq_nr++;
+       line->type = PBLK_LINETYPE_DATA;
+       l_mg->data_line = line;
+
+       meta_line = find_first_zero_bit(&l_mg->meta_bitmap, PBLK_DATA_LINES);
+       set_bit(meta_line, &l_mg->meta_bitmap);
+       line->smeta = l_mg->sline_meta[meta_line].meta;
+       line->emeta = l_mg->eline_meta[meta_line].meta;
+       line->meta_line = meta_line;
+
+       /* Allocate next line for preparation */
+       l_mg->data_next = pblk_line_get(pblk);
+       if (l_mg->data_next) {
+               l_mg->data_next->seq_nr = l_mg->d_seq_nr++;
+               l_mg->data_next->type = PBLK_LINETYPE_DATA;
+               is_next = 1;
+       }
+       spin_unlock(&l_mg->free_lock);
+
+       pblk_rl_free_lines_dec(&pblk->rl, line);
+       if (is_next)
+               pblk_rl_free_lines_dec(&pblk->rl, l_mg->data_next);
+
+       if (pblk_line_erase(pblk, line))
+               return NULL;
+
+retry_setup:
+       if (!pblk_line_set_metadata(pblk, line, NULL)) {
+               line = pblk_line_retry(pblk, line);
+               if (!line)
+                       return NULL;
+
+               goto retry_setup;
+       }
+
+       if (!pblk_line_init_bb(pblk, line, 1)) {
+               line = pblk_line_retry(pblk, line);
+               if (!line)
+                       return NULL;
+
+               goto retry_setup;
+       }
+
+       return line;
+}
+
+struct pblk_line *pblk_line_replace_data(struct pblk *pblk)
+{
+       struct pblk_line_meta *lm = &pblk->lm;
+       struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+       struct pblk_line *cur, *new;
+       unsigned int left_seblks;
+       int meta_line;
+       int is_next = 0;
+
+       cur = l_mg->data_line;
+       new = l_mg->data_next;
+       if (!new)
+               return NULL;
+       l_mg->data_line = new;
+
+retry_line:
+       left_seblks = atomic_read(&new->left_seblks);
+       if (left_seblks) {
+               /* If line is not fully erased, erase it */
+               if (atomic_read(&new->left_eblks)) {
+                       if (pblk_line_erase(pblk, new))
+                               return NULL;
+               } else {
+                       io_schedule();
+               }
+               goto retry_line;
+       }
+
+       spin_lock(&l_mg->free_lock);
+       /* Allocate next line for preparation */
+       l_mg->data_next = pblk_line_get(pblk);
+       if (l_mg->data_next) {
+               l_mg->data_next->seq_nr = l_mg->d_seq_nr++;
+               l_mg->data_next->type = PBLK_LINETYPE_DATA;
+               is_next = 1;
+       }
+
+retry_meta:
+       meta_line = find_first_zero_bit(&l_mg->meta_bitmap, PBLK_DATA_LINES);
+       if (meta_line == PBLK_DATA_LINES) {
+               spin_unlock(&l_mg->free_lock);
+               io_schedule();
+               spin_lock(&l_mg->free_lock);
+               goto retry_meta;
+       }
+
+       set_bit(meta_line, &l_mg->meta_bitmap);
+       new->smeta = l_mg->sline_meta[meta_line].meta;
+       new->emeta = l_mg->eline_meta[meta_line].meta;
+       new->meta_line = meta_line;
+
+       memset(new->smeta, 0, lm->smeta_len);
+       memset(new->emeta, 0, lm->emeta_len);
+       spin_unlock(&l_mg->free_lock);
+
+       if (is_next)
+               pblk_rl_free_lines_dec(&pblk->rl, l_mg->data_next);
+
+retry_setup:
+       if (!pblk_line_set_metadata(pblk, new, cur)) {
+               new = pblk_line_retry(pblk, new);
+               if (!new)
+                       return NULL;
+
+               goto retry_setup;
+       }
+
+       if (!pblk_line_init_bb(pblk, new, 1)) {
+               new = pblk_line_retry(pblk, new);
+               if (!new)
+                       return NULL;
+
+               goto retry_setup;
+       }
+
+       return new;
+}
+
+void pblk_line_free(struct pblk *pblk, struct pblk_line *line)
+{
+       if (line->map_bitmap)
+               mempool_free(line->map_bitmap, pblk->line_meta_pool);
+       if (line->invalid_bitmap)
+               mempool_free(line->invalid_bitmap, pblk->line_meta_pool);
+
+       line->map_bitmap = NULL;
+       line->invalid_bitmap = NULL;
+       line->smeta = NULL;
+       line->emeta = NULL;
+}
+
+void pblk_line_put(struct kref *ref)
+{
+       struct pblk_line *line = container_of(ref, struct pblk_line, ref);
+       struct pblk *pblk = line->pblk;
+       struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+
+       spin_lock(&line->lock);
+       WARN_ON(line->state != PBLK_LINESTATE_GC);
+       line->state = PBLK_LINESTATE_FREE;
+       line->gc_group = PBLK_LINEGC_NONE;
+       pblk_line_free(pblk, line);
+       spin_unlock(&line->lock);
+
+       spin_lock(&l_mg->free_lock);
+       list_add_tail(&line->list, &l_mg->free_list);
+       l_mg->nr_free_lines++;
+       spin_unlock(&l_mg->free_lock);
+
+       pblk_rl_free_lines_inc(&pblk->rl, line);
+}
+
+int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr ppa)
+{
+       struct nvm_rq *rqd;
+       int err;
+
+       rqd = mempool_alloc(pblk->r_rq_pool, GFP_KERNEL);
+       memset(rqd, 0, pblk_r_rq_size);
+
+       pblk_setup_e_rq(pblk, rqd, ppa);
+
+       rqd->end_io = pblk_end_io_erase;
+       rqd->private = pblk;
+
+       /* The write thread schedules erases so that it minimizes disturbances
+        * with writes. Thus, there is no need to take the LUN semaphore.
+        */
+       err = pblk_submit_io(pblk, rqd);
+       if (err) {
+               struct nvm_tgt_dev *dev = pblk->dev;
+               struct nvm_geo *geo = &dev->geo;
+
+               pr_err("pblk: could not async erase line:%d,blk:%d\n",
+                                       pblk_dev_ppa_to_line(ppa),
+                                       pblk_dev_ppa_to_pos(geo, ppa));
+       }
+
+       return err;
+}
+
+struct pblk_line *pblk_line_get_data(struct pblk *pblk)
+{
+       return pblk->l_mg.data_line;
+}
+
+struct pblk_line *pblk_line_get_data_next(struct pblk *pblk)
+{
+       return pblk->l_mg.data_next;
+}
+
+int pblk_line_is_full(struct pblk_line *line)
+{
+       return (line->left_msecs == 0);
+}
+
+void pblk_line_close(struct pblk *pblk, struct pblk_line *line)
+{
+       struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+       struct list_head *move_list;
+
+       line->emeta->crc = cpu_to_le32(pblk_calc_emeta_crc(pblk, line->emeta));
+
+       if (pblk_line_submit_emeta_io(pblk, line, line->cur_sec, WRITE))
+               pr_err("pblk: line %d close I/O failed\n", line->id);
+
+       WARN(!bitmap_full(line->map_bitmap, line->sec_in_line),
+                               "pblk: corrupt closed line %d\n", line->id);
+
+       spin_lock(&l_mg->free_lock);
+       WARN_ON(!test_and_clear_bit(line->meta_line, &l_mg->meta_bitmap));
+       spin_unlock(&l_mg->free_lock);
+
+       spin_lock(&l_mg->gc_lock);
+       spin_lock(&line->lock);
+       WARN_ON(line->state != PBLK_LINESTATE_OPEN);
+       line->state = PBLK_LINESTATE_CLOSED;
+       move_list = pblk_line_gc_list(pblk, line);
+
+       list_add_tail(&line->list, move_list);
+
+       mempool_free(line->map_bitmap, pblk->line_meta_pool);
+       line->map_bitmap = NULL;
+       line->smeta = NULL;
+       line->emeta = NULL;
+
+       spin_unlock(&line->lock);
+       spin_unlock(&l_mg->gc_lock);
+}
+
+void pblk_line_close_ws(struct work_struct *work)
+{
+       struct pblk_line_ws *line_ws = container_of(work, struct pblk_line_ws,
+                                                                       ws);
+       struct pblk *pblk = line_ws->pblk;
+       struct pblk_line *line = line_ws->line;
+
+       pblk_line_close(pblk, line);
+       mempool_free(line_ws, pblk->line_ws_pool);
+}
+
+void pblk_line_mark_bb(struct work_struct *work)
+{
+       struct pblk_line_ws *line_ws = container_of(work, struct pblk_line_ws,
+                                                                       ws);
+       struct pblk *pblk = line_ws->pblk;
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct ppa_addr *ppa = line_ws->priv;
+       int ret;
+
+       ret = nvm_set_tgt_bb_tbl(dev, ppa, 1, NVM_BLK_T_GRWN_BAD);
+       if (ret) {
+               struct pblk_line *line;
+               int pos;
+
+               line = &pblk->lines[pblk_dev_ppa_to_line(*ppa)];
+               pos = pblk_dev_ppa_to_pos(&dev->geo, *ppa);
+
+               pr_err("pblk: failed to mark bb, line:%d, pos:%d\n",
+                               line->id, pos);
+       }
+
+       kfree(ppa);
+       mempool_free(line_ws, pblk->line_ws_pool);
+}
+
+void pblk_line_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv,
+                     void (*work)(struct work_struct *))
+{
+       struct pblk_line_ws *line_ws;
+
+       line_ws = mempool_alloc(pblk->line_ws_pool, GFP_ATOMIC);
+       if (!line_ws)
+               return;
+
+       line_ws->pblk = pblk;
+       line_ws->line = line;
+       line_ws->priv = priv;
+
+       INIT_WORK(&line_ws->ws, work);
+       queue_work(pblk->kw_wq, &line_ws->ws);
+}
+
+void pblk_down_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
+                 unsigned long *lun_bitmap)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       struct pblk_lun *rlun;
+       int lun_id = ppa_list[0].g.ch * geo->luns_per_chnl + ppa_list[0].g.lun;
+       int ret;
+
+       /*
+        * Only send one inflight I/O per LUN. Since we map at a page
+        * granurality, all ppas in the I/O will map to the same LUN
+        */
+#ifdef CONFIG_NVM_DEBUG
+       int i;
+
+       for (i = 1; i < nr_ppas; i++)
+               WARN_ON(ppa_list[0].g.lun != ppa_list[i].g.lun ||
+                               ppa_list[0].g.ch != ppa_list[i].g.ch);
+#endif
+       /* If the LUN has been locked for this same request, do no attempt to
+        * lock it again
+        */
+       if (test_and_set_bit(lun_id, lun_bitmap))
+               return;
+
+       rlun = &pblk->luns[lun_id];
+       ret = down_timeout(&rlun->wr_sem, msecs_to_jiffies(5000));
+       if (ret) {
+               switch (ret) {
+               case -ETIME:
+                       pr_err("pblk: lun semaphore timed out\n");
+                       break;
+               case -EINTR:
+                       pr_err("pblk: lun semaphore timed out\n");
+                       break;
+               }
+       }
+}
+
+void pblk_up_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
+               unsigned long *lun_bitmap)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       struct pblk_lun *rlun;
+       int nr_luns = geo->nr_luns;
+       int bit = -1;
+
+       while ((bit = find_next_bit(lun_bitmap, nr_luns, bit + 1)) < nr_luns) {
+               rlun = &pblk->luns[bit];
+               up(&rlun->wr_sem);
+       }
+
+       kfree(lun_bitmap);
+}
+
+void pblk_update_map(struct pblk *pblk, sector_t lba, struct ppa_addr ppa)
+{
+       struct ppa_addr l2p_ppa;
+
+       /* logic error: lba out-of-bounds. Ignore update */
+       if (!(lba < pblk->rl.nr_secs)) {
+               WARN(1, "pblk: corrupted L2P map request\n");
+               return;
+       }
+
+       spin_lock(&pblk->trans_lock);
+       l2p_ppa = pblk_trans_map_get(pblk, lba);
+
+       if (!pblk_addr_in_cache(l2p_ppa) && !pblk_ppa_empty(l2p_ppa))
+               pblk_map_invalidate(pblk, l2p_ppa);
+
+       pblk_trans_map_set(pblk, lba, ppa);
+       spin_unlock(&pblk->trans_lock);
+}
+
+void pblk_update_map_cache(struct pblk *pblk, sector_t lba, struct ppa_addr ppa)
+{
+#ifdef CONFIG_NVM_DEBUG
+       /* Callers must ensure that the ppa points to a cache address */
+       BUG_ON(!pblk_addr_in_cache(ppa));
+       BUG_ON(pblk_rb_pos_oob(&pblk->rwb, pblk_addr_to_cacheline(ppa)));
+#endif
+
+       pblk_update_map(pblk, lba, ppa);
+}
+
+int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa,
+                      struct pblk_line *gc_line)
+{
+       struct ppa_addr l2p_ppa;
+       int ret = 1;
+
+#ifdef CONFIG_NVM_DEBUG
+       /* Callers must ensure that the ppa points to a cache address */
+       BUG_ON(!pblk_addr_in_cache(ppa));
+       BUG_ON(pblk_rb_pos_oob(&pblk->rwb, pblk_addr_to_cacheline(ppa)));
+#endif
+
+       /* logic error: lba out-of-bounds. Ignore update */
+       if (!(lba < pblk->rl.nr_secs)) {
+               WARN(1, "pblk: corrupted L2P map request\n");
+               return 0;
+       }
+
+       spin_lock(&pblk->trans_lock);
+       l2p_ppa = pblk_trans_map_get(pblk, lba);
+
+       /* Prevent updated entries to be overwritten by GC */
+       if (pblk_addr_in_cache(l2p_ppa) || pblk_ppa_empty(l2p_ppa) ||
+                               pblk_tgt_ppa_to_line(l2p_ppa) != gc_line->id) {
+               ret = 0;
+               goto out;
+       }
+
+       pblk_trans_map_set(pblk, lba, ppa);
+out:
+       spin_unlock(&pblk->trans_lock);
+       return ret;
+}
+
+void pblk_update_map_dev(struct pblk *pblk, sector_t lba, struct ppa_addr ppa,
+                        struct ppa_addr entry_line)
+{
+       struct ppa_addr l2p_line;
+
+#ifdef CONFIG_NVM_DEBUG
+       /* Callers must ensure that the ppa points to a device address */
+       BUG_ON(pblk_addr_in_cache(ppa));
+#endif
+       /* Invalidate and discard padded entries */
+       if (lba == ADDR_EMPTY) {
+#ifdef CONFIG_NVM_DEBUG
+               atomic_long_inc(&pblk->padded_wb);
+#endif
+               pblk_map_invalidate(pblk, ppa);
+               return;
+       }
+
+       /* logic error: lba out-of-bounds. Ignore update */
+       if (!(lba < pblk->rl.nr_secs)) {
+               WARN(1, "pblk: corrupted L2P map request\n");
+               return;
+       }
+
+       spin_lock(&pblk->trans_lock);
+       l2p_line = pblk_trans_map_get(pblk, lba);
+
+       /* Do not update L2P if the cacheline has been updated. In this case,
+        * the mapped ppa must be invalidated
+        */
+       if (l2p_line.ppa != entry_line.ppa) {
+               if (!pblk_ppa_empty(ppa))
+                       pblk_map_invalidate(pblk, ppa);
+               goto out;
+       }
+
+#ifdef CONFIG_NVM_DEBUG
+       WARN_ON(!pblk_addr_in_cache(l2p_line) && !pblk_ppa_empty(l2p_line));
+#endif
+
+       pblk_trans_map_set(pblk, lba, ppa);
+out:
+       spin_unlock(&pblk->trans_lock);
+}
+
+void pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas,
+                        sector_t blba, int nr_secs)
+{
+       int i;
+
+       spin_lock(&pblk->trans_lock);
+       for (i = 0; i < nr_secs; i++)
+               ppas[i] = pblk_trans_map_get(pblk, blba + i);
+       spin_unlock(&pblk->trans_lock);
+}
+
+void pblk_lookup_l2p_rand(struct pblk *pblk, struct ppa_addr *ppas,
+                         u64 *lba_list, int nr_secs)
+{
+       sector_t lba;
+       int i;
+
+       spin_lock(&pblk->trans_lock);
+       for (i = 0; i < nr_secs; i++) {
+               lba = lba_list[i];
+               if (lba == ADDR_EMPTY) {
+                       ppas[i].ppa = ADDR_EMPTY;
+               } else {
+                       /* logic error: lba out-of-bounds. Ignore update */
+                       if (!(lba < pblk->rl.nr_secs)) {
+                               WARN(1, "pblk: corrupted L2P map request\n");
+                               continue;
+                       }
+                       ppas[i] = pblk_trans_map_get(pblk, lba);
+               }
+       }
+       spin_unlock(&pblk->trans_lock);
+}
diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c
new file mode 100644 (file)
index 0000000..eaf479c
--- /dev/null
@@ -0,0 +1,555 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ *                  Matias Bjorling <matias@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * pblk-gc.c - pblk's garbage collector
+ */
+
+#include "pblk.h"
+#include <linux/delay.h>
+
+static void pblk_gc_free_gc_rq(struct pblk_gc_rq *gc_rq)
+{
+       kfree(gc_rq->data);
+       kfree(gc_rq->lba_list);
+       kfree(gc_rq);
+}
+
+static int pblk_gc_write(struct pblk *pblk)
+{
+       struct pblk_gc *gc = &pblk->gc;
+       struct pblk_gc_rq *gc_rq, *tgc_rq;
+       LIST_HEAD(w_list);
+
+       spin_lock(&gc->w_lock);
+       if (list_empty(&gc->w_list)) {
+               spin_unlock(&gc->w_lock);
+               return 1;
+       }
+
+       list_for_each_entry_safe(gc_rq, tgc_rq, &gc->w_list, list) {
+               list_move_tail(&gc_rq->list, &w_list);
+               gc->w_entries--;
+       }
+       spin_unlock(&gc->w_lock);
+
+       list_for_each_entry_safe(gc_rq, tgc_rq, &w_list, list) {
+               pblk_write_gc_to_cache(pblk, gc_rq->data, gc_rq->lba_list,
+                               gc_rq->nr_secs, gc_rq->secs_to_gc,
+                               gc_rq->line, PBLK_IOTYPE_GC);
+
+               kref_put(&gc_rq->line->ref, pblk_line_put);
+
+               list_del(&gc_rq->list);
+               pblk_gc_free_gc_rq(gc_rq);
+       }
+
+       return 0;
+}
+
+static void pblk_gc_writer_kick(struct pblk_gc *gc)
+{
+       wake_up_process(gc->gc_writer_ts);
+}
+
+/*
+ * Responsible for managing all memory related to a gc request. Also in case of
+ * failure
+ */
+static int pblk_gc_move_valid_secs(struct pblk *pblk, struct pblk_line *line,
+                                  u64 *lba_list, unsigned int nr_secs)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       struct pblk_gc *gc = &pblk->gc;
+       struct pblk_gc_rq *gc_rq;
+       void *data;
+       unsigned int secs_to_gc;
+       int ret = NVM_IO_OK;
+
+       data = kmalloc(nr_secs * geo->sec_size, GFP_KERNEL);
+       if (!data) {
+               ret = NVM_IO_ERR;
+               goto free_lba_list;
+       }
+
+       /* Read from GC victim block */
+       if (pblk_submit_read_gc(pblk, lba_list, data, nr_secs,
+                                                       &secs_to_gc, line)) {
+               ret = NVM_IO_ERR;
+               goto free_data;
+       }
+
+       if (!secs_to_gc)
+               goto free_data;
+
+       gc_rq = kmalloc(sizeof(struct pblk_gc_rq), GFP_KERNEL);
+       if (!gc_rq) {
+               ret = NVM_IO_ERR;
+               goto free_data;
+       }
+
+       gc_rq->line = line;
+       gc_rq->data = data;
+       gc_rq->lba_list = lba_list;
+       gc_rq->nr_secs = nr_secs;
+       gc_rq->secs_to_gc = secs_to_gc;
+
+       kref_get(&line->ref);
+
+retry:
+       spin_lock(&gc->w_lock);
+       if (gc->w_entries > 256) {
+               spin_unlock(&gc->w_lock);
+               usleep_range(256, 1024);
+               goto retry;
+       }
+       gc->w_entries++;
+       list_add_tail(&gc_rq->list, &gc->w_list);
+       spin_unlock(&gc->w_lock);
+
+       pblk_gc_writer_kick(&pblk->gc);
+
+       return NVM_IO_OK;
+
+free_data:
+       kfree(data);
+free_lba_list:
+       kfree(lba_list);
+
+       return ret;
+}
+
+static void pblk_put_line_back(struct pblk *pblk, struct pblk_line *line)
+{
+       struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+       struct list_head *move_list;
+
+       spin_lock(&line->lock);
+       WARN_ON(line->state != PBLK_LINESTATE_GC);
+       line->state = PBLK_LINESTATE_CLOSED;
+       move_list = pblk_line_gc_list(pblk, line);
+       spin_unlock(&line->lock);
+
+       if (move_list) {
+               spin_lock(&l_mg->gc_lock);
+               list_add_tail(&line->list, move_list);
+               spin_unlock(&l_mg->gc_lock);
+       }
+}
+
+static void pblk_gc_line_ws(struct work_struct *work)
+{
+       struct pblk_line_ws *line_ws = container_of(work, struct pblk_line_ws,
+                                                                       ws);
+       struct pblk *pblk = line_ws->pblk;
+       struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+       struct pblk_line *line = line_ws->line;
+       struct pblk_line_meta *lm = &pblk->lm;
+       __le64 *lba_list = line_ws->priv;
+       u64 *gc_list;
+       int sec_left;
+       int nr_ppas, bit;
+       int put_line = 1;
+
+       pr_debug("pblk: line '%d' being reclaimed for GC\n", line->id);
+
+       spin_lock(&line->lock);
+       sec_left = line->vsc;
+       if (!sec_left) {
+               /* Lines are erased before being used (l_mg->data_/log_next) */
+               spin_unlock(&line->lock);
+               goto out;
+       }
+       spin_unlock(&line->lock);
+
+       if (sec_left < 0) {
+               pr_err("pblk: corrupted GC line (%d)\n", line->id);
+               put_line = 0;
+               pblk_put_line_back(pblk, line);
+               goto out;
+       }
+
+       bit = -1;
+next_rq:
+       gc_list = kmalloc_array(pblk->max_write_pgs, sizeof(u64), GFP_KERNEL);
+       if (!gc_list) {
+               put_line = 0;
+               pblk_put_line_back(pblk, line);
+               goto out;
+       }
+
+       nr_ppas = 0;
+       do {
+               bit = find_next_zero_bit(line->invalid_bitmap, lm->sec_per_line,
+                                                               bit + 1);
+               if (bit > line->emeta_ssec)
+                       break;
+
+               gc_list[nr_ppas++] = le64_to_cpu(lba_list[bit]);
+       } while (nr_ppas < pblk->max_write_pgs);
+
+       if (unlikely(!nr_ppas)) {
+               kfree(gc_list);
+               goto out;
+       }
+
+       if (pblk_gc_move_valid_secs(pblk, line, gc_list, nr_ppas)) {
+               pr_err("pblk: could not GC all sectors: line:%d (%d/%d/%d)\n",
+                                               line->id, line->vsc,
+                                               nr_ppas, nr_ppas);
+               put_line = 0;
+               pblk_put_line_back(pblk, line);
+               goto out;
+       }
+
+       sec_left -= nr_ppas;
+       if (sec_left > 0)
+               goto next_rq;
+
+out:
+       pblk_mfree(line->emeta, l_mg->emeta_alloc_type);
+       mempool_free(line_ws, pblk->line_ws_pool);
+       atomic_dec(&pblk->gc.inflight_gc);
+       if (put_line)
+               kref_put(&line->ref, pblk_line_put);
+}
+
+static int pblk_gc_line(struct pblk *pblk, struct pblk_line *line)
+{
+       struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+       struct pblk_line_meta *lm = &pblk->lm;
+       struct pblk_line_ws *line_ws;
+       __le64 *lba_list;
+       int ret;
+
+       line_ws = mempool_alloc(pblk->line_ws_pool, GFP_KERNEL);
+       line->emeta = pblk_malloc(lm->emeta_len, l_mg->emeta_alloc_type,
+                                                               GFP_KERNEL);
+       if (!line->emeta) {
+               pr_err("pblk: cannot use GC emeta\n");
+               goto fail_free_ws;
+       }
+
+       ret = pblk_line_read_emeta(pblk, line);
+       if (ret) {
+               pr_err("pblk: line %d read emeta failed (%d)\n", line->id, ret);
+               goto fail_free_emeta;
+       }
+
+       /* If this read fails, it means that emeta is corrupted. For now, leave
+        * the line untouched. TODO: Implement a recovery routine that scans and
+        * moves all sectors on the line.
+        */
+       lba_list = pblk_recov_get_lba_list(pblk, line->emeta);
+       if (!lba_list) {
+               pr_err("pblk: could not interpret emeta (line %d)\n", line->id);
+               goto fail_free_emeta;
+       }
+
+       line_ws->pblk = pblk;
+       line_ws->line = line;
+       line_ws->priv = lba_list;
+
+       INIT_WORK(&line_ws->ws, pblk_gc_line_ws);
+       queue_work(pblk->gc.gc_reader_wq, &line_ws->ws);
+
+       return 0;
+
+fail_free_emeta:
+       pblk_mfree(line->emeta, l_mg->emeta_alloc_type);
+fail_free_ws:
+       mempool_free(line_ws, pblk->line_ws_pool);
+       pblk_put_line_back(pblk, line);
+
+       return 1;
+}
+
+static void pblk_gc_lines(struct pblk *pblk, struct list_head *gc_list)
+{
+       struct pblk_line *line, *tline;
+
+       list_for_each_entry_safe(line, tline, gc_list, list) {
+               if (pblk_gc_line(pblk, line))
+                       pr_err("pblk: failed to GC line %d\n", line->id);
+               list_del(&line->list);
+       }
+}
+
+/*
+ * Lines with no valid sectors will be returned to the free list immediately. If
+ * GC is activated - either because the free block count is under the determined
+ * threshold, or because it is being forced from user space - only lines with a
+ * high count of invalid sectors will be recycled.
+ */
+static void pblk_gc_run(struct pblk *pblk)
+{
+       struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+       struct pblk_gc *gc = &pblk->gc;
+       struct pblk_line *line, *tline;
+       unsigned int nr_blocks_free, nr_blocks_need;
+       struct list_head *group_list;
+       int run_gc, gc_group = 0;
+       int prev_gc = 0;
+       int inflight_gc = atomic_read(&gc->inflight_gc);
+       LIST_HEAD(gc_list);
+
+       spin_lock(&l_mg->gc_lock);
+       list_for_each_entry_safe(line, tline, &l_mg->gc_full_list, list) {
+               spin_lock(&line->lock);
+               WARN_ON(line->state != PBLK_LINESTATE_CLOSED);
+               line->state = PBLK_LINESTATE_GC;
+               spin_unlock(&line->lock);
+
+               list_del(&line->list);
+               kref_put(&line->ref, pblk_line_put);
+       }
+       spin_unlock(&l_mg->gc_lock);
+
+       nr_blocks_need = pblk_rl_gc_thrs(&pblk->rl);
+       nr_blocks_free = pblk_rl_nr_free_blks(&pblk->rl);
+       run_gc = (nr_blocks_need > nr_blocks_free || gc->gc_forced);
+
+next_gc_group:
+       group_list = l_mg->gc_lists[gc_group++];
+       spin_lock(&l_mg->gc_lock);
+       while (run_gc && !list_empty(group_list)) {
+               /* No need to queue up more GC lines than we can handle */
+               if (!run_gc || inflight_gc > gc->gc_jobs_active) {
+                       spin_unlock(&l_mg->gc_lock);
+                       pblk_gc_lines(pblk, &gc_list);
+                       return;
+               }
+
+               line = list_first_entry(group_list, struct pblk_line, list);
+               nr_blocks_free += atomic_read(&line->blk_in_line);
+
+               spin_lock(&line->lock);
+               WARN_ON(line->state != PBLK_LINESTATE_CLOSED);
+               line->state = PBLK_LINESTATE_GC;
+               list_move_tail(&line->list, &gc_list);
+               atomic_inc(&gc->inflight_gc);
+               inflight_gc++;
+               spin_unlock(&line->lock);
+
+               prev_gc = 1;
+               run_gc = (nr_blocks_need > nr_blocks_free || gc->gc_forced);
+       }
+       spin_unlock(&l_mg->gc_lock);
+
+       pblk_gc_lines(pblk, &gc_list);
+
+       if (!prev_gc && pblk->rl.rb_state > gc_group &&
+                                               gc_group < PBLK_NR_GC_LISTS)
+               goto next_gc_group;
+}
+
+
+static void pblk_gc_kick(struct pblk *pblk)
+{
+       struct pblk_gc *gc = &pblk->gc;
+
+       wake_up_process(gc->gc_ts);
+       pblk_gc_writer_kick(gc);
+       mod_timer(&gc->gc_timer, jiffies + msecs_to_jiffies(GC_TIME_MSECS));
+}
+
+static void pblk_gc_timer(unsigned long data)
+{
+       struct pblk *pblk = (struct pblk *)data;
+
+       pblk_gc_kick(pblk);
+}
+
+static int pblk_gc_ts(void *data)
+{
+       struct pblk *pblk = data;
+
+       while (!kthread_should_stop()) {
+               pblk_gc_run(pblk);
+               set_current_state(TASK_INTERRUPTIBLE);
+               io_schedule();
+       }
+
+       return 0;
+}
+
+static int pblk_gc_writer_ts(void *data)
+{
+       struct pblk *pblk = data;
+
+       while (!kthread_should_stop()) {
+               if (!pblk_gc_write(pblk))
+                       continue;
+               set_current_state(TASK_INTERRUPTIBLE);
+               io_schedule();
+       }
+
+       return 0;
+}
+
+static void pblk_gc_start(struct pblk *pblk)
+{
+       pblk->gc.gc_active = 1;
+
+       pr_debug("pblk: gc start\n");
+}
+
+int pblk_gc_status(struct pblk *pblk)
+{
+       struct pblk_gc *gc = &pblk->gc;
+       int ret;
+
+       spin_lock(&gc->lock);
+       ret = gc->gc_active;
+       spin_unlock(&gc->lock);
+
+       return ret;
+}
+
+static void __pblk_gc_should_start(struct pblk *pblk)
+{
+       struct pblk_gc *gc = &pblk->gc;
+
+       lockdep_assert_held(&gc->lock);
+
+       if (gc->gc_enabled && !gc->gc_active)
+               pblk_gc_start(pblk);
+}
+
+void pblk_gc_should_start(struct pblk *pblk)
+{
+       struct pblk_gc *gc = &pblk->gc;
+
+       spin_lock(&gc->lock);
+       __pblk_gc_should_start(pblk);
+       spin_unlock(&gc->lock);
+}
+
+/*
+ * If flush_wq == 1 then no lock should be held by the caller since
+ * flush_workqueue can sleep
+ */
+static void pblk_gc_stop(struct pblk *pblk, int flush_wq)
+{
+       spin_lock(&pblk->gc.lock);
+       pblk->gc.gc_active = 0;
+       spin_unlock(&pblk->gc.lock);
+
+       pr_debug("pblk: gc stop\n");
+}
+
+void pblk_gc_should_stop(struct pblk *pblk)
+{
+       struct pblk_gc *gc = &pblk->gc;
+
+       if (gc->gc_active && !gc->gc_forced)
+               pblk_gc_stop(pblk, 0);
+}
+
+void pblk_gc_sysfs_state_show(struct pblk *pblk, int *gc_enabled,
+                             int *gc_active)
+{
+       struct pblk_gc *gc = &pblk->gc;
+
+       spin_lock(&gc->lock);
+       *gc_enabled = gc->gc_enabled;
+       *gc_active = gc->gc_active;
+       spin_unlock(&gc->lock);
+}
+
+void pblk_gc_sysfs_force(struct pblk *pblk, int force)
+{
+       struct pblk_gc *gc = &pblk->gc;
+       int rsv = 0;
+
+       spin_lock(&gc->lock);
+       if (force) {
+               gc->gc_enabled = 1;
+               rsv = 64;
+       }
+       pblk_rl_set_gc_rsc(&pblk->rl, rsv);
+       gc->gc_forced = force;
+       __pblk_gc_should_start(pblk);
+       spin_unlock(&gc->lock);
+}
+
+int pblk_gc_init(struct pblk *pblk)
+{
+       struct pblk_gc *gc = &pblk->gc;
+       int ret;
+
+       gc->gc_ts = kthread_create(pblk_gc_ts, pblk, "pblk-gc-ts");
+       if (IS_ERR(gc->gc_ts)) {
+               pr_err("pblk: could not allocate GC main kthread\n");
+               return PTR_ERR(gc->gc_ts);
+       }
+
+       gc->gc_writer_ts = kthread_create(pblk_gc_writer_ts, pblk,
+                                                       "pblk-gc-writer-ts");
+       if (IS_ERR(gc->gc_writer_ts)) {
+               pr_err("pblk: could not allocate GC writer kthread\n");
+               ret = PTR_ERR(gc->gc_writer_ts);
+               goto fail_free_main_kthread;
+       }
+
+       setup_timer(&gc->gc_timer, pblk_gc_timer, (unsigned long)pblk);
+       mod_timer(&gc->gc_timer, jiffies + msecs_to_jiffies(GC_TIME_MSECS));
+
+       gc->gc_active = 0;
+       gc->gc_forced = 0;
+       gc->gc_enabled = 1;
+       gc->gc_jobs_active = 8;
+       gc->w_entries = 0;
+       atomic_set(&gc->inflight_gc, 0);
+
+       gc->gc_reader_wq = alloc_workqueue("pblk-gc-reader-wq",
+                       WQ_MEM_RECLAIM | WQ_UNBOUND, gc->gc_jobs_active);
+       if (!gc->gc_reader_wq) {
+               pr_err("pblk: could not allocate GC reader workqueue\n");
+               ret = -ENOMEM;
+               goto fail_free_writer_kthread;
+       }
+
+       spin_lock_init(&gc->lock);
+       spin_lock_init(&gc->w_lock);
+       INIT_LIST_HEAD(&gc->w_list);
+
+       return 0;
+
+fail_free_writer_kthread:
+       kthread_stop(gc->gc_writer_ts);
+fail_free_main_kthread:
+       kthread_stop(gc->gc_ts);
+
+       return ret;
+}
+
+void pblk_gc_exit(struct pblk *pblk)
+{
+       struct pblk_gc *gc = &pblk->gc;
+
+       flush_workqueue(gc->gc_reader_wq);
+
+       del_timer(&gc->gc_timer);
+       pblk_gc_stop(pblk, 1);
+
+       if (gc->gc_ts)
+               kthread_stop(gc->gc_ts);
+
+       if (pblk->gc.gc_reader_wq)
+               destroy_workqueue(pblk->gc.gc_reader_wq);
+
+       if (gc->gc_writer_ts)
+               kthread_stop(gc->gc_writer_ts);
+}
diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
new file mode 100644 (file)
index 0000000..ae8cd6d
--- /dev/null
@@ -0,0 +1,962 @@
+/*
+ * Copyright (C) 2015 IT University of Copenhagen (rrpc.c)
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ *                  Matias Bjorling <matias@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * Implementation of a physical block-device target for Open-channel SSDs.
+ *
+ * pblk-init.c - pblk's initialization.
+ */
+
+#include "pblk.h"
+
+static struct kmem_cache *pblk_blk_ws_cache, *pblk_rec_cache, *pblk_r_rq_cache,
+                                       *pblk_w_rq_cache, *pblk_line_meta_cache;
+static DECLARE_RWSEM(pblk_lock);
+
+static int pblk_rw_io(struct request_queue *q, struct pblk *pblk,
+                         struct bio *bio)
+{
+       int ret;
+
+       /* Read requests must be <= 256kb due to NVMe's 64 bit completion bitmap
+        * constraint. Writes can be of arbitrary size.
+        */
+       if (bio_data_dir(bio) == READ) {
+               blk_queue_split(q, &bio, q->bio_split);
+               ret = pblk_submit_read(pblk, bio);
+               if (ret == NVM_IO_DONE && bio_flagged(bio, BIO_CLONED))
+                       bio_put(bio);
+
+               return ret;
+       }
+
+       /* Prevent deadlock in the case of a modest LUN configuration and large
+        * user I/Os. Unless stalled, the rate limiter leaves at least 256KB
+        * available for user I/O.
+        */
+       if (unlikely(pblk_get_secs(bio) >= pblk_rl_sysfs_rate_show(&pblk->rl)))
+               blk_queue_split(q, &bio, q->bio_split);
+
+       return pblk_write_to_cache(pblk, bio, PBLK_IOTYPE_USER);
+}
+
+static blk_qc_t pblk_make_rq(struct request_queue *q, struct bio *bio)
+{
+       struct pblk *pblk = q->queuedata;
+
+       if (bio_op(bio) == REQ_OP_DISCARD) {
+               pblk_discard(pblk, bio);
+               if (!(bio->bi_opf & REQ_PREFLUSH)) {
+                       bio_endio(bio);
+                       return BLK_QC_T_NONE;
+               }
+       }
+
+       switch (pblk_rw_io(q, pblk, bio)) {
+       case NVM_IO_ERR:
+               bio_io_error(bio);
+               break;
+       case NVM_IO_DONE:
+               bio_endio(bio);
+               break;
+       }
+
+       return BLK_QC_T_NONE;
+}
+
+static void pblk_l2p_free(struct pblk *pblk)
+{
+       vfree(pblk->trans_map);
+}
+
+static int pblk_l2p_init(struct pblk *pblk)
+{
+       sector_t i;
+       struct ppa_addr ppa;
+       int entry_size = 8;
+
+       if (pblk->ppaf_bitsize < 32)
+               entry_size = 4;
+
+       pblk->trans_map = vmalloc(entry_size * pblk->rl.nr_secs);
+       if (!pblk->trans_map)
+               return -ENOMEM;
+
+       pblk_ppa_set_empty(&ppa);
+
+       for (i = 0; i < pblk->rl.nr_secs; i++)
+               pblk_trans_map_set(pblk, i, ppa);
+
+       return 0;
+}
+
+static void pblk_rwb_free(struct pblk *pblk)
+{
+       if (pblk_rb_tear_down_check(&pblk->rwb))
+               pr_err("pblk: write buffer error on tear down\n");
+
+       pblk_rb_data_free(&pblk->rwb);
+       vfree(pblk_rb_entries_ref(&pblk->rwb));
+}
+
+static int pblk_rwb_init(struct pblk *pblk)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       struct pblk_rb_entry *entries;
+       unsigned long nr_entries;
+       unsigned int power_size, power_seg_sz;
+
+       nr_entries = pblk_rb_calculate_size(pblk->pgs_in_buffer);
+
+       entries = vzalloc(nr_entries * sizeof(struct pblk_rb_entry));
+       if (!entries)
+               return -ENOMEM;
+
+       power_size = get_count_order(nr_entries);
+       power_seg_sz = get_count_order(geo->sec_size);
+
+       return pblk_rb_init(&pblk->rwb, entries, power_size, power_seg_sz);
+}
+
+/* Minimum pages needed within a lun */
+#define PAGE_POOL_SIZE 16
+#define ADDR_POOL_SIZE 64
+
+static int pblk_set_ppaf(struct pblk *pblk)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       struct nvm_addr_format ppaf = geo->ppaf;
+       int power_len;
+
+       /* Re-calculate channel and lun format to adapt to configuration */
+       power_len = get_count_order(geo->nr_chnls);
+       if (1 << power_len != geo->nr_chnls) {
+               pr_err("pblk: supports only power-of-two channel config.\n");
+               return -EINVAL;
+       }
+       ppaf.ch_len = power_len;
+
+       power_len = get_count_order(geo->luns_per_chnl);
+       if (1 << power_len != geo->luns_per_chnl) {
+               pr_err("pblk: supports only power-of-two LUN config.\n");
+               return -EINVAL;
+       }
+       ppaf.lun_len = power_len;
+
+       pblk->ppaf.sec_offset = 0;
+       pblk->ppaf.pln_offset = ppaf.sect_len;
+       pblk->ppaf.ch_offset = pblk->ppaf.pln_offset + ppaf.pln_len;
+       pblk->ppaf.lun_offset = pblk->ppaf.ch_offset + ppaf.ch_len;
+       pblk->ppaf.pg_offset = pblk->ppaf.lun_offset + ppaf.lun_len;
+       pblk->ppaf.blk_offset = pblk->ppaf.pg_offset + ppaf.pg_len;
+       pblk->ppaf.sec_mask = (1ULL << ppaf.sect_len) - 1;
+       pblk->ppaf.pln_mask = ((1ULL << ppaf.pln_len) - 1) <<
+                                                       pblk->ppaf.pln_offset;
+       pblk->ppaf.ch_mask = ((1ULL << ppaf.ch_len) - 1) <<
+                                                       pblk->ppaf.ch_offset;
+       pblk->ppaf.lun_mask = ((1ULL << ppaf.lun_len) - 1) <<
+                                                       pblk->ppaf.lun_offset;
+       pblk->ppaf.pg_mask = ((1ULL << ppaf.pg_len) - 1) <<
+                                                       pblk->ppaf.pg_offset;
+       pblk->ppaf.blk_mask = ((1ULL << ppaf.blk_len) - 1) <<
+                                                       pblk->ppaf.blk_offset;
+
+       pblk->ppaf_bitsize = pblk->ppaf.blk_offset + ppaf.blk_len;
+
+       return 0;
+}
+
+static int pblk_init_global_caches(struct pblk *pblk)
+{
+       char cache_name[PBLK_CACHE_NAME_LEN];
+
+       down_write(&pblk_lock);
+       pblk_blk_ws_cache = kmem_cache_create("pblk_blk_ws",
+                               sizeof(struct pblk_line_ws), 0, 0, NULL);
+       if (!pblk_blk_ws_cache) {
+               up_write(&pblk_lock);
+               return -ENOMEM;
+       }
+
+       pblk_rec_cache = kmem_cache_create("pblk_rec",
+                               sizeof(struct pblk_rec_ctx), 0, 0, NULL);
+       if (!pblk_rec_cache) {
+               kmem_cache_destroy(pblk_blk_ws_cache);
+               up_write(&pblk_lock);
+               return -ENOMEM;
+       }
+
+       pblk_r_rq_cache = kmem_cache_create("pblk_r_rq", pblk_r_rq_size,
+                               0, 0, NULL);
+       if (!pblk_r_rq_cache) {
+               kmem_cache_destroy(pblk_blk_ws_cache);
+               kmem_cache_destroy(pblk_rec_cache);
+               up_write(&pblk_lock);
+               return -ENOMEM;
+       }
+
+       pblk_w_rq_cache = kmem_cache_create("pblk_w_rq", pblk_w_rq_size,
+                               0, 0, NULL);
+       if (!pblk_w_rq_cache) {
+               kmem_cache_destroy(pblk_blk_ws_cache);
+               kmem_cache_destroy(pblk_rec_cache);
+               kmem_cache_destroy(pblk_r_rq_cache);
+               up_write(&pblk_lock);
+               return -ENOMEM;
+       }
+
+       snprintf(cache_name, sizeof(cache_name), "pblk_line_m_%s",
+                                                       pblk->disk->disk_name);
+       pblk_line_meta_cache = kmem_cache_create(cache_name,
+                               pblk->lm.sec_bitmap_len, 0, 0, NULL);
+       if (!pblk_line_meta_cache) {
+               kmem_cache_destroy(pblk_blk_ws_cache);
+               kmem_cache_destroy(pblk_rec_cache);
+               kmem_cache_destroy(pblk_r_rq_cache);
+               kmem_cache_destroy(pblk_w_rq_cache);
+               up_write(&pblk_lock);
+               return -ENOMEM;
+       }
+       up_write(&pblk_lock);
+
+       return 0;
+}
+
+static int pblk_core_init(struct pblk *pblk)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       int max_write_ppas;
+       int mod;
+
+       pblk->min_write_pgs = geo->sec_per_pl * (geo->sec_size / PAGE_SIZE);
+       max_write_ppas = pblk->min_write_pgs * geo->nr_luns;
+       pblk->max_write_pgs = (max_write_ppas < nvm_max_phys_sects(dev)) ?
+                               max_write_ppas : nvm_max_phys_sects(dev);
+       pblk->pgs_in_buffer = NVM_MEM_PAGE_WRITE * geo->sec_per_pg *
+                                               geo->nr_planes * geo->nr_luns;
+
+       if (pblk->max_write_pgs > PBLK_MAX_REQ_ADDRS) {
+               pr_err("pblk: cannot support device max_phys_sect\n");
+               return -EINVAL;
+       }
+
+       div_u64_rem(geo->sec_per_blk, pblk->min_write_pgs, &mod);
+       if (mod) {
+               pr_err("pblk: bad configuration of sectors/pages\n");
+               return -EINVAL;
+       }
+
+       if (pblk_init_global_caches(pblk))
+               return -ENOMEM;
+
+       pblk->page_pool = mempool_create_page_pool(PAGE_POOL_SIZE, 0);
+       if (!pblk->page_pool)
+               return -ENOMEM;
+
+       pblk->line_ws_pool = mempool_create_slab_pool(geo->nr_luns,
+                                                       pblk_blk_ws_cache);
+       if (!pblk->line_ws_pool)
+               goto free_page_pool;
+
+       pblk->rec_pool = mempool_create_slab_pool(geo->nr_luns, pblk_rec_cache);
+       if (!pblk->rec_pool)
+               goto free_blk_ws_pool;
+
+       pblk->r_rq_pool = mempool_create_slab_pool(64, pblk_r_rq_cache);
+       if (!pblk->r_rq_pool)
+               goto free_rec_pool;
+
+       pblk->w_rq_pool = mempool_create_slab_pool(64, pblk_w_rq_cache);
+       if (!pblk->w_rq_pool)
+               goto free_r_rq_pool;
+
+       pblk->line_meta_pool =
+                       mempool_create_slab_pool(16, pblk_line_meta_cache);
+       if (!pblk->line_meta_pool)
+               goto free_w_rq_pool;
+
+       pblk->kw_wq = alloc_workqueue("pblk-aux-wq",
+                                       WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
+       if (!pblk->kw_wq)
+               goto free_line_meta_pool;
+
+       if (pblk_set_ppaf(pblk))
+               goto free_kw_wq;
+
+       if (pblk_rwb_init(pblk))
+               goto free_kw_wq;
+
+       INIT_LIST_HEAD(&pblk->compl_list);
+       return 0;
+
+free_kw_wq:
+       destroy_workqueue(pblk->kw_wq);
+free_line_meta_pool:
+       mempool_destroy(pblk->line_meta_pool);
+free_w_rq_pool:
+       mempool_destroy(pblk->w_rq_pool);
+free_r_rq_pool:
+       mempool_destroy(pblk->r_rq_pool);
+free_rec_pool:
+       mempool_destroy(pblk->rec_pool);
+free_blk_ws_pool:
+       mempool_destroy(pblk->line_ws_pool);
+free_page_pool:
+       mempool_destroy(pblk->page_pool);
+       return -ENOMEM;
+}
+
+static void pblk_core_free(struct pblk *pblk)
+{
+       if (pblk->kw_wq)
+               destroy_workqueue(pblk->kw_wq);
+
+       mempool_destroy(pblk->page_pool);
+       mempool_destroy(pblk->line_ws_pool);
+       mempool_destroy(pblk->rec_pool);
+       mempool_destroy(pblk->r_rq_pool);
+       mempool_destroy(pblk->w_rq_pool);
+       mempool_destroy(pblk->line_meta_pool);
+
+       kmem_cache_destroy(pblk_blk_ws_cache);
+       kmem_cache_destroy(pblk_rec_cache);
+       kmem_cache_destroy(pblk_r_rq_cache);
+       kmem_cache_destroy(pblk_w_rq_cache);
+       kmem_cache_destroy(pblk_line_meta_cache);
+}
+
+static void pblk_luns_free(struct pblk *pblk)
+{
+       kfree(pblk->luns);
+}
+
+static void pblk_lines_free(struct pblk *pblk)
+{
+       struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+       struct pblk_line *line;
+       int i;
+
+       spin_lock(&l_mg->free_lock);
+       for (i = 0; i < l_mg->nr_lines; i++) {
+               line = &pblk->lines[i];
+
+               pblk_line_free(pblk, line);
+               kfree(line->blk_bitmap);
+               kfree(line->erase_bitmap);
+       }
+       spin_unlock(&l_mg->free_lock);
+}
+
+static void pblk_line_meta_free(struct pblk *pblk)
+{
+       struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+       int i;
+
+       kfree(l_mg->bb_template);
+       kfree(l_mg->bb_aux);
+
+       for (i = 0; i < PBLK_DATA_LINES; i++) {
+               pblk_mfree(l_mg->sline_meta[i].meta, l_mg->smeta_alloc_type);
+               pblk_mfree(l_mg->eline_meta[i].meta, l_mg->emeta_alloc_type);
+       }
+
+       kfree(pblk->lines);
+}
+
+static int pblk_bb_discovery(struct nvm_tgt_dev *dev, struct pblk_lun *rlun)
+{
+       struct nvm_geo *geo = &dev->geo;
+       struct ppa_addr ppa;
+       u8 *blks;
+       int nr_blks, ret;
+
+       nr_blks = geo->blks_per_lun * geo->plane_mode;
+       blks = kmalloc(nr_blks, GFP_KERNEL);
+       if (!blks)
+               return -ENOMEM;
+
+       ppa.ppa = 0;
+       ppa.g.ch = rlun->bppa.g.ch;
+       ppa.g.lun = rlun->bppa.g.lun;
+
+       ret = nvm_get_tgt_bb_tbl(dev, ppa, blks);
+       if (ret)
+               goto out;
+
+       nr_blks = nvm_bb_tbl_fold(dev->parent, blks, nr_blks);
+       if (nr_blks < 0) {
+               ret = nr_blks;
+               goto out;
+       }
+
+       rlun->bb_list = blks;
+
+       return 0;
+out:
+       kfree(blks);
+       return ret;
+}
+
+static int pblk_bb_line(struct pblk *pblk, struct pblk_line *line)
+{
+       struct pblk_line_meta *lm = &pblk->lm;
+       struct pblk_lun *rlun;
+       int bb_cnt = 0;
+       int i;
+
+       line->blk_bitmap = kzalloc(lm->blk_bitmap_len, GFP_KERNEL);
+       if (!line->blk_bitmap)
+               return -ENOMEM;
+
+       line->erase_bitmap = kzalloc(lm->blk_bitmap_len, GFP_KERNEL);
+       if (!line->erase_bitmap) {
+               kfree(line->blk_bitmap);
+               return -ENOMEM;
+       }
+
+       for (i = 0; i < lm->blk_per_line; i++) {
+               rlun = &pblk->luns[i];
+               if (rlun->bb_list[line->id] == NVM_BLK_T_FREE)
+                       continue;
+
+               set_bit(i, line->blk_bitmap);
+               bb_cnt++;
+       }
+
+       return bb_cnt;
+}
+
+static int pblk_luns_init(struct pblk *pblk, struct ppa_addr *luns)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       struct pblk_lun *rlun;
+       int i, ret;
+
+       /* TODO: Implement unbalanced LUN support */
+       if (geo->luns_per_chnl < 0) {
+               pr_err("pblk: unbalanced LUN config.\n");
+               return -EINVAL;
+       }
+
+       pblk->luns = kcalloc(geo->nr_luns, sizeof(struct pblk_lun), GFP_KERNEL);
+       if (!pblk->luns)
+               return -ENOMEM;
+
+       for (i = 0; i < geo->nr_luns; i++) {
+               /* Stripe across channels */
+               int ch = i % geo->nr_chnls;
+               int lun_raw = i / geo->nr_chnls;
+               int lunid = lun_raw + ch * geo->luns_per_chnl;
+
+               rlun = &pblk->luns[i];
+               rlun->bppa = luns[lunid];
+
+               sema_init(&rlun->wr_sem, 1);
+
+               ret = pblk_bb_discovery(dev, rlun);
+               if (ret) {
+                       while (--i >= 0)
+                               kfree(pblk->luns[i].bb_list);
+                       return ret;
+               }
+       }
+
+       return 0;
+}
+
+static int pblk_lines_configure(struct pblk *pblk, int flags)
+{
+       struct pblk_line *line = NULL;
+       int ret = 0;
+
+       if (!(flags & NVM_TARGET_FACTORY)) {
+               line = pblk_recov_l2p(pblk);
+               if (IS_ERR(line)) {
+                       pr_err("pblk: could not recover l2p table\n");
+                       ret = -EFAULT;
+               }
+       }
+
+       if (!line) {
+               /* Configure next line for user data */
+               line = pblk_line_get_first_data(pblk);
+               if (!line) {
+                       pr_err("pblk: line list corrupted\n");
+                       ret = -EFAULT;
+               }
+       }
+
+       return ret;
+}
+
+/* See comment over struct line_emeta definition */
+static unsigned int calc_emeta_len(struct pblk *pblk, struct pblk_line_meta *lm)
+{
+       return (sizeof(struct line_emeta) +
+                       ((lm->sec_per_line - lm->emeta_sec) * sizeof(u64)) +
+                       (pblk->l_mg.nr_lines * sizeof(u32)) +
+                       lm->blk_bitmap_len);
+}
+
+static void pblk_set_provision(struct pblk *pblk, long nr_free_blks)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       sector_t provisioned;
+
+       pblk->over_pct = 20;
+
+       provisioned = nr_free_blks;
+       provisioned *= (100 - pblk->over_pct);
+       sector_div(provisioned, 100);
+
+       /* Internally pblk manages all free blocks, but all calculations based
+        * on user capacity consider only provisioned blocks
+        */
+       pblk->rl.total_blocks = nr_free_blks;
+       pblk->rl.nr_secs = nr_free_blks * geo->sec_per_blk;
+       pblk->capacity = provisioned * geo->sec_per_blk;
+       atomic_set(&pblk->rl.free_blocks, nr_free_blks);
+}
+
+static int pblk_lines_init(struct pblk *pblk)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+       struct pblk_line_meta *lm = &pblk->lm;
+       struct pblk_line *line;
+       unsigned int smeta_len, emeta_len;
+       long nr_bad_blks, nr_meta_blks, nr_free_blks;
+       int bb_distance;
+       int i;
+       int ret;
+
+       lm->sec_per_line = geo->sec_per_blk * geo->nr_luns;
+       lm->blk_per_line = geo->nr_luns;
+       lm->blk_bitmap_len = BITS_TO_LONGS(geo->nr_luns) * sizeof(long);
+       lm->sec_bitmap_len = BITS_TO_LONGS(lm->sec_per_line) * sizeof(long);
+       lm->lun_bitmap_len = BITS_TO_LONGS(geo->nr_luns) * sizeof(long);
+       lm->high_thrs = lm->sec_per_line / 2;
+       lm->mid_thrs = lm->sec_per_line / 4;
+
+       /* Calculate necessary pages for smeta. See comment over struct
+        * line_smeta definition
+        */
+       lm->smeta_len = sizeof(struct line_smeta) +
+                               PBLK_LINE_NR_LUN_BITMAP * lm->lun_bitmap_len;
+
+       i = 1;
+add_smeta_page:
+       lm->smeta_sec = i * geo->sec_per_pl;
+       lm->smeta_len = lm->smeta_sec * geo->sec_size;
+
+       smeta_len = sizeof(struct line_smeta) +
+                               PBLK_LINE_NR_LUN_BITMAP * lm->lun_bitmap_len;
+       if (smeta_len > lm->smeta_len) {
+               i++;
+               goto add_smeta_page;
+       }
+
+       /* Calculate necessary pages for emeta. See comment over struct
+        * line_emeta definition
+        */
+       i = 1;
+add_emeta_page:
+       lm->emeta_sec = i * geo->sec_per_pl;
+       lm->emeta_len = lm->emeta_sec * geo->sec_size;
+
+       emeta_len = calc_emeta_len(pblk, lm);
+       if (emeta_len > lm->emeta_len) {
+               i++;
+               goto add_emeta_page;
+       }
+       lm->emeta_bb = geo->nr_luns - i;
+
+       nr_meta_blks = (lm->smeta_sec + lm->emeta_sec +
+                               (geo->sec_per_blk / 2)) / geo->sec_per_blk;
+       lm->min_blk_line = nr_meta_blks + 1;
+
+       l_mg->nr_lines = geo->blks_per_lun;
+       l_mg->log_line = l_mg->data_line = NULL;
+       l_mg->l_seq_nr = l_mg->d_seq_nr = 0;
+       l_mg->nr_free_lines = 0;
+       bitmap_zero(&l_mg->meta_bitmap, PBLK_DATA_LINES);
+
+       /* smeta is always small enough to fit on a kmalloc memory allocation,
+        * emeta depends on the number of LUNs allocated to the pblk instance
+        */
+       l_mg->smeta_alloc_type = PBLK_KMALLOC_META;
+       for (i = 0; i < PBLK_DATA_LINES; i++) {
+               l_mg->sline_meta[i].meta = kmalloc(lm->smeta_len, GFP_KERNEL);
+               if (!l_mg->sline_meta[i].meta)
+                       while (--i >= 0) {
+                               kfree(l_mg->sline_meta[i].meta);
+                               ret = -ENOMEM;
+                               goto fail;
+                       }
+       }
+
+       if (lm->emeta_len > KMALLOC_MAX_CACHE_SIZE) {
+               l_mg->emeta_alloc_type = PBLK_VMALLOC_META;
+
+               for (i = 0; i < PBLK_DATA_LINES; i++) {
+                       l_mg->eline_meta[i].meta = vmalloc(lm->emeta_len);
+                       if (!l_mg->eline_meta[i].meta)
+                               while (--i >= 0) {
+                                       vfree(l_mg->eline_meta[i].meta);
+                                       ret = -ENOMEM;
+                                       goto fail;
+                               }
+               }
+       } else {
+               l_mg->emeta_alloc_type = PBLK_KMALLOC_META;
+
+               for (i = 0; i < PBLK_DATA_LINES; i++) {
+                       l_mg->eline_meta[i].meta =
+                                       kmalloc(lm->emeta_len, GFP_KERNEL);
+                       if (!l_mg->eline_meta[i].meta)
+                               while (--i >= 0) {
+                                       kfree(l_mg->eline_meta[i].meta);
+                                       ret = -ENOMEM;
+                                       goto fail;
+                               }
+               }
+       }
+
+       l_mg->bb_template = kzalloc(lm->sec_bitmap_len, GFP_KERNEL);
+       if (!l_mg->bb_template) {
+               ret = -ENOMEM;
+               goto fail_free_meta;
+       }
+
+       l_mg->bb_aux = kzalloc(lm->sec_bitmap_len, GFP_KERNEL);
+       if (!l_mg->bb_aux) {
+               ret = -ENOMEM;
+               goto fail_free_bb_template;
+       }
+
+       bb_distance = (geo->nr_luns) * geo->sec_per_pl;
+       for (i = 0; i < lm->sec_per_line; i += bb_distance)
+               bitmap_set(l_mg->bb_template, i, geo->sec_per_pl);
+
+       INIT_LIST_HEAD(&l_mg->free_list);
+       INIT_LIST_HEAD(&l_mg->corrupt_list);
+       INIT_LIST_HEAD(&l_mg->bad_list);
+       INIT_LIST_HEAD(&l_mg->gc_full_list);
+       INIT_LIST_HEAD(&l_mg->gc_high_list);
+       INIT_LIST_HEAD(&l_mg->gc_mid_list);
+       INIT_LIST_HEAD(&l_mg->gc_low_list);
+       INIT_LIST_HEAD(&l_mg->gc_empty_list);
+
+       l_mg->gc_lists[0] = &l_mg->gc_high_list;
+       l_mg->gc_lists[1] = &l_mg->gc_mid_list;
+       l_mg->gc_lists[2] = &l_mg->gc_low_list;
+
+       spin_lock_init(&l_mg->free_lock);
+       spin_lock_init(&l_mg->gc_lock);
+
+       pblk->lines = kcalloc(l_mg->nr_lines, sizeof(struct pblk_line),
+                                                               GFP_KERNEL);
+       if (!pblk->lines) {
+               ret = -ENOMEM;
+               goto fail_free_bb_aux;
+       }
+
+       nr_free_blks = 0;
+       for (i = 0; i < l_mg->nr_lines; i++) {
+               int blk_in_line;
+
+               line = &pblk->lines[i];
+
+               line->pblk = pblk;
+               line->id = i;
+               line->type = PBLK_LINETYPE_FREE;
+               line->state = PBLK_LINESTATE_FREE;
+               line->gc_group = PBLK_LINEGC_NONE;
+               spin_lock_init(&line->lock);
+
+               nr_bad_blks = pblk_bb_line(pblk, line);
+               if (nr_bad_blks < 0 || nr_bad_blks > lm->blk_per_line) {
+                       ret = -EINVAL;
+                       goto fail_free_lines;
+               }
+
+               blk_in_line = lm->blk_per_line - nr_bad_blks;
+               if (blk_in_line < lm->min_blk_line) {
+                       line->state = PBLK_LINESTATE_BAD;
+                       list_add_tail(&line->list, &l_mg->bad_list);
+                       continue;
+               }
+
+               nr_free_blks += blk_in_line;
+               atomic_set(&line->blk_in_line, blk_in_line);
+
+               l_mg->nr_free_lines++;
+               list_add_tail(&line->list, &l_mg->free_list);
+       }
+
+       pblk_set_provision(pblk, nr_free_blks);
+
+       sema_init(&pblk->erase_sem, 1);
+
+       /* Cleanup per-LUN bad block lists - managed within lines on run-time */
+       for (i = 0; i < geo->nr_luns; i++)
+               kfree(pblk->luns[i].bb_list);
+
+       return 0;
+fail_free_lines:
+       kfree(pblk->lines);
+fail_free_bb_aux:
+       kfree(l_mg->bb_aux);
+fail_free_bb_template:
+       kfree(l_mg->bb_template);
+fail_free_meta:
+       for (i = 0; i < PBLK_DATA_LINES; i++) {
+               pblk_mfree(l_mg->sline_meta[i].meta, l_mg->smeta_alloc_type);
+               pblk_mfree(l_mg->eline_meta[i].meta, l_mg->emeta_alloc_type);
+       }
+fail:
+       for (i = 0; i < geo->nr_luns; i++)
+               kfree(pblk->luns[i].bb_list);
+
+       return ret;
+}
+
+static int pblk_writer_init(struct pblk *pblk)
+{
+       setup_timer(&pblk->wtimer, pblk_write_timer_fn, (unsigned long)pblk);
+       mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(100));
+
+       pblk->writer_ts = kthread_create(pblk_write_ts, pblk, "pblk-writer-t");
+       if (IS_ERR(pblk->writer_ts)) {
+               pr_err("pblk: could not allocate writer kthread\n");
+               return PTR_ERR(pblk->writer_ts);
+       }
+
+       return 0;
+}
+
+static void pblk_writer_stop(struct pblk *pblk)
+{
+       if (pblk->writer_ts)
+               kthread_stop(pblk->writer_ts);
+       del_timer(&pblk->wtimer);
+}
+
+static void pblk_free(struct pblk *pblk)
+{
+       pblk_luns_free(pblk);
+       pblk_lines_free(pblk);
+       pblk_line_meta_free(pblk);
+       pblk_core_free(pblk);
+       pblk_l2p_free(pblk);
+
+       kfree(pblk);
+}
+
+static void pblk_tear_down(struct pblk *pblk)
+{
+       pblk_flush_writer(pblk);
+       pblk_writer_stop(pblk);
+       pblk_rb_sync_l2p(&pblk->rwb);
+       pblk_recov_pad(pblk);
+       pblk_rwb_free(pblk);
+       pblk_rl_free(&pblk->rl);
+
+       pr_debug("pblk: consistent tear down\n");
+}
+
+static void pblk_exit(void *private)
+{
+       struct pblk *pblk = private;
+
+       down_write(&pblk_lock);
+       pblk_gc_exit(pblk);
+       pblk_tear_down(pblk);
+       pblk_free(pblk);
+       up_write(&pblk_lock);
+}
+
+static sector_t pblk_capacity(void *private)
+{
+       struct pblk *pblk = private;
+
+       return pblk->capacity * NR_PHY_IN_LOG;
+}
+
+static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
+                      int flags)
+{
+       struct nvm_geo *geo = &dev->geo;
+       struct request_queue *bqueue = dev->q;
+       struct request_queue *tqueue = tdisk->queue;
+       struct pblk *pblk;
+       int ret;
+
+       if (dev->identity.dom & NVM_RSP_L2P) {
+               pr_err("pblk: device-side L2P table not supported. (%x)\n",
+                                                       dev->identity.dom);
+               return ERR_PTR(-EINVAL);
+       }
+
+       pblk = kzalloc(sizeof(struct pblk), GFP_KERNEL);
+       if (!pblk)
+               return ERR_PTR(-ENOMEM);
+
+       pblk->dev = dev;
+       pblk->disk = tdisk;
+
+       spin_lock_init(&pblk->trans_lock);
+       spin_lock_init(&pblk->lock);
+
+       if (flags & NVM_TARGET_FACTORY)
+               pblk_setup_uuid(pblk);
+
+#ifdef CONFIG_NVM_DEBUG
+       atomic_long_set(&pblk->inflight_writes, 0);
+       atomic_long_set(&pblk->padded_writes, 0);
+       atomic_long_set(&pblk->padded_wb, 0);
+       atomic_long_set(&pblk->nr_flush, 0);
+       atomic_long_set(&pblk->req_writes, 0);
+       atomic_long_set(&pblk->sub_writes, 0);
+       atomic_long_set(&pblk->sync_writes, 0);
+       atomic_long_set(&pblk->compl_writes, 0);
+       atomic_long_set(&pblk->inflight_reads, 0);
+       atomic_long_set(&pblk->sync_reads, 0);
+       atomic_long_set(&pblk->recov_writes, 0);
+       atomic_long_set(&pblk->recov_writes, 0);
+       atomic_long_set(&pblk->recov_gc_writes, 0);
+#endif
+
+       atomic_long_set(&pblk->read_failed, 0);
+       atomic_long_set(&pblk->read_empty, 0);
+       atomic_long_set(&pblk->read_high_ecc, 0);
+       atomic_long_set(&pblk->read_failed_gc, 0);
+       atomic_long_set(&pblk->write_failed, 0);
+       atomic_long_set(&pblk->erase_failed, 0);
+
+       ret = pblk_luns_init(pblk, dev->luns);
+       if (ret) {
+               pr_err("pblk: could not initialize luns\n");
+               goto fail;
+       }
+
+       ret = pblk_lines_init(pblk);
+       if (ret) {
+               pr_err("pblk: could not initialize lines\n");
+               goto fail_free_luns;
+       }
+
+       ret = pblk_core_init(pblk);
+       if (ret) {
+               pr_err("pblk: could not initialize core\n");
+               goto fail_free_line_meta;
+       }
+
+       ret = pblk_l2p_init(pblk);
+       if (ret) {
+               pr_err("pblk: could not initialize maps\n");
+               goto fail_free_core;
+       }
+
+       ret = pblk_lines_configure(pblk, flags);
+       if (ret) {
+               pr_err("pblk: could not configure lines\n");
+               goto fail_free_l2p;
+       }
+
+       ret = pblk_writer_init(pblk);
+       if (ret) {
+               pr_err("pblk: could not initialize write thread\n");
+               goto fail_free_lines;
+       }
+
+       ret = pblk_gc_init(pblk);
+       if (ret) {
+               pr_err("pblk: could not initialize gc\n");
+               goto fail_stop_writer;
+       }
+
+       /* inherit the size from the underlying device */
+       blk_queue_logical_block_size(tqueue, queue_physical_block_size(bqueue));
+       blk_queue_max_hw_sectors(tqueue, queue_max_hw_sectors(bqueue));
+
+       blk_queue_write_cache(tqueue, true, false);
+
+       tqueue->limits.discard_granularity = geo->pgs_per_blk * geo->pfpg_size;
+       tqueue->limits.discard_alignment = 0;
+       blk_queue_max_discard_sectors(tqueue, UINT_MAX >> 9);
+       queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, tqueue);
+
+       pr_info("pblk init: luns:%u, lines:%d, secs:%llu, buf entries:%u\n",
+                       geo->nr_luns, pblk->l_mg.nr_lines,
+                       (unsigned long long)pblk->rl.nr_secs,
+                       pblk->rwb.nr_entries);
+
+       wake_up_process(pblk->writer_ts);
+       return pblk;
+
+fail_stop_writer:
+       pblk_writer_stop(pblk);
+fail_free_lines:
+       pblk_lines_free(pblk);
+fail_free_l2p:
+       pblk_l2p_free(pblk);
+fail_free_core:
+       pblk_core_free(pblk);
+fail_free_line_meta:
+       pblk_line_meta_free(pblk);
+fail_free_luns:
+       pblk_luns_free(pblk);
+fail:
+       kfree(pblk);
+       return ERR_PTR(ret);
+}
+
+/* physical block device target */
+static struct nvm_tgt_type tt_pblk = {
+       .name           = "pblk",
+       .version        = {1, 0, 0},
+
+       .make_rq        = pblk_make_rq,
+       .capacity       = pblk_capacity,
+
+       .init           = pblk_init,
+       .exit           = pblk_exit,
+
+       .sysfs_init     = pblk_sysfs_init,
+       .sysfs_exit     = pblk_sysfs_exit,
+};
+
+static int __init pblk_module_init(void)
+{
+       return nvm_register_tgt_type(&tt_pblk);
+}
+
+static void pblk_module_exit(void)
+{
+       nvm_unregister_tgt_type(&tt_pblk);
+}
+
+module_init(pblk_module_init);
+module_exit(pblk_module_exit);
+MODULE_AUTHOR("Javier Gonzalez <javier@cnexlabs.com>");
+MODULE_AUTHOR("Matias Bjorling <matias@cnexlabs.com>");
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("Physical Block-Device for Open-Channel SSDs");
diff --git a/drivers/lightnvm/pblk-map.c b/drivers/lightnvm/pblk-map.c
new file mode 100644 (file)
index 0000000..17c1695
--- /dev/null
@@ -0,0 +1,136 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ *                  Matias Bjorling <matias@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * pblk-map.c - pblk's lba-ppa mapping strategy
+ *
+ */
+
+#include "pblk.h"
+
+static void pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
+                              struct ppa_addr *ppa_list,
+                              unsigned long *lun_bitmap,
+                              struct pblk_sec_meta *meta_list,
+                              unsigned int valid_secs)
+{
+       struct pblk_line *line = pblk_line_get_data(pblk);
+       struct line_emeta *emeta = line->emeta;
+       struct pblk_w_ctx *w_ctx;
+       __le64 *lba_list = pblk_line_emeta_to_lbas(emeta);
+       u64 paddr;
+       int nr_secs = pblk->min_write_pgs;
+       int i;
+
+       paddr = pblk_alloc_page(pblk, line, nr_secs);
+
+       for (i = 0; i < nr_secs; i++, paddr++) {
+               /* ppa to be sent to the device */
+               ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id);
+
+               /* Write context for target bio completion on write buffer. Note
+                * that the write buffer is protected by the sync backpointer,
+                * and a single writer thread have access to each specific entry
+                * at a time. Thus, it is safe to modify the context for the
+                * entry we are setting up for submission without taking any
+                * lock or memory barrier.
+                */
+               if (i < valid_secs) {
+                       kref_get(&line->ref);
+                       w_ctx = pblk_rb_w_ctx(&pblk->rwb, sentry + i);
+                       w_ctx->ppa = ppa_list[i];
+                       meta_list[i].lba = cpu_to_le64(w_ctx->lba);
+                       lba_list[paddr] = cpu_to_le64(w_ctx->lba);
+                       le64_add_cpu(&line->emeta->nr_valid_lbas, 1);
+               } else {
+                       meta_list[i].lba = cpu_to_le64(ADDR_EMPTY);
+                       lba_list[paddr] = cpu_to_le64(ADDR_EMPTY);
+                       pblk_map_pad_invalidate(pblk, line, paddr);
+               }
+       }
+
+       if (pblk_line_is_full(line)) {
+               line = pblk_line_replace_data(pblk);
+               if (!line)
+                       return;
+       }
+
+       pblk_down_rq(pblk, ppa_list, nr_secs, lun_bitmap);
+}
+
+void pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
+                unsigned long *lun_bitmap, unsigned int valid_secs,
+                unsigned int off)
+{
+       struct pblk_sec_meta *meta_list = rqd->meta_list;
+       unsigned int map_secs;
+       int min = pblk->min_write_pgs;
+       int i;
+
+       for (i = off; i < rqd->nr_ppas; i += min) {
+               map_secs = (i + min > valid_secs) ? (valid_secs % min) : min;
+               pblk_map_page_data(pblk, sentry + i, &rqd->ppa_list[i],
+                                       lun_bitmap, &meta_list[i], map_secs);
+       }
+}
+
+/* only if erase_ppa is set, acquire erase semaphore */
+void pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
+                      unsigned int sentry, unsigned long *lun_bitmap,
+                      unsigned int valid_secs, struct ppa_addr *erase_ppa)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       struct pblk_line *e_line = pblk_line_get_data_next(pblk);
+       struct pblk_sec_meta *meta_list = rqd->meta_list;
+       unsigned int map_secs;
+       int min = pblk->min_write_pgs;
+       int i, erase_lun;
+
+       for (i = 0; i < rqd->nr_ppas; i += min) {
+               map_secs = (i + min > valid_secs) ? (valid_secs % min) : min;
+               pblk_map_page_data(pblk, sentry + i, &rqd->ppa_list[i],
+                                       lun_bitmap, &meta_list[i], map_secs);
+
+               erase_lun = rqd->ppa_list[i].g.lun * geo->nr_chnls +
+                                                       rqd->ppa_list[i].g.ch;
+
+               if (!test_bit(erase_lun, e_line->erase_bitmap)) {
+                       if (down_trylock(&pblk->erase_sem))
+                               continue;
+
+                       set_bit(erase_lun, e_line->erase_bitmap);
+                       atomic_dec(&e_line->left_eblks);
+                       *erase_ppa = rqd->ppa_list[i];
+                       erase_ppa->g.blk = e_line->id;
+
+                       /* Avoid evaluating e_line->left_eblks */
+                       return pblk_map_rq(pblk, rqd, sentry, lun_bitmap,
+                                                       valid_secs, i + min);
+               }
+       }
+
+       /* Erase blocks that are bad in this line but might not be in next */
+       if (unlikely(ppa_empty(*erase_ppa))) {
+               struct pblk_line_meta *lm = &pblk->lm;
+
+               i = find_first_zero_bit(e_line->erase_bitmap, lm->blk_per_line);
+               if (i == lm->blk_per_line)
+                       return;
+
+               set_bit(i, e_line->erase_bitmap);
+               atomic_dec(&e_line->left_eblks);
+               *erase_ppa = pblk->luns[i].bppa; /* set ch and lun */
+               erase_ppa->g.blk = e_line->id;
+       }
+}
diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c
new file mode 100644 (file)
index 0000000..045384d
--- /dev/null
@@ -0,0 +1,852 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ *
+ * Based upon the circular ringbuffer.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * pblk-rb.c - pblk's write buffer
+ */
+
+#include <linux/circ_buf.h>
+
+#include "pblk.h"
+
+static DECLARE_RWSEM(pblk_rb_lock);
+
+void pblk_rb_data_free(struct pblk_rb *rb)
+{
+       struct pblk_rb_pages *p, *t;
+
+       down_write(&pblk_rb_lock);
+       list_for_each_entry_safe(p, t, &rb->pages, list) {
+               free_pages((unsigned long)page_address(p->pages), p->order);
+               list_del(&p->list);
+               kfree(p);
+       }
+       up_write(&pblk_rb_lock);
+}
+
+/*
+ * Initialize ring buffer. The data and metadata buffers must be previously
+ * allocated and their size must be a power of two
+ * (Documentation/circular-buffers.txt)
+ */
+int pblk_rb_init(struct pblk_rb *rb, struct pblk_rb_entry *rb_entry_base,
+                unsigned int power_size, unsigned int power_seg_sz)
+{
+       struct pblk *pblk = container_of(rb, struct pblk, rwb);
+       unsigned int init_entry = 0;
+       unsigned int alloc_order = power_size;
+       unsigned int max_order = MAX_ORDER - 1;
+       unsigned int order, iter;
+
+       down_write(&pblk_rb_lock);
+       rb->entries = rb_entry_base;
+       rb->seg_size = (1 << power_seg_sz);
+       rb->nr_entries = (1 << power_size);
+       rb->mem = rb->subm = rb->sync = rb->l2p_update = 0;
+       rb->sync_point = EMPTY_ENTRY;
+
+       spin_lock_init(&rb->w_lock);
+       spin_lock_init(&rb->s_lock);
+
+       INIT_LIST_HEAD(&rb->pages);
+
+       if (alloc_order >= max_order) {
+               order = max_order;
+               iter = (1 << (alloc_order - max_order));
+       } else {
+               order = alloc_order;
+               iter = 1;
+       }
+
+       do {
+               struct pblk_rb_entry *entry;
+               struct pblk_rb_pages *page_set;
+               void *kaddr;
+               unsigned long set_size;
+               int i;
+
+               page_set = kmalloc(sizeof(struct pblk_rb_pages), GFP_KERNEL);
+               if (!page_set) {
+                       up_write(&pblk_rb_lock);
+                       return -ENOMEM;
+               }
+
+               page_set->order = order;
+               page_set->pages = alloc_pages(GFP_KERNEL, order);
+               if (!page_set->pages) {
+                       kfree(page_set);
+                       pblk_rb_data_free(rb);
+                       up_write(&pblk_rb_lock);
+                       return -ENOMEM;
+               }
+               kaddr = page_address(page_set->pages);
+
+               entry = &rb->entries[init_entry];
+               entry->data = kaddr;
+               entry->cacheline = pblk_cacheline_to_addr(init_entry++);
+               entry->w_ctx.flags = PBLK_WRITABLE_ENTRY;
+
+               set_size = (1 << order);
+               for (i = 1; i < set_size; i++) {
+                       entry = &rb->entries[init_entry];
+                       entry->cacheline = pblk_cacheline_to_addr(init_entry++);
+                       entry->data = kaddr + (i * rb->seg_size);
+                       entry->w_ctx.flags = PBLK_WRITABLE_ENTRY;
+                       bio_list_init(&entry->w_ctx.bios);
+               }
+
+               list_add_tail(&page_set->list, &rb->pages);
+               iter--;
+       } while (iter > 0);
+       up_write(&pblk_rb_lock);
+
+#ifdef CONFIG_NVM_DEBUG
+       atomic_set(&rb->inflight_sync_point, 0);
+#endif
+
+       /*
+        * Initialize rate-limiter, which controls access to the write buffer
+        * but user and GC I/O
+        */
+       pblk_rl_init(&pblk->rl, rb->nr_entries);
+
+       return 0;
+}
+
+/*
+ * pblk_rb_calculate_size -- calculate the size of the write buffer
+ */
+unsigned int pblk_rb_calculate_size(unsigned int nr_entries)
+{
+       /* Alloc a write buffer that can at least fit 128 entries */
+       return (1 << max(get_count_order(nr_entries), 7));
+}
+
+void *pblk_rb_entries_ref(struct pblk_rb *rb)
+{
+       return rb->entries;
+}
+
+static void clean_wctx(struct pblk_w_ctx *w_ctx)
+{
+       int flags;
+
+try:
+       flags = READ_ONCE(w_ctx->flags);
+       if (!(flags & PBLK_SUBMITTED_ENTRY))
+               goto try;
+
+       /* Release flags on context. Protect from writes and reads */
+       smp_store_release(&w_ctx->flags, PBLK_WRITABLE_ENTRY);
+       pblk_ppa_set_empty(&w_ctx->ppa);
+}
+
+#define pblk_rb_ring_count(head, tail, size) CIRC_CNT(head, tail, size)
+#define pblk_rb_ring_space(rb, head, tail, size) \
+                                       (CIRC_SPACE(head, tail, size))
+
+/*
+ * Buffer space is calculated with respect to the back pointer signaling
+ * synchronized entries to the media.
+ */
+static unsigned int pblk_rb_space(struct pblk_rb *rb)
+{
+       unsigned int mem = READ_ONCE(rb->mem);
+       unsigned int sync = READ_ONCE(rb->sync);
+
+       return pblk_rb_ring_space(rb, mem, sync, rb->nr_entries);
+}
+
+/*
+ * Buffer count is calculated with respect to the submission entry signaling the
+ * entries that are available to send to the media
+ */
+unsigned int pblk_rb_read_count(struct pblk_rb *rb)
+{
+       unsigned int mem = READ_ONCE(rb->mem);
+       unsigned int subm = READ_ONCE(rb->subm);
+
+       return pblk_rb_ring_count(mem, subm, rb->nr_entries);
+}
+
+unsigned int pblk_rb_read_commit(struct pblk_rb *rb, unsigned int nr_entries)
+{
+       unsigned int subm;
+
+       subm = READ_ONCE(rb->subm);
+       /* Commit read means updating submission pointer */
+       smp_store_release(&rb->subm,
+                               (subm + nr_entries) & (rb->nr_entries - 1));
+
+       return subm;
+}
+
+static int __pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int *l2p_upd,
+                               unsigned int to_update)
+{
+       struct pblk *pblk = container_of(rb, struct pblk, rwb);
+       struct pblk_line *line;
+       struct pblk_rb_entry *entry;
+       struct pblk_w_ctx *w_ctx;
+       unsigned int i;
+
+       for (i = 0; i < to_update; i++) {
+               entry = &rb->entries[*l2p_upd];
+               w_ctx = &entry->w_ctx;
+
+               pblk_update_map_dev(pblk, w_ctx->lba, w_ctx->ppa,
+                                                       entry->cacheline);
+
+               line = &pblk->lines[pblk_tgt_ppa_to_line(w_ctx->ppa)];
+               kref_put(&line->ref, pblk_line_put);
+               clean_wctx(w_ctx);
+               *l2p_upd = (*l2p_upd + 1) & (rb->nr_entries - 1);
+       }
+
+       return 0;
+}
+
+/*
+ * When we move the l2p_update pointer, we update the l2p table - lookups will
+ * point to the physical address instead of to the cacheline in the write buffer
+ * from this moment on.
+ */
+static int pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int nr_entries,
+                             unsigned int mem, unsigned int sync)
+{
+       unsigned int space, count;
+       int ret = 0;
+
+       lockdep_assert_held(&rb->w_lock);
+
+       /* Update l2p only as buffer entries are being overwritten */
+       space = pblk_rb_ring_space(rb, mem, rb->l2p_update, rb->nr_entries);
+       if (space > nr_entries)
+               goto out;
+
+       count = nr_entries - space;
+       /* l2p_update used exclusively under rb->w_lock */
+       ret = __pblk_rb_update_l2p(rb, &rb->l2p_update, count);
+
+out:
+       return ret;
+}
+
+/*
+ * Update the l2p entry for all sectors stored on the write buffer. This means
+ * that all future lookups to the l2p table will point to a device address, not
+ * to the cacheline in the write buffer.
+ */
+void pblk_rb_sync_l2p(struct pblk_rb *rb)
+{
+       unsigned int sync;
+       unsigned int to_update;
+
+       spin_lock(&rb->w_lock);
+
+       /* Protect from reads and writes */
+       sync = smp_load_acquire(&rb->sync);
+
+       to_update = pblk_rb_ring_count(sync, rb->l2p_update, rb->nr_entries);
+       __pblk_rb_update_l2p(rb, &rb->l2p_update, to_update);
+
+       spin_unlock(&rb->w_lock);
+}
+
+/*
+ * Write @nr_entries to ring buffer from @data buffer if there is enough space.
+ * Typically, 4KB data chunks coming from a bio will be copied to the ring
+ * buffer, thus the write will fail if not all incoming data can be copied.
+ *
+ */
+static void __pblk_rb_write_entry(struct pblk_rb *rb, void *data,
+                                 struct pblk_w_ctx w_ctx,
+                                 struct pblk_rb_entry *entry)
+{
+       memcpy(entry->data, data, rb->seg_size);
+
+       entry->w_ctx.lba = w_ctx.lba;
+       entry->w_ctx.ppa = w_ctx.ppa;
+}
+
+void pblk_rb_write_entry_user(struct pblk_rb *rb, void *data,
+                             struct pblk_w_ctx w_ctx, unsigned int ring_pos)
+{
+       struct pblk *pblk = container_of(rb, struct pblk, rwb);
+       struct pblk_rb_entry *entry;
+       int flags;
+
+       entry = &rb->entries[ring_pos];
+       flags = READ_ONCE(entry->w_ctx.flags);
+#ifdef CONFIG_NVM_DEBUG
+       /* Caller must guarantee that the entry is free */
+       BUG_ON(!(flags & PBLK_WRITABLE_ENTRY));
+#endif
+
+       __pblk_rb_write_entry(rb, data, w_ctx, entry);
+
+       pblk_update_map_cache(pblk, w_ctx.lba, entry->cacheline);
+       flags = w_ctx.flags | PBLK_WRITTEN_DATA;
+
+       /* Release flags on write context. Protect from writes */
+       smp_store_release(&entry->w_ctx.flags, flags);
+}
+
+void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data,
+                           struct pblk_w_ctx w_ctx, struct pblk_line *gc_line,
+                           unsigned int ring_pos)
+{
+       struct pblk *pblk = container_of(rb, struct pblk, rwb);
+       struct pblk_rb_entry *entry;
+       int flags;
+
+       entry = &rb->entries[ring_pos];
+       flags = READ_ONCE(entry->w_ctx.flags);
+#ifdef CONFIG_NVM_DEBUG
+       /* Caller must guarantee that the entry is free */
+       BUG_ON(!(flags & PBLK_WRITABLE_ENTRY));
+#endif
+
+       __pblk_rb_write_entry(rb, data, w_ctx, entry);
+
+       if (!pblk_update_map_gc(pblk, w_ctx.lba, entry->cacheline, gc_line))
+               entry->w_ctx.lba = ADDR_EMPTY;
+
+       flags = w_ctx.flags | PBLK_WRITTEN_DATA;
+
+       /* Release flags on write context. Protect from writes */
+       smp_store_release(&entry->w_ctx.flags, flags);
+}
+
+static int pblk_rb_sync_point_set(struct pblk_rb *rb, struct bio *bio,
+                                 unsigned int pos)
+{
+       struct pblk_rb_entry *entry;
+       unsigned int subm, sync_point;
+       int flags;
+
+       subm = READ_ONCE(rb->subm);
+
+#ifdef CONFIG_NVM_DEBUG
+       atomic_inc(&rb->inflight_sync_point);
+#endif
+
+       if (pos == subm)
+               return 0;
+
+       sync_point = (pos == 0) ? (rb->nr_entries - 1) : (pos - 1);
+       entry = &rb->entries[sync_point];
+
+       flags = READ_ONCE(entry->w_ctx.flags);
+       flags |= PBLK_FLUSH_ENTRY;
+
+       /* Release flags on context. Protect from writes */
+       smp_store_release(&entry->w_ctx.flags, flags);
+
+       /* Protect syncs */
+       smp_store_release(&rb->sync_point, sync_point);
+
+       spin_lock_irq(&rb->s_lock);
+       bio_list_add(&entry->w_ctx.bios, bio);
+       spin_unlock_irq(&rb->s_lock);
+
+       return 1;
+}
+
+static int __pblk_rb_may_write(struct pblk_rb *rb, unsigned int nr_entries,
+                              unsigned int *pos)
+{
+       unsigned int mem;
+       unsigned int sync;
+
+       sync = READ_ONCE(rb->sync);
+       mem = READ_ONCE(rb->mem);
+
+       if (pblk_rb_ring_space(rb, mem, sync, rb->nr_entries) < nr_entries)
+               return 0;
+
+       if (pblk_rb_update_l2p(rb, nr_entries, mem, sync))
+               return 0;
+
+       *pos = mem;
+
+       return 1;
+}
+
+static int pblk_rb_may_write(struct pblk_rb *rb, unsigned int nr_entries,
+                            unsigned int *pos)
+{
+       if (!__pblk_rb_may_write(rb, nr_entries, pos))
+               return 0;
+
+       /* Protect from read count */
+       smp_store_release(&rb->mem, (*pos + nr_entries) & (rb->nr_entries - 1));
+       return 1;
+}
+
+static int pblk_rb_may_write_flush(struct pblk_rb *rb, unsigned int nr_entries,
+                                  unsigned int *pos, struct bio *bio,
+                                  int *io_ret)
+{
+       unsigned int mem;
+
+       if (!__pblk_rb_may_write(rb, nr_entries, pos))
+               return 0;
+
+       mem = (*pos + nr_entries) & (rb->nr_entries - 1);
+       *io_ret = NVM_IO_DONE;
+
+       if (bio->bi_opf & REQ_PREFLUSH) {
+               struct pblk *pblk = container_of(rb, struct pblk, rwb);
+
+#ifdef CONFIG_NVM_DEBUG
+               atomic_long_inc(&pblk->nr_flush);
+#endif
+               if (pblk_rb_sync_point_set(&pblk->rwb, bio, mem))
+                       *io_ret = NVM_IO_OK;
+       }
+
+       /* Protect from read count */
+       smp_store_release(&rb->mem, mem);
+       return 1;
+}
+
+/*
+ * Atomically check that (i) there is space on the write buffer for the
+ * incoming I/O, and (ii) the current I/O type has enough budget in the write
+ * buffer (rate-limiter).
+ */
+int pblk_rb_may_write_user(struct pblk_rb *rb, struct bio *bio,
+                          unsigned int nr_entries, unsigned int *pos)
+{
+       struct pblk *pblk = container_of(rb, struct pblk, rwb);
+       int flush_done;
+
+       spin_lock(&rb->w_lock);
+       if (!pblk_rl_user_may_insert(&pblk->rl, nr_entries)) {
+               spin_unlock(&rb->w_lock);
+               return NVM_IO_REQUEUE;
+       }
+
+       if (!pblk_rb_may_write_flush(rb, nr_entries, pos, bio, &flush_done)) {
+               spin_unlock(&rb->w_lock);
+               return NVM_IO_REQUEUE;
+       }
+
+       pblk_rl_user_in(&pblk->rl, nr_entries);
+       spin_unlock(&rb->w_lock);
+
+       return flush_done;
+}
+
+/*
+ * Look at pblk_rb_may_write_user comment
+ */
+int pblk_rb_may_write_gc(struct pblk_rb *rb, unsigned int nr_entries,
+                        unsigned int *pos)
+{
+       struct pblk *pblk = container_of(rb, struct pblk, rwb);
+
+       spin_lock(&rb->w_lock);
+       if (!pblk_rl_gc_may_insert(&pblk->rl, nr_entries)) {
+               spin_unlock(&rb->w_lock);
+               return 0;
+       }
+
+       if (!pblk_rb_may_write(rb, nr_entries, pos)) {
+               spin_unlock(&rb->w_lock);
+               return 0;
+       }
+
+       pblk_rl_gc_in(&pblk->rl, nr_entries);
+       spin_unlock(&rb->w_lock);
+
+       return 1;
+}
+
+/*
+ * The caller of this function must ensure that the backpointer will not
+ * overwrite the entries passed on the list.
+ */
+unsigned int pblk_rb_read_to_bio_list(struct pblk_rb *rb, struct bio *bio,
+                                     struct list_head *list,
+                                     unsigned int max)
+{
+       struct pblk_rb_entry *entry, *tentry;
+       struct page *page;
+       unsigned int read = 0;
+       int ret;
+
+       list_for_each_entry_safe(entry, tentry, list, index) {
+               if (read > max) {
+                       pr_err("pblk: too many entries on list\n");
+                       goto out;
+               }
+
+               page = virt_to_page(entry->data);
+               if (!page) {
+                       pr_err("pblk: could not allocate write bio page\n");
+                       goto out;
+               }
+
+               ret = bio_add_page(bio, page, rb->seg_size, 0);
+               if (ret != rb->seg_size) {
+                       pr_err("pblk: could not add page to write bio\n");
+                       goto out;
+               }
+
+               list_del(&entry->index);
+               read++;
+       }
+
+out:
+       return read;
+}
+
+/*
+ * Read available entries on rb and add them to the given bio. To avoid a memory
+ * copy, a page reference to the write buffer is used to be added to the bio.
+ *
+ * This function is used by the write thread to form the write bio that will
+ * persist data on the write buffer to the media.
+ */
+unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct bio *bio,
+                                struct pblk_c_ctx *c_ctx,
+                                unsigned int pos,
+                                unsigned int nr_entries,
+                                unsigned int count)
+{
+       struct pblk *pblk = container_of(rb, struct pblk, rwb);
+       struct pblk_rb_entry *entry;
+       struct page *page;
+       unsigned int pad = 0, read = 0, to_read = nr_entries;
+       unsigned int user_io = 0, gc_io = 0;
+       unsigned int i;
+       int flags;
+       int ret;
+
+       if (count < nr_entries) {
+               pad = nr_entries - count;
+               to_read = count;
+       }
+
+       c_ctx->sentry = pos;
+       c_ctx->nr_valid = to_read;
+       c_ctx->nr_padded = pad;
+
+       for (i = 0; i < to_read; i++) {
+               entry = &rb->entries[pos];
+
+               /* A write has been allowed into the buffer, but data is still
+                * being copied to it. It is ok to busy wait.
+                */
+try:
+               flags = READ_ONCE(entry->w_ctx.flags);
+               if (!(flags & PBLK_WRITTEN_DATA))
+                       goto try;
+
+               if (flags & PBLK_IOTYPE_USER)
+                       user_io++;
+               else if (flags & PBLK_IOTYPE_GC)
+                       gc_io++;
+               else
+                       WARN(1, "pblk: unknown IO type\n");
+
+               page = virt_to_page(entry->data);
+               if (!page) {
+                       pr_err("pblk: could not allocate write bio page\n");
+                       flags &= ~PBLK_WRITTEN_DATA;
+                       flags |= PBLK_SUBMITTED_ENTRY;
+                       /* Release flags on context. Protect from writes */
+                       smp_store_release(&entry->w_ctx.flags, flags);
+                       goto out;
+               }
+
+               ret = bio_add_page(bio, page, rb->seg_size, 0);
+               if (ret != rb->seg_size) {
+                       pr_err("pblk: could not add page to write bio\n");
+                       flags &= ~PBLK_WRITTEN_DATA;
+                       flags |= PBLK_SUBMITTED_ENTRY;
+                       /* Release flags on context. Protect from writes */
+                       smp_store_release(&entry->w_ctx.flags, flags);
+                       goto out;
+               }
+
+               if (flags & PBLK_FLUSH_ENTRY) {
+                       unsigned int sync_point;
+
+                       sync_point = READ_ONCE(rb->sync_point);
+                       if (sync_point == pos) {
+                               /* Protect syncs */
+                               smp_store_release(&rb->sync_point, EMPTY_ENTRY);
+                       }
+
+                       flags &= ~PBLK_FLUSH_ENTRY;
+#ifdef CONFIG_NVM_DEBUG
+                       atomic_dec(&rb->inflight_sync_point);
+#endif
+               }
+
+               flags &= ~PBLK_WRITTEN_DATA;
+               flags |= PBLK_SUBMITTED_ENTRY;
+
+               /* Release flags on context. Protect from writes */
+               smp_store_release(&entry->w_ctx.flags, flags);
+
+               pos = (pos + 1) & (rb->nr_entries - 1);
+       }
+
+       read = to_read;
+       pblk_rl_out(&pblk->rl, user_io, gc_io);
+#ifdef CONFIG_NVM_DEBUG
+       atomic_long_add(pad, &((struct pblk *)
+                       (container_of(rb, struct pblk, rwb)))->padded_writes);
+#endif
+out:
+       return read;
+}
+
+/*
+ * Copy to bio only if the lba matches the one on the given cache entry.
+ * Otherwise, it means that the entry has been overwritten, and the bio should
+ * be directed to disk.
+ */
+int pblk_rb_copy_to_bio(struct pblk_rb *rb, struct bio *bio, sector_t lba,
+                       u64 pos, int bio_iter)
+{
+       struct pblk_rb_entry *entry;
+       struct pblk_w_ctx *w_ctx;
+       void *data;
+       int flags;
+       int ret = 1;
+
+       spin_lock(&rb->w_lock);
+
+#ifdef CONFIG_NVM_DEBUG
+       /* Caller must ensure that the access will not cause an overflow */
+       BUG_ON(pos >= rb->nr_entries);
+#endif
+       entry = &rb->entries[pos];
+       w_ctx = &entry->w_ctx;
+       flags = READ_ONCE(w_ctx->flags);
+
+       /* Check if the entry has been overwritten or is scheduled to be */
+       if (w_ctx->lba != lba || flags & PBLK_WRITABLE_ENTRY) {
+               ret = 0;
+               goto out;
+       }
+
+       /* Only advance the bio if it hasn't been advanced already. If advanced,
+        * this bio is at least a partial bio (i.e., it has partially been
+        * filled with data from the cache). If part of the data resides on the
+        * media, we will read later on
+        */
+       if (unlikely(!bio->bi_iter.bi_idx))
+               bio_advance(bio, bio_iter * PBLK_EXPOSED_PAGE_SIZE);
+
+       data = bio_data(bio);
+       memcpy(data, entry->data, rb->seg_size);
+
+out:
+       spin_unlock(&rb->w_lock);
+       return ret;
+}
+
+struct pblk_w_ctx *pblk_rb_w_ctx(struct pblk_rb *rb, unsigned int pos)
+{
+       unsigned int entry = pos & (rb->nr_entries - 1);
+
+       return &rb->entries[entry].w_ctx;
+}
+
+unsigned int pblk_rb_sync_init(struct pblk_rb *rb, unsigned long *flags)
+       __acquires(&rb->s_lock)
+{
+       if (flags)
+               spin_lock_irqsave(&rb->s_lock, *flags);
+       else
+               spin_lock_irq(&rb->s_lock);
+
+       return rb->sync;
+}
+
+void pblk_rb_sync_end(struct pblk_rb *rb, unsigned long *flags)
+       __releases(&rb->s_lock)
+{
+       lockdep_assert_held(&rb->s_lock);
+
+       if (flags)
+               spin_unlock_irqrestore(&rb->s_lock, *flags);
+       else
+               spin_unlock_irq(&rb->s_lock);
+}
+
+unsigned int pblk_rb_sync_advance(struct pblk_rb *rb, unsigned int nr_entries)
+{
+       unsigned int sync;
+       unsigned int i;
+
+       lockdep_assert_held(&rb->s_lock);
+
+       sync = READ_ONCE(rb->sync);
+
+       for (i = 0; i < nr_entries; i++)
+               sync = (sync + 1) & (rb->nr_entries - 1);
+
+       /* Protect from counts */
+       smp_store_release(&rb->sync, sync);
+
+       return sync;
+}
+
+unsigned int pblk_rb_sync_point_count(struct pblk_rb *rb)
+{
+       unsigned int subm, sync_point;
+       unsigned int count;
+
+       /* Protect syncs */
+       sync_point = smp_load_acquire(&rb->sync_point);
+       if (sync_point == EMPTY_ENTRY)
+               return 0;
+
+       subm = READ_ONCE(rb->subm);
+
+       /* The sync point itself counts as a sector to sync */
+       count = pblk_rb_ring_count(sync_point, subm, rb->nr_entries) + 1;
+
+       return count;
+}
+
+/*
+ * Scan from the current position of the sync pointer to find the entry that
+ * corresponds to the given ppa. This is necessary since write requests can be
+ * completed out of order. The assumption is that the ppa is close to the sync
+ * pointer thus the search will not take long.
+ *
+ * The caller of this function must guarantee that the sync pointer will no
+ * reach the entry while it is using the metadata associated with it. With this
+ * assumption in mind, there is no need to take the sync lock.
+ */
+struct pblk_rb_entry *pblk_rb_sync_scan_entry(struct pblk_rb *rb,
+                                             struct ppa_addr *ppa)
+{
+       unsigned int sync, subm, count;
+       unsigned int i;
+
+       sync = READ_ONCE(rb->sync);
+       subm = READ_ONCE(rb->subm);
+       count = pblk_rb_ring_count(subm, sync, rb->nr_entries);
+
+       for (i = 0; i < count; i++)
+               sync = (sync + 1) & (rb->nr_entries - 1);
+
+       return NULL;
+}
+
+int pblk_rb_tear_down_check(struct pblk_rb *rb)
+{
+       struct pblk_rb_entry *entry;
+       int i;
+       int ret = 0;
+
+       spin_lock(&rb->w_lock);
+       spin_lock_irq(&rb->s_lock);
+
+       if ((rb->mem == rb->subm) && (rb->subm == rb->sync) &&
+                               (rb->sync == rb->l2p_update) &&
+                               (rb->sync_point == EMPTY_ENTRY)) {
+               goto out;
+       }
+
+       if (!rb->entries) {
+               ret = 1;
+               goto out;
+       }
+
+       for (i = 0; i < rb->nr_entries; i++) {
+               entry = &rb->entries[i];
+
+               if (!entry->data) {
+                       ret = 1;
+                       goto out;
+               }
+       }
+
+out:
+       spin_unlock(&rb->w_lock);
+       spin_unlock_irq(&rb->s_lock);
+
+       return ret;
+}
+
+unsigned int pblk_rb_wrap_pos(struct pblk_rb *rb, unsigned int pos)
+{
+       return (pos & (rb->nr_entries - 1));
+}
+
+int pblk_rb_pos_oob(struct pblk_rb *rb, u64 pos)
+{
+       return (pos >= rb->nr_entries);
+}
+
+ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf)
+{
+       struct pblk *pblk = container_of(rb, struct pblk, rwb);
+       struct pblk_c_ctx *c;
+       ssize_t offset;
+       int queued_entries = 0;
+
+       spin_lock_irq(&rb->s_lock);
+       list_for_each_entry(c, &pblk->compl_list, list)
+               queued_entries++;
+       spin_unlock_irq(&rb->s_lock);
+
+       if (rb->sync_point != EMPTY_ENTRY)
+               offset = scnprintf(buf, PAGE_SIZE,
+                       "%u\t%u\t%u\t%u\t%u\t%u\t%u - %u/%u/%u - %d\n",
+                       rb->nr_entries,
+                       rb->mem,
+                       rb->subm,
+                       rb->sync,
+                       rb->l2p_update,
+#ifdef CONFIG_NVM_DEBUG
+                       atomic_read(&rb->inflight_sync_point),
+#else
+                       0,
+#endif
+                       rb->sync_point,
+                       pblk_rb_read_count(rb),
+                       pblk_rb_space(rb),
+                       pblk_rb_sync_point_count(rb),
+                       queued_entries);
+       else
+               offset = scnprintf(buf, PAGE_SIZE,
+                       "%u\t%u\t%u\t%u\t%u\t%u\tNULL - %u/%u/%u - %d\n",
+                       rb->nr_entries,
+                       rb->mem,
+                       rb->subm,
+                       rb->sync,
+                       rb->l2p_update,
+#ifdef CONFIG_NVM_DEBUG
+                       atomic_read(&rb->inflight_sync_point),
+#else
+                       0,
+#endif
+                       pblk_rb_read_count(rb),
+                       pblk_rb_space(rb),
+                       pblk_rb_sync_point_count(rb),
+                       queued_entries);
+
+       return offset;
+}
diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c
new file mode 100644 (file)
index 0000000..4a12f14
--- /dev/null
@@ -0,0 +1,529 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ *                  Matias Bjorling <matias@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * pblk-read.c - pblk's read path
+ */
+
+#include "pblk.h"
+
+/*
+ * There is no guarantee that the value read from cache has not been updated and
+ * resides at another location in the cache. We guarantee though that if the
+ * value is read from the cache, it belongs to the mapped lba. In order to
+ * guarantee and order between writes and reads are ordered, a flush must be
+ * issued.
+ */
+static int pblk_read_from_cache(struct pblk *pblk, struct bio *bio,
+                               sector_t lba, struct ppa_addr ppa,
+                               int bio_iter)
+{
+#ifdef CONFIG_NVM_DEBUG
+       /* Callers must ensure that the ppa points to a cache address */
+       BUG_ON(pblk_ppa_empty(ppa));
+       BUG_ON(!pblk_addr_in_cache(ppa));
+#endif
+
+       return pblk_rb_copy_to_bio(&pblk->rwb, bio, lba,
+                                       pblk_addr_to_cacheline(ppa), bio_iter);
+}
+
+static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd,
+                                unsigned long *read_bitmap)
+{
+       struct bio *bio = rqd->bio;
+       struct ppa_addr ppas[PBLK_MAX_REQ_ADDRS];
+       sector_t blba = pblk_get_lba(bio);
+       int nr_secs = rqd->nr_ppas;
+       int advanced_bio = 0;
+       int i, j = 0;
+
+       /* logic error: lba out-of-bounds. Ignore read request */
+       if (blba + nr_secs >= pblk->rl.nr_secs) {
+               WARN(1, "pblk: read lbas out of bounds\n");
+               return;
+       }
+
+       pblk_lookup_l2p_seq(pblk, ppas, blba, nr_secs);
+
+       for (i = 0; i < nr_secs; i++) {
+               struct ppa_addr p = ppas[i];
+               sector_t lba = blba + i;
+
+retry:
+               if (pblk_ppa_empty(p)) {
+                       WARN_ON(test_and_set_bit(i, read_bitmap));
+                       continue;
+               }
+
+               /* Try to read from write buffer. The address is later checked
+                * on the write buffer to prevent retrieving overwritten data.
+                */
+               if (pblk_addr_in_cache(p)) {
+                       if (!pblk_read_from_cache(pblk, bio, lba, p, i)) {
+                               pblk_lookup_l2p_seq(pblk, &p, lba, 1);
+                               goto retry;
+                       }
+                       WARN_ON(test_and_set_bit(i, read_bitmap));
+                       advanced_bio = 1;
+               } else {
+                       /* Read from media non-cached sectors */
+                       rqd->ppa_list[j++] = p;
+               }
+
+               if (advanced_bio)
+                       bio_advance(bio, PBLK_EXPOSED_PAGE_SIZE);
+       }
+
+#ifdef CONFIG_NVM_DEBUG
+       atomic_long_add(nr_secs, &pblk->inflight_reads);
+#endif
+}
+
+static int pblk_submit_read_io(struct pblk *pblk, struct nvm_rq *rqd)
+{
+       int err;
+
+       rqd->flags = pblk_set_read_mode(pblk);
+
+       err = pblk_submit_io(pblk, rqd);
+       if (err)
+               return NVM_IO_ERR;
+
+       return NVM_IO_OK;
+}
+
+static void pblk_end_io_read(struct nvm_rq *rqd)
+{
+       struct pblk *pblk = rqd->private;
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct pblk_r_ctx *r_ctx = nvm_rq_to_pdu(rqd);
+       struct bio *bio = rqd->bio;
+
+       if (rqd->error)
+               pblk_log_read_err(pblk, rqd);
+#ifdef CONFIG_NVM_DEBUG
+       else
+               WARN_ONCE(bio->bi_error, "pblk: corrupted read error\n");
+#endif
+
+       if (rqd->nr_ppas > 1)
+               nvm_dev_dma_free(dev->parent, rqd->ppa_list, rqd->dma_ppa_list);
+
+       bio_put(bio);
+       if (r_ctx->orig_bio) {
+#ifdef CONFIG_NVM_DEBUG
+               WARN_ONCE(r_ctx->orig_bio->bi_error,
+                                               "pblk: corrupted read bio\n");
+#endif
+               bio_endio(r_ctx->orig_bio);
+               bio_put(r_ctx->orig_bio);
+       }
+
+#ifdef CONFIG_NVM_DEBUG
+       atomic_long_add(rqd->nr_ppas, &pblk->sync_reads);
+       atomic_long_sub(rqd->nr_ppas, &pblk->inflight_reads);
+#endif
+
+       pblk_free_rqd(pblk, rqd, READ);
+}
+
+static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
+                                     unsigned int bio_init_idx,
+                                     unsigned long *read_bitmap)
+{
+       struct bio *new_bio, *bio = rqd->bio;
+       struct bio_vec src_bv, dst_bv;
+       void *ppa_ptr = NULL;
+       void *src_p, *dst_p;
+       dma_addr_t dma_ppa_list = 0;
+       int nr_secs = rqd->nr_ppas;
+       int nr_holes = nr_secs - bitmap_weight(read_bitmap, nr_secs);
+       int i, ret, hole;
+       DECLARE_COMPLETION_ONSTACK(wait);
+
+       new_bio = bio_alloc(GFP_KERNEL, nr_holes);
+       if (!new_bio) {
+               pr_err("pblk: could not alloc read bio\n");
+               return NVM_IO_ERR;
+       }
+
+       if (pblk_bio_add_pages(pblk, new_bio, GFP_KERNEL, nr_holes))
+               goto err;
+
+       if (nr_holes != new_bio->bi_vcnt) {
+               pr_err("pblk: malformed bio\n");
+               goto err;
+       }
+
+       new_bio->bi_iter.bi_sector = 0; /* internal bio */
+       bio_set_op_attrs(new_bio, REQ_OP_READ, 0);
+       new_bio->bi_private = &wait;
+       new_bio->bi_end_io = pblk_end_bio_sync;
+
+       rqd->bio = new_bio;
+       rqd->nr_ppas = nr_holes;
+       rqd->end_io = NULL;
+
+       if (unlikely(nr_secs > 1 && nr_holes == 1)) {
+               ppa_ptr = rqd->ppa_list;
+               dma_ppa_list = rqd->dma_ppa_list;
+               rqd->ppa_addr = rqd->ppa_list[0];
+       }
+
+       ret = pblk_submit_read_io(pblk, rqd);
+       if (ret) {
+               bio_put(rqd->bio);
+               pr_err("pblk: read IO submission failed\n");
+               goto err;
+       }
+
+       if (!wait_for_completion_io_timeout(&wait,
+                               msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
+               pr_err("pblk: partial read I/O timed out\n");
+       }
+
+       if (rqd->error) {
+               atomic_long_inc(&pblk->read_failed);
+#ifdef CONFIG_NVM_DEBUG
+               pblk_print_failed_rqd(pblk, rqd, rqd->error);
+#endif
+       }
+
+       if (unlikely(nr_secs > 1 && nr_holes == 1)) {
+               rqd->ppa_list = ppa_ptr;
+               rqd->dma_ppa_list = dma_ppa_list;
+       }
+
+       /* Fill the holes in the original bio */
+       i = 0;
+       hole = find_first_zero_bit(read_bitmap, nr_secs);
+       do {
+               src_bv = new_bio->bi_io_vec[i++];
+               dst_bv = bio->bi_io_vec[bio_init_idx + hole];
+
+               src_p = kmap_atomic(src_bv.bv_page);
+               dst_p = kmap_atomic(dst_bv.bv_page);
+
+               memcpy(dst_p + dst_bv.bv_offset,
+                       src_p + src_bv.bv_offset,
+                       PBLK_EXPOSED_PAGE_SIZE);
+
+               kunmap_atomic(src_p);
+               kunmap_atomic(dst_p);
+
+               mempool_free(src_bv.bv_page, pblk->page_pool);
+
+               hole = find_next_zero_bit(read_bitmap, nr_secs, hole + 1);
+       } while (hole < nr_secs);
+
+       bio_put(new_bio);
+
+       /* Complete the original bio and associated request */
+       rqd->bio = bio;
+       rqd->nr_ppas = nr_secs;
+       rqd->private = pblk;
+
+       bio_endio(bio);
+       pblk_end_io_read(rqd);
+       return NVM_IO_OK;
+
+err:
+       /* Free allocated pages in new bio */
+       pblk_bio_free_pages(pblk, bio, 0, new_bio->bi_vcnt);
+       rqd->private = pblk;
+       pblk_end_io_read(rqd);
+       return NVM_IO_ERR;
+}
+
+static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd,
+                        unsigned long *read_bitmap)
+{
+       struct bio *bio = rqd->bio;
+       struct ppa_addr ppa;
+       sector_t lba = pblk_get_lba(bio);
+
+       /* logic error: lba out-of-bounds. Ignore read request */
+       if (lba >= pblk->rl.nr_secs) {
+               WARN(1, "pblk: read lba out of bounds\n");
+               return;
+       }
+
+       pblk_lookup_l2p_seq(pblk, &ppa, lba, 1);
+
+#ifdef CONFIG_NVM_DEBUG
+       atomic_long_inc(&pblk->inflight_reads);
+#endif
+
+retry:
+       if (pblk_ppa_empty(ppa)) {
+               WARN_ON(test_and_set_bit(0, read_bitmap));
+               return;
+       }
+
+       /* Try to read from write buffer. The address is later checked on the
+        * write buffer to prevent retrieving overwritten data.
+        */
+       if (pblk_addr_in_cache(ppa)) {
+               if (!pblk_read_from_cache(pblk, bio, lba, ppa, 0)) {
+                       pblk_lookup_l2p_seq(pblk, &ppa, lba, 1);
+                       goto retry;
+               }
+               WARN_ON(test_and_set_bit(0, read_bitmap));
+       } else {
+               rqd->ppa_addr = ppa;
+       }
+}
+
+int pblk_submit_read(struct pblk *pblk, struct bio *bio)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       unsigned int nr_secs = pblk_get_secs(bio);
+       struct nvm_rq *rqd;
+       unsigned long read_bitmap; /* Max 64 ppas per request */
+       unsigned int bio_init_idx;
+       int ret = NVM_IO_ERR;
+
+       if (nr_secs > PBLK_MAX_REQ_ADDRS)
+               return NVM_IO_ERR;
+
+       bitmap_zero(&read_bitmap, nr_secs);
+
+       rqd = pblk_alloc_rqd(pblk, READ);
+       if (IS_ERR(rqd)) {
+               pr_err_ratelimited("pblk: not able to alloc rqd");
+               return NVM_IO_ERR;
+       }
+
+       rqd->opcode = NVM_OP_PREAD;
+       rqd->bio = bio;
+       rqd->nr_ppas = nr_secs;
+       rqd->private = pblk;
+       rqd->end_io = pblk_end_io_read;
+
+       /* Save the index for this bio's start. This is needed in case
+        * we need to fill a partial read.
+        */
+       bio_init_idx = pblk_get_bi_idx(bio);
+
+       if (nr_secs > 1) {
+               rqd->ppa_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL,
+                                               &rqd->dma_ppa_list);
+               if (!rqd->ppa_list) {
+                       pr_err("pblk: not able to allocate ppa list\n");
+                       goto fail_rqd_free;
+               }
+
+               pblk_read_ppalist_rq(pblk, rqd, &read_bitmap);
+       } else {
+               pblk_read_rq(pblk, rqd, &read_bitmap);
+       }
+
+       bio_get(bio);
+       if (bitmap_full(&read_bitmap, nr_secs)) {
+               bio_endio(bio);
+               pblk_end_io_read(rqd);
+               return NVM_IO_OK;
+       }
+
+       /* All sectors are to be read from the device */
+       if (bitmap_empty(&read_bitmap, rqd->nr_ppas)) {
+               struct bio *int_bio = NULL;
+               struct pblk_r_ctx *r_ctx = nvm_rq_to_pdu(rqd);
+
+               /* Clone read bio to deal with read errors internally */
+               int_bio = bio_clone_bioset(bio, GFP_KERNEL, fs_bio_set);
+               if (!int_bio) {
+                       pr_err("pblk: could not clone read bio\n");
+                       return NVM_IO_ERR;
+               }
+
+               rqd->bio = int_bio;
+               r_ctx->orig_bio = bio;
+
+               ret = pblk_submit_read_io(pblk, rqd);
+               if (ret) {
+                       pr_err("pblk: read IO submission failed\n");
+                       if (int_bio)
+                               bio_put(int_bio);
+                       return ret;
+               }
+
+               return NVM_IO_OK;
+       }
+
+       /* The read bio request could be partially filled by the write buffer,
+        * but there are some holes that need to be read from the drive.
+        */
+       ret = pblk_fill_partial_read_bio(pblk, rqd, bio_init_idx, &read_bitmap);
+       if (ret) {
+               pr_err("pblk: failed to perform partial read\n");
+               return ret;
+       }
+
+       return NVM_IO_OK;
+
+fail_rqd_free:
+       pblk_free_rqd(pblk, rqd, READ);
+       return ret;
+}
+
+static int read_ppalist_rq_gc(struct pblk *pblk, struct nvm_rq *rqd,
+                             struct pblk_line *line, u64 *lba_list,
+                             unsigned int nr_secs)
+{
+       struct ppa_addr ppas[PBLK_MAX_REQ_ADDRS];
+       int valid_secs = 0;
+       int i;
+
+       pblk_lookup_l2p_rand(pblk, ppas, lba_list, nr_secs);
+
+       for (i = 0; i < nr_secs; i++) {
+               if (pblk_addr_in_cache(ppas[i]) || ppas[i].g.blk != line->id ||
+                                               pblk_ppa_empty(ppas[i])) {
+                       lba_list[i] = ADDR_EMPTY;
+                       continue;
+               }
+
+               rqd->ppa_list[valid_secs++] = ppas[i];
+       }
+
+#ifdef CONFIG_NVM_DEBUG
+       atomic_long_add(valid_secs, &pblk->inflight_reads);
+#endif
+       return valid_secs;
+}
+
+static int read_rq_gc(struct pblk *pblk, struct nvm_rq *rqd,
+                     struct pblk_line *line, sector_t lba)
+{
+       struct ppa_addr ppa;
+       int valid_secs = 0;
+
+       if (lba == ADDR_EMPTY)
+               goto out;
+
+       /* logic error: lba out-of-bounds */
+       if (lba >= pblk->rl.nr_secs) {
+               WARN(1, "pblk: read lba out of bounds\n");
+               goto out;
+       }
+
+       spin_lock(&pblk->trans_lock);
+       ppa = pblk_trans_map_get(pblk, lba);
+       spin_unlock(&pblk->trans_lock);
+
+       /* Ignore updated values until the moment */
+       if (pblk_addr_in_cache(ppa) || ppa.g.blk != line->id ||
+                                                       pblk_ppa_empty(ppa))
+               goto out;
+
+       rqd->ppa_addr = ppa;
+       valid_secs = 1;
+
+#ifdef CONFIG_NVM_DEBUG
+       atomic_long_inc(&pblk->inflight_reads);
+#endif
+
+out:
+       return valid_secs;
+}
+
+int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data,
+                       unsigned int nr_secs, unsigned int *secs_to_gc,
+                       struct pblk_line *line)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       struct request_queue *q = dev->q;
+       struct bio *bio;
+       struct nvm_rq rqd;
+       int ret, data_len;
+       DECLARE_COMPLETION_ONSTACK(wait);
+
+       memset(&rqd, 0, sizeof(struct nvm_rq));
+
+       if (nr_secs > 1) {
+               rqd.ppa_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL,
+                                                       &rqd.dma_ppa_list);
+               if (!rqd.ppa_list)
+                       return NVM_IO_ERR;
+
+               *secs_to_gc = read_ppalist_rq_gc(pblk, &rqd, line, lba_list,
+                                                               nr_secs);
+               if (*secs_to_gc == 1) {
+                       struct ppa_addr ppa;
+
+                       ppa = rqd.ppa_list[0];
+                       nvm_dev_dma_free(dev->parent, rqd.ppa_list,
+                                                       rqd.dma_ppa_list);
+                       rqd.ppa_addr = ppa;
+               }
+       } else {
+               *secs_to_gc = read_rq_gc(pblk, &rqd, line, lba_list[0]);
+       }
+
+       if (!(*secs_to_gc))
+               goto out;
+
+       data_len = (*secs_to_gc) * geo->sec_size;
+       bio = bio_map_kern(q, data, data_len, GFP_KERNEL);
+       if (IS_ERR(bio)) {
+               pr_err("pblk: could not allocate GC bio (%lu)\n", PTR_ERR(bio));
+               goto err_free_dma;
+       }
+
+       bio->bi_iter.bi_sector = 0; /* internal bio */
+       bio_set_op_attrs(bio, REQ_OP_READ, 0);
+
+       rqd.opcode = NVM_OP_PREAD;
+       rqd.end_io = pblk_end_io_sync;
+       rqd.private = &wait;
+       rqd.nr_ppas = *secs_to_gc;
+       rqd.bio = bio;
+
+       ret = pblk_submit_read_io(pblk, &rqd);
+       if (ret) {
+               bio_endio(bio);
+               pr_err("pblk: GC read request failed\n");
+               goto err_free_dma;
+       }
+
+       if (!wait_for_completion_io_timeout(&wait,
+                               msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
+               pr_err("pblk: GC read I/O timed out\n");
+       }
+
+       if (rqd.error) {
+               atomic_long_inc(&pblk->read_failed_gc);
+#ifdef CONFIG_NVM_DEBUG
+               pblk_print_failed_rqd(pblk, &rqd, rqd.error);
+#endif
+       }
+
+#ifdef CONFIG_NVM_DEBUG
+       atomic_long_add(*secs_to_gc, &pblk->sync_reads);
+       atomic_long_add(*secs_to_gc, &pblk->recov_gc_reads);
+       atomic_long_sub(*secs_to_gc, &pblk->inflight_reads);
+#endif
+
+out:
+       if (rqd.nr_ppas > 1)
+               nvm_dev_dma_free(dev->parent, rqd.ppa_list, rqd.dma_ppa_list);
+       return NVM_IO_OK;
+
+err_free_dma:
+       if (rqd.nr_ppas > 1)
+               nvm_dev_dma_free(dev->parent, rqd.ppa_list, rqd.dma_ppa_list);
+       return NVM_IO_ERR;
+}
diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c
new file mode 100644 (file)
index 0000000..f8f8508
--- /dev/null
@@ -0,0 +1,998 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial: Javier Gonzalez <javier@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * pblk-recovery.c - pblk's recovery path
+ */
+
+#include "pblk.h"
+
+void pblk_submit_rec(struct work_struct *work)
+{
+       struct pblk_rec_ctx *recovery =
+                       container_of(work, struct pblk_rec_ctx, ws_rec);
+       struct pblk *pblk = recovery->pblk;
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_rq *rqd = recovery->rqd;
+       struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
+       int max_secs = nvm_max_phys_sects(dev);
+       struct bio *bio;
+       unsigned int nr_rec_secs;
+       unsigned int pgs_read;
+       int ret;
+
+       nr_rec_secs = bitmap_weight((unsigned long int *)&rqd->ppa_status,
+                                                               max_secs);
+
+       bio = bio_alloc(GFP_KERNEL, nr_rec_secs);
+       if (!bio) {
+               pr_err("pblk: not able to create recovery bio\n");
+               return;
+       }
+
+       bio->bi_iter.bi_sector = 0;
+       bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+       rqd->bio = bio;
+       rqd->nr_ppas = nr_rec_secs;
+
+       pgs_read = pblk_rb_read_to_bio_list(&pblk->rwb, bio, &recovery->failed,
+                                                               nr_rec_secs);
+       if (pgs_read != nr_rec_secs) {
+               pr_err("pblk: could not read recovery entries\n");
+               goto err;
+       }
+
+       if (pblk_setup_w_rec_rq(pblk, rqd, c_ctx)) {
+               pr_err("pblk: could not setup recovery request\n");
+               goto err;
+       }
+
+#ifdef CONFIG_NVM_DEBUG
+       atomic_long_add(nr_rec_secs, &pblk->recov_writes);
+#endif
+
+       ret = pblk_submit_io(pblk, rqd);
+       if (ret) {
+               pr_err("pblk: I/O submission failed: %d\n", ret);
+               goto err;
+       }
+
+       mempool_free(recovery, pblk->rec_pool);
+       return;
+
+err:
+       bio_put(bio);
+       pblk_free_rqd(pblk, rqd, WRITE);
+}
+
+int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx,
+                       struct pblk_rec_ctx *recovery, u64 *comp_bits,
+                       unsigned int comp)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       int max_secs = nvm_max_phys_sects(dev);
+       struct nvm_rq *rec_rqd;
+       struct pblk_c_ctx *rec_ctx;
+       int nr_entries = c_ctx->nr_valid + c_ctx->nr_padded;
+
+       rec_rqd = pblk_alloc_rqd(pblk, WRITE);
+       if (IS_ERR(rec_rqd)) {
+               pr_err("pblk: could not create recovery req.\n");
+               return -ENOMEM;
+       }
+
+       rec_ctx = nvm_rq_to_pdu(rec_rqd);
+
+       /* Copy completion bitmap, but exclude the first X completed entries */
+       bitmap_shift_right((unsigned long int *)&rec_rqd->ppa_status,
+                               (unsigned long int *)comp_bits,
+                               comp, max_secs);
+
+       /* Save the context for the entries that need to be re-written and
+        * update current context with the completed entries.
+        */
+       rec_ctx->sentry = pblk_rb_wrap_pos(&pblk->rwb, c_ctx->sentry + comp);
+       if (comp >= c_ctx->nr_valid) {
+               rec_ctx->nr_valid = 0;
+               rec_ctx->nr_padded = nr_entries - comp;
+
+               c_ctx->nr_padded = comp - c_ctx->nr_valid;
+       } else {
+               rec_ctx->nr_valid = c_ctx->nr_valid - comp;
+               rec_ctx->nr_padded = c_ctx->nr_padded;
+
+               c_ctx->nr_valid = comp;
+               c_ctx->nr_padded = 0;
+       }
+
+       recovery->rqd = rec_rqd;
+       recovery->pblk = pblk;
+
+       return 0;
+}
+
+__le64 *pblk_recov_get_lba_list(struct pblk *pblk, struct line_emeta *emeta)
+{
+       u32 crc;
+
+       crc = pblk_calc_emeta_crc(pblk, emeta);
+       if (le32_to_cpu(emeta->crc) != crc)
+               return NULL;
+
+       if (le32_to_cpu(emeta->header.identifier) != PBLK_MAGIC)
+               return NULL;
+
+       return pblk_line_emeta_to_lbas(emeta);
+}
+
+static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       struct pblk_line_meta *lm = &pblk->lm;
+       struct line_emeta *emeta = line->emeta;
+       __le64 *lba_list;
+       int data_start;
+       int nr_data_lbas, nr_valid_lbas, nr_lbas = 0;
+       int i;
+
+       lba_list = pblk_recov_get_lba_list(pblk, emeta);
+       if (!lba_list)
+               return 1;
+
+       data_start = pblk_line_smeta_start(pblk, line) + lm->smeta_sec;
+       nr_data_lbas = lm->sec_per_line - lm->emeta_sec;
+       nr_valid_lbas = le64_to_cpu(emeta->nr_valid_lbas);
+
+       for (i = data_start; i < nr_data_lbas && nr_lbas < nr_valid_lbas; i++) {
+               struct ppa_addr ppa;
+               int pos;
+
+               ppa = addr_to_pblk_ppa(pblk, i, line->id);
+               pos = pblk_ppa_to_pos(geo, ppa);
+
+               /* Do not update bad blocks */
+               if (test_bit(pos, line->blk_bitmap))
+                       continue;
+
+               if (le64_to_cpu(lba_list[i]) == ADDR_EMPTY) {
+                       spin_lock(&line->lock);
+                       if (test_and_set_bit(i, line->invalid_bitmap))
+                               WARN_ONCE(1, "pblk: rec. double invalidate:\n");
+                       else
+                               line->vsc--;
+                       spin_unlock(&line->lock);
+
+                       continue;
+               }
+
+               pblk_update_map(pblk, le64_to_cpu(lba_list[i]), ppa);
+               nr_lbas++;
+       }
+
+       if (nr_valid_lbas != nr_lbas)
+               pr_err("pblk: line %d - inconsistent lba list(%llu/%d)\n",
+                               line->id, line->emeta->nr_valid_lbas, nr_lbas);
+
+       line->left_msecs = 0;
+
+       return 0;
+}
+
+static int pblk_calc_sec_in_line(struct pblk *pblk, struct pblk_line *line)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       struct pblk_line_meta *lm = &pblk->lm;
+       int nr_bb = bitmap_weight(line->blk_bitmap, lm->blk_per_line);
+
+       return lm->sec_per_line - lm->smeta_sec - lm->emeta_sec -
+                               nr_bb * geo->sec_per_blk;
+}
+
+struct pblk_recov_alloc {
+       struct ppa_addr *ppa_list;
+       struct pblk_sec_meta *meta_list;
+       struct nvm_rq *rqd;
+       void *data;
+       dma_addr_t dma_ppa_list;
+       dma_addr_t dma_meta_list;
+};
+
+static int pblk_recov_read_oob(struct pblk *pblk, struct pblk_line *line,
+                              struct pblk_recov_alloc p, u64 r_ptr)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       struct ppa_addr *ppa_list;
+       struct pblk_sec_meta *meta_list;
+       struct nvm_rq *rqd;
+       struct bio *bio;
+       void *data;
+       dma_addr_t dma_ppa_list, dma_meta_list;
+       u64 r_ptr_int;
+       int left_ppas;
+       int rq_ppas, rq_len;
+       int i, j;
+       int ret = 0;
+       DECLARE_COMPLETION_ONSTACK(wait);
+
+       ppa_list = p.ppa_list;
+       meta_list = p.meta_list;
+       rqd = p.rqd;
+       data = p.data;
+       dma_ppa_list = p.dma_ppa_list;
+       dma_meta_list = p.dma_meta_list;
+
+       left_ppas = line->cur_sec - r_ptr;
+       if (!left_ppas)
+               return 0;
+
+       r_ptr_int = r_ptr;
+
+next_read_rq:
+       memset(rqd, 0, pblk_r_rq_size);
+
+       rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
+       if (!rq_ppas)
+               rq_ppas = pblk->min_write_pgs;
+       rq_len = rq_ppas * geo->sec_size;
+
+       bio = bio_map_kern(dev->q, data, rq_len, GFP_KERNEL);
+       if (IS_ERR(bio))
+               return PTR_ERR(bio);
+
+       bio->bi_iter.bi_sector = 0; /* internal bio */
+       bio_set_op_attrs(bio, REQ_OP_READ, 0);
+
+       rqd->bio = bio;
+       rqd->opcode = NVM_OP_PREAD;
+       rqd->flags = pblk_set_read_mode(pblk);
+       rqd->meta_list = meta_list;
+       rqd->nr_ppas = rq_ppas;
+       rqd->ppa_list = ppa_list;
+       rqd->dma_ppa_list = dma_ppa_list;
+       rqd->dma_meta_list = dma_meta_list;
+       rqd->end_io = pblk_end_io_sync;
+       rqd->private = &wait;
+
+       for (i = 0; i < rqd->nr_ppas; ) {
+               struct ppa_addr ppa;
+               int pos;
+
+               ppa = addr_to_gen_ppa(pblk, r_ptr_int, line->id);
+               pos = pblk_dev_ppa_to_pos(geo, ppa);
+
+               while (test_bit(pos, line->blk_bitmap)) {
+                       r_ptr_int += pblk->min_write_pgs;
+                       ppa = addr_to_gen_ppa(pblk, r_ptr_int, line->id);
+                       pos = pblk_dev_ppa_to_pos(geo, ppa);
+               }
+
+               for (j = 0; j < pblk->min_write_pgs; j++, i++, r_ptr_int++)
+                       rqd->ppa_list[i] =
+                               addr_to_gen_ppa(pblk, r_ptr_int, line->id);
+       }
+
+       /* If read fails, more padding is needed */
+       ret = pblk_submit_io(pblk, rqd);
+       if (ret) {
+               pr_err("pblk: I/O submission failed: %d\n", ret);
+               return ret;
+       }
+
+       if (!wait_for_completion_io_timeout(&wait,
+                               msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
+               pr_err("pblk: L2P recovery read timed out\n");
+               return -EINTR;
+       }
+
+       reinit_completion(&wait);
+
+       /* At this point, the read should not fail. If it does, it is a problem
+        * we cannot recover from here. Need FTL log.
+        */
+       if (rqd->error) {
+               pr_err("pblk: L2P recovery failed (%d)\n", rqd->error);
+               return -EINTR;
+       }
+
+       for (i = 0; i < rqd->nr_ppas; i++) {
+               u64 lba = le64_to_cpu(meta_list[i].lba);
+
+               if (lba == ADDR_EMPTY || lba > pblk->rl.nr_secs)
+                       continue;
+
+               pblk_update_map(pblk, lba, rqd->ppa_list[i]);
+       }
+
+       left_ppas -= rq_ppas;
+       if (left_ppas > 0)
+               goto next_read_rq;
+
+       return 0;
+}
+
+static int pblk_recov_pad_oob(struct pblk *pblk, struct pblk_line *line,
+                             struct pblk_recov_alloc p, int left_ppas)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       struct ppa_addr *ppa_list;
+       struct pblk_sec_meta *meta_list;
+       struct nvm_rq *rqd;
+       struct bio *bio;
+       void *data;
+       dma_addr_t dma_ppa_list, dma_meta_list;
+       __le64 *lba_list = pblk_line_emeta_to_lbas(line->emeta);
+       u64 w_ptr = line->cur_sec;
+       int left_line_ppas = line->left_msecs;
+       int rq_ppas, rq_len;
+       int i, j;
+       int ret = 0;
+       DECLARE_COMPLETION_ONSTACK(wait);
+
+       ppa_list = p.ppa_list;
+       meta_list = p.meta_list;
+       rqd = p.rqd;
+       data = p.data;
+       dma_ppa_list = p.dma_ppa_list;
+       dma_meta_list = p.dma_meta_list;
+
+next_pad_rq:
+       rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
+       if (!rq_ppas)
+               rq_ppas = pblk->min_write_pgs;
+       rq_len = rq_ppas * geo->sec_size;
+
+       bio = bio_map_kern(dev->q, data, rq_len, GFP_KERNEL);
+       if (IS_ERR(bio))
+               return PTR_ERR(bio);
+
+       bio->bi_iter.bi_sector = 0; /* internal bio */
+       bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+
+       memset(rqd, 0, pblk_r_rq_size);
+
+       rqd->bio = bio;
+       rqd->opcode = NVM_OP_PWRITE;
+       rqd->flags = pblk_set_progr_mode(pblk, WRITE);
+       rqd->meta_list = meta_list;
+       rqd->nr_ppas = rq_ppas;
+       rqd->ppa_list = ppa_list;
+       rqd->dma_ppa_list = dma_ppa_list;
+       rqd->dma_meta_list = dma_meta_list;
+       rqd->end_io = pblk_end_io_sync;
+       rqd->private = &wait;
+
+       for (i = 0; i < rqd->nr_ppas; ) {
+               struct ppa_addr ppa;
+               int pos;
+
+               w_ptr = pblk_alloc_page(pblk, line, pblk->min_write_pgs);
+               ppa = addr_to_pblk_ppa(pblk, w_ptr, line->id);
+               pos = pblk_ppa_to_pos(geo, ppa);
+
+               while (test_bit(pos, line->blk_bitmap)) {
+                       w_ptr += pblk->min_write_pgs;
+                       ppa = addr_to_pblk_ppa(pblk, w_ptr, line->id);
+                       pos = pblk_ppa_to_pos(geo, ppa);
+               }
+
+               for (j = 0; j < pblk->min_write_pgs; j++, i++, w_ptr++) {
+                       struct ppa_addr dev_ppa;
+
+                       dev_ppa = addr_to_gen_ppa(pblk, w_ptr, line->id);
+
+                       pblk_map_invalidate(pblk, dev_ppa);
+                       meta_list[i].lba = cpu_to_le64(ADDR_EMPTY);
+                       lba_list[w_ptr] = cpu_to_le64(ADDR_EMPTY);
+                       rqd->ppa_list[i] = dev_ppa;
+               }
+       }
+
+       ret = pblk_submit_io(pblk, rqd);
+       if (ret) {
+               pr_err("pblk: I/O submission failed: %d\n", ret);
+               return ret;
+       }
+
+       if (!wait_for_completion_io_timeout(&wait,
+                               msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
+               pr_err("pblk: L2P recovery write timed out\n");
+       }
+       reinit_completion(&wait);
+
+       left_line_ppas -= rq_ppas;
+       left_ppas -= rq_ppas;
+       if (left_ppas > 0 && left_line_ppas)
+               goto next_pad_rq;
+
+       return 0;
+}
+
+/* When this function is called, it means that not all upper pages have been
+ * written in a page that contains valid data. In order to recover this data, we
+ * first find the write pointer on the device, then we pad all necessary
+ * sectors, and finally attempt to read the valid data
+ */
+static int pblk_recov_scan_all_oob(struct pblk *pblk, struct pblk_line *line,
+                                  struct pblk_recov_alloc p)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       struct ppa_addr *ppa_list;
+       struct pblk_sec_meta *meta_list;
+       struct nvm_rq *rqd;
+       struct bio *bio;
+       void *data;
+       dma_addr_t dma_ppa_list, dma_meta_list;
+       u64 w_ptr = 0, r_ptr;
+       int rq_ppas, rq_len;
+       int i, j;
+       int ret = 0;
+       int rec_round;
+       int left_ppas = pblk_calc_sec_in_line(pblk, line) - line->cur_sec;
+       DECLARE_COMPLETION_ONSTACK(wait);
+
+       ppa_list = p.ppa_list;
+       meta_list = p.meta_list;
+       rqd = p.rqd;
+       data = p.data;
+       dma_ppa_list = p.dma_ppa_list;
+       dma_meta_list = p.dma_meta_list;
+
+       /* we could recover up until the line write pointer */
+       r_ptr = line->cur_sec;
+       rec_round = 0;
+
+next_rq:
+       memset(rqd, 0, pblk_r_rq_size);
+
+       rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
+       if (!rq_ppas)
+               rq_ppas = pblk->min_write_pgs;
+       rq_len = rq_ppas * geo->sec_size;
+
+       bio = bio_map_kern(dev->q, data, rq_len, GFP_KERNEL);
+       if (IS_ERR(bio))
+               return PTR_ERR(bio);
+
+       bio->bi_iter.bi_sector = 0; /* internal bio */
+       bio_set_op_attrs(bio, REQ_OP_READ, 0);
+
+       rqd->bio = bio;
+       rqd->opcode = NVM_OP_PREAD;
+       rqd->flags = pblk_set_read_mode(pblk);
+       rqd->meta_list = meta_list;
+       rqd->nr_ppas = rq_ppas;
+       rqd->ppa_list = ppa_list;
+       rqd->dma_ppa_list = dma_ppa_list;
+       rqd->dma_meta_list = dma_meta_list;
+       rqd->end_io = pblk_end_io_sync;
+       rqd->private = &wait;
+
+       for (i = 0; i < rqd->nr_ppas; ) {
+               struct ppa_addr ppa;
+               int pos;
+
+               w_ptr = pblk_alloc_page(pblk, line, pblk->min_write_pgs);
+               ppa = addr_to_gen_ppa(pblk, w_ptr, line->id);
+               pos = pblk_dev_ppa_to_pos(geo, ppa);
+
+               while (test_bit(pos, line->blk_bitmap)) {
+                       w_ptr += pblk->min_write_pgs;
+                       ppa = addr_to_gen_ppa(pblk, w_ptr, line->id);
+                       pos = pblk_dev_ppa_to_pos(geo, ppa);
+               }
+
+               for (j = 0; j < pblk->min_write_pgs; j++, i++, w_ptr++)
+                       rqd->ppa_list[i] =
+                               addr_to_gen_ppa(pblk, w_ptr, line->id);
+       }
+
+       ret = pblk_submit_io(pblk, rqd);
+       if (ret) {
+               pr_err("pblk: I/O submission failed: %d\n", ret);
+               return ret;
+       }
+
+       if (!wait_for_completion_io_timeout(&wait,
+                               msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
+               pr_err("pblk: L2P recovery read timed out\n");
+       }
+       reinit_completion(&wait);
+
+       /* This should not happen since the read failed during normal recovery,
+        * but the media works funny sometimes...
+        */
+       if (!rec_round++ && !rqd->error) {
+               rec_round = 0;
+               for (i = 0; i < rqd->nr_ppas; i++, r_ptr++) {
+                       u64 lba = le64_to_cpu(meta_list[i].lba);
+
+                       if (lba == ADDR_EMPTY || lba > pblk->rl.nr_secs)
+                               continue;
+
+                       pblk_update_map(pblk, lba, rqd->ppa_list[i]);
+               }
+       }
+
+       /* Reached the end of the written line */
+       if (rqd->error == NVM_RSP_ERR_EMPTYPAGE) {
+               int pad_secs, nr_error_bits, bit;
+               int ret;
+
+               bit = find_first_bit((void *)&rqd->ppa_status, rqd->nr_ppas);
+               nr_error_bits = rqd->nr_ppas - bit;
+
+               /* Roll back failed sectors */
+               line->cur_sec -= nr_error_bits;
+               line->left_msecs += nr_error_bits;
+               bitmap_clear(line->map_bitmap, line->cur_sec, nr_error_bits);
+
+               pad_secs = pblk_pad_distance(pblk);
+               if (pad_secs > line->left_msecs)
+                       pad_secs = line->left_msecs;
+
+               ret = pblk_recov_pad_oob(pblk, line, p, pad_secs);
+               if (ret)
+                       pr_err("pblk: OOB padding failed (err:%d)\n", ret);
+
+               ret = pblk_recov_read_oob(pblk, line, p, r_ptr);
+               if (ret)
+                       pr_err("pblk: OOB read failed (err:%d)\n", ret);
+
+               line->left_ssecs = line->left_msecs;
+               left_ppas = 0;
+       }
+
+       left_ppas -= rq_ppas;
+       if (left_ppas > 0)
+               goto next_rq;
+
+       return ret;
+}
+
+static int pblk_recov_scan_oob(struct pblk *pblk, struct pblk_line *line,
+                              struct pblk_recov_alloc p, int *done)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       struct ppa_addr *ppa_list;
+       struct pblk_sec_meta *meta_list;
+       struct nvm_rq *rqd;
+       struct bio *bio;
+       void *data;
+       dma_addr_t dma_ppa_list, dma_meta_list;
+       u64 paddr;
+       int rq_ppas, rq_len;
+       int i, j;
+       int ret = 0;
+       int left_ppas = pblk_calc_sec_in_line(pblk, line);
+       DECLARE_COMPLETION_ONSTACK(wait);
+
+       ppa_list = p.ppa_list;
+       meta_list = p.meta_list;
+       rqd = p.rqd;
+       data = p.data;
+       dma_ppa_list = p.dma_ppa_list;
+       dma_meta_list = p.dma_meta_list;
+
+       *done = 1;
+
+next_rq:
+       memset(rqd, 0, pblk_r_rq_size);
+
+       rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
+       if (!rq_ppas)
+               rq_ppas = pblk->min_write_pgs;
+       rq_len = rq_ppas * geo->sec_size;
+
+       bio = bio_map_kern(dev->q, data, rq_len, GFP_KERNEL);
+       if (IS_ERR(bio))
+               return PTR_ERR(bio);
+
+       bio->bi_iter.bi_sector = 0; /* internal bio */
+       bio_set_op_attrs(bio, REQ_OP_READ, 0);
+
+       rqd->bio = bio;
+       rqd->opcode = NVM_OP_PREAD;
+       rqd->flags = pblk_set_read_mode(pblk);
+       rqd->meta_list = meta_list;
+       rqd->nr_ppas = rq_ppas;
+       rqd->ppa_list = ppa_list;
+       rqd->dma_ppa_list = dma_ppa_list;
+       rqd->dma_meta_list = dma_meta_list;
+       rqd->end_io = pblk_end_io_sync;
+       rqd->private = &wait;
+
+       for (i = 0; i < rqd->nr_ppas; ) {
+               struct ppa_addr ppa;
+               int pos;
+
+               paddr = pblk_alloc_page(pblk, line, pblk->min_write_pgs);
+               ppa = addr_to_gen_ppa(pblk, paddr, line->id);
+               pos = pblk_dev_ppa_to_pos(geo, ppa);
+
+               while (test_bit(pos, line->blk_bitmap)) {
+                       paddr += pblk->min_write_pgs;
+                       ppa = addr_to_gen_ppa(pblk, paddr, line->id);
+                       pos = pblk_dev_ppa_to_pos(geo, ppa);
+               }
+
+               for (j = 0; j < pblk->min_write_pgs; j++, i++, paddr++)
+                       rqd->ppa_list[i] =
+                               addr_to_gen_ppa(pblk, paddr, line->id);
+       }
+
+       ret = pblk_submit_io(pblk, rqd);
+       if (ret) {
+               pr_err("pblk: I/O submission failed: %d\n", ret);
+               bio_put(bio);
+               return ret;
+       }
+
+       if (!wait_for_completion_io_timeout(&wait,
+                               msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
+               pr_err("pblk: L2P recovery read timed out\n");
+       }
+       reinit_completion(&wait);
+
+       /* Reached the end of the written line */
+       if (rqd->error) {
+               int nr_error_bits, bit;
+
+               bit = find_first_bit((void *)&rqd->ppa_status, rqd->nr_ppas);
+               nr_error_bits = rqd->nr_ppas - bit;
+
+               /* Roll back failed sectors */
+               line->cur_sec -= nr_error_bits;
+               line->left_msecs += nr_error_bits;
+               line->left_ssecs = line->left_msecs;
+               bitmap_clear(line->map_bitmap, line->cur_sec, nr_error_bits);
+
+               left_ppas = 0;
+               rqd->nr_ppas = bit;
+
+               if (rqd->error != NVM_RSP_ERR_EMPTYPAGE)
+                       *done = 0;
+       }
+
+       for (i = 0; i < rqd->nr_ppas; i++) {
+               u64 lba = le64_to_cpu(meta_list[i].lba);
+
+               if (lba == ADDR_EMPTY || lba > pblk->rl.nr_secs)
+                       continue;
+
+               pblk_update_map(pblk, lba, rqd->ppa_list[i]);
+       }
+
+       left_ppas -= rq_ppas;
+       if (left_ppas > 0)
+               goto next_rq;
+
+       return ret;
+}
+
+/* Scan line for lbas on out of bound area */
+static int pblk_recov_l2p_from_oob(struct pblk *pblk, struct pblk_line *line)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       struct nvm_rq *rqd;
+       struct ppa_addr *ppa_list;
+       struct pblk_sec_meta *meta_list;
+       struct pblk_recov_alloc p;
+       void *data;
+       dma_addr_t dma_ppa_list, dma_meta_list;
+       int done, ret = 0;
+
+       rqd = pblk_alloc_rqd(pblk, READ);
+       if (IS_ERR(rqd))
+               return PTR_ERR(rqd);
+
+       meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, &dma_meta_list);
+       if (!meta_list) {
+               ret = -ENOMEM;
+               goto free_rqd;
+       }
+
+       ppa_list = (void *)(meta_list) + pblk_dma_meta_size;
+       dma_ppa_list = dma_meta_list + pblk_dma_meta_size;
+
+       data = kcalloc(pblk->max_write_pgs, geo->sec_size, GFP_KERNEL);
+       if (!data) {
+               ret = -ENOMEM;
+               goto free_meta_list;
+       }
+
+       p.ppa_list = ppa_list;
+       p.meta_list = meta_list;
+       p.rqd = rqd;
+       p.data = data;
+       p.dma_ppa_list = dma_ppa_list;
+       p.dma_meta_list = dma_meta_list;
+
+       ret = pblk_recov_scan_oob(pblk, line, p, &done);
+       if (ret) {
+               pr_err("pblk: could not recover L2P from OOB\n");
+               goto out;
+       }
+
+       if (!done) {
+               ret = pblk_recov_scan_all_oob(pblk, line, p);
+               if (ret) {
+                       pr_err("pblk: could not recover L2P from OOB\n");
+                       goto out;
+               }
+       }
+
+       if (pblk_line_is_full(line))
+               pblk_line_recov_close(pblk, line);
+
+out:
+       kfree(data);
+free_meta_list:
+       nvm_dev_dma_free(dev->parent, meta_list, dma_meta_list);
+free_rqd:
+       pblk_free_rqd(pblk, rqd, READ);
+
+       return ret;
+}
+
+/* Insert lines ordered by sequence number (seq_num) on list */
+static void pblk_recov_line_add_ordered(struct list_head *head,
+                                       struct pblk_line *line)
+{
+       struct pblk_line *t = NULL;
+
+       list_for_each_entry(t, head, list)
+               if (t->seq_nr > line->seq_nr)
+                       break;
+
+       __list_add(&line->list, t->list.prev, &t->list);
+}
+
+struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       struct pblk_line_meta *lm = &pblk->lm;
+       struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+       struct pblk_line *line, *tline, *data_line = NULL;
+       struct line_smeta *smeta;
+       struct line_emeta *emeta;
+       int found_lines = 0, recovered_lines = 0, open_lines = 0;
+       int is_next = 0;
+       int meta_line;
+       int i, valid_uuid = 0;
+       LIST_HEAD(recov_list);
+
+       /* TODO: Implement FTL snapshot */
+
+       /* Scan recovery - takes place when FTL snapshot fails */
+       spin_lock(&l_mg->free_lock);
+       meta_line = find_first_zero_bit(&l_mg->meta_bitmap, PBLK_DATA_LINES);
+       set_bit(meta_line, &l_mg->meta_bitmap);
+       smeta = l_mg->sline_meta[meta_line].meta;
+       emeta = l_mg->eline_meta[meta_line].meta;
+       spin_unlock(&l_mg->free_lock);
+
+       /* Order data lines using their sequence number */
+       for (i = 0; i < l_mg->nr_lines; i++) {
+               u32 crc;
+
+               line = &pblk->lines[i];
+
+               memset(smeta, 0, lm->smeta_len);
+               line->smeta = smeta;
+               line->lun_bitmap = ((void *)(smeta)) +
+                                               sizeof(struct line_smeta);
+
+               /* Lines that cannot be read are assumed as not written here */
+               if (pblk_line_read_smeta(pblk, line))
+                       continue;
+
+               crc = pblk_calc_smeta_crc(pblk, smeta);
+               if (le32_to_cpu(smeta->crc) != crc)
+                       continue;
+
+               if (le32_to_cpu(smeta->header.identifier) != PBLK_MAGIC)
+                       continue;
+
+               if (le16_to_cpu(smeta->header.version) != 1) {
+                       pr_err("pblk: found incompatible line version %u\n",
+                                       smeta->header.version);
+                       return ERR_PTR(-EINVAL);
+               }
+
+               /* The first valid instance uuid is used for initialization */
+               if (!valid_uuid) {
+                       memcpy(pblk->instance_uuid, smeta->header.uuid, 16);
+                       valid_uuid = 1;
+               }
+
+               if (memcmp(pblk->instance_uuid, smeta->header.uuid, 16)) {
+                       pr_debug("pblk: ignore line %u due to uuid mismatch\n",
+                                       i);
+                       continue;
+               }
+
+               /* Update line metadata */
+               spin_lock(&line->lock);
+               line->id = le32_to_cpu(line->smeta->header.id);
+               line->type = le16_to_cpu(line->smeta->header.type);
+               line->seq_nr = le64_to_cpu(line->smeta->seq_nr);
+               spin_unlock(&line->lock);
+
+               /* Update general metadata */
+               spin_lock(&l_mg->free_lock);
+               if (line->seq_nr >= l_mg->d_seq_nr)
+                       l_mg->d_seq_nr = line->seq_nr + 1;
+               l_mg->nr_free_lines--;
+               spin_unlock(&l_mg->free_lock);
+
+               if (pblk_line_recov_alloc(pblk, line))
+                       goto out;
+
+               pblk_recov_line_add_ordered(&recov_list, line);
+               found_lines++;
+               pr_debug("pblk: recovering data line %d, seq:%llu\n",
+                                               line->id, smeta->seq_nr);
+       }
+
+       if (!found_lines) {
+               pblk_setup_uuid(pblk);
+
+               spin_lock(&l_mg->free_lock);
+               WARN_ON_ONCE(!test_and_clear_bit(meta_line,
+                                                       &l_mg->meta_bitmap));
+               spin_unlock(&l_mg->free_lock);
+
+               goto out;
+       }
+
+       /* Verify closed blocks and recover this portion of L2P table*/
+       list_for_each_entry_safe(line, tline, &recov_list, list) {
+               int off, nr_bb;
+
+               recovered_lines++;
+               /* Calculate where emeta starts based on the line bb */
+               off = lm->sec_per_line - lm->emeta_sec;
+               nr_bb = bitmap_weight(line->blk_bitmap, lm->blk_per_line);
+               off -= nr_bb * geo->sec_per_pl;
+
+               memset(emeta, 0, lm->emeta_len);
+               line->emeta = emeta;
+               line->emeta_ssec = off;
+
+               if (pblk_line_read_emeta(pblk, line)) {
+                       pblk_recov_l2p_from_oob(pblk, line);
+                       goto next;
+               }
+
+               if (pblk_recov_l2p_from_emeta(pblk, line))
+                       pblk_recov_l2p_from_oob(pblk, line);
+
+next:
+               if (pblk_line_is_full(line)) {
+                       struct list_head *move_list;
+
+                       spin_lock(&line->lock);
+                       line->state = PBLK_LINESTATE_CLOSED;
+                       move_list = pblk_line_gc_list(pblk, line);
+                       spin_unlock(&line->lock);
+
+                       spin_lock(&l_mg->gc_lock);
+                       list_move_tail(&line->list, move_list);
+                       spin_unlock(&l_mg->gc_lock);
+
+                       mempool_free(line->map_bitmap, pblk->line_meta_pool);
+                       line->map_bitmap = NULL;
+                       line->smeta = NULL;
+                       line->emeta = NULL;
+               } else {
+                       if (open_lines > 1)
+                               pr_err("pblk: failed to recover L2P\n");
+
+                       open_lines++;
+                       line->meta_line = meta_line;
+                       data_line = line;
+               }
+       }
+
+       spin_lock(&l_mg->free_lock);
+       if (!open_lines) {
+               WARN_ON_ONCE(!test_and_clear_bit(meta_line,
+                                                       &l_mg->meta_bitmap));
+               pblk_line_replace_data(pblk);
+       } else {
+               /* Allocate next line for preparation */
+               l_mg->data_next = pblk_line_get(pblk);
+               if (l_mg->data_next) {
+                       l_mg->data_next->seq_nr = l_mg->d_seq_nr++;
+                       l_mg->data_next->type = PBLK_LINETYPE_DATA;
+                       is_next = 1;
+               }
+       }
+       spin_unlock(&l_mg->free_lock);
+
+       if (is_next) {
+               pblk_line_erase(pblk, l_mg->data_next);
+               pblk_rl_free_lines_dec(&pblk->rl, l_mg->data_next);
+       }
+
+out:
+       if (found_lines != recovered_lines)
+               pr_err("pblk: failed to recover all found lines %d/%d\n",
+                                               found_lines, recovered_lines);
+
+       return data_line;
+}
+
+/*
+ * Pad until smeta can be read on current data line
+ */
+void pblk_recov_pad(struct pblk *pblk)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       struct pblk_line *line;
+       struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+       struct nvm_rq *rqd;
+       struct pblk_recov_alloc p;
+       struct ppa_addr *ppa_list;
+       struct pblk_sec_meta *meta_list;
+       void *data;
+       dma_addr_t dma_ppa_list, dma_meta_list;
+
+       spin_lock(&l_mg->free_lock);
+       line = l_mg->data_line;
+       spin_unlock(&l_mg->free_lock);
+
+       rqd = pblk_alloc_rqd(pblk, READ);
+       if (IS_ERR(rqd))
+               return;
+
+       meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, &dma_meta_list);
+       if (!meta_list)
+               goto free_rqd;
+
+       ppa_list = (void *)(meta_list) + pblk_dma_meta_size;
+       dma_ppa_list = dma_meta_list + pblk_dma_meta_size;
+
+       data = kcalloc(pblk->max_write_pgs, geo->sec_size, GFP_KERNEL);
+       if (!data)
+               goto free_meta_list;
+
+       p.ppa_list = ppa_list;
+       p.meta_list = meta_list;
+       p.rqd = rqd;
+       p.data = data;
+       p.dma_ppa_list = dma_ppa_list;
+       p.dma_meta_list = dma_meta_list;
+
+       if (pblk_recov_pad_oob(pblk, line, p, line->left_msecs)) {
+               pr_err("pblk: Tear down padding failed\n");
+               goto free_data;
+       }
+
+       pblk_line_close(pblk, line);
+
+free_data:
+       kfree(data);
+free_meta_list:
+       nvm_dev_dma_free(dev->parent, meta_list, dma_meta_list);
+free_rqd:
+       pblk_free_rqd(pblk, rqd, READ);
+}
diff --git a/drivers/lightnvm/pblk-rl.c b/drivers/lightnvm/pblk-rl.c
new file mode 100644 (file)
index 0000000..ab7cbb1
--- /dev/null
@@ -0,0 +1,184 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ *                  Matias Bjorling <matias@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * pblk-rl.c - pblk's rate limiter for user I/O
+ *
+ */
+
+#include "pblk.h"
+
+static void pblk_rl_kick_u_timer(struct pblk_rl *rl)
+{
+       mod_timer(&rl->u_timer, jiffies + msecs_to_jiffies(5000));
+}
+
+int pblk_rl_user_may_insert(struct pblk_rl *rl, int nr_entries)
+{
+       int rb_user_cnt = atomic_read(&rl->rb_user_cnt);
+
+       return (!(rb_user_cnt + nr_entries > rl->rb_user_max));
+}
+
+int pblk_rl_gc_may_insert(struct pblk_rl *rl, int nr_entries)
+{
+       int rb_gc_cnt = atomic_read(&rl->rb_gc_cnt);
+       int rb_user_active;
+
+       /* If there is no user I/O let GC take over space on the write buffer */
+       rb_user_active = READ_ONCE(rl->rb_user_active);
+       return (!(rb_gc_cnt + nr_entries > rl->rb_gc_max && rb_user_active));
+}
+
+void pblk_rl_user_in(struct pblk_rl *rl, int nr_entries)
+{
+       atomic_add(nr_entries, &rl->rb_user_cnt);
+
+       /* Release user I/O state. Protect from GC */
+       smp_store_release(&rl->rb_user_active, 1);
+       pblk_rl_kick_u_timer(rl);
+}
+
+void pblk_rl_gc_in(struct pblk_rl *rl, int nr_entries)
+{
+       atomic_add(nr_entries, &rl->rb_gc_cnt);
+}
+
+void pblk_rl_out(struct pblk_rl *rl, int nr_user, int nr_gc)
+{
+       atomic_sub(nr_user, &rl->rb_user_cnt);
+       atomic_sub(nr_gc, &rl->rb_gc_cnt);
+}
+
+unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl)
+{
+       return atomic_read(&rl->free_blocks);
+}
+
+/*
+ * We check for (i) the number of free blocks in the current LUN and (ii) the
+ * total number of free blocks in the pblk instance. This is to even out the
+ * number of free blocks on each LUN when GC kicks in.
+ *
+ * Only the total number of free blocks is used to configure the rate limiter.
+ */
+static int pblk_rl_update_rates(struct pblk_rl *rl, unsigned long max)
+{
+       unsigned long free_blocks = pblk_rl_nr_free_blks(rl);
+
+       if (free_blocks >= rl->high) {
+               rl->rb_user_max = max - rl->rb_gc_rsv;
+               rl->rb_gc_max = rl->rb_gc_rsv;
+               rl->rb_state = PBLK_RL_HIGH;
+       } else if (free_blocks < rl->high) {
+               int shift = rl->high_pw - rl->rb_windows_pw;
+               int user_windows = free_blocks >> shift;
+               int user_max = user_windows << PBLK_MAX_REQ_ADDRS_PW;
+               int gc_max;
+
+               rl->rb_user_max = user_max;
+               gc_max = max - rl->rb_user_max;
+               rl->rb_gc_max = max(gc_max, rl->rb_gc_rsv);
+
+               if (free_blocks > rl->low)
+                       rl->rb_state = PBLK_RL_MID;
+               else
+                       rl->rb_state = PBLK_RL_LOW;
+       }
+
+       return rl->rb_state;
+}
+
+void pblk_rl_set_gc_rsc(struct pblk_rl *rl, int rsv)
+{
+       rl->rb_gc_rsv = rl->rb_gc_max = rsv;
+}
+
+void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line)
+{
+       struct pblk *pblk = container_of(rl, struct pblk, rl);
+       int blk_in_line = atomic_read(&line->blk_in_line);
+       int ret;
+
+       atomic_add(blk_in_line, &rl->free_blocks);
+       /* Rates will not change that often - no need to lock update */
+       ret = pblk_rl_update_rates(rl, rl->rb_budget);
+
+       if (ret == (PBLK_RL_MID | PBLK_RL_LOW))
+               pblk_gc_should_start(pblk);
+       else
+               pblk_gc_should_stop(pblk);
+}
+
+void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line)
+{
+       struct pblk *pblk = container_of(rl, struct pblk, rl);
+       int blk_in_line = atomic_read(&line->blk_in_line);
+       int ret;
+
+       atomic_sub(blk_in_line, &rl->free_blocks);
+
+       /* Rates will not change that often - no need to lock update */
+       ret = pblk_rl_update_rates(rl, rl->rb_budget);
+       if (ret == (PBLK_RL_MID | PBLK_RL_LOW))
+               pblk_gc_should_start(pblk);
+       else
+               pblk_gc_should_stop(pblk);
+}
+
+int pblk_rl_gc_thrs(struct pblk_rl *rl)
+{
+       return rl->high;
+}
+
+int pblk_rl_sysfs_rate_show(struct pblk_rl *rl)
+{
+       return rl->rb_user_max;
+}
+
+static void pblk_rl_u_timer(unsigned long data)
+{
+       struct pblk_rl *rl = (struct pblk_rl *)data;
+
+       /* Release user I/O state. Protect from GC */
+       smp_store_release(&rl->rb_user_active, 0);
+}
+
+void pblk_rl_free(struct pblk_rl *rl)
+{
+       del_timer(&rl->u_timer);
+}
+
+void pblk_rl_init(struct pblk_rl *rl, int budget)
+{
+       unsigned int rb_windows;
+
+       rl->high = rl->total_blocks / PBLK_USER_HIGH_THRS;
+       rl->low = rl->total_blocks / PBLK_USER_LOW_THRS;
+       rl->high_pw = get_count_order(rl->high);
+
+       /* This will always be a power-of-2 */
+       rb_windows = budget / PBLK_MAX_REQ_ADDRS;
+       rl->rb_windows_pw = get_count_order(rb_windows) + 1;
+
+       /* To start with, all buffer is available to user I/O writers */
+       rl->rb_budget = budget;
+       rl->rb_user_max = budget;
+       atomic_set(&rl->rb_user_cnt, 0);
+       rl->rb_gc_max = 0;
+       rl->rb_state = PBLK_RL_HIGH;
+       atomic_set(&rl->rb_gc_cnt, 0);
+
+       setup_timer(&rl->u_timer, pblk_rl_u_timer, (unsigned long)rl);
+       rl->rb_user_active = 0;
+}
diff --git a/drivers/lightnvm/pblk-sysfs.c b/drivers/lightnvm/pblk-sysfs.c
new file mode 100644 (file)
index 0000000..f0af1d1
--- /dev/null
@@ -0,0 +1,507 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ *                  Matias Bjorling <matias@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * Implementation of a physical block-device target for Open-channel SSDs.
+ *
+ * pblk-sysfs.c - pblk's sysfs
+ *
+ */
+
+#include "pblk.h"
+
+static ssize_t pblk_sysfs_luns_show(struct pblk *pblk, char *page)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       struct pblk_lun *rlun;
+       ssize_t sz = 0;
+       int i;
+
+       for (i = 0; i < geo->nr_luns; i++) {
+               int active = 1;
+
+               rlun = &pblk->luns[i];
+               if (!down_trylock(&rlun->wr_sem)) {
+                       active = 0;
+                       up(&rlun->wr_sem);
+               }
+               sz += snprintf(page + sz, PAGE_SIZE - sz,
+                               "pblk: pos:%d, ch:%d, lun:%d - %d\n",
+                                       i,
+                                       rlun->bppa.g.ch,
+                                       rlun->bppa.g.lun,
+                                       active);
+       }
+
+       return sz;
+}
+
+static ssize_t pblk_sysfs_rate_limiter(struct pblk *pblk, char *page)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       int free_blocks, total_blocks;
+       int rb_user_max, rb_user_cnt;
+       int rb_gc_max, rb_gc_rsv, rb_gc_cnt, rb_budget, rb_state;
+
+       free_blocks = atomic_read(&pblk->rl.free_blocks);
+       rb_user_max = pblk->rl.rb_user_max;
+       rb_user_cnt = atomic_read(&pblk->rl.rb_user_cnt);
+       rb_gc_max = pblk->rl.rb_gc_max;
+       rb_gc_rsv = pblk->rl.rb_gc_rsv;
+       rb_gc_cnt = atomic_read(&pblk->rl.rb_gc_cnt);
+       rb_budget = pblk->rl.rb_budget;
+       rb_state = pblk->rl.rb_state;
+
+       total_blocks = geo->blks_per_lun * geo->nr_luns;
+
+       return snprintf(page, PAGE_SIZE,
+               "u:%u/%u,gc:%u/%u/%u(%u/%u)(stop:<%u,full:>%u,free:%d/%d)-%d\n",
+                               rb_user_cnt,
+                               rb_user_max,
+                               rb_gc_cnt,
+                               rb_gc_max,
+                               rb_gc_rsv,
+                               rb_state,
+                               rb_budget,
+                               pblk->rl.low,
+                               pblk->rl.high,
+                               free_blocks,
+                               total_blocks,
+                               READ_ONCE(pblk->rl.rb_user_active));
+}
+
+static ssize_t pblk_sysfs_gc_state_show(struct pblk *pblk, char *page)
+{
+       int gc_enabled, gc_active;
+
+       pblk_gc_sysfs_state_show(pblk, &gc_enabled, &gc_active);
+       return snprintf(page, PAGE_SIZE, "gc_enabled=%d, gc_active=%d\n",
+                                       gc_enabled, gc_active);
+}
+
+static ssize_t pblk_sysfs_stats(struct pblk *pblk, char *page)
+{
+       ssize_t sz;
+
+       sz = snprintf(page, PAGE_SIZE,
+                       "read_failed=%lu, read_high_ecc=%lu, read_empty=%lu, read_failed_gc=%lu, write_failed=%lu, erase_failed=%lu\n",
+                       atomic_long_read(&pblk->read_failed),
+                       atomic_long_read(&pblk->read_high_ecc),
+                       atomic_long_read(&pblk->read_empty),
+                       atomic_long_read(&pblk->read_failed_gc),
+                       atomic_long_read(&pblk->write_failed),
+                       atomic_long_read(&pblk->erase_failed));
+
+       return sz;
+}
+
+static ssize_t pblk_sysfs_write_buffer(struct pblk *pblk, char *page)
+{
+       return pblk_rb_sysfs(&pblk->rwb, page);
+}
+
+static ssize_t pblk_sysfs_ppaf(struct pblk *pblk, char *page)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       ssize_t sz = 0;
+
+       sz = snprintf(page, PAGE_SIZE - sz,
+               "g:(b:%d)blk:%d/%d,pg:%d/%d,lun:%d/%d,ch:%d/%d,pl:%d/%d,sec:%d/%d\n",
+               pblk->ppaf_bitsize,
+               pblk->ppaf.blk_offset, geo->ppaf.blk_len,
+               pblk->ppaf.pg_offset, geo->ppaf.pg_len,
+               pblk->ppaf.lun_offset, geo->ppaf.lun_len,
+               pblk->ppaf.ch_offset, geo->ppaf.ch_len,
+               pblk->ppaf.pln_offset, geo->ppaf.pln_len,
+               pblk->ppaf.sec_offset, geo->ppaf.sect_len);
+
+       sz += snprintf(page + sz, PAGE_SIZE - sz,
+               "d:blk:%d/%d,pg:%d/%d,lun:%d/%d,ch:%d/%d,pl:%d/%d,sec:%d/%d\n",
+               geo->ppaf.blk_offset, geo->ppaf.blk_len,
+               geo->ppaf.pg_offset, geo->ppaf.pg_len,
+               geo->ppaf.lun_offset, geo->ppaf.lun_len,
+               geo->ppaf.ch_offset, geo->ppaf.ch_len,
+               geo->ppaf.pln_offset, geo->ppaf.pln_len,
+               geo->ppaf.sect_offset, geo->ppaf.sect_len);
+
+       return sz;
+}
+
+static ssize_t pblk_sysfs_lines(struct pblk *pblk, char *page)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       struct pblk_line_meta *lm = &pblk->lm;
+       struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+       struct pblk_line *line;
+       ssize_t sz = 0;
+       int nr_free_lines;
+       int cur_data, cur_log;
+       int free_line_cnt = 0, closed_line_cnt = 0;
+       int d_line_cnt = 0, l_line_cnt = 0;
+       int gc_full = 0, gc_high = 0, gc_mid = 0, gc_low = 0, gc_empty = 0;
+       int free = 0, bad = 0, cor = 0;
+       int msecs = 0, ssecs = 0, cur_sec = 0, vsc = 0, sec_in_line = 0;
+       int map_weight = 0, meta_weight = 0;
+
+       spin_lock(&l_mg->free_lock);
+       cur_data = (l_mg->data_line) ? l_mg->data_line->id : -1;
+       cur_log = (l_mg->log_line) ? l_mg->log_line->id : -1;
+       nr_free_lines = l_mg->nr_free_lines;
+
+       list_for_each_entry(line, &l_mg->free_list, list)
+               free_line_cnt++;
+       spin_unlock(&l_mg->free_lock);
+
+       spin_lock(&l_mg->gc_lock);
+       list_for_each_entry(line, &l_mg->gc_full_list, list) {
+               if (line->type == PBLK_LINETYPE_DATA)
+                       d_line_cnt++;
+               else if (line->type == PBLK_LINETYPE_LOG)
+                       l_line_cnt++;
+               closed_line_cnt++;
+               gc_full++;
+       }
+
+       list_for_each_entry(line, &l_mg->gc_high_list, list) {
+               if (line->type == PBLK_LINETYPE_DATA)
+                       d_line_cnt++;
+               else if (line->type == PBLK_LINETYPE_LOG)
+                       l_line_cnt++;
+               closed_line_cnt++;
+               gc_high++;
+       }
+
+       list_for_each_entry(line, &l_mg->gc_mid_list, list) {
+               if (line->type == PBLK_LINETYPE_DATA)
+                       d_line_cnt++;
+               else if (line->type == PBLK_LINETYPE_LOG)
+                       l_line_cnt++;
+               closed_line_cnt++;
+               gc_mid++;
+       }
+
+       list_for_each_entry(line, &l_mg->gc_low_list, list) {
+               if (line->type == PBLK_LINETYPE_DATA)
+                       d_line_cnt++;
+               else if (line->type == PBLK_LINETYPE_LOG)
+                       l_line_cnt++;
+               closed_line_cnt++;
+               gc_low++;
+       }
+
+       list_for_each_entry(line, &l_mg->gc_empty_list, list) {
+               if (line->type == PBLK_LINETYPE_DATA)
+                       d_line_cnt++;
+               else if (line->type == PBLK_LINETYPE_LOG)
+                       l_line_cnt++;
+               closed_line_cnt++;
+               gc_empty++;
+       }
+
+       list_for_each_entry(line, &l_mg->free_list, list)
+               free++;
+       list_for_each_entry(line, &l_mg->bad_list, list)
+               bad++;
+       list_for_each_entry(line, &l_mg->corrupt_list, list)
+               cor++;
+       spin_unlock(&l_mg->gc_lock);
+
+       spin_lock(&l_mg->free_lock);
+       if (l_mg->data_line) {
+               cur_sec = l_mg->data_line->cur_sec;
+               msecs = l_mg->data_line->left_msecs;
+               ssecs = l_mg->data_line->left_ssecs;
+               vsc = l_mg->data_line->vsc;
+               sec_in_line = l_mg->data_line->sec_in_line;
+               meta_weight = bitmap_weight(&l_mg->meta_bitmap,
+                                                       PBLK_DATA_LINES);
+               map_weight = bitmap_weight(l_mg->data_line->map_bitmap,
+                                                       lm->sec_per_line);
+       }
+       spin_unlock(&l_mg->free_lock);
+
+       if (nr_free_lines != free_line_cnt)
+               pr_err("pblk: corrupted free line list\n");
+
+       sz = snprintf(page, PAGE_SIZE - sz,
+               "line: nluns:%d, nblks:%d, nsecs:%d\n",
+               geo->nr_luns, lm->blk_per_line, lm->sec_per_line);
+
+       sz += snprintf(page + sz, PAGE_SIZE - sz,
+               "lines:d:%d,l:%d-f:%d(%d),b:%d,co:%d,c:%d(d:%d,l:%d)t:%d\n",
+                                       cur_data, cur_log,
+                                       free, nr_free_lines, bad, cor,
+                                       closed_line_cnt,
+                                       d_line_cnt, l_line_cnt,
+                                       l_mg->nr_lines);
+
+       sz += snprintf(page + sz, PAGE_SIZE - sz,
+               "GC: full:%d, high:%d, mid:%d, low:%d, empty:%d, queue:%d\n",
+                       gc_full, gc_high, gc_mid, gc_low, gc_empty,
+                       atomic_read(&pblk->gc.inflight_gc));
+
+       sz += snprintf(page + sz, PAGE_SIZE - sz,
+               "data (%d) cur:%d, left:%d/%d, vsc:%d, s:%d, map:%d/%d (%d)\n",
+                       cur_data, cur_sec, msecs, ssecs, vsc, sec_in_line,
+                       map_weight, lm->sec_per_line, meta_weight);
+
+       return sz;
+}
+
+static ssize_t pblk_sysfs_lines_info(struct pblk *pblk, char *page)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       struct pblk_line_meta *lm = &pblk->lm;
+       ssize_t sz = 0;
+
+       sz = snprintf(page, PAGE_SIZE - sz,
+                               "smeta - len:%d, secs:%d\n",
+                                       lm->smeta_len, lm->smeta_sec);
+       sz += snprintf(page + sz, PAGE_SIZE - sz,
+                               "emeta - len:%d, sec:%d, bb_start:%d\n",
+                                       lm->emeta_len, lm->emeta_sec,
+                                       lm->emeta_bb);
+       sz += snprintf(page + sz, PAGE_SIZE - sz,
+                               "bitmap lengths: sec:%d, blk:%d, lun:%d\n",
+                                       lm->sec_bitmap_len,
+                                       lm->blk_bitmap_len,
+                                       lm->lun_bitmap_len);
+       sz += snprintf(page + sz, PAGE_SIZE - sz,
+                               "blk_line:%d, sec_line:%d, sec_blk:%d\n",
+                                       lm->blk_per_line,
+                                       lm->sec_per_line,
+                                       geo->sec_per_blk);
+
+       return sz;
+}
+
+#ifdef CONFIG_NVM_DEBUG
+static ssize_t pblk_sysfs_stats_debug(struct pblk *pblk, char *page)
+{
+       return snprintf(page, PAGE_SIZE,
+               "%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\n",
+                       atomic_long_read(&pblk->inflight_writes),
+                       atomic_long_read(&pblk->inflight_reads),
+                       atomic_long_read(&pblk->req_writes),
+                       atomic_long_read(&pblk->nr_flush),
+                       atomic_long_read(&pblk->padded_writes),
+                       atomic_long_read(&pblk->padded_wb),
+                       atomic_long_read(&pblk->sub_writes),
+                       atomic_long_read(&pblk->sync_writes),
+                       atomic_long_read(&pblk->compl_writes),
+                       atomic_long_read(&pblk->recov_writes),
+                       atomic_long_read(&pblk->recov_gc_writes),
+                       atomic_long_read(&pblk->recov_gc_reads),
+                       atomic_long_read(&pblk->sync_reads));
+}
+#endif
+
+static ssize_t pblk_sysfs_rate_store(struct pblk *pblk, const char *page,
+                                    size_t len)
+{
+       struct pblk_gc *gc = &pblk->gc;
+       size_t c_len;
+       int value;
+
+       c_len = strcspn(page, "\n");
+       if (c_len >= len)
+               return -EINVAL;
+
+       if (kstrtouint(page, 0, &value))
+               return -EINVAL;
+
+       spin_lock(&gc->lock);
+       pblk_rl_set_gc_rsc(&pblk->rl, value);
+       spin_unlock(&gc->lock);
+
+       return len;
+}
+
+static ssize_t pblk_sysfs_gc_force(struct pblk *pblk, const char *page,
+                                  size_t len)
+{
+       size_t c_len;
+       int force;
+
+       c_len = strcspn(page, "\n");
+       if (c_len >= len)
+               return -EINVAL;
+
+       if (kstrtouint(page, 0, &force))
+               return -EINVAL;
+
+       if (force < 0 || force > 1)
+               return -EINVAL;
+
+       pblk_gc_sysfs_force(pblk, force);
+
+       return len;
+}
+
+static struct attribute sys_write_luns = {
+       .name = "write_luns",
+       .mode = 0444,
+};
+
+static struct attribute sys_rate_limiter_attr = {
+       .name = "rate_limiter",
+       .mode = 0444,
+};
+
+static struct attribute sys_gc_state = {
+       .name = "gc_state",
+       .mode = 0444,
+};
+
+static struct attribute sys_errors_attr = {
+       .name = "errors",
+       .mode = 0444,
+};
+
+static struct attribute sys_rb_attr = {
+       .name = "write_buffer",
+       .mode = 0444,
+};
+
+static struct attribute sys_stats_ppaf_attr = {
+       .name = "ppa_format",
+       .mode = 0444,
+};
+
+static struct attribute sys_lines_attr = {
+       .name = "lines",
+       .mode = 0444,
+};
+
+static struct attribute sys_lines_info_attr = {
+       .name = "lines_info",
+       .mode = 0444,
+};
+
+static struct attribute sys_gc_force = {
+       .name = "gc_force",
+       .mode = 0200,
+};
+
+static struct attribute sys_gc_rl_max = {
+       .name = "gc_rl_max",
+       .mode = 0200,
+};
+
+#ifdef CONFIG_NVM_DEBUG
+static struct attribute sys_stats_debug_attr = {
+       .name = "stats",
+       .mode = 0444,
+};
+#endif
+
+static struct attribute *pblk_attrs[] = {
+       &sys_write_luns,
+       &sys_rate_limiter_attr,
+       &sys_errors_attr,
+       &sys_gc_state,
+       &sys_gc_force,
+       &sys_gc_rl_max,
+       &sys_rb_attr,
+       &sys_stats_ppaf_attr,
+       &sys_lines_attr,
+       &sys_lines_info_attr,
+#ifdef CONFIG_NVM_DEBUG
+       &sys_stats_debug_attr,
+#endif
+       NULL,
+};
+
+static ssize_t pblk_sysfs_show(struct kobject *kobj, struct attribute *attr,
+                              char *buf)
+{
+       struct pblk *pblk = container_of(kobj, struct pblk, kobj);
+
+       if (strcmp(attr->name, "rate_limiter") == 0)
+               return pblk_sysfs_rate_limiter(pblk, buf);
+       else if (strcmp(attr->name, "write_luns") == 0)
+               return pblk_sysfs_luns_show(pblk, buf);
+       else if (strcmp(attr->name, "gc_state") == 0)
+               return pblk_sysfs_gc_state_show(pblk, buf);
+       else if (strcmp(attr->name, "errors") == 0)
+               return pblk_sysfs_stats(pblk, buf);
+       else if (strcmp(attr->name, "write_buffer") == 0)
+               return pblk_sysfs_write_buffer(pblk, buf);
+       else if (strcmp(attr->name, "ppa_format") == 0)
+               return pblk_sysfs_ppaf(pblk, buf);
+       else if (strcmp(attr->name, "lines") == 0)
+               return pblk_sysfs_lines(pblk, buf);
+       else if (strcmp(attr->name, "lines_info") == 0)
+               return pblk_sysfs_lines_info(pblk, buf);
+#ifdef CONFIG_NVM_DEBUG
+       else if (strcmp(attr->name, "stats") == 0)
+               return pblk_sysfs_stats_debug(pblk, buf);
+#endif
+       return 0;
+}
+
+static ssize_t pblk_sysfs_store(struct kobject *kobj, struct attribute *attr,
+                               const char *buf, size_t len)
+{
+       struct pblk *pblk = container_of(kobj, struct pblk, kobj);
+
+       if (strcmp(attr->name, "gc_rl_max") == 0)
+               return pblk_sysfs_rate_store(pblk, buf, len);
+       else if (strcmp(attr->name, "gc_force") == 0)
+               return pblk_sysfs_gc_force(pblk, buf, len);
+
+       return 0;
+}
+
+static const struct sysfs_ops pblk_sysfs_ops = {
+       .show = pblk_sysfs_show,
+       .store = pblk_sysfs_store,
+};
+
+static struct kobj_type pblk_ktype = {
+       .sysfs_ops      = &pblk_sysfs_ops,
+       .default_attrs  = pblk_attrs,
+};
+
+int pblk_sysfs_init(struct gendisk *tdisk)
+{
+       struct pblk *pblk = tdisk->private_data;
+       struct device *parent_dev = disk_to_dev(pblk->disk);
+       int ret;
+
+       ret = kobject_init_and_add(&pblk->kobj, &pblk_ktype,
+                                       kobject_get(&parent_dev->kobj),
+                                       "%s", "pblk");
+       if (ret) {
+               pr_err("pblk: could not register %s/pblk\n",
+                                               tdisk->disk_name);
+               return ret;
+       }
+
+       kobject_uevent(&pblk->kobj, KOBJ_ADD);
+       return 0;
+}
+
+void pblk_sysfs_exit(struct gendisk *tdisk)
+{
+       struct pblk *pblk = tdisk->private_data;
+
+       kobject_uevent(&pblk->kobj, KOBJ_REMOVE);
+       kobject_del(&pblk->kobj);
+       kobject_put(&pblk->kobj);
+}
diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c
new file mode 100644 (file)
index 0000000..aef6fd7
--- /dev/null
@@ -0,0 +1,414 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ *                  Matias Bjorling <matias@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * pblk-write.c - pblk's write path from write buffer to media
+ */
+
+#include "pblk.h"
+
+static void pblk_sync_line(struct pblk *pblk, struct pblk_line *line)
+{
+#ifdef CONFIG_NVM_DEBUG
+       atomic_long_inc(&pblk->sync_writes);
+#endif
+
+       /* Counter protected by rb sync lock */
+       line->left_ssecs--;
+       if (!line->left_ssecs)
+               pblk_line_run_ws(pblk, line, NULL, pblk_line_close_ws);
+}
+
+static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd,
+                                   struct pblk_c_ctx *c_ctx)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct bio *original_bio;
+       unsigned long ret;
+       int i;
+
+       for (i = 0; i < c_ctx->nr_valid; i++) {
+               struct pblk_w_ctx *w_ctx;
+               struct ppa_addr p;
+               struct pblk_line *line;
+
+               w_ctx = pblk_rb_w_ctx(&pblk->rwb, c_ctx->sentry + i);
+
+               p = rqd->ppa_list[i];
+               line = &pblk->lines[pblk_dev_ppa_to_line(p)];
+               pblk_sync_line(pblk, line);
+
+               while ((original_bio = bio_list_pop(&w_ctx->bios)))
+                       bio_endio(original_bio);
+       }
+
+#ifdef CONFIG_NVM_DEBUG
+       atomic_long_add(c_ctx->nr_valid, &pblk->compl_writes);
+#endif
+
+       ret = pblk_rb_sync_advance(&pblk->rwb, c_ctx->nr_valid);
+
+       if (rqd->meta_list)
+               nvm_dev_dma_free(dev->parent, rqd->meta_list,
+                                                       rqd->dma_meta_list);
+
+       bio_put(rqd->bio);
+       pblk_free_rqd(pblk, rqd, WRITE);
+
+       return ret;
+}
+
+static unsigned long pblk_end_queued_w_bio(struct pblk *pblk,
+                                          struct nvm_rq *rqd,
+                                          struct pblk_c_ctx *c_ctx)
+{
+       list_del(&c_ctx->list);
+       return pblk_end_w_bio(pblk, rqd, c_ctx);
+}
+
+static void pblk_complete_write(struct pblk *pblk, struct nvm_rq *rqd,
+                               struct pblk_c_ctx *c_ctx)
+{
+       struct pblk_c_ctx *c, *r;
+       unsigned long flags;
+       unsigned long pos;
+
+#ifdef CONFIG_NVM_DEBUG
+       atomic_long_sub(c_ctx->nr_valid, &pblk->inflight_writes);
+#endif
+
+       pblk_up_rq(pblk, rqd->ppa_list, rqd->nr_ppas, c_ctx->lun_bitmap);
+
+       pos = pblk_rb_sync_init(&pblk->rwb, &flags);
+       if (pos == c_ctx->sentry) {
+               pos = pblk_end_w_bio(pblk, rqd, c_ctx);
+
+retry:
+               list_for_each_entry_safe(c, r, &pblk->compl_list, list) {
+                       rqd = nvm_rq_from_c_ctx(c);
+                       if (c->sentry == pos) {
+                               pos = pblk_end_queued_w_bio(pblk, rqd, c);
+                               goto retry;
+                       }
+               }
+       } else {
+               WARN_ON(nvm_rq_from_c_ctx(c_ctx) != rqd);
+               list_add_tail(&c_ctx->list, &pblk->compl_list);
+       }
+       pblk_rb_sync_end(&pblk->rwb, &flags);
+}
+
+/* When a write fails, we are not sure whether the block has grown bad or a page
+ * range is more susceptible to write errors. If a high number of pages fail, we
+ * assume that the block is bad and we mark it accordingly. In all cases, we
+ * remap and resubmit the failed entries as fast as possible; if a flush is
+ * waiting on a completion, the whole stack would stall otherwise.
+ */
+static void pblk_end_w_fail(struct pblk *pblk, struct nvm_rq *rqd)
+{
+       void *comp_bits = &rqd->ppa_status;
+       struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
+       struct pblk_rec_ctx *recovery;
+       struct ppa_addr *ppa_list = rqd->ppa_list;
+       int nr_ppas = rqd->nr_ppas;
+       unsigned int c_entries;
+       int bit, ret;
+
+       if (unlikely(nr_ppas == 1))
+               ppa_list = &rqd->ppa_addr;
+
+       recovery = mempool_alloc(pblk->rec_pool, GFP_ATOMIC);
+       if (!recovery) {
+               pr_err("pblk: could not allocate recovery context\n");
+               return;
+       }
+       INIT_LIST_HEAD(&recovery->failed);
+
+       bit = -1;
+       while ((bit = find_next_bit(comp_bits, nr_ppas, bit + 1)) < nr_ppas) {
+               struct pblk_rb_entry *entry;
+               struct ppa_addr ppa;
+
+               /* Logic error */
+               if (bit > c_ctx->nr_valid) {
+                       WARN_ONCE(1, "pblk: corrupted write request\n");
+                       mempool_free(recovery, pblk->rec_pool);
+                       goto out;
+               }
+
+               ppa = ppa_list[bit];
+               entry = pblk_rb_sync_scan_entry(&pblk->rwb, &ppa);
+               if (!entry) {
+                       pr_err("pblk: could not scan entry on write failure\n");
+                       mempool_free(recovery, pblk->rec_pool);
+                       goto out;
+               }
+
+               /* The list is filled first and emptied afterwards. No need for
+                * protecting it with a lock
+                */
+               list_add_tail(&entry->index, &recovery->failed);
+       }
+
+       c_entries = find_first_bit(comp_bits, nr_ppas);
+       ret = pblk_recov_setup_rq(pblk, c_ctx, recovery, comp_bits, c_entries);
+       if (ret) {
+               pr_err("pblk: could not recover from write failure\n");
+               mempool_free(recovery, pblk->rec_pool);
+               goto out;
+       }
+
+       INIT_WORK(&recovery->ws_rec, pblk_submit_rec);
+       queue_work(pblk->kw_wq, &recovery->ws_rec);
+
+out:
+       pblk_complete_write(pblk, rqd, c_ctx);
+}
+
+static void pblk_end_io_write(struct nvm_rq *rqd)
+{
+       struct pblk *pblk = rqd->private;
+       struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
+
+       if (rqd->error) {
+               pblk_log_write_err(pblk, rqd);
+               return pblk_end_w_fail(pblk, rqd);
+       }
+#ifdef CONFIG_NVM_DEBUG
+       else
+               WARN_ONCE(rqd->bio->bi_error, "pblk: corrupted write error\n");
+#endif
+
+       pblk_complete_write(pblk, rqd, c_ctx);
+}
+
+static int pblk_alloc_w_rq(struct pblk *pblk, struct nvm_rq *rqd,
+                          unsigned int nr_secs)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+
+       /* Setup write request */
+       rqd->opcode = NVM_OP_PWRITE;
+       rqd->nr_ppas = nr_secs;
+       rqd->flags = pblk_set_progr_mode(pblk, WRITE);
+       rqd->private = pblk;
+       rqd->end_io = pblk_end_io_write;
+
+       rqd->meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL,
+                                                       &rqd->dma_meta_list);
+       if (!rqd->meta_list)
+               return -ENOMEM;
+
+       if (unlikely(nr_secs == 1))
+               return 0;
+
+       rqd->ppa_list = rqd->meta_list + pblk_dma_meta_size;
+       rqd->dma_ppa_list = rqd->dma_meta_list + pblk_dma_meta_size;
+
+       return 0;
+}
+
+static int pblk_setup_w_rq(struct pblk *pblk, struct nvm_rq *rqd,
+                          struct pblk_c_ctx *c_ctx)
+{
+       struct pblk_line_meta *lm = &pblk->lm;
+       struct pblk_line *e_line = pblk_line_get_data_next(pblk);
+       struct ppa_addr erase_ppa;
+       unsigned int valid = c_ctx->nr_valid;
+       unsigned int padded = c_ctx->nr_padded;
+       unsigned int nr_secs = valid + padded;
+       unsigned long *lun_bitmap;
+       int ret = 0;
+
+       lun_bitmap = kzalloc(lm->lun_bitmap_len, GFP_KERNEL);
+       if (!lun_bitmap) {
+               ret = -ENOMEM;
+               goto out;
+       }
+       c_ctx->lun_bitmap = lun_bitmap;
+
+       ret = pblk_alloc_w_rq(pblk, rqd, nr_secs);
+       if (ret) {
+               kfree(lun_bitmap);
+               goto out;
+       }
+
+       ppa_set_empty(&erase_ppa);
+       if (likely(!e_line || !atomic_read(&e_line->left_eblks)))
+               pblk_map_rq(pblk, rqd, c_ctx->sentry, lun_bitmap, valid, 0);
+       else
+               pblk_map_erase_rq(pblk, rqd, c_ctx->sentry, lun_bitmap,
+                                                       valid, &erase_ppa);
+
+out:
+       if (unlikely(e_line && !ppa_empty(erase_ppa))) {
+               if (pblk_blk_erase_async(pblk, erase_ppa)) {
+                       struct nvm_tgt_dev *dev = pblk->dev;
+                       struct nvm_geo *geo = &dev->geo;
+                       int bit;
+
+                       atomic_inc(&e_line->left_eblks);
+                       bit = erase_ppa.g.lun * geo->nr_chnls + erase_ppa.g.ch;
+                       WARN_ON(!test_and_clear_bit(bit, e_line->erase_bitmap));
+                       up(&pblk->erase_sem);
+               }
+       }
+
+       return ret;
+}
+
+int pblk_setup_w_rec_rq(struct pblk *pblk, struct nvm_rq *rqd,
+                       struct pblk_c_ctx *c_ctx)
+{
+       struct pblk_line_meta *lm = &pblk->lm;
+       unsigned long *lun_bitmap;
+       int ret;
+
+       lun_bitmap = kzalloc(lm->lun_bitmap_len, GFP_KERNEL);
+       if (!lun_bitmap)
+               return -ENOMEM;
+
+       c_ctx->lun_bitmap = lun_bitmap;
+
+       ret = pblk_alloc_w_rq(pblk, rqd, rqd->nr_ppas);
+       if (ret)
+               return ret;
+
+       pblk_map_rq(pblk, rqd, c_ctx->sentry, lun_bitmap, c_ctx->nr_valid, 0);
+
+       rqd->ppa_status = (u64)0;
+       rqd->flags = pblk_set_progr_mode(pblk, WRITE);
+
+       return ret;
+}
+
+static int pblk_calc_secs_to_sync(struct pblk *pblk, unsigned int secs_avail,
+                                 unsigned int secs_to_flush)
+{
+       int secs_to_sync;
+
+       secs_to_sync = pblk_calc_secs(pblk, secs_avail, secs_to_flush);
+
+#ifdef CONFIG_NVM_DEBUG
+       if ((!secs_to_sync && secs_to_flush)
+                       || (secs_to_sync < 0)
+                       || (secs_to_sync > secs_avail && !secs_to_flush)) {
+               pr_err("pblk: bad sector calculation (a:%d,s:%d,f:%d)\n",
+                               secs_avail, secs_to_sync, secs_to_flush);
+       }
+#endif
+
+       return secs_to_sync;
+}
+
+static int pblk_submit_write(struct pblk *pblk)
+{
+       struct bio *bio;
+       struct nvm_rq *rqd;
+       struct pblk_c_ctx *c_ctx;
+       unsigned int pgs_read;
+       unsigned int secs_avail, secs_to_sync, secs_to_com;
+       unsigned int secs_to_flush;
+       unsigned long pos;
+       int err;
+
+       /* If there are no sectors in the cache, flushes (bios without data)
+        * will be cleared on the cache threads
+        */
+       secs_avail = pblk_rb_read_count(&pblk->rwb);
+       if (!secs_avail)
+               return 1;
+
+       secs_to_flush = pblk_rb_sync_point_count(&pblk->rwb);
+       if (!secs_to_flush && secs_avail < pblk->min_write_pgs)
+               return 1;
+
+       rqd = pblk_alloc_rqd(pblk, WRITE);
+       if (IS_ERR(rqd)) {
+               pr_err("pblk: cannot allocate write req.\n");
+               return 1;
+       }
+       c_ctx = nvm_rq_to_pdu(rqd);
+
+       bio = bio_alloc(GFP_KERNEL, pblk->max_write_pgs);
+       if (!bio) {
+               pr_err("pblk: cannot allocate write bio\n");
+               goto fail_free_rqd;
+       }
+       bio->bi_iter.bi_sector = 0; /* internal bio */
+       bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+       rqd->bio = bio;
+
+       secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail, secs_to_flush);
+       if (secs_to_sync > pblk->max_write_pgs) {
+               pr_err("pblk: bad buffer sync calculation\n");
+               goto fail_put_bio;
+       }
+
+       secs_to_com = (secs_to_sync > secs_avail) ? secs_avail : secs_to_sync;
+       pos = pblk_rb_read_commit(&pblk->rwb, secs_to_com);
+
+       pgs_read = pblk_rb_read_to_bio(&pblk->rwb, bio, c_ctx, pos,
+                                               secs_to_sync, secs_avail);
+       if (!pgs_read) {
+               pr_err("pblk: corrupted write bio\n");
+               goto fail_put_bio;
+       }
+
+       if (c_ctx->nr_padded)
+               if (pblk_bio_add_pages(pblk, bio, GFP_KERNEL, c_ctx->nr_padded))
+                       goto fail_put_bio;
+
+       /* Assign lbas to ppas and populate request structure */
+       err = pblk_setup_w_rq(pblk, rqd, c_ctx);
+       if (err) {
+               pr_err("pblk: could not setup write request\n");
+               goto fail_free_bio;
+       }
+
+       err = pblk_submit_io(pblk, rqd);
+       if (err) {
+               pr_err("pblk: I/O submission failed: %d\n", err);
+               goto fail_free_bio;
+       }
+
+#ifdef CONFIG_NVM_DEBUG
+       atomic_long_add(secs_to_sync, &pblk->sub_writes);
+#endif
+
+       return 0;
+
+fail_free_bio:
+       if (c_ctx->nr_padded)
+               pblk_bio_free_pages(pblk, bio, secs_to_sync, c_ctx->nr_padded);
+fail_put_bio:
+       bio_put(bio);
+fail_free_rqd:
+       pblk_free_rqd(pblk, rqd, WRITE);
+
+       return 1;
+}
+
+int pblk_write_ts(void *data)
+{
+       struct pblk *pblk = data;
+
+       while (!kthread_should_stop()) {
+               if (!pblk_submit_write(pblk))
+                       continue;
+               set_current_state(TASK_INTERRUPTIBLE);
+               io_schedule();
+       }
+
+       return 0;
+}
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
new file mode 100644 (file)
index 0000000..99f3186
--- /dev/null
@@ -0,0 +1,1121 @@
+/*
+ * Copyright (C) 2015 IT University of Copenhagen (rrpc.h)
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Matias Bjorling <matias@cnexlabs.com>
+ * Write buffering: Javier Gonzalez <javier@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * Implementation of a Physical Block-device target for Open-channel SSDs.
+ *
+ */
+
+#ifndef PBLK_H_
+#define PBLK_H_
+
+#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
+#include <linux/bio.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/vmalloc.h>
+#include <linux/crc32.h>
+#include <linux/uuid.h>
+
+#include <linux/lightnvm.h>
+
+/* Run only GC if less than 1/X blocks are free */
+#define GC_LIMIT_INVERSE 5
+#define GC_TIME_MSECS 1000
+
+#define PBLK_SECTOR (512)
+#define PBLK_EXPOSED_PAGE_SIZE (4096)
+#define PBLK_MAX_REQ_ADDRS (64)
+#define PBLK_MAX_REQ_ADDRS_PW (6)
+
+#define PBLK_CACHE_NAME_LEN (DISK_NAME_LEN + 16)
+
+#define PBLK_COMMAND_TIMEOUT_MS 30000
+
+/* Max 512 LUNs per device */
+#define PBLK_MAX_LUNS_BITMAP (4)
+
+#define NR_PHY_IN_LOG (PBLK_EXPOSED_PAGE_SIZE / PBLK_SECTOR)
+
+#define pblk_for_each_lun(pblk, rlun, i) \
+               for ((i) = 0, rlun = &(pblk)->luns[0]; \
+                       (i) < (pblk)->nr_luns; (i)++, rlun = &(pblk)->luns[(i)])
+
+#define ERASE 2 /* READ = 0, WRITE = 1 */
+
+enum {
+       /* IO Types */
+       PBLK_IOTYPE_USER        = 1 << 0,
+       PBLK_IOTYPE_GC          = 1 << 1,
+
+       /* Write buffer flags */
+       PBLK_FLUSH_ENTRY        = 1 << 2,
+       PBLK_WRITTEN_DATA       = 1 << 3,
+       PBLK_SUBMITTED_ENTRY    = 1 << 4,
+       PBLK_WRITABLE_ENTRY     = 1 << 5,
+};
+
+enum {
+       PBLK_BLK_ST_OPEN =      0x1,
+       PBLK_BLK_ST_CLOSED =    0x2,
+};
+
+/* The number of GC lists and the rate-limiter states go together. This way the
+ * rate-limiter can dictate how much GC is needed based on resource utilization.
+ */
+#define PBLK_NR_GC_LISTS 3
+#define PBLK_MAX_GC_JOBS 32
+
+enum {
+       PBLK_RL_HIGH = 1,
+       PBLK_RL_MID = 2,
+       PBLK_RL_LOW = 3,
+};
+
+struct pblk_sec_meta {
+       u64 reserved;
+       __le64 lba;
+};
+
+#define pblk_dma_meta_size (sizeof(struct pblk_sec_meta) * PBLK_MAX_REQ_ADDRS)
+
+/* write completion context */
+struct pblk_c_ctx {
+       struct list_head list;          /* Head for out-of-order completion */
+
+       unsigned long *lun_bitmap;      /* Luns used on current request */
+       unsigned int sentry;
+       unsigned int nr_valid;
+       unsigned int nr_padded;
+};
+
+/* Read context */
+struct pblk_r_ctx {
+       struct bio *orig_bio;
+};
+
+/* Recovery context */
+struct pblk_rec_ctx {
+       struct pblk *pblk;
+       struct nvm_rq *rqd;
+       struct list_head failed;
+       struct work_struct ws_rec;
+};
+
+/* Write context */
+struct pblk_w_ctx {
+       struct bio_list bios;           /* Original bios - used for completion
+                                        * in REQ_FUA, REQ_FLUSH case
+                                        */
+       u64 lba;                        /* Logic addr. associated with entry */
+       struct ppa_addr ppa;            /* Physic addr. associated with entry */
+       int flags;                      /* Write context flags */
+};
+
+struct pblk_rb_entry {
+       struct ppa_addr cacheline;      /* Cacheline for this entry */
+       void *data;                     /* Pointer to data on this entry */
+       struct pblk_w_ctx w_ctx;        /* Context for this entry */
+       struct list_head index;         /* List head to enable indexes */
+};
+
+#define EMPTY_ENTRY (~0U)
+
+struct pblk_rb_pages {
+       struct page *pages;
+       int order;
+       struct list_head list;
+};
+
+struct pblk_rb {
+       struct pblk_rb_entry *entries;  /* Ring buffer entries */
+       unsigned int mem;               /* Write offset - points to next
+                                        * writable entry in memory
+                                        */
+       unsigned int subm;              /* Read offset - points to last entry
+                                        * that has been submitted to the media
+                                        * to be persisted
+                                        */
+       unsigned int sync;              /* Synced - backpointer that signals
+                                        * the last submitted entry that has
+                                        * been successfully persisted to media
+                                        */
+       unsigned int sync_point;        /* Sync point - last entry that must be
+                                        * flushed to the media. Used with
+                                        * REQ_FLUSH and REQ_FUA
+                                        */
+       unsigned int l2p_update;        /* l2p update point - next entry for
+                                        * which l2p mapping will be updated to
+                                        * contain a device ppa address (instead
+                                        * of a cacheline
+                                        */
+       unsigned int nr_entries;        /* Number of entries in write buffer -
+                                        * must be a power of two
+                                        */
+       unsigned int seg_size;          /* Size of the data segments being
+                                        * stored on each entry. Typically this
+                                        * will be 4KB
+                                        */
+
+       struct list_head pages;         /* List of data pages */
+
+       spinlock_t w_lock;              /* Write lock */
+       spinlock_t s_lock;              /* Sync lock */
+
+#ifdef CONFIG_NVM_DEBUG
+       atomic_t inflight_sync_point;   /* Not served REQ_FLUSH | REQ_FUA */
+#endif
+};
+
+#define PBLK_RECOVERY_SECTORS 16
+
+struct pblk_lun {
+       struct ppa_addr bppa;
+
+       u8 *bb_list;                    /* Bad block list for LUN. Only used on
+                                        * bring up. Bad blocks are managed
+                                        * within lines on run-time.
+                                        */
+
+       struct semaphore wr_sem;
+};
+
+struct pblk_gc_rq {
+       struct pblk_line *line;
+       void *data;
+       u64 *lba_list;
+       int nr_secs;
+       int secs_to_gc;
+       struct list_head list;
+};
+
+struct pblk_gc {
+       int gc_active;
+       int gc_enabled;
+       int gc_forced;
+       int gc_jobs_active;
+       atomic_t inflight_gc;
+
+       struct task_struct *gc_ts;
+       struct task_struct *gc_writer_ts;
+       struct workqueue_struct *gc_reader_wq;
+       struct timer_list gc_timer;
+
+       int w_entries;
+       struct list_head w_list;
+
+       spinlock_t lock;
+       spinlock_t w_lock;
+};
+
+struct pblk_rl {
+       unsigned int high;      /* Upper threshold for rate limiter (free run -
+                                * user I/O rate limiter
+                                */
+       unsigned int low;       /* Lower threshold for rate limiter (user I/O
+                                * rate limiter - stall)
+                                */
+       unsigned int high_pw;   /* High rounded up as a power of 2 */
+
+#define PBLK_USER_HIGH_THRS 2  /* Begin write limit at 50 percent
+                                * available blks
+                                */
+#define PBLK_USER_LOW_THRS 20  /* Aggressive GC at 5% available blocks */
+
+       int rb_windows_pw;      /* Number of rate windows in the write buffer
+                                * given as a power-of-2. This guarantees that
+                                * when user I/O is being rate limited, there
+                                * will be reserved enough space for the GC to
+                                * place its payload. A window is of
+                                * pblk->max_write_pgs size, which in NVMe is
+                                * 64, i.e., 256kb.
+                                */
+       int rb_budget;          /* Total number of entries available for I/O */
+       int rb_user_max;        /* Max buffer entries available for user I/O */
+       atomic_t rb_user_cnt;   /* User I/O buffer counter */
+       int rb_gc_max;          /* Max buffer entries available for GC I/O */
+       int rb_gc_rsv;          /* Reserved buffer entries for GC I/O */
+       int rb_state;           /* Rate-limiter current state */
+       atomic_t rb_gc_cnt;     /* GC I/O buffer counter */
+
+       int rb_user_active;
+       struct timer_list u_timer;
+
+       unsigned long long nr_secs;
+       unsigned long total_blocks;
+       atomic_t free_blocks;
+};
+
+#define PBLK_LINE_NR_LUN_BITMAP 2
+#define PBLK_LINE_NR_SEC_BITMAP 2
+#define PBLK_LINE_EMPTY (~0U)
+
+enum {
+       /* Line Types */
+       PBLK_LINETYPE_FREE = 0,
+       PBLK_LINETYPE_LOG = 1,
+       PBLK_LINETYPE_DATA = 2,
+
+       /* Line state */
+       PBLK_LINESTATE_FREE = 10,
+       PBLK_LINESTATE_OPEN = 11,
+       PBLK_LINESTATE_CLOSED = 12,
+       PBLK_LINESTATE_GC = 13,
+       PBLK_LINESTATE_BAD = 14,
+       PBLK_LINESTATE_CORRUPT = 15,
+
+       /* GC group */
+       PBLK_LINEGC_NONE = 20,
+       PBLK_LINEGC_EMPTY = 21,
+       PBLK_LINEGC_LOW = 22,
+       PBLK_LINEGC_MID = 23,
+       PBLK_LINEGC_HIGH = 24,
+       PBLK_LINEGC_FULL = 25,
+};
+
+#define PBLK_MAGIC 0x70626c6b /*pblk*/
+
+struct line_header {
+       __le32 crc;
+       __le32 identifier;      /* pblk identifier */
+       __u8 uuid[16];          /* instance uuid */
+       __le16 type;            /* line type */
+       __le16 version;         /* type version */
+       __le32 id;              /* line id for current line */
+};
+
+struct line_smeta {
+       struct line_header header;
+
+       __le32 crc;             /* Full structure including struct crc */
+       /* Previous line metadata */
+       __le32 prev_id;         /* Line id for previous line */
+
+       /* Current line metadata */
+       __le64 seq_nr;          /* Sequence number for current line */
+
+       /* Active writers */
+       __le32 window_wr_lun;   /* Number of parallel LUNs to write */
+
+       __le32 rsvd[2];
+};
+
+/*
+ * Metadata Layout:
+ *     1. struct pblk_emeta
+ *     2. nr_lbas u64 forming lba list
+ *     3. nr_lines (all) u32 valid sector count (vsc) (~0U: non-alloc line)
+ *     4. nr_luns bits (u64 format) forming line bad block bitmap
+ *
+ *     3. and 4. will be part of FTL log
+ */
+struct line_emeta {
+       struct line_header header;
+
+       __le32 crc;             /* Full structure including struct crc */
+
+       /* Previous line metadata */
+       __le32 prev_id;         /* Line id for prev line */
+
+       /* Current line metadata */
+       __le64 seq_nr;          /* Sequence number for current line */
+
+       /* Active writers */
+       __le32 window_wr_lun;   /* Number of parallel LUNs to write */
+
+       /* Bookkeeping for recovery */
+       __le32 next_id;         /* Line id for next line */
+       __le64 nr_lbas;         /* Number of lbas mapped in line */
+       __le64 nr_valid_lbas;   /* Number of valid lbas mapped in line */
+};
+
+struct pblk_line {
+       struct pblk *pblk;
+       unsigned int id;                /* Line number corresponds to the
+                                        * block line
+                                        */
+       unsigned int seq_nr;            /* Unique line sequence number */
+
+       int state;                      /* PBLK_LINESTATE_X */
+       int type;                       /* PBLK_LINETYPE_X */
+       int gc_group;                   /* PBLK_LINEGC_X */
+       struct list_head list;          /* Free, GC lists */
+
+       unsigned long *lun_bitmap;      /* Bitmap for LUNs mapped in line */
+
+       struct line_smeta *smeta;       /* Start metadata */
+       struct line_emeta *emeta;       /* End metadata */
+       int meta_line;                  /* Metadata line id */
+       u64 smeta_ssec;                 /* Sector where smeta starts */
+       u64 emeta_ssec;                 /* Sector where emeta starts */
+
+       unsigned int sec_in_line;       /* Number of usable secs in line */
+
+       atomic_t blk_in_line;           /* Number of good blocks in line */
+       unsigned long *blk_bitmap;      /* Bitmap for valid/invalid blocks */
+       unsigned long *erase_bitmap;    /* Bitmap for erased blocks */
+
+       unsigned long *map_bitmap;      /* Bitmap for mapped sectors in line */
+       unsigned long *invalid_bitmap;  /* Bitmap for invalid sectors in line */
+
+       atomic_t left_eblks;            /* Blocks left for erasing */
+       atomic_t left_seblks;           /* Blocks left for sync erasing */
+
+       int left_msecs;                 /* Sectors left for mapping */
+       int left_ssecs;                 /* Sectors left to sync */
+       unsigned int cur_sec;           /* Sector map pointer */
+       unsigned int vsc;               /* Valid sector count in line */
+
+       struct kref ref;                /* Write buffer L2P references */
+
+       spinlock_t lock;                /* Necessary for invalid_bitmap only */
+};
+
+#define PBLK_DATA_LINES 4
+
+enum{
+       PBLK_KMALLOC_META = 1,
+       PBLK_VMALLOC_META = 2,
+};
+
+struct pblk_line_metadata {
+       void *meta;
+};
+
+struct pblk_line_mgmt {
+       int nr_lines;                   /* Total number of full lines */
+       int nr_free_lines;              /* Number of full lines in free list */
+
+       /* Free lists - use free_lock */
+       struct list_head free_list;     /* Full lines ready to use */
+       struct list_head corrupt_list;  /* Full lines corrupted */
+       struct list_head bad_list;      /* Full lines bad */
+
+       /* GC lists - use gc_lock */
+       struct list_head *gc_lists[PBLK_NR_GC_LISTS];
+       struct list_head gc_high_list;  /* Full lines ready to GC, high isc */
+       struct list_head gc_mid_list;   /* Full lines ready to GC, mid isc */
+       struct list_head gc_low_list;   /* Full lines ready to GC, low isc */
+
+       struct list_head gc_full_list;  /* Full lines ready to GC, no valid */
+       struct list_head gc_empty_list; /* Full lines close, all valid */
+
+       struct pblk_line *log_line;     /* Current FTL log line */
+       struct pblk_line *data_line;    /* Current data line */
+       struct pblk_line *log_next;     /* Next FTL log line */
+       struct pblk_line *data_next;    /* Next data line */
+
+       /* Metadata allocation type: VMALLOC | KMALLOC */
+       int smeta_alloc_type;
+       int emeta_alloc_type;
+
+       /* Pre-allocated metadata for data lines */
+       struct pblk_line_metadata sline_meta[PBLK_DATA_LINES];
+       struct pblk_line_metadata eline_meta[PBLK_DATA_LINES];
+       unsigned long meta_bitmap;
+
+       /* Helpers for fast bitmap calculations */
+       unsigned long *bb_template;
+       unsigned long *bb_aux;
+
+       unsigned long d_seq_nr;         /* Data line unique sequence number */
+       unsigned long l_seq_nr;         /* Log line unique sequence number */
+
+       spinlock_t free_lock;
+       spinlock_t gc_lock;
+};
+
+struct pblk_line_meta {
+       unsigned int smeta_len;         /* Total length for smeta */
+       unsigned int smeta_sec;         /* Sectors needed for smeta*/
+       unsigned int emeta_len;         /* Total length for emeta */
+       unsigned int emeta_sec;         /* Sectors needed for emeta*/
+       unsigned int emeta_bb;          /* Boundary for bb that affects emeta */
+       unsigned int sec_bitmap_len;    /* Length for sector bitmap in line */
+       unsigned int blk_bitmap_len;    /* Length for block bitmap in line */
+       unsigned int lun_bitmap_len;    /* Length for lun bitmap in line */
+
+       unsigned int blk_per_line;      /* Number of blocks in a full line */
+       unsigned int sec_per_line;      /* Number of sectors in a line */
+       unsigned int min_blk_line;      /* Min. number of good blocks in line */
+
+       unsigned int mid_thrs;          /* Threshold for GC mid list */
+       unsigned int high_thrs;         /* Threshold for GC high list */
+};
+
+struct pblk_addr_format {
+       u64     ch_mask;
+       u64     lun_mask;
+       u64     pln_mask;
+       u64     blk_mask;
+       u64     pg_mask;
+       u64     sec_mask;
+       u8      ch_offset;
+       u8      lun_offset;
+       u8      pln_offset;
+       u8      blk_offset;
+       u8      pg_offset;
+       u8      sec_offset;
+};
+
+struct pblk {
+       struct nvm_tgt_dev *dev;
+       struct gendisk *disk;
+
+       struct kobject kobj;
+
+       struct pblk_lun *luns;
+
+       struct pblk_line *lines;                /* Line array */
+       struct pblk_line_mgmt l_mg;             /* Line management */
+       struct pblk_line_meta lm;               /* Line metadata */
+
+       int ppaf_bitsize;
+       struct pblk_addr_format ppaf;
+
+       struct pblk_rb rwb;
+
+       int min_write_pgs; /* Minimum amount of pages required by controller */
+       int max_write_pgs; /* Maximum amount of pages supported by controller */
+       int pgs_in_buffer; /* Number of pages that need to be held in buffer to
+                           * guarantee successful reads.
+                           */
+
+       sector_t capacity; /* Device capacity when bad blocks are subtracted */
+       int over_pct;      /* Percentage of device used for over-provisioning */
+
+       /* pblk provisioning values. Used by rate limiter */
+       struct pblk_rl rl;
+
+       struct semaphore erase_sem;
+
+       unsigned char instance_uuid[16];
+#ifdef CONFIG_NVM_DEBUG
+       /* All debug counters apply to 4kb sector I/Os */
+       atomic_long_t inflight_writes;  /* Inflight writes (user and gc) */
+       atomic_long_t padded_writes;    /* Sectors padded due to flush/fua */
+       atomic_long_t padded_wb;        /* Sectors padded in write buffer */
+       atomic_long_t nr_flush;         /* Number of flush/fua I/O */
+       atomic_long_t req_writes;       /* Sectors stored on write buffer */
+       atomic_long_t sub_writes;       /* Sectors submitted from buffer */
+       atomic_long_t sync_writes;      /* Sectors synced to media */
+       atomic_long_t compl_writes;     /* Sectors completed in write bio */
+       atomic_long_t inflight_reads;   /* Inflight sector read requests */
+       atomic_long_t sync_reads;       /* Completed sector read requests */
+       atomic_long_t recov_writes;     /* Sectors submitted from recovery */
+       atomic_long_t recov_gc_writes;  /* Sectors submitted from write GC */
+       atomic_long_t recov_gc_reads;   /* Sectors submitted from read GC */
+#endif
+
+       spinlock_t lock;
+
+       atomic_long_t read_failed;
+       atomic_long_t read_empty;
+       atomic_long_t read_high_ecc;
+       atomic_long_t read_failed_gc;
+       atomic_long_t write_failed;
+       atomic_long_t erase_failed;
+
+       struct task_struct *writer_ts;
+
+       /* Simple translation map of logical addresses to physical addresses.
+        * The logical addresses is known by the host system, while the physical
+        * addresses are used when writing to the disk block device.
+        */
+       unsigned char *trans_map;
+       spinlock_t trans_lock;
+
+       struct list_head compl_list;
+
+       mempool_t *page_pool;
+       mempool_t *line_ws_pool;
+       mempool_t *rec_pool;
+       mempool_t *r_rq_pool;
+       mempool_t *w_rq_pool;
+       mempool_t *line_meta_pool;
+
+       struct workqueue_struct *kw_wq;
+       struct timer_list wtimer;
+
+       struct pblk_gc gc;
+};
+
+struct pblk_line_ws {
+       struct pblk *pblk;
+       struct pblk_line *line;
+       void *priv;
+       struct work_struct ws;
+};
+
+#define pblk_r_rq_size (sizeof(struct nvm_rq) + sizeof(struct pblk_r_ctx))
+#define pblk_w_rq_size (sizeof(struct nvm_rq) + sizeof(struct pblk_c_ctx))
+
+/*
+ * pblk ring buffer operations
+ */
+int pblk_rb_init(struct pblk_rb *rb, struct pblk_rb_entry *rb_entry_base,
+                unsigned int power_size, unsigned int power_seg_sz);
+unsigned int pblk_rb_calculate_size(unsigned int nr_entries);
+void *pblk_rb_entries_ref(struct pblk_rb *rb);
+int pblk_rb_may_write_user(struct pblk_rb *rb, struct bio *bio,
+                          unsigned int nr_entries, unsigned int *pos);
+int pblk_rb_may_write_gc(struct pblk_rb *rb, unsigned int nr_entries,
+                        unsigned int *pos);
+void pblk_rb_write_entry_user(struct pblk_rb *rb, void *data,
+                             struct pblk_w_ctx w_ctx, unsigned int pos);
+void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data,
+                           struct pblk_w_ctx w_ctx, struct pblk_line *gc_line,
+                           unsigned int pos);
+struct pblk_w_ctx *pblk_rb_w_ctx(struct pblk_rb *rb, unsigned int pos);
+
+void pblk_rb_sync_l2p(struct pblk_rb *rb);
+unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct bio *bio,
+                                struct pblk_c_ctx *c_ctx,
+                                unsigned int pos,
+                                unsigned int nr_entries,
+                                unsigned int count);
+unsigned int pblk_rb_read_to_bio_list(struct pblk_rb *rb, struct bio *bio,
+                                     struct list_head *list,
+                                     unsigned int max);
+int pblk_rb_copy_to_bio(struct pblk_rb *rb, struct bio *bio, sector_t lba,
+                       u64 pos, int bio_iter);
+unsigned int pblk_rb_read_commit(struct pblk_rb *rb, unsigned int entries);
+
+unsigned int pblk_rb_sync_init(struct pblk_rb *rb, unsigned long *flags);
+unsigned int pblk_rb_sync_advance(struct pblk_rb *rb, unsigned int nr_entries);
+struct pblk_rb_entry *pblk_rb_sync_scan_entry(struct pblk_rb *rb,
+                                             struct ppa_addr *ppa);
+void pblk_rb_sync_end(struct pblk_rb *rb, unsigned long *flags);
+unsigned int pblk_rb_sync_point_count(struct pblk_rb *rb);
+
+unsigned int pblk_rb_read_count(struct pblk_rb *rb);
+unsigned int pblk_rb_wrap_pos(struct pblk_rb *rb, unsigned int pos);
+
+int pblk_rb_tear_down_check(struct pblk_rb *rb);
+int pblk_rb_pos_oob(struct pblk_rb *rb, u64 pos);
+void pblk_rb_data_free(struct pblk_rb *rb);
+ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf);
+
+/*
+ * pblk core
+ */
+struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int rw);
+int pblk_setup_w_rec_rq(struct pblk *pblk, struct nvm_rq *rqd,
+                       struct pblk_c_ctx *c_ctx);
+void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int rw);
+void pblk_flush_writer(struct pblk *pblk);
+struct ppa_addr pblk_get_lba_map(struct pblk *pblk, sector_t lba);
+void pblk_discard(struct pblk *pblk, struct bio *bio);
+void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd);
+void pblk_log_read_err(struct pblk *pblk, struct nvm_rq *rqd);
+int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd);
+struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data,
+                             unsigned int nr_secs, unsigned int len,
+                             gfp_t gfp_mask);
+struct pblk_line *pblk_line_get(struct pblk *pblk);
+struct pblk_line *pblk_line_get_first_data(struct pblk *pblk);
+struct pblk_line *pblk_line_replace_data(struct pblk *pblk);
+int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line);
+void pblk_line_recov_close(struct pblk *pblk, struct pblk_line *line);
+struct pblk_line *pblk_line_get_data(struct pblk *pblk);
+struct pblk_line *pblk_line_get_data_next(struct pblk *pblk);
+int pblk_line_erase(struct pblk *pblk, struct pblk_line *line);
+int pblk_line_is_full(struct pblk_line *line);
+void pblk_line_free(struct pblk *pblk, struct pblk_line *line);
+void pblk_line_close_ws(struct work_struct *work);
+void pblk_line_close(struct pblk *pblk, struct pblk_line *line);
+void pblk_line_mark_bb(struct work_struct *work);
+void pblk_line_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv,
+                     void (*work)(struct work_struct *));
+u64 pblk_line_smeta_start(struct pblk *pblk, struct pblk_line *line);
+int pblk_line_read_smeta(struct pblk *pblk, struct pblk_line *line);
+int pblk_line_read_emeta(struct pblk *pblk, struct pblk_line *line);
+int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr erase_ppa);
+void pblk_line_put(struct kref *ref);
+struct list_head *pblk_line_gc_list(struct pblk *pblk, struct pblk_line *line);
+u64 pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs);
+int pblk_calc_secs(struct pblk *pblk, unsigned long secs_avail,
+                  unsigned long secs_to_flush);
+void pblk_down_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
+                 unsigned long *lun_bitmap);
+void pblk_up_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
+               unsigned long *lun_bitmap);
+void pblk_end_bio_sync(struct bio *bio);
+void pblk_end_io_sync(struct nvm_rq *rqd);
+int pblk_bio_add_pages(struct pblk *pblk, struct bio *bio, gfp_t flags,
+                      int nr_pages);
+void pblk_map_pad_invalidate(struct pblk *pblk, struct pblk_line *line,
+                            u64 paddr);
+void pblk_bio_free_pages(struct pblk *pblk, struct bio *bio, int off,
+                        int nr_pages);
+void pblk_map_invalidate(struct pblk *pblk, struct ppa_addr ppa);
+void pblk_update_map(struct pblk *pblk, sector_t lba, struct ppa_addr ppa);
+void pblk_update_map_cache(struct pblk *pblk, sector_t lba,
+                          struct ppa_addr ppa);
+void pblk_update_map_dev(struct pblk *pblk, sector_t lba,
+                        struct ppa_addr ppa, struct ppa_addr entry_line);
+int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa,
+                      struct pblk_line *gc_line);
+void pblk_lookup_l2p_rand(struct pblk *pblk, struct ppa_addr *ppas,
+                         u64 *lba_list, int nr_secs);
+void pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas,
+                        sector_t blba, int nr_secs);
+
+/*
+ * pblk user I/O write path
+ */
+int pblk_write_to_cache(struct pblk *pblk, struct bio *bio,
+                       unsigned long flags);
+int pblk_write_gc_to_cache(struct pblk *pblk, void *data, u64 *lba_list,
+                          unsigned int nr_entries, unsigned int nr_rec_entries,
+                          struct pblk_line *gc_line, unsigned long flags);
+
+/*
+ * pblk map
+ */
+void pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
+                      unsigned int sentry, unsigned long *lun_bitmap,
+                      unsigned int valid_secs, struct ppa_addr *erase_ppa);
+void pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
+                unsigned long *lun_bitmap, unsigned int valid_secs,
+                unsigned int off);
+
+/*
+ * pblk write thread
+ */
+int pblk_write_ts(void *data);
+void pblk_write_timer_fn(unsigned long data);
+void pblk_write_should_kick(struct pblk *pblk);
+
+/*
+ * pblk read path
+ */
+int pblk_submit_read(struct pblk *pblk, struct bio *bio);
+int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data,
+                       unsigned int nr_secs, unsigned int *secs_to_gc,
+                       struct pblk_line *line);
+/*
+ * pblk recovery
+ */
+void pblk_submit_rec(struct work_struct *work);
+struct pblk_line *pblk_recov_l2p(struct pblk *pblk);
+void pblk_recov_pad(struct pblk *pblk);
+__le64 *pblk_recov_get_lba_list(struct pblk *pblk, struct line_emeta *emeta);
+int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx,
+                       struct pblk_rec_ctx *recovery, u64 *comp_bits,
+                       unsigned int comp);
+
+/*
+ * pblk gc
+ */
+#define PBLK_GC_TRIES 3
+
+int pblk_gc_init(struct pblk *pblk);
+void pblk_gc_exit(struct pblk *pblk);
+void pblk_gc_should_start(struct pblk *pblk);
+void pblk_gc_should_stop(struct pblk *pblk);
+int pblk_gc_status(struct pblk *pblk);
+void pblk_gc_sysfs_state_show(struct pblk *pblk, int *gc_enabled,
+                             int *gc_active);
+void pblk_gc_sysfs_force(struct pblk *pblk, int force);
+
+/*
+ * pblk rate limiter
+ */
+void pblk_rl_init(struct pblk_rl *rl, int budget);
+void pblk_rl_free(struct pblk_rl *rl);
+int pblk_rl_gc_thrs(struct pblk_rl *rl);
+unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl);
+int pblk_rl_user_may_insert(struct pblk_rl *rl, int nr_entries);
+void pblk_rl_user_in(struct pblk_rl *rl, int nr_entries);
+int pblk_rl_gc_may_insert(struct pblk_rl *rl, int nr_entries);
+void pblk_rl_gc_in(struct pblk_rl *rl, int nr_entries);
+void pblk_rl_out(struct pblk_rl *rl, int nr_user, int nr_gc);
+void pblk_rl_set_gc_rsc(struct pblk_rl *rl, int rsv);
+int pblk_rl_sysfs_rate_show(struct pblk_rl *rl);
+void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line);
+void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line);
+
+/*
+ * pblk sysfs
+ */
+int pblk_sysfs_init(struct gendisk *tdisk);
+void pblk_sysfs_exit(struct gendisk *tdisk);
+
+static inline void *pblk_malloc(size_t size, int type, gfp_t flags)
+{
+       if (type == PBLK_KMALLOC_META)
+               return kmalloc(size, flags);
+       return vmalloc(size);
+}
+
+static inline void pblk_mfree(void *ptr, int type)
+{
+       if (type == PBLK_KMALLOC_META)
+               kfree(ptr);
+       else
+               vfree(ptr);
+}
+
+static inline struct nvm_rq *nvm_rq_from_c_ctx(void *c_ctx)
+{
+       return c_ctx - sizeof(struct nvm_rq);
+}
+
+static inline void *pblk_line_emeta_to_lbas(struct line_emeta *emeta)
+{
+       return (emeta) + 1;
+}
+
+#define NVM_MEM_PAGE_WRITE (8)
+
+static inline int pblk_pad_distance(struct pblk *pblk)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+
+       return NVM_MEM_PAGE_WRITE * geo->nr_luns * geo->sec_per_pl;
+}
+
+static inline int pblk_dev_ppa_to_line(struct ppa_addr p)
+{
+       return p.g.blk;
+}
+
+static inline int pblk_tgt_ppa_to_line(struct ppa_addr p)
+{
+       return p.g.blk;
+}
+
+static inline int pblk_ppa_to_pos(struct nvm_geo *geo, struct ppa_addr p)
+{
+       return p.g.lun * geo->nr_chnls + p.g.ch;
+}
+
+/* A block within a line corresponds to the lun */
+static inline int pblk_dev_ppa_to_pos(struct nvm_geo *geo, struct ppa_addr p)
+{
+       return p.g.lun * geo->nr_chnls + p.g.ch;
+}
+
+static inline struct ppa_addr pblk_ppa32_to_ppa64(struct pblk *pblk, u32 ppa32)
+{
+       struct ppa_addr ppa64;
+
+       ppa64.ppa = 0;
+
+       if (ppa32 == -1) {
+               ppa64.ppa = ADDR_EMPTY;
+       } else if (ppa32 & (1U << 31)) {
+               ppa64.c.line = ppa32 & ((~0U) >> 1);
+               ppa64.c.is_cached = 1;
+       } else {
+               ppa64.g.blk = (ppa32 & pblk->ppaf.blk_mask) >>
+                                                       pblk->ppaf.blk_offset;
+               ppa64.g.pg = (ppa32 & pblk->ppaf.pg_mask) >>
+                                                       pblk->ppaf.pg_offset;
+               ppa64.g.lun = (ppa32 & pblk->ppaf.lun_mask) >>
+                                                       pblk->ppaf.lun_offset;
+               ppa64.g.ch = (ppa32 & pblk->ppaf.ch_mask) >>
+                                                       pblk->ppaf.ch_offset;
+               ppa64.g.pl = (ppa32 & pblk->ppaf.pln_mask) >>
+                                                       pblk->ppaf.pln_offset;
+               ppa64.g.sec = (ppa32 & pblk->ppaf.sec_mask) >>
+                                                       pblk->ppaf.sec_offset;
+       }
+
+       return ppa64;
+}
+
+static inline struct ppa_addr pblk_trans_map_get(struct pblk *pblk,
+                                                               sector_t lba)
+{
+       struct ppa_addr ppa;
+
+       if (pblk->ppaf_bitsize < 32) {
+               u32 *map = (u32 *)pblk->trans_map;
+
+               ppa = pblk_ppa32_to_ppa64(pblk, map[lba]);
+       } else {
+               struct ppa_addr *map = (struct ppa_addr *)pblk->trans_map;
+
+               ppa = map[lba];
+       }
+
+       return ppa;
+}
+
+static inline u32 pblk_ppa64_to_ppa32(struct pblk *pblk, struct ppa_addr ppa64)
+{
+       u32 ppa32 = 0;
+
+       if (ppa64.ppa == ADDR_EMPTY) {
+               ppa32 = ~0U;
+       } else if (ppa64.c.is_cached) {
+               ppa32 |= ppa64.c.line;
+               ppa32 |= 1U << 31;
+       } else {
+               ppa32 |= ppa64.g.blk << pblk->ppaf.blk_offset;
+               ppa32 |= ppa64.g.pg << pblk->ppaf.pg_offset;
+               ppa32 |= ppa64.g.lun << pblk->ppaf.lun_offset;
+               ppa32 |= ppa64.g.ch << pblk->ppaf.ch_offset;
+               ppa32 |= ppa64.g.pl << pblk->ppaf.pln_offset;
+               ppa32 |= ppa64.g.sec << pblk->ppaf.sec_offset;
+       }
+
+       return ppa32;
+}
+
+static inline void pblk_trans_map_set(struct pblk *pblk, sector_t lba,
+                                               struct ppa_addr ppa)
+{
+       if (pblk->ppaf_bitsize < 32) {
+               u32 *map = (u32 *)pblk->trans_map;
+
+               map[lba] = pblk_ppa64_to_ppa32(pblk, ppa);
+       } else {
+               u64 *map = (u64 *)pblk->trans_map;
+
+               map[lba] = ppa.ppa;
+       }
+}
+
+static inline u64 pblk_dev_ppa_to_line_addr(struct pblk *pblk,
+                                                       struct ppa_addr p)
+{
+       u64 paddr;
+
+       paddr = 0;
+       paddr |= (u64)p.g.pg << pblk->ppaf.pg_offset;
+       paddr |= (u64)p.g.lun << pblk->ppaf.lun_offset;
+       paddr |= (u64)p.g.ch << pblk->ppaf.ch_offset;
+       paddr |= (u64)p.g.pl << pblk->ppaf.pln_offset;
+       paddr |= (u64)p.g.sec << pblk->ppaf.sec_offset;
+
+       return paddr;
+}
+
+static inline int pblk_ppa_empty(struct ppa_addr ppa_addr)
+{
+       return (ppa_addr.ppa == ADDR_EMPTY);
+}
+
+static inline void pblk_ppa_set_empty(struct ppa_addr *ppa_addr)
+{
+       ppa_addr->ppa = ADDR_EMPTY;
+}
+
+static inline int pblk_addr_in_cache(struct ppa_addr ppa)
+{
+       return (ppa.ppa != ADDR_EMPTY && ppa.c.is_cached);
+}
+
+static inline int pblk_addr_to_cacheline(struct ppa_addr ppa)
+{
+       return ppa.c.line;
+}
+
+static inline struct ppa_addr pblk_cacheline_to_addr(int addr)
+{
+       struct ppa_addr p;
+
+       p.c.line = addr;
+       p.c.is_cached = 1;
+
+       return p;
+}
+
+static inline struct ppa_addr addr_to_gen_ppa(struct pblk *pblk, u64 paddr,
+                                             u64 line_id)
+{
+       struct ppa_addr ppa;
+
+       ppa.ppa = 0;
+       ppa.g.blk = line_id;
+       ppa.g.pg = (paddr & pblk->ppaf.pg_mask) >> pblk->ppaf.pg_offset;
+       ppa.g.lun = (paddr & pblk->ppaf.lun_mask) >> pblk->ppaf.lun_offset;
+       ppa.g.ch = (paddr & pblk->ppaf.ch_mask) >> pblk->ppaf.ch_offset;
+       ppa.g.pl = (paddr & pblk->ppaf.pln_mask) >> pblk->ppaf.pln_offset;
+       ppa.g.sec = (paddr & pblk->ppaf.sec_mask) >> pblk->ppaf.sec_offset;
+
+       return ppa;
+}
+
+static inline struct ppa_addr addr_to_pblk_ppa(struct pblk *pblk, u64 paddr,
+                                        u64 line_id)
+{
+       struct ppa_addr ppa;
+
+       ppa = addr_to_gen_ppa(pblk, paddr, line_id);
+
+       return ppa;
+}
+
+static inline u32 pblk_calc_meta_header_crc(struct pblk *pblk,
+                                           struct line_smeta *smeta)
+{
+       u32 crc = ~(u32)0;
+
+       crc = crc32_le(crc, (unsigned char *)smeta + sizeof(crc),
+                               sizeof(struct line_header) - sizeof(crc));
+
+       return crc;
+}
+
+static inline u32 pblk_calc_smeta_crc(struct pblk *pblk,
+                                     struct line_smeta *smeta)
+{
+       struct pblk_line_meta *lm = &pblk->lm;
+       u32 crc = ~(u32)0;
+
+       crc = crc32_le(crc, (unsigned char *)smeta +
+                               sizeof(struct line_header) + sizeof(crc),
+                               lm->smeta_len -
+                               sizeof(struct line_header) - sizeof(crc));
+
+       return crc;
+}
+
+static inline u32 pblk_calc_emeta_crc(struct pblk *pblk,
+                                     struct line_emeta *emeta)
+{
+       struct pblk_line_meta *lm = &pblk->lm;
+       u32 crc = ~(u32)0;
+
+       crc = crc32_le(crc, (unsigned char *)emeta +
+                               sizeof(struct line_header) + sizeof(crc),
+                               lm->emeta_len -
+                               sizeof(struct line_header) - sizeof(crc));
+
+       return crc;
+}
+
+static inline int pblk_set_progr_mode(struct pblk *pblk, int type)
+{
+       struct nvm_tgt_dev *dev = pblk->dev;
+       struct nvm_geo *geo = &dev->geo;
+       int flags;
+
+       flags = geo->plane_mode >> 1;
+
+       if (type == WRITE)
+               flags |= NVM_IO_SCRAMBLE_ENABLE;
+
+       return flags;
+}
+
+static inline int pblk_set_read_mode(struct pblk *pblk)
+{
+       return NVM_IO_SNGL_ACCESS | NVM_IO_SUSPEND | NVM_IO_SCRAMBLE_ENABLE;
+}
+
+#ifdef CONFIG_NVM_DEBUG
+static inline void print_ppa(struct ppa_addr *p, char *msg, int error)
+{
+       if (p->c.is_cached) {
+               pr_err("ppa: (%s: %x) cache line: %llu\n",
+                               msg, error, (u64)p->c.line);
+       } else {
+               pr_err("ppa: (%s: %x):ch:%d,lun:%d,blk:%d,pg:%d,pl:%d,sec:%d\n",
+                       msg, error,
+                       p->g.ch, p->g.lun, p->g.blk,
+                       p->g.pg, p->g.pl, p->g.sec);
+       }
+}
+
+static inline void pblk_print_failed_rqd(struct pblk *pblk, struct nvm_rq *rqd,
+                                        int error)
+{
+       int bit = -1;
+
+       if (rqd->nr_ppas ==  1) {
+               print_ppa(&rqd->ppa_addr, "rqd", error);
+               return;
+       }
+
+       while ((bit = find_next_bit((void *)&rqd->ppa_status, rqd->nr_ppas,
+                                               bit + 1)) < rqd->nr_ppas) {
+               print_ppa(&rqd->ppa_list[bit], "rqd", error);
+       }
+
+       pr_err("error:%d, ppa_status:%llx\n", error, rqd->ppa_status);
+}
+#endif
+
+static inline int pblk_boundary_ppa_checks(struct nvm_tgt_dev *tgt_dev,
+                                      struct ppa_addr *ppas, int nr_ppas)
+{
+       struct nvm_geo *geo = &tgt_dev->geo;
+       struct ppa_addr *ppa;
+       int i;
+
+       for (i = 0; i < nr_ppas; i++) {
+               ppa = &ppas[i];
+
+               if (!ppa->c.is_cached &&
+                               ppa->g.ch < geo->nr_chnls &&
+                               ppa->g.lun < geo->luns_per_chnl &&
+                               ppa->g.pl < geo->nr_planes &&
+                               ppa->g.blk < geo->blks_per_lun &&
+                               ppa->g.pg < geo->pgs_per_blk &&
+                               ppa->g.sec < geo->sec_per_pg)
+                       continue;
+
+#ifdef CONFIG_NVM_DEBUG
+               print_ppa(ppa, "boundary", i);
+#endif
+               return 1;
+       }
+       return 0;
+}
+
+static inline int pblk_boundary_paddr_checks(struct pblk *pblk, u64 paddr)
+{
+       struct pblk_line_meta *lm = &pblk->lm;
+
+       if (paddr > lm->sec_per_line)
+               return 1;
+
+       return 0;
+}
+
+static inline unsigned int pblk_get_bi_idx(struct bio *bio)
+{
+       return bio->bi_iter.bi_idx;
+}
+
+static inline sector_t pblk_get_lba(struct bio *bio)
+{
+       return bio->bi_iter.bi_sector / NR_PHY_IN_LOG;
+}
+
+static inline unsigned int pblk_get_secs(struct bio *bio)
+{
+       return  bio->bi_iter.bi_size / PBLK_EXPOSED_PAGE_SIZE;
+}
+
+static inline sector_t pblk_get_sector(sector_t lba)
+{
+       return lba * NR_PHY_IN_LOG;
+}
+
+static inline void pblk_setup_uuid(struct pblk *pblk)
+{
+       uuid_le uuid;
+
+       uuid_le_gen(&uuid);
+       memcpy(pblk->instance_uuid, uuid.b, 16);
+}
+#endif /* PBLK_H_ */
index e00b1d7..cf0e28a 100644 (file)
@@ -318,10 +318,6 @@ static int rrpc_move_valid_pages(struct rrpc *rrpc, struct rrpc_block *rblk)
        }
 
        page = mempool_alloc(rrpc->page_pool, GFP_NOIO);
-       if (!page) {
-               bio_put(bio);
-               return -ENOMEM;
-       }
 
        while ((slot = find_first_zero_bit(rblk->invalid_pages,
                                            nr_sec_per_blk)) < nr_sec_per_blk) {
@@ -414,7 +410,6 @@ static void rrpc_block_gc(struct work_struct *work)
        struct rrpc *rrpc = gcb->rrpc;
        struct rrpc_block *rblk = gcb->rblk;
        struct rrpc_lun *rlun = rblk->rlun;
-       struct nvm_tgt_dev *dev = rrpc->dev;
        struct ppa_addr ppa;
 
        mempool_free(gcb, rrpc->gcb_pool);
@@ -430,7 +425,7 @@ static void rrpc_block_gc(struct work_struct *work)
        ppa.g.lun = rlun->bppa.g.lun;
        ppa.g.blk = rblk->id;
 
-       if (nvm_erase_blk(dev, &ppa, 0))
+       if (nvm_erase_sync(rrpc->dev, &ppa, 1))
                goto put_back;
 
        rrpc_put_blk(rrpc, rblk);
@@ -822,7 +817,7 @@ static int rrpc_read_ppalist_rq(struct rrpc *rrpc, struct bio *bio,
 
        for (i = 0; i < npages; i++) {
                /* We assume that mapping occurs at 4KB granularity */
-               BUG_ON(!(laddr + i >= 0 && laddr + i < rrpc->nr_sects));
+               BUG_ON(!(laddr + i < rrpc->nr_sects));
                gp = &rrpc->trans_map[laddr + i];
 
                if (gp->rblk) {
@@ -851,7 +846,7 @@ static int rrpc_read_rq(struct rrpc *rrpc, struct bio *bio, struct nvm_rq *rqd,
        if (!is_gc && rrpc_lock_rq(rrpc, bio, rqd))
                return NVM_IO_REQUEUE;
 
-       BUG_ON(!(laddr >= 0 && laddr < rrpc->nr_sects));
+       BUG_ON(!(laddr < rrpc->nr_sects));
        gp = &rrpc->trans_map[laddr];
 
        if (gp->rblk) {
@@ -1007,11 +1002,6 @@ static blk_qc_t rrpc_make_rq(struct request_queue *q, struct bio *bio)
        }
 
        rqd = mempool_alloc(rrpc->rq_pool, GFP_KERNEL);
-       if (!rqd) {
-               pr_err_ratelimited("rrpc: not able to queue bio.");
-               bio_io_error(bio);
-               return BLK_QC_T_NONE;
-       }
        memset(rqd, 0, sizeof(struct nvm_rq));
 
        err = rrpc_submit_io(rrpc, bio, rqd, NVM_IOTYPE_NONE);
@@ -1275,8 +1265,10 @@ static int rrpc_bb_discovery(struct nvm_tgt_dev *dev, struct rrpc_lun *rlun)
        }
 
        nr_blks = nvm_bb_tbl_fold(dev->parent, blks, nr_blks);
-       if (nr_blks < 0)
-               return nr_blks;
+       if (nr_blks < 0) {
+               ret = nr_blks;
+               goto out;
+       }
 
        for (i = 0; i < nr_blks; i++) {
                if (blks[i] == NVM_BLK_T_FREE)
@@ -1514,7 +1506,8 @@ err:
 
 static struct nvm_tgt_type tt_rrpc;
 
-static void *rrpc_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk)
+static void *rrpc_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
+                      int flags)
 {
        struct request_queue *bqueue = dev->q;
        struct request_queue *tqueue = tdisk->queue;
index ceff415..ee1a3d9 100644 (file)
@@ -144,12 +144,22 @@ config XGENE_SLIMPRO_MBOX
          want to use the APM X-Gene SLIMpro IPCM support.
 
 config BCM_PDC_MBOX
-       tristate "Broadcom PDC Mailbox"
-       depends on ARM64 || COMPILE_TEST
+       tristate "Broadcom FlexSparx DMA Mailbox"
+       depends on ARCH_BCM_IPROC || COMPILE_TEST
        depends on HAS_DMA
+       help
+         Mailbox implementation for the Broadcom FlexSparx DMA ring manager,
+         which provides access to various offload engines on Broadcom
+         SoCs, including FA2/FA+ on Northstar Plus and PDC on Northstar 2.
+
+config BCM_FLEXRM_MBOX
+       tristate "Broadcom FlexRM Mailbox"
+       depends on ARM64
+       depends on HAS_DMA
+       select GENERIC_MSI_IRQ_DOMAIN
        default ARCH_BCM_IPROC
        help
-         Mailbox implementation for the Broadcom PDC ring manager,
+         Mailbox implementation of the Broadcom FlexRM ring manager,
          which provides access to various offload engines on Broadcom
-         SoCs. Say Y here if you want to use the Broadcom PDC.
+         SoCs. Say Y here if you want to use the Broadcom FlexRM.
 endif
index 7dde4f6..e2bcb03 100644 (file)
@@ -30,4 +30,6 @@ obj-$(CONFIG_HI6220_MBOX)     += hi6220-mailbox.o
 
 obj-$(CONFIG_BCM_PDC_MBOX)     += bcm-pdc-mailbox.o
 
+obj-$(CONFIG_BCM_FLEXRM_MBOX)  += bcm-flexrm-mailbox.o
+
 obj-$(CONFIG_TEGRA_HSP_MBOX)   += tegra-hsp.o
diff --git a/drivers/mailbox/bcm-flexrm-mailbox.c b/drivers/mailbox/bcm-flexrm-mailbox.c
new file mode 100644 (file)
index 0000000..da67882
--- /dev/null
@@ -0,0 +1,1595 @@
+/* Broadcom FlexRM Mailbox Driver
+ *
+ * Copyright (C) 2017 Broadcom
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Each Broadcom FlexSparx4 offload engine is implemented as an
+ * extension to Broadcom FlexRM ring manager. The FlexRM ring
+ * manager provides a set of rings which can be used to submit
+ * work to a FlexSparx4 offload engine.
+ *
+ * This driver creates a mailbox controller using a set of FlexRM
+ * rings where each mailbox channel represents a separate FlexRM ring.
+ */
+
+#include <asm/barrier.h>
+#include <asm/byteorder.h>
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/dma-mapping.h>
+#include <linux/dmapool.h>
+#include <linux/err.h>
+#include <linux/idr.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/mailbox_controller.h>
+#include <linux/mailbox_client.h>
+#include <linux/mailbox/brcm-message.h>
+#include <linux/module.h>
+#include <linux/msi.h>
+#include <linux/of_address.h>
+#include <linux/of_irq.h>
+#include <linux/platform_device.h>
+#include <linux/spinlock.h>
+
+/* ====== FlexRM register defines ===== */
+
+/* FlexRM configuration */
+#define RING_REGS_SIZE                                 0x10000
+#define RING_DESC_SIZE                                 8
+#define RING_DESC_INDEX(offset)                                \
+                       ((offset) / RING_DESC_SIZE)
+#define RING_DESC_OFFSET(index)                                \
+                       ((index) * RING_DESC_SIZE)
+#define RING_MAX_REQ_COUNT                             1024
+#define RING_BD_ALIGN_ORDER                            12
+#define RING_BD_ALIGN_CHECK(addr)                      \
+                       (!((addr) & ((0x1 << RING_BD_ALIGN_ORDER) - 1)))
+#define RING_BD_TOGGLE_INVALID(offset)                 \
+                       (((offset) >> RING_BD_ALIGN_ORDER) & 0x1)
+#define RING_BD_TOGGLE_VALID(offset)                   \
+                       (!RING_BD_TOGGLE_INVALID(offset))
+#define RING_BD_DESC_PER_REQ                           32
+#define RING_BD_DESC_COUNT                             \
+                       (RING_MAX_REQ_COUNT * RING_BD_DESC_PER_REQ)
+#define RING_BD_SIZE                                   \
+                       (RING_BD_DESC_COUNT * RING_DESC_SIZE)
+#define RING_CMPL_ALIGN_ORDER                          13
+#define RING_CMPL_DESC_COUNT                           RING_MAX_REQ_COUNT
+#define RING_CMPL_SIZE                                 \
+                       (RING_CMPL_DESC_COUNT * RING_DESC_SIZE)
+#define RING_VER_MAGIC                                 0x76303031
+
+/* Per-Ring register offsets */
+#define RING_VER                                       0x000
+#define RING_BD_START_ADDR                             0x004
+#define RING_BD_READ_PTR                               0x008
+#define RING_BD_WRITE_PTR                              0x00c
+#define RING_BD_READ_PTR_DDR_LS                                0x010
+#define RING_BD_READ_PTR_DDR_MS                                0x014
+#define RING_CMPL_START_ADDR                           0x018
+#define RING_CMPL_WRITE_PTR                            0x01c
+#define RING_NUM_REQ_RECV_LS                           0x020
+#define RING_NUM_REQ_RECV_MS                           0x024
+#define RING_NUM_REQ_TRANS_LS                          0x028
+#define RING_NUM_REQ_TRANS_MS                          0x02c
+#define RING_NUM_REQ_OUTSTAND                          0x030
+#define RING_CONTROL                                   0x034
+#define RING_FLUSH_DONE                                        0x038
+#define RING_MSI_ADDR_LS                               0x03c
+#define RING_MSI_ADDR_MS                               0x040
+#define RING_MSI_CONTROL                               0x048
+#define RING_BD_READ_PTR_DDR_CONTROL                   0x04c
+#define RING_MSI_DATA_VALUE                            0x064
+
+/* Register RING_BD_START_ADDR fields */
+#define BD_LAST_UPDATE_HW_SHIFT                                28
+#define BD_LAST_UPDATE_HW_MASK                         0x1
+#define BD_START_ADDR_VALUE(pa)                                \
+       ((u32)((((dma_addr_t)(pa)) >> RING_BD_ALIGN_ORDER) & 0x0fffffff))
+#define BD_START_ADDR_DECODE(val)                      \
+       ((dma_addr_t)((val) & 0x0fffffff) << RING_BD_ALIGN_ORDER)
+
+/* Register RING_CMPL_START_ADDR fields */
+#define CMPL_START_ADDR_VALUE(pa)                      \
+       ((u32)((((u64)(pa)) >> RING_CMPL_ALIGN_ORDER) & 0x03ffffff))
+
+/* Register RING_CONTROL fields */
+#define CONTROL_MASK_DISABLE_CONTROL                   12
+#define CONTROL_FLUSH_SHIFT                            5
+#define CONTROL_ACTIVE_SHIFT                           4
+#define CONTROL_RATE_ADAPT_MASK                                0xf
+#define CONTROL_RATE_DYNAMIC                           0x0
+#define CONTROL_RATE_FAST                              0x8
+#define CONTROL_RATE_MEDIUM                            0x9
+#define CONTROL_RATE_SLOW                              0xa
+#define CONTROL_RATE_IDLE                              0xb
+
+/* Register RING_FLUSH_DONE fields */
+#define FLUSH_DONE_MASK                                        0x1
+
+/* Register RING_MSI_CONTROL fields */
+#define MSI_TIMER_VAL_SHIFT                            16
+#define MSI_TIMER_VAL_MASK                             0xffff
+#define MSI_ENABLE_SHIFT                               15
+#define MSI_ENABLE_MASK                                        0x1
+#define MSI_COUNT_SHIFT                                        0
+#define MSI_COUNT_MASK                                 0x3ff
+
+/* Register RING_BD_READ_PTR_DDR_CONTROL fields */
+#define BD_READ_PTR_DDR_TIMER_VAL_SHIFT                        16
+#define BD_READ_PTR_DDR_TIMER_VAL_MASK                 0xffff
+#define BD_READ_PTR_DDR_ENABLE_SHIFT                   15
+#define BD_READ_PTR_DDR_ENABLE_MASK                    0x1
+
+/* ====== FlexRM ring descriptor defines ===== */
+
+/* Completion descriptor format */
+#define CMPL_OPAQUE_SHIFT                      0
+#define CMPL_OPAQUE_MASK                       0xffff
+#define CMPL_ENGINE_STATUS_SHIFT               16
+#define CMPL_ENGINE_STATUS_MASK                        0xffff
+#define CMPL_DME_STATUS_SHIFT                  32
+#define CMPL_DME_STATUS_MASK                   0xffff
+#define CMPL_RM_STATUS_SHIFT                   48
+#define CMPL_RM_STATUS_MASK                    0xffff
+
+/* Completion DME status code */
+#define DME_STATUS_MEM_COR_ERR                 BIT(0)
+#define DME_STATUS_MEM_UCOR_ERR                        BIT(1)
+#define DME_STATUS_FIFO_UNDERFLOW              BIT(2)
+#define DME_STATUS_FIFO_OVERFLOW               BIT(3)
+#define DME_STATUS_RRESP_ERR                   BIT(4)
+#define DME_STATUS_BRESP_ERR                   BIT(5)
+#define DME_STATUS_ERROR_MASK                  (DME_STATUS_MEM_COR_ERR | \
+                                                DME_STATUS_MEM_UCOR_ERR | \
+                                                DME_STATUS_FIFO_UNDERFLOW | \
+                                                DME_STATUS_FIFO_OVERFLOW | \
+                                                DME_STATUS_RRESP_ERR | \
+                                                DME_STATUS_BRESP_ERR)
+
+/* Completion RM status code */
+#define RM_STATUS_CODE_SHIFT                   0
+#define RM_STATUS_CODE_MASK                    0x3ff
+#define RM_STATUS_CODE_GOOD                    0x0
+#define RM_STATUS_CODE_AE_TIMEOUT              0x3ff
+
+/* General descriptor format */
+#define DESC_TYPE_SHIFT                                60
+#define DESC_TYPE_MASK                         0xf
+#define DESC_PAYLOAD_SHIFT                     0
+#define DESC_PAYLOAD_MASK                      0x0fffffffffffffff
+
+/* Null descriptor format  */
+#define NULL_TYPE                              0
+#define NULL_TOGGLE_SHIFT                      58
+#define NULL_TOGGLE_MASK                       0x1
+
+/* Header descriptor format */
+#define HEADER_TYPE                            1
+#define HEADER_TOGGLE_SHIFT                    58
+#define HEADER_TOGGLE_MASK                     0x1
+#define HEADER_ENDPKT_SHIFT                    57
+#define HEADER_ENDPKT_MASK                     0x1
+#define HEADER_STARTPKT_SHIFT                  56
+#define HEADER_STARTPKT_MASK                   0x1
+#define HEADER_BDCOUNT_SHIFT                   36
+#define HEADER_BDCOUNT_MASK                    0x1f
+#define HEADER_BDCOUNT_MAX                     HEADER_BDCOUNT_MASK
+#define HEADER_FLAGS_SHIFT                     16
+#define HEADER_FLAGS_MASK                      0xffff
+#define HEADER_OPAQUE_SHIFT                    0
+#define HEADER_OPAQUE_MASK                     0xffff
+
+/* Source (SRC) descriptor format */
+#define SRC_TYPE                               2
+#define SRC_LENGTH_SHIFT                       44
+#define SRC_LENGTH_MASK                                0xffff
+#define SRC_ADDR_SHIFT                         0
+#define SRC_ADDR_MASK                          0x00000fffffffffff
+
+/* Destination (DST) descriptor format */
+#define DST_TYPE                               3
+#define DST_LENGTH_SHIFT                       44
+#define DST_LENGTH_MASK                                0xffff
+#define DST_ADDR_SHIFT                         0
+#define DST_ADDR_MASK                          0x00000fffffffffff
+
+/* Immediate (IMM) descriptor format */
+#define IMM_TYPE                               4
+#define IMM_DATA_SHIFT                         0
+#define IMM_DATA_MASK                          0x0fffffffffffffff
+
+/* Next pointer (NPTR) descriptor format */
+#define NPTR_TYPE                              5
+#define NPTR_TOGGLE_SHIFT                      58
+#define NPTR_TOGGLE_MASK                       0x1
+#define NPTR_ADDR_SHIFT                                0
+#define NPTR_ADDR_MASK                         0x00000fffffffffff
+
+/* Mega source (MSRC) descriptor format */
+#define MSRC_TYPE                              6
+#define MSRC_LENGTH_SHIFT                      44
+#define MSRC_LENGTH_MASK                       0xffff
+#define MSRC_ADDR_SHIFT                                0
+#define MSRC_ADDR_MASK                         0x00000fffffffffff
+
+/* Mega destination (MDST) descriptor format */
+#define MDST_TYPE                              7
+#define MDST_LENGTH_SHIFT                      44
+#define MDST_LENGTH_MASK                       0xffff
+#define MDST_ADDR_SHIFT                                0
+#define MDST_ADDR_MASK                         0x00000fffffffffff
+
+/* Source with tlast (SRCT) descriptor format */
+#define SRCT_TYPE                              8
+#define SRCT_LENGTH_SHIFT                      44
+#define SRCT_LENGTH_MASK                       0xffff
+#define SRCT_ADDR_SHIFT                                0
+#define SRCT_ADDR_MASK                         0x00000fffffffffff
+
+/* Destination with tlast (DSTT) descriptor format */
+#define DSTT_TYPE                              9
+#define DSTT_LENGTH_SHIFT                      44
+#define DSTT_LENGTH_MASK                       0xffff
+#define DSTT_ADDR_SHIFT                                0
+#define DSTT_ADDR_MASK                         0x00000fffffffffff
+
+/* Immediate with tlast (IMMT) descriptor format */
+#define IMMT_TYPE                              10
+#define IMMT_DATA_SHIFT                                0
+#define IMMT_DATA_MASK                         0x0fffffffffffffff
+
+/* Descriptor helper macros */
+#define DESC_DEC(_d, _s, _m)                   (((_d) >> (_s)) & (_m))
+#define DESC_ENC(_d, _v, _s, _m)               \
+                       do { \
+                               (_d) &= ~((u64)(_m) << (_s)); \
+                               (_d) |= (((u64)(_v) & (_m)) << (_s)); \
+                       } while (0)
+
+/* ====== FlexRM data structures ===== */
+
+struct flexrm_ring {
+       /* Unprotected members */
+       int num;
+       struct flexrm_mbox *mbox;
+       void __iomem *regs;
+       bool irq_requested;
+       unsigned int irq;
+       unsigned int msi_timer_val;
+       unsigned int msi_count_threshold;
+       struct ida requests_ida;
+       struct brcm_message *requests[RING_MAX_REQ_COUNT];
+       void *bd_base;
+       dma_addr_t bd_dma_base;
+       u32 bd_write_offset;
+       void *cmpl_base;
+       dma_addr_t cmpl_dma_base;
+       /* Protected members */
+       spinlock_t lock;
+       struct brcm_message *last_pending_msg;
+       u32 cmpl_read_offset;
+};
+
+struct flexrm_mbox {
+       struct device *dev;
+       void __iomem *regs;
+       u32 num_rings;
+       struct flexrm_ring *rings;
+       struct dma_pool *bd_pool;
+       struct dma_pool *cmpl_pool;
+       struct mbox_controller controller;
+};
+
+/* ====== FlexRM ring descriptor helper routines ===== */
+
+static u64 flexrm_read_desc(void *desc_ptr)
+{
+       return le64_to_cpu(*((u64 *)desc_ptr));
+}
+
+static void flexrm_write_desc(void *desc_ptr, u64 desc)
+{
+       *((u64 *)desc_ptr) = cpu_to_le64(desc);
+}
+
+static u32 flexrm_cmpl_desc_to_reqid(u64 cmpl_desc)
+{
+       return (u32)(cmpl_desc & CMPL_OPAQUE_MASK);
+}
+
+static int flexrm_cmpl_desc_to_error(u64 cmpl_desc)
+{
+       u32 status;
+
+       status = DESC_DEC(cmpl_desc, CMPL_DME_STATUS_SHIFT,
+                         CMPL_DME_STATUS_MASK);
+       if (status & DME_STATUS_ERROR_MASK)
+               return -EIO;
+
+       status = DESC_DEC(cmpl_desc, CMPL_RM_STATUS_SHIFT,
+                         CMPL_RM_STATUS_MASK);
+       status &= RM_STATUS_CODE_MASK;
+       if (status == RM_STATUS_CODE_AE_TIMEOUT)
+               return -ETIMEDOUT;
+
+       return 0;
+}
+
+static bool flexrm_is_next_table_desc(void *desc_ptr)
+{
+       u64 desc = flexrm_read_desc(desc_ptr);
+       u32 type = DESC_DEC(desc, DESC_TYPE_SHIFT, DESC_TYPE_MASK);
+
+       return (type == NPTR_TYPE) ? true : false;
+}
+
+static u64 flexrm_next_table_desc(u32 toggle, dma_addr_t next_addr)
+{
+       u64 desc = 0;
+
+       DESC_ENC(desc, NPTR_TYPE, DESC_TYPE_SHIFT, DESC_TYPE_MASK);
+       DESC_ENC(desc, toggle, NPTR_TOGGLE_SHIFT, NPTR_TOGGLE_MASK);
+       DESC_ENC(desc, next_addr, NPTR_ADDR_SHIFT, NPTR_ADDR_MASK);
+
+       return desc;
+}
+
+static u64 flexrm_null_desc(u32 toggle)
+{
+       u64 desc = 0;
+
+       DESC_ENC(desc, NULL_TYPE, DESC_TYPE_SHIFT, DESC_TYPE_MASK);
+       DESC_ENC(desc, toggle, NULL_TOGGLE_SHIFT, NULL_TOGGLE_MASK);
+
+       return desc;
+}
+
+static u32 flexrm_estimate_header_desc_count(u32 nhcnt)
+{
+       u32 hcnt = nhcnt / HEADER_BDCOUNT_MAX;
+
+       if (!(nhcnt % HEADER_BDCOUNT_MAX))
+               hcnt += 1;
+
+       return hcnt;
+}
+
+static void flexrm_flip_header_toogle(void *desc_ptr)
+{
+       u64 desc = flexrm_read_desc(desc_ptr);
+
+       if (desc & ((u64)0x1 << HEADER_TOGGLE_SHIFT))
+               desc &= ~((u64)0x1 << HEADER_TOGGLE_SHIFT);
+       else
+               desc |= ((u64)0x1 << HEADER_TOGGLE_SHIFT);
+
+       flexrm_write_desc(desc_ptr, desc);
+}
+
+static u64 flexrm_header_desc(u32 toggle, u32 startpkt, u32 endpkt,
+                              u32 bdcount, u32 flags, u32 opaque)
+{
+       u64 desc = 0;
+
+       DESC_ENC(desc, HEADER_TYPE, DESC_TYPE_SHIFT, DESC_TYPE_MASK);
+       DESC_ENC(desc, toggle, HEADER_TOGGLE_SHIFT, HEADER_TOGGLE_MASK);
+       DESC_ENC(desc, startpkt, HEADER_STARTPKT_SHIFT, HEADER_STARTPKT_MASK);
+       DESC_ENC(desc, endpkt, HEADER_ENDPKT_SHIFT, HEADER_ENDPKT_MASK);
+       DESC_ENC(desc, bdcount, HEADER_BDCOUNT_SHIFT, HEADER_BDCOUNT_MASK);
+       DESC_ENC(desc, flags, HEADER_FLAGS_SHIFT, HEADER_FLAGS_MASK);
+       DESC_ENC(desc, opaque, HEADER_OPAQUE_SHIFT, HEADER_OPAQUE_MASK);
+
+       return desc;
+}
+
+static void flexrm_enqueue_desc(u32 nhpos, u32 nhcnt, u32 reqid,
+                                u64 desc, void **desc_ptr, u32 *toggle,
+                                void *start_desc, void *end_desc)
+{
+       u64 d;
+       u32 nhavail, _toggle, _startpkt, _endpkt, _bdcount;
+
+       /* Sanity check */
+       if (nhcnt <= nhpos)
+               return;
+
+       /*
+        * Each request or packet start with a HEADER descriptor followed
+        * by one or more non-HEADER descriptors (SRC, SRCT, MSRC, DST,
+        * DSTT, MDST, IMM, and IMMT). The number of non-HEADER descriptors
+        * following a HEADER descriptor is represented by BDCOUNT field
+        * of HEADER descriptor. The max value of BDCOUNT field is 31 which
+        * means we can only have 31 non-HEADER descriptors following one
+        * HEADER descriptor.
+        *
+        * In general use, number of non-HEADER descriptors can easily go
+        * beyond 31. To tackle this situation, we have packet (or request)
+        * extenstion bits (STARTPKT and ENDPKT) in the HEADER descriptor.
+        *
+        * To use packet extension, the first HEADER descriptor of request
+        * (or packet) will have STARTPKT=1 and ENDPKT=0. The intermediate
+        * HEADER descriptors will have STARTPKT=0 and ENDPKT=0. The last
+        * HEADER descriptor will have STARTPKT=0 and ENDPKT=1. Also, the
+        * TOGGLE bit of the first HEADER will be set to invalid state to
+        * ensure that FlexRM does not start fetching descriptors till all
+        * descriptors are enqueued. The user of this function will flip
+        * the TOGGLE bit of first HEADER after all descriptors are
+        * enqueued.
+        */
+
+       if ((nhpos % HEADER_BDCOUNT_MAX == 0) && (nhcnt - nhpos)) {
+               /* Prepare the header descriptor */
+               nhavail = (nhcnt - nhpos);
+               _toggle = (nhpos == 0) ? !(*toggle) : (*toggle);
+               _startpkt = (nhpos == 0) ? 0x1 : 0x0;
+               _endpkt = (nhavail <= HEADER_BDCOUNT_MAX) ? 0x1 : 0x0;
+               _bdcount = (nhavail <= HEADER_BDCOUNT_MAX) ?
+                               nhavail : HEADER_BDCOUNT_MAX;
+               if (nhavail <= HEADER_BDCOUNT_MAX)
+                       _bdcount = nhavail;
+               else
+                       _bdcount = HEADER_BDCOUNT_MAX;
+               d = flexrm_header_desc(_toggle, _startpkt, _endpkt,
+                                       _bdcount, 0x0, reqid);
+
+               /* Write header descriptor */
+               flexrm_write_desc(*desc_ptr, d);
+
+               /* Point to next descriptor */
+               *desc_ptr += sizeof(desc);
+               if (*desc_ptr == end_desc)
+                       *desc_ptr = start_desc;
+
+               /* Skip next pointer descriptors */
+               while (flexrm_is_next_table_desc(*desc_ptr)) {
+                       *toggle = (*toggle) ? 0 : 1;
+                       *desc_ptr += sizeof(desc);
+                       if (*desc_ptr == end_desc)
+                               *desc_ptr = start_desc;
+               }
+       }
+
+       /* Write desired descriptor */
+       flexrm_write_desc(*desc_ptr, desc);
+
+       /* Point to next descriptor */
+       *desc_ptr += sizeof(desc);
+       if (*desc_ptr == end_desc)
+               *desc_ptr = start_desc;
+
+       /* Skip next pointer descriptors */
+       while (flexrm_is_next_table_desc(*desc_ptr)) {
+               *toggle = (*toggle) ? 0 : 1;
+               *desc_ptr += sizeof(desc);
+               if (*desc_ptr == end_desc)
+                       *desc_ptr = start_desc;
+       }
+}
+
+static u64 flexrm_src_desc(dma_addr_t addr, unsigned int length)
+{
+       u64 desc = 0;
+
+       DESC_ENC(desc, SRC_TYPE, DESC_TYPE_SHIFT, DESC_TYPE_MASK);
+       DESC_ENC(desc, length, SRC_LENGTH_SHIFT, SRC_LENGTH_MASK);
+       DESC_ENC(desc, addr, SRC_ADDR_SHIFT, SRC_ADDR_MASK);
+
+       return desc;
+}
+
+static u64 flexrm_msrc_desc(dma_addr_t addr, unsigned int length_div_16)
+{
+       u64 desc = 0;
+
+       DESC_ENC(desc, MSRC_TYPE, DESC_TYPE_SHIFT, DESC_TYPE_MASK);
+       DESC_ENC(desc, length_div_16, MSRC_LENGTH_SHIFT, MSRC_LENGTH_MASK);
+       DESC_ENC(desc, addr, MSRC_ADDR_SHIFT, MSRC_ADDR_MASK);
+
+       return desc;
+}
+
+static u64 flexrm_dst_desc(dma_addr_t addr, unsigned int length)
+{
+       u64 desc = 0;
+
+       DESC_ENC(desc, DST_TYPE, DESC_TYPE_SHIFT, DESC_TYPE_MASK);
+       DESC_ENC(desc, length, DST_LENGTH_SHIFT, DST_LENGTH_MASK);
+       DESC_ENC(desc, addr, DST_ADDR_SHIFT, DST_ADDR_MASK);
+
+       return desc;
+}
+
+static u64 flexrm_mdst_desc(dma_addr_t addr, unsigned int length_div_16)
+{
+       u64 desc = 0;
+
+       DESC_ENC(desc, MDST_TYPE, DESC_TYPE_SHIFT, DESC_TYPE_MASK);
+       DESC_ENC(desc, length_div_16, MDST_LENGTH_SHIFT, MDST_LENGTH_MASK);
+       DESC_ENC(desc, addr, MDST_ADDR_SHIFT, MDST_ADDR_MASK);
+
+       return desc;
+}
+
+static u64 flexrm_imm_desc(u64 data)
+{
+       u64 desc = 0;
+
+       DESC_ENC(desc, IMM_TYPE, DESC_TYPE_SHIFT, DESC_TYPE_MASK);
+       DESC_ENC(desc, data, IMM_DATA_SHIFT, IMM_DATA_MASK);
+
+       return desc;
+}
+
+static u64 flexrm_srct_desc(dma_addr_t addr, unsigned int length)
+{
+       u64 desc = 0;
+
+       DESC_ENC(desc, SRCT_TYPE, DESC_TYPE_SHIFT, DESC_TYPE_MASK);
+       DESC_ENC(desc, length, SRCT_LENGTH_SHIFT, SRCT_LENGTH_MASK);
+       DESC_ENC(desc, addr, SRCT_ADDR_SHIFT, SRCT_ADDR_MASK);
+
+       return desc;
+}
+
+static u64 flexrm_dstt_desc(dma_addr_t addr, unsigned int length)
+{
+       u64 desc = 0;
+
+       DESC_ENC(desc, DSTT_TYPE, DESC_TYPE_SHIFT, DESC_TYPE_MASK);
+       DESC_ENC(desc, length, DSTT_LENGTH_SHIFT, DSTT_LENGTH_MASK);
+       DESC_ENC(desc, addr, DSTT_ADDR_SHIFT, DSTT_ADDR_MASK);
+
+       return desc;
+}
+
+static u64 flexrm_immt_desc(u64 data)
+{
+       u64 desc = 0;
+
+       DESC_ENC(desc, IMMT_TYPE, DESC_TYPE_SHIFT, DESC_TYPE_MASK);
+       DESC_ENC(desc, data, IMMT_DATA_SHIFT, IMMT_DATA_MASK);
+
+       return desc;
+}
+
+static bool flexrm_spu_sanity_check(struct brcm_message *msg)
+{
+       struct scatterlist *sg;
+
+       if (!msg->spu.src || !msg->spu.dst)
+               return false;
+       for (sg = msg->spu.src; sg; sg = sg_next(sg)) {
+               if (sg->length & 0xf) {
+                       if (sg->length > SRC_LENGTH_MASK)
+                               return false;
+               } else {
+                       if (sg->length > (MSRC_LENGTH_MASK * 16))
+                               return false;
+               }
+       }
+       for (sg = msg->spu.dst; sg; sg = sg_next(sg)) {
+               if (sg->length & 0xf) {
+                       if (sg->length > DST_LENGTH_MASK)
+                               return false;
+               } else {
+                       if (sg->length > (MDST_LENGTH_MASK * 16))
+                               return false;
+               }
+       }
+
+       return true;
+}
+
+static u32 flexrm_spu_estimate_nonheader_desc_count(struct brcm_message *msg)
+{
+       u32 cnt = 0;
+       unsigned int dst_target = 0;
+       struct scatterlist *src_sg = msg->spu.src, *dst_sg = msg->spu.dst;
+
+       while (src_sg || dst_sg) {
+               if (src_sg) {
+                       cnt++;
+                       dst_target = src_sg->length;
+                       src_sg = sg_next(src_sg);
+               } else
+                       dst_target = UINT_MAX;
+
+               while (dst_target && dst_sg) {
+                       cnt++;
+                       if (dst_sg->length < dst_target)
+                               dst_target -= dst_sg->length;
+                       else
+                               dst_target = 0;
+                       dst_sg = sg_next(dst_sg);
+               }
+       }
+
+       return cnt;
+}
+
+static int flexrm_spu_dma_map(struct device *dev, struct brcm_message *msg)
+{
+       int rc;
+
+       rc = dma_map_sg(dev, msg->spu.src, sg_nents(msg->spu.src),
+                       DMA_TO_DEVICE);
+       if (rc < 0)
+               return rc;
+
+       rc = dma_map_sg(dev, msg->spu.dst, sg_nents(msg->spu.dst),
+                       DMA_FROM_DEVICE);
+       if (rc < 0) {
+               dma_unmap_sg(dev, msg->spu.src, sg_nents(msg->spu.src),
+                            DMA_TO_DEVICE);
+               return rc;
+       }
+
+       return 0;
+}
+
+static void flexrm_spu_dma_unmap(struct device *dev, struct brcm_message *msg)
+{
+       dma_unmap_sg(dev, msg->spu.dst, sg_nents(msg->spu.dst),
+                    DMA_FROM_DEVICE);
+       dma_unmap_sg(dev, msg->spu.src, sg_nents(msg->spu.src),
+                    DMA_TO_DEVICE);
+}
+
+static void *flexrm_spu_write_descs(struct brcm_message *msg, u32 nhcnt,
+                                    u32 reqid, void *desc_ptr, u32 toggle,
+                                    void *start_desc, void *end_desc)
+{
+       u64 d;
+       u32 nhpos = 0;
+       void *orig_desc_ptr = desc_ptr;
+       unsigned int dst_target = 0;
+       struct scatterlist *src_sg = msg->spu.src, *dst_sg = msg->spu.dst;
+
+       while (src_sg || dst_sg) {
+               if (src_sg) {
+                       if (sg_dma_len(src_sg) & 0xf)
+                               d = flexrm_src_desc(sg_dma_address(src_sg),
+                                                    sg_dma_len(src_sg));
+                       else
+                               d = flexrm_msrc_desc(sg_dma_address(src_sg),
+                                                     sg_dma_len(src_sg)/16);
+                       flexrm_enqueue_desc(nhpos, nhcnt, reqid,
+                                            d, &desc_ptr, &toggle,
+                                            start_desc, end_desc);
+                       nhpos++;
+                       dst_target = sg_dma_len(src_sg);
+                       src_sg = sg_next(src_sg);
+               } else
+                       dst_target = UINT_MAX;
+
+               while (dst_target && dst_sg) {
+                       if (sg_dma_len(dst_sg) & 0xf)
+                               d = flexrm_dst_desc(sg_dma_address(dst_sg),
+                                                    sg_dma_len(dst_sg));
+                       else
+                               d = flexrm_mdst_desc(sg_dma_address(dst_sg),
+                                                     sg_dma_len(dst_sg)/16);
+                       flexrm_enqueue_desc(nhpos, nhcnt, reqid,
+                                            d, &desc_ptr, &toggle,
+                                            start_desc, end_desc);
+                       nhpos++;
+                       if (sg_dma_len(dst_sg) < dst_target)
+                               dst_target -= sg_dma_len(dst_sg);
+                       else
+                               dst_target = 0;
+                       dst_sg = sg_next(dst_sg);
+               }
+       }
+
+       /* Null descriptor with invalid toggle bit */
+       flexrm_write_desc(desc_ptr, flexrm_null_desc(!toggle));
+
+       /* Ensure that descriptors have been written to memory */
+       wmb();
+
+       /* Flip toggle bit in header */
+       flexrm_flip_header_toogle(orig_desc_ptr);
+
+       return desc_ptr;
+}
+
+static bool flexrm_sba_sanity_check(struct brcm_message *msg)
+{
+       u32 i;
+
+       if (!msg->sba.cmds || !msg->sba.cmds_count)
+               return false;
+
+       for (i = 0; i < msg->sba.cmds_count; i++) {
+               if (((msg->sba.cmds[i].flags & BRCM_SBA_CMD_TYPE_B) ||
+                    (msg->sba.cmds[i].flags & BRCM_SBA_CMD_TYPE_C)) &&
+                   (msg->sba.cmds[i].flags & BRCM_SBA_CMD_HAS_OUTPUT))
+                       return false;
+               if ((msg->sba.cmds[i].flags & BRCM_SBA_CMD_TYPE_B) &&
+                   (msg->sba.cmds[i].data_len > SRCT_LENGTH_MASK))
+                       return false;
+               if ((msg->sba.cmds[i].flags & BRCM_SBA_CMD_TYPE_C) &&
+                   (msg->sba.cmds[i].data_len > SRCT_LENGTH_MASK))
+                       return false;
+               if ((msg->sba.cmds[i].flags & BRCM_SBA_CMD_HAS_RESP) &&
+                   (msg->sba.cmds[i].resp_len > DSTT_LENGTH_MASK))
+                       return false;
+               if ((msg->sba.cmds[i].flags & BRCM_SBA_CMD_HAS_OUTPUT) &&
+                   (msg->sba.cmds[i].data_len > DSTT_LENGTH_MASK))
+                       return false;
+       }
+
+       return true;
+}
+
+static u32 flexrm_sba_estimate_nonheader_desc_count(struct brcm_message *msg)
+{
+       u32 i, cnt;
+
+       cnt = 0;
+       for (i = 0; i < msg->sba.cmds_count; i++) {
+               cnt++;
+
+               if ((msg->sba.cmds[i].flags & BRCM_SBA_CMD_TYPE_B) ||
+                   (msg->sba.cmds[i].flags & BRCM_SBA_CMD_TYPE_C))
+                       cnt++;
+
+               if (msg->sba.cmds[i].flags & BRCM_SBA_CMD_HAS_RESP)
+                       cnt++;
+
+               if (msg->sba.cmds[i].flags & BRCM_SBA_CMD_HAS_OUTPUT)
+                       cnt++;
+       }
+
+       return cnt;
+}
+
+static void *flexrm_sba_write_descs(struct brcm_message *msg, u32 nhcnt,
+                                    u32 reqid, void *desc_ptr, u32 toggle,
+                                    void *start_desc, void *end_desc)
+{
+       u64 d;
+       u32 i, nhpos = 0;
+       struct brcm_sba_command *c;
+       void *orig_desc_ptr = desc_ptr;
+
+       /* Convert SBA commands into descriptors */
+       for (i = 0; i < msg->sba.cmds_count; i++) {
+               c = &msg->sba.cmds[i];
+
+               if ((c->flags & BRCM_SBA_CMD_HAS_RESP) &&
+                   (c->flags & BRCM_SBA_CMD_HAS_OUTPUT)) {
+                       /* Destination response descriptor */
+                       d = flexrm_dst_desc(c->resp, c->resp_len);
+                       flexrm_enqueue_desc(nhpos, nhcnt, reqid,
+                                            d, &desc_ptr, &toggle,
+                                            start_desc, end_desc);
+                       nhpos++;
+               } else if (c->flags & BRCM_SBA_CMD_HAS_RESP) {
+                       /* Destination response with tlast descriptor */
+                       d = flexrm_dstt_desc(c->resp, c->resp_len);
+                       flexrm_enqueue_desc(nhpos, nhcnt, reqid,
+                                            d, &desc_ptr, &toggle,
+                                            start_desc, end_desc);
+                       nhpos++;
+               }
+
+               if (c->flags & BRCM_SBA_CMD_HAS_OUTPUT) {
+                       /* Destination with tlast descriptor */
+                       d = flexrm_dstt_desc(c->data, c->data_len);
+                       flexrm_enqueue_desc(nhpos, nhcnt, reqid,
+                                            d, &desc_ptr, &toggle,
+                                            start_desc, end_desc);
+                       nhpos++;
+               }
+
+               if (c->flags & BRCM_SBA_CMD_TYPE_B) {
+                       /* Command as immediate descriptor */
+                       d = flexrm_imm_desc(c->cmd);
+                       flexrm_enqueue_desc(nhpos, nhcnt, reqid,
+                                            d, &desc_ptr, &toggle,
+                                            start_desc, end_desc);
+                       nhpos++;
+               } else {
+                       /* Command as immediate descriptor with tlast */
+                       d = flexrm_immt_desc(c->cmd);
+                       flexrm_enqueue_desc(nhpos, nhcnt, reqid,
+                                            d, &desc_ptr, &toggle,
+                                            start_desc, end_desc);
+                       nhpos++;
+               }
+
+               if ((c->flags & BRCM_SBA_CMD_TYPE_B) ||
+                   (c->flags & BRCM_SBA_CMD_TYPE_C)) {
+                       /* Source with tlast descriptor */
+                       d = flexrm_srct_desc(c->data, c->data_len);
+                       flexrm_enqueue_desc(nhpos, nhcnt, reqid,
+                                            d, &desc_ptr, &toggle,
+                                            start_desc, end_desc);
+                       nhpos++;
+               }
+       }
+
+       /* Null descriptor with invalid toggle bit */
+       flexrm_write_desc(desc_ptr, flexrm_null_desc(!toggle));
+
+       /* Ensure that descriptors have been written to memory */
+       wmb();
+
+       /* Flip toggle bit in header */
+       flexrm_flip_header_toogle(orig_desc_ptr);
+
+       return desc_ptr;
+}
+
+static bool flexrm_sanity_check(struct brcm_message *msg)
+{
+       if (!msg)
+               return false;
+
+       switch (msg->type) {
+       case BRCM_MESSAGE_SPU:
+               return flexrm_spu_sanity_check(msg);
+       case BRCM_MESSAGE_SBA:
+               return flexrm_sba_sanity_check(msg);
+       default:
+               return false;
+       };
+}
+
+static u32 flexrm_estimate_nonheader_desc_count(struct brcm_message *msg)
+{
+       if (!msg)
+               return 0;
+
+       switch (msg->type) {
+       case BRCM_MESSAGE_SPU:
+               return flexrm_spu_estimate_nonheader_desc_count(msg);
+       case BRCM_MESSAGE_SBA:
+               return flexrm_sba_estimate_nonheader_desc_count(msg);
+       default:
+               return 0;
+       };
+}
+
+static int flexrm_dma_map(struct device *dev, struct brcm_message *msg)
+{
+       if (!dev || !msg)
+               return -EINVAL;
+
+       switch (msg->type) {
+       case BRCM_MESSAGE_SPU:
+               return flexrm_spu_dma_map(dev, msg);
+       default:
+               break;
+       }
+
+       return 0;
+}
+
+static void flexrm_dma_unmap(struct device *dev, struct brcm_message *msg)
+{
+       if (!dev || !msg)
+               return;
+
+       switch (msg->type) {
+       case BRCM_MESSAGE_SPU:
+               flexrm_spu_dma_unmap(dev, msg);
+               break;
+       default:
+               break;
+       }
+}
+
+static void *flexrm_write_descs(struct brcm_message *msg, u32 nhcnt,
+                               u32 reqid, void *desc_ptr, u32 toggle,
+                               void *start_desc, void *end_desc)
+{
+       if (!msg || !desc_ptr || !start_desc || !end_desc)
+               return ERR_PTR(-ENOTSUPP);
+
+       if ((desc_ptr < start_desc) || (end_desc <= desc_ptr))
+               return ERR_PTR(-ERANGE);
+
+       switch (msg->type) {
+       case BRCM_MESSAGE_SPU:
+               return flexrm_spu_write_descs(msg, nhcnt, reqid,
+                                              desc_ptr, toggle,
+                                              start_desc, end_desc);
+       case BRCM_MESSAGE_SBA:
+               return flexrm_sba_write_descs(msg, nhcnt, reqid,
+                                              desc_ptr, toggle,
+                                              start_desc, end_desc);
+       default:
+               return ERR_PTR(-ENOTSUPP);
+       };
+}
+
+/* ====== FlexRM driver helper routines ===== */
+
+static int flexrm_new_request(struct flexrm_ring *ring,
+                               struct brcm_message *batch_msg,
+                               struct brcm_message *msg)
+{
+       void *next;
+       unsigned long flags;
+       u32 val, count, nhcnt;
+       u32 read_offset, write_offset;
+       bool exit_cleanup = false;
+       int ret = 0, reqid;
+
+       /* Do sanity check on message */
+       if (!flexrm_sanity_check(msg))
+               return -EIO;
+       msg->error = 0;
+
+       /* If no requests possible then save data pointer and goto done. */
+       reqid = ida_simple_get(&ring->requests_ida, 0,
+                               RING_MAX_REQ_COUNT, GFP_KERNEL);
+       if (reqid < 0) {
+               spin_lock_irqsave(&ring->lock, flags);
+               if (batch_msg)
+                       ring->last_pending_msg = batch_msg;
+               else
+                       ring->last_pending_msg = msg;
+               spin_unlock_irqrestore(&ring->lock, flags);
+               return 0;
+       }
+       ring->requests[reqid] = msg;
+
+       /* Do DMA mappings for the message */
+       ret = flexrm_dma_map(ring->mbox->dev, msg);
+       if (ret < 0) {
+               ring->requests[reqid] = NULL;
+               ida_simple_remove(&ring->requests_ida, reqid);
+               return ret;
+       }
+
+       /* If last_pending_msg is already set then goto done with error */
+       spin_lock_irqsave(&ring->lock, flags);
+       if (ring->last_pending_msg)
+               ret = -ENOSPC;
+       spin_unlock_irqrestore(&ring->lock, flags);
+       if (ret < 0) {
+               dev_warn(ring->mbox->dev, "no space in ring %d\n", ring->num);
+               exit_cleanup = true;
+               goto exit;
+       }
+
+       /* Determine current HW BD read offset */
+       read_offset = readl_relaxed(ring->regs + RING_BD_READ_PTR);
+       val = readl_relaxed(ring->regs + RING_BD_START_ADDR);
+       read_offset *= RING_DESC_SIZE;
+       read_offset += (u32)(BD_START_ADDR_DECODE(val) - ring->bd_dma_base);
+
+       /*
+        * Number required descriptors = number of non-header descriptors +
+        *                               number of header descriptors +
+        *                               1x null descriptor
+        */
+       nhcnt = flexrm_estimate_nonheader_desc_count(msg);
+       count = flexrm_estimate_header_desc_count(nhcnt) + nhcnt + 1;
+
+       /* Check for available descriptor space. */
+       write_offset = ring->bd_write_offset;
+       while (count) {
+               if (!flexrm_is_next_table_desc(ring->bd_base + write_offset))
+                       count--;
+               write_offset += RING_DESC_SIZE;
+               if (write_offset == RING_BD_SIZE)
+                       write_offset = 0x0;
+               if (write_offset == read_offset)
+                       break;
+       }
+       if (count) {
+               spin_lock_irqsave(&ring->lock, flags);
+               if (batch_msg)
+                       ring->last_pending_msg = batch_msg;
+               else
+                       ring->last_pending_msg = msg;
+               spin_unlock_irqrestore(&ring->lock, flags);
+               ret = 0;
+               exit_cleanup = true;
+               goto exit;
+       }
+
+       /* Write descriptors to ring */
+       next = flexrm_write_descs(msg, nhcnt, reqid,
+                       ring->bd_base + ring->bd_write_offset,
+                       RING_BD_TOGGLE_VALID(ring->bd_write_offset),
+                       ring->bd_base, ring->bd_base + RING_BD_SIZE);
+       if (IS_ERR(next)) {
+               ret = PTR_ERR(next);
+               exit_cleanup = true;
+               goto exit;
+       }
+
+       /* Save ring BD write offset */
+       ring->bd_write_offset = (unsigned long)(next - ring->bd_base);
+
+exit:
+       /* Update error status in message */
+       msg->error = ret;
+
+       /* Cleanup if we failed */
+       if (exit_cleanup) {
+               flexrm_dma_unmap(ring->mbox->dev, msg);
+               ring->requests[reqid] = NULL;
+               ida_simple_remove(&ring->requests_ida, reqid);
+       }
+
+       return ret;
+}
+
+static int flexrm_process_completions(struct flexrm_ring *ring)
+{
+       u64 desc;
+       int err, count = 0;
+       unsigned long flags;
+       struct brcm_message *msg = NULL;
+       u32 reqid, cmpl_read_offset, cmpl_write_offset;
+       struct mbox_chan *chan = &ring->mbox->controller.chans[ring->num];
+
+       spin_lock_irqsave(&ring->lock, flags);
+
+       /* Check last_pending_msg */
+       if (ring->last_pending_msg) {
+               msg = ring->last_pending_msg;
+               ring->last_pending_msg = NULL;
+       }
+
+       /*
+        * Get current completion read and write offset
+        *
+        * Note: We should read completion write pointer atleast once
+        * after we get a MSI interrupt because HW maintains internal
+        * MSI status which will allow next MSI interrupt only after
+        * completion write pointer is read.
+        */
+       cmpl_write_offset = readl_relaxed(ring->regs + RING_CMPL_WRITE_PTR);
+       cmpl_write_offset *= RING_DESC_SIZE;
+       cmpl_read_offset = ring->cmpl_read_offset;
+       ring->cmpl_read_offset = cmpl_write_offset;
+
+       spin_unlock_irqrestore(&ring->lock, flags);
+
+       /* If last_pending_msg was set then queue it back */
+       if (msg)
+               mbox_send_message(chan, msg);
+
+       /* For each completed request notify mailbox clients */
+       reqid = 0;
+       while (cmpl_read_offset != cmpl_write_offset) {
+               /* Dequeue next completion descriptor */
+               desc = *((u64 *)(ring->cmpl_base + cmpl_read_offset));
+
+               /* Next read offset */
+               cmpl_read_offset += RING_DESC_SIZE;
+               if (cmpl_read_offset == RING_CMPL_SIZE)
+                       cmpl_read_offset = 0;
+
+               /* Decode error from completion descriptor */
+               err = flexrm_cmpl_desc_to_error(desc);
+               if (err < 0) {
+                       dev_warn(ring->mbox->dev,
+                                "got completion desc=0x%lx with error %d",
+                                (unsigned long)desc, err);
+               }
+
+               /* Determine request id from completion descriptor */
+               reqid = flexrm_cmpl_desc_to_reqid(desc);
+
+               /* Determine message pointer based on reqid */
+               msg = ring->requests[reqid];
+               if (!msg) {
+                       dev_warn(ring->mbox->dev,
+                                "null msg pointer for completion desc=0x%lx",
+                                (unsigned long)desc);
+                       continue;
+               }
+
+               /* Release reqid for recycling */
+               ring->requests[reqid] = NULL;
+               ida_simple_remove(&ring->requests_ida, reqid);
+
+               /* Unmap DMA mappings */
+               flexrm_dma_unmap(ring->mbox->dev, msg);
+
+               /* Give-back message to mailbox client */
+               msg->error = err;
+               mbox_chan_received_data(chan, msg);
+
+               /* Increment number of completions processed */
+               count++;
+       }
+
+       return count;
+}
+
+/* ====== FlexRM interrupt handler ===== */
+
+static irqreturn_t flexrm_irq_event(int irq, void *dev_id)
+{
+       /* We only have MSI for completions so just wakeup IRQ thread */
+       /* Ring related errors will be informed via completion descriptors */
+
+       return IRQ_WAKE_THREAD;
+}
+
+static irqreturn_t flexrm_irq_thread(int irq, void *dev_id)
+{
+       flexrm_process_completions(dev_id);
+
+       return IRQ_HANDLED;
+}
+
+/* ====== FlexRM mailbox callbacks ===== */
+
+static int flexrm_send_data(struct mbox_chan *chan, void *data)
+{
+       int i, rc;
+       struct flexrm_ring *ring = chan->con_priv;
+       struct brcm_message *msg = data;
+
+       if (msg->type == BRCM_MESSAGE_BATCH) {
+               for (i = msg->batch.msgs_queued;
+                    i < msg->batch.msgs_count; i++) {
+                       rc = flexrm_new_request(ring, msg,
+                                                &msg->batch.msgs[i]);
+                       if (rc) {
+                               msg->error = rc;
+                               return rc;
+                       }
+                       msg->batch.msgs_queued++;
+               }
+               return 0;
+       }
+
+       return flexrm_new_request(ring, NULL, data);
+}
+
+static bool flexrm_peek_data(struct mbox_chan *chan)
+{
+       int cnt = flexrm_process_completions(chan->con_priv);
+
+       return (cnt > 0) ? true : false;
+}
+
+static int flexrm_startup(struct mbox_chan *chan)
+{
+       u64 d;
+       u32 val, off;
+       int ret = 0;
+       dma_addr_t next_addr;
+       struct flexrm_ring *ring = chan->con_priv;
+
+       /* Allocate BD memory */
+       ring->bd_base = dma_pool_alloc(ring->mbox->bd_pool,
+                                      GFP_KERNEL, &ring->bd_dma_base);
+       if (!ring->bd_base) {
+               dev_err(ring->mbox->dev, "can't allocate BD memory\n");
+               ret = -ENOMEM;
+               goto fail;
+       }
+
+       /* Configure next table pointer entries in BD memory */
+       for (off = 0; off < RING_BD_SIZE; off += RING_DESC_SIZE) {
+               next_addr = off + RING_DESC_SIZE;
+               if (next_addr == RING_BD_SIZE)
+                       next_addr = 0;
+               next_addr += ring->bd_dma_base;
+               if (RING_BD_ALIGN_CHECK(next_addr))
+                       d = flexrm_next_table_desc(RING_BD_TOGGLE_VALID(off),
+                                                   next_addr);
+               else
+                       d = flexrm_null_desc(RING_BD_TOGGLE_INVALID(off));
+               flexrm_write_desc(ring->bd_base + off, d);
+       }
+
+       /* Allocate completion memory */
+       ring->cmpl_base = dma_pool_alloc(ring->mbox->cmpl_pool,
+                                        GFP_KERNEL, &ring->cmpl_dma_base);
+       if (!ring->cmpl_base) {
+               dev_err(ring->mbox->dev, "can't allocate completion memory\n");
+               ret = -ENOMEM;
+               goto fail_free_bd_memory;
+       }
+       memset(ring->cmpl_base, 0, RING_CMPL_SIZE);
+
+       /* Request IRQ */
+       if (ring->irq == UINT_MAX) {
+               dev_err(ring->mbox->dev, "ring IRQ not available\n");
+               ret = -ENODEV;
+               goto fail_free_cmpl_memory;
+       }
+       ret = request_threaded_irq(ring->irq,
+                                  flexrm_irq_event,
+                                  flexrm_irq_thread,
+                                  0, dev_name(ring->mbox->dev), ring);
+       if (ret) {
+               dev_err(ring->mbox->dev, "failed to request ring IRQ\n");
+               goto fail_free_cmpl_memory;
+       }
+       ring->irq_requested = true;
+
+       /* Disable/inactivate ring */
+       writel_relaxed(0x0, ring->regs + RING_CONTROL);
+
+       /* Program BD start address */
+       val = BD_START_ADDR_VALUE(ring->bd_dma_base);
+       writel_relaxed(val, ring->regs + RING_BD_START_ADDR);
+
+       /* BD write pointer will be same as HW write pointer */
+       ring->bd_write_offset =
+                       readl_relaxed(ring->regs + RING_BD_WRITE_PTR);
+       ring->bd_write_offset *= RING_DESC_SIZE;
+
+       /* Program completion start address */
+       val = CMPL_START_ADDR_VALUE(ring->cmpl_dma_base);
+       writel_relaxed(val, ring->regs + RING_CMPL_START_ADDR);
+
+       /* Ensure last pending message is cleared */
+       ring->last_pending_msg = NULL;
+
+       /* Completion read pointer will be same as HW write pointer */
+       ring->cmpl_read_offset =
+                       readl_relaxed(ring->regs + RING_CMPL_WRITE_PTR);
+       ring->cmpl_read_offset *= RING_DESC_SIZE;
+
+       /* Read ring Tx, Rx, and Outstanding counts to clear */
+       readl_relaxed(ring->regs + RING_NUM_REQ_RECV_LS);
+       readl_relaxed(ring->regs + RING_NUM_REQ_RECV_MS);
+       readl_relaxed(ring->regs + RING_NUM_REQ_TRANS_LS);
+       readl_relaxed(ring->regs + RING_NUM_REQ_TRANS_MS);
+       readl_relaxed(ring->regs + RING_NUM_REQ_OUTSTAND);
+
+       /* Configure RING_MSI_CONTROL */
+       val = 0;
+       val |= (ring->msi_timer_val << MSI_TIMER_VAL_SHIFT);
+       val |= BIT(MSI_ENABLE_SHIFT);
+       val |= (ring->msi_count_threshold & MSI_COUNT_MASK) << MSI_COUNT_SHIFT;
+       writel_relaxed(val, ring->regs + RING_MSI_CONTROL);
+
+       /* Enable/activate ring */
+       val = BIT(CONTROL_ACTIVE_SHIFT);
+       writel_relaxed(val, ring->regs + RING_CONTROL);
+
+       return 0;
+
+fail_free_cmpl_memory:
+       dma_pool_free(ring->mbox->cmpl_pool,
+                     ring->cmpl_base, ring->cmpl_dma_base);
+       ring->cmpl_base = NULL;
+fail_free_bd_memory:
+       dma_pool_free(ring->mbox->bd_pool,
+                     ring->bd_base, ring->bd_dma_base);
+       ring->bd_base = NULL;
+fail:
+       return ret;
+}
+
+static void flexrm_shutdown(struct mbox_chan *chan)
+{
+       u32 reqid;
+       unsigned int timeout;
+       struct brcm_message *msg;
+       struct flexrm_ring *ring = chan->con_priv;
+
+       /* Disable/inactivate ring */
+       writel_relaxed(0x0, ring->regs + RING_CONTROL);
+
+       /* Flush ring with timeout of 1s */
+       timeout = 1000;
+       writel_relaxed(BIT(CONTROL_FLUSH_SHIFT),
+                       ring->regs + RING_CONTROL);
+       do {
+               if (readl_relaxed(ring->regs + RING_FLUSH_DONE) &
+                   FLUSH_DONE_MASK)
+                       break;
+               mdelay(1);
+       } while (timeout--);
+
+       /* Abort all in-flight requests */
+       for (reqid = 0; reqid < RING_MAX_REQ_COUNT; reqid++) {
+               msg = ring->requests[reqid];
+               if (!msg)
+                       continue;
+
+               /* Release reqid for recycling */
+               ring->requests[reqid] = NULL;
+               ida_simple_remove(&ring->requests_ida, reqid);
+
+               /* Unmap DMA mappings */
+               flexrm_dma_unmap(ring->mbox->dev, msg);
+
+               /* Give-back message to mailbox client */
+               msg->error = -EIO;
+               mbox_chan_received_data(chan, msg);
+       }
+
+       /* Release IRQ */
+       if (ring->irq_requested) {
+               free_irq(ring->irq, ring);
+               ring->irq_requested = false;
+       }
+
+       /* Free-up completion descriptor ring */
+       if (ring->cmpl_base) {
+               dma_pool_free(ring->mbox->cmpl_pool,
+                             ring->cmpl_base, ring->cmpl_dma_base);
+               ring->cmpl_base = NULL;
+       }
+
+       /* Free-up BD descriptor ring */
+       if (ring->bd_base) {
+               dma_pool_free(ring->mbox->bd_pool,
+                             ring->bd_base, ring->bd_dma_base);
+               ring->bd_base = NULL;
+       }
+}
+
+static bool flexrm_last_tx_done(struct mbox_chan *chan)
+{
+       bool ret;
+       unsigned long flags;
+       struct flexrm_ring *ring = chan->con_priv;
+
+       spin_lock_irqsave(&ring->lock, flags);
+       ret = (ring->last_pending_msg) ? false : true;
+       spin_unlock_irqrestore(&ring->lock, flags);
+
+       return ret;
+}
+
+static const struct mbox_chan_ops flexrm_mbox_chan_ops = {
+       .send_data      = flexrm_send_data,
+       .startup        = flexrm_startup,
+       .shutdown       = flexrm_shutdown,
+       .last_tx_done   = flexrm_last_tx_done,
+       .peek_data      = flexrm_peek_data,
+};
+
+static struct mbox_chan *flexrm_mbox_of_xlate(struct mbox_controller *cntlr,
+                                       const struct of_phandle_args *pa)
+{
+       struct mbox_chan *chan;
+       struct flexrm_ring *ring;
+
+       if (pa->args_count < 3)
+               return ERR_PTR(-EINVAL);
+
+       if (pa->args[0] >= cntlr->num_chans)
+               return ERR_PTR(-ENOENT);
+
+       if (pa->args[1] > MSI_COUNT_MASK)
+               return ERR_PTR(-EINVAL);
+
+       if (pa->args[2] > MSI_TIMER_VAL_MASK)
+               return ERR_PTR(-EINVAL);
+
+       chan = &cntlr->chans[pa->args[0]];
+       ring = chan->con_priv;
+       ring->msi_count_threshold = pa->args[1];
+       ring->msi_timer_val = pa->args[2];
+
+       return chan;
+}
+
+/* ====== FlexRM platform driver ===== */
+
+static void flexrm_mbox_msi_write(struct msi_desc *desc, struct msi_msg *msg)
+{
+       struct device *dev = msi_desc_to_dev(desc);
+       struct flexrm_mbox *mbox = dev_get_drvdata(dev);
+       struct flexrm_ring *ring = &mbox->rings[desc->platform.msi_index];
+
+       /* Configure per-Ring MSI registers */
+       writel_relaxed(msg->address_lo, ring->regs + RING_MSI_ADDR_LS);
+       writel_relaxed(msg->address_hi, ring->regs + RING_MSI_ADDR_MS);
+       writel_relaxed(msg->data, ring->regs + RING_MSI_DATA_VALUE);
+}
+
+static int flexrm_mbox_probe(struct platform_device *pdev)
+{
+       int index, ret = 0;
+       void __iomem *regs;
+       void __iomem *regs_end;
+       struct msi_desc *desc;
+       struct resource *iomem;
+       struct flexrm_ring *ring;
+       struct flexrm_mbox *mbox;
+       struct device *dev = &pdev->dev;
+
+       /* Allocate driver mailbox struct */
+       mbox = devm_kzalloc(dev, sizeof(*mbox), GFP_KERNEL);
+       if (!mbox) {
+               ret = -ENOMEM;
+               goto fail;
+       }
+       mbox->dev = dev;
+       platform_set_drvdata(pdev, mbox);
+
+       /* Get resource for registers */
+       iomem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+       if (!iomem || (resource_size(iomem) < RING_REGS_SIZE)) {
+               ret = -ENODEV;
+               goto fail;
+       }
+
+       /* Map registers of all rings */
+       mbox->regs = devm_ioremap_resource(&pdev->dev, iomem);
+       if (IS_ERR(mbox->regs)) {
+               ret = PTR_ERR(mbox->regs);
+               dev_err(&pdev->dev, "Failed to remap mailbox regs: %d\n", ret);
+               goto fail;
+       }
+       regs_end = mbox->regs + resource_size(iomem);
+
+       /* Scan and count available rings */
+       mbox->num_rings = 0;
+       for (regs = mbox->regs; regs < regs_end; regs += RING_REGS_SIZE) {
+               if (readl_relaxed(regs + RING_VER) == RING_VER_MAGIC)
+                       mbox->num_rings++;
+       }
+       if (!mbox->num_rings) {
+               ret = -ENODEV;
+               goto fail;
+       }
+
+       /* Allocate driver ring structs */
+       ring = devm_kcalloc(dev, mbox->num_rings, sizeof(*ring), GFP_KERNEL);
+       if (!ring) {
+               ret = -ENOMEM;
+               goto fail;
+       }
+       mbox->rings = ring;
+
+       /* Initialize members of driver ring structs */
+       regs = mbox->regs;
+       for (index = 0; index < mbox->num_rings; index++) {
+               ring = &mbox->rings[index];
+               ring->num = index;
+               ring->mbox = mbox;
+               while ((regs < regs_end) &&
+                      (readl_relaxed(regs + RING_VER) != RING_VER_MAGIC))
+                       regs += RING_REGS_SIZE;
+               if (regs_end <= regs) {
+                       ret = -ENODEV;
+                       goto fail;
+               }
+               ring->regs = regs;
+               regs += RING_REGS_SIZE;
+               ring->irq = UINT_MAX;
+               ring->irq_requested = false;
+               ring->msi_timer_val = MSI_TIMER_VAL_MASK;
+               ring->msi_count_threshold = 0x1;
+               ida_init(&ring->requests_ida);
+               memset(ring->requests, 0, sizeof(ring->requests));
+               ring->bd_base = NULL;
+               ring->bd_dma_base = 0;
+               ring->cmpl_base = NULL;
+               ring->cmpl_dma_base = 0;
+               spin_lock_init(&ring->lock);
+               ring->last_pending_msg = NULL;
+               ring->cmpl_read_offset = 0;
+       }
+
+       /* FlexRM is capable of 40-bit physical addresses only */
+       ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(40));
+       if (ret) {
+               ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(32));
+               if (ret)
+                       goto fail;
+       }
+
+       /* Create DMA pool for ring BD memory */
+       mbox->bd_pool = dma_pool_create("bd", dev, RING_BD_SIZE,
+                                       1 << RING_BD_ALIGN_ORDER, 0);
+       if (!mbox->bd_pool) {
+               ret = -ENOMEM;
+               goto fail;
+       }
+
+       /* Create DMA pool for ring completion memory */
+       mbox->cmpl_pool = dma_pool_create("cmpl", dev, RING_CMPL_SIZE,
+                                         1 << RING_CMPL_ALIGN_ORDER, 0);
+       if (!mbox->cmpl_pool) {
+               ret = -ENOMEM;
+               goto fail_destroy_bd_pool;
+       }
+
+       /* Allocate platform MSIs for each ring */
+       ret = platform_msi_domain_alloc_irqs(dev, mbox->num_rings,
+                                               flexrm_mbox_msi_write);
+       if (ret)
+               goto fail_destroy_cmpl_pool;
+
+       /* Save alloced IRQ numbers for each ring */
+       for_each_msi_entry(desc, dev) {
+               ring = &mbox->rings[desc->platform.msi_index];
+               ring->irq = desc->irq;
+       }
+
+       /* Initialize mailbox controller */
+       mbox->controller.txdone_irq = false;
+       mbox->controller.txdone_poll = true;
+       mbox->controller.txpoll_period = 1;
+       mbox->controller.ops = &flexrm_mbox_chan_ops;
+       mbox->controller.dev = dev;
+       mbox->controller.num_chans = mbox->num_rings;
+       mbox->controller.of_xlate = flexrm_mbox_of_xlate;
+       mbox->controller.chans = devm_kcalloc(dev, mbox->num_rings,
+                               sizeof(*mbox->controller.chans), GFP_KERNEL);
+       if (!mbox->controller.chans) {
+               ret = -ENOMEM;
+               goto fail_free_msis;
+       }
+       for (index = 0; index < mbox->num_rings; index++)
+               mbox->controller.chans[index].con_priv = &mbox->rings[index];
+
+       /* Register mailbox controller */
+       ret = mbox_controller_register(&mbox->controller);
+       if (ret)
+               goto fail_free_msis;
+
+       dev_info(dev, "registered flexrm mailbox with %d channels\n",
+                       mbox->controller.num_chans);
+
+       return 0;
+
+fail_free_msis:
+       platform_msi_domain_free_irqs(dev);
+fail_destroy_cmpl_pool:
+       dma_pool_destroy(mbox->cmpl_pool);
+fail_destroy_bd_pool:
+       dma_pool_destroy(mbox->bd_pool);
+fail:
+       return ret;
+}
+
+static int flexrm_mbox_remove(struct platform_device *pdev)
+{
+       int index;
+       struct device *dev = &pdev->dev;
+       struct flexrm_ring *ring;
+       struct flexrm_mbox *mbox = platform_get_drvdata(pdev);
+
+       mbox_controller_unregister(&mbox->controller);
+
+       platform_msi_domain_free_irqs(dev);
+
+       dma_pool_destroy(mbox->cmpl_pool);
+       dma_pool_destroy(mbox->bd_pool);
+
+       for (index = 0; index < mbox->num_rings; index++) {
+               ring = &mbox->rings[index];
+               ida_destroy(&ring->requests_ida);
+       }
+
+       return 0;
+}
+
+static const struct of_device_id flexrm_mbox_of_match[] = {
+       { .compatible = "brcm,iproc-flexrm-mbox", },
+       {},
+};
+MODULE_DEVICE_TABLE(of, flexrm_mbox_of_match);
+
+static struct platform_driver flexrm_mbox_driver = {
+       .driver = {
+               .name = "brcm-flexrm-mbox",
+               .of_match_table = flexrm_mbox_of_match,
+       },
+       .probe          = flexrm_mbox_probe,
+       .remove         = flexrm_mbox_remove,
+};
+module_platform_driver(flexrm_mbox_driver);
+
+MODULE_AUTHOR("Anup Patel <anup.patel@broadcom.com>");
+MODULE_DESCRIPTION("Broadcom FlexRM mailbox driver");
+MODULE_LICENSE("GPL v2");
index 2aeb034..4fe7be0 100644 (file)
@@ -18,7 +18,8 @@
  * Broadcom PDC Mailbox Driver
  * The PDC provides a ring based programming interface to one or more hardware
  * offload engines. For example, the PDC driver works with both SPU-M and SPU2
- * cryptographic offload hardware. In some chips the PDC is referred to as MDE.
+ * cryptographic offload hardware. In some chips the PDC is referred to as MDE,
+ * and in others the FA2/FA+ hardware is used with this PDC driver.
  *
  * The PDC driver registers with the Linux mailbox framework as a mailbox
  * controller, once for each PDC instance. Ring 0 for each PDC is registered as
 #define PDC_INTMASK_OFFSET   0x24
 #define PDC_INTSTATUS_OFFSET 0x20
 #define PDC_RCVLAZY0_OFFSET  (0x30 + 4 * PDC_RINGSET)
+#define FA_RCVLAZY0_OFFSET   0x100
 
 /*
  * For SPU2, configure MDE_CKSUM_CONTROL to write 17 bytes of metadata
 /* Maximum size buffer the DMA engine can handle */
 #define PDC_DMA_BUF_MAX 16384
 
+enum pdc_hw {
+       FA_HW,          /* FA2/FA+ hardware (i.e. Northstar Plus) */
+       PDC_HW          /* PDC/MDE hardware (i.e. Northstar 2, Pegasus) */
+};
+
 struct pdc_dma_map {
        void *ctx;          /* opaque context associated with frame */
 };
@@ -211,13 +218,13 @@ struct pdc_regs {
        u32  gptimer;                /* 0x028 */
 
        u32  PAD;
-       u32  intrcvlazy_0;           /* 0x030 */
-       u32  intrcvlazy_1;           /* 0x034 */
-       u32  intrcvlazy_2;           /* 0x038 */
-       u32  intrcvlazy_3;           /* 0x03c */
+       u32  intrcvlazy_0;           /* 0x030 (Only in PDC, not FA2) */
+       u32  intrcvlazy_1;           /* 0x034 (Only in PDC, not FA2) */
+       u32  intrcvlazy_2;           /* 0x038 (Only in PDC, not FA2) */
+       u32  intrcvlazy_3;           /* 0x03c (Only in PDC, not FA2) */
 
        u32  PAD[48];
-       u32  removed_intrecvlazy;    /* 0x100 */
+       u32  fa_intrecvlazy;         /* 0x100 (Only in FA2, not PDC) */
        u32  flowctlthresh;          /* 0x104 */
        u32  wrrthresh;              /* 0x108 */
        u32  gmac_idle_cnt_thresh;   /* 0x10c */
@@ -243,7 +250,7 @@ struct pdc_regs {
        u32  serdes_status1;         /* 0x1b0 */
        u32  PAD[11];                /* 0x1b4-1dc */
        u32  clk_ctl_st;             /* 0x1e0 */
-       u32  hw_war;                 /* 0x1e4 */
+       u32  hw_war;                 /* 0x1e4 (Only in PDC, not FA2) */
        u32  pwrctl;                 /* 0x1e8 */
        u32  PAD[5];
 
@@ -410,6 +417,9 @@ struct pdc_state {
        u32  txnobuf;          /* unable to create tx descriptor */
        u32  rxnobuf;          /* unable to create rx descriptor */
        u32  rx_oflow;         /* count of rx overflows */
+
+       /* hardware type - FA2 or PDC/MDE */
+       enum pdc_hw hw_type;
 };
 
 /* Global variables */
@@ -1396,7 +1406,13 @@ static int pdc_interrupts_init(struct pdc_state *pdcs)
 
        /* interrupt configuration */
        iowrite32(PDC_INTMASK, pdcs->pdc_reg_vbase + PDC_INTMASK_OFFSET);
-       iowrite32(PDC_LAZY_INT, pdcs->pdc_reg_vbase + PDC_RCVLAZY0_OFFSET);
+
+       if (pdcs->hw_type == FA_HW)
+               iowrite32(PDC_LAZY_INT, pdcs->pdc_reg_vbase +
+                         FA_RCVLAZY0_OFFSET);
+       else
+               iowrite32(PDC_LAZY_INT, pdcs->pdc_reg_vbase +
+                         PDC_RCVLAZY0_OFFSET);
 
        /* read irq from device tree */
        pdcs->pdc_irq = irq_of_parse_and_map(dn, 0);
@@ -1465,6 +1481,17 @@ static int pdc_mb_init(struct pdc_state *pdcs)
        return 0;
 }
 
+/* Device tree API */
+static const int pdc_hw = PDC_HW;
+static const int fa_hw = FA_HW;
+
+static const struct of_device_id pdc_mbox_of_match[] = {
+       {.compatible = "brcm,iproc-pdc-mbox", .data = &pdc_hw},
+       {.compatible = "brcm,iproc-fa2-mbox", .data = &fa_hw},
+       { /* sentinel */ }
+};
+MODULE_DEVICE_TABLE(of, pdc_mbox_of_match);
+
 /**
  * pdc_dt_read() - Read application-specific data from device tree.
  * @pdev:  Platform device
@@ -1481,6 +1508,8 @@ static int pdc_dt_read(struct platform_device *pdev, struct pdc_state *pdcs)
 {
        struct device *dev = &pdev->dev;
        struct device_node *dn = pdev->dev.of_node;
+       const struct of_device_id *match;
+       const int *hw_type;
        int err;
 
        err = of_property_read_u32(dn, "brcm,rx-status-len",
@@ -1492,6 +1521,14 @@ static int pdc_dt_read(struct platform_device *pdev, struct pdc_state *pdcs)
 
        pdcs->use_bcm_hdr = of_property_read_bool(dn, "brcm,use-bcm-hdr");
 
+       pdcs->hw_type = PDC_HW;
+
+       match = of_match_device(of_match_ptr(pdc_mbox_of_match), dev);
+       if (match != NULL) {
+               hw_type = match->data;
+               pdcs->hw_type = *hw_type;
+       }
+
        return 0;
 }
 
@@ -1525,7 +1562,7 @@ static int pdc_probe(struct platform_device *pdev)
        pdcs->pdc_idx = pdcg.num_spu;
        pdcg.num_spu++;
 
-       err = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(32));
+       err = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(39));
        if (err) {
                dev_warn(dev, "PDC device cannot perform DMA. Error %d.", err);
                goto cleanup;
@@ -1611,12 +1648,6 @@ static int pdc_remove(struct platform_device *pdev)
        return 0;
 }
 
-static const struct of_device_id pdc_mbox_of_match[] = {
-       {.compatible = "brcm,iproc-pdc-mbox"},
-       { /* sentinel */ }
-};
-MODULE_DEVICE_TABLE(of, pdc_mbox_of_match);
-
 static struct platform_driver pdc_mbox_driver = {
        .probe = pdc_probe,
        .remove = pdc_remove,
index 613722d..519376d 100644 (file)
@@ -221,7 +221,7 @@ static void hi6220_mbox_shutdown(struct mbox_chan *chan)
        mbox->irq_map_chan[mchan->ack_irq] = NULL;
 }
 
-static struct mbox_chan_ops hi6220_mbox_ops = {
+static const struct mbox_chan_ops hi6220_mbox_ops = {
        .send_data    = hi6220_mbox_send_data,
        .startup      = hi6220_mbox_startup,
        .shutdown     = hi6220_mbox_shutdown,
index dd2afbc..a704016 100644 (file)
@@ -174,7 +174,7 @@ static void slimpro_mbox_shutdown(struct mbox_chan *chan)
        devm_free_irq(mb_chan->dev, mb_chan->irq, mb_chan);
 }
 
-static struct mbox_chan_ops slimpro_mbox_ops = {
+static const struct mbox_chan_ops slimpro_mbox_ops = {
        .send_data = slimpro_mbox_send_data,
        .startup = slimpro_mbox_startup,
        .shutdown = slimpro_mbox_shutdown,
index 4671f8a..9dfbf7e 100644 (file)
@@ -103,11 +103,14 @@ static void tx_tick(struct mbox_chan *chan, int r)
        /* Submit next message */
        msg_submit(chan);
 
+       if (!mssg)
+               return;
+
        /* Notify the client */
-       if (mssg && chan->cl->tx_done)
+       if (chan->cl->tx_done)
                chan->cl->tx_done(chan->cl, mssg, r);
 
-       if (chan->cl->tx_block)
+       if (r != -ETIME && chan->cl->tx_block)
                complete(&chan->tx_complete);
 }
 
@@ -260,7 +263,7 @@ int mbox_send_message(struct mbox_chan *chan, void *mssg)
 
        msg_submit(chan);
 
-       if (chan->cl->tx_block && chan->active_req) {
+       if (chan->cl->tx_block) {
                unsigned long wait;
                int ret;
 
@@ -271,8 +274,8 @@ int mbox_send_message(struct mbox_chan *chan, void *mssg)
 
                ret = wait_for_completion_timeout(&chan->tx_complete, wait);
                if (ret == 0) {
-                       t = -EIO;
-                       tx_tick(chan, -EIO);
+                       t = -ETIME;
+                       tx_tick(chan, t);
                }
        }
 
@@ -453,6 +456,12 @@ int mbox_controller_register(struct mbox_controller *mbox)
                txdone = TXDONE_BY_ACK;
 
        if (txdone == TXDONE_BY_POLL) {
+
+               if (!mbox->ops->last_tx_done) {
+                       dev_err(mbox->dev, "last_tx_done method is absent\n");
+                       return -EINVAL;
+               }
+
                hrtimer_init(&mbox->poll_hrt, CLOCK_MONOTONIC,
                             HRTIMER_MODE_REL);
                mbox->poll_hrt.function = txdone_hrtimer;
index e4c2c1a..6735c8d 100644 (file)
@@ -932,7 +932,7 @@ static int blocks_are_clean_separate_dirty(struct dm_cache_metadata *cmd,
        *result = true;
 
        r = dm_bitset_cursor_begin(&cmd->dirty_info, cmd->dirty_root,
-                                  from_cblock(begin), &cmd->dirty_cursor);
+                                  from_cblock(cmd->cache_blocks), &cmd->dirty_cursor);
        if (r) {
                DMERR("%s: dm_bitset_cursor_begin for dirty failed", __func__);
                return r;
@@ -959,14 +959,16 @@ static int blocks_are_clean_separate_dirty(struct dm_cache_metadata *cmd,
                        return 0;
                }
 
+               begin = to_cblock(from_cblock(begin) + 1);
+               if (begin == end)
+                       break;
+
                r = dm_bitset_cursor_next(&cmd->dirty_cursor);
                if (r) {
                        DMERR("%s: dm_bitset_cursor_next for dirty failed", __func__);
                        dm_bitset_cursor_end(&cmd->dirty_cursor);
                        return r;
                }
-
-               begin = to_cblock(from_cblock(begin) + 1);
        }
 
        dm_bitset_cursor_end(&cmd->dirty_cursor);
index 9c689b3..975922c 100644 (file)
@@ -2773,7 +2773,6 @@ static int cache_create(struct cache_args *ca, struct cache **result)
 
        ti->num_discard_bios = 1;
        ti->discards_supported = true;
-       ti->discard_zeroes_data_unsupported = true;
        ti->split_discard_bios = false;
 
        cache->features = ca->features;
index 136fda3..fea5bd5 100644 (file)
@@ -132,6 +132,7 @@ void dm_init_md_queue(struct mapped_device *md);
 void dm_init_normal_md_queue(struct mapped_device *md);
 int md_in_flight(struct mapped_device *md);
 void disable_write_same(struct mapped_device *md);
+void disable_write_zeroes(struct mapped_device *md);
 
 static inline struct completion *dm_get_completion_from_kobject(struct kobject *kobj)
 {
index 389a363..ef1d836 100644 (file)
@@ -2030,7 +2030,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        wake_up_process(cc->write_thread);
 
        ti->num_flush_bios = 1;
-       ti->discard_zeroes_data_unsupported = true;
 
        return 0;
 
index 03940bf..3702e50 100644 (file)
@@ -312,9 +312,12 @@ static void do_region(int op, int op_flags, unsigned region,
         */
        if (op == REQ_OP_DISCARD)
                special_cmd_max_sectors = q->limits.max_discard_sectors;
+       else if (op == REQ_OP_WRITE_ZEROES)
+               special_cmd_max_sectors = q->limits.max_write_zeroes_sectors;
        else if (op == REQ_OP_WRITE_SAME)
                special_cmd_max_sectors = q->limits.max_write_same_sectors;
-       if ((op == REQ_OP_DISCARD || op == REQ_OP_WRITE_SAME) &&
+       if ((op == REQ_OP_DISCARD || op == REQ_OP_WRITE_ZEROES ||
+            op == REQ_OP_WRITE_SAME)  &&
            special_cmd_max_sectors == 0) {
                dec_count(io, region, -EOPNOTSUPP);
                return;
@@ -328,11 +331,18 @@ static void do_region(int op, int op_flags, unsigned region,
                /*
                 * Allocate a suitably sized-bio.
                 */
-               if ((op == REQ_OP_DISCARD) || (op == REQ_OP_WRITE_SAME))
+               switch (op) {
+               case REQ_OP_DISCARD:
+               case REQ_OP_WRITE_ZEROES:
+                       num_bvecs = 0;
+                       break;
+               case REQ_OP_WRITE_SAME:
                        num_bvecs = 1;
-               else
+                       break;
+               default:
                        num_bvecs = min_t(int, BIO_MAX_PAGES,
                                          dm_sector_div_up(remaining, (PAGE_SIZE >> SECTOR_SHIFT)));
+               }
 
                bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios);
                bio->bi_iter.bi_sector = where->sector + (where->count - remaining);
@@ -341,7 +351,7 @@ static void do_region(int op, int op_flags, unsigned region,
                bio_set_op_attrs(bio, op, op_flags);
                store_io_and_region_in_bio(bio, io, region);
 
-               if (op == REQ_OP_DISCARD) {
+               if (op == REQ_OP_DISCARD || op == REQ_OP_WRITE_ZEROES) {
                        num_sectors = min_t(sector_t, special_cmd_max_sectors, remaining);
                        bio->bi_iter.bi_size = num_sectors << SECTOR_SHIFT;
                        remaining -= num_sectors;
index 9e9d04c..f858467 100644 (file)
@@ -733,11 +733,11 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
                job->pages = &zero_page_list;
 
                /*
-                * Use WRITE SAME to optimize zeroing if all dests support it.
+                * Use WRITE ZEROES to optimize zeroing if all dests support it.
                 */
-               job->rw = REQ_OP_WRITE_SAME;
+               job->rw = REQ_OP_WRITE_ZEROES;
                for (i = 0; i < job->num_dests; i++)
-                       if (!bdev_write_same(job->dests[i].bdev)) {
+                       if (!bdev_write_zeroes_sectors(job->dests[i].bdev)) {
                                job->rw = WRITE;
                                break;
                        }
index 4788b0b..e17fd44 100644 (file)
@@ -59,6 +59,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        ti->num_flush_bios = 1;
        ti->num_discard_bios = 1;
        ti->num_write_same_bios = 1;
+       ti->num_write_zeroes_bios = 1;
        ti->private = lc;
        return 0;
 
index 7f223db..2950b14 100644 (file)
@@ -1103,6 +1103,7 @@ static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv)
        ti->num_flush_bios = 1;
        ti->num_discard_bios = 1;
        ti->num_write_same_bios = 1;
+       ti->num_write_zeroes_bios = 1;
        if (m->queue_mode == DM_TYPE_BIO_BASED)
                ti->per_io_data_size = multipath_per_bio_data_size();
        else
@@ -1491,7 +1492,7 @@ static int do_end_io(struct multipath *m, struct request *clone,
         */
        int r = DM_ENDIO_REQUEUE;
 
-       if (!error && !clone->errors)
+       if (!error)
                return 0;       /* I/O complete */
 
        if (noretry_error(error))
index f8564d6..2dae3e5 100644 (file)
@@ -2813,7 +2813,9 @@ static void configure_discard_support(struct raid_set *rs)
        /* Assume discards not supported until after checks below. */
        ti->discards_supported = false;
 
-       /* RAID level 4,5,6 require discard_zeroes_data for data integrity! */
+       /*
+        * XXX: RAID level 4,5,6 require zeroing for safety.
+        */
        raid456 = (rs->md.level == 4 || rs->md.level == 5 || rs->md.level == 6);
 
        for (i = 0; i < rs->raid_disks; i++) {
@@ -2827,8 +2829,6 @@ static void configure_discard_support(struct raid_set *rs)
                        return;
 
                if (raid456) {
-                       if (!q->limits.discard_zeroes_data)
-                               return;
                        if (!devices_handle_discard_safely) {
                                DMERR("raid456 discard support disabled due to discard_zeroes_data uncertainty.");
                                DMERR("Set dm-raid.devices_handle_discard_safely=Y to override.");
@@ -3726,7 +3726,7 @@ static int raid_preresume(struct dm_target *ti)
                return r;
 
        /* Resize bitmap to adjust to changed region size (aka MD bitmap chunksize) */
-       if (test_bit(RT_FLAG_RS_BITMAP_LOADED, &rs->runtime_flags) &&
+       if (test_bit(RT_FLAG_RS_BITMAP_LOADED, &rs->runtime_flags) && mddev->bitmap &&
            mddev->bitmap_info.chunksize != to_bytes(rs->requested_bitmap_chunk_sectors)) {
                r = bitmap_resize(mddev->bitmap, mddev->dev_sectors,
                                  to_bytes(rs->requested_bitmap_chunk_sectors), 0);
index 2ddc2d2..a95cbb8 100644 (file)
@@ -1124,7 +1124,6 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        ti->num_flush_bios = 1;
        ti->num_discard_bios = 1;
        ti->per_io_data_size = sizeof(struct dm_raid1_bio_record);
-       ti->discard_zeroes_data_unsupported = true;
 
        ms->kmirrord_wq = alloc_workqueue("kmirrord", WQ_MEM_RECLAIM, 0);
        if (!ms->kmirrord_wq) {
index 28955b9..bff7e3b 100644 (file)
@@ -298,9 +298,14 @@ static void dm_done(struct request *clone, int error, bool mapped)
                        r = rq_end_io(tio->ti, clone, error, &tio->info);
        }
 
-       if (unlikely(r == -EREMOTEIO && (req_op(clone) == REQ_OP_WRITE_SAME) &&
-                    !clone->q->limits.max_write_same_sectors))
-               disable_write_same(tio->md);
+       if (unlikely(r == -EREMOTEIO)) {
+               if (req_op(clone) == REQ_OP_WRITE_SAME &&
+                   !clone->q->limits.max_write_same_sectors)
+                       disable_write_same(tio->md);
+               if (req_op(clone) == REQ_OP_WRITE_ZEROES &&
+                   !clone->q->limits.max_write_zeroes_sectors)
+                       disable_write_zeroes(tio->md);
+       }
 
        if (r <= 0)
                /* The target wants to complete the I/O */
@@ -358,7 +363,7 @@ static void dm_complete_request(struct request *rq, int error)
        if (!rq->q->mq_ops)
                blk_complete_request(rq);
        else
-               blk_mq_complete_request(rq, error);
+               blk_mq_complete_request(rq);
 }
 
 /*
@@ -755,13 +760,14 @@ static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
                /* Undo dm_start_request() before requeuing */
                rq_end_stats(md, rq);
                rq_completed(md, rq_data_dir(rq), false);
+               blk_mq_delay_run_hw_queue(hctx, 100/*ms*/);
                return BLK_MQ_RQ_QUEUE_BUSY;
        }
 
        return BLK_MQ_RQ_QUEUE_OK;
 }
 
-static struct blk_mq_ops dm_mq_ops = {
+static const struct blk_mq_ops dm_mq_ops = {
        .queue_rq = dm_mq_queue_rq,
        .complete = dm_softirq_done,
        .init_request = dm_mq_init_request,
index 28193a5..5ef49c1 100644 (file)
@@ -169,6 +169,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        ti->num_flush_bios = stripes;
        ti->num_discard_bios = stripes;
        ti->num_write_same_bios = stripes;
+       ti->num_write_zeroes_bios = stripes;
 
        sc->chunk_size = chunk_size;
        if (chunk_size & (chunk_size - 1))
@@ -293,6 +294,7 @@ static int stripe_map(struct dm_target *ti, struct bio *bio)
                return DM_MAPIO_REMAPPED;
        }
        if (unlikely(bio_op(bio) == REQ_OP_DISCARD) ||
+           unlikely(bio_op(bio) == REQ_OP_WRITE_ZEROES) ||
            unlikely(bio_op(bio) == REQ_OP_WRITE_SAME)) {
                target_bio_nr = dm_bio_get_target_bio_nr(bio);
                BUG_ON(target_bio_nr >= sc->stripes);
index 3ad16d9..958275a 100644 (file)
@@ -1449,22 +1449,6 @@ static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush)
        return false;
 }
 
-static bool dm_table_discard_zeroes_data(struct dm_table *t)
-{
-       struct dm_target *ti;
-       unsigned i = 0;
-
-       /* Ensure that all targets supports discard_zeroes_data. */
-       while (i < dm_table_get_num_targets(t)) {
-               ti = dm_table_get_target(t, i++);
-
-               if (ti->discard_zeroes_data_unsupported)
-                       return false;
-       }
-
-       return true;
-}
-
 static int device_is_nonrot(struct dm_target *ti, struct dm_dev *dev,
                            sector_t start, sector_t len, void *data)
 {
@@ -1533,6 +1517,34 @@ static bool dm_table_supports_write_same(struct dm_table *t)
        return true;
 }
 
+static int device_not_write_zeroes_capable(struct dm_target *ti, struct dm_dev *dev,
+                                          sector_t start, sector_t len, void *data)
+{
+       struct request_queue *q = bdev_get_queue(dev->bdev);
+
+       return q && !q->limits.max_write_zeroes_sectors;
+}
+
+static bool dm_table_supports_write_zeroes(struct dm_table *t)
+{
+       struct dm_target *ti;
+       unsigned i = 0;
+
+       while (i < dm_table_get_num_targets(t)) {
+               ti = dm_table_get_target(t, i++);
+
+               if (!ti->num_write_zeroes_bios)
+                       return false;
+
+               if (!ti->type->iterate_devices ||
+                   ti->type->iterate_devices(ti, device_not_write_zeroes_capable, NULL))
+                       return false;
+       }
+
+       return true;
+}
+
+
 static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev,
                                  sector_t start, sector_t len, void *data)
 {
@@ -1592,9 +1604,6 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
        }
        blk_queue_write_cache(q, wc, fua);
 
-       if (!dm_table_discard_zeroes_data(t))
-               q->limits.discard_zeroes_data = 0;
-
        /* Ensure that all underlying devices are non-rotational. */
        if (dm_table_all_devices_attribute(t, device_is_nonrot))
                queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
@@ -1603,6 +1612,8 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
 
        if (!dm_table_supports_write_same(t))
                q->limits.max_write_same_sectors = 0;
+       if (!dm_table_supports_write_zeroes(t))
+               q->limits.max_write_zeroes_sectors = 0;
 
        if (dm_table_all_devices_attribute(t, queue_supports_sg_merge))
                queue_flag_clear_unlocked(QUEUE_FLAG_NO_SG_MERGE, q);
index 2b266a2..a5f1916 100644 (file)
@@ -3263,7 +3263,6 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
         * them down to the data device.  The thin device's discard
         * processing will cause mappings to be removed from the btree.
         */
-       ti->discard_zeroes_data_unsupported = true;
        if (pf.discard_enabled && pf.discard_passdown) {
                ti->num_discard_bios = 1;
 
@@ -4119,7 +4118,6 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
        ti->per_io_data_size = sizeof(struct dm_thin_endio_hook);
 
        /* In case the pool supports discards, pass them on. */
-       ti->discard_zeroes_data_unsupported = true;
        if (tc->pool->pf.discard_enabled) {
                ti->discards_supported = true;
                ti->num_discard_bios = 1;
index 0f0eb8a..78f3601 100644 (file)
@@ -146,8 +146,6 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio,
                block = fec_buffer_rs_block(v, fio, n, i);
                res = fec_decode_rs8(v, fio, block, &par[offset], neras);
                if (res < 0) {
-                       dm_bufio_release(buf);
-
                        r = res;
                        goto error;
                }
@@ -172,6 +170,8 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio,
 done:
        r = corrected;
 error:
+       dm_bufio_release(buf);
+
        if (r < 0 && neras)
                DMERR_LIMIT("%s: FEC %llu: failed to correct: %d",
                            v->data_dev->name, (unsigned long long)rsb, r);
@@ -269,7 +269,7 @@ static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io,
                                          &is_zero) == 0) {
                        /* skip known zero blocks entirely */
                        if (is_zero)
-                               continue;
+                               goto done;
 
                        /*
                         * skip if we have already found the theoretical
@@ -439,6 +439,13 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io,
        if (!verity_fec_is_enabled(v))
                return -EOPNOTSUPP;
 
+       if (fio->level >= DM_VERITY_FEC_MAX_RECURSION) {
+               DMWARN_LIMIT("%s: FEC: recursion too deep", v->data_dev->name);
+               return -EIO;
+       }
+
+       fio->level++;
+
        if (type == DM_VERITY_BLOCK_TYPE_METADATA)
                block += v->data_blocks;
 
@@ -470,7 +477,7 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io,
        if (r < 0) {
                r = fec_decode_rsb(v, io, fio, rsb, offset, true);
                if (r < 0)
-                       return r;
+                       goto done;
        }
 
        if (dest)
@@ -480,6 +487,8 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io,
                r = verity_for_bv_block(v, io, iter, fec_bv_copy);
        }
 
+done:
+       fio->level--;
        return r;
 }
 
@@ -520,6 +529,7 @@ void verity_fec_init_io(struct dm_verity_io *io)
        memset(fio->bufs, 0, sizeof(fio->bufs));
        fio->nbufs = 0;
        fio->output = NULL;
+       fio->level = 0;
 }
 
 /*
index 7fa0298..bb31ce8 100644 (file)
@@ -27,6 +27,9 @@
 #define DM_VERITY_FEC_BUF_MAX \
        (1 << (PAGE_SHIFT - DM_VERITY_FEC_BUF_RS_BITS))
 
+/* maximum recursion level for verity_fec_decode */
+#define DM_VERITY_FEC_MAX_RECURSION    4
+
 #define DM_VERITY_OPT_FEC_DEV          "use_fec_from_device"
 #define DM_VERITY_OPT_FEC_BLOCKS       "fec_blocks"
 #define DM_VERITY_OPT_FEC_START                "fec_start"
@@ -58,6 +61,7 @@ struct dm_verity_fec_io {
        unsigned nbufs;         /* number of buffers allocated */
        u8 *output;             /* buffer for corrected output */
        size_t output_pos;
+       unsigned level;         /* recursion level */
 };
 
 #ifdef CONFIG_DM_VERITY_FEC
index dfb7597..8bf3977 100644 (file)
@@ -810,7 +810,6 @@ static void dec_pending(struct dm_io *io, int error)
                        queue_io(md, bio);
                } else {
                        /* done with normal IO or empty flush */
-                       trace_block_bio_complete(md->queue, bio, io_error);
                        bio->bi_error = io_error;
                        bio_endio(bio);
                }
@@ -825,6 +824,14 @@ void disable_write_same(struct mapped_device *md)
        limits->max_write_same_sectors = 0;
 }
 
+void disable_write_zeroes(struct mapped_device *md)
+{
+       struct queue_limits *limits = dm_get_queue_limits(md);
+
+       /* device doesn't really support WRITE ZEROES, disable it */
+       limits->max_write_zeroes_sectors = 0;
+}
+
 static void clone_endio(struct bio *bio)
 {
        int error = bio->bi_error;
@@ -851,9 +858,14 @@ static void clone_endio(struct bio *bio)
                }
        }
 
-       if (unlikely(r == -EREMOTEIO && (bio_op(bio) == REQ_OP_WRITE_SAME) &&
-                    !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors))
-               disable_write_same(md);
+       if (unlikely(r == -EREMOTEIO)) {
+               if (bio_op(bio) == REQ_OP_WRITE_SAME &&
+                   !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors)
+                       disable_write_same(md);
+               if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
+                   !bdev_get_queue(bio->bi_bdev)->limits.max_write_zeroes_sectors)
+                       disable_write_zeroes(md);
+       }
 
        free_tio(tio);
        dec_pending(io, error);
@@ -1202,6 +1214,11 @@ static unsigned get_num_write_same_bios(struct dm_target *ti)
        return ti->num_write_same_bios;
 }
 
+static unsigned get_num_write_zeroes_bios(struct dm_target *ti)
+{
+       return ti->num_write_zeroes_bios;
+}
+
 typedef bool (*is_split_required_fn)(struct dm_target *ti);
 
 static bool is_split_required_for_discard(struct dm_target *ti)
@@ -1256,6 +1273,11 @@ static int __send_write_same(struct clone_info *ci)
        return __send_changing_extent_only(ci, get_num_write_same_bios, NULL);
 }
 
+static int __send_write_zeroes(struct clone_info *ci)
+{
+       return __send_changing_extent_only(ci, get_num_write_zeroes_bios, NULL);
+}
+
 /*
  * Select the correct strategy for processing a non-flush bio.
  */
@@ -1270,6 +1292,8 @@ static int __split_and_process_non_flush(struct clone_info *ci)
                return __send_discard(ci);
        else if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME))
                return __send_write_same(ci);
+       else if (unlikely(bio_op(bio) == REQ_OP_WRITE_ZEROES))
+               return __send_write_zeroes(ci);
 
        ti = dm_table_find_target(ci->map, ci->sector);
        if (!dm_target_is_valid(ti))
index 3e38e02..377a8a3 100644 (file)
@@ -293,6 +293,7 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
                                                      split, disk_devt(mddev->gendisk),
                                                      bio_sector);
                        mddev_check_writesame(mddev, split);
+                       mddev_check_write_zeroes(mddev, split);
                        generic_make_request(split);
                }
        } while (split != bio);
index dde8ecb..1e76d64 100644 (file)
@@ -709,4 +709,11 @@ static inline void mddev_check_writesame(struct mddev *mddev, struct bio *bio)
            !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors)
                mddev->queue->limits.max_write_same_sectors = 0;
 }
+
+static inline void mddev_check_write_zeroes(struct mddev *mddev, struct bio *bio)
+{
+       if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
+           !bdev_get_queue(bio->bi_bdev)->limits.max_write_zeroes_sectors)
+               mddev->queue->limits.max_write_zeroes_sectors = 0;
+}
 #endif /* _MD_MD_H */
index 79a12b5..e95d521 100644 (file)
@@ -139,6 +139,7 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio)
        mp_bh->bio.bi_end_io = multipath_end_request;
        mp_bh->bio.bi_private = mp_bh;
        mddev_check_writesame(mddev, &mp_bh->bio);
+       mddev_check_write_zeroes(mddev, &mp_bh->bio);
        generic_make_request(&mp_bh->bio);
        return;
 }
index 93347ca..ce7a6a5 100644 (file)
@@ -383,6 +383,7 @@ static int raid0_run(struct mddev *mddev)
 
                blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
                blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors);
+               blk_queue_max_write_zeroes_sectors(mddev->queue, mddev->chunk_sectors);
                blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors);
 
                blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
@@ -504,6 +505,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
                                                      split, disk_devt(mddev->gendisk),
                                                      bio_sector);
                        mddev_check_writesame(mddev, split);
+                       mddev_check_write_zeroes(mddev, split);
                        generic_make_request(split);
                }
        } while (split != bio);
index a34f587..b59cc10 100644 (file)
@@ -3177,8 +3177,10 @@ static int raid1_run(struct mddev *mddev)
        if (IS_ERR(conf))
                return PTR_ERR(conf);
 
-       if (mddev->queue)
+       if (mddev->queue) {
                blk_queue_max_write_same_sectors(mddev->queue, 0);
+               blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
+       }
 
        rdev_for_each(rdev, mddev) {
                if (!mddev->gendisk)
index e89a8d7..28ec3a9 100644 (file)
@@ -3749,6 +3749,7 @@ static int raid10_run(struct mddev *mddev)
                blk_queue_max_discard_sectors(mddev->queue,
                                              mddev->chunk_sectors);
                blk_queue_max_write_same_sectors(mddev->queue, 0);
+               blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
                blk_queue_io_min(mddev->queue, chunk_size);
                if (conf->geo.raid_disks % conf->geo.near_copies)
                        blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
index ed5cd70..2efdb0d 100644 (file)
@@ -5031,8 +5031,6 @@ static void raid5_align_endio(struct bio *bi)
        rdev_dec_pending(rdev, conf->mddev);
 
        if (!error) {
-               trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev),
-                                        raid_bi, 0);
                bio_endio(raid_bi);
                if (atomic_dec_and_test(&conf->active_aligned_reads))
                        wake_up(&conf->wait_for_quiescent);
@@ -7229,7 +7227,6 @@ static int raid5_run(struct mddev *mddev)
 
        if (mddev->queue) {
                int chunk_size;
-               bool discard_supported = true;
                /* read-ahead size must cover two whole stripes, which
                 * is 2 * (datadisks) * chunksize where 'n' is the
                 * number of raid devices
@@ -7265,48 +7262,32 @@ static int raid5_run(struct mddev *mddev)
                blk_queue_max_discard_sectors(mddev->queue,
                                              0xfffe * STRIPE_SECTORS);
 
-               /*
-                * unaligned part of discard request will be ignored, so can't
-                * guarantee discard_zeroes_data
-                */
-               mddev->queue->limits.discard_zeroes_data = 0;
-
                blk_queue_max_write_same_sectors(mddev->queue, 0);
+               blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
 
                rdev_for_each(rdev, mddev) {
                        disk_stack_limits(mddev->gendisk, rdev->bdev,
                                          rdev->data_offset << 9);
                        disk_stack_limits(mddev->gendisk, rdev->bdev,
                                          rdev->new_data_offset << 9);
-                       /*
-                        * discard_zeroes_data is required, otherwise data
-                        * could be lost. Consider a scenario: discard a stripe
-                        * (the stripe could be inconsistent if
-                        * discard_zeroes_data is 0); write one disk of the
-                        * stripe (the stripe could be inconsistent again
-                        * depending on which disks are used to calculate
-                        * parity); the disk is broken; The stripe data of this
-                        * disk is lost.
-                        */
-                       if (!blk_queue_discard(bdev_get_queue(rdev->bdev)) ||
-                           !bdev_get_queue(rdev->bdev)->
-                                               limits.discard_zeroes_data)
-                               discard_supported = false;
-                       /* Unfortunately, discard_zeroes_data is not currently
-                        * a guarantee - just a hint.  So we only allow DISCARD
-                        * if the sysadmin has confirmed that only safe devices
-                        * are in use by setting a module parameter.
-                        */
-                       if (!devices_handle_discard_safely) {
-                               if (discard_supported) {
-                                       pr_info("md/raid456: discard support disabled due to uncertainty.\n");
-                                       pr_info("Set raid456.devices_handle_discard_safely=Y to override.\n");
-                               }
-                               discard_supported = false;
-                       }
                }
 
-               if (discard_supported &&
+               /*
+                * zeroing is required, otherwise data
+                * could be lost. Consider a scenario: discard a stripe
+                * (the stripe could be inconsistent if
+                * discard_zeroes_data is 0); write one disk of the
+                * stripe (the stripe could be inconsistent again
+                * depending on which disks are used to calculate
+                * parity); the disk is broken; The stripe data of this
+                * disk is lost.
+                *
+                * We only allow DISCARD if the sysadmin has confirmed that
+                * only safe devices are in use by setting a module parameter.
+                * A better idea might be to turn DISCARD into WRITE_ZEROES
+                * requests, as that is required to be safe.
+                */
+               if (devices_handle_discard_safely &&
                    mddev->queue->limits.max_discard_sectors >= (stripe >> 9) &&
                    mddev->queue->limits.discard_granularity >= stripe)
                        queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
index 493eb10..4c54ad3 100644 (file)
@@ -167,8 +167,6 @@ static void mmc_queue_setup_discard(struct request_queue *q,
 
        queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
        blk_queue_max_discard_sectors(q, max_discard);
-       if (card->erased_byte == 0 && !mmc_can_discard(card))
-               q->limits.discard_zeroes_data = 1;
        q->limits.discard_granularity = card->pref_erase << 9;
        /* granularity must not be greater than max. discard */
        if (card->pref_erase > max_discard)
index e992a7f..2b32b88 100644 (file)
@@ -267,7 +267,7 @@ static void sdio_release_func(struct device *dev)
        sdio_free_func_cis(func);
 
        kfree(func->info);
-
+       kfree(func->tmpbuf);
        kfree(func);
 }
 
@@ -282,6 +282,16 @@ struct sdio_func *sdio_alloc_func(struct mmc_card *card)
        if (!func)
                return ERR_PTR(-ENOMEM);
 
+       /*
+        * allocate buffer separately to make sure it's properly aligned for
+        * DMA usage (incl. 64 bit DMA)
+        */
+       func->tmpbuf = kmalloc(4, GFP_KERNEL);
+       if (!func->tmpbuf) {
+               kfree(func);
+               return ERR_PTR(-ENOMEM);
+       }
+
        func->card = card;
 
        device_initialize(&func->dev);
index a9ac0b4..8718432 100644 (file)
@@ -22,6 +22,7 @@
 #include <linux/ioport.h>
 #include <linux/module.h>
 #include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/stat.h>
@@ -1621,10 +1622,16 @@ static void dw_mci_init_card(struct mmc_host *mmc, struct mmc_card *card)
 
                if (card->type == MMC_TYPE_SDIO ||
                    card->type == MMC_TYPE_SD_COMBO) {
-                       set_bit(DW_MMC_CARD_NO_LOW_PWR, &slot->flags);
+                       if (!test_bit(DW_MMC_CARD_NO_LOW_PWR, &slot->flags)) {
+                               pm_runtime_get_noresume(mmc->parent);
+                               set_bit(DW_MMC_CARD_NO_LOW_PWR, &slot->flags);
+                       }
                        clk_en_a = clk_en_a_old & ~clken_low_pwr;
                } else {
-                       clear_bit(DW_MMC_CARD_NO_LOW_PWR, &slot->flags);
+                       if (test_bit(DW_MMC_CARD_NO_LOW_PWR, &slot->flags)) {
+                               pm_runtime_put_noidle(mmc->parent);
+                               clear_bit(DW_MMC_CARD_NO_LOW_PWR, &slot->flags);
+                       }
                        clk_en_a = clk_en_a_old | clken_low_pwr;
                }
 
index 7123ef9..445fc47 100644 (file)
@@ -830,6 +830,7 @@ static int esdhc_change_pinstate(struct sdhci_host *host,
 
        switch (uhs) {
        case MMC_TIMING_UHS_SDR50:
+       case MMC_TIMING_UHS_DDR50:
                pinctrl = imx_data->pins_100mhz;
                break;
        case MMC_TIMING_UHS_SDR104:
index 66a9ded..1517da3 100644 (file)
@@ -46,7 +46,7 @@
 
 #include "mtdcore.h"
 
-static struct backing_dev_info *mtd_bdi;
+struct backing_dev_info *mtd_bdi;
 
 #ifdef CONFIG_PM_SLEEP
 
@@ -496,11 +496,9 @@ int add_mtd_device(struct mtd_info *mtd)
         * mtd_device_parse_register() multiple times on the same master MTD,
         * especially with CONFIG_MTD_PARTITIONED_MASTER=y.
         */
-       if (WARN_ONCE(mtd->backing_dev_info, "MTD already registered\n"))
+       if (WARN_ONCE(mtd->dev.type, "MTD already registered\n"))
                return -EEXIST;
 
-       mtd->backing_dev_info = mtd_bdi;
-
        BUG_ON(mtd->writesize == 0);
        mutex_lock(&mtd_table_mutex);
 
@@ -1775,13 +1773,18 @@ static struct backing_dev_info * __init mtd_bdi_init(char *name)
        struct backing_dev_info *bdi;
        int ret;
 
-       bdi = kzalloc(sizeof(*bdi), GFP_KERNEL);
+       bdi = bdi_alloc(GFP_KERNEL);
        if (!bdi)
                return ERR_PTR(-ENOMEM);
 
-       ret = bdi_setup_and_register(bdi, name);
+       bdi->name = name;
+       /*
+        * We put '-0' suffix to the name to get the same name format as we
+        * used to get. Since this is called only once, we get a unique name. 
+        */
+       ret = bdi_register(bdi, "%.28s-0", name);
        if (ret)
-               kfree(bdi);
+               bdi_put(bdi);
 
        return ret ? ERR_PTR(ret) : bdi;
 }
@@ -1813,8 +1816,7 @@ static int __init init_mtd(void)
 out_procfs:
        if (proc_mtd)
                remove_proc_entry("mtd", NULL);
-       bdi_destroy(mtd_bdi);
-       kfree(mtd_bdi);
+       bdi_put(mtd_bdi);
 err_bdi:
        class_unregister(&mtd_class);
 err_reg:
@@ -1828,8 +1830,7 @@ static void __exit cleanup_mtd(void)
        if (proc_mtd)
                remove_proc_entry("mtd", NULL);
        class_unregister(&mtd_class);
-       bdi_destroy(mtd_bdi);
-       kfree(mtd_bdi);
+       bdi_put(mtd_bdi);
        idr_destroy(&mtd_idr);
 }
 
index 20c02a3..e43fea8 100644 (file)
@@ -18,6 +18,7 @@
 #include <linux/ctype.h>
 #include <linux/slab.h>
 #include <linux/major.h>
+#include <linux/backing-dev.h>
 
 /*
  * compare superblocks to see if they're equivalent
@@ -38,6 +39,8 @@ static int get_sb_mtd_compare(struct super_block *sb, void *_mtd)
        return 0;
 }
 
+extern struct backing_dev_info *mtd_bdi;
+
 /*
  * mark the superblock by the MTD device it is using
  * - set the device number to be the correct MTD block device for pesuperstence
@@ -49,7 +52,8 @@ static int get_sb_mtd_set(struct super_block *sb, void *_mtd)
 
        sb->s_mtd = mtd;
        sb->s_dev = MKDEV(MTD_BLOCK_MAJOR, mtd->index);
-       sb->s_bdi = mtd->backing_dev_info;
+       sb->s_bdi = bdi_get(mtd_bdi);
+
        return 0;
 }
 
index c80869e..51f2be8 100644 (file)
@@ -347,7 +347,7 @@ static int ubiblock_init_request(void *data, struct request *req,
        return 0;
 }
 
-static struct blk_mq_ops ubiblock_mq_ops = {
+static const struct blk_mq_ops ubiblock_mq_ops = {
        .queue_rq       = ubiblock_queue_rq,
        .init_request   = ubiblock_init_request,
 };
index 0134ba3..3971256 100644 (file)
@@ -148,11 +148,11 @@ int ubi_start_update(struct ubi_device *ubi, struct ubi_volume *vol,
                        return err;
        }
 
-       if (bytes == 0) {
-               err = ubi_wl_flush(ubi, UBI_ALL, UBI_ALL);
-               if (err)
-                       return err;
+       err = ubi_wl_flush(ubi, UBI_ALL, UBI_ALL);
+       if (err)
+               return err;
 
+       if (bytes == 0) {
                err = clear_update_marker(ubi, vol, 0);
                if (err)
                        return err;
index 8a4ba8b..34481c9 100644 (file)
@@ -1104,11 +1104,11 @@ static void bond_compute_features(struct bonding *bond)
                gso_max_size = min(gso_max_size, slave->dev->gso_max_size);
                gso_max_segs = min(gso_max_segs, slave->dev->gso_max_segs);
        }
+       bond_dev->hard_header_len = max_hard_header_len;
 
 done:
        bond_dev->vlan_features = vlan_features;
        bond_dev->hw_enc_features = enc_features | NETIF_F_GSO_ENCAP_ALL;
-       bond_dev->hard_header_len = max_hard_header_len;
        bond_dev->gso_max_segs = gso_max_segs;
        netif_set_gso_max_size(bond_dev, gso_max_size);
 
index 138f5ae..4d1fe8d 100644 (file)
@@ -557,7 +557,7 @@ static int ifi_canfd_poll(struct napi_struct *napi, int quota)
        int work_done = 0;
 
        u32 stcmd = readl(priv->base + IFI_CANFD_STCMD);
-       u32 rxstcmd = readl(priv->base + IFI_CANFD_STCMD);
+       u32 rxstcmd = readl(priv->base + IFI_CANFD_RXSTCMD);
        u32 errctr = readl(priv->base + IFI_CANFD_ERROR_CTR);
 
        /* Handle bus state changes */
index caed4e6..11662f4 100644 (file)
@@ -826,8 +826,7 @@ static int rcar_can_probe(struct platform_device *pdev)
 
        devm_can_led_init(ndev);
 
-       dev_info(&pdev->dev, "device registered (regs @ %p, IRQ%d)\n",
-                priv->regs, ndev->irq);
+       dev_info(&pdev->dev, "device registered (IRQ%d)\n", ndev->irq);
 
        return 0;
 fail_candev:
index 8483a40..5f9e0e6 100644 (file)
@@ -72,6 +72,8 @@ config CAN_PEAK_USB
          PCAN-USB Pro         dual CAN 2.0b channels USB adapter
          PCAN-USB FD          single CAN-FD channel USB adapter
          PCAN-USB Pro FD      dual CAN-FD channels USB adapter
+         PCAN-Chip USB        CAN-FD to USB stamp module
+         PCAN-USB X6          6 CAN-FD channels USB adapter
 
          (see also http://www.peak-system.com).
 
index 300349f..eecee7f 100644 (file)
@@ -739,13 +739,18 @@ static const struct net_device_ops gs_usb_netdev_ops = {
 static int gs_usb_set_identify(struct net_device *netdev, bool do_identify)
 {
        struct gs_can *dev = netdev_priv(netdev);
-       struct gs_identify_mode imode;
+       struct gs_identify_mode *imode;
        int rc;
 
+       imode = kmalloc(sizeof(*imode), GFP_KERNEL);
+
+       if (!imode)
+               return -ENOMEM;
+
        if (do_identify)
-               imode.mode = GS_CAN_IDENTIFY_ON;
+               imode->mode = GS_CAN_IDENTIFY_ON;
        else
-               imode.mode = GS_CAN_IDENTIFY_OFF;
+               imode->mode = GS_CAN_IDENTIFY_OFF;
 
        rc = usb_control_msg(interface_to_usbdev(dev->iface),
                             usb_sndctrlpipe(interface_to_usbdev(dev->iface),
@@ -755,10 +760,12 @@ static int gs_usb_set_identify(struct net_device *netdev, bool do_identify)
                             USB_RECIP_INTERFACE,
                             dev->channel,
                             0,
-                            &imode,
-                            sizeof(imode),
+                            imode,
+                            sizeof(*imode),
                             100);
 
+       kfree(imode);
+
        return (rc > 0) ? 0 : rc;
 }
 
index 0b0302a..57913db 100644 (file)
@@ -39,6 +39,7 @@ static struct usb_device_id peak_usb_table[] = {
        {USB_DEVICE(PCAN_USB_VENDOR_ID, PCAN_USBPRO_PRODUCT_ID)},
        {USB_DEVICE(PCAN_USB_VENDOR_ID, PCAN_USBFD_PRODUCT_ID)},
        {USB_DEVICE(PCAN_USB_VENDOR_ID, PCAN_USBPROFD_PRODUCT_ID)},
+       {USB_DEVICE(PCAN_USB_VENDOR_ID, PCAN_USBCHIP_PRODUCT_ID)},
        {USB_DEVICE(PCAN_USB_VENDOR_ID, PCAN_USBX6_PRODUCT_ID)},
        {} /* Terminating entry */
 };
@@ -51,6 +52,7 @@ static const struct peak_usb_adapter *const peak_usb_adapters_list[] = {
        &pcan_usb_pro,
        &pcan_usb_fd,
        &pcan_usb_pro_fd,
+       &pcan_usb_chip,
        &pcan_usb_x6,
 };
 
index 3cbfb06..c01316c 100644 (file)
@@ -27,6 +27,7 @@
 #define PCAN_USBPRO_PRODUCT_ID         0x000d
 #define PCAN_USBPROFD_PRODUCT_ID       0x0011
 #define PCAN_USBFD_PRODUCT_ID          0x0012
+#define PCAN_USBCHIP_PRODUCT_ID                0x0013
 #define PCAN_USBX6_PRODUCT_ID          0x0014
 
 #define PCAN_USB_DRIVER_NAME           "peak_usb"
@@ -90,6 +91,7 @@ struct peak_usb_adapter {
 extern const struct peak_usb_adapter pcan_usb;
 extern const struct peak_usb_adapter pcan_usb_pro;
 extern const struct peak_usb_adapter pcan_usb_fd;
+extern const struct peak_usb_adapter pcan_usb_chip;
 extern const struct peak_usb_adapter pcan_usb_pro_fd;
 extern const struct peak_usb_adapter pcan_usb_x6;
 
index 3047325..528d3bb 100644 (file)
@@ -1061,6 +1061,78 @@ const struct peak_usb_adapter pcan_usb_fd = {
        .do_get_berr_counter = pcan_usb_fd_get_berr_counter,
 };
 
+/* describes the PCAN-CHIP USB */
+static const struct can_bittiming_const pcan_usb_chip_const = {
+       .name = "pcan_chip_usb",
+       .tseg1_min = 1,
+       .tseg1_max = (1 << PUCAN_TSLOW_TSGEG1_BITS),
+       .tseg2_min = 1,
+       .tseg2_max = (1 << PUCAN_TSLOW_TSGEG2_BITS),
+       .sjw_max = (1 << PUCAN_TSLOW_SJW_BITS),
+       .brp_min = 1,
+       .brp_max = (1 << PUCAN_TSLOW_BRP_BITS),
+       .brp_inc = 1,
+};
+
+static const struct can_bittiming_const pcan_usb_chip_data_const = {
+       .name = "pcan_chip_usb",
+       .tseg1_min = 1,
+       .tseg1_max = (1 << PUCAN_TFAST_TSGEG1_BITS),
+       .tseg2_min = 1,
+       .tseg2_max = (1 << PUCAN_TFAST_TSGEG2_BITS),
+       .sjw_max = (1 << PUCAN_TFAST_SJW_BITS),
+       .brp_min = 1,
+       .brp_max = (1 << PUCAN_TFAST_BRP_BITS),
+       .brp_inc = 1,
+};
+
+const struct peak_usb_adapter pcan_usb_chip = {
+       .name = "PCAN-Chip USB",
+       .device_id = PCAN_USBCHIP_PRODUCT_ID,
+       .ctrl_count = PCAN_USBFD_CHANNEL_COUNT,
+       .ctrlmode_supported = CAN_CTRLMODE_FD |
+               CAN_CTRLMODE_3_SAMPLES | CAN_CTRLMODE_LISTENONLY,
+       .clock = {
+               .freq = PCAN_UFD_CRYSTAL_HZ,
+       },
+       .bittiming_const = &pcan_usb_chip_const,
+       .data_bittiming_const = &pcan_usb_chip_data_const,
+
+       /* size of device private data */
+       .sizeof_dev_private = sizeof(struct pcan_usb_fd_device),
+
+       /* timestamps usage */
+       .ts_used_bits = 32,
+       .ts_period = 1000000, /* calibration period in ts. */
+       .us_per_ts_scale = 1, /* us = (ts * scale) >> shift */
+       .us_per_ts_shift = 0,
+
+       /* give here messages in/out endpoints */
+       .ep_msg_in = PCAN_USBPRO_EP_MSGIN,
+       .ep_msg_out = {PCAN_USBPRO_EP_MSGOUT_0},
+
+       /* size of rx/tx usb buffers */
+       .rx_buffer_size = PCAN_UFD_RX_BUFFER_SIZE,
+       .tx_buffer_size = PCAN_UFD_TX_BUFFER_SIZE,
+
+       /* device callbacks */
+       .intf_probe = pcan_usb_pro_probe,       /* same as PCAN-USB Pro */
+       .dev_init = pcan_usb_fd_init,
+
+       .dev_exit = pcan_usb_fd_exit,
+       .dev_free = pcan_usb_fd_free,
+       .dev_set_bus = pcan_usb_fd_set_bus,
+       .dev_set_bittiming = pcan_usb_fd_set_bittiming_slow,
+       .dev_set_data_bittiming = pcan_usb_fd_set_bittiming_fast,
+       .dev_decode_buf = pcan_usb_fd_decode_buf,
+       .dev_start = pcan_usb_fd_start,
+       .dev_stop = pcan_usb_fd_stop,
+       .dev_restart_async = pcan_usb_fd_restart_async,
+       .dev_encode_msg = pcan_usb_fd_encode_msg,
+
+       .do_get_berr_counter = pcan_usb_fd_get_berr_counter,
+};
+
 /* describes the PCAN-USB Pro FD adapter */
 static const struct can_bittiming_const pcan_usb_pro_fd_const = {
        .name = "pcan_usb_pro_fd",
index 8cf4801..fa0eece 100644 (file)
@@ -326,6 +326,7 @@ static void b53_get_vlan_entry(struct b53_device *dev, u16 vid,
 
 static void b53_set_forwarding(struct b53_device *dev, int enable)
 {
+       struct dsa_switch *ds = dev->ds;
        u8 mgmt;
 
        b53_read8(dev, B53_CTRL_PAGE, B53_SWITCH_MODE, &mgmt);
@@ -336,6 +337,15 @@ static void b53_set_forwarding(struct b53_device *dev, int enable)
                mgmt &= ~SM_SW_FWD_EN;
 
        b53_write8(dev, B53_CTRL_PAGE, B53_SWITCH_MODE, mgmt);
+
+       /* Include IMP port in dumb forwarding mode when no tagging protocol is
+        * set
+        */
+       if (ds->ops->get_tag_protocol(ds) == DSA_TAG_PROTO_NONE) {
+               b53_read8(dev, B53_CTRL_PAGE, B53_SWITCH_CTRL, &mgmt);
+               mgmt |= B53_MII_DUMB_FWDG_EN;
+               b53_write8(dev, B53_CTRL_PAGE, B53_SWITCH_CTRL, mgmt);
+       }
 }
 
 static void b53_enable_vlan(struct b53_device *dev, bool enable)
@@ -598,7 +608,8 @@ static void b53_switch_reset_gpio(struct b53_device *dev)
 
 static int b53_switch_reset(struct b53_device *dev)
 {
-       u8 mgmt;
+       unsigned int timeout = 1000;
+       u8 mgmt, reg;
 
        b53_switch_reset_gpio(dev);
 
@@ -607,6 +618,28 @@ static int b53_switch_reset(struct b53_device *dev)
                b53_write8(dev, B53_CTRL_PAGE, B53_SOFTRESET, 0x00);
        }
 
+       /* This is specific to 58xx devices here, do not use is58xx() which
+        * covers the larger Starfigther 2 family, including 7445/7278 which
+        * still use this driver as a library and need to perform the reset
+        * earlier.
+        */
+       if (dev->chip_id == BCM58XX_DEVICE_ID) {
+               b53_read8(dev, B53_CTRL_PAGE, B53_SOFTRESET, &reg);
+               reg |= SW_RST | EN_SW_RST | EN_CH_RST;
+               b53_write8(dev, B53_CTRL_PAGE, B53_SOFTRESET, reg);
+
+               do {
+                       b53_read8(dev, B53_CTRL_PAGE, B53_SOFTRESET, &reg);
+                       if (!(reg & SW_RST))
+                               break;
+
+                       usleep_range(1000, 2000);
+               } while (timeout-- > 0);
+
+               if (timeout == 0)
+                       return -ETIMEDOUT;
+       }
+
        b53_read8(dev, B53_CTRL_PAGE, B53_SWITCH_MODE, &mgmt);
 
        if (!(mgmt & SM_SW_FWD_EN)) {
@@ -1731,7 +1764,7 @@ static const struct b53_chip_data b53_switch_chips[] = {
                .vlans  = 4096,
                .enabled_ports = 0x1ff,
                .arl_entries = 4,
-               .cpu_port = B53_CPU_PORT_25,
+               .cpu_port = B53_CPU_PORT,
                .vta_regs = B53_VTA_REGS,
                .duplex_reg = B53_DUPLEX_STAT_GE,
                .jumbo_pm_reg = B53_JUMBO_PORT_MASK,
index 9fd24c4..e5c86d4 100644 (file)
 #define  B53_UC_FWD_EN                 BIT(6)
 #define  B53_MC_FWD_EN                 BIT(7)
 
+/* Switch control (8 bit) */
+#define B53_SWITCH_CTRL                        0x22
+#define  B53_MII_DUMB_FWDG_EN          BIT(6)
+
 /* (16 bit) */
 #define B53_UC_FLOOD_MASK              0x32
 #define B53_MC_FLOOD_MASK              0x34
 /* Software reset register (8 bit) */
 #define B53_SOFTRESET                  0x79
 #define   SW_RST                       BIT(7)
+#define   EN_CH_RST                    BIT(6)
 #define   EN_SW_RST                    BIT(4)
 
 /* Fast Aging Control register (8 bit) */
index d05fbfd..5d6c40d 100644 (file)
@@ -100,11 +100,6 @@ static int aq_ndev_change_mtu(struct net_device *ndev, int new_mtu)
                goto err_exit;
        ndev->mtu = new_mtu;
 
-       if (netif_running(ndev)) {
-               aq_ndev_close(ndev);
-               aq_ndev_open(ndev);
-       }
-
 err_exit:
        return err;
 }
index ee78444..cdb0299 100644 (file)
@@ -487,6 +487,9 @@ static unsigned int aq_nic_map_skb(struct aq_nic_s *self,
                dx_buff->mss = skb_shinfo(skb)->gso_size;
                dx_buff->is_txc = 1U;
 
+               dx_buff->is_ipv6 =
+                       (ip_hdr(skb)->version == 6) ? 1U : 0U;
+
                dx = aq_ring_next_dx(ring, dx);
                dx_buff = &ring->buff_ring[dx];
                ++ret;
@@ -510,10 +513,22 @@ static unsigned int aq_nic_map_skb(struct aq_nic_s *self,
        if (skb->ip_summed == CHECKSUM_PARTIAL) {
                dx_buff->is_ip_cso = (htons(ETH_P_IP) == skb->protocol) ?
                        1U : 0U;
-               dx_buff->is_tcp_cso =
-                       (ip_hdr(skb)->protocol == IPPROTO_TCP) ? 1U : 0U;
-               dx_buff->is_udp_cso =
-                       (ip_hdr(skb)->protocol == IPPROTO_UDP) ? 1U : 0U;
+
+               if (ip_hdr(skb)->version == 4) {
+                       dx_buff->is_tcp_cso =
+                               (ip_hdr(skb)->protocol == IPPROTO_TCP) ?
+                                       1U : 0U;
+                       dx_buff->is_udp_cso =
+                               (ip_hdr(skb)->protocol == IPPROTO_UDP) ?
+                                       1U : 0U;
+               } else if (ip_hdr(skb)->version == 6) {
+                       dx_buff->is_tcp_cso =
+                               (ipv6_hdr(skb)->nexthdr == NEXTHDR_TCP) ?
+                                       1U : 0U;
+                       dx_buff->is_udp_cso =
+                               (ipv6_hdr(skb)->nexthdr == NEXTHDR_UDP) ?
+                                       1U : 0U;
+               }
        }
 
        for (; nr_frags--; ++frag_count) {
index 0358e60..3a8a4aa 100644 (file)
@@ -101,6 +101,7 @@ int aq_ring_init(struct aq_ring_s *self)
        self->hw_head = 0;
        self->sw_head = 0;
        self->sw_tail = 0;
+       spin_lock_init(&self->header.lock);
        return 0;
 }
 
index 2572546..eecd6d1 100644 (file)
@@ -58,7 +58,8 @@ struct __packed aq_ring_buff_s {
                        u8 len_l2;
                        u8 len_l3;
                        u8 len_l4;
-                       u8 rsvd2;
+                       u8 is_ipv6:1;
+                       u8 rsvd2:7;
                        u32 len_pkt;
                };
        };
index a2b746a..4ee15ff 100644 (file)
@@ -433,6 +433,9 @@ static int hw_atl_a0_hw_ring_tx_xmit(struct aq_hw_s *self,
                                    buff->len_l3 +
                                    buff->len_l2);
                        is_gso = true;
+
+                       if (buff->is_ipv6)
+                               txd->ctl |= HW_ATL_A0_TXD_CTL_CMD_IPV6;
                } else {
                        buff_pa_len = buff->len;
 
@@ -458,6 +461,7 @@ static int hw_atl_a0_hw_ring_tx_xmit(struct aq_hw_s *self,
                        if (unlikely(buff->is_eop)) {
                                txd->ctl |= HW_ATL_A0_TXD_CTL_EOP;
                                txd->ctl |= HW_ATL_A0_TXD_CTL_CMD_WB;
+                               is_gso = false;
                        }
                }
 
index cab2931..4215070 100644 (file)
@@ -471,6 +471,9 @@ static int hw_atl_b0_hw_ring_tx_xmit(struct aq_hw_s *self,
                                    buff->len_l3 +
                                    buff->len_l2);
                        is_gso = true;
+
+                       if (buff->is_ipv6)
+                               txd->ctl |= HW_ATL_B0_TXD_CTL_CMD_IPV6;
                } else {
                        buff_pa_len = buff->len;
 
@@ -496,6 +499,7 @@ static int hw_atl_b0_hw_ring_tx_xmit(struct aq_hw_s *self,
                        if (unlikely(buff->is_eop)) {
                                txd->ctl |= HW_ATL_B0_TXD_CTL_EOP;
                                txd->ctl |= HW_ATL_B0_TXD_CTL_CMD_WB;
+                               is_gso = false;
                        }
                }
 
index 0a23034..352beff 100644 (file)
@@ -2277,7 +2277,7 @@ void bnx2x_igu_clear_sb_gen(struct bnx2x *bp, u8 func, u8 idu_sb_id,
                                 GENERAL_ATTEN_OFFSET(LATCHED_ATTN_RBCP) | \
                                 GENERAL_ATTEN_OFFSET(LATCHED_ATTN_RSVD_GRC))
 
-#define HW_INTERRUT_ASSERT_SET_0 \
+#define HW_INTERRUPT_ASSERT_SET_0 \
                                (AEU_INPUTS_ATTN_BITS_TSDM_HW_INTERRUPT | \
                                 AEU_INPUTS_ATTN_BITS_TCM_HW_INTERRUPT | \
                                 AEU_INPUTS_ATTN_BITS_TSEMI_HW_INTERRUPT | \
@@ -2290,7 +2290,7 @@ void bnx2x_igu_clear_sb_gen(struct bnx2x *bp, u8 func, u8 idu_sb_id,
                                 AEU_INPUTS_ATTN_BITS_TSEMI_PARITY_ERROR |\
                                 AEU_INPUTS_ATTN_BITS_TCM_PARITY_ERROR |\
                                 AEU_INPUTS_ATTN_BITS_PBCLIENT_PARITY_ERROR)
-#define HW_INTERRUT_ASSERT_SET_1 \
+#define HW_INTERRUPT_ASSERT_SET_1 \
                                (AEU_INPUTS_ATTN_BITS_QM_HW_INTERRUPT | \
                                 AEU_INPUTS_ATTN_BITS_TIMERS_HW_INTERRUPT | \
                                 AEU_INPUTS_ATTN_BITS_XSDM_HW_INTERRUPT | \
@@ -2318,7 +2318,7 @@ void bnx2x_igu_clear_sb_gen(struct bnx2x *bp, u8 func, u8 idu_sb_id,
                                 AEU_INPUTS_ATTN_BITS_UPB_PARITY_ERROR | \
                                 AEU_INPUTS_ATTN_BITS_CSDM_PARITY_ERROR |\
                                 AEU_INPUTS_ATTN_BITS_CCM_PARITY_ERROR)
-#define HW_INTERRUT_ASSERT_SET_2 \
+#define HW_INTERRUPT_ASSERT_SET_2 \
                                (AEU_INPUTS_ATTN_BITS_CSEMI_HW_INTERRUPT | \
                                 AEU_INPUTS_ATTN_BITS_CDU_HW_INTERRUPT | \
                                 AEU_INPUTS_ATTN_BITS_DMAE_HW_INTERRUPT | \
index ac76fc2..a851f95 100644 (file)
@@ -4166,14 +4166,14 @@ static void bnx2x_attn_int_deasserted0(struct bnx2x *bp, u32 attn)
                bnx2x_release_phy_lock(bp);
        }
 
-       if (attn & HW_INTERRUT_ASSERT_SET_0) {
+       if (attn & HW_INTERRUPT_ASSERT_SET_0) {
 
                val = REG_RD(bp, reg_offset);
-               val &= ~(attn & HW_INTERRUT_ASSERT_SET_0);
+               val &= ~(attn & HW_INTERRUPT_ASSERT_SET_0);
                REG_WR(bp, reg_offset, val);
 
                BNX2X_ERR("FATAL HW block attention set0 0x%x\n",
-                         (u32)(attn & HW_INTERRUT_ASSERT_SET_0));
+                         (u32)(attn & HW_INTERRUPT_ASSERT_SET_0));
                bnx2x_panic();
        }
 }
@@ -4191,7 +4191,7 @@ static void bnx2x_attn_int_deasserted1(struct bnx2x *bp, u32 attn)
                        BNX2X_ERR("FATAL error from DORQ\n");
        }
 
-       if (attn & HW_INTERRUT_ASSERT_SET_1) {
+       if (attn & HW_INTERRUPT_ASSERT_SET_1) {
 
                int port = BP_PORT(bp);
                int reg_offset;
@@ -4200,11 +4200,11 @@ static void bnx2x_attn_int_deasserted1(struct bnx2x *bp, u32 attn)
                                     MISC_REG_AEU_ENABLE1_FUNC_0_OUT_1);
 
                val = REG_RD(bp, reg_offset);
-               val &= ~(attn & HW_INTERRUT_ASSERT_SET_1);
+               val &= ~(attn & HW_INTERRUPT_ASSERT_SET_1);
                REG_WR(bp, reg_offset, val);
 
                BNX2X_ERR("FATAL HW block attention set1 0x%x\n",
-                         (u32)(attn & HW_INTERRUT_ASSERT_SET_1));
+                         (u32)(attn & HW_INTERRUPT_ASSERT_SET_1));
                bnx2x_panic();
        }
 }
@@ -4235,7 +4235,7 @@ static void bnx2x_attn_int_deasserted2(struct bnx2x *bp, u32 attn)
                }
        }
 
-       if (attn & HW_INTERRUT_ASSERT_SET_2) {
+       if (attn & HW_INTERRUPT_ASSERT_SET_2) {
 
                int port = BP_PORT(bp);
                int reg_offset;
@@ -4244,11 +4244,11 @@ static void bnx2x_attn_int_deasserted2(struct bnx2x *bp, u32 attn)
                                     MISC_REG_AEU_ENABLE1_FUNC_0_OUT_2);
 
                val = REG_RD(bp, reg_offset);
-               val &= ~(attn & HW_INTERRUT_ASSERT_SET_2);
+               val &= ~(attn & HW_INTERRUPT_ASSERT_SET_2);
                REG_WR(bp, reg_offset, val);
 
                BNX2X_ERR("FATAL HW block attention set2 0x%x\n",
-                         (u32)(attn & HW_INTERRUT_ASSERT_SET_2));
+                         (u32)(attn & HW_INTERRUPT_ASSERT_SET_2));
                bnx2x_panic();
        }
 }
index 32de458..1f1e54b 100644 (file)
@@ -1983,20 +1983,25 @@ static void bnxt_free_rx_skbs(struct bnxt *bp)
 
                for (j = 0; j < max_idx; j++) {
                        struct bnxt_sw_rx_bd *rx_buf = &rxr->rx_buf_ring[j];
+                       dma_addr_t mapping = rx_buf->mapping;
                        void *data = rx_buf->data;
 
                        if (!data)
                                continue;
 
-                       dma_unmap_single(&pdev->dev, rx_buf->mapping,
-                                        bp->rx_buf_use_size, bp->rx_dir);
-
                        rx_buf->data = NULL;
 
-                       if (BNXT_RX_PAGE_MODE(bp))
+                       if (BNXT_RX_PAGE_MODE(bp)) {
+                               mapping -= bp->rx_dma_offset;
+                               dma_unmap_page(&pdev->dev, mapping,
+                                              PAGE_SIZE, bp->rx_dir);
                                __free_page(data);
-                       else
+                       } else {
+                               dma_unmap_single(&pdev->dev, mapping,
+                                                bp->rx_buf_use_size,
+                                                bp->rx_dir);
                                kfree(data);
+                       }
                }
 
                for (j = 0; j < max_agg_idx; j++) {
@@ -2455,6 +2460,18 @@ static int bnxt_init_one_rx_ring(struct bnxt *bp, int ring_nr)
        return 0;
 }
 
+static void bnxt_init_cp_rings(struct bnxt *bp)
+{
+       int i;
+
+       for (i = 0; i < bp->cp_nr_rings; i++) {
+               struct bnxt_cp_ring_info *cpr = &bp->bnapi[i]->cp_ring;
+               struct bnxt_ring_struct *ring = &cpr->cp_ring_struct;
+
+               ring->fw_ring_id = INVALID_HW_RING_ID;
+       }
+}
+
 static int bnxt_init_rx_rings(struct bnxt *bp)
 {
        int i, rc = 0;
@@ -4732,7 +4749,7 @@ static int bnxt_set_tpa(struct bnxt *bp, bool set_tpa)
                rc = bnxt_hwrm_vnic_set_tpa(bp, i, tpa_flags);
                if (rc) {
                        netdev_err(bp->dev, "hwrm vnic set tpa failure rc for vnic %d: %x\n",
-                                  rc, i);
+                                  i, rc);
                        return rc;
                }
        }
@@ -5006,6 +5023,7 @@ static int bnxt_shutdown_nic(struct bnxt *bp, bool irq_re_init)
 
 static int bnxt_init_nic(struct bnxt *bp, bool irq_re_init)
 {
+       bnxt_init_cp_rings(bp);
        bnxt_init_rx_rings(bp);
        bnxt_init_tx_rings(bp);
        bnxt_init_ring_grps(bp, irq_re_init);
index 9e59663..0f68118 100644 (file)
@@ -1930,13 +1930,13 @@ static void
 bfa_ioc_send_enable(struct bfa_ioc *ioc)
 {
        struct bfi_ioc_ctrl_req enable_req;
-       struct timeval tv;
 
        bfi_h2i_set(enable_req.mh, BFI_MC_IOC, BFI_IOC_H2I_ENABLE_REQ,
                    bfa_ioc_portid(ioc));
        enable_req.clscode = htons(ioc->clscode);
-       do_gettimeofday(&tv);
-       enable_req.tv_sec = ntohl(tv.tv_sec);
+       enable_req.rsvd = htons(0);
+       /* overflow in 2106 */
+       enable_req.tv_sec = ntohl(ktime_get_real_seconds());
        bfa_ioc_mbox_send(ioc, &enable_req, sizeof(struct bfi_ioc_ctrl_req));
 }
 
@@ -1947,6 +1947,10 @@ bfa_ioc_send_disable(struct bfa_ioc *ioc)
 
        bfi_h2i_set(disable_req.mh, BFI_MC_IOC, BFI_IOC_H2I_DISABLE_REQ,
                    bfa_ioc_portid(ioc));
+       disable_req.clscode = htons(ioc->clscode);
+       disable_req.rsvd = htons(0);
+       /* overflow in 2106 */
+       disable_req.tv_sec = ntohl(ktime_get_real_seconds());
        bfa_ioc_mbox_send(ioc, &disable_req, sizeof(struct bfi_ioc_ctrl_req));
 }
 
index 64a1095..a0ca68c 100644 (file)
@@ -134,6 +134,7 @@ static void set_max_bgx_per_node(struct pci_dev *pdev)
        pci_read_config_word(pdev, PCI_SUBSYSTEM_ID, &sdevid);
        switch (sdevid) {
        case PCI_SUBSYS_DEVID_81XX_BGX:
+       case PCI_SUBSYS_DEVID_81XX_RGX:
                max_bgx_per_node = MAX_BGX_PER_CN81XX;
                break;
        case PCI_SUBSYS_DEVID_83XX_BGX:
index c5080f2..6b7fe6f 100644 (file)
@@ -16,6 +16,7 @@
 /* Subsystem device IDs */
 #define PCI_SUBSYS_DEVID_88XX_BGX              0xA126
 #define PCI_SUBSYS_DEVID_81XX_BGX              0xA226
+#define PCI_SUBSYS_DEVID_81XX_RGX              0xA254
 #define PCI_SUBSYS_DEVID_83XX_BGX              0xA326
 
 #define    MAX_BGX_THUNDER                     8 /* Max 2 nodes, 4 per node */
index 30e8550..02dd524 100644 (file)
@@ -4939,8 +4939,9 @@ static int
 __be_cmd_set_logical_link_config(struct be_adapter *adapter,
                                 int link_state, int version, u8 domain)
 {
-       struct be_mcc_wrb *wrb;
        struct be_cmd_req_set_ll_link *req;
+       struct be_mcc_wrb *wrb;
+       u32 link_config = 0;
        int status;
 
        mutex_lock(&adapter->mcc_lock);
@@ -4962,10 +4963,12 @@ __be_cmd_set_logical_link_config(struct be_adapter *adapter,
 
        if (link_state == IFLA_VF_LINK_STATE_ENABLE ||
            link_state == IFLA_VF_LINK_STATE_AUTO)
-               req->link_config |= PLINK_ENABLE;
+               link_config |= PLINK_ENABLE;
 
        if (link_state == IFLA_VF_LINK_STATE_AUTO)
-               req->link_config |= PLINK_TRACK;
+               link_config |= PLINK_TRACK;
+
+       req->link_config = cpu_to_le32(link_config);
 
        status = be_mcc_notify_wait(adapter);
 err:
index 992ebe9..f819843 100644 (file)
@@ -189,11 +189,9 @@ static int nps_enet_poll(struct napi_struct *napi, int budget)
 
        nps_enet_tx_handler(ndev);
        work_done = nps_enet_rx_handler(ndev);
-       if (work_done < budget) {
+       if ((work_done < budget) && napi_complete_done(napi, work_done)) {
                u32 buf_int_enable_value = 0;
 
-               napi_complete_done(napi, work_done);
-
                /* set tx_done and rx_rdy bits */
                buf_int_enable_value |= NPS_ENET_ENABLE << RX_RDY_SHIFT;
                buf_int_enable_value |= NPS_ENET_ENABLE << TX_DONE_SHIFT;
index 928b0df..ade6b3e 100644 (file)
 #include <linux/io.h>
 #include <linux/module.h>
 #include <linux/netdevice.h>
+#include <linux/of.h>
 #include <linux/phy.h>
 #include <linux/platform_device.h>
+#include <linux/property.h>
 #include <net/ip.h>
 #include <net/ncsi.h>
 
index 3239d27..bdd8cdd 100644 (file)
@@ -82,9 +82,12 @@ void hns_mac_get_link_status(struct hns_mac_cb *mac_cb, u32 *link_status)
        else
                *link_status = 0;
 
-       ret = mac_cb->dsaf_dev->misc_op->get_sfp_prsnt(mac_cb, &sfp_prsnt);
-       if (!ret)
-               *link_status = *link_status && sfp_prsnt;
+       if (mac_cb->media_type == HNAE_MEDIA_TYPE_FIBER) {
+               ret = mac_cb->dsaf_dev->misc_op->get_sfp_prsnt(mac_cb,
+                                                              &sfp_prsnt);
+               if (!ret)
+                       *link_status = *link_status && sfp_prsnt;
+       }
 
        mac_cb->link = *link_status;
 }
@@ -855,7 +858,7 @@ static int  hns_mac_get_info(struct hns_mac_cb *mac_cb)
                of_node_put(np);
 
                np = of_parse_phandle(to_of_node(mac_cb->fw_port),
-                                       "serdes-syscon", 0);
+                                     "serdes-syscon", 0);
                syscon = syscon_node_to_regmap(np);
                of_node_put(np);
                if (IS_ERR_OR_NULL(syscon)) {
index 90dbda7..403ea9d 100644 (file)
@@ -1519,6 +1519,7 @@ static void hns_dsaf_set_mac_key(
        mac_key->high.bits.mac_3 = addr[3];
        mac_key->low.bits.mac_4 = addr[4];
        mac_key->low.bits.mac_5 = addr[5];
+       mac_key->low.bits.port_vlan = 0;
        dsaf_set_field(mac_key->low.bits.port_vlan, DSAF_TBL_TCAM_KEY_VLAN_M,
                       DSAF_TBL_TCAM_KEY_VLAN_S, vlan_id);
        dsaf_set_field(mac_key->low.bits.port_vlan, DSAF_TBL_TCAM_KEY_PORT_M,
@@ -2924,10 +2925,11 @@ void hns_dsaf_set_promisc_tcam(struct dsaf_device *dsaf_dev,
        /* find the tcam entry index for promisc */
        entry_index = dsaf_promisc_tcam_entry(port);
 
+       memset(&tbl_tcam_data, 0, sizeof(tbl_tcam_data));
+       memset(&tbl_tcam_mask, 0, sizeof(tbl_tcam_mask));
+
        /* config key mask */
        if (enable) {
-               memset(&tbl_tcam_data, 0, sizeof(tbl_tcam_data));
-               memset(&tbl_tcam_mask, 0, sizeof(tbl_tcam_mask));
                dsaf_set_field(tbl_tcam_data.low.bits.port_vlan,
                               DSAF_TBL_TCAM_KEY_PORT_M,
                               DSAF_TBL_TCAM_KEY_PORT_S, port);
index a2c22d0..e13aa06 100644 (file)
@@ -461,6 +461,32 @@ int hns_mac_get_sfp_prsnt(struct hns_mac_cb *mac_cb, int *sfp_prsnt)
        return 0;
 }
 
+int hns_mac_get_sfp_prsnt_acpi(struct hns_mac_cb *mac_cb, int *sfp_prsnt)
+{
+       union acpi_object *obj;
+       union acpi_object obj_args, argv4;
+
+       obj_args.integer.type = ACPI_TYPE_INTEGER;
+       obj_args.integer.value = mac_cb->mac_id;
+
+       argv4.type = ACPI_TYPE_PACKAGE,
+       argv4.package.count = 1,
+       argv4.package.elements = &obj_args,
+
+       obj = acpi_evaluate_dsm(ACPI_HANDLE(mac_cb->dev),
+                               hns_dsaf_acpi_dsm_uuid, 0,
+                               HNS_OP_GET_SFP_STAT_FUNC, &argv4);
+
+       if (!obj || obj->type != ACPI_TYPE_INTEGER)
+               return -ENODEV;
+
+       *sfp_prsnt = obj->integer.value;
+
+       ACPI_FREE(obj);
+
+       return 0;
+}
+
 /**
  * hns_mac_config_sds_loopback - set loop back for serdes
  * @mac_cb: mac control block
@@ -592,7 +618,7 @@ struct dsaf_misc_op *hns_misc_op_get(struct dsaf_device *dsaf_dev)
                misc_op->hns_dsaf_roce_srst = hns_dsaf_roce_srst_acpi;
 
                misc_op->get_phy_if = hns_mac_get_phy_if_acpi;
-               misc_op->get_sfp_prsnt = hns_mac_get_sfp_prsnt;
+               misc_op->get_sfp_prsnt = hns_mac_get_sfp_prsnt_acpi;
 
                misc_op->cfg_serdes_loopback = hns_mac_config_sds_loopback_acpi;
        } else {
index 2175cce..e9af89a 100644 (file)
@@ -6274,8 +6274,8 @@ static int e1000e_pm_freeze(struct device *dev)
                /* Quiesce the device without resetting the hardware */
                e1000e_down(adapter, false);
                e1000_free_irq(adapter);
-               e1000e_reset_interrupt_capability(adapter);
        }
+       e1000e_reset_interrupt_capability(adapter);
 
        /* Allow time for pending master requests to run */
        e1000e_disable_pcie_master(&adapter->hw);
index e8a8351..82a95cc 100644 (file)
@@ -4438,8 +4438,12 @@ static void i40e_napi_enable_all(struct i40e_vsi *vsi)
        if (!vsi->netdev)
                return;
 
-       for (q_idx = 0; q_idx < vsi->num_q_vectors; q_idx++)
-               napi_enable(&vsi->q_vectors[q_idx]->napi);
+       for (q_idx = 0; q_idx < vsi->num_q_vectors; q_idx++) {
+               struct i40e_q_vector *q_vector = vsi->q_vectors[q_idx];
+
+               if (q_vector->rx.ring || q_vector->tx.ring)
+                       napi_enable(&q_vector->napi);
+       }
 }
 
 /**
@@ -4453,8 +4457,12 @@ static void i40e_napi_disable_all(struct i40e_vsi *vsi)
        if (!vsi->netdev)
                return;
 
-       for (q_idx = 0; q_idx < vsi->num_q_vectors; q_idx++)
-               napi_disable(&vsi->q_vectors[q_idx]->napi);
+       for (q_idx = 0; q_idx < vsi->num_q_vectors; q_idx++) {
+               struct i40e_q_vector *q_vector = vsi->q_vectors[q_idx];
+
+               if (q_vector->rx.ring || q_vector->tx.ring)
+                       napi_disable(&q_vector->napi);
+       }
 }
 
 /**
index 9e75768..9394913 100644 (file)
@@ -613,7 +613,7 @@ static int mtk_tx_map(struct sk_buff *skb, struct net_device *dev,
        struct mtk_mac *mac = netdev_priv(dev);
        struct mtk_eth *eth = mac->hw;
        struct mtk_tx_dma *itxd, *txd;
-       struct mtk_tx_buf *tx_buf;
+       struct mtk_tx_buf *itx_buf, *tx_buf;
        dma_addr_t mapped_addr;
        unsigned int nr_frags;
        int i, n_desc = 1;
@@ -627,8 +627,8 @@ static int mtk_tx_map(struct sk_buff *skb, struct net_device *dev,
        fport = (mac->id + 1) << TX_DMA_FPORT_SHIFT;
        txd4 |= fport;
 
-       tx_buf = mtk_desc_to_tx_buf(ring, itxd);
-       memset(tx_buf, 0, sizeof(*tx_buf));
+       itx_buf = mtk_desc_to_tx_buf(ring, itxd);
+       memset(itx_buf, 0, sizeof(*itx_buf));
 
        if (gso)
                txd4 |= TX_DMA_TSO;
@@ -647,9 +647,11 @@ static int mtk_tx_map(struct sk_buff *skb, struct net_device *dev,
                return -ENOMEM;
 
        WRITE_ONCE(itxd->txd1, mapped_addr);
-       tx_buf->flags |= MTK_TX_FLAGS_SINGLE0;
-       dma_unmap_addr_set(tx_buf, dma_addr0, mapped_addr);
-       dma_unmap_len_set(tx_buf, dma_len0, skb_headlen(skb));
+       itx_buf->flags |= MTK_TX_FLAGS_SINGLE0;
+       itx_buf->flags |= (!mac->id) ? MTK_TX_FLAGS_FPORT0 :
+                         MTK_TX_FLAGS_FPORT1;
+       dma_unmap_addr_set(itx_buf, dma_addr0, mapped_addr);
+       dma_unmap_len_set(itx_buf, dma_len0, skb_headlen(skb));
 
        /* TX SG offload */
        txd = itxd;
@@ -685,11 +687,13 @@ static int mtk_tx_map(struct sk_buff *skb, struct net_device *dev,
                                               last_frag * TX_DMA_LS0));
                        WRITE_ONCE(txd->txd4, fport);
 
-                       tx_buf->skb = (struct sk_buff *)MTK_DMA_DUMMY_DESC;
                        tx_buf = mtk_desc_to_tx_buf(ring, txd);
                        memset(tx_buf, 0, sizeof(*tx_buf));
-
+                       tx_buf->skb = (struct sk_buff *)MTK_DMA_DUMMY_DESC;
                        tx_buf->flags |= MTK_TX_FLAGS_PAGE0;
+                       tx_buf->flags |= (!mac->id) ? MTK_TX_FLAGS_FPORT0 :
+                                        MTK_TX_FLAGS_FPORT1;
+
                        dma_unmap_addr_set(tx_buf, dma_addr0, mapped_addr);
                        dma_unmap_len_set(tx_buf, dma_len0, frag_map_size);
                        frag_size -= frag_map_size;
@@ -698,7 +702,7 @@ static int mtk_tx_map(struct sk_buff *skb, struct net_device *dev,
        }
 
        /* store skb to cleanup */
-       tx_buf->skb = skb;
+       itx_buf->skb = skb;
 
        WRITE_ONCE(itxd->txd4, txd4);
        WRITE_ONCE(itxd->txd3, (TX_DMA_SWC | TX_DMA_PLEN0(skb_headlen(skb)) |
@@ -1012,17 +1016,16 @@ static int mtk_poll_tx(struct mtk_eth *eth, int budget)
 
        while ((cpu != dma) && budget) {
                u32 next_cpu = desc->txd2;
-               int mac;
+               int mac = 0;
 
                desc = mtk_qdma_phys_to_virt(ring, desc->txd2);
                if ((desc->txd3 & TX_DMA_OWNER_CPU) == 0)
                        break;
 
-               mac = (desc->txd4 >> TX_DMA_FPORT_SHIFT) &
-                      TX_DMA_FPORT_MASK;
-               mac--;
-
                tx_buf = mtk_desc_to_tx_buf(ring, desc);
+               if (tx_buf->flags & MTK_TX_FLAGS_FPORT1)
+                       mac = 1;
+
                skb = tx_buf->skb;
                if (!skb) {
                        condition = 1;
index 99b1c8e..08285a9 100644 (file)
@@ -406,12 +406,18 @@ struct mtk_hw_stats {
        struct u64_stats_sync   syncp;
 };
 
-/* PDMA descriptor can point at 1-2 segments. This enum allows us to track how
- * memory was allocated so that it can be freed properly
- */
 enum mtk_tx_flags {
+       /* PDMA descriptor can point at 1-2 segments. This enum allows us to
+        * track how memory was allocated so that it can be freed properly.
+        */
        MTK_TX_FLAGS_SINGLE0    = 0x01,
        MTK_TX_FLAGS_PAGE0      = 0x02,
+
+       /* MTK_TX_FLAGS_FPORTx allows tracking which port the transmitted
+        * SKB out instead of looking up through hardware TX descriptor.
+        */
+       MTK_TX_FLAGS_FPORT0     = 0x04,
+       MTK_TX_FLAGS_FPORT1     = 0x08,
 };
 
 /* This enum allows us to identify how the clock is defined on the array of the
index dc52053..3d9490c 100644 (file)
@@ -90,7 +90,7 @@
 #define MLX5E_VALID_NUM_MTTS(num_mtts) (MLX5_MTT_OCTW(num_mtts) - 1 <= U16_MAX)
 
 #define MLX5_UMR_ALIGN                         (2048)
-#define MLX5_MPWRQ_SMALL_PACKET_THRESHOLD      (128)
+#define MLX5_MPWRQ_SMALL_PACKET_THRESHOLD      (256)
 
 #define MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ                 (64 * 1024)
 #define MLX5E_DEFAULT_LRO_TIMEOUT                       32
index d55fff0..26fc77e 100644 (file)
@@ -564,6 +564,7 @@ int mlx5e_ethtool_get_all_flows(struct mlx5e_priv *priv, struct ethtool_rxnfc *i
        int idx = 0;
        int err = 0;
 
+       info->data = MAX_NUM_OF_ETHTOOL_RULES;
        while ((!err || err == -ENOENT) && idx < info->rule_cnt) {
                err = mlx5e_ethtool_get_flow(priv, info, location);
                if (!err)
index 66c1337..15cc7b4 100644 (file)
@@ -174,7 +174,7 @@ unlock:
 
 static void mlx5e_update_sw_counters(struct mlx5e_priv *priv)
 {
-       struct mlx5e_sw_stats *s = &priv->stats.sw;
+       struct mlx5e_sw_stats temp, *s = &temp;
        struct mlx5e_rq_stats *rq_stats;
        struct mlx5e_sq_stats *sq_stats;
        u64 tx_offload_none = 0;
@@ -229,6 +229,7 @@ static void mlx5e_update_sw_counters(struct mlx5e_priv *priv)
        s->link_down_events_phy = MLX5_GET(ppcnt_reg,
                                priv->stats.pport.phy_counters,
                                counter_set.phys_layer_cntrs.link_down_events);
+       memcpy(&priv->stats.sw, s, sizeof(*s));
 }
 
 static void mlx5e_update_vport_counters(struct mlx5e_priv *priv)
@@ -243,7 +244,6 @@ static void mlx5e_update_vport_counters(struct mlx5e_priv *priv)
        MLX5_SET(query_vport_counter_in, in, op_mod, 0);
        MLX5_SET(query_vport_counter_in, in, other_vport, 0);
 
-       memset(out, 0, outlen);
        mlx5_cmd_exec(mdev, in, sizeof(in), out, outlen);
 }
 
index fade723..5436866 100644 (file)
@@ -639,7 +639,8 @@ static int parse_cls_flower(struct mlx5e_priv *priv,
 
        if (!err && (flow->flags & MLX5E_TC_FLOW_ESWITCH) &&
            rep->vport != FDB_UPLINK_VPORT) {
-               if (min_inline > esw->offloads.inline_mode) {
+               if (esw->offloads.inline_mode != MLX5_INLINE_MODE_NONE &&
+                   esw->offloads.inline_mode < min_inline) {
                        netdev_warn(priv->netdev,
                                    "Flow is not offloaded due to min inline setting, required %d actual %d\n",
                                    min_inline, esw->offloads.inline_mode);
@@ -785,16 +786,15 @@ static int mlx5e_route_lookup_ipv6(struct mlx5e_priv *priv,
        return 0;
 }
 
-static int gen_vxlan_header_ipv4(struct net_device *out_dev,
-                                char buf[],
-                                unsigned char h_dest[ETH_ALEN],
-                                int ttl,
-                                __be32 daddr,
-                                __be32 saddr,
-                                __be16 udp_dst_port,
-                                __be32 vx_vni)
+static void gen_vxlan_header_ipv4(struct net_device *out_dev,
+                                 char buf[], int encap_size,
+                                 unsigned char h_dest[ETH_ALEN],
+                                 int ttl,
+                                 __be32 daddr,
+                                 __be32 saddr,
+                                 __be16 udp_dst_port,
+                                 __be32 vx_vni)
 {
-       int encap_size = VXLAN_HLEN + sizeof(struct iphdr) + ETH_HLEN;
        struct ethhdr *eth = (struct ethhdr *)buf;
        struct iphdr  *ip = (struct iphdr *)((char *)eth + sizeof(struct ethhdr));
        struct udphdr *udp = (struct udphdr *)((char *)ip + sizeof(struct iphdr));
@@ -817,20 +817,17 @@ static int gen_vxlan_header_ipv4(struct net_device *out_dev,
        udp->dest = udp_dst_port;
        vxh->vx_flags = VXLAN_HF_VNI;
        vxh->vx_vni = vxlan_vni_field(vx_vni);
-
-       return encap_size;
 }
 
-static int gen_vxlan_header_ipv6(struct net_device *out_dev,
-                                char buf[],
-                                unsigned char h_dest[ETH_ALEN],
-                                int ttl,
-                                struct in6_addr *daddr,
-                                struct in6_addr *saddr,
-                                __be16 udp_dst_port,
-                                __be32 vx_vni)
+static void gen_vxlan_header_ipv6(struct net_device *out_dev,
+                                 char buf[], int encap_size,
+                                 unsigned char h_dest[ETH_ALEN],
+                                 int ttl,
+                                 struct in6_addr *daddr,
+                                 struct in6_addr *saddr,
+                                 __be16 udp_dst_port,
+                                 __be32 vx_vni)
 {
-       int encap_size = VXLAN_HLEN + sizeof(struct ipv6hdr) + ETH_HLEN;
        struct ethhdr *eth = (struct ethhdr *)buf;
        struct ipv6hdr *ip6h = (struct ipv6hdr *)((char *)eth + sizeof(struct ethhdr));
        struct udphdr *udp = (struct udphdr *)((char *)ip6h + sizeof(struct ipv6hdr));
@@ -852,8 +849,6 @@ static int gen_vxlan_header_ipv6(struct net_device *out_dev,
        udp->dest = udp_dst_port;
        vxh->vx_flags = VXLAN_HF_VNI;
        vxh->vx_vni = vxlan_vni_field(vx_vni);
-
-       return encap_size;
 }
 
 static int mlx5e_create_encap_header_ipv4(struct mlx5e_priv *priv,
@@ -862,13 +857,20 @@ static int mlx5e_create_encap_header_ipv4(struct mlx5e_priv *priv,
                                          struct net_device **out_dev)
 {
        int max_encap_size = MLX5_CAP_ESW(priv->mdev, max_encap_header_size);
+       int ipv4_encap_size = ETH_HLEN + sizeof(struct iphdr) + VXLAN_HLEN;
        struct ip_tunnel_key *tun_key = &e->tun_info.key;
-       int encap_size, ttl, err;
        struct neighbour *n = NULL;
        struct flowi4 fl4 = {};
        char *encap_header;
+       int ttl, err;
+
+       if (max_encap_size < ipv4_encap_size) {
+               mlx5_core_warn(priv->mdev, "encap size %d too big, max supported is %d\n",
+                              ipv4_encap_size, max_encap_size);
+               return -EOPNOTSUPP;
+       }
 
-       encap_header = kzalloc(max_encap_size, GFP_KERNEL);
+       encap_header = kzalloc(ipv4_encap_size, GFP_KERNEL);
        if (!encap_header)
                return -ENOMEM;
 
@@ -903,11 +905,11 @@ static int mlx5e_create_encap_header_ipv4(struct mlx5e_priv *priv,
 
        switch (e->tunnel_type) {
        case MLX5_HEADER_TYPE_VXLAN:
-               encap_size = gen_vxlan_header_ipv4(*out_dev, encap_header,
-                                                  e->h_dest, ttl,
-                                                  fl4.daddr,
-                                                  fl4.saddr, tun_key->tp_dst,
-                                                  tunnel_id_to_key32(tun_key->tun_id));
+               gen_vxlan_header_ipv4(*out_dev, encap_header,
+                                     ipv4_encap_size, e->h_dest, ttl,
+                                     fl4.daddr,
+                                     fl4.saddr, tun_key->tp_dst,
+                                     tunnel_id_to_key32(tun_key->tun_id));
                break;
        default:
                err = -EOPNOTSUPP;
@@ -915,7 +917,7 @@ static int mlx5e_create_encap_header_ipv4(struct mlx5e_priv *priv,
        }
 
        err = mlx5_encap_alloc(priv->mdev, e->tunnel_type,
-                              encap_size, encap_header, &e->encap_id);
+                              ipv4_encap_size, encap_header, &e->encap_id);
 out:
        if (err && n)
                neigh_release(n);
@@ -930,13 +932,20 @@ static int mlx5e_create_encap_header_ipv6(struct mlx5e_priv *priv,
 
 {
        int max_encap_size = MLX5_CAP_ESW(priv->mdev, max_encap_header_size);
+       int ipv6_encap_size = ETH_HLEN + sizeof(struct ipv6hdr) + VXLAN_HLEN;
        struct ip_tunnel_key *tun_key = &e->tun_info.key;
-       int encap_size, err, ttl = 0;
        struct neighbour *n = NULL;
        struct flowi6 fl6 = {};
        char *encap_header;
+       int err, ttl = 0;
+
+       if (max_encap_size < ipv6_encap_size) {
+               mlx5_core_warn(priv->mdev, "encap size %d too big, max supported is %d\n",
+                              ipv6_encap_size, max_encap_size);
+               return -EOPNOTSUPP;
+       }
 
-       encap_header = kzalloc(max_encap_size, GFP_KERNEL);
+       encap_header = kzalloc(ipv6_encap_size, GFP_KERNEL);
        if (!encap_header)
                return -ENOMEM;
 
@@ -972,11 +981,11 @@ static int mlx5e_create_encap_header_ipv6(struct mlx5e_priv *priv,
 
        switch (e->tunnel_type) {
        case MLX5_HEADER_TYPE_VXLAN:
-               encap_size = gen_vxlan_header_ipv6(*out_dev, encap_header,
-                                                  e->h_dest, ttl,
-                                                  &fl6.daddr,
-                                                  &fl6.saddr, tun_key->tp_dst,
-                                                  tunnel_id_to_key32(tun_key->tun_id));
+               gen_vxlan_header_ipv6(*out_dev, encap_header,
+                                     ipv6_encap_size, e->h_dest, ttl,
+                                     &fl6.daddr,
+                                     &fl6.saddr, tun_key->tp_dst,
+                                     tunnel_id_to_key32(tun_key->tun_id));
                break;
        default:
                err = -EOPNOTSUPP;
@@ -984,7 +993,7 @@ static int mlx5e_create_encap_header_ipv6(struct mlx5e_priv *priv,
        }
 
        err = mlx5_encap_alloc(priv->mdev, e->tunnel_type,
-                              encap_size, encap_header, &e->encap_id);
+                              ipv6_encap_size, encap_header, &e->encap_id);
 out:
        if (err && n)
                neigh_release(n);
index 307ec6c..d111ceb 100644 (file)
@@ -911,8 +911,7 @@ int mlx5_devlink_eswitch_inline_mode_set(struct devlink *devlink, u8 mode)
        struct mlx5_core_dev *dev = devlink_priv(devlink);
        struct mlx5_eswitch *esw = dev->priv.eswitch;
        int num_vports = esw->enabled_vports;
-       int err;
-       int vport;
+       int err, vport;
        u8 mlx5_mode;
 
        if (!MLX5_CAP_GEN(dev, vport_group_manager))
@@ -921,9 +920,17 @@ int mlx5_devlink_eswitch_inline_mode_set(struct devlink *devlink, u8 mode)
        if (esw->mode == SRIOV_NONE)
                return -EOPNOTSUPP;
 
-       if (MLX5_CAP_ETH(dev, wqe_inline_mode) !=
-           MLX5_CAP_INLINE_MODE_VPORT_CONTEXT)
+       switch (MLX5_CAP_ETH(dev, wqe_inline_mode)) {
+       case MLX5_CAP_INLINE_MODE_NOT_REQUIRED:
+               if (mode == DEVLINK_ESWITCH_INLINE_MODE_NONE)
+                       return 0;
+               /* fall through */
+       case MLX5_CAP_INLINE_MODE_L2:
+               esw_warn(dev, "Inline mode can't be set\n");
                return -EOPNOTSUPP;
+       case MLX5_CAP_INLINE_MODE_VPORT_CONTEXT:
+               break;
+       }
 
        if (esw->offloads.num_flows > 0) {
                esw_warn(dev, "Can't set inline mode when flows are configured\n");
@@ -966,18 +973,14 @@ int mlx5_devlink_eswitch_inline_mode_get(struct devlink *devlink, u8 *mode)
        if (esw->mode == SRIOV_NONE)
                return -EOPNOTSUPP;
 
-       if (MLX5_CAP_ETH(dev, wqe_inline_mode) !=
-           MLX5_CAP_INLINE_MODE_VPORT_CONTEXT)
-               return -EOPNOTSUPP;
-
        return esw_inline_mode_to_devlink(esw->offloads.inline_mode, mode);
 }
 
 int mlx5_eswitch_inline_mode_get(struct mlx5_eswitch *esw, int nvfs, u8 *mode)
 {
+       u8 prev_mlx5_mode, mlx5_mode = MLX5_INLINE_MODE_L2;
        struct mlx5_core_dev *dev = esw->dev;
        int vport;
-       u8 prev_mlx5_mode, mlx5_mode = MLX5_INLINE_MODE_L2;
 
        if (!MLX5_CAP_GEN(dev, vport_group_manager))
                return -EOPNOTSUPP;
@@ -985,10 +988,18 @@ int mlx5_eswitch_inline_mode_get(struct mlx5_eswitch *esw, int nvfs, u8 *mode)
        if (esw->mode == SRIOV_NONE)
                return -EOPNOTSUPP;
 
-       if (MLX5_CAP_ETH(dev, wqe_inline_mode) !=
-           MLX5_CAP_INLINE_MODE_VPORT_CONTEXT)
-               return -EOPNOTSUPP;
+       switch (MLX5_CAP_ETH(dev, wqe_inline_mode)) {
+       case MLX5_CAP_INLINE_MODE_NOT_REQUIRED:
+               mlx5_mode = MLX5_INLINE_MODE_NONE;
+               goto out;
+       case MLX5_CAP_INLINE_MODE_L2:
+               mlx5_mode = MLX5_INLINE_MODE_L2;
+               goto out;
+       case MLX5_CAP_INLINE_MODE_VPORT_CONTEXT:
+               goto query_vports;
+       }
 
+query_vports:
        for (vport = 1; vport <= nvfs; vport++) {
                mlx5_query_nic_vport_min_inline(dev, vport, &mlx5_mode);
                if (vport > 1 && prev_mlx5_mode != mlx5_mode)
@@ -996,6 +1007,7 @@ int mlx5_eswitch_inline_mode_get(struct mlx5_eswitch *esw, int nvfs, u8 *mode)
                prev_mlx5_mode = mlx5_mode;
        }
 
+out:
        *mode = mlx5_mode;
        return 0;
 }
index 5595724..b5d5519 100644 (file)
@@ -294,7 +294,7 @@ static int mlx5_handle_changeupper_event(struct mlx5_lag *ldev,
                                         struct netdev_notifier_changeupper_info *info)
 {
        struct net_device *upper = info->upper_dev, *ndev_tmp;
-       struct netdev_lag_upper_info *lag_upper_info;
+       struct netdev_lag_upper_info *lag_upper_info = NULL;
        bool is_bonded;
        int bond_status = 0;
        int num_slaves = 0;
@@ -303,7 +303,8 @@ static int mlx5_handle_changeupper_event(struct mlx5_lag *ldev,
        if (!netif_is_lag_master(upper))
                return 0;
 
-       lag_upper_info = info->upper_info;
+       if (info->linking)
+               lag_upper_info = info->upper_info;
 
        /* The event may still be of interest if the slave does not belong to
         * us, but is enslaved to a master which has one or more of our netdevs
index 60154a1..0ad6632 100644 (file)
@@ -1029,7 +1029,7 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
        if (err) {
                dev_err(&dev->pdev->dev, "Firmware over %d MS in initializing state, aborting\n",
                        FW_INIT_TIMEOUT_MILI);
-               goto out_err;
+               goto err_cmd_cleanup;
        }
 
        err = mlx5_core_enable_hca(dev, 0);
index 2e6b0f2..222b259 100644 (file)
@@ -87,6 +87,7 @@ static void up_rel_func(struct kref *kref)
        struct mlx5_uars_page *up = container_of(kref, struct mlx5_uars_page, ref_count);
 
        list_del(&up->list);
+       iounmap(up->map);
        if (mlx5_cmd_free_uar(up->mdev, up->index))
                mlx5_core_warn(up->mdev, "failed to free uar index %d\n", up->index);
        kfree(up->reg_bitmap);
index 06c9f41..6ad44be 100644 (file)
@@ -25,6 +25,7 @@
 #include <linux/of_irq.h>
 #include <linux/crc32.h>
 #include <linux/crc32c.h>
+#include <linux/circ_buf.h>
 
 #include "moxart_ether.h"
 
@@ -278,6 +279,13 @@ rx_next:
        return rx;
 }
 
+static int moxart_tx_queue_space(struct net_device *ndev)
+{
+       struct moxart_mac_priv_t *priv = netdev_priv(ndev);
+
+       return CIRC_SPACE(priv->tx_head, priv->tx_tail, TX_DESC_NUM);
+}
+
 static void moxart_tx_finished(struct net_device *ndev)
 {
        struct moxart_mac_priv_t *priv = netdev_priv(ndev);
@@ -297,6 +305,9 @@ static void moxart_tx_finished(struct net_device *ndev)
                tx_tail = TX_NEXT(tx_tail);
        }
        priv->tx_tail = tx_tail;
+       if (netif_queue_stopped(ndev) &&
+           moxart_tx_queue_space(ndev) >= TX_WAKE_THRESHOLD)
+               netif_wake_queue(ndev);
 }
 
 static irqreturn_t moxart_mac_interrupt(int irq, void *dev_id)
@@ -324,13 +335,18 @@ static int moxart_mac_start_xmit(struct sk_buff *skb, struct net_device *ndev)
        struct moxart_mac_priv_t *priv = netdev_priv(ndev);
        void *desc;
        unsigned int len;
-       unsigned int tx_head = priv->tx_head;
+       unsigned int tx_head;
        u32 txdes1;
        int ret = NETDEV_TX_BUSY;
 
+       spin_lock_irq(&priv->txlock);
+
+       tx_head = priv->tx_head;
        desc = priv->tx_desc_base + (TX_REG_DESC_SIZE * tx_head);
 
-       spin_lock_irq(&priv->txlock);
+       if (moxart_tx_queue_space(ndev) == 1)
+               netif_stop_queue(ndev);
+
        if (moxart_desc_read(desc + TX_REG_OFFSET_DESC0) & TX_DESC0_DMA_OWN) {
                net_dbg_ratelimited("no TX space for packet\n");
                priv->stats.tx_dropped++;
index 93a9563..afc32ec 100644 (file)
@@ -59,6 +59,7 @@
 #define TX_NEXT(N)             (((N) + 1) & (TX_DESC_NUM_MASK))
 #define TX_BUF_SIZE            1600
 #define TX_BUF_SIZE_MAX                (TX_DESC1_BUF_SIZE_MASK+1)
+#define TX_WAKE_THRESHOLD      16
 
 #define RX_DESC_NUM            64
 #define RX_DESC_NUM_MASK       (RX_DESC_NUM-1)
index 9179a99..a41377e 100644 (file)
@@ -3275,9 +3275,10 @@ void nfp_net_netdev_clean(struct net_device *netdev)
 {
        struct nfp_net *nn = netdev_priv(netdev);
 
+       unregister_netdev(nn->netdev);
+
        if (nn->xdp_prog)
                bpf_prog_put(nn->xdp_prog);
        if (nn->bpf_offload_xdp)
                nfp_net_xdp_offload(nn, NULL);
-       unregister_netdev(nn->netdev);
 }
index 5bd36a4..cfdadb6 100644 (file)
        ((u32)(prio_tc_tbl >> ((7 - prio) * 4)) & 0x7)
 
 static const struct qed_dcbx_app_metadata qed_dcbx_app_update[] = {
-       {DCBX_PROTOCOL_ISCSI, "ISCSI", QED_PCI_DEFAULT},
-       {DCBX_PROTOCOL_FCOE, "FCOE", QED_PCI_DEFAULT},
-       {DCBX_PROTOCOL_ROCE, "ROCE", QED_PCI_DEFAULT},
-       {DCBX_PROTOCOL_ROCE_V2, "ROCE_V2", QED_PCI_DEFAULT},
-       {DCBX_PROTOCOL_ETH, "ETH", QED_PCI_ETH}
+       {DCBX_PROTOCOL_ISCSI, "ISCSI", QED_PCI_ISCSI},
+       {DCBX_PROTOCOL_FCOE, "FCOE", QED_PCI_FCOE},
+       {DCBX_PROTOCOL_ROCE, "ROCE", QED_PCI_ETH_ROCE},
+       {DCBX_PROTOCOL_ROCE_V2, "ROCE_V2", QED_PCI_ETH_ROCE},
+       {DCBX_PROTOCOL_ETH, "ETH", QED_PCI_ETH},
 };
 
 static bool qed_dcbx_app_ethtype(u32 app_info_bitmap)
@@ -583,6 +583,13 @@ qed_dcbx_get_ets_data(struct qed_hwfn *p_hwfn,
                   p_params->ets_cbs,
                   p_ets->pri_tc_tbl[0], p_params->max_ets_tc);
 
+       if (p_params->ets_enabled && !p_params->max_ets_tc) {
+               p_params->max_ets_tc = QED_MAX_PFC_PRIORITIES;
+               DP_VERBOSE(p_hwfn, QED_MSG_DCB,
+                          "ETS params: max_ets_tc is forced to %d\n",
+               p_params->max_ets_tc);
+       }
+
        /* 8 bit tsa and bw data corresponding to each of the 8 TC's are
         * encoded in a type u32 array of size 2.
         */
@@ -1001,6 +1008,8 @@ qed_dcbx_set_pfc_data(struct qed_hwfn *p_hwfn,
        u8 pfc_map = 0;
        int i;
 
+       *pfc &= ~DCBX_PFC_ERROR_MASK;
+
        if (p_params->pfc.willing)
                *pfc |= DCBX_PFC_WILLING_MASK;
        else
@@ -1255,7 +1264,7 @@ static struct qed_dcbx_get *qed_dcbnl_get_dcbx(struct qed_hwfn *hwfn,
 {
        struct qed_dcbx_get *dcbx_info;
 
-       dcbx_info = kzalloc(sizeof(*dcbx_info), GFP_KERNEL);
+       dcbx_info = kmalloc(sizeof(*dcbx_info), GFP_ATOMIC);
        if (!dcbx_info)
                return NULL;
 
@@ -2073,6 +2082,8 @@ static int qed_dcbnl_ieee_setpfc(struct qed_dev *cdev, struct ieee_pfc *pfc)
        for (i = 0; i < QED_MAX_PFC_PRIORITIES; i++)
                dcbx_set.config.params.pfc.prio[i] = !!(pfc->pfc_en & BIT(i));
 
+       dcbx_set.config.params.pfc.max_tc = pfc->pfc_cap;
+
        ptt = qed_ptt_acquire(hwfn);
        if (!ptt)
                return -EINVAL;
index 8cfc4a5..3cd7989 100644 (file)
@@ -1516,11 +1516,12 @@ static netdev_tx_t ravb_start_xmit(struct sk_buff *skb, struct net_device *ndev)
                spin_unlock_irqrestore(&priv->lock, flags);
                return NETDEV_TX_BUSY;
        }
-       entry = priv->cur_tx[q] % (priv->num_tx_ring[q] * NUM_TX_DESC);
-       priv->tx_skb[q][entry / NUM_TX_DESC] = skb;
 
        if (skb_put_padto(skb, ETH_ZLEN))
-               goto drop;
+               goto exit;
+
+       entry = priv->cur_tx[q] % (priv->num_tx_ring[q] * NUM_TX_DESC);
+       priv->tx_skb[q][entry / NUM_TX_DESC] = skb;
 
        buffer = PTR_ALIGN(priv->tx_align[q], DPTR_ALIGN) +
                 entry / NUM_TX_DESC * DPTR_ALIGN;
index 5424877..f68c4db 100644 (file)
@@ -1127,12 +1127,70 @@ static struct mdiobb_ops bb_ops = {
        .get_mdio_data = sh_get_mdio,
 };
 
+/* free Tx skb function */
+static int sh_eth_tx_free(struct net_device *ndev, bool sent_only)
+{
+       struct sh_eth_private *mdp = netdev_priv(ndev);
+       struct sh_eth_txdesc *txdesc;
+       int free_num = 0;
+       int entry;
+       bool sent;
+
+       for (; mdp->cur_tx - mdp->dirty_tx > 0; mdp->dirty_tx++) {
+               entry = mdp->dirty_tx % mdp->num_tx_ring;
+               txdesc = &mdp->tx_ring[entry];
+               sent = !(txdesc->status & cpu_to_le32(TD_TACT));
+               if (sent_only && !sent)
+                       break;
+               /* TACT bit must be checked before all the following reads */
+               dma_rmb();
+               netif_info(mdp, tx_done, ndev,
+                          "tx entry %d status 0x%08x\n",
+                          entry, le32_to_cpu(txdesc->status));
+               /* Free the original skb. */
+               if (mdp->tx_skbuff[entry]) {
+                       dma_unmap_single(&ndev->dev, le32_to_cpu(txdesc->addr),
+                                        le32_to_cpu(txdesc->len) >> 16,
+                                        DMA_TO_DEVICE);
+                       dev_kfree_skb_irq(mdp->tx_skbuff[entry]);
+                       mdp->tx_skbuff[entry] = NULL;
+                       free_num++;
+               }
+               txdesc->status = cpu_to_le32(TD_TFP);
+               if (entry >= mdp->num_tx_ring - 1)
+                       txdesc->status |= cpu_to_le32(TD_TDLE);
+
+               if (sent) {
+                       ndev->stats.tx_packets++;
+                       ndev->stats.tx_bytes += le32_to_cpu(txdesc->len) >> 16;
+               }
+       }
+       return free_num;
+}
+
 /* free skb and descriptor buffer */
 static void sh_eth_ring_free(struct net_device *ndev)
 {
        struct sh_eth_private *mdp = netdev_priv(ndev);
        int ringsize, i;
 
+       if (mdp->rx_ring) {
+               for (i = 0; i < mdp->num_rx_ring; i++) {
+                       if (mdp->rx_skbuff[i]) {
+                               struct sh_eth_rxdesc *rxdesc = &mdp->rx_ring[i];
+
+                               dma_unmap_single(&ndev->dev,
+                                                le32_to_cpu(rxdesc->addr),
+                                                ALIGN(mdp->rx_buf_sz, 32),
+                                                DMA_FROM_DEVICE);
+                       }
+               }
+               ringsize = sizeof(struct sh_eth_rxdesc) * mdp->num_rx_ring;
+               dma_free_coherent(NULL, ringsize, mdp->rx_ring,
+                                 mdp->rx_desc_dma);
+               mdp->rx_ring = NULL;
+       }
+
        /* Free Rx skb ringbuffer */
        if (mdp->rx_skbuff) {
                for (i = 0; i < mdp->num_rx_ring; i++)
@@ -1141,27 +1199,18 @@ static void sh_eth_ring_free(struct net_device *ndev)
        kfree(mdp->rx_skbuff);
        mdp->rx_skbuff = NULL;
 
-       /* Free Tx skb ringbuffer */
-       if (mdp->tx_skbuff) {
-               for (i = 0; i < mdp->num_tx_ring; i++)
-                       dev_kfree_skb(mdp->tx_skbuff[i]);
-       }
-       kfree(mdp->tx_skbuff);
-       mdp->tx_skbuff = NULL;
-
-       if (mdp->rx_ring) {
-               ringsize = sizeof(struct sh_eth_rxdesc) * mdp->num_rx_ring;
-               dma_free_coherent(NULL, ringsize, mdp->rx_ring,
-                                 mdp->rx_desc_dma);
-               mdp->rx_ring = NULL;
-       }
-
        if (mdp->tx_ring) {
+               sh_eth_tx_free(ndev, false);
+
                ringsize = sizeof(struct sh_eth_txdesc) * mdp->num_tx_ring;
                dma_free_coherent(NULL, ringsize, mdp->tx_ring,
                                  mdp->tx_desc_dma);
                mdp->tx_ring = NULL;
        }
+
+       /* Free Tx skb ringbuffer */
+       kfree(mdp->tx_skbuff);
+       mdp->tx_skbuff = NULL;
 }
 
 /* format skb and descriptor buffer */
@@ -1409,43 +1458,6 @@ static void sh_eth_dev_exit(struct net_device *ndev)
        update_mac_address(ndev);
 }
 
-/* free Tx skb function */
-static int sh_eth_txfree(struct net_device *ndev)
-{
-       struct sh_eth_private *mdp = netdev_priv(ndev);
-       struct sh_eth_txdesc *txdesc;
-       int free_num = 0;
-       int entry;
-
-       for (; mdp->cur_tx - mdp->dirty_tx > 0; mdp->dirty_tx++) {
-               entry = mdp->dirty_tx % mdp->num_tx_ring;
-               txdesc = &mdp->tx_ring[entry];
-               if (txdesc->status & cpu_to_le32(TD_TACT))
-                       break;
-               /* TACT bit must be checked before all the following reads */
-               dma_rmb();
-               netif_info(mdp, tx_done, ndev,
-                          "tx entry %d status 0x%08x\n",
-                          entry, le32_to_cpu(txdesc->status));
-               /* Free the original skb. */
-               if (mdp->tx_skbuff[entry]) {
-                       dma_unmap_single(&ndev->dev, le32_to_cpu(txdesc->addr),
-                                        le32_to_cpu(txdesc->len) >> 16,
-                                        DMA_TO_DEVICE);
-                       dev_kfree_skb_irq(mdp->tx_skbuff[entry]);
-                       mdp->tx_skbuff[entry] = NULL;
-                       free_num++;
-               }
-               txdesc->status = cpu_to_le32(TD_TFP);
-               if (entry >= mdp->num_tx_ring - 1)
-                       txdesc->status |= cpu_to_le32(TD_TDLE);
-
-               ndev->stats.tx_packets++;
-               ndev->stats.tx_bytes += le32_to_cpu(txdesc->len) >> 16;
-       }
-       return free_num;
-}
-
 /* Packet receive function */
 static int sh_eth_rx(struct net_device *ndev, u32 intr_status, int *quota)
 {
@@ -1690,7 +1702,7 @@ static void sh_eth_error(struct net_device *ndev, u32 intr_status)
                           intr_status, mdp->cur_tx, mdp->dirty_tx,
                           (u32)ndev->state, edtrr);
                /* dirty buffer free */
-               sh_eth_txfree(ndev);
+               sh_eth_tx_free(ndev, true);
 
                /* SH7712 BUG */
                if (edtrr ^ sh_eth_get_edtrr_trns(mdp)) {
@@ -1751,7 +1763,7 @@ static irqreturn_t sh_eth_interrupt(int irq, void *netdev)
                /* Clear Tx interrupts */
                sh_eth_write(ndev, intr_status & cd->tx_check, EESR);
 
-               sh_eth_txfree(ndev);
+               sh_eth_tx_free(ndev, true);
                netif_wake_queue(ndev);
        }
 
@@ -2412,7 +2424,7 @@ static int sh_eth_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 
        spin_lock_irqsave(&mdp->lock, flags);
        if ((mdp->cur_tx - mdp->dirty_tx) >= (mdp->num_tx_ring - 4)) {
-               if (!sh_eth_txfree(ndev)) {
+               if (!sh_eth_tx_free(ndev, true)) {
                        netif_warn(mdp, tx_queued, ndev, "TxFD exhausted.\n");
                        netif_stop_queue(ndev);
                        spin_unlock_irqrestore(&mdp->lock, flags);
index 7cd76b6..2ae8524 100644 (file)
@@ -2216,18 +2216,15 @@ static int ofdpa_port_stp_update(struct ofdpa_port *ofdpa_port,
 {
        bool want[OFDPA_CTRL_MAX] = { 0, };
        bool prev_ctrls[OFDPA_CTRL_MAX];
-       u8 uninitialized_var(prev_state);
+       u8 prev_state;
        int err;
        int i;
 
-       if (switchdev_trans_ph_prepare(trans)) {
-               memcpy(prev_ctrls, ofdpa_port->ctrls, sizeof(prev_ctrls));
-               prev_state = ofdpa_port->stp_state;
-       }
-
-       if (ofdpa_port->stp_state == state)
+       prev_state = ofdpa_port->stp_state;
+       if (prev_state == state)
                return 0;
 
+       memcpy(prev_ctrls, ofdpa_port->ctrls, sizeof(prev_ctrls));
        ofdpa_port->stp_state = state;
 
        switch (state) {
index 50d2826..b9cb697 100644 (file)
@@ -1371,6 +1371,13 @@ static unsigned int efx_wanted_parallelism(struct efx_nic *efx)
                free_cpumask_var(thread_mask);
        }
 
+       if (count > EFX_MAX_RX_QUEUES) {
+               netif_cond_dbg(efx, probe, efx->net_dev, !rss_cpus, warn,
+                              "Reducing number of rx queues from %u to %u.\n",
+                              count, EFX_MAX_RX_QUEUES);
+               count = EFX_MAX_RX_QUEUES;
+       }
+
        /* If RSS is requested for the PF *and* VFs then we can't write RSS
         * table entries that are inaccessible to VFs
         */
index ee14662..a0c52e3 100644 (file)
@@ -74,7 +74,10 @@ void efx_schedule_slow_fill(struct efx_rx_queue *rx_queue);
 #define EFX_RXQ_MIN_ENT                128U
 #define EFX_TXQ_MIN_ENT(efx)   (2 * efx_tx_max_skb_descs(efx))
 
-#define EFX_TXQ_MAX_ENT(efx)   (EFX_WORKAROUND_35388(efx) ? \
+/* All EF10 architecture NICs steal one bit of the DMAQ size for various
+ * other purposes when counting TxQ entries, so we halve the queue size.
+ */
+#define EFX_TXQ_MAX_ENT(efx)   (EFX_WORKAROUND_EF10(efx) ? \
                                 EFX_MAX_DMAQ_SIZE / 2 : EFX_MAX_DMAQ_SIZE)
 
 static inline bool efx_rss_enabled(struct efx_nic *efx)
index f5e5cd1..29614da 100644 (file)
@@ -1354,6 +1354,13 @@ static unsigned int ef4_wanted_parallelism(struct ef4_nic *efx)
                free_cpumask_var(thread_mask);
        }
 
+       if (count > EF4_MAX_RX_QUEUES) {
+               netif_cond_dbg(efx, probe, efx->net_dev, !rss_cpus, warn,
+                              "Reducing number of rx queues from %u to %u.\n",
+                              count, EF4_MAX_RX_QUEUES);
+               count = EF4_MAX_RX_QUEUES;
+       }
+
        return count;
 }
 
index 103f827..c67fa18 100644 (file)
@@ -16,6 +16,7 @@
  */
 
 #define EFX_WORKAROUND_SIENA(efx) (efx_nic_rev(efx) == EFX_REV_SIENA_A0)
+#define EFX_WORKAROUND_EF10(efx) (efx_nic_rev(efx) >= EFX_REV_HUNT_A0)
 #define EFX_WORKAROUND_10G(efx) 1
 
 /* Bit-bashed I2C reads cause performance drop */
index 9e63195..48a541e 100644 (file)
@@ -76,7 +76,7 @@ config TI_CPSW
 config TI_CPTS
        bool "TI Common Platform Time Sync (CPTS) Support"
        depends on TI_CPSW || TI_KEYSTONE_NETCP
-       depends on PTP_1588_CLOCK
+       depends on POSIX_TIMERS
        ---help---
          This driver supports the Common Platform Time Sync unit of
          the CPSW Ethernet Switch and Keystone 2 1g/10g Switch Subsystem.
@@ -87,6 +87,8 @@ config TI_CPTS_MOD
        tristate
        depends on TI_CPTS
        default y if TI_CPSW=y || TI_KEYSTONE_NETCP=y
+       select NET_PTP_CLASSIFY
+       imply PTP_1588_CLOCK
        default m
 
 config TI_KEYSTONE_NETCP
index 9f3d9c6..fa674a8 100644 (file)
@@ -1267,6 +1267,7 @@ static void soft_reset_slave(struct cpsw_slave *slave)
 static void cpsw_slave_open(struct cpsw_slave *slave, struct cpsw_priv *priv)
 {
        u32 slave_port;
+       struct phy_device *phy;
        struct cpsw_common *cpsw = priv->cpsw;
 
        soft_reset_slave(slave);
@@ -1300,27 +1301,28 @@ static void cpsw_slave_open(struct cpsw_slave *slave, struct cpsw_priv *priv)
                                   1 << slave_port, 0, 0, ALE_MCAST_FWD_2);
 
        if (slave->data->phy_node) {
-               slave->phy = of_phy_connect(priv->ndev, slave->data->phy_node,
+               phy = of_phy_connect(priv->ndev, slave->data->phy_node,
                                 &cpsw_adjust_link, 0, slave->data->phy_if);
-               if (!slave->phy) {
+               if (!phy) {
                        dev_err(priv->dev, "phy \"%s\" not found on slave %d\n",
                                slave->data->phy_node->full_name,
                                slave->slave_num);
                        return;
                }
        } else {
-               slave->phy = phy_connect(priv->ndev, slave->data->phy_id,
+               phy = phy_connect(priv->ndev, slave->data->phy_id,
                                 &cpsw_adjust_link, slave->data->phy_if);
-               if (IS_ERR(slave->phy)) {
+               if (IS_ERR(phy)) {
                        dev_err(priv->dev,
                                "phy \"%s\" not found on slave %d, err %ld\n",
                                slave->data->phy_id, slave->slave_num,
-                               PTR_ERR(slave->phy));
-                       slave->phy = NULL;
+                               PTR_ERR(phy));
                        return;
                }
        }
 
+       slave->phy = phy;
+
        phy_attached_info(slave->phy);
 
        phy_start(slave->phy);
@@ -1817,6 +1819,8 @@ static void cpsw_ndo_tx_timeout(struct net_device *ndev)
        }
 
        cpsw_intr_enable(cpsw);
+       netif_trans_update(ndev);
+       netif_tx_wake_all_queues(ndev);
 }
 
 static int cpsw_ndo_set_mac_address(struct net_device *ndev, void *p)
index a45f98f..3dadee1 100644 (file)
@@ -1017,8 +1017,8 @@ tc35815_free_queues(struct net_device *dev)
                        BUG_ON(lp->tx_skbs[i].skb != skb);
 #endif
                        if (skb) {
-                               dev_kfree_skb(skb);
                                pci_unmap_single(lp->pci_dev, lp->tx_skbs[i].skb_dma, skb->len, PCI_DMA_TODEVICE);
+                               dev_kfree_skb(skb);
                                lp->tx_skbs[i].skb = NULL;
                                lp->tx_skbs[i].skb_dma = 0;
                        }
index f9f3dba..db23cb3 100644 (file)
@@ -751,7 +751,6 @@ struct netvsc_device {
        u32 send_section_cnt;
        u32 send_section_size;
        unsigned long *send_section_map;
-       int map_words;
 
        /* Used for NetVSP initialization protocol */
        struct completion channel_init_wait;
index 8dd0b87..15ef713 100644 (file)
@@ -236,6 +236,7 @@ static int netvsc_init_buf(struct hv_device *device)
        struct netvsc_device *net_device;
        struct nvsp_message *init_packet;
        struct net_device *ndev;
+       size_t map_words;
        int node;
 
        net_device = get_outbound_net_device(device);
@@ -401,11 +402,9 @@ static int netvsc_init_buf(struct hv_device *device)
                   net_device->send_section_size, net_device->send_section_cnt);
 
        /* Setup state for managing the send buffer. */
-       net_device->map_words = DIV_ROUND_UP(net_device->send_section_cnt,
-                                            BITS_PER_LONG);
+       map_words = DIV_ROUND_UP(net_device->send_section_cnt, BITS_PER_LONG);
 
-       net_device->send_section_map = kcalloc(net_device->map_words,
-                                              sizeof(ulong), GFP_KERNEL);
+       net_device->send_section_map = kcalloc(map_words, sizeof(ulong), GFP_KERNEL);
        if (net_device->send_section_map == NULL) {
                ret = -ENOMEM;
                goto cleanup;
@@ -683,7 +682,7 @@ static u32 netvsc_get_next_send_section(struct netvsc_device *net_device)
        unsigned long *map_addr = net_device->send_section_map;
        unsigned int i;
 
-       for_each_clear_bit(i, map_addr, net_device->map_words) {
+       for_each_clear_bit(i, map_addr, net_device->send_section_cnt) {
                if (sync_test_and_set_bit(i, map_addr) == 0)
                        return i;
        }
index ffedad2..15b9200 100644 (file)
@@ -418,8 +418,9 @@ static struct vlsi_ring *vlsi_alloc_ring(struct pci_dev *pdev, struct ring_descr
                memset(rd, 0, sizeof(*rd));
                rd->hw = hwmap + i;
                rd->buf = kmalloc(len, GFP_KERNEL|GFP_DMA);
-               if (rd->buf == NULL ||
-                   !(busaddr = pci_map_single(pdev, rd->buf, len, dir))) {
+               if (rd->buf)
+                       busaddr = pci_map_single(pdev, rd->buf, len, dir);
+               if (rd->buf == NULL || pci_dma_mapping_error(pdev, busaddr)) {
                        if (rd->buf) {
                                net_err_ratelimited("%s: failed to create PCI-MAP for %p\n",
                                                    __func__, rd->buf);
@@ -430,8 +431,7 @@ static struct vlsi_ring *vlsi_alloc_ring(struct pci_dev *pdev, struct ring_descr
                                rd = r->rd + j;
                                busaddr = rd_get_addr(rd);
                                rd_set_addr_status(rd, 0, 0);
-                               if (busaddr)
-                                       pci_unmap_single(pdev, busaddr, len, dir);
+                               pci_unmap_single(pdev, busaddr, len, dir);
                                kfree(rd->buf);
                                rd->buf = NULL;
                        }
index ff0a5ed..49ce4e9 100644 (file)
@@ -617,7 +617,8 @@ static void macsec_encrypt_done(struct crypto_async_request *base, int err)
 
 static struct aead_request *macsec_alloc_req(struct crypto_aead *tfm,
                                             unsigned char **iv,
-                                            struct scatterlist **sg)
+                                            struct scatterlist **sg,
+                                            int num_frags)
 {
        size_t size, iv_offset, sg_offset;
        struct aead_request *req;
@@ -629,7 +630,7 @@ static struct aead_request *macsec_alloc_req(struct crypto_aead *tfm,
 
        size = ALIGN(size, __alignof__(struct scatterlist));
        sg_offset = size;
-       size += sizeof(struct scatterlist) * (MAX_SKB_FRAGS + 1);
+       size += sizeof(struct scatterlist) * num_frags;
 
        tmp = kmalloc(size, GFP_ATOMIC);
        if (!tmp)
@@ -649,6 +650,7 @@ static struct sk_buff *macsec_encrypt(struct sk_buff *skb,
 {
        int ret;
        struct scatterlist *sg;
+       struct sk_buff *trailer;
        unsigned char *iv;
        struct ethhdr *eth;
        struct macsec_eth_header *hh;
@@ -723,7 +725,14 @@ static struct sk_buff *macsec_encrypt(struct sk_buff *skb,
                return ERR_PTR(-EINVAL);
        }
 
-       req = macsec_alloc_req(tx_sa->key.tfm, &iv, &sg);
+       ret = skb_cow_data(skb, 0, &trailer);
+       if (unlikely(ret < 0)) {
+               macsec_txsa_put(tx_sa);
+               kfree_skb(skb);
+               return ERR_PTR(ret);
+       }
+
+       req = macsec_alloc_req(tx_sa->key.tfm, &iv, &sg, ret);
        if (!req) {
                macsec_txsa_put(tx_sa);
                kfree_skb(skb);
@@ -732,7 +741,7 @@ static struct sk_buff *macsec_encrypt(struct sk_buff *skb,
 
        macsec_fill_iv(iv, secy->sci, pn);
 
-       sg_init_table(sg, MAX_SKB_FRAGS + 1);
+       sg_init_table(sg, ret);
        skb_to_sgvec(skb, sg, 0, skb->len);
 
        if (tx_sc->encrypt) {
@@ -917,6 +926,7 @@ static struct sk_buff *macsec_decrypt(struct sk_buff *skb,
 {
        int ret;
        struct scatterlist *sg;
+       struct sk_buff *trailer;
        unsigned char *iv;
        struct aead_request *req;
        struct macsec_eth_header *hdr;
@@ -927,7 +937,12 @@ static struct sk_buff *macsec_decrypt(struct sk_buff *skb,
        if (!skb)
                return ERR_PTR(-ENOMEM);
 
-       req = macsec_alloc_req(rx_sa->key.tfm, &iv, &sg);
+       ret = skb_cow_data(skb, 0, &trailer);
+       if (unlikely(ret < 0)) {
+               kfree_skb(skb);
+               return ERR_PTR(ret);
+       }
+       req = macsec_alloc_req(rx_sa->key.tfm, &iv, &sg, ret);
        if (!req) {
                kfree_skb(skb);
                return ERR_PTR(-ENOMEM);
@@ -936,7 +951,7 @@ static struct sk_buff *macsec_decrypt(struct sk_buff *skb,
        hdr = (struct macsec_eth_header *)skb->data;
        macsec_fill_iv(iv, sci, ntohl(hdr->packet_number));
 
-       sg_init_table(sg, MAX_SKB_FRAGS + 1);
+       sg_init_table(sg, ret);
        skb_to_sgvec(skb, sg, 0, skb->len);
 
        if (hdr->tci_an & MACSEC_TCI_E) {
index 9261722..b34eaaa 100644 (file)
@@ -1139,6 +1139,7 @@ static int macvlan_port_create(struct net_device *dev)
 static void macvlan_port_destroy(struct net_device *dev)
 {
        struct macvlan_port *port = macvlan_port_get_rtnl(dev);
+       struct sk_buff *skb;
 
        dev->priv_flags &= ~IFF_MACVLAN_PORT;
        netdev_rx_handler_unregister(dev);
@@ -1147,7 +1148,15 @@ static void macvlan_port_destroy(struct net_device *dev)
         * but we need to cancel it and purge left skbs if any.
         */
        cancel_work_sync(&port->bc_work);
-       __skb_queue_purge(&port->bc_queue);
+
+       while ((skb = __skb_dequeue(&port->bc_queue))) {
+               const struct macvlan_dev *src = MACVLAN_SKB_CB(skb)->src;
+
+               if (src)
+                       dev_put(src->dev);
+
+               kfree_skb(skb);
+       }
 
        kfree(port);
 }
index e2460a5..ed0d10f 100644 (file)
@@ -1438,8 +1438,6 @@ static bool dp83640_rxtstamp(struct phy_device *phydev,
                skb_info->tmo = jiffies + SKB_TIMESTAMP_TIMEOUT;
                skb_queue_tail(&dp83640->rx_queue, skb);
                schedule_delayed_work(&dp83640->ts_work, SKB_TIMESTAMP_TIMEOUT);
-       } else {
-               netif_rx_ni(skb);
        }
 
        return true;
index 6b988f7..61941e2 100644 (file)
@@ -84,3 +84,4 @@ int mdiobus_register_board_info(const struct mdio_board_info *info,
 
        return 0;
 }
+EXPORT_SYMBOL(mdiobus_register_board_info);
index 6742070..da5b392 100644 (file)
@@ -297,17 +297,6 @@ static int kszphy_config_init(struct phy_device *phydev)
        if (priv->led_mode >= 0)
                kszphy_setup_led(phydev, type->led_mode_reg, priv->led_mode);
 
-       if (phy_interrupt_is_valid(phydev)) {
-               int ctl = phy_read(phydev, MII_BMCR);
-
-               if (ctl < 0)
-                       return ctl;
-
-               ret = phy_write(phydev, MII_BMCR, ctl & ~BMCR_ANENABLE);
-               if (ret < 0)
-                       return ret;
-       }
-
        return 0;
 }
 
@@ -798,9 +787,6 @@ static struct phy_driver ksphy_driver[] = {
        .read_status    = genphy_read_status,
        .ack_interrupt  = kszphy_ack_interrupt,
        .config_intr    = kszphy_config_intr,
-       .get_sset_count = kszphy_get_sset_count,
-       .get_strings    = kszphy_get_strings,
-       .get_stats      = kszphy_get_stats,
        .suspend        = genphy_suspend,
        .resume         = genphy_resume,
 }, {
@@ -940,9 +926,6 @@ static struct phy_driver ksphy_driver[] = {
        .read_status    = genphy_read_status,
        .ack_interrupt  = kszphy_ack_interrupt,
        .config_intr    = kszphy_config_intr,
-       .get_sset_count = kszphy_get_sset_count,
-       .get_strings    = kszphy_get_strings,
-       .get_stats      = kszphy_get_stats,
        .suspend        = genphy_suspend,
        .resume         = genphy_resume,
 }, {
@@ -952,6 +935,7 @@ static struct phy_driver ksphy_driver[] = {
        .features       = PHY_GBIT_FEATURES,
        .flags          = PHY_HAS_MAGICANEG | PHY_HAS_INTERRUPT,
        .driver_data    = &ksz9021_type,
+       .probe          = kszphy_probe,
        .config_init    = ksz9021_config_init,
        .config_aneg    = genphy_config_aneg,
        .read_status    = genphy_read_status,
@@ -971,6 +955,7 @@ static struct phy_driver ksphy_driver[] = {
        .features       = PHY_GBIT_FEATURES,
        .flags          = PHY_HAS_MAGICANEG | PHY_HAS_INTERRUPT,
        .driver_data    = &ksz9021_type,
+       .probe          = kszphy_probe,
        .config_init    = ksz9031_config_init,
        .config_aneg    = genphy_config_aneg,
        .read_status    = ksz9031_read_status,
@@ -989,9 +974,6 @@ static struct phy_driver ksphy_driver[] = {
        .config_init    = kszphy_config_init,
        .config_aneg    = ksz8873mll_config_aneg,
        .read_status    = ksz8873mll_read_status,
-       .get_sset_count = kszphy_get_sset_count,
-       .get_strings    = kszphy_get_strings,
-       .get_stats      = kszphy_get_stats,
        .suspend        = genphy_suspend,
        .resume         = genphy_resume,
 }, {
@@ -1003,9 +985,6 @@ static struct phy_driver ksphy_driver[] = {
        .config_init    = kszphy_config_init,
        .config_aneg    = genphy_config_aneg,
        .read_status    = genphy_read_status,
-       .get_sset_count = kszphy_get_sset_count,
-       .get_strings    = kszphy_get_strings,
-       .get_stats      = kszphy_get_stats,
        .suspend        = genphy_suspend,
        .resume         = genphy_resume,
 }, {
@@ -1017,9 +996,6 @@ static struct phy_driver ksphy_driver[] = {
        .config_init    = kszphy_config_init,
        .config_aneg    = ksz8873mll_config_aneg,
        .read_status    = ksz8873mll_read_status,
-       .get_sset_count = kszphy_get_sset_count,
-       .get_strings    = kszphy_get_strings,
-       .get_stats      = kszphy_get_stats,
        .suspend        = genphy_suspend,
        .resume         = genphy_resume,
 } };
index 1be69d8..97ff127 100644 (file)
@@ -591,16 +591,18 @@ int phy_mii_ioctl(struct phy_device *phydev, struct ifreq *ifr, int cmd)
 EXPORT_SYMBOL(phy_mii_ioctl);
 
 /**
- * phy_start_aneg - start auto-negotiation for this PHY device
+ * phy_start_aneg_priv - start auto-negotiation for this PHY device
  * @phydev: the phy_device struct
+ * @sync: indicate whether we should wait for the workqueue cancelation
  *
  * Description: Sanitizes the settings (if we're not autonegotiating
  *   them), and then calls the driver's config_aneg function.
  *   If the PHYCONTROL Layer is operating, we change the state to
  *   reflect the beginning of Auto-negotiation or forcing.
  */
-int phy_start_aneg(struct phy_device *phydev)
+static int phy_start_aneg_priv(struct phy_device *phydev, bool sync)
 {
+       bool trigger = 0;
        int err;
 
        if (!phydev->drv)
@@ -628,10 +630,40 @@ int phy_start_aneg(struct phy_device *phydev)
                }
        }
 
+       /* Re-schedule a PHY state machine to check PHY status because
+        * negotiation may already be done and aneg interrupt may not be
+        * generated.
+        */
+       if (phy_interrupt_is_valid(phydev) && (phydev->state == PHY_AN)) {
+               err = phy_aneg_done(phydev);
+               if (err > 0) {
+                       trigger = true;
+                       err = 0;
+               }
+       }
+
 out_unlock:
        mutex_unlock(&phydev->lock);
+
+       if (trigger)
+               phy_trigger_machine(phydev, sync);
+
        return err;
 }
+
+/**
+ * phy_start_aneg - start auto-negotiation for this PHY device
+ * @phydev: the phy_device struct
+ *
+ * Description: Sanitizes the settings (if we're not autonegotiating
+ *   them), and then calls the driver's config_aneg function.
+ *   If the PHYCONTROL Layer is operating, we change the state to
+ *   reflect the beginning of Auto-negotiation or forcing.
+ */
+int phy_start_aneg(struct phy_device *phydev)
+{
+       return phy_start_aneg_priv(phydev, true);
+}
 EXPORT_SYMBOL(phy_start_aneg);
 
 /**
@@ -659,7 +691,7 @@ void phy_start_machine(struct phy_device *phydev)
  *   state machine runs.
  */
 
-static void phy_trigger_machine(struct phy_device *phydev, bool sync)
+void phy_trigger_machine(struct phy_device *phydev, bool sync)
 {
        if (sync)
                cancel_delayed_work_sync(&phydev->state_queue);
@@ -681,7 +713,7 @@ void phy_stop_machine(struct phy_device *phydev)
        cancel_delayed_work_sync(&phydev->state_queue);
 
        mutex_lock(&phydev->lock);
-       if (phydev->state > PHY_UP)
+       if (phydev->state > PHY_UP && phydev->state != PHY_HALTED)
                phydev->state = PHY_UP;
        mutex_unlock(&phydev->lock);
 }
@@ -1154,7 +1186,7 @@ void phy_state_machine(struct work_struct *work)
        mutex_unlock(&phydev->lock);
 
        if (needs_aneg)
-               err = phy_start_aneg(phydev);
+               err = phy_start_aneg_priv(phydev, false);
        else if (do_suspend)
                phy_suspend(phydev);
 
index 1b52520..85c0124 100644 (file)
@@ -990,7 +990,7 @@ static void team_port_disable(struct team *team,
 #define TEAM_ENC_FEATURES      (NETIF_F_HW_CSUM | NETIF_F_SG | \
                                 NETIF_F_RXCSUM | NETIF_F_ALL_TSO)
 
-static void ___team_compute_features(struct team *team)
+static void __team_compute_features(struct team *team)
 {
        struct team_port *port;
        u32 vlan_features = TEAM_VLAN_FEATURES & NETIF_F_ALL_FOR_ALL;
@@ -1023,16 +1023,10 @@ static void ___team_compute_features(struct team *team)
                team->dev->priv_flags |= IFF_XMIT_DST_RELEASE;
 }
 
-static void __team_compute_features(struct team *team)
-{
-       ___team_compute_features(team);
-       netdev_change_features(team->dev);
-}
-
 static void team_compute_features(struct team *team)
 {
        mutex_lock(&team->lock);
-       ___team_compute_features(team);
+       __team_compute_features(team);
        mutex_unlock(&team->lock);
        netdev_change_features(team->dev);
 }
@@ -1641,6 +1635,7 @@ static void team_uninit(struct net_device *dev)
        team_notify_peers_fini(team);
        team_queue_override_fini(team);
        mutex_unlock(&team->lock);
+       netdev_change_features(dev);
 }
 
 static void team_destructor(struct net_device *dev)
@@ -1928,6 +1923,10 @@ static int team_add_slave(struct net_device *dev, struct net_device *port_dev)
        mutex_lock(&team->lock);
        err = team_port_add(team, port_dev);
        mutex_unlock(&team->lock);
+
+       if (!err)
+               netdev_change_features(dev);
+
        return err;
 }
 
@@ -1939,6 +1938,10 @@ static int team_del_slave(struct net_device *dev, struct net_device *port_dev)
        mutex_lock(&team->lock);
        err = team_port_del(team, port_dev);
        mutex_unlock(&team->lock);
+
+       if (!err)
+               netdev_change_features(dev);
+
        return err;
 }
 
@@ -2358,8 +2361,10 @@ start_again:
 
        hdr = genlmsg_put(skb, portid, seq, &team_nl_family, flags | NLM_F_MULTI,
                          TEAM_CMD_OPTIONS_GET);
-       if (!hdr)
+       if (!hdr) {
+               nlmsg_free(skb);
                return -EMSGSIZE;
+       }
 
        if (nla_put_u32(skb, TEAM_ATTR_TEAM_IFINDEX, team->dev->ifindex))
                goto nla_put_failure;
@@ -2631,8 +2636,10 @@ start_again:
 
        hdr = genlmsg_put(skb, portid, seq, &team_nl_family, flags | NLM_F_MULTI,
                          TEAM_CMD_PORT_LIST_GET);
-       if (!hdr)
+       if (!hdr) {
+               nlmsg_free(skb);
                return -EMSGSIZE;
+       }
 
        if (nla_put_u32(skb, TEAM_ATTR_TEAM_IFINDEX, team->dev->ifindex))
                goto nla_put_failure;
index 3dd490f..f28bd74 100644 (file)
@@ -369,7 +369,7 @@ config USB_NET_NET1080
          optionally with LEDs that indicate traffic
 
 config USB_NET_PLUSB
-       tristate "Prolific PL-2301/2302/25A1 based cables"
+       tristate "Prolific PL-2301/2302/25A1/27A1 based cables"
        # if the handshake/init/reset problems, from original 'plusb',
        # are ever resolved ... then remove "experimental"
        depends on USB_USBNET
index f5552aa..f3ae88f 100644 (file)
@@ -532,6 +532,7 @@ static const struct driver_info wwan_info = {
 #define LENOVO_VENDOR_ID       0x17ef
 #define NVIDIA_VENDOR_ID       0x0955
 #define HP_VENDOR_ID           0x03f0
+#define MICROSOFT_VENDOR_ID    0x045e
 
 static const struct usb_device_id      products[] = {
 /* BLACKLIST !!
@@ -761,6 +762,20 @@ static const struct usb_device_id  products[] = {
        .driver_info = 0,
 },
 
+/* Microsoft Surface 2 dock (based on Realtek RTL8152) */
+{
+       USB_DEVICE_AND_INTERFACE_INFO(MICROSOFT_VENDOR_ID, 0x07ab, USB_CLASS_COMM,
+                       USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE),
+       .driver_info = 0,
+},
+
+/* Microsoft Surface 3 dock (based on Realtek RTL8153) */
+{
+       USB_DEVICE_AND_INTERFACE_INFO(MICROSOFT_VENDOR_ID, 0x07c6, USB_CLASS_COMM,
+                       USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE),
+       .driver_info = 0,
+},
+
 /* WHITELIST!!!
  *
  * CDC Ether uses two interfaces, not necessarily consecutive.
index 8a40202..c4f1c36 100644 (file)
@@ -254,14 +254,9 @@ static struct sk_buff *ch9200_tx_fixup(struct usbnet *dev, struct sk_buff *skb,
        tx_overhead = 0x40;
 
        len = skb->len;
-       if (skb_headroom(skb) < tx_overhead) {
-               struct sk_buff *skb2;
-
-               skb2 = skb_copy_expand(skb, tx_overhead, 0, flags);
+       if (skb_cow_head(skb, tx_overhead)) {
                dev_kfree_skb_any(skb);
-               skb = skb2;
-               if (!skb)
-                       return NULL;
+               return NULL;
        }
 
        __skb_push(skb, tx_overhead);
index e221bfc..947bea8 100644 (file)
@@ -293,12 +293,9 @@ static struct sk_buff *cx82310_tx_fixup(struct usbnet *dev, struct sk_buff *skb,
 {
        int len = skb->len;
 
-       if (skb_headroom(skb) < 2) {
-               struct sk_buff *skb2 = skb_copy_expand(skb, 2, 0, flags);
+       if (skb_cow_head(skb, 2)) {
                dev_kfree_skb_any(skb);
-               skb = skb2;
-               if (!skb)
-                       return NULL;
+               return NULL;
        }
        skb_push(skb, 2);
 
index 4f2e814..00067a0 100644 (file)
@@ -2534,13 +2534,6 @@ static struct hso_device *hso_create_net_device(struct usb_interface *interface,
        SET_NETDEV_DEV(net, &interface->dev);
        SET_NETDEV_DEVTYPE(net, &hso_type);
 
-       /* registering our net device */
-       result = register_netdev(net);
-       if (result) {
-               dev_err(&interface->dev, "Failed to register device\n");
-               goto exit;
-       }
-
        /* start allocating */
        for (i = 0; i < MUX_BULK_RX_BUF_COUNT; i++) {
                hso_net->mux_bulk_rx_urb_pool[i] = usb_alloc_urb(0, GFP_KERNEL);
@@ -2560,6 +2553,13 @@ static struct hso_device *hso_create_net_device(struct usb_interface *interface,
 
        add_net_device(hso_dev);
 
+       /* registering our net device */
+       result = register_netdev(net);
+       if (result) {
+               dev_err(&interface->dev, "Failed to register device\n");
+               goto exit;
+       }
+
        hso_log_port(hso_dev);
 
        hso_create_rfkill(hso_dev, interface);
@@ -3279,9 +3279,9 @@ static void __exit hso_exit(void)
        pr_info("unloaded\n");
 
        tty_unregister_driver(tty_drv);
-       put_tty_driver(tty_drv);
        /* deregister the usb driver */
        usb_deregister(&hso_driver);
+       put_tty_driver(tty_drv);
 }
 
 /* Module definitions */
index 876f02f..2a2c3ed 100644 (file)
@@ -803,18 +803,12 @@ static netdev_tx_t kaweth_start_xmit(struct sk_buff *skb,
        }
 
        /* We now decide whether we can put our special header into the sk_buff */
-       if (skb_cloned(skb) || skb_headroom(skb) < 2) {
-               /* no such luck - we make our own */
-               struct sk_buff *copied_skb;
-               copied_skb = skb_copy_expand(skb, 2, 0, GFP_ATOMIC);
-               dev_kfree_skb_irq(skb);
-               skb = copied_skb;
-               if (!copied_skb) {
-                       kaweth->stats.tx_errors++;
-                       netif_start_queue(net);
-                       spin_unlock_irq(&kaweth->device_lock);
-                       return NETDEV_TX_OK;
-               }
+       if (skb_cow_head(skb, 2)) {
+               kaweth->stats.tx_errors++;
+               netif_start_queue(net);
+               spin_unlock_irq(&kaweth->device_lock);
+               dev_kfree_skb_any(skb);
+               return NETDEV_TX_OK;
        }
 
        private_header = (__le16 *)__skb_push(skb, 2);
index 9889a70..636f48f 100644 (file)
@@ -2607,14 +2607,9 @@ static struct sk_buff *lan78xx_tx_prep(struct lan78xx_net *dev,
 {
        u32 tx_cmd_a, tx_cmd_b;
 
-       if (skb_headroom(skb) < TX_OVERHEAD) {
-               struct sk_buff *skb2;
-
-               skb2 = skb_copy_expand(skb, TX_OVERHEAD, 0, flags);
+       if (skb_cow_head(skb, TX_OVERHEAD)) {
                dev_kfree_skb_any(skb);
-               skb = skb2;
-               if (!skb)
-                       return NULL;
+               return NULL;
        }
 
        if (lan78xx_linearize(skb) < 0)
index 22e1a9a..6fe5937 100644 (file)
@@ -102,7 +102,7 @@ static int pl_reset(struct usbnet *dev)
 }
 
 static const struct driver_info        prolific_info = {
-       .description =  "Prolific PL-2301/PL-2302/PL-25A1",
+       .description =  "Prolific PL-2301/PL-2302/PL-25A1/PL-27A1",
        .flags =        FLAG_POINTTOPOINT | FLAG_NO_SETINT,
                /* some PL-2302 versions seem to fail usb_set_interface() */
        .reset =        pl_reset,
@@ -139,6 +139,17 @@ static const struct usb_device_id  products [] = {
                                         * Host-to-Host Cable
                                         */
        .driver_info =  (unsigned long) &prolific_info,
+
+},
+
+/* super speed cables */
+{
+       USB_DEVICE(0x067b, 0x27a1),     /* PL-27A1, no eeprom
+                                        * also: goobay Active USB 3.0
+                                        * Data Link,
+                                        * Unitek Y-3501
+                                        */
+       .driver_info =  (unsigned long) &prolific_info,
 },
 
        { },            // END
@@ -158,5 +169,5 @@ static struct usb_driver plusb_driver = {
 module_usb_driver(plusb_driver);
 
 MODULE_AUTHOR("David Brownell");
-MODULE_DESCRIPTION("Prolific PL-2301/2302/25A1 USB Host to Host Link Driver");
+MODULE_DESCRIPTION("Prolific PL-2301/2302/25A1/27A1 USB Host to Host Link Driver");
 MODULE_LICENSE("GPL");
index 156f7f8..2474618 100644 (file)
@@ -908,7 +908,7 @@ static const struct usb_device_id products[] = {
        {QMI_FIXED_INTF(0x2357, 0x9000, 4)},    /* TP-LINK MA260 */
        {QMI_QUIRK_SET_DTR(0x1bc7, 0x1040, 2)}, /* Telit LE922A */
        {QMI_FIXED_INTF(0x1bc7, 0x1200, 5)},    /* Telit LE920 */
-       {QMI_FIXED_INTF(0x1bc7, 0x1201, 2)},    /* Telit LE920 */
+       {QMI_QUIRK_SET_DTR(0x1bc7, 0x1201, 2)}, /* Telit LE920, LE920A4 */
        {QMI_FIXED_INTF(0x1c9e, 0x9b01, 3)},    /* XS Stick W100-2 from 4G Systems */
        {QMI_FIXED_INTF(0x0b3c, 0xc000, 4)},    /* Olivetti Olicard 100 */
        {QMI_FIXED_INTF(0x0b3c, 0xc001, 4)},    /* Olivetti Olicard 120 */
index 0b1b918..07f788c 100644 (file)
@@ -517,6 +517,7 @@ enum rtl8152_flags {
 
 /* Define these values to match your device */
 #define VENDOR_ID_REALTEK              0x0bda
+#define VENDOR_ID_MICROSOFT            0x045e
 #define VENDOR_ID_SAMSUNG              0x04e8
 #define VENDOR_ID_LENOVO               0x17ef
 #define VENDOR_ID_NVIDIA               0x0955
@@ -1294,6 +1295,7 @@ static void intr_callback(struct urb *urb)
                }
        } else {
                if (netif_carrier_ok(tp->netdev)) {
+                       netif_stop_queue(tp->netdev);
                        set_bit(RTL8152_LINK_CHG, &tp->flags);
                        schedule_delayed_work(&tp->schedule, 0);
                }
@@ -3169,6 +3171,9 @@ static void set_carrier(struct r8152 *tp)
                        napi_enable(&tp->napi);
                        netif_wake_queue(netdev);
                        netif_info(tp, link, netdev, "carrier on\n");
+               } else if (netif_queue_stopped(netdev) &&
+                          skb_queue_len(&tp->tx_queue) < tp->tx_qlen) {
+                       netif_wake_queue(netdev);
                }
        } else {
                if (netif_carrier_ok(netdev)) {
@@ -3702,8 +3707,18 @@ static int rtl8152_resume(struct usb_interface *intf)
                        tp->rtl_ops.autosuspend_en(tp, false);
                        napi_disable(&tp->napi);
                        set_bit(WORK_ENABLE, &tp->flags);
-                       if (netif_carrier_ok(tp->netdev))
-                               rtl_start_rx(tp);
+
+                       if (netif_carrier_ok(tp->netdev)) {
+                               if (rtl8152_get_speed(tp) & LINK_STATUS) {
+                                       rtl_start_rx(tp);
+                               } else {
+                                       netif_carrier_off(tp->netdev);
+                                       tp->rtl_ops.disable(tp);
+                                       netif_info(tp, link, tp->netdev,
+                                                  "linking down\n");
+                               }
+                       }
+
                        napi_enable(&tp->napi);
                        clear_bit(SELECTIVE_SUSPEND, &tp->flags);
                        smp_mb__after_atomic();
@@ -4507,6 +4522,8 @@ static void rtl8152_disconnect(struct usb_interface *intf)
 static struct usb_device_id rtl8152_table[] = {
        {REALTEK_USB_DEVICE(VENDOR_ID_REALTEK, 0x8152)},
        {REALTEK_USB_DEVICE(VENDOR_ID_REALTEK, 0x8153)},
+       {REALTEK_USB_DEVICE(VENDOR_ID_MICROSOFT, 0x07ab)},
+       {REALTEK_USB_DEVICE(VENDOR_ID_MICROSOFT, 0x07c6)},
        {REALTEK_USB_DEVICE(VENDOR_ID_SAMSUNG, 0xa101)},
        {REALTEK_USB_DEVICE(VENDOR_ID_LENOVO,  0x304f)},
        {REALTEK_USB_DEVICE(VENDOR_ID_LENOVO,  0x3062)},
index 0b17b40..190de9a 100644 (file)
@@ -2203,13 +2203,9 @@ static struct sk_buff *smsc75xx_tx_fixup(struct usbnet *dev,
 {
        u32 tx_cmd_a, tx_cmd_b;
 
-       if (skb_headroom(skb) < SMSC75XX_TX_OVERHEAD) {
-               struct sk_buff *skb2 =
-                       skb_copy_expand(skb, SMSC75XX_TX_OVERHEAD, 0, flags);
+       if (skb_cow_head(skb, SMSC75XX_TX_OVERHEAD)) {
                dev_kfree_skb_any(skb);
-               skb = skb2;
-               if (!skb)
-                       return NULL;
+               return NULL;
        }
 
        tx_cmd_a = (u32)(skb->len & TX_CMD_A_LEN) | TX_CMD_A_FCS;
index 831aa33..5f19fb0 100644 (file)
@@ -2001,13 +2001,13 @@ static struct sk_buff *smsc95xx_tx_fixup(struct usbnet *dev,
        /* We do not advertise SG, so skbs should be already linearized */
        BUG_ON(skb_shinfo(skb)->nr_frags);
 
-       if (skb_headroom(skb) < overhead) {
-               struct sk_buff *skb2 = skb_copy_expand(skb,
-                       overhead, 0, flags);
+       /* Make writable and expand header space by overhead if required */
+       if (skb_cow_head(skb, overhead)) {
+               /* Must deallocate here as returning NULL to indicate error
+                * means the skb won't be deallocated in the caller.
+                */
                dev_kfree_skb_any(skb);
-               skb = skb2;
-               if (!skb)
-                       return NULL;
+               return NULL;
        }
 
        if (csum) {
index 4a1e9c4..aadfe1d 100644 (file)
@@ -456,14 +456,9 @@ static struct sk_buff *sr9700_tx_fixup(struct usbnet *dev, struct sk_buff *skb,
 
        len = skb->len;
 
-       if (skb_headroom(skb) < SR_TX_OVERHEAD) {
-               struct sk_buff *skb2;
-
-               skb2 = skb_copy_expand(skb, SR_TX_OVERHEAD, 0, flags);
+       if (skb_cow_head(skb, SR_TX_OVERHEAD)) {
                dev_kfree_skb_any(skb);
-               skb = skb2;
-               if (!skb)
-                       return NULL;
+               return NULL;
        }
 
        __skb_push(skb, SR_TX_OVERHEAD);
index 3de65ea..4532448 100644 (file)
@@ -1929,7 +1929,7 @@ static int __usbnet_read_cmd(struct usbnet *dev, u8 cmd, u8 reqtype,
                   " value=0x%04x index=0x%04x size=%d\n",
                   cmd, reqtype, value, index, size);
 
-       if (data) {
+       if (size) {
                buf = kmalloc(size, GFP_KERNEL);
                if (!buf)
                        goto out;
@@ -1938,8 +1938,13 @@ static int __usbnet_read_cmd(struct usbnet *dev, u8 cmd, u8 reqtype,
        err = usb_control_msg(dev->udev, usb_rcvctrlpipe(dev->udev, 0),
                              cmd, reqtype, value, index, buf, size,
                              USB_CTRL_GET_TIMEOUT);
-       if (err > 0 && err <= size)
-               memcpy(data, buf, err);
+       if (err > 0 && err <= size) {
+        if (data)
+            memcpy(data, buf, err);
+        else
+            netdev_dbg(dev->net,
+                "Huh? Data requested but thrown away.\n");
+    }
        kfree(buf);
 out:
        return err;
@@ -1960,7 +1965,13 @@ static int __usbnet_write_cmd(struct usbnet *dev, u8 cmd, u8 reqtype,
                buf = kmemdup(data, size, GFP_KERNEL);
                if (!buf)
                        goto out;
-       }
+       } else {
+        if (size) {
+            WARN_ON_ONCE(1);
+            err = -EINVAL;
+            goto out;
+        }
+    }
 
        err = usb_control_msg(dev->udev, usb_sndctrlpipe(dev->udev, 0),
                              cmd, reqtype, value, index, buf, size,
index ea9890d..f365846 100644 (file)
@@ -2230,14 +2230,8 @@ static bool virtnet_validate_features(struct virtio_device *vdev)
 #define MIN_MTU ETH_MIN_MTU
 #define MAX_MTU ETH_MAX_MTU
 
-static int virtnet_probe(struct virtio_device *vdev)
+static int virtnet_validate(struct virtio_device *vdev)
 {
-       int i, err;
-       struct net_device *dev;
-       struct virtnet_info *vi;
-       u16 max_queue_pairs;
-       int mtu;
-
        if (!vdev->config->get) {
                dev_err(&vdev->dev, "%s failure: config access disabled\n",
                        __func__);
@@ -2247,6 +2241,25 @@ static int virtnet_probe(struct virtio_device *vdev)
        if (!virtnet_validate_features(vdev))
                return -EINVAL;
 
+       if (virtio_has_feature(vdev, VIRTIO_NET_F_MTU)) {
+               int mtu = virtio_cread16(vdev,
+                                        offsetof(struct virtio_net_config,
+                                                 mtu));
+               if (mtu < MIN_MTU)
+                       __virtio_clear_bit(vdev, VIRTIO_NET_F_MTU);
+       }
+
+       return 0;
+}
+
+static int virtnet_probe(struct virtio_device *vdev)
+{
+       int i, err;
+       struct net_device *dev;
+       struct virtnet_info *vi;
+       u16 max_queue_pairs;
+       int mtu;
+
        /* Find if host supports multiqueue virtio_net device */
        err = virtio_cread_feature(vdev, VIRTIO_NET_F_MQ,
                                   struct virtio_net_config,
@@ -2362,11 +2375,20 @@ static int virtnet_probe(struct virtio_device *vdev)
                                     offsetof(struct virtio_net_config,
                                              mtu));
                if (mtu < dev->min_mtu) {
-                       __virtio_clear_bit(vdev, VIRTIO_NET_F_MTU);
-               } else {
-                       dev->mtu = mtu;
-                       dev->max_mtu = mtu;
+                       /* Should never trigger: MTU was previously validated
+                        * in virtnet_validate.
+                        */
+                       dev_err(&vdev->dev, "device MTU appears to have changed "
+                               "it is now %d < %d", mtu, dev->min_mtu);
+                       goto free_stats;
                }
+
+               dev->mtu = mtu;
+               dev->max_mtu = mtu;
+
+               /* TODO: size buffers correctly in this case. */
+               if (dev->mtu > ETH_DATA_LEN)
+                       vi->big_packets = true;
        }
 
        if (vi->any_header_sg)
@@ -2544,6 +2566,7 @@ static struct virtio_driver virtio_net_driver = {
        .driver.name =  KBUILD_MODNAME,
        .driver.owner = THIS_MODULE,
        .id_table =     id_table,
+       .validate =     virtnet_validate,
        .probe =        virtnet_probe,
        .remove =       virtnet_remove,
        .config_changed = virtnet_config_changed,
index d6988db..7d909c8 100644 (file)
@@ -1128,7 +1128,7 @@ static int vrf_fib_rule(const struct net_device *dev, __u8 family, bool add_it)
                goto nla_put_failure;
 
        /* rule only needs to appear once */
-       nlh->nlmsg_flags &= NLM_F_EXCL;
+       nlh->nlmsg_flags |= NLM_F_EXCL;
 
        frh = nlmsg_data(nlh);
        memset(frh, 0, sizeof(*frh));
index de19c7c..85d949e 100644 (file)
@@ -2238,14 +2238,16 @@ int brcmf_p2p_del_vif(struct wiphy *wiphy, struct wireless_dev *wdev)
        struct brcmf_cfg80211_info *cfg = wiphy_priv(wiphy);
        struct brcmf_p2p_info *p2p = &cfg->p2p;
        struct brcmf_cfg80211_vif *vif;
+       enum nl80211_iftype iftype;
        bool wait_for_disable = false;
        int err;
 
        brcmf_dbg(TRACE, "delete P2P vif\n");
        vif = container_of(wdev, struct brcmf_cfg80211_vif, wdev);
 
+       iftype = vif->wdev.iftype;
        brcmf_cfg80211_arm_vif_event(cfg, vif);
-       switch (vif->wdev.iftype) {
+       switch (iftype) {
        case NL80211_IFTYPE_P2P_CLIENT:
                if (test_bit(BRCMF_VIF_STATUS_DISCONNECTING, &vif->sme_state))
                        wait_for_disable = true;
@@ -2275,7 +2277,7 @@ int brcmf_p2p_del_vif(struct wiphy *wiphy, struct wireless_dev *wdev)
                                            BRCMF_P2P_DISABLE_TIMEOUT);
 
        err = 0;
-       if (vif->wdev.iftype != NL80211_IFTYPE_P2P_DEVICE) {
+       if (iftype != NL80211_IFTYPE_P2P_DEVICE) {
                brcmf_vif_clear_mgmt_ies(vif);
                err = brcmf_p2p_release_p2p_if(vif);
        }
@@ -2291,7 +2293,7 @@ int brcmf_p2p_del_vif(struct wiphy *wiphy, struct wireless_dev *wdev)
        brcmf_remove_interface(vif->ifp, true);
 
        brcmf_cfg80211_arm_vif_event(cfg, NULL);
-       if (vif->wdev.iftype != NL80211_IFTYPE_P2P_DEVICE)
+       if (iftype != NL80211_IFTYPE_P2P_DEVICE)
                p2p->bss_idx[P2PAPI_BSSCFG_CONNECTION].vif = NULL;
 
        return err;
index a260cd5..077bfd8 100644 (file)
@@ -1056,6 +1056,8 @@ static ssize_t iwl_dbgfs_fw_dbg_collect_write(struct iwl_mvm *mvm,
 
        if (ret)
                return ret;
+       if (count == 0)
+               return 0;
 
        iwl_mvm_fw_dbg_collect(mvm, FW_DBG_TRIGGER_USER, buf,
                               (count - 1), NULL);
index 99132ea..c5734e1 100644 (file)
@@ -216,7 +216,8 @@ u32 iwl_mvm_mac_get_queues_mask(struct ieee80211_vif *vif)
                        qmask |= BIT(vif->hw_queue[ac]);
        }
 
-       if (vif->type == NL80211_IFTYPE_AP)
+       if (vif->type == NL80211_IFTYPE_AP ||
+           vif->type == NL80211_IFTYPE_ADHOC)
                qmask |= BIT(vif->cab_queue);
 
        return qmask;
index 6927cae..486dcce 100644 (file)
@@ -2401,7 +2401,7 @@ void iwl_mvm_sta_pm_notif(struct iwl_mvm *mvm, struct iwl_rx_cmd_buffer *rxb)
                return;
 
        rcu_read_lock();
-       sta = mvm->fw_id_to_mac_id[notif->sta_id];
+       sta = rcu_dereference(mvm->fw_id_to_mac_id[notif->sta_id]);
        if (WARN_ON(IS_ERR_OR_NULL(sta))) {
                rcu_read_unlock();
                return;
index b51a285..9d28db7 100644 (file)
@@ -1806,7 +1806,8 @@ int iwl_mvm_send_add_bcast_sta(struct iwl_mvm *mvm, struct ieee80211_vif *vif)
                        iwl_mvm_get_wd_timeout(mvm, vif, false, false);
                int queue;
 
-               if (vif->type == NL80211_IFTYPE_AP)
+               if (vif->type == NL80211_IFTYPE_AP ||
+                   vif->type == NL80211_IFTYPE_ADHOC)
                        queue = IWL_MVM_DQA_AP_PROBE_RESP_QUEUE;
                else if (vif->type == NL80211_IFTYPE_P2P_DEVICE)
                        queue = IWL_MVM_DQA_P2P_DEVICE_QUEUE;
@@ -1837,7 +1838,8 @@ int iwl_mvm_send_add_bcast_sta(struct iwl_mvm *mvm, struct ieee80211_vif *vif)
         * enabled-cab_queue to the mask)
         */
        if (iwl_mvm_is_dqa_supported(mvm) &&
-           vif->type == NL80211_IFTYPE_AP) {
+           (vif->type == NL80211_IFTYPE_AP ||
+            vif->type == NL80211_IFTYPE_ADHOC)) {
                struct iwl_trans_txq_scd_cfg cfg = {
                        .fifo = IWL_MVM_TX_FIFO_MCAST,
                        .sta_id = mvmvif->bcast_sta.sta_id,
@@ -1862,7 +1864,8 @@ static void iwl_mvm_free_bcast_sta_queues(struct iwl_mvm *mvm,
 
        lockdep_assert_held(&mvm->mutex);
 
-       if (vif->type == NL80211_IFTYPE_AP)
+       if (vif->type == NL80211_IFTYPE_AP ||
+           vif->type == NL80211_IFTYPE_ADHOC)
                iwl_mvm_disable_txq(mvm, vif->cab_queue, vif->cab_queue,
                                    IWL_MAX_TID_COUNT, 0);
 
index 3f37075..1ba0a6f 100644 (file)
@@ -506,6 +506,7 @@ static int iwl_mvm_get_ctrl_vif_queue(struct iwl_mvm *mvm,
 
        switch (info->control.vif->type) {
        case NL80211_IFTYPE_AP:
+       case NL80211_IFTYPE_ADHOC:
                /*
                 * Handle legacy hostapd as well, where station may be added
                 * only after assoc. Take care of the case where we send a
@@ -517,7 +518,8 @@ static int iwl_mvm_get_ctrl_vif_queue(struct iwl_mvm *mvm,
                if (info->hw_queue == info->control.vif->cab_queue)
                        return info->hw_queue;
 
-               WARN_ONCE(1, "fc=0x%02x", le16_to_cpu(fc));
+               WARN_ONCE(info->control.vif->type != NL80211_IFTYPE_ADHOC,
+                         "fc=0x%02x", le16_to_cpu(fc));
                return IWL_MVM_DQA_AP_PROBE_RESP_QUEUE;
        case NL80211_IFTYPE_P2P_DEVICE:
                if (ieee80211_is_mgmt(fc))
@@ -584,7 +586,8 @@ int iwl_mvm_tx_skb_non_sta(struct iwl_mvm *mvm, struct sk_buff *skb)
                        iwl_mvm_vif_from_mac80211(info.control.vif);
 
                if (info.control.vif->type == NL80211_IFTYPE_P2P_DEVICE ||
-                   info.control.vif->type == NL80211_IFTYPE_AP) {
+                   info.control.vif->type == NL80211_IFTYPE_AP ||
+                   info.control.vif->type == NL80211_IFTYPE_ADHOC) {
                        sta_id = mvmvif->bcast_sta.sta_id;
                        queue = iwl_mvm_get_ctrl_vif_queue(mvm, &info,
                                                           hdr->frame_control);
index caea350..bdc3791 100644 (file)
@@ -1742,12 +1742,14 @@ void rtl_c2hcmd_enqueue(struct ieee80211_hw *hw, u8 tag, u8 len, u8 *val)
        unsigned long flags;
        struct rtl_c2hcmd *c2hcmd;
 
-       c2hcmd = kmalloc(sizeof(*c2hcmd), GFP_KERNEL);
+       c2hcmd = kmalloc(sizeof(*c2hcmd),
+                        in_interrupt() ? GFP_ATOMIC : GFP_KERNEL);
 
        if (!c2hcmd)
                goto label_err;
 
-       c2hcmd->val = kmalloc(len, GFP_KERNEL);
+       c2hcmd->val = kmalloc(len,
+                             in_interrupt() ? GFP_ATOMIC : GFP_KERNEL);
 
        if (!c2hcmd->val)
                goto label_err2;
index 23d4a17..351bac8 100644 (file)
@@ -934,8 +934,14 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
        rc = nd_desc->ndctl(nd_desc, nvdimm, cmd, buf, buf_len, NULL);
        if (rc < 0)
                goto out_unlock;
+       nvdimm_bus_unlock(&nvdimm_bus->dev);
+
        if (copy_to_user(p, buf, buf_len))
                rc = -EFAULT;
+
+       vfree(buf);
+       return rc;
+
  out_unlock:
        nvdimm_bus_unlock(&nvdimm_bus->dev);
  out:
index b3323c0..ca6d572 100644 (file)
@@ -243,7 +243,15 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns,
        }
 
        if (unlikely(is_bad_pmem(&nsio->bb, sector, sz_align))) {
-               if (IS_ALIGNED(offset, 512) && IS_ALIGNED(size, 512)) {
+               /*
+                * FIXME: nsio_rw_bytes() may be called from atomic
+                * context in the btt case and nvdimm_clear_poison()
+                * takes a sleeping lock. Until the locking can be
+                * reworked this capability requires that the namespace
+                * is not claimed by btt.
+                */
+               if (IS_ALIGNED(offset, 512) && IS_ALIGNED(size, 512)
+                               && (!ndns->claim || !is_nd_btt(ndns->claim))) {
                        long cleared;
 
                        cleared = nvdimm_clear_poison(&ndns->dev, offset, size);
index 0eedc49..8b72132 100644 (file)
@@ -395,7 +395,7 @@ EXPORT_SYMBOL_GPL(nvdimm_create);
 
 int alias_dpa_busy(struct device *dev, void *data)
 {
-       resource_size_t map_end, blk_start, new, busy;
+       resource_size_t map_end, blk_start, new;
        struct blk_alloc_info *info = data;
        struct nd_mapping *nd_mapping;
        struct nd_region *nd_region;
@@ -436,29 +436,19 @@ int alias_dpa_busy(struct device *dev, void *data)
  retry:
        /*
         * Find the free dpa from the end of the last pmem allocation to
-        * the end of the interleave-set mapping that is not already
-        * covered by a blk allocation.
+        * the end of the interleave-set mapping.
         */
-       busy = 0;
        for_each_dpa_resource(ndd, res) {
+               if (strncmp(res->name, "pmem", 4) != 0)
+                       continue;
                if ((res->start >= blk_start && res->start < map_end)
                                || (res->end >= blk_start
                                        && res->end <= map_end)) {
-                       if (strncmp(res->name, "pmem", 4) == 0) {
-                               new = max(blk_start, min(map_end + 1,
-                                                       res->end + 1));
-                               if (new != blk_start) {
-                                       blk_start = new;
-                                       goto retry;
-                               }
-                       } else
-                               busy += min(map_end, res->end)
-                                       - max(nd_mapping->start, res->start) + 1;
-               } else if (nd_mapping->start > res->start
-                               && map_end < res->end) {
-                       /* total eclipse of the PMEM region mapping */
-                       busy += nd_mapping->size;
-                       break;
+                       new = max(blk_start, min(map_end + 1, res->end + 1));
+                       if (new != blk_start) {
+                               blk_start = new;
+                               goto retry;
+                       }
                }
        }
 
@@ -470,52 +460,11 @@ int alias_dpa_busy(struct device *dev, void *data)
                return 1;
        }
 
-       info->available -= blk_start - nd_mapping->start + busy;
+       info->available -= blk_start - nd_mapping->start;
 
        return 0;
 }
 
-static int blk_dpa_busy(struct device *dev, void *data)
-{
-       struct blk_alloc_info *info = data;
-       struct nd_mapping *nd_mapping;
-       struct nd_region *nd_region;
-       resource_size_t map_end;
-       int i;
-
-       if (!is_nd_pmem(dev))
-               return 0;
-
-       nd_region = to_nd_region(dev);
-       for (i = 0; i < nd_region->ndr_mappings; i++) {
-               nd_mapping  = &nd_region->mapping[i];
-               if (nd_mapping->nvdimm == info->nd_mapping->nvdimm)
-                       break;
-       }
-
-       if (i >= nd_region->ndr_mappings)
-               return 0;
-
-       map_end = nd_mapping->start + nd_mapping->size - 1;
-       if (info->res->start >= nd_mapping->start
-                       && info->res->start < map_end) {
-               if (info->res->end <= map_end) {
-                       info->busy = 0;
-                       return 1;
-               } else {
-                       info->busy -= info->res->end - map_end;
-                       return 0;
-               }
-       } else if (info->res->end >= nd_mapping->start
-                       && info->res->end <= map_end) {
-               info->busy -= nd_mapping->start - info->res->start;
-               return 0;
-       } else {
-               info->busy -= nd_mapping->size;
-               return 0;
-       }
-}
-
 /**
  * nd_blk_available_dpa - account the unused dpa of BLK region
  * @nd_mapping: container of dpa-resource-root + labels
@@ -545,11 +494,7 @@ resource_size_t nd_blk_available_dpa(struct nd_region *nd_region)
        for_each_dpa_resource(ndd, res) {
                if (strncmp(res->name, "blk", 3) != 0)
                        continue;
-
-               info.res = res;
-               info.busy = resource_size(res);
-               device_for_each_child(&nvdimm_bus->dev, &info, blk_dpa_busy);
-               info.available -= info.busy;
+               info.available -= resource_size(res);
        }
 
        return info.available;
index 9b3b57f..d5e0906 100644 (file)
@@ -49,10 +49,9 @@ unsigned char shutdown_timeout = 5;
 module_param(shutdown_timeout, byte, 0644);
 MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown");
 
-unsigned int nvme_max_retries = 5;
-module_param_named(max_retries, nvme_max_retries, uint, 0644);
+static u8 nvme_max_retries = 5;
+module_param_named(max_retries, nvme_max_retries, byte, 0644);
 MODULE_PARM_DESC(max_retries, "max number of retries a command may have");
-EXPORT_SYMBOL_GPL(nvme_max_retries);
 
 static int nvme_char_major;
 module_param(nvme_char_major, int, 0);
@@ -62,11 +61,66 @@ module_param(default_ps_max_latency_us, ulong, 0644);
 MODULE_PARM_DESC(default_ps_max_latency_us,
                 "max power saving latency for new devices; use PM QOS to change per device");
 
+static bool force_apst;
+module_param(force_apst, bool, 0644);
+MODULE_PARM_DESC(force_apst, "allow APST for newly enumerated devices even if quirked off");
+
 static LIST_HEAD(nvme_ctrl_list);
 static DEFINE_SPINLOCK(dev_list_lock);
 
 static struct class *nvme_class;
 
+static int nvme_error_status(struct request *req)
+{
+       switch (nvme_req(req)->status & 0x7ff) {
+       case NVME_SC_SUCCESS:
+               return 0;
+       case NVME_SC_CAP_EXCEEDED:
+               return -ENOSPC;
+       default:
+               return -EIO;
+
+       /*
+        * XXX: these errors are a nasty side-band protocol to
+        * drivers/md/dm-mpath.c:noretry_error() that aren't documented
+        * anywhere..
+        */
+       case NVME_SC_CMD_SEQ_ERROR:
+               return -EILSEQ;
+       case NVME_SC_ONCS_NOT_SUPPORTED:
+               return -EOPNOTSUPP;
+       case NVME_SC_WRITE_FAULT:
+       case NVME_SC_READ_ERROR:
+       case NVME_SC_UNWRITTEN_BLOCK:
+               return -ENODATA;
+       }
+}
+
+static inline bool nvme_req_needs_retry(struct request *req)
+{
+       if (blk_noretry_request(req))
+               return false;
+       if (nvme_req(req)->status & NVME_SC_DNR)
+               return false;
+       if (jiffies - req->start_time >= req->timeout)
+               return false;
+       if (nvme_req(req)->retries >= nvme_max_retries)
+               return false;
+       return true;
+}
+
+void nvme_complete_rq(struct request *req)
+{
+       if (unlikely(nvme_req(req)->status && nvme_req_needs_retry(req))) {
+               nvme_req(req)->retries++;
+               blk_mq_requeue_request(req, !blk_mq_queue_stopped(req->q));
+               return;
+       }
+
+       blk_mq_end_request(req, nvme_error_status(req));
+}
+EXPORT_SYMBOL_GPL(nvme_complete_rq);
+
 void nvme_cancel_request(struct request *req, void *data, bool reserved)
 {
        int status;
@@ -80,7 +134,9 @@ void nvme_cancel_request(struct request *req, void *data, bool reserved)
        status = NVME_SC_ABORT_REQ;
        if (blk_queue_dying(req->q))
                status |= NVME_SC_DNR;
-       blk_mq_complete_request(req, status);
+       nvme_req(req)->status = status;
+       blk_mq_complete_request(req);
+
 }
 EXPORT_SYMBOL_GPL(nvme_cancel_request);
 
@@ -205,12 +261,6 @@ fail:
        return NULL;
 }
 
-void nvme_requeue_req(struct request *req)
-{
-       blk_mq_requeue_request(req, !blk_mq_queue_stopped(req->q));
-}
-EXPORT_SYMBOL_GPL(nvme_requeue_req);
-
 struct request *nvme_alloc_request(struct request_queue *q,
                struct nvme_command *cmd, unsigned int flags, int qid)
 {
@@ -270,7 +320,7 @@ static inline int nvme_setup_discard(struct nvme_ns *ns, struct request *req,
        memset(cmnd, 0, sizeof(*cmnd));
        cmnd->dsm.opcode = nvme_cmd_dsm;
        cmnd->dsm.nsid = cpu_to_le32(ns->ns_id);
-       cmnd->dsm.nr = segments - 1;
+       cmnd->dsm.nr = cpu_to_le32(segments - 1);
        cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
 
        req->special_vec.bv_page = virt_to_page(range);
@@ -327,6 +377,12 @@ int nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
 {
        int ret = BLK_MQ_RQ_QUEUE_OK;
 
+       if (!(req->rq_flags & RQF_DONTPREP)) {
+               nvme_req(req)->retries = 0;
+               nvme_req(req)->flags = 0;
+               req->rq_flags |= RQF_DONTPREP;
+       }
+
        switch (req_op(req)) {
        case REQ_OP_DRV_IN:
        case REQ_OP_DRV_OUT:
@@ -335,6 +391,8 @@ int nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
        case REQ_OP_FLUSH:
                nvme_setup_flush(ns, cmd);
                break;
+       case REQ_OP_WRITE_ZEROES:
+               /* currently only aliased to deallocate for a few ctrls: */
        case REQ_OP_DISCARD:
                ret = nvme_setup_discard(ns, req, cmd);
                break;
@@ -378,7 +436,10 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
        blk_execute_rq(req->q, NULL, req, at_head);
        if (result)
                *result = nvme_req(req)->result;
-       ret = req->errors;
+       if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
+               ret = -EINTR;
+       else
+               ret = nvme_req(req)->status;
  out:
        blk_mq_free_request(req);
        return ret;
@@ -463,7 +524,10 @@ int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
        }
  submit:
        blk_execute_rq(req->q, disk, req, 0);
-       ret = req->errors;
+       if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
+               ret = -EINTR;
+       else
+               ret = nvme_req(req)->status;
        if (result)
                *result = le32_to_cpu(nvme_req(req)->result.u32);
        if (meta && !ret && !write) {
@@ -900,16 +964,14 @@ static void nvme_config_discard(struct nvme_ns *ns)
        BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) <
                        NVME_DSM_MAX_RANGES);
 
-       if (ctrl->quirks & NVME_QUIRK_DISCARD_ZEROES)
-               ns->queue->limits.discard_zeroes_data = 1;
-       else
-               ns->queue->limits.discard_zeroes_data = 0;
-
        ns->queue->limits.discard_alignment = logical_block_size;
        ns->queue->limits.discard_granularity = logical_block_size;
        blk_queue_max_discard_sectors(ns->queue, UINT_MAX);
        blk_queue_max_discard_segments(ns->queue, NVME_DSM_MAX_RANGES);
        queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue);
+
+       if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
+               blk_queue_max_write_zeroes_sectors(ns->queue, UINT_MAX);
 }
 
 static int nvme_revalidate_ns(struct nvme_ns *ns, struct nvme_id_ns **id)
@@ -1267,7 +1329,7 @@ static void nvme_configure_apst(struct nvme_ctrl *ctrl)
         * heuristic: we are willing to spend at most 2% of the time
         * transitioning between power states.  Therefore, when running
         * in any given state, we will enter the next lower-power
-        * non-operational state after waiting 100 * (enlat + exlat)
+        * non-operational state after waiting 50 * (enlat + exlat)
         * microseconds, as long as that state's total latency is under
         * the requested maximum latency.
         *
@@ -1278,6 +1340,8 @@ static void nvme_configure_apst(struct nvme_ctrl *ctrl)
 
        unsigned apste;
        struct nvme_feat_auto_pst *table;
+       u64 max_lat_us = 0;
+       int max_ps = -1;
        int ret;
 
        /*
@@ -1299,6 +1363,7 @@ static void nvme_configure_apst(struct nvme_ctrl *ctrl)
        if (ctrl->ps_max_latency_us == 0) {
                /* Turn off APST. */
                apste = 0;
+               dev_dbg(ctrl->device, "APST disabled\n");
        } else {
                __le64 target = cpu_to_le64(0);
                int state;
@@ -1316,6 +1381,14 @@ static void nvme_configure_apst(struct nvme_ctrl *ctrl)
                                table->entries[state] = target;
 
                        /*
+                        * Don't allow transitions to the deepest state
+                        * if it's quirked off.
+                        */
+                       if (state == ctrl->npss &&
+                           (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS))
+                               continue;
+
+                       /*
                         * Is this state a useful non-operational state for
                         * higher-power states to autonomously transition to?
                         */
@@ -1340,9 +1413,22 @@ static void nvme_configure_apst(struct nvme_ctrl *ctrl)
 
                        target = cpu_to_le64((state << 3) |
                                             (transition_ms << 8));
+
+                       if (max_ps == -1)
+                               max_ps = state;
+
+                       if (total_latency_us > max_lat_us)
+                               max_lat_us = total_latency_us;
                }
 
                apste = 1;
+
+               if (max_ps == -1) {
+                       dev_dbg(ctrl->device, "APST enabled but no non-operational states are available\n");
+               } else {
+                       dev_dbg(ctrl->device, "APST enabled: max PS = %d, max round-trip latency = %lluus, table = %*phN\n",
+                               max_ps, max_lat_us, (int)sizeof(*table), table);
+               }
        }
 
        ret = nvme_set_features(ctrl, NVME_FEAT_AUTO_PST, apste,
@@ -1387,16 +1473,15 @@ struct nvme_core_quirk_entry {
 };
 
 static const struct nvme_core_quirk_entry core_quirks[] = {
-       /*
-        * Seen on a Samsung "SM951 NVMe SAMSUNG 256GB": using APST causes
-        * the controller to go out to lunch.  It dies when the watchdog
-        * timer reads CSTS and gets 0xffffffff.
-        */
        {
-               .vid = 0x144d,
-               .fr = "BXW75D0Q",
+               /*
+                * This Toshiba device seems to die using any APST states.  See:
+                * https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1678184/comments/11
+                */
+               .vid = 0x1179,
+               .mn = "THNSF5256GPUK TOSHIBA",
                .quirks = NVME_QUIRK_NO_APST,
-       },
+       }
 };
 
 /* match is null-terminated but idstr is space-padded. */
@@ -1481,6 +1566,11 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
                }
        }
 
+       if (force_apst && (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) {
+               dev_warn(ctrl->dev, "forcibly allowing all power states due to nvme_core.force_apst -- use at your own risk\n");
+               ctrl->quirks &= ~NVME_QUIRK_NO_DEEPEST_PS;
+       }
+
        ctrl->oacs = le16_to_cpu(id->oacs);
        ctrl->vid = le16_to_cpu(id->vid);
        ctrl->oncs = le16_to_cpup(&id->oncs);
@@ -1503,7 +1593,16 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
 
        ctrl->npss = id->npss;
        prev_apsta = ctrl->apsta;
-       ctrl->apsta = (ctrl->quirks & NVME_QUIRK_NO_APST) ? 0 : id->apsta;
+       if (ctrl->quirks & NVME_QUIRK_NO_APST) {
+               if (force_apst && id->apsta) {
+                       dev_warn(ctrl->dev, "forcibly allowing APST due to nvme_core.force_apst -- use at your own risk\n");
+                       ctrl->apsta = 1;
+               } else {
+                       ctrl->apsta = 0;
+               }
+       } else {
+               ctrl->apsta = id->apsta;
+       }
        memcpy(ctrl->psd, id->psd, sizeof(ctrl->psd));
 
        if (ctrl->ops->is_fabrics) {
@@ -2386,7 +2485,7 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl)
 
        mutex_lock(&ctrl->namespaces_mutex);
        list_for_each_entry(ns, &ctrl->namespaces, list)
-               blk_mq_freeze_queue_start(ns->queue);
+               blk_freeze_queue_start(ns->queue);
        mutex_unlock(&ctrl->namespaces_mutex);
 }
 EXPORT_SYMBOL_GPL(nvme_start_freeze);
index 5b7386f..990e6fb 100644 (file)
@@ -471,6 +471,16 @@ int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid)
 }
 EXPORT_SYMBOL_GPL(nvmf_connect_io_queue);
 
+bool nvmf_should_reconnect(struct nvme_ctrl *ctrl)
+{
+       if (ctrl->opts->max_reconnects != -1 &&
+           ctrl->opts->nr_reconnects < ctrl->opts->max_reconnects)
+               return true;
+
+       return false;
+}
+EXPORT_SYMBOL_GPL(nvmf_should_reconnect);
+
 /**
  * nvmf_register_transport() - NVMe Fabrics Library registration function.
  * @ops:       Transport ops instance to be registered to the
@@ -533,6 +543,7 @@ static const match_table_t opt_tokens = {
        { NVMF_OPT_QUEUE_SIZE,          "queue_size=%d"         },
        { NVMF_OPT_NR_IO_QUEUES,        "nr_io_queues=%d"       },
        { NVMF_OPT_RECONNECT_DELAY,     "reconnect_delay=%d"    },
+       { NVMF_OPT_CTRL_LOSS_TMO,       "ctrl_loss_tmo=%d"      },
        { NVMF_OPT_KATO,                "keep_alive_tmo=%d"     },
        { NVMF_OPT_HOSTNQN,             "hostnqn=%s"            },
        { NVMF_OPT_HOST_TRADDR,         "host_traddr=%s"        },
@@ -546,6 +557,7 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
        char *options, *o, *p;
        int token, ret = 0;
        size_t nqnlen  = 0;
+       int ctrl_loss_tmo = NVMF_DEF_CTRL_LOSS_TMO;
 
        /* Set defaults */
        opts->queue_size = NVMF_DEF_QUEUE_SIZE;
@@ -655,6 +667,16 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
                        }
                        opts->kato = token;
                        break;
+               case NVMF_OPT_CTRL_LOSS_TMO:
+                       if (match_int(args, &token)) {
+                               ret = -EINVAL;
+                               goto out;
+                       }
+
+                       if (token < 0)
+                               pr_warn("ctrl_loss_tmo < 0 will reconnect forever\n");
+                       ctrl_loss_tmo = token;
+                       break;
                case NVMF_OPT_HOSTNQN:
                        if (opts->host) {
                                pr_err("hostnqn already user-assigned: %s\n",
@@ -710,6 +732,12 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
                }
        }
 
+       if (ctrl_loss_tmo < 0)
+               opts->max_reconnects = -1;
+       else
+               opts->max_reconnects = DIV_ROUND_UP(ctrl_loss_tmo,
+                                               opts->reconnect_delay);
+
        if (!opts->host) {
                kref_get(&nvmf_default_host->ref);
                opts->host = nvmf_default_host;
index 1560181..f5a9c1f 100644 (file)
@@ -21,6 +21,8 @@
 #define NVMF_MAX_QUEUE_SIZE    1024
 #define NVMF_DEF_QUEUE_SIZE    128
 #define NVMF_DEF_RECONNECT_DELAY       10
+/* default to 600 seconds of reconnect attempts before giving up */
+#define NVMF_DEF_CTRL_LOSS_TMO         600
 
 /*
  * Define a host as seen by the target.  We allocate one at boot, but also
@@ -53,6 +55,7 @@ enum {
        NVMF_OPT_HOSTNQN        = 1 << 8,
        NVMF_OPT_RECONNECT_DELAY = 1 << 9,
        NVMF_OPT_HOST_TRADDR    = 1 << 10,
+       NVMF_OPT_CTRL_LOSS_TMO  = 1 << 11,
 };
 
 /**
@@ -77,6 +80,10 @@ enum {
  * @discovery_nqn: indicates if the subsysnqn is the well-known discovery NQN.
  * @kato:      Keep-alive timeout.
  * @host:      Virtual NVMe host, contains the NQN and Host ID.
+ * @nr_reconnects: number of reconnect attempted since the last ctrl failure
+ * @max_reconnects: maximum number of allowed reconnect attempts before removing
+ *              the controller, (-1) means reconnect forever, zero means remove
+ *              immediately;
  */
 struct nvmf_ctrl_options {
        unsigned                mask;
@@ -91,6 +98,8 @@ struct nvmf_ctrl_options {
        bool                    discovery_nqn;
        unsigned int            kato;
        struct nvmf_host        *host;
+       int                     nr_reconnects;
+       int                     max_reconnects;
 };
 
 /*
@@ -133,5 +142,6 @@ void nvmf_unregister_transport(struct nvmf_transport_ops *ops);
 void nvmf_free_options(struct nvmf_ctrl_options *opts);
 const char *nvmf_get_subsysnqn(struct nvme_ctrl *ctrl);
 int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size);
+bool nvmf_should_reconnect(struct nvme_ctrl *ctrl);
 
 #endif /* _NVME_FABRICS_H */
index 9690beb..4976db5 100644 (file)
@@ -19,6 +19,7 @@
 #include <linux/parser.h>
 #include <uapi/scsi/fc/fc_fs.h>
 #include <uapi/scsi/fc/fc_els.h>
+#include <linux/delay.h>
 
 #include "nvme.h"
 #include "fabrics.h"
@@ -44,6 +45,8 @@ enum nvme_fc_queue_flags {
 
 #define NVMEFC_QUEUE_DELAY     3               /* ms units */
 
+#define NVME_FC_MAX_CONNECT_ATTEMPTS   1
+
 struct nvme_fc_queue {
        struct nvme_fc_ctrl     *ctrl;
        struct device           *dev;
@@ -61,16 +64,24 @@ struct nvme_fc_queue {
        unsigned long           flags;
 } __aligned(sizeof(u64));      /* alignment for other things alloc'd with */
 
+enum nvme_fcop_flags {
+       FCOP_FLAGS_TERMIO       = (1 << 0),
+       FCOP_FLAGS_RELEASED     = (1 << 1),
+       FCOP_FLAGS_COMPLETE     = (1 << 2),
+       FCOP_FLAGS_AEN          = (1 << 3),
+};
+
 struct nvmefc_ls_req_op {
        struct nvmefc_ls_req    ls_req;
 
-       struct nvme_fc_ctrl     *ctrl;
+       struct nvme_fc_rport    *rport;
        struct nvme_fc_queue    *queue;
        struct request          *rq;
+       u32                     flags;
 
        int                     ls_error;
        struct completion       ls_done;
-       struct list_head        lsreq_list;     /* ctrl->ls_req_list */
+       struct list_head        lsreq_list;     /* rport->ls_req_list */
        bool                    req_queued;
 };
 
@@ -79,6 +90,7 @@ enum nvme_fcpop_state {
        FCPOP_STATE_IDLE        = 1,
        FCPOP_STATE_ACTIVE      = 2,
        FCPOP_STATE_ABORTED     = 3,
+       FCPOP_STATE_COMPLETE    = 4,
 };
 
 struct nvme_fc_fcp_op {
@@ -97,6 +109,7 @@ struct nvme_fc_fcp_op {
        struct request          *rq;
 
        atomic_t                state;
+       u32                     flags;
        u32                     rqno;
        u32                     nents;
 
@@ -120,23 +133,24 @@ struct nvme_fc_rport {
 
        struct list_head                endp_list; /* for lport->endp_list */
        struct list_head                ctrl_list;
+       struct list_head                ls_req_list;
+       struct device                   *dev;   /* physical device for dma */
+       struct nvme_fc_lport            *lport;
        spinlock_t                      lock;
        struct kref                     ref;
 } __aligned(sizeof(u64));      /* alignment for other things alloc'd with */
 
-enum nvme_fcctrl_state {
-       FCCTRL_INIT             = 0,
-       FCCTRL_ACTIVE           = 1,
+enum nvme_fcctrl_flags {
+       FCCTRL_TERMIO           = (1 << 0),
 };
 
 struct nvme_fc_ctrl {
        spinlock_t              lock;
        struct nvme_fc_queue    *queues;
-       u32                     queue_count;
-
        struct device           *dev;
        struct nvme_fc_lport    *lport;
        struct nvme_fc_rport    *rport;
+       u32                     queue_count;
        u32                     cnum;
 
        u64                     association_id;
@@ -144,14 +158,19 @@ struct nvme_fc_ctrl {
        u64                     cap;
 
        struct list_head        ctrl_list;      /* rport->ctrl_list */
-       struct list_head        ls_req_list;
 
        struct blk_mq_tag_set   admin_tag_set;
        struct blk_mq_tag_set   tag_set;
 
        struct work_struct      delete_work;
+       struct work_struct      reset_work;
+       struct delayed_work     connect_work;
+       int                     reconnect_delay;
+       int                     connect_attempts;
+
        struct kref             ref;
-       int                     state;
+       u32                     flags;
+       u32                     iocnt;
 
        struct nvme_fc_fcp_op   aen_ops[NVME_FC_NR_AEN_COMMANDS];
 
@@ -419,9 +438,12 @@ nvme_fc_register_remoteport(struct nvme_fc_local_port *localport,
 
        INIT_LIST_HEAD(&newrec->endp_list);
        INIT_LIST_HEAD(&newrec->ctrl_list);
+       INIT_LIST_HEAD(&newrec->ls_req_list);
        kref_init(&newrec->ref);
        spin_lock_init(&newrec->lock);
        newrec->remoteport.localport = &lport->localport;
+       newrec->dev = lport->dev;
+       newrec->lport = lport;
        newrec->remoteport.private = &newrec[1];
        newrec->remoteport.port_role = pinfo->port_role;
        newrec->remoteport.node_name = pinfo->node_name;
@@ -444,7 +466,6 @@ out_kfree_rport:
 out_reghost_failed:
        *portptr = NULL;
        return ret;
-
 }
 EXPORT_SYMBOL_GPL(nvme_fc_register_remoteport);
 
@@ -487,6 +508,30 @@ nvme_fc_rport_get(struct nvme_fc_rport *rport)
        return kref_get_unless_zero(&rport->ref);
 }
 
+static int
+nvme_fc_abort_lsops(struct nvme_fc_rport *rport)
+{
+       struct nvmefc_ls_req_op *lsop;
+       unsigned long flags;
+
+restart:
+       spin_lock_irqsave(&rport->lock, flags);
+
+       list_for_each_entry(lsop, &rport->ls_req_list, lsreq_list) {
+               if (!(lsop->flags & FCOP_FLAGS_TERMIO)) {
+                       lsop->flags |= FCOP_FLAGS_TERMIO;
+                       spin_unlock_irqrestore(&rport->lock, flags);
+                       rport->lport->ops->ls_abort(&rport->lport->localport,
+                                               &rport->remoteport,
+                                               &lsop->ls_req);
+                       goto restart;
+               }
+       }
+       spin_unlock_irqrestore(&rport->lock, flags);
+
+       return 0;
+}
+
 /**
  * nvme_fc_unregister_remoteport - transport entry point called by an
  *                              LLDD to deregister/remove a previously
@@ -522,6 +567,8 @@ nvme_fc_unregister_remoteport(struct nvme_fc_remote_port *portptr)
 
        spin_unlock_irqrestore(&rport->lock, flags);
 
+       nvme_fc_abort_lsops(rport);
+
        nvme_fc_rport_put(rport);
        return 0;
 }
@@ -624,16 +671,16 @@ static int nvme_fc_ctrl_get(struct nvme_fc_ctrl *);
 
 
 static void
-__nvme_fc_finish_ls_req(struct nvme_fc_ctrl *ctrl,
-               struct nvmefc_ls_req_op *lsop)
+__nvme_fc_finish_ls_req(struct nvmefc_ls_req_op *lsop)
 {
+       struct nvme_fc_rport *rport = lsop->rport;
        struct nvmefc_ls_req *lsreq = &lsop->ls_req;
        unsigned long flags;
 
-       spin_lock_irqsave(&ctrl->lock, flags);
+       spin_lock_irqsave(&rport->lock, flags);
 
        if (!lsop->req_queued) {
-               spin_unlock_irqrestore(&ctrl->lock, flags);
+               spin_unlock_irqrestore(&rport->lock, flags);
                return;
        }
 
@@ -641,56 +688,71 @@ __nvme_fc_finish_ls_req(struct nvme_fc_ctrl *ctrl,
 
        lsop->req_queued = false;
 
-       spin_unlock_irqrestore(&ctrl->lock, flags);
+       spin_unlock_irqrestore(&rport->lock, flags);
 
-       fc_dma_unmap_single(ctrl->dev, lsreq->rqstdma,
+       fc_dma_unmap_single(rport->dev, lsreq->rqstdma,
                                  (lsreq->rqstlen + lsreq->rsplen),
                                  DMA_BIDIRECTIONAL);
 
-       nvme_fc_ctrl_put(ctrl);
+       nvme_fc_rport_put(rport);
 }
 
 static int
-__nvme_fc_send_ls_req(struct nvme_fc_ctrl *ctrl,
+__nvme_fc_send_ls_req(struct nvme_fc_rport *rport,
                struct nvmefc_ls_req_op *lsop,
                void (*done)(struct nvmefc_ls_req *req, int status))
 {
        struct nvmefc_ls_req *lsreq = &lsop->ls_req;
        unsigned long flags;
-       int ret;
+       int ret = 0;
 
-       if (!nvme_fc_ctrl_get(ctrl))
+       if (rport->remoteport.port_state != FC_OBJSTATE_ONLINE)
+               return -ECONNREFUSED;
+
+       if (!nvme_fc_rport_get(rport))
                return -ESHUTDOWN;
 
        lsreq->done = done;
-       lsop->ctrl = ctrl;
+       lsop->rport = rport;
        lsop->req_queued = false;
        INIT_LIST_HEAD(&lsop->lsreq_list);
        init_completion(&lsop->ls_done);
 
-       lsreq->rqstdma = fc_dma_map_single(ctrl->dev, lsreq->rqstaddr,
+       lsreq->rqstdma = fc_dma_map_single(rport->dev, lsreq->rqstaddr,
                                  lsreq->rqstlen + lsreq->rsplen,
                                  DMA_BIDIRECTIONAL);
-       if (fc_dma_mapping_error(ctrl->dev, lsreq->rqstdma)) {
-               nvme_fc_ctrl_put(ctrl);
-               dev_err(ctrl->dev,
-                       "els request command failed EFAULT.\n");
-               return -EFAULT;
+       if (fc_dma_mapping_error(rport->dev, lsreq->rqstdma)) {
+               ret = -EFAULT;
+               goto out_putrport;
        }
        lsreq->rspdma = lsreq->rqstdma + lsreq->rqstlen;
 
-       spin_lock_irqsave(&ctrl->lock, flags);
+       spin_lock_irqsave(&rport->lock, flags);
 
-       list_add_tail(&lsop->lsreq_list, &ctrl->ls_req_list);
+       list_add_tail(&lsop->lsreq_list, &rport->ls_req_list);
 
        lsop->req_queued = true;
 
-       spin_unlock_irqrestore(&ctrl->lock, flags);
+       spin_unlock_irqrestore(&rport->lock, flags);
 
-       ret = ctrl->lport->ops->ls_req(&ctrl->lport->localport,
-                                       &ctrl->rport->remoteport, lsreq);
+       ret = rport->lport->ops->ls_req(&rport->lport->localport,
+                                       &rport->remoteport, lsreq);
        if (ret)
-               lsop->ls_error = ret;
+               goto out_unlink;
+
+       return 0;
+
+out_unlink:
+       lsop->ls_error = ret;
+       spin_lock_irqsave(&rport->lock, flags);
+       lsop->req_queued = false;
+       list_del(&lsop->lsreq_list);
+       spin_unlock_irqrestore(&rport->lock, flags);
+       fc_dma_unmap_single(rport->dev, lsreq->rqstdma,
+                                 (lsreq->rqstlen + lsreq->rsplen),
+                                 DMA_BIDIRECTIONAL);
+out_putrport:
+       nvme_fc_rport_put(rport);
 
        return ret;
 }
@@ -705,15 +767,15 @@ nvme_fc_send_ls_req_done(struct nvmefc_ls_req *lsreq, int status)
 }
 
 static int
-nvme_fc_send_ls_req(struct nvme_fc_ctrl *ctrl, struct nvmefc_ls_req_op *lsop)
+nvme_fc_send_ls_req(struct nvme_fc_rport *rport, struct nvmefc_ls_req_op *lsop)
 {
        struct nvmefc_ls_req *lsreq = &lsop->ls_req;
        struct fcnvme_ls_rjt *rjt = lsreq->rspaddr;
        int ret;
 
-       ret = __nvme_fc_send_ls_req(ctrl, lsop, nvme_fc_send_ls_req_done);
+       ret = __nvme_fc_send_ls_req(rport, lsop, nvme_fc_send_ls_req_done);
 
-       if (!ret)
+       if (!ret) {
                /*
                 * No timeout/not interruptible as we need the struct
                 * to exist until the lldd calls us back. Thus mandate
@@ -722,14 +784,14 @@ nvme_fc_send_ls_req(struct nvme_fc_ctrl *ctrl, struct nvmefc_ls_req_op *lsop)
                 */
                wait_for_completion(&lsop->ls_done);
 
-       __nvme_fc_finish_ls_req(ctrl, lsop);
+               __nvme_fc_finish_ls_req(lsop);
 
-       if (ret) {
-               dev_err(ctrl->dev,
-                       "ls request command failed (%d).\n", ret);
-               return ret;
+               ret = lsop->ls_error;
        }
 
+       if (ret)
+               return ret;
+
        /* ACC or RJT payload ? */
        if (rjt->w0.ls_cmd == FCNVME_LS_RJT)
                return -ENXIO;
@@ -737,19 +799,14 @@ nvme_fc_send_ls_req(struct nvme_fc_ctrl *ctrl, struct nvmefc_ls_req_op *lsop)
        return 0;
 }
 
-static void
-nvme_fc_send_ls_req_async(struct nvme_fc_ctrl *ctrl,
+static int
+nvme_fc_send_ls_req_async(struct nvme_fc_rport *rport,
                struct nvmefc_ls_req_op *lsop,
                void (*done)(struct nvmefc_ls_req *req, int status))
 {
-       int ret;
-
-       ret = __nvme_fc_send_ls_req(ctrl, lsop, done);
-
        /* don't wait for completion */
 
-       if (ret)
-               done(&lsop->ls_req, ret);
+       return __nvme_fc_send_ls_req(rport, lsop, done);
 }
 
 /* Validation Error indexes into the string table below */
@@ -839,7 +896,7 @@ nvme_fc_connect_admin_queue(struct nvme_fc_ctrl *ctrl,
        lsreq->rsplen = sizeof(*assoc_acc);
        lsreq->timeout = NVME_FC_CONNECT_TIMEOUT_SEC;
 
-       ret = nvme_fc_send_ls_req(ctrl, lsop);
+       ret = nvme_fc_send_ls_req(ctrl->rport, lsop);
        if (ret)
                goto out_free_buffer;
 
@@ -848,11 +905,12 @@ nvme_fc_connect_admin_queue(struct nvme_fc_ctrl *ctrl,
        /* validate the ACC response */
        if (assoc_acc->hdr.w0.ls_cmd != FCNVME_LS_ACC)
                fcret = VERR_LSACC;
-       if (assoc_acc->hdr.desc_list_len !=
+       else if (assoc_acc->hdr.desc_list_len !=
                        fcnvme_lsdesc_len(
                                sizeof(struct fcnvme_ls_cr_assoc_acc)))
                fcret = VERR_CR_ASSOC_ACC_LEN;
-       if (assoc_acc->hdr.rqst.desc_tag != cpu_to_be32(FCNVME_LSDESC_RQST))
+       else if (assoc_acc->hdr.rqst.desc_tag !=
+                       cpu_to_be32(FCNVME_LSDESC_RQST))
                fcret = VERR_LSDESC_RQST;
        else if (assoc_acc->hdr.rqst.desc_len !=
                        fcnvme_lsdesc_len(sizeof(struct fcnvme_lsdesc_rqst)))
@@ -946,7 +1004,7 @@ nvme_fc_connect_queue(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
        lsreq->rsplen = sizeof(*conn_acc);
        lsreq->timeout = NVME_FC_CONNECT_TIMEOUT_SEC;
 
-       ret = nvme_fc_send_ls_req(ctrl, lsop);
+       ret = nvme_fc_send_ls_req(ctrl->rport, lsop);
        if (ret)
                goto out_free_buffer;
 
@@ -955,10 +1013,10 @@ nvme_fc_connect_queue(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
        /* validate the ACC response */
        if (conn_acc->hdr.w0.ls_cmd != FCNVME_LS_ACC)
                fcret = VERR_LSACC;
-       if (conn_acc->hdr.desc_list_len !=
+       else if (conn_acc->hdr.desc_list_len !=
                        fcnvme_lsdesc_len(sizeof(struct fcnvme_ls_cr_conn_acc)))
                fcret = VERR_CR_CONN_ACC_LEN;
-       if (conn_acc->hdr.rqst.desc_tag != cpu_to_be32(FCNVME_LSDESC_RQST))
+       else if (conn_acc->hdr.rqst.desc_tag != cpu_to_be32(FCNVME_LSDESC_RQST))
                fcret = VERR_LSDESC_RQST;
        else if (conn_acc->hdr.rqst.desc_len !=
                        fcnvme_lsdesc_len(sizeof(struct fcnvme_lsdesc_rqst)))
@@ -997,14 +1055,8 @@ static void
 nvme_fc_disconnect_assoc_done(struct nvmefc_ls_req *lsreq, int status)
 {
        struct nvmefc_ls_req_op *lsop = ls_req_to_lsop(lsreq);
-       struct nvme_fc_ctrl *ctrl = lsop->ctrl;
 
-       __nvme_fc_finish_ls_req(ctrl, lsop);
-
-       if (status)
-               dev_err(ctrl->dev,
-                       "disconnect assoc ls request command failed (%d).\n",
-                       status);
+       __nvme_fc_finish_ls_req(lsop);
 
        /* fc-nvme iniator doesn't care about success or failure of cmd */
 
@@ -1035,6 +1087,7 @@ nvme_fc_xmt_disconnect_assoc(struct nvme_fc_ctrl *ctrl)
        struct fcnvme_ls_disconnect_acc *discon_acc;
        struct nvmefc_ls_req_op *lsop;
        struct nvmefc_ls_req *lsreq;
+       int ret;
 
        lsop = kzalloc((sizeof(*lsop) +
                         ctrl->lport->ops->lsrqst_priv_sz +
@@ -1077,7 +1130,10 @@ nvme_fc_xmt_disconnect_assoc(struct nvme_fc_ctrl *ctrl)
        lsreq->rsplen = sizeof(*discon_acc);
        lsreq->timeout = NVME_FC_CONNECT_TIMEOUT_SEC;
 
-       nvme_fc_send_ls_req_async(ctrl, lsop, nvme_fc_disconnect_assoc_done);
+       ret = nvme_fc_send_ls_req_async(ctrl->rport, lsop,
+                               nvme_fc_disconnect_assoc_done);
+       if (ret)
+               kfree(lsop);
 
        /* only meaningful part to terminating the association */
        ctrl->association_id = 0;
@@ -1086,6 +1142,7 @@ nvme_fc_xmt_disconnect_assoc(struct nvme_fc_ctrl *ctrl)
 
 /* *********************** NVME Ctrl Routines **************************** */
 
+static void __nvme_fc_final_op_cleanup(struct request *rq);
 
 static int
 nvme_fc_reinit_request(void *data, struct request *rq)
@@ -1123,21 +1180,84 @@ nvme_fc_exit_request(void *data, struct request *rq,
        return __nvme_fc_exit_request(data, op);
 }
 
+static int
+__nvme_fc_abort_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_fcp_op *op)
+{
+       int state;
+
+       state = atomic_xchg(&op->state, FCPOP_STATE_ABORTED);
+       if (state != FCPOP_STATE_ACTIVE) {
+               atomic_set(&op->state, state);
+               return -ECANCELED;
+       }
+
+       ctrl->lport->ops->fcp_abort(&ctrl->lport->localport,
+                                       &ctrl->rport->remoteport,
+                                       op->queue->lldd_handle,
+                                       &op->fcp_req);
+
+       return 0;
+}
+
 static void
-nvme_fc_exit_aen_ops(struct nvme_fc_ctrl *ctrl)
+nvme_fc_abort_aen_ops(struct nvme_fc_ctrl *ctrl)
 {
        struct nvme_fc_fcp_op *aen_op = ctrl->aen_ops;
-       int i;
+       unsigned long flags;
+       int i, ret;
 
        for (i = 0; i < NVME_FC_NR_AEN_COMMANDS; i++, aen_op++) {
-               if (atomic_read(&aen_op->state) == FCPOP_STATE_UNINIT)
+               if (atomic_read(&aen_op->state) != FCPOP_STATE_ACTIVE)
                        continue;
-               __nvme_fc_exit_request(ctrl, aen_op);
-               nvme_fc_ctrl_put(ctrl);
+
+               spin_lock_irqsave(&ctrl->lock, flags);
+               if (ctrl->flags & FCCTRL_TERMIO) {
+                       ctrl->iocnt++;
+                       aen_op->flags |= FCOP_FLAGS_TERMIO;
+               }
+               spin_unlock_irqrestore(&ctrl->lock, flags);
+
+               ret = __nvme_fc_abort_op(ctrl, aen_op);
+               if (ret) {
+                       /*
+                        * if __nvme_fc_abort_op failed the io wasn't
+                        * active. Thus this call path is running in
+                        * parallel to the io complete. Treat as non-error.
+                        */
+
+                       /* back out the flags/counters */
+                       spin_lock_irqsave(&ctrl->lock, flags);
+                       if (ctrl->flags & FCCTRL_TERMIO)
+                               ctrl->iocnt--;
+                       aen_op->flags &= ~FCOP_FLAGS_TERMIO;
+                       spin_unlock_irqrestore(&ctrl->lock, flags);
+                       return;
+               }
+       }
+}
+
+static inline int
+__nvme_fc_fcpop_chk_teardowns(struct nvme_fc_ctrl *ctrl,
+               struct nvme_fc_fcp_op *op)
+{
+       unsigned long flags;
+       bool complete_rq = false;
+
+       spin_lock_irqsave(&ctrl->lock, flags);
+       if (unlikely(op->flags & FCOP_FLAGS_TERMIO)) {
+               if (ctrl->flags & FCCTRL_TERMIO)
+                       ctrl->iocnt--;
        }
+       if (op->flags & FCOP_FLAGS_RELEASED)
+               complete_rq = true;
+       else
+               op->flags |= FCOP_FLAGS_COMPLETE;
+       spin_unlock_irqrestore(&ctrl->lock, flags);
+
+       return complete_rq;
 }
 
-void
+static void
 nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
 {
        struct nvme_fc_fcp_op *op = fcp_req_to_fcp_op(req);
@@ -1146,7 +1266,10 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
        struct nvme_fc_ctrl *ctrl = op->ctrl;
        struct nvme_fc_queue *queue = op->queue;
        struct nvme_completion *cqe = &op->rsp_iu.cqe;
-       u16 status;
+       struct nvme_command *sqe = &op->cmd_iu.sqe;
+       __le16 status = cpu_to_le16(NVME_SC_SUCCESS << 1);
+       union nvme_result result;
+       bool complete_rq;
 
        /*
         * WARNING:
@@ -1181,9 +1304,9 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
                                sizeof(op->rsp_iu), DMA_FROM_DEVICE);
 
        if (atomic_read(&op->state) == FCPOP_STATE_ABORTED)
-               status = NVME_SC_ABORT_REQ | NVME_SC_DNR;
-       else
-               status = freq->status;
+               status = cpu_to_le16((NVME_SC_ABORT_REQ | NVME_SC_DNR) << 1);
+       else if (freq->status)
+               status = cpu_to_le16(NVME_SC_FC_TRANSPORT_ERROR << 1);
 
        /*
         * For the linux implementation, if we have an unsuccesful
@@ -1211,10 +1334,10 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
                 */
                if (freq->transferred_length !=
                        be32_to_cpu(op->cmd_iu.data_len)) {
-                       status = -EIO;
+                       status = cpu_to_le16(NVME_SC_FC_TRANSPORT_ERROR << 1);
                        goto done;
                }
-               op->nreq.result.u64 = 0;
+               result.u64 = 0;
                break;
 
        case sizeof(struct nvme_fc_ersp_iu):
@@ -1226,28 +1349,40 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
                                        (freq->rcv_rsplen / 4) ||
                             be32_to_cpu(op->rsp_iu.xfrd_len) !=
                                        freq->transferred_length ||
-                            op->rqno != le16_to_cpu(cqe->command_id))) {
-                       status = -EIO;
+                            op->rsp_iu.status_code ||
+                            sqe->common.command_id != cqe->command_id)) {
+                       status = cpu_to_le16(NVME_SC_FC_TRANSPORT_ERROR << 1);
                        goto done;
                }
-               op->nreq.result = cqe->result;
-               status = le16_to_cpu(cqe->status) >> 1;
+               result = cqe->result;
+               status = cqe->status;
                break;
 
        default:
-               status = -EIO;
+               status = cpu_to_le16(NVME_SC_FC_TRANSPORT_ERROR << 1);
                goto done;
        }
 
 done:
-       if (!queue->qnum && op->rqno >= AEN_CMDID_BASE) {
-               nvme_complete_async_event(&queue->ctrl->ctrl, status,
-                                       &op->nreq.result);
+       if (op->flags & FCOP_FLAGS_AEN) {
+               nvme_complete_async_event(&queue->ctrl->ctrl, status, &result);
+               complete_rq = __nvme_fc_fcpop_chk_teardowns(ctrl, op);
+               atomic_set(&op->state, FCPOP_STATE_IDLE);
+               op->flags = FCOP_FLAGS_AEN;     /* clear other flags */
                nvme_fc_ctrl_put(ctrl);
                return;
        }
 
-       blk_mq_complete_request(rq, status);
+       complete_rq = __nvme_fc_fcpop_chk_teardowns(ctrl, op);
+       if (!complete_rq) {
+               if (unlikely(op->flags & FCOP_FLAGS_TERMIO)) {
+                       status = cpu_to_le16(NVME_SC_ABORT_REQ);
+                       if (blk_queue_dying(rq->q))
+                               status |= cpu_to_le16(NVME_SC_DNR);
+               }
+               nvme_end_request(rq, status, result);
+       } else
+               __nvme_fc_final_op_cleanup(rq);
 }
 
 static int
@@ -1328,25 +1463,55 @@ nvme_fc_init_aen_ops(struct nvme_fc_ctrl *ctrl)
        struct nvme_fc_fcp_op *aen_op;
        struct nvme_fc_cmd_iu *cmdiu;
        struct nvme_command *sqe;
+       void *private;
        int i, ret;
 
        aen_op = ctrl->aen_ops;
        for (i = 0; i < NVME_FC_NR_AEN_COMMANDS; i++, aen_op++) {
+               private = kzalloc(ctrl->lport->ops->fcprqst_priv_sz,
+                                               GFP_KERNEL);
+               if (!private)
+                       return -ENOMEM;
+
                cmdiu = &aen_op->cmd_iu;
                sqe = &cmdiu->sqe;
                ret = __nvme_fc_init_request(ctrl, &ctrl->queues[0],
                                aen_op, (struct request *)NULL,
                                (AEN_CMDID_BASE + i));
-               if (ret)
+               if (ret) {
+                       kfree(private);
                        return ret;
+               }
+
+               aen_op->flags = FCOP_FLAGS_AEN;
+               aen_op->fcp_req.first_sgl = NULL; /* no sg list */
+               aen_op->fcp_req.private = private;
 
                memset(sqe, 0, sizeof(*sqe));
                sqe->common.opcode = nvme_admin_async_event;
+               /* Note: core layer may overwrite the sqe.command_id value */
                sqe->common.command_id = AEN_CMDID_BASE + i;
        }
        return 0;
 }
 
+static void
+nvme_fc_term_aen_ops(struct nvme_fc_ctrl *ctrl)
+{
+       struct nvme_fc_fcp_op *aen_op;
+       int i;
+
+       aen_op = ctrl->aen_ops;
+       for (i = 0; i < NVME_FC_NR_AEN_COMMANDS; i++, aen_op++) {
+               if (!aen_op->fcp_req.private)
+                       continue;
+
+               __nvme_fc_exit_request(ctrl, aen_op);
+
+               kfree(aen_op->fcp_req.private);
+               aen_op->fcp_req.private = NULL;
+       }
+}
 
 static inline void
 __nvme_fc_init_hctx(struct blk_mq_hw_ctx *hctx, struct nvme_fc_ctrl *ctrl,
@@ -1446,15 +1611,6 @@ __nvme_fc_delete_hw_queue(struct nvme_fc_ctrl *ctrl,
 }
 
 static void
-nvme_fc_destroy_admin_queue(struct nvme_fc_ctrl *ctrl)
-{
-       __nvme_fc_delete_hw_queue(ctrl, &ctrl->queues[0], 0);
-       blk_cleanup_queue(ctrl->ctrl.admin_q);
-       blk_mq_free_tag_set(&ctrl->admin_tag_set);
-       nvme_fc_free_queue(&ctrl->queues[0]);
-}
-
-static void
 nvme_fc_free_io_queues(struct nvme_fc_ctrl *ctrl)
 {
        int i;
@@ -1541,19 +1697,27 @@ nvme_fc_ctrl_free(struct kref *ref)
                container_of(ref, struct nvme_fc_ctrl, ref);
        unsigned long flags;
 
-       if (ctrl->state != FCCTRL_INIT) {
-               /* remove from rport list */
-               spin_lock_irqsave(&ctrl->rport->lock, flags);
-               list_del(&ctrl->ctrl_list);
-               spin_unlock_irqrestore(&ctrl->rport->lock, flags);
+       if (ctrl->ctrl.tagset) {
+               blk_cleanup_queue(ctrl->ctrl.connect_q);
+               blk_mq_free_tag_set(&ctrl->tag_set);
        }
 
+       /* remove from rport list */
+       spin_lock_irqsave(&ctrl->rport->lock, flags);
+       list_del(&ctrl->ctrl_list);
+       spin_unlock_irqrestore(&ctrl->rport->lock, flags);
+
+       blk_cleanup_queue(ctrl->ctrl.admin_q);
+       blk_mq_free_tag_set(&ctrl->admin_tag_set);
+
+       kfree(ctrl->queues);
+
        put_device(ctrl->dev);
        nvme_fc_rport_put(ctrl->rport);
 
-       kfree(ctrl->queues);
        ida_simple_remove(&nvme_fc_ctrl_cnt, ctrl->cnum);
-       nvmf_free_options(ctrl->ctrl.opts);
+       if (ctrl->ctrl.opts)
+               nvmf_free_options(ctrl->ctrl.opts);
        kfree(ctrl);
 }
 
@@ -1574,57 +1738,38 @@ nvme_fc_ctrl_get(struct nvme_fc_ctrl *ctrl)
  * controller. Called after last nvme_put_ctrl() call
  */
 static void
-nvme_fc_free_nvme_ctrl(struct nvme_ctrl *nctrl)
+nvme_fc_nvme_ctrl_freed(struct nvme_ctrl *nctrl)
 {
        struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl);
 
        WARN_ON(nctrl != &ctrl->ctrl);
 
-       /*
-        * Tear down the association, which will generate link
-        * traffic to terminate connections
-        */
-
-       if (ctrl->state != FCCTRL_INIT) {
-               /* send a Disconnect(association) LS to fc-nvme target */
-               nvme_fc_xmt_disconnect_assoc(ctrl);
-
-               if (ctrl->ctrl.tagset) {
-                       blk_cleanup_queue(ctrl->ctrl.connect_q);
-                       blk_mq_free_tag_set(&ctrl->tag_set);
-                       nvme_fc_delete_hw_io_queues(ctrl);
-                       nvme_fc_free_io_queues(ctrl);
-               }
-
-               nvme_fc_exit_aen_ops(ctrl);
-
-               nvme_fc_destroy_admin_queue(ctrl);
-       }
-
        nvme_fc_ctrl_put(ctrl);
 }
 
-
-static int
-__nvme_fc_abort_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_fcp_op *op)
+static void
+nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg)
 {
-       int state;
+       dev_warn(ctrl->ctrl.device,
+               "NVME-FC{%d}: transport association error detected: %s\n",
+               ctrl->cnum, errmsg);
+       dev_info(ctrl->ctrl.device,
+               "NVME-FC{%d}: resetting controller\n", ctrl->cnum);
 
-       state = atomic_xchg(&op->state, FCPOP_STATE_ABORTED);
-       if (state != FCPOP_STATE_ACTIVE) {
-               atomic_set(&op->state, state);
-               return -ECANCELED; /* fail */
+       if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING)) {
+               dev_err(ctrl->ctrl.device,
+                       "NVME-FC{%d}: error_recovery: Couldn't change state "
+                       "to RECONNECTING\n", ctrl->cnum);
+               return;
        }
 
-       ctrl->lport->ops->fcp_abort(&ctrl->lport->localport,
-                                       &ctrl->rport->remoteport,
-                                       op->queue->lldd_handle,
-                                       &op->fcp_req);
-
-       return 0;
+       if (!queue_work(nvme_fc_wq, &ctrl->reset_work))
+               dev_err(ctrl->ctrl.device,
+                       "NVME-FC{%d}: error_recovery: Failed to schedule "
+                       "reset work\n", ctrl->cnum);
 }
 
-enum blk_eh_timer_return
+static enum blk_eh_timer_return
 nvme_fc_timeout(struct request *rq, bool reserved)
 {
        struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq);
@@ -1640,11 +1785,13 @@ nvme_fc_timeout(struct request *rq, bool reserved)
                return BLK_EH_HANDLED;
 
        /*
-        * TODO: force a controller reset
-        *   when that happens, queues will be torn down and outstanding
-        *   ios will be terminated, and the above abort, on a single io
-        *   will no longer be needed.
+        * we can't individually ABTS an io without affecting the queue,
+        * thus killing the queue, adn thus the association.
+        * So resolve by performing a controller reset, which will stop
+        * the host/io stack, terminate the association on the link,
+        * and recreate an association on the link.
         */
+       nvme_fc_error_recovery(ctrl, "io timeout error");
 
        return BLK_EH_HANDLED;
 }
@@ -1738,6 +1885,13 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
        u32 csn;
        int ret;
 
+       /*
+        * before attempting to send the io, check to see if we believe
+        * the target device is present
+        */
+       if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE)
+               return BLK_MQ_RQ_QUEUE_ERROR;
+
        if (!nvme_fc_ctrl_get(ctrl))
                return BLK_MQ_RQ_QUEUE_ERROR;
 
@@ -1761,7 +1915,7 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
        op->fcp_req.io_dir = io_dir;
        op->fcp_req.transferred_length = 0;
        op->fcp_req.rcv_rsplen = 0;
-       op->fcp_req.status = 0;
+       op->fcp_req.status = NVME_SC_SUCCESS;
        op->fcp_req.sqid = cpu_to_le16(queue->qnum);
 
        /*
@@ -1782,14 +1936,9 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
        sqe->rw.dptr.sgl.length = cpu_to_le32(data_len);
        sqe->rw.dptr.sgl.addr = 0;
 
-       /* odd that we set the command_id - should come from nvme-fabrics */
-       WARN_ON_ONCE(sqe->common.command_id != cpu_to_le16(op->rqno));
-
-       if (op->rq) {                           /* skipped on aens */
+       if (!(op->flags & FCOP_FLAGS_AEN)) {
                ret = nvme_fc_map_data(ctrl, op->rq, op);
                if (ret < 0) {
-                       dev_err(queue->ctrl->ctrl.device,
-                            "Failed to map data (%d)\n", ret);
                        nvme_cleanup_cmd(op->rq);
                        nvme_fc_ctrl_put(ctrl);
                        return (ret == -ENOMEM || ret == -EAGAIN) ?
@@ -1802,7 +1951,7 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
 
        atomic_set(&op->state, FCPOP_STATE_ACTIVE);
 
-       if (op->rq)
+       if (!(op->flags & FCOP_FLAGS_AEN))
                blk_mq_start_request(op->rq);
 
        ret = ctrl->lport->ops->fcp_io(&ctrl->lport->localport,
@@ -1810,9 +1959,6 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
                                        queue->lldd_handle, &op->fcp_req);
 
        if (ret) {
-               dev_err(ctrl->dev,
-                       "Send nvme command failed - lldd returned %d.\n", ret);
-
                if (op->rq) {                   /* normal request */
                        nvme_fc_unmap_data(ctrl, op->rq, op);
                        nvme_cleanup_cmd(op->rq);
@@ -1882,12 +2028,8 @@ nvme_fc_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
        struct nvme_fc_fcp_op *op;
 
        req = blk_mq_tag_to_rq(nvme_fc_tagset(queue), tag);
-       if (!req) {
-               dev_err(queue->ctrl->ctrl.device,
-                        "tag 0x%x on QNum %#x not found\n",
-                       tag, queue->qnum);
+       if (!req)
                return 0;
-       }
 
        op = blk_mq_rq_to_pdu(req);
 
@@ -1904,11 +2046,21 @@ nvme_fc_submit_async_event(struct nvme_ctrl *arg, int aer_idx)
 {
        struct nvme_fc_ctrl *ctrl = to_fc_ctrl(arg);
        struct nvme_fc_fcp_op *aen_op;
+       unsigned long flags;
+       bool terminating = false;
        int ret;
 
        if (aer_idx > NVME_FC_NR_AEN_COMMANDS)
                return;
 
+       spin_lock_irqsave(&ctrl->lock, flags);
+       if (ctrl->flags & FCCTRL_TERMIO)
+               terminating = true;
+       spin_unlock_irqrestore(&ctrl->lock, flags);
+
+       if (terminating)
+               return;
+
        aen_op = &ctrl->aen_ops[aer_idx];
 
        ret = nvme_fc_start_fcp_op(ctrl, aen_op->queue, aen_op, 0,
@@ -1919,36 +2071,101 @@ nvme_fc_submit_async_event(struct nvme_ctrl *arg, int aer_idx)
 }
 
 static void
-nvme_fc_complete_rq(struct request *rq)
+__nvme_fc_final_op_cleanup(struct request *rq)
 {
        struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq);
        struct nvme_fc_ctrl *ctrl = op->ctrl;
-       int error = 0, state;
 
-       state = atomic_xchg(&op->state, FCPOP_STATE_IDLE);
+       atomic_set(&op->state, FCPOP_STATE_IDLE);
+       op->flags &= ~(FCOP_FLAGS_TERMIO | FCOP_FLAGS_RELEASED |
+                       FCOP_FLAGS_COMPLETE);
 
        nvme_cleanup_cmd(rq);
-
        nvme_fc_unmap_data(ctrl, rq, op);
+       nvme_complete_rq(rq);
+       nvme_fc_ctrl_put(ctrl);
 
-       if (unlikely(rq->errors)) {
-               if (nvme_req_needs_retry(rq, rq->errors)) {
-                       nvme_requeue_req(rq);
-                       return;
-               }
+}
 
-               if (blk_rq_is_passthrough(rq))
-                       error = rq->errors;
-               else
-                       error = nvme_error_status(rq->errors);
+static void
+nvme_fc_complete_rq(struct request *rq)
+{
+       struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq);
+       struct nvme_fc_ctrl *ctrl = op->ctrl;
+       unsigned long flags;
+       bool completed = false;
+
+       /*
+        * the core layer, on controller resets after calling
+        * nvme_shutdown_ctrl(), calls complete_rq without our
+        * calling blk_mq_complete_request(), thus there may still
+        * be live i/o outstanding with the LLDD. Means transport has
+        * to track complete calls vs fcpio_done calls to know what
+        * path to take on completes and dones.
+        */
+       spin_lock_irqsave(&ctrl->lock, flags);
+       if (op->flags & FCOP_FLAGS_COMPLETE)
+               completed = true;
+       else
+               op->flags |= FCOP_FLAGS_RELEASED;
+       spin_unlock_irqrestore(&ctrl->lock, flags);
+
+       if (completed)
+               __nvme_fc_final_op_cleanup(rq);
+}
+
+/*
+ * This routine is used by the transport when it needs to find active
+ * io on a queue that is to be terminated. The transport uses
+ * blk_mq_tagset_busy_itr() to find the busy requests, which then invoke
+ * this routine to kill them on a 1 by 1 basis.
+ *
+ * As FC allocates FC exchange for each io, the transport must contact
+ * the LLDD to terminate the exchange, thus releasing the FC exchange.
+ * After terminating the exchange the LLDD will call the transport's
+ * normal io done path for the request, but it will have an aborted
+ * status. The done path will return the io request back to the block
+ * layer with an error status.
+ */
+static void
+nvme_fc_terminate_exchange(struct request *req, void *data, bool reserved)
+{
+       struct nvme_ctrl *nctrl = data;
+       struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl);
+       struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(req);
+       unsigned long flags;
+       int status;
+
+       if (!blk_mq_request_started(req))
+               return;
+
+       spin_lock_irqsave(&ctrl->lock, flags);
+       if (ctrl->flags & FCCTRL_TERMIO) {
+               ctrl->iocnt++;
+               op->flags |= FCOP_FLAGS_TERMIO;
        }
+       spin_unlock_irqrestore(&ctrl->lock, flags);
 
-       nvme_fc_ctrl_put(ctrl);
+       status = __nvme_fc_abort_op(ctrl, op);
+       if (status) {
+               /*
+                * if __nvme_fc_abort_op failed the io wasn't
+                * active. Thus this call path is running in
+                * parallel to the io complete. Treat as non-error.
+                */
 
-       blk_mq_end_request(rq, error);
+               /* back out the flags/counters */
+               spin_lock_irqsave(&ctrl->lock, flags);
+               if (ctrl->flags & FCCTRL_TERMIO)
+                       ctrl->iocnt--;
+               op->flags &= ~FCOP_FLAGS_TERMIO;
+               spin_unlock_irqrestore(&ctrl->lock, flags);
+               return;
+       }
 }
 
-static struct blk_mq_ops nvme_fc_mq_ops = {
+
+static const struct blk_mq_ops nvme_fc_mq_ops = {
        .queue_rq       = nvme_fc_queue_rq,
        .complete       = nvme_fc_complete_rq,
        .init_request   = nvme_fc_init_request,
@@ -1959,145 +2176,275 @@ static struct blk_mq_ops nvme_fc_mq_ops = {
        .timeout        = nvme_fc_timeout,
 };
 
-static struct blk_mq_ops nvme_fc_admin_mq_ops = {
-       .queue_rq       = nvme_fc_queue_rq,
-       .complete       = nvme_fc_complete_rq,
-       .init_request   = nvme_fc_init_admin_request,
-       .exit_request   = nvme_fc_exit_request,
-       .reinit_request = nvme_fc_reinit_request,
-       .init_hctx      = nvme_fc_init_admin_hctx,
-       .timeout        = nvme_fc_timeout,
-};
-
 static int
-nvme_fc_configure_admin_queue(struct nvme_fc_ctrl *ctrl)
+nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl)
 {
-       u32 segs;
-       int error;
+       struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
+       int ret;
 
-       nvme_fc_init_queue(ctrl, 0, NVME_FC_AQ_BLKMQ_DEPTH);
+       ret = nvme_set_queue_count(&ctrl->ctrl, &opts->nr_io_queues);
+       if (ret) {
+               dev_info(ctrl->ctrl.device,
+                       "set_queue_count failed: %d\n", ret);
+               return ret;
+       }
 
-       error = nvme_fc_connect_admin_queue(ctrl, &ctrl->queues[0],
-                               NVME_FC_AQ_BLKMQ_DEPTH,
-                               (NVME_FC_AQ_BLKMQ_DEPTH / 4));
-       if (error)
-               return error;
+       ctrl->queue_count = opts->nr_io_queues + 1;
+       if (!opts->nr_io_queues)
+               return 0;
 
-       memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set));
-       ctrl->admin_tag_set.ops = &nvme_fc_admin_mq_ops;
-       ctrl->admin_tag_set.queue_depth = NVME_FC_AQ_BLKMQ_DEPTH;
-       ctrl->admin_tag_set.reserved_tags = 2; /* fabric connect + Keep-Alive */
-       ctrl->admin_tag_set.numa_node = NUMA_NO_NODE;
-       ctrl->admin_tag_set.cmd_size = sizeof(struct nvme_fc_fcp_op) +
+       dev_info(ctrl->ctrl.device, "creating %d I/O queues.\n",
+                       opts->nr_io_queues);
+
+       nvme_fc_init_io_queues(ctrl);
+
+       memset(&ctrl->tag_set, 0, sizeof(ctrl->tag_set));
+       ctrl->tag_set.ops = &nvme_fc_mq_ops;
+       ctrl->tag_set.queue_depth = ctrl->ctrl.opts->queue_size;
+       ctrl->tag_set.reserved_tags = 1; /* fabric connect */
+       ctrl->tag_set.numa_node = NUMA_NO_NODE;
+       ctrl->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
+       ctrl->tag_set.cmd_size = sizeof(struct nvme_fc_fcp_op) +
                                        (SG_CHUNK_SIZE *
                                                sizeof(struct scatterlist)) +
                                        ctrl->lport->ops->fcprqst_priv_sz;
-       ctrl->admin_tag_set.driver_data = ctrl;
-       ctrl->admin_tag_set.nr_hw_queues = 1;
-       ctrl->admin_tag_set.timeout = ADMIN_TIMEOUT;
+       ctrl->tag_set.driver_data = ctrl;
+       ctrl->tag_set.nr_hw_queues = ctrl->queue_count - 1;
+       ctrl->tag_set.timeout = NVME_IO_TIMEOUT;
 
-       error = blk_mq_alloc_tag_set(&ctrl->admin_tag_set);
-       if (error)
-               goto out_free_queue;
+       ret = blk_mq_alloc_tag_set(&ctrl->tag_set);
+       if (ret)
+               return ret;
 
-       ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set);
-       if (IS_ERR(ctrl->ctrl.admin_q)) {
-               error = PTR_ERR(ctrl->ctrl.admin_q);
-               goto out_free_tagset;
+       ctrl->ctrl.tagset = &ctrl->tag_set;
+
+       ctrl->ctrl.connect_q = blk_mq_init_queue(&ctrl->tag_set);
+       if (IS_ERR(ctrl->ctrl.connect_q)) {
+               ret = PTR_ERR(ctrl->ctrl.connect_q);
+               goto out_free_tag_set;
        }
 
-       error = __nvme_fc_create_hw_queue(ctrl, &ctrl->queues[0], 0,
+       ret = nvme_fc_create_hw_io_queues(ctrl, ctrl->ctrl.opts->queue_size);
+       if (ret)
+               goto out_cleanup_blk_queue;
+
+       ret = nvme_fc_connect_io_queues(ctrl, ctrl->ctrl.opts->queue_size);
+       if (ret)
+               goto out_delete_hw_queues;
+
+       return 0;
+
+out_delete_hw_queues:
+       nvme_fc_delete_hw_io_queues(ctrl);
+out_cleanup_blk_queue:
+       nvme_stop_keep_alive(&ctrl->ctrl);
+       blk_cleanup_queue(ctrl->ctrl.connect_q);
+out_free_tag_set:
+       blk_mq_free_tag_set(&ctrl->tag_set);
+       nvme_fc_free_io_queues(ctrl);
+
+       /* force put free routine to ignore io queues */
+       ctrl->ctrl.tagset = NULL;
+
+       return ret;
+}
+
+static int
+nvme_fc_reinit_io_queues(struct nvme_fc_ctrl *ctrl)
+{
+       struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
+       int ret;
+
+       ret = nvme_set_queue_count(&ctrl->ctrl, &opts->nr_io_queues);
+       if (ret) {
+               dev_info(ctrl->ctrl.device,
+                       "set_queue_count failed: %d\n", ret);
+               return ret;
+       }
+
+       /* check for io queues existing */
+       if (ctrl->queue_count == 1)
+               return 0;
+
+       dev_info(ctrl->ctrl.device, "Recreating %d I/O queues.\n",
+                       opts->nr_io_queues);
+
+       nvme_fc_init_io_queues(ctrl);
+
+       ret = blk_mq_reinit_tagset(&ctrl->tag_set);
+       if (ret)
+               goto out_free_io_queues;
+
+       ret = nvme_fc_create_hw_io_queues(ctrl, ctrl->ctrl.opts->queue_size);
+       if (ret)
+               goto out_free_io_queues;
+
+       ret = nvme_fc_connect_io_queues(ctrl, ctrl->ctrl.opts->queue_size);
+       if (ret)
+               goto out_delete_hw_queues;
+
+       return 0;
+
+out_delete_hw_queues:
+       nvme_fc_delete_hw_io_queues(ctrl);
+out_free_io_queues:
+       nvme_fc_free_io_queues(ctrl);
+       return ret;
+}
+
+/*
+ * This routine restarts the controller on the host side, and
+ * on the link side, recreates the controller association.
+ */
+static int
+nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
+{
+       struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
+       u32 segs;
+       int ret;
+       bool changed;
+
+       ctrl->connect_attempts++;
+
+       /*
+        * Create the admin queue
+        */
+
+       nvme_fc_init_queue(ctrl, 0, NVME_FC_AQ_BLKMQ_DEPTH);
+
+       ret = __nvme_fc_create_hw_queue(ctrl, &ctrl->queues[0], 0,
                                NVME_FC_AQ_BLKMQ_DEPTH);
-       if (error)
-               goto out_cleanup_queue;
+       if (ret)
+               goto out_free_queue;
 
-       error = nvmf_connect_admin_queue(&ctrl->ctrl);
-       if (error)
+       ret = nvme_fc_connect_admin_queue(ctrl, &ctrl->queues[0],
+                               NVME_FC_AQ_BLKMQ_DEPTH,
+                               (NVME_FC_AQ_BLKMQ_DEPTH / 4));
+       if (ret)
                goto out_delete_hw_queue;
 
-       error = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->cap);
-       if (error) {
+       if (ctrl->ctrl.state != NVME_CTRL_NEW)
+               blk_mq_start_stopped_hw_queues(ctrl->ctrl.admin_q, true);
+
+       ret = nvmf_connect_admin_queue(&ctrl->ctrl);
+       if (ret)
+               goto out_disconnect_admin_queue;
+
+       /*
+        * Check controller capabilities
+        *
+        * todo:- add code to check if ctrl attributes changed from
+        * prior connection values
+        */
+
+       ret = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->cap);
+       if (ret) {
                dev_err(ctrl->ctrl.device,
                        "prop_get NVME_REG_CAP failed\n");
-               goto out_delete_hw_queue;
+               goto out_disconnect_admin_queue;
        }
 
        ctrl->ctrl.sqsize =
                min_t(int, NVME_CAP_MQES(ctrl->cap) + 1, ctrl->ctrl.sqsize);
 
-       error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap);
-       if (error)
-               goto out_delete_hw_queue;
+       ret = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap);
+       if (ret)
+               goto out_disconnect_admin_queue;
 
        segs = min_t(u32, NVME_FC_MAX_SEGMENTS,
                        ctrl->lport->ops->max_sgl_segments);
        ctrl->ctrl.max_hw_sectors = (segs - 1) << (PAGE_SHIFT - 9);
 
-       error = nvme_init_identify(&ctrl->ctrl);
-       if (error)
-               goto out_delete_hw_queue;
+       ret = nvme_init_identify(&ctrl->ctrl);
+       if (ret)
+               goto out_disconnect_admin_queue;
+
+       /* sanity checks */
+
+       /* FC-NVME does not have other data in the capsule */
+       if (ctrl->ctrl.icdoff) {
+               dev_err(ctrl->ctrl.device, "icdoff %d is not supported!\n",
+                               ctrl->ctrl.icdoff);
+               goto out_disconnect_admin_queue;
+       }
 
        nvme_start_keep_alive(&ctrl->ctrl);
 
-       return 0;
+       /* FC-NVME supports normal SGL Data Block Descriptors */
+
+       if (opts->queue_size > ctrl->ctrl.maxcmd) {
+               /* warn if maxcmd is lower than queue_size */
+               dev_warn(ctrl->ctrl.device,
+                       "queue_size %zu > ctrl maxcmd %u, reducing "
+                       "to queue_size\n",
+                       opts->queue_size, ctrl->ctrl.maxcmd);
+               opts->queue_size = ctrl->ctrl.maxcmd;
+       }
+
+       ret = nvme_fc_init_aen_ops(ctrl);
+       if (ret)
+               goto out_term_aen_ops;
+
+       /*
+        * Create the io queues
+        */
+
+       if (ctrl->queue_count > 1) {
+               if (ctrl->ctrl.state == NVME_CTRL_NEW)
+                       ret = nvme_fc_create_io_queues(ctrl);
+               else
+                       ret = nvme_fc_reinit_io_queues(ctrl);
+               if (ret)
+                       goto out_term_aen_ops;
+       }
+
+       changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
+       WARN_ON_ONCE(!changed);
 
+       ctrl->connect_attempts = 0;
+
+       kref_get(&ctrl->ctrl.kref);
+
+       if (ctrl->queue_count > 1) {
+               nvme_start_queues(&ctrl->ctrl);
+               nvme_queue_scan(&ctrl->ctrl);
+               nvme_queue_async_events(&ctrl->ctrl);
+       }
+
+       return 0;       /* Success */
+
+out_term_aen_ops:
+       nvme_fc_term_aen_ops(ctrl);
+       nvme_stop_keep_alive(&ctrl->ctrl);
+out_disconnect_admin_queue:
+       /* send a Disconnect(association) LS to fc-nvme target */
+       nvme_fc_xmt_disconnect_assoc(ctrl);
 out_delete_hw_queue:
        __nvme_fc_delete_hw_queue(ctrl, &ctrl->queues[0], 0);
-out_cleanup_queue:
-       blk_cleanup_queue(ctrl->ctrl.admin_q);
-out_free_tagset:
-       blk_mq_free_tag_set(&ctrl->admin_tag_set);
 out_free_queue:
        nvme_fc_free_queue(&ctrl->queues[0]);
-       return error;
+
+       return ret;
 }
 
 /*
- * This routine is used by the transport when it needs to find active
- * io on a queue that is to be terminated. The transport uses
- * blk_mq_tagset_busy_itr() to find the busy requests, which then invoke
- * this routine to kill them on a 1 by 1 basis.
- *
- * As FC allocates FC exchange for each io, the transport must contact
- * the LLDD to terminate the exchange, thus releasing the FC exchange.
- * After terminating the exchange the LLDD will call the transport's
- * normal io done path for the request, but it will have an aborted
- * status. The done path will return the io request back to the block
- * layer with an error status.
+ * This routine stops operation of the controller on the host side.
+ * On the host os stack side: Admin and IO queues are stopped,
+ *   outstanding ios on them terminated via FC ABTS.
+ * On the link side: the association is terminated.
  */
 static void
-nvme_fc_terminate_exchange(struct request *req, void *data, bool reserved)
+nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl)
 {
-       struct nvme_ctrl *nctrl = data;
-       struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl);
-       struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(req);
-int status;
-
-       if (!blk_mq_request_started(req))
-               return;
+       unsigned long flags;
 
-       /* this performs an ABTS-LS on the FC exchange for the io */
-       status = __nvme_fc_abort_op(ctrl, op);
-       /*
-        * if __nvme_fc_abort_op failed: io wasn't active to abort
-        * consider it done. Assume completion path already completing
-        * in parallel
-        */
-       if (status)
-               /* io wasn't active to abort consider it done */
-               /* assume completion path already completing in parallel */
-               return;
-}
+       nvme_stop_keep_alive(&ctrl->ctrl);
 
+       spin_lock_irqsave(&ctrl->lock, flags);
+       ctrl->flags |= FCCTRL_TERMIO;
+       ctrl->iocnt = 0;
+       spin_unlock_irqrestore(&ctrl->lock, flags);
 
-/*
- * This routine stops operation of the controller. Admin and IO queues
- * are stopped, outstanding ios on them terminated, and the nvme ctrl
- * is shutdown.
- */
-static void
-nvme_fc_shutdown_ctrl(struct nvme_fc_ctrl *ctrl)
-{
        /*
         * If io queues are present, stop them and terminate all outstanding
         * ios on them. As FC allocates FC exchange for each io, the
@@ -2116,35 +2463,79 @@ nvme_fc_shutdown_ctrl(struct nvme_fc_ctrl *ctrl)
                                nvme_fc_terminate_exchange, &ctrl->ctrl);
        }
 
-       if (ctrl->ctrl.state == NVME_CTRL_LIVE)
-               nvme_shutdown_ctrl(&ctrl->ctrl);
+       /*
+        * Other transports, which don't have link-level contexts bound
+        * to sqe's, would try to gracefully shutdown the controller by
+        * writing the registers for shutdown and polling (call
+        * nvme_shutdown_ctrl()). Given a bunch of i/o was potentially
+        * just aborted and we will wait on those contexts, and given
+        * there was no indication of how live the controlelr is on the
+        * link, don't send more io to create more contexts for the
+        * shutdown. Let the controller fail via keepalive failure if
+        * its still present.
+        */
 
        /*
-        * now clean up the admin queue. Same thing as above.
+        * clean up the admin queue. Same thing as above.
         * use blk_mq_tagset_busy_itr() and the transport routine to
         * terminate the exchanges.
         */
        blk_mq_stop_hw_queues(ctrl->ctrl.admin_q);
        blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
                                nvme_fc_terminate_exchange, &ctrl->ctrl);
+
+       /* kill the aens as they are a separate path */
+       nvme_fc_abort_aen_ops(ctrl);
+
+       /* wait for all io that had to be aborted */
+       spin_lock_irqsave(&ctrl->lock, flags);
+       while (ctrl->iocnt) {
+               spin_unlock_irqrestore(&ctrl->lock, flags);
+               msleep(1000);
+               spin_lock_irqsave(&ctrl->lock, flags);
+       }
+       ctrl->flags &= ~FCCTRL_TERMIO;
+       spin_unlock_irqrestore(&ctrl->lock, flags);
+
+       nvme_fc_term_aen_ops(ctrl);
+
+       /*
+        * send a Disconnect(association) LS to fc-nvme target
+        * Note: could have been sent at top of process, but
+        * cleaner on link traffic if after the aborts complete.
+        * Note: if association doesn't exist, association_id will be 0
+        */
+       if (ctrl->association_id)
+               nvme_fc_xmt_disconnect_assoc(ctrl);
+
+       if (ctrl->ctrl.tagset) {
+               nvme_fc_delete_hw_io_queues(ctrl);
+               nvme_fc_free_io_queues(ctrl);
+       }
+
+       __nvme_fc_delete_hw_queue(ctrl, &ctrl->queues[0], 0);
+       nvme_fc_free_queue(&ctrl->queues[0]);
 }
 
-/*
- * Called to teardown an association.
- * May be called with association fully in place or partially in place.
- */
 static void
-__nvme_fc_remove_ctrl(struct nvme_fc_ctrl *ctrl)
+nvme_fc_delete_ctrl_work(struct work_struct *work)
 {
-       nvme_stop_keep_alive(&ctrl->ctrl);
+       struct nvme_fc_ctrl *ctrl =
+               container_of(work, struct nvme_fc_ctrl, delete_work);
 
-       /* stop and terminate ios on admin and io queues */
-       nvme_fc_shutdown_ctrl(ctrl);
+       cancel_work_sync(&ctrl->reset_work);
+       cancel_delayed_work_sync(&ctrl->connect_work);
+
+       /*
+        * kill the association on the link side.  this will block
+        * waiting for io to terminate
+        */
+       nvme_fc_delete_association(ctrl);
 
        /*
         * tear down the controller
         * This will result in the last reference on the nvme ctrl to
-        * expire, calling the transport nvme_fc_free_nvme_ctrl() callback.
+        * expire, calling the transport nvme_fc_nvme_ctrl_freed() callback.
         * From there, the transport will tear down it's logical queues and
         * association.
         */
@@ -2153,15 +2544,6 @@ __nvme_fc_remove_ctrl(struct nvme_fc_ctrl *ctrl)
        nvme_put_ctrl(&ctrl->ctrl);
 }
 
-static void
-nvme_fc_del_ctrl_work(struct work_struct *work)
-{
-       struct nvme_fc_ctrl *ctrl =
-                       container_of(work, struct nvme_fc_ctrl, delete_work);
-
-       __nvme_fc_remove_ctrl(ctrl);
-}
-
 static int
 __nvme_fc_del_ctrl(struct nvme_fc_ctrl *ctrl)
 {
@@ -2181,25 +2563,85 @@ static int
 nvme_fc_del_nvme_ctrl(struct nvme_ctrl *nctrl)
 {
        struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl);
-       struct nvme_fc_rport *rport = ctrl->rport;
-       unsigned long flags;
        int ret;
 
-       spin_lock_irqsave(&rport->lock, flags);
+       if (!kref_get_unless_zero(&ctrl->ctrl.kref))
+               return -EBUSY;
+
        ret = __nvme_fc_del_ctrl(ctrl);
-       spin_unlock_irqrestore(&rport->lock, flags);
-       if (ret)
-               return ret;
 
-       flush_work(&ctrl->delete_work);
+       if (!ret)
+               flush_workqueue(nvme_fc_wq);
 
-       return 0;
+       nvme_put_ctrl(&ctrl->ctrl);
+
+       return ret;
 }
 
+static void
+nvme_fc_reset_ctrl_work(struct work_struct *work)
+{
+       struct nvme_fc_ctrl *ctrl =
+                       container_of(work, struct nvme_fc_ctrl, reset_work);
+       int ret;
+
+       /* will block will waiting for io to terminate */
+       nvme_fc_delete_association(ctrl);
+
+       ret = nvme_fc_create_association(ctrl);
+       if (ret) {
+               dev_warn(ctrl->ctrl.device,
+                       "NVME-FC{%d}: reset: Reconnect attempt failed (%d)\n",
+                       ctrl->cnum, ret);
+               if (ctrl->connect_attempts >= NVME_FC_MAX_CONNECT_ATTEMPTS) {
+                       dev_warn(ctrl->ctrl.device,
+                               "NVME-FC{%d}: Max reconnect attempts (%d) "
+                               "reached. Removing controller\n",
+                               ctrl->cnum, ctrl->connect_attempts);
+
+                       if (!nvme_change_ctrl_state(&ctrl->ctrl,
+                               NVME_CTRL_DELETING)) {
+                               dev_err(ctrl->ctrl.device,
+                                       "NVME-FC{%d}: failed to change state "
+                                       "to DELETING\n", ctrl->cnum);
+                               return;
+                       }
+
+                       WARN_ON(!queue_work(nvme_fc_wq, &ctrl->delete_work));
+                       return;
+               }
+
+               dev_warn(ctrl->ctrl.device,
+                       "NVME-FC{%d}: Reconnect attempt in %d seconds.\n",
+                       ctrl->cnum, ctrl->reconnect_delay);
+               queue_delayed_work(nvme_fc_wq, &ctrl->connect_work,
+                               ctrl->reconnect_delay * HZ);
+       } else
+               dev_info(ctrl->ctrl.device,
+                       "NVME-FC{%d}: controller reset complete\n", ctrl->cnum);
+}
+
+/*
+ * called by the nvme core layer, for sysfs interface that requests
+ * a reset of the nvme controller
+ */
 static int
 nvme_fc_reset_nvme_ctrl(struct nvme_ctrl *nctrl)
 {
-       return -EIO;
+       struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl);
+
+       dev_warn(ctrl->ctrl.device,
+               "NVME-FC{%d}: admin requested controller reset\n", ctrl->cnum);
+
+       if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING))
+               return -EBUSY;
+
+       if (!queue_work(nvme_fc_wq, &ctrl->reset_work))
+               return -EBUSY;
+
+       flush_work(&ctrl->reset_work);
+
+       return 0;
 }
 
 static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = {
@@ -2210,95 +2652,75 @@ static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = {
        .reg_read64             = nvmf_reg_read64,
        .reg_write32            = nvmf_reg_write32,
        .reset_ctrl             = nvme_fc_reset_nvme_ctrl,
-       .free_ctrl              = nvme_fc_free_nvme_ctrl,
+       .free_ctrl              = nvme_fc_nvme_ctrl_freed,
        .submit_async_event     = nvme_fc_submit_async_event,
        .delete_ctrl            = nvme_fc_del_nvme_ctrl,
        .get_subsysnqn          = nvmf_get_subsysnqn,
        .get_address            = nvmf_get_address,
 };
 
-static int
-nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl)
+static void
+nvme_fc_connect_ctrl_work(struct work_struct *work)
 {
-       struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
        int ret;
 
-       ret = nvme_set_queue_count(&ctrl->ctrl, &opts->nr_io_queues);
-       if (ret) {
-               dev_info(ctrl->ctrl.device,
-                       "set_queue_count failed: %d\n", ret);
-               return ret;
-       }
-
-       ctrl->queue_count = opts->nr_io_queues + 1;
-       if (!opts->nr_io_queues)
-               return 0;
-
-       dev_info(ctrl->ctrl.device, "creating %d I/O queues.\n",
-                       opts->nr_io_queues);
-
-       nvme_fc_init_io_queues(ctrl);
-
-       memset(&ctrl->tag_set, 0, sizeof(ctrl->tag_set));
-       ctrl->tag_set.ops = &nvme_fc_mq_ops;
-       ctrl->tag_set.queue_depth = ctrl->ctrl.opts->queue_size;
-       ctrl->tag_set.reserved_tags = 1; /* fabric connect */
-       ctrl->tag_set.numa_node = NUMA_NO_NODE;
-       ctrl->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
-       ctrl->tag_set.cmd_size = sizeof(struct nvme_fc_fcp_op) +
-                                       (SG_CHUNK_SIZE *
-                                               sizeof(struct scatterlist)) +
-                                       ctrl->lport->ops->fcprqst_priv_sz;
-       ctrl->tag_set.driver_data = ctrl;
-       ctrl->tag_set.nr_hw_queues = ctrl->queue_count - 1;
-       ctrl->tag_set.timeout = NVME_IO_TIMEOUT;
-
-       ret = blk_mq_alloc_tag_set(&ctrl->tag_set);
-       if (ret)
-               return ret;
-
-       ctrl->ctrl.tagset = &ctrl->tag_set;
-
-       ctrl->ctrl.connect_q = blk_mq_init_queue(&ctrl->tag_set);
-       if (IS_ERR(ctrl->ctrl.connect_q)) {
-               ret = PTR_ERR(ctrl->ctrl.connect_q);
-               goto out_free_tag_set;
-       }
-
-       ret = nvme_fc_create_hw_io_queues(ctrl, ctrl->ctrl.opts->queue_size);
-       if (ret)
-               goto out_cleanup_blk_queue;
+       struct nvme_fc_ctrl *ctrl =
+                       container_of(to_delayed_work(work),
+                               struct nvme_fc_ctrl, connect_work);
 
-       ret = nvme_fc_connect_io_queues(ctrl, ctrl->ctrl.opts->queue_size);
-       if (ret)
-               goto out_delete_hw_queues;
+       ret = nvme_fc_create_association(ctrl);
+       if (ret) {
+               dev_warn(ctrl->ctrl.device,
+                       "NVME-FC{%d}: Reconnect attempt failed (%d)\n",
+                       ctrl->cnum, ret);
+               if (ctrl->connect_attempts >= NVME_FC_MAX_CONNECT_ATTEMPTS) {
+                       dev_warn(ctrl->ctrl.device,
+                               "NVME-FC{%d}: Max reconnect attempts (%d) "
+                               "reached. Removing controller\n",
+                               ctrl->cnum, ctrl->connect_attempts);
+
+                       if (!nvme_change_ctrl_state(&ctrl->ctrl,
+                               NVME_CTRL_DELETING)) {
+                               dev_err(ctrl->ctrl.device,
+                                       "NVME-FC{%d}: failed to change state "
+                                       "to DELETING\n", ctrl->cnum);
+                               return;
+                       }
 
-       return 0;
+                       WARN_ON(!queue_work(nvme_fc_wq, &ctrl->delete_work));
+                       return;
+               }
 
-out_delete_hw_queues:
-       nvme_fc_delete_hw_io_queues(ctrl);
-out_cleanup_blk_queue:
-       nvme_stop_keep_alive(&ctrl->ctrl);
-       blk_cleanup_queue(ctrl->ctrl.connect_q);
-out_free_tag_set:
-       blk_mq_free_tag_set(&ctrl->tag_set);
-       nvme_fc_free_io_queues(ctrl);
+               dev_warn(ctrl->ctrl.device,
+                       "NVME-FC{%d}: Reconnect attempt in %d seconds.\n",
+                       ctrl->cnum, ctrl->reconnect_delay);
+               queue_delayed_work(nvme_fc_wq, &ctrl->connect_work,
+                               ctrl->reconnect_delay * HZ);
+       } else
+               dev_info(ctrl->ctrl.device,
+                       "NVME-FC{%d}: controller reconnect complete\n",
+                       ctrl->cnum);
+}
 
-       /* force put free routine to ignore io queues */
-       ctrl->ctrl.tagset = NULL;
 
-       return ret;
-}
+static const struct blk_mq_ops nvme_fc_admin_mq_ops = {
+       .queue_rq       = nvme_fc_queue_rq,
+       .complete       = nvme_fc_complete_rq,
+       .init_request   = nvme_fc_init_admin_request,
+       .exit_request   = nvme_fc_exit_request,
+       .reinit_request = nvme_fc_reinit_request,
+       .init_hctx      = nvme_fc_init_admin_hctx,
+       .timeout        = nvme_fc_timeout,
+};
 
 
 static struct nvme_ctrl *
-__nvme_fc_create_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
+nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
        struct nvme_fc_lport *lport, struct nvme_fc_rport *rport)
 {
        struct nvme_fc_ctrl *ctrl;
        unsigned long flags;
        int ret, idx;
-       bool changed;
 
        ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
        if (!ctrl) {
@@ -2314,21 +2736,18 @@ __nvme_fc_create_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
 
        ctrl->ctrl.opts = opts;
        INIT_LIST_HEAD(&ctrl->ctrl_list);
-       INIT_LIST_HEAD(&ctrl->ls_req_list);
        ctrl->lport = lport;
        ctrl->rport = rport;
        ctrl->dev = lport->dev;
-       ctrl->state = FCCTRL_INIT;
        ctrl->cnum = idx;
 
-       ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_fc_ctrl_ops, 0);
-       if (ret)
-               goto out_free_ida;
-
        get_device(ctrl->dev);
        kref_init(&ctrl->ref);
 
-       INIT_WORK(&ctrl->delete_work, nvme_fc_del_ctrl_work);
+       INIT_WORK(&ctrl->delete_work, nvme_fc_delete_ctrl_work);
+       INIT_WORK(&ctrl->reset_work, nvme_fc_reset_ctrl_work);
+       INIT_DELAYED_WORK(&ctrl->connect_work, nvme_fc_connect_ctrl_work);
+       ctrl->reconnect_delay = opts->reconnect_delay;
        spin_lock_init(&ctrl->lock);
 
        /* io queue count */
@@ -2345,87 +2764,87 @@ __nvme_fc_create_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
        ctrl->queues = kcalloc(ctrl->queue_count, sizeof(struct nvme_fc_queue),
                                GFP_KERNEL);
        if (!ctrl->queues)
-               goto out_uninit_ctrl;
-
-       ret = nvme_fc_configure_admin_queue(ctrl);
-       if (ret)
-               goto out_uninit_ctrl;
-
-       /* sanity checks */
-
-       /* FC-NVME does not have other data in the capsule */
-       if (ctrl->ctrl.icdoff) {
-               dev_err(ctrl->ctrl.device, "icdoff %d is not supported!\n",
-                               ctrl->ctrl.icdoff);
-               goto out_remove_admin_queue;
-       }
-
-       /* FC-NVME supports normal SGL Data Block Descriptors */
+               goto out_free_ida;
 
-       if (opts->queue_size > ctrl->ctrl.maxcmd) {
-               /* warn if maxcmd is lower than queue_size */
-               dev_warn(ctrl->ctrl.device,
-                       "queue_size %zu > ctrl maxcmd %u, reducing "
-                       "to queue_size\n",
-                       opts->queue_size, ctrl->ctrl.maxcmd);
-               opts->queue_size = ctrl->ctrl.maxcmd;
-       }
+       memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set));
+       ctrl->admin_tag_set.ops = &nvme_fc_admin_mq_ops;
+       ctrl->admin_tag_set.queue_depth = NVME_FC_AQ_BLKMQ_DEPTH;
+       ctrl->admin_tag_set.reserved_tags = 2; /* fabric connect + Keep-Alive */
+       ctrl->admin_tag_set.numa_node = NUMA_NO_NODE;
+       ctrl->admin_tag_set.cmd_size = sizeof(struct nvme_fc_fcp_op) +
+                                       (SG_CHUNK_SIZE *
+                                               sizeof(struct scatterlist)) +
+                                       ctrl->lport->ops->fcprqst_priv_sz;
+       ctrl->admin_tag_set.driver_data = ctrl;
+       ctrl->admin_tag_set.nr_hw_queues = 1;
+       ctrl->admin_tag_set.timeout = ADMIN_TIMEOUT;
 
-       ret = nvme_fc_init_aen_ops(ctrl);
+       ret = blk_mq_alloc_tag_set(&ctrl->admin_tag_set);
        if (ret)
-               goto out_exit_aen_ops;
+               goto out_free_queues;
 
-       if (ctrl->queue_count > 1) {
-               ret = nvme_fc_create_io_queues(ctrl);
-               if (ret)
-                       goto out_exit_aen_ops;
+       ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set);
+       if (IS_ERR(ctrl->ctrl.admin_q)) {
+               ret = PTR_ERR(ctrl->ctrl.admin_q);
+               goto out_free_admin_tag_set;
        }
 
-       spin_lock_irqsave(&ctrl->lock, flags);
-       ctrl->state = FCCTRL_ACTIVE;
-       spin_unlock_irqrestore(&ctrl->lock, flags);
-
-       changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
-       WARN_ON_ONCE(!changed);
+       /*
+        * Would have been nice to init io queues tag set as well.
+        * However, we require interaction from the controller
+        * for max io queue count before we can do so.
+        * Defer this to the connect path.
+        */
 
-       dev_info(ctrl->ctrl.device,
-               "NVME-FC{%d}: new ctrl: NQN \"%s\"\n",
-               ctrl->cnum, ctrl->ctrl.opts->subsysnqn);
+       ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_fc_ctrl_ops, 0);
+       if (ret)
+               goto out_cleanup_admin_q;
 
-       kref_get(&ctrl->ctrl.kref);
+       /* at this point, teardown path changes to ref counting on nvme ctrl */
 
        spin_lock_irqsave(&rport->lock, flags);
        list_add_tail(&ctrl->ctrl_list, &rport->ctrl_list);
        spin_unlock_irqrestore(&rport->lock, flags);
 
-       if (opts->nr_io_queues) {
-               nvme_queue_scan(&ctrl->ctrl);
-               nvme_queue_async_events(&ctrl->ctrl);
+       ret = nvme_fc_create_association(ctrl);
+       if (ret) {
+               ctrl->ctrl.opts = NULL;
+               /* initiate nvme ctrl ref counting teardown */
+               nvme_uninit_ctrl(&ctrl->ctrl);
+               nvme_put_ctrl(&ctrl->ctrl);
+
+               /* as we're past the point where we transition to the ref
+                * counting teardown path, if we return a bad pointer here,
+                * the calling routine, thinking it's prior to the
+                * transition, will do an rport put. Since the teardown
+                * path also does a rport put, we do an extra get here to
+                * so proper order/teardown happens.
+                */
+               nvme_fc_rport_get(rport);
+
+               if (ret > 0)
+                       ret = -EIO;
+               return ERR_PTR(ret);
        }
 
-       return &ctrl->ctrl;
+       dev_info(ctrl->ctrl.device,
+               "NVME-FC{%d}: new ctrl: NQN \"%s\"\n",
+               ctrl->cnum, ctrl->ctrl.opts->subsysnqn);
 
-out_exit_aen_ops:
-       nvme_fc_exit_aen_ops(ctrl);
-out_remove_admin_queue:
-       /* send a Disconnect(association) LS to fc-nvme target */
-       nvme_fc_xmt_disconnect_assoc(ctrl);
-       nvme_stop_keep_alive(&ctrl->ctrl);
-       nvme_fc_destroy_admin_queue(ctrl);
-out_uninit_ctrl:
-       nvme_uninit_ctrl(&ctrl->ctrl);
-       nvme_put_ctrl(&ctrl->ctrl);
-       if (ret > 0)
-               ret = -EIO;
-       /* exit via here will follow ctlr ref point callbacks to free */
-       return ERR_PTR(ret);
+       return &ctrl->ctrl;
 
+out_cleanup_admin_q:
+       blk_cleanup_queue(ctrl->ctrl.admin_q);
+out_free_admin_tag_set:
+       blk_mq_free_tag_set(&ctrl->admin_tag_set);
+out_free_queues:
+       kfree(ctrl->queues);
 out_free_ida:
+       put_device(ctrl->dev);
        ida_simple_remove(&nvme_fc_ctrl_cnt, ctrl->cnum);
 out_free_ctrl:
        kfree(ctrl);
 out_fail:
-       nvme_fc_rport_put(rport);
        /* exit via here doesn't follow ctlr ref points */
        return ERR_PTR(ret);
 }
@@ -2497,6 +2916,7 @@ nvme_fc_create_ctrl(struct device *dev, struct nvmf_ctrl_options *opts)
 {
        struct nvme_fc_lport *lport;
        struct nvme_fc_rport *rport;
+       struct nvme_ctrl *ctrl;
        struct nvmet_fc_traddr laddr = { 0L, 0L };
        struct nvmet_fc_traddr raddr = { 0L, 0L };
        unsigned long flags;
@@ -2528,7 +2948,10 @@ nvme_fc_create_ctrl(struct device *dev, struct nvmf_ctrl_options *opts)
 
                        spin_unlock_irqrestore(&nvme_fc_lock, flags);
 
-                       return __nvme_fc_create_ctrl(dev, opts, lport, rport);
+                       ctrl = nvme_fc_init_ctrl(dev, opts, lport, rport);
+                       if (IS_ERR(ctrl))
+                               nvme_fc_rport_put(rport);
+                       return ctrl;
                }
        }
        spin_unlock_irqrestore(&nvme_fc_lock, flags);
@@ -2546,11 +2969,20 @@ static struct nvmf_transport_ops nvme_fc_transport = {
 
 static int __init nvme_fc_init_module(void)
 {
+       int ret;
+
        nvme_fc_wq = create_workqueue("nvme_fc_wq");
        if (!nvme_fc_wq)
                return -ENOMEM;
 
-       return nvmf_register_transport(&nvme_fc_transport);
+       ret = nvmf_register_transport(&nvme_fc_transport);
+       if (ret)
+               goto err;
+
+       return 0;
+err:
+       destroy_workqueue(nvme_fc_wq);
+       return ret;
 }
 
 static void __exit nvme_fc_exit_module(void)
index 21cac85..e4e4e60 100644 (file)
@@ -241,9 +241,9 @@ static inline void _nvme_nvm_check_size(void)
        BUILD_BUG_ON(sizeof(struct nvme_nvm_l2ptbl) != 64);
        BUILD_BUG_ON(sizeof(struct nvme_nvm_erase_blk) != 64);
        BUILD_BUG_ON(sizeof(struct nvme_nvm_id_group) != 960);
-       BUILD_BUG_ON(sizeof(struct nvme_nvm_addr_format) != 128);
+       BUILD_BUG_ON(sizeof(struct nvme_nvm_addr_format) != 16);
        BUILD_BUG_ON(sizeof(struct nvme_nvm_id) != 4096);
-       BUILD_BUG_ON(sizeof(struct nvme_nvm_bb_tbl) != 512);
+       BUILD_BUG_ON(sizeof(struct nvme_nvm_bb_tbl) != 64);
 }
 
 static int init_grps(struct nvm_id *nvm_id, struct nvme_nvm_id *nvme_nvm_id)
@@ -324,7 +324,7 @@ static int nvme_nvm_identity(struct nvm_dev *nvmdev, struct nvm_id *nvm_id)
        nvm_id->cap = le32_to_cpu(nvme_nvm_id->cap);
        nvm_id->dom = le32_to_cpu(nvme_nvm_id->dom);
        memcpy(&nvm_id->ppaf, &nvme_nvm_id->ppaf,
-                                       sizeof(struct nvme_nvm_addr_format));
+                                       sizeof(struct nvm_addr_format));
 
        ret = init_grps(nvm_id, nvme_nvm_id);
 out:
@@ -483,8 +483,8 @@ static void nvme_nvm_end_io(struct request *rq, int error)
 {
        struct nvm_rq *rqd = rq->end_io_data;
 
-       rqd->ppa_status = nvme_req(rq)->result.u64;
-       rqd->error = error;
+       rqd->ppa_status = le64_to_cpu(nvme_req(rq)->result.u64);
+       rqd->error = nvme_req(rq)->status;
        nvm_end_io(rqd);
 
        kfree(nvme_req(rq)->cmd);
@@ -510,12 +510,12 @@ static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
        }
        rq->cmd_flags &= ~REQ_FAILFAST_DRIVER;
 
-       rq->ioprio = bio_prio(bio);
-       if (bio_has_data(bio))
-               rq->nr_phys_segments = bio_phys_segments(q, bio);
-
-       rq->__data_len = bio->bi_iter.bi_size;
-       rq->bio = rq->biotail = bio;
+       if (bio) {
+               blk_init_request_from_bio(rq, bio);
+       } else {
+               rq->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM);
+               rq->__data_len = 0;
+       }
 
        nvme_nvm_rqtocmd(rq, rqd, ns, cmd);
 
@@ -526,21 +526,6 @@ static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
        return 0;
 }
 
-static int nvme_nvm_erase_block(struct nvm_dev *dev, struct nvm_rq *rqd)
-{
-       struct request_queue *q = dev->q;
-       struct nvme_ns *ns = q->queuedata;
-       struct nvme_nvm_command c = {};
-
-       c.erase.opcode = NVM_OP_ERASE;
-       c.erase.nsid = cpu_to_le32(ns->ns_id);
-       c.erase.spba = cpu_to_le64(rqd->ppa_addr.ppa);
-       c.erase.length = cpu_to_le16(rqd->nr_ppas - 1);
-       c.erase.control = cpu_to_le16(rqd->flags);
-
-       return nvme_submit_sync_cmd(q, (struct nvme_command *)&c, NULL, 0);
-}
-
 static void *nvme_nvm_create_dma_pool(struct nvm_dev *nvmdev, char *name)
 {
        struct nvme_ns *ns = nvmdev->q->queuedata;
@@ -576,7 +561,6 @@ static struct nvm_dev_ops nvme_nvm_dev_ops = {
        .set_bb_tbl             = nvme_nvm_set_bb_tbl,
 
        .submit_io              = nvme_nvm_submit_io,
-       .erase_block            = nvme_nvm_erase_block,
 
        .create_dma_pool        = nvme_nvm_create_dma_pool,
        .destroy_dma_pool       = nvme_nvm_destroy_dma_pool,
@@ -611,7 +595,7 @@ static int nvme_nvm_submit_user_cmd(struct request_queue *q,
        __le64 *metadata = NULL;
        dma_addr_t metadata_dma;
        DECLARE_COMPLETION_ONSTACK(wait);
-       int ret;
+       int ret = 0;
 
        rq = nvme_alloc_request(q, (struct nvme_command *)vcmd, 0,
                        NVME_QID_ANY);
@@ -681,9 +665,12 @@ submit:
 
        wait_for_completion_io(&wait);
 
-       ret = nvme_error_status(rq->errors);
+       if (nvme_req(rq)->flags & NVME_REQ_CANCELLED)
+               ret = -EINTR;
+       else if (nvme_req(rq)->status & 0x7ff)
+               ret = -EIO;
        if (result)
-               *result = rq->errors & 0x7ff;
+               *result = nvme_req(rq)->status & 0x7ff;
        if (status)
                *status = le64_to_cpu(nvme_req(rq)->result.u64);
 
@@ -766,7 +753,7 @@ static int nvme_nvm_user_vcmd(struct nvme_ns *ns, int admin,
        c.common.cdw2[1] = cpu_to_le32(vcmd.cdw3);
        /* cdw11-12 */
        c.ph_rw.length = cpu_to_le16(vcmd.nppas);
-       c.ph_rw.control  = cpu_to_le32(vcmd.control);
+       c.ph_rw.control  = cpu_to_le16(vcmd.control);
        c.common.cdw10[3] = cpu_to_le32(vcmd.cdw13);
        c.common.cdw10[4] = cpu_to_le32(vcmd.cdw14);
        c.common.cdw10[5] = cpu_to_le32(vcmd.cdw15);
@@ -809,6 +796,8 @@ int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node)
        struct request_queue *q = ns->queue;
        struct nvm_dev *dev;
 
+       _nvme_nvm_check_size();
+
        dev = nvm_alloc_dev(node);
        if (!dev)
                return -ENOMEM;
index 2aa20e3..29c708c 100644 (file)
 #include <linux/lightnvm.h>
 #include <linux/sed-opal.h>
 
-enum {
-       /*
-        * Driver internal status code for commands that were cancelled due
-        * to timeouts or controller shutdown.  The value is negative so
-        * that it a) doesn't overlap with the unsigned hardware error codes,
-        * and b) can easily be tested for.
-        */
-       NVME_SC_CANCELLED               = -EINTR,
-};
-
 extern unsigned char nvme_io_timeout;
 #define NVME_IO_TIMEOUT        (nvme_io_timeout * HZ)
 
@@ -43,8 +33,6 @@ extern unsigned char shutdown_timeout;
 #define NVME_DEFAULT_KATO      5
 #define NVME_KATO_GRACE                10
 
-extern unsigned int nvme_max_retries;
-
 enum {
        NVME_NS_LBA             = 0,
        NVME_NS_LIGHTNVM        = 1,
@@ -68,10 +56,10 @@ enum nvme_quirks {
        NVME_QUIRK_IDENTIFY_CNS                 = (1 << 1),
 
        /*
-        * The controller deterministically returns O's on reads to discarded
-        * logical blocks.
+        * The controller deterministically returns O's on reads to
+        * logical blocks that deallocate was called on.
         */
-       NVME_QUIRK_DISCARD_ZEROES               = (1 << 2),
+       NVME_QUIRK_DEALLOCATE_ZEROES            = (1 << 2),
 
        /*
         * The controller needs a delay before starts checking the device
@@ -83,6 +71,11 @@ enum nvme_quirks {
         * APST should not be used.
         */
        NVME_QUIRK_NO_APST                      = (1 << 4),
+
+       /*
+        * The deepest sleep state should not be used.
+        */
+       NVME_QUIRK_NO_DEEPEST_PS                = (1 << 5),
 };
 
 /*
@@ -92,6 +85,13 @@ enum nvme_quirks {
 struct nvme_request {
        struct nvme_command     *cmd;
        union nvme_result       result;
+       u8                      retries;
+       u8                      flags;
+       u16                     status;
+};
+
+enum {
+       NVME_REQ_CANCELLED              = (1 << 0),
 };
 
 static inline struct nvme_request *nvme_req(struct request *req)
@@ -249,25 +249,17 @@ static inline void nvme_cleanup_cmd(struct request *req)
        }
 }
 
-static inline int nvme_error_status(u16 status)
+static inline void nvme_end_request(struct request *req, __le16 status,
+               union nvme_result result)
 {
-       switch (status & 0x7ff) {
-       case NVME_SC_SUCCESS:
-               return 0;
-       case NVME_SC_CAP_EXCEEDED:
-               return -ENOSPC;
-       default:
-               return -EIO;
-       }
-}
+       struct nvme_request *rq = nvme_req(req);
 
-static inline bool nvme_req_needs_retry(struct request *req, u16 status)
-{
-       return !(status & NVME_SC_DNR || blk_noretry_request(req)) &&
-               (jiffies - req->start_time) < req->timeout &&
-               req->retries < nvme_max_retries;
+       rq->status = le16_to_cpu(status) >> 1;
+       rq->result = result;
+       blk_mq_complete_request(req);
 }
 
+void nvme_complete_rq(struct request *req);
 void nvme_cancel_request(struct request *req, void *data, bool reserved);
 bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
                enum nvme_ctrl_state new_state);
@@ -302,7 +294,6 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl);
 #define NVME_QID_ANY -1
 struct request *nvme_alloc_request(struct request_queue *q,
                struct nvme_command *cmd, unsigned int flags, int qid);
-void nvme_requeue_req(struct request *req);
 int nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
                struct nvme_command *cmd);
 int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
index 26a5fd0..c8541c3 100644 (file)
@@ -19,6 +19,7 @@
 #include <linux/blk-mq-pci.h>
 #include <linux/cpu.h>
 #include <linux/delay.h>
+#include <linux/dmi.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
 #include <linux/genhd.h>
@@ -103,8 +104,22 @@ struct nvme_dev {
        u32 cmbloc;
        struct nvme_ctrl ctrl;
        struct completion ioq_wait;
+       u32 *dbbuf_dbs;
+       dma_addr_t dbbuf_dbs_dma_addr;
+       u32 *dbbuf_eis;
+       dma_addr_t dbbuf_eis_dma_addr;
 };
 
+static inline unsigned int sq_idx(unsigned int qid, u32 stride)
+{
+       return qid * 2 * stride;
+}
+
+static inline unsigned int cq_idx(unsigned int qid, u32 stride)
+{
+       return (qid * 2 + 1) * stride;
+}
+
 static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl)
 {
        return container_of(ctrl, struct nvme_dev, ctrl);
@@ -133,6 +148,10 @@ struct nvme_queue {
        u16 qid;
        u8 cq_phase;
        u8 cqe_seen;
+       u32 *dbbuf_sq_db;
+       u32 *dbbuf_cq_db;
+       u32 *dbbuf_sq_ei;
+       u32 *dbbuf_cq_ei;
 };
 
 /*
@@ -171,6 +190,112 @@ static inline void _nvme_check_size(void)
        BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096);
        BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
        BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
+       BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
+}
+
+static inline unsigned int nvme_dbbuf_size(u32 stride)
+{
+       return ((num_possible_cpus() + 1) * 8 * stride);
+}
+
+static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev)
+{
+       unsigned int mem_size = nvme_dbbuf_size(dev->db_stride);
+
+       if (dev->dbbuf_dbs)
+               return 0;
+
+       dev->dbbuf_dbs = dma_alloc_coherent(dev->dev, mem_size,
+                                           &dev->dbbuf_dbs_dma_addr,
+                                           GFP_KERNEL);
+       if (!dev->dbbuf_dbs)
+               return -ENOMEM;
+       dev->dbbuf_eis = dma_alloc_coherent(dev->dev, mem_size,
+                                           &dev->dbbuf_eis_dma_addr,
+                                           GFP_KERNEL);
+       if (!dev->dbbuf_eis) {
+               dma_free_coherent(dev->dev, mem_size,
+                                 dev->dbbuf_dbs, dev->dbbuf_dbs_dma_addr);
+               dev->dbbuf_dbs = NULL;
+               return -ENOMEM;
+       }
+
+       return 0;
+}
+
+static void nvme_dbbuf_dma_free(struct nvme_dev *dev)
+{
+       unsigned int mem_size = nvme_dbbuf_size(dev->db_stride);
+
+       if (dev->dbbuf_dbs) {
+               dma_free_coherent(dev->dev, mem_size,
+                                 dev->dbbuf_dbs, dev->dbbuf_dbs_dma_addr);
+               dev->dbbuf_dbs = NULL;
+       }
+       if (dev->dbbuf_eis) {
+               dma_free_coherent(dev->dev, mem_size,
+                                 dev->dbbuf_eis, dev->dbbuf_eis_dma_addr);
+               dev->dbbuf_eis = NULL;
+       }
+}
+
+static void nvme_dbbuf_init(struct nvme_dev *dev,
+                           struct nvme_queue *nvmeq, int qid)
+{
+       if (!dev->dbbuf_dbs || !qid)
+               return;
+
+       nvmeq->dbbuf_sq_db = &dev->dbbuf_dbs[sq_idx(qid, dev->db_stride)];
+       nvmeq->dbbuf_cq_db = &dev->dbbuf_dbs[cq_idx(qid, dev->db_stride)];
+       nvmeq->dbbuf_sq_ei = &dev->dbbuf_eis[sq_idx(qid, dev->db_stride)];
+       nvmeq->dbbuf_cq_ei = &dev->dbbuf_eis[cq_idx(qid, dev->db_stride)];
+}
+
+static void nvme_dbbuf_set(struct nvme_dev *dev)
+{
+       struct nvme_command c;
+
+       if (!dev->dbbuf_dbs)
+               return;
+
+       memset(&c, 0, sizeof(c));
+       c.dbbuf.opcode = nvme_admin_dbbuf;
+       c.dbbuf.prp1 = cpu_to_le64(dev->dbbuf_dbs_dma_addr);
+       c.dbbuf.prp2 = cpu_to_le64(dev->dbbuf_eis_dma_addr);
+
+       if (nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0)) {
+               dev_warn(dev->dev, "unable to set dbbuf\n");
+               /* Free memory and continue on */
+               nvme_dbbuf_dma_free(dev);
+       }
+}
+
+static inline int nvme_dbbuf_need_event(u16 event_idx, u16 new_idx, u16 old)
+{
+       return (u16)(new_idx - event_idx - 1) < (u16)(new_idx - old);
+}
+
+/* Update dbbuf and return true if an MMIO is required */
+static bool nvme_dbbuf_update_and_check_event(u16 value, u32 *dbbuf_db,
+                                             volatile u32 *dbbuf_ei)
+{
+       if (dbbuf_db) {
+               u16 old_value;
+
+               /*
+                * Ensure that the queue is written before updating
+                * the doorbell in memory
+                */
+               wmb();
+
+               old_value = *dbbuf_db;
+               *dbbuf_db = value;
+
+               if (!nvme_dbbuf_need_event(*dbbuf_ei, value, old_value))
+                       return false;
+       }
+
+       return true;
 }
 
 /*
@@ -297,7 +422,9 @@ static void __nvme_submit_cmd(struct nvme_queue *nvmeq,
 
        if (++tail == nvmeq->q_depth)
                tail = 0;
-       writel(tail, nvmeq->q_db);
+       if (nvme_dbbuf_update_and_check_event(tail, nvmeq->dbbuf_sq_db,
+                                             nvmeq->dbbuf_sq_ei))
+               writel(tail, nvmeq->q_db);
        nvmeq->sq_tail = tail;
 }
 
@@ -326,10 +453,6 @@ static int nvme_init_iod(struct request *rq, struct nvme_dev *dev)
        iod->nents = 0;
        iod->length = size;
 
-       if (!(rq->rq_flags & RQF_DONTPREP)) {
-               rq->retries = 0;
-               rq->rq_flags |= RQF_DONTPREP;
-       }
        return BLK_MQ_RQ_QUEUE_OK;
 }
 
@@ -628,34 +751,12 @@ out_free_cmd:
        return ret;
 }
 
-static void nvme_complete_rq(struct request *req)
+static void nvme_pci_complete_rq(struct request *req)
 {
        struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
-       struct nvme_dev *dev = iod->nvmeq->dev;
-       int error = 0;
-
-       nvme_unmap_data(dev, req);
 
-       if (unlikely(req->errors)) {
-               if (nvme_req_needs_retry(req, req->errors)) {
-                       req->retries++;
-                       nvme_requeue_req(req);
-                       return;
-               }
-
-               if (blk_rq_is_passthrough(req))
-                       error = req->errors;
-               else
-                       error = nvme_error_status(req->errors);
-       }
-
-       if (unlikely(iod->aborted)) {
-               dev_warn(dev->ctrl.device,
-                       "completing aborted command with status: %04x\n",
-                       req->errors);
-       }
-
-       blk_mq_end_request(req, error);
+       nvme_unmap_data(iod->nvmeq->dev, req);
+       nvme_complete_rq(req);
 }
 
 /* We read the CQE phase first to check if the rest of the entry is valid */
@@ -705,15 +806,16 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag)
                }
 
                req = blk_mq_tag_to_rq(*nvmeq->tags, cqe.command_id);
-               nvme_req(req)->result = cqe.result;
-               blk_mq_complete_request(req, le16_to_cpu(cqe.status) >> 1);
+               nvme_end_request(req, cqe.status, cqe.result);
        }
 
        if (head == nvmeq->cq_head && phase == nvmeq->cq_phase)
                return;
 
        if (likely(nvmeq->cq_vector >= 0))
-               writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
+               if (nvme_dbbuf_update_and_check_event(head, nvmeq->dbbuf_cq_db,
+                                                     nvmeq->dbbuf_cq_ei))
+                       writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
        nvmeq->cq_head = head;
        nvmeq->cq_phase = phase;
 
@@ -745,10 +847,8 @@ static irqreturn_t nvme_irq_check(int irq, void *data)
        return IRQ_NONE;
 }
 
-static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
+static int __nvme_poll(struct nvme_queue *nvmeq, unsigned int tag)
 {
-       struct nvme_queue *nvmeq = hctx->driver_data;
-
        if (nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase)) {
                spin_lock_irq(&nvmeq->q_lock);
                __nvme_process_cq(nvmeq, &tag);
@@ -761,6 +861,13 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
        return 0;
 }
 
+static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
+{
+       struct nvme_queue *nvmeq = hctx->driver_data;
+
+       return __nvme_poll(nvmeq, tag);
+}
+
 static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl, int aer_idx)
 {
        struct nvme_dev *dev = to_nvme_dev(ctrl);
@@ -812,7 +919,7 @@ static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
                                                struct nvme_queue *nvmeq)
 {
        struct nvme_command c;
-       int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM;
+       int flags = NVME_QUEUE_PHYS_CONTIG;
 
        /*
         * Note: we (ab)use the fact the the prp fields survive if no data
@@ -843,9 +950,9 @@ static void abort_endio(struct request *req, int error)
 {
        struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
        struct nvme_queue *nvmeq = iod->nvmeq;
-       u16 status = req->errors;
 
-       dev_warn(nvmeq->dev->ctrl.device, "Abort status: 0x%x", status);
+       dev_warn(nvmeq->dev->ctrl.device,
+                "Abort status: 0x%x", nvme_req(req)->status);
        atomic_inc(&nvmeq->dev->ctrl.abort_limit);
        blk_mq_free_request(req);
 }
@@ -859,6 +966,16 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
        struct nvme_command cmd;
 
        /*
+        * Did we miss an interrupt?
+        */
+       if (__nvme_poll(nvmeq, req->tag)) {
+               dev_warn(dev->ctrl.device,
+                        "I/O %d QID %d timeout, completion polled\n",
+                        req->tag, nvmeq->qid);
+               return BLK_EH_HANDLED;
+       }
+
+       /*
         * Shutdown immediately if controller times out while starting. The
         * reset work will see the pci device disabled when it gets the forced
         * cancellation error. All outstanding requests are completed on
@@ -869,7 +986,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
                         "I/O %d QID %d timeout, disable controller\n",
                         req->tag, nvmeq->qid);
                nvme_dev_disable(dev, false);
-               req->errors = NVME_SC_CANCELLED;
+               nvme_req(req)->flags |= NVME_REQ_CANCELLED;
                return BLK_EH_HANDLED;
        }
 
@@ -889,7 +1006,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
                 * Mark the request as handled, since the inline shutdown
                 * forces all outstanding requests to complete.
                 */
-               req->errors = NVME_SC_CANCELLED;
+               nvme_req(req)->flags |= NVME_REQ_CANCELLED;
                return BLK_EH_HANDLED;
        }
 
@@ -1097,6 +1214,7 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
        nvmeq->cq_phase = 1;
        nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
        memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth));
+       nvme_dbbuf_init(dev, nvmeq, qid);
        dev->online_queues++;
        spin_unlock_irq(&nvmeq->q_lock);
 }
@@ -1129,18 +1247,18 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
        return result;
 }
 
-static struct blk_mq_ops nvme_mq_admin_ops = {
+static const struct blk_mq_ops nvme_mq_admin_ops = {
        .queue_rq       = nvme_queue_rq,
-       .complete       = nvme_complete_rq,
+       .complete       = nvme_pci_complete_rq,
        .init_hctx      = nvme_admin_init_hctx,
        .exit_hctx      = nvme_admin_exit_hctx,
        .init_request   = nvme_admin_init_request,
        .timeout        = nvme_timeout,
 };
 
-static struct blk_mq_ops nvme_mq_ops = {
+static const struct blk_mq_ops nvme_mq_ops = {
        .queue_rq       = nvme_queue_rq,
-       .complete       = nvme_complete_rq,
+       .complete       = nvme_pci_complete_rq,
        .init_hctx      = nvme_init_hctx,
        .init_request   = nvme_init_request,
        .map_queues     = nvme_pci_map_queues,
@@ -1569,6 +1687,8 @@ static int nvme_dev_add(struct nvme_dev *dev)
                if (blk_mq_alloc_tag_set(&dev->tagset))
                        return 0;
                dev->ctrl.tagset = &dev->tagset;
+
+               nvme_dbbuf_set(dev);
        } else {
                blk_mq_update_nr_hw_queues(&dev->tagset, dev->online_queues - 1);
 
@@ -1755,6 +1875,7 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
 {
        struct nvme_dev *dev = to_nvme_dev(ctrl);
 
+       nvme_dbbuf_dma_free(dev);
        put_device(dev->dev);
        if (dev->tagset.tags)
                blk_mq_free_tag_set(&dev->tagset);
@@ -1822,6 +1943,13 @@ static void nvme_reset_work(struct work_struct *work)
                dev->ctrl.opal_dev = NULL;
        }
 
+       if (dev->ctrl.oacs & NVME_CTRL_OACS_DBBUF_SUPP) {
+               result = nvme_dbbuf_dma_alloc(dev);
+               if (result)
+                       dev_warn(dev->dev,
+                                "unable to allocate dma for dbbuf\n");
+       }
+
        result = nvme_setup_io_queues(dev);
        if (result)
                goto out;
@@ -1943,10 +2071,31 @@ static int nvme_dev_map(struct nvme_dev *dev)
        return -ENODEV;
 }
 
+static unsigned long check_dell_samsung_bug(struct pci_dev *pdev)
+{
+       if (pdev->vendor == 0x144d && pdev->device == 0xa802) {
+               /*
+                * Several Samsung devices seem to drop off the PCIe bus
+                * randomly when APST is on and uses the deepest sleep state.
+                * This has been observed on a Samsung "SM951 NVMe SAMSUNG
+                * 256GB", a "PM951 NVMe SAMSUNG 512GB", and a "Samsung SSD
+                * 950 PRO 256GB", but it seems to be restricted to two Dell
+                * laptops.
+                */
+               if (dmi_match(DMI_SYS_VENDOR, "Dell Inc.") &&
+                   (dmi_match(DMI_PRODUCT_NAME, "XPS 15 9550") ||
+                    dmi_match(DMI_PRODUCT_NAME, "Precision 5510")))
+                       return NVME_QUIRK_NO_DEEPEST_PS;
+       }
+
+       return 0;
+}
+
 static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
        int node, result = -ENOMEM;
        struct nvme_dev *dev;
+       unsigned long quirks = id->driver_data;
 
        node = dev_to_node(&pdev->dev);
        if (node == NUMA_NO_NODE)
@@ -1978,8 +2127,10 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
        if (result)
                goto put_pci;
 
+       quirks |= check_dell_samsung_bug(pdev);
+
        result = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops,
-                       id->driver_data);
+                       quirks);
        if (result)
                goto release_pools;
 
@@ -2135,13 +2286,13 @@ static const struct pci_error_handlers nvme_err_handler = {
 static const struct pci_device_id nvme_id_table[] = {
        { PCI_VDEVICE(INTEL, 0x0953),
                .driver_data = NVME_QUIRK_STRIPE_SIZE |
-                               NVME_QUIRK_DISCARD_ZEROES, },
+                               NVME_QUIRK_DEALLOCATE_ZEROES, },
        { PCI_VDEVICE(INTEL, 0x0a53),
                .driver_data = NVME_QUIRK_STRIPE_SIZE |
-                               NVME_QUIRK_DISCARD_ZEROES, },
+                               NVME_QUIRK_DEALLOCATE_ZEROES, },
        { PCI_VDEVICE(INTEL, 0x0a54),
                .driver_data = NVME_QUIRK_STRIPE_SIZE |
-                               NVME_QUIRK_DISCARD_ZEROES, },
+                               NVME_QUIRK_DEALLOCATE_ZEROES, },
        { PCI_VDEVICE(INTEL, 0x5845),   /* Qemu emulated controller */
                .driver_data = NVME_QUIRK_IDENTIFY_CNS, },
        { PCI_DEVICE(0x1c58, 0x0003),   /* HGST adapter */
index 47a479f..29cf88a 100644 (file)
@@ -34,7 +34,7 @@
 #include "fabrics.h"
 
 
-#define NVME_RDMA_CONNECT_TIMEOUT_MS   1000            /* 1 second */
+#define NVME_RDMA_CONNECT_TIMEOUT_MS   3000            /* 3 second */
 
 #define NVME_RDMA_MAX_SEGMENT_SIZE     0xffffff        /* 24-bit SGL field */
 
@@ -118,7 +118,6 @@ struct nvme_rdma_ctrl {
 
        struct nvme_rdma_qe     async_event_sqe;
 
-       int                     reconnect_delay;
        struct delayed_work     reconnect_work;
 
        struct list_head        list;
@@ -129,14 +128,8 @@ struct nvme_rdma_ctrl {
        u64                     cap;
        u32                     max_fr_pages;
 
-       union {
-               struct sockaddr addr;
-               struct sockaddr_in addr_in;
-       };
-       union {
-               struct sockaddr src_addr;
-               struct sockaddr_in src_addr_in;
-       };
+       struct sockaddr_storage addr;
+       struct sockaddr_storage src_addr;
 
        struct nvme_ctrl        ctrl;
 };
@@ -569,11 +562,12 @@ static int nvme_rdma_init_queue(struct nvme_rdma_ctrl *ctrl,
                return PTR_ERR(queue->cm_id);
        }
 
-       queue->cm_error = -ETIMEDOUT;
        if (ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR)
-               src_addr = &ctrl->src_addr;
+               src_addr = (struct sockaddr *)&ctrl->src_addr;
 
-       ret = rdma_resolve_addr(queue->cm_id, src_addr, &ctrl->addr,
+       queue->cm_error = -ETIMEDOUT;
+       ret = rdma_resolve_addr(queue->cm_id, src_addr,
+                       (struct sockaddr *)&ctrl->addr,
                        NVME_RDMA_CONNECT_TIMEOUT_MS);
        if (ret) {
                dev_info(ctrl->ctrl.device,
@@ -712,6 +706,26 @@ free_ctrl:
        kfree(ctrl);
 }
 
+static void nvme_rdma_reconnect_or_remove(struct nvme_rdma_ctrl *ctrl)
+{
+       /* If we are resetting/deleting then do nothing */
+       if (ctrl->ctrl.state != NVME_CTRL_RECONNECTING) {
+               WARN_ON_ONCE(ctrl->ctrl.state == NVME_CTRL_NEW ||
+                       ctrl->ctrl.state == NVME_CTRL_LIVE);
+               return;
+       }
+
+       if (nvmf_should_reconnect(&ctrl->ctrl)) {
+               dev_info(ctrl->ctrl.device, "Reconnecting in %d seconds...\n",
+                       ctrl->ctrl.opts->reconnect_delay);
+               queue_delayed_work(nvme_rdma_wq, &ctrl->reconnect_work,
+                               ctrl->ctrl.opts->reconnect_delay * HZ);
+       } else {
+               dev_info(ctrl->ctrl.device, "Removing controller...\n");
+               queue_work(nvme_rdma_wq, &ctrl->delete_work);
+       }
+}
+
 static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
 {
        struct nvme_rdma_ctrl *ctrl = container_of(to_delayed_work(work),
@@ -719,6 +733,8 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
        bool changed;
        int ret;
 
+       ++ctrl->ctrl.opts->nr_reconnects;
+
        if (ctrl->queue_count > 1) {
                nvme_rdma_free_io_queues(ctrl);
 
@@ -763,6 +779,7 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
 
        changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
        WARN_ON_ONCE(!changed);
+       ctrl->ctrl.opts->nr_reconnects = 0;
 
        if (ctrl->queue_count > 1) {
                nvme_start_queues(&ctrl->ctrl);
@@ -777,13 +794,9 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
 stop_admin_q:
        blk_mq_stop_hw_queues(ctrl->ctrl.admin_q);
 requeue:
-       /* Make sure we are not resetting/deleting */
-       if (ctrl->ctrl.state == NVME_CTRL_RECONNECTING) {
-               dev_info(ctrl->ctrl.device,
-                       "Failed reconnect attempt, requeueing...\n");
-               queue_delayed_work(nvme_rdma_wq, &ctrl->reconnect_work,
-                                       ctrl->reconnect_delay * HZ);
-       }
+       dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d\n",
+                       ctrl->ctrl.opts->nr_reconnects);
+       nvme_rdma_reconnect_or_remove(ctrl);
 }
 
 static void nvme_rdma_error_recovery_work(struct work_struct *work)
@@ -810,11 +823,7 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work)
        blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
                                nvme_cancel_request, &ctrl->ctrl);
 
-       dev_info(ctrl->ctrl.device, "reconnecting in %d seconds\n",
-               ctrl->reconnect_delay);
-
-       queue_delayed_work(nvme_rdma_wq, &ctrl->reconnect_work,
-                               ctrl->reconnect_delay * HZ);
+       nvme_rdma_reconnect_or_remove(ctrl);
 }
 
 static void nvme_rdma_error_recovery(struct nvme_rdma_ctrl *ctrl)
@@ -1169,8 +1178,7 @@ static int nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,
            wc->ex.invalidate_rkey == req->mr->rkey)
                req->mr->need_inval = false;
 
-       req->req.result = cqe->result;
-       blk_mq_complete_request(rq, le16_to_cpu(cqe->status) >> 1);
+       nvme_end_request(rq, cqe->status, cqe->result);
        return ret;
 }
 
@@ -1407,7 +1415,7 @@ nvme_rdma_timeout(struct request *rq, bool reserved)
        nvme_rdma_error_recovery(req->queue->ctrl);
 
        /* fail with DNR on cmd timeout */
-       rq->errors = NVME_SC_ABORT_REQ | NVME_SC_DNR;
+       nvme_req(rq)->status = NVME_SC_ABORT_REQ | NVME_SC_DNR;
 
        return BLK_EH_HANDLED;
 }
@@ -1509,27 +1517,12 @@ static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
 static void nvme_rdma_complete_rq(struct request *rq)
 {
        struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
-       struct nvme_rdma_queue *queue = req->queue;
-       int error = 0;
-
-       nvme_rdma_unmap_data(queue, rq);
 
-       if (unlikely(rq->errors)) {
-               if (nvme_req_needs_retry(rq, rq->errors)) {
-                       nvme_requeue_req(rq);
-                       return;
-               }
-
-               if (blk_rq_is_passthrough(rq))
-                       error = rq->errors;
-               else
-                       error = nvme_error_status(rq->errors);
-       }
-
-       blk_mq_end_request(rq, error);
+       nvme_rdma_unmap_data(req->queue, rq);
+       nvme_complete_rq(rq);
 }
 
-static struct blk_mq_ops nvme_rdma_mq_ops = {
+static const struct blk_mq_ops nvme_rdma_mq_ops = {
        .queue_rq       = nvme_rdma_queue_rq,
        .complete       = nvme_rdma_complete_rq,
        .init_request   = nvme_rdma_init_request,
@@ -1540,7 +1533,7 @@ static struct blk_mq_ops nvme_rdma_mq_ops = {
        .timeout        = nvme_rdma_timeout,
 };
 
-static struct blk_mq_ops nvme_rdma_admin_mq_ops = {
+static const struct blk_mq_ops nvme_rdma_admin_mq_ops = {
        .queue_rq       = nvme_rdma_queue_rq,
        .complete       = nvme_rdma_complete_rq,
        .init_request   = nvme_rdma_init_admin_request,
@@ -1606,7 +1599,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl)
        }
 
        ctrl->ctrl.sqsize =
-               min_t(int, NVME_CAP_MQES(ctrl->cap) + 1, ctrl->ctrl.sqsize);
+               min_t(int, NVME_CAP_MQES(ctrl->cap), ctrl->ctrl.sqsize);
 
        error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap);
        if (error)
@@ -1857,27 +1850,13 @@ out_free_io_queues:
        return ret;
 }
 
-static int nvme_rdma_parse_ipaddr(struct sockaddr_in *in_addr, char *p)
-{
-       u8 *addr = (u8 *)&in_addr->sin_addr.s_addr;
-       size_t buflen = strlen(p);
-
-       /* XXX: handle IPv6 addresses */
-
-       if (buflen > INET_ADDRSTRLEN)
-               return -EINVAL;
-       if (in4_pton(p, buflen, addr, '\0', NULL) == 0)
-               return -EINVAL;
-       in_addr->sin_family = AF_INET;
-       return 0;
-}
-
 static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
                struct nvmf_ctrl_options *opts)
 {
        struct nvme_rdma_ctrl *ctrl;
        int ret;
        bool changed;
+       char *port;
 
        ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
        if (!ctrl)
@@ -1885,40 +1864,33 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
        ctrl->ctrl.opts = opts;
        INIT_LIST_HEAD(&ctrl->list);
 
-       ret = nvme_rdma_parse_ipaddr(&ctrl->addr_in, opts->traddr);
+       if (opts->mask & NVMF_OPT_TRSVCID)
+               port = opts->trsvcid;
+       else
+               port = __stringify(NVME_RDMA_IP_PORT);
+
+       ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
+                       opts->traddr, port, &ctrl->addr);
        if (ret) {
-               pr_err("malformed IP address passed: %s\n", opts->traddr);
+               pr_err("malformed address passed: %s:%s\n", opts->traddr, port);
                goto out_free_ctrl;
        }
 
        if (opts->mask & NVMF_OPT_HOST_TRADDR) {
-               ret = nvme_rdma_parse_ipaddr(&ctrl->src_addr_in,
-                               opts->host_traddr);
+               ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
+                       opts->host_traddr, NULL, &ctrl->src_addr);
                if (ret) {
-                       pr_err("malformed src IP address passed: %s\n",
+                       pr_err("malformed src address passed: %s\n",
                               opts->host_traddr);
                        goto out_free_ctrl;
                }
        }
 
-       if (opts->mask & NVMF_OPT_TRSVCID) {
-               u16 port;
-
-               ret = kstrtou16(opts->trsvcid, 0, &port);
-               if (ret)
-                       goto out_free_ctrl;
-
-               ctrl->addr_in.sin_port = cpu_to_be16(port);
-       } else {
-               ctrl->addr_in.sin_port = cpu_to_be16(NVME_RDMA_IP_PORT);
-       }
-
        ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_rdma_ctrl_ops,
                                0 /* no quirks, we're perfect! */);
        if (ret)
                goto out_free_ctrl;
 
-       ctrl->reconnect_delay = opts->reconnect_delay;
        INIT_DELAYED_WORK(&ctrl->reconnect_work,
                        nvme_rdma_reconnect_ctrl_work);
        INIT_WORK(&ctrl->err_work, nvme_rdma_error_recovery_work);
@@ -1977,7 +1949,7 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
        changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
        WARN_ON_ONCE(!changed);
 
-       dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp\n",
+       dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISpcs\n",
                ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
 
        kref_get(&ctrl->ctrl.kref);
@@ -2013,7 +1985,7 @@ static struct nvmf_transport_ops nvme_rdma_transport = {
        .name           = "rdma",
        .required_opts  = NVMF_OPT_TRADDR,
        .allowed_opts   = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
-                         NVMF_OPT_HOST_TRADDR,
+                         NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO,
        .create_ctrl    = nvme_rdma_create_ctrl,
 };
 
@@ -2055,12 +2027,20 @@ static int __init nvme_rdma_init_module(void)
                return -ENOMEM;
 
        ret = ib_register_client(&nvme_rdma_ib_client);
-       if (ret) {
-               destroy_workqueue(nvme_rdma_wq);
-               return ret;
-       }
+       if (ret)
+               goto err_destroy_wq;
+
+       ret = nvmf_register_transport(&nvme_rdma_transport);
+       if (ret)
+               goto err_unreg_client;
+
+       return 0;
 
-       return nvmf_register_transport(&nvme_rdma_transport);
+err_unreg_client:
+       ib_unregister_client(&nvme_rdma_ib_client);
+err_destroy_wq:
+       destroy_workqueue(nvme_rdma_wq);
+       return ret;
 }
 
 static void __exit nvme_rdma_cleanup_module(void)
index f49ae27..1f7671e 100644 (file)
@@ -1609,7 +1609,7 @@ static int nvme_trans_do_nvme_io(struct nvme_ns *ns, struct sg_io_hdr *hdr,
        struct nvme_command c;
        u8 opcode = (is_write ? nvme_cmd_write : nvme_cmd_read);
        u16 control;
-       u32 max_blocks = queue_max_hw_sectors(ns->queue);
+       u32 max_blocks = queue_max_hw_sectors(ns->queue) >> (ns->lba_shift - 9);
 
        num_cmds = nvme_trans_io_get_num_cmds(hdr, cdb_info, max_blocks);
 
@@ -2138,15 +2138,6 @@ static int nvme_trans_request_sense(struct nvme_ns *ns, struct sg_io_hdr *hdr,
        return res;
 }
 
-static int nvme_trans_security_protocol(struct nvme_ns *ns,
-                                       struct sg_io_hdr *hdr,
-                                       u8 *cmd)
-{
-       return nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
-                               ILLEGAL_REQUEST, SCSI_ASC_ILLEGAL_COMMAND,
-                               SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
-}
-
 static int nvme_trans_synchronize_cache(struct nvme_ns *ns,
                                        struct sg_io_hdr *hdr)
 {
@@ -2414,10 +2405,6 @@ static int nvme_scsi_translate(struct nvme_ns *ns, struct sg_io_hdr *hdr)
        case REQUEST_SENSE:
                retcode = nvme_trans_request_sense(ns, hdr, cmd);
                break;
-       case SECURITY_PROTOCOL_IN:
-       case SECURITY_PROTOCOL_OUT:
-               retcode = nvme_trans_security_protocol(ns, hdr, cmd);
-               break;
        case SYNCHRONIZE_CACHE:
                retcode = nvme_trans_synchronize_cache(ns, hdr);
                break;
index a7bcff4..ff1f970 100644 (file)
@@ -100,7 +100,7 @@ static u16 nvmet_get_smart_log(struct nvmet_req *req,
        u16 status;
 
        WARN_ON(req == NULL || slog == NULL);
-       if (req->cmd->get_log_page.nsid == 0xFFFFFFFF)
+       if (req->cmd->get_log_page.nsid == cpu_to_le32(0xFFFFFFFF))
                status = nvmet_get_smart_log_all(req, slog);
        else
                status = nvmet_get_smart_log_nsid(req, slog);
@@ -121,7 +121,7 @@ static void nvmet_execute_get_log_page(struct nvmet_req *req)
        }
 
        switch (req->cmd->get_log_page.lid) {
-       case 0x01:
+       case NVME_LOG_ERROR:
                /*
                 * We currently never set the More bit in the status field,
                 * so all error log entries are invalid and can be zeroed out.
@@ -129,7 +129,7 @@ static void nvmet_execute_get_log_page(struct nvmet_req *req)
                 * mandatory log page.
                 */
                break;
-       case 0x02:
+       case NVME_LOG_SMART:
                /*
                 * XXX: fill out actual smart log
                 *
@@ -149,7 +149,7 @@ static void nvmet_execute_get_log_page(struct nvmet_req *req)
                        goto err;
                }
                break;
-       case 0x03:
+       case NVME_LOG_FW_SLOT:
                /*
                 * We only support a single firmware slot which always is
                 * active, so we can zero out the whole firmware slot log and
@@ -480,31 +480,25 @@ static void nvmet_execute_keep_alive(struct nvmet_req *req)
        nvmet_req_complete(req, 0);
 }
 
-int nvmet_parse_admin_cmd(struct nvmet_req *req)
+u16 nvmet_parse_admin_cmd(struct nvmet_req *req)
 {
        struct nvme_command *cmd = req->cmd;
+       u16 ret;
 
        req->ns = NULL;
 
-       if (unlikely(!(req->sq->ctrl->cc & NVME_CC_ENABLE))) {
-               pr_err("nvmet: got admin cmd %d while CC.EN == 0\n",
-                               cmd->common.opcode);
-               return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
-       }
-       if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) {
-               pr_err("nvmet: got admin cmd %d while CSTS.RDY == 0\n",
-                               cmd->common.opcode);
-               return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
-       }
+       ret = nvmet_check_ctrl_status(req, cmd);
+       if (unlikely(ret))
+               return ret;
 
        switch (cmd->common.opcode) {
        case nvme_admin_get_log_page:
                req->data_len = nvmet_get_log_page_len(cmd);
 
                switch (cmd->get_log_page.lid) {
-               case 0x01:
-               case 0x02:
-               case 0x03:
+               case NVME_LOG_ERROR:
+               case NVME_LOG_SMART:
+               case NVME_LOG_FW_SLOT:
                        req->execute = nvmet_execute_get_log_page;
                        return 0;
                }
@@ -545,6 +539,7 @@ int nvmet_parse_admin_cmd(struct nvmet_req *req)
                return 0;
        }
 
-       pr_err("nvmet: unhandled cmd %d\n", cmd->common.opcode);
+       pr_err("unhandled cmd %d on qid %d\n", cmd->common.opcode,
+              req->sq->qid);
        return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
 }
index 798653b..cf90713 100644 (file)
@@ -273,8 +273,8 @@ int nvmet_ns_enable(struct nvmet_ns *ns)
        ns->bdev = blkdev_get_by_path(ns->device_path, FMODE_READ | FMODE_WRITE,
                        NULL);
        if (IS_ERR(ns->bdev)) {
-               pr_err("nvmet: failed to open block device %s: (%ld)\n",
-                       ns->device_path, PTR_ERR(ns->bdev));
+               pr_err("failed to open block device %s: (%ld)\n",
+                      ns->device_path, PTR_ERR(ns->bdev));
                ret = PTR_ERR(ns->bdev);
                ns->bdev = NULL;
                goto out_unlock;
@@ -661,6 +661,23 @@ out:
        return status;
 }
 
+u16 nvmet_check_ctrl_status(struct nvmet_req *req, struct nvme_command *cmd)
+{
+       if (unlikely(!(req->sq->ctrl->cc & NVME_CC_ENABLE))) {
+               pr_err("got io cmd %d while CC.EN == 0 on qid = %d\n",
+                      cmd->common.opcode, req->sq->qid);
+               return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
+       }
+
+       if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) {
+               pr_err("got io cmd %d while CSTS.RDY == 0 on qid = %d\n",
+                      cmd->common.opcode, req->sq->qid);
+               req->ns = NULL;
+               return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
+       }
+       return 0;
+}
+
 static bool __nvmet_host_allowed(struct nvmet_subsys *subsys,
                const char *hostnqn)
 {
index af8aabf..1aaf597 100644 (file)
@@ -159,15 +159,15 @@ out:
        nvmet_req_complete(req, status);
 }
 
-int nvmet_parse_discovery_cmd(struct nvmet_req *req)
+u16 nvmet_parse_discovery_cmd(struct nvmet_req *req)
 {
        struct nvme_command *cmd = req->cmd;
 
        req->ns = NULL;
 
        if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) {
-               pr_err("nvmet: got cmd %d while not ready\n",
-                               cmd->common.opcode);
+               pr_err("got cmd %d while not ready\n",
+                      cmd->common.opcode);
                return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
        }
 
@@ -180,8 +180,8 @@ int nvmet_parse_discovery_cmd(struct nvmet_req *req)
                        req->execute = nvmet_execute_get_disc_log_page;
                        return 0;
                default:
-                       pr_err("nvmet: unsupported get_log_page lid %d\n",
-                               cmd->get_log_page.lid);
+                       pr_err("unsupported get_log_page lid %d\n",
+                              cmd->get_log_page.lid);
                return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
                }
        case nvme_admin_identify:
@@ -192,17 +192,16 @@ int nvmet_parse_discovery_cmd(struct nvmet_req *req)
                                nvmet_execute_identify_disc_ctrl;
                        return 0;
                default:
-                       pr_err("nvmet: unsupported identify cns %d\n",
-                               cmd->identify.cns);
+                       pr_err("unsupported identify cns %d\n",
+                              cmd->identify.cns);
                        return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
                }
        default:
-               pr_err("nvmet: unsupported cmd %d\n",
-                               cmd->common.opcode);
+               pr_err("unsupported cmd %d\n", cmd->common.opcode);
                return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
        }
 
-       pr_err("nvmet: unhandled cmd %d\n", cmd->common.opcode);
+       pr_err("unhandled cmd %d\n", cmd->common.opcode);
        return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
 }
 
index 8bd022a..3cc1726 100644 (file)
@@ -73,7 +73,7 @@ static void nvmet_execute_prop_get(struct nvmet_req *req)
        nvmet_req_complete(req, status);
 }
 
-int nvmet_parse_fabrics_cmd(struct nvmet_req *req)
+u16 nvmet_parse_fabrics_cmd(struct nvmet_req *req)
 {
        struct nvme_command *cmd = req->cmd;
 
@@ -122,7 +122,15 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req)
        struct nvmet_ctrl *ctrl = NULL;
        u16 status = 0;
 
-       d = kmap(sg_page(req->sg)) + req->sg->offset;
+       d = kmalloc(sizeof(*d), GFP_KERNEL);
+       if (!d) {
+               status = NVME_SC_INTERNAL;
+               goto complete;
+       }
+
+       status = nvmet_copy_from_sgl(req, 0, d, sizeof(*d));
+       if (status)
+               goto out;
 
        /* zero out initial completion result, assign values as needed */
        req->rsp->result.u32 = 0;
@@ -143,7 +151,7 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req)
        }
 
        status = nvmet_alloc_ctrl(d->subsysnqn, d->hostnqn, req,
-                       le32_to_cpu(c->kato), &ctrl);
+                                 le32_to_cpu(c->kato), &ctrl);
        if (status)
                goto out;
 
@@ -158,7 +166,8 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req)
        req->rsp->result.u16 = cpu_to_le16(ctrl->cntlid);
 
 out:
-       kunmap(sg_page(req->sg));
+       kfree(d);
+complete:
        nvmet_req_complete(req, status);
 }
 
@@ -170,7 +179,15 @@ static void nvmet_execute_io_connect(struct nvmet_req *req)
        u16 qid = le16_to_cpu(c->qid);
        u16 status = 0;
 
-       d = kmap(sg_page(req->sg)) + req->sg->offset;
+       d = kmalloc(sizeof(*d), GFP_KERNEL);
+       if (!d) {
+               status = NVME_SC_INTERNAL;
+               goto complete;
+       }
+
+       status = nvmet_copy_from_sgl(req, 0, d, sizeof(*d));
+       if (status)
+               goto out;
 
        /* zero out initial completion result, assign values as needed */
        req->rsp->result.u32 = 0;
@@ -183,8 +200,8 @@ static void nvmet_execute_io_connect(struct nvmet_req *req)
        }
 
        status = nvmet_ctrl_find_get(d->subsysnqn, d->hostnqn,
-                       le16_to_cpu(d->cntlid),
-                       req, &ctrl);
+                                    le16_to_cpu(d->cntlid),
+                                    req, &ctrl);
        if (status)
                goto out;
 
@@ -205,7 +222,8 @@ static void nvmet_execute_io_connect(struct nvmet_req *req)
        pr_info("adding queue %d to ctrl %d.\n", qid, ctrl->cntlid);
 
 out:
-       kunmap(sg_page(req->sg));
+       kfree(d);
+complete:
        nvmet_req_complete(req, status);
        return;
 
@@ -214,7 +232,7 @@ out_ctrl_put:
        goto out;
 }
 
-int nvmet_parse_connect_cmd(struct nvmet_req *req)
+u16 nvmet_parse_connect_cmd(struct nvmet_req *req)
 {
        struct nvme_command *cmd = req->cmd;
 
index 8f483ee..62eba29 100644 (file)
@@ -82,10 +82,13 @@ struct nvmet_fc_fcp_iod {
        enum nvmet_fcp_datadir          io_dir;
        bool                            active;
        bool                            abort;
+       bool                            aborted;
+       bool                            writedataactive;
        spinlock_t                      flock;
 
        struct nvmet_req                req;
        struct work_struct              work;
+       struct work_struct              done_work;
 
        struct nvmet_fc_tgtport         *tgtport;
        struct nvmet_fc_tgt_queue       *queue;
@@ -116,7 +119,7 @@ struct nvmet_fc_tgt_queue {
        u16                             qid;
        u16                             sqsize;
        u16                             ersp_ratio;
-       u16                             sqhd;
+       __le16                          sqhd;
        int                             cpu;
        atomic_t                        connected;
        atomic_t                        sqtail;
@@ -213,6 +216,7 @@ static DEFINE_IDA(nvmet_fc_tgtport_cnt);
 
 static void nvmet_fc_handle_ls_rqst_work(struct work_struct *work);
 static void nvmet_fc_handle_fcp_rqst_work(struct work_struct *work);
+static void nvmet_fc_fcp_rqst_op_done_work(struct work_struct *work);
 static void nvmet_fc_tgt_a_put(struct nvmet_fc_tgt_assoc *assoc);
 static int nvmet_fc_tgt_a_get(struct nvmet_fc_tgt_assoc *assoc);
 static void nvmet_fc_tgt_q_put(struct nvmet_fc_tgt_queue *queue);
@@ -414,9 +418,13 @@ nvmet_fc_prep_fcp_iodlist(struct nvmet_fc_tgtport *tgtport,
 
        for (i = 0; i < queue->sqsize; fod++, i++) {
                INIT_WORK(&fod->work, nvmet_fc_handle_fcp_rqst_work);
+               INIT_WORK(&fod->done_work, nvmet_fc_fcp_rqst_op_done_work);
                fod->tgtport = tgtport;
                fod->queue = queue;
                fod->active = false;
+               fod->abort = false;
+               fod->aborted = false;
+               fod->fcpreq = NULL;
                list_add_tail(&fod->fcp_list, &queue->fod_list);
                spin_lock_init(&fod->flock);
 
@@ -463,7 +471,6 @@ nvmet_fc_alloc_fcp_iod(struct nvmet_fc_tgt_queue *queue)
        if (fod) {
                list_del(&fod->fcp_list);
                fod->active = true;
-               fod->abort = false;
                /*
                 * no queue reference is taken, as it was taken by the
                 * queue lookup just prior to the allocation. The iod
@@ -479,17 +486,30 @@ static void
 nvmet_fc_free_fcp_iod(struct nvmet_fc_tgt_queue *queue,
                        struct nvmet_fc_fcp_iod *fod)
 {
+       struct nvmefc_tgt_fcp_req *fcpreq = fod->fcpreq;
+       struct nvmet_fc_tgtport *tgtport = fod->tgtport;
        unsigned long flags;
 
+       fc_dma_sync_single_for_cpu(tgtport->dev, fod->rspdma,
+                               sizeof(fod->rspiubuf), DMA_TO_DEVICE);
+
+       fcpreq->nvmet_fc_private = NULL;
+
        spin_lock_irqsave(&queue->qlock, flags);
        list_add_tail(&fod->fcp_list, &fod->queue->fod_list);
        fod->active = false;
+       fod->abort = false;
+       fod->aborted = false;
+       fod->writedataactive = false;
+       fod->fcpreq = NULL;
        spin_unlock_irqrestore(&queue->qlock, flags);
 
        /*
         * release the reference taken at queue lookup and fod allocation
         */
        nvmet_fc_tgt_q_put(queue);
+
+       tgtport->ops->fcp_req_release(&tgtport->fc_target_port, fcpreq);
 }
 
 static int
@@ -616,32 +636,12 @@ nvmet_fc_tgt_q_get(struct nvmet_fc_tgt_queue *queue)
 
 
 static void
-nvmet_fc_abort_op(struct nvmet_fc_tgtport *tgtport,
-                               struct nvmefc_tgt_fcp_req *fcpreq)
-{
-       int ret;
-
-       fcpreq->op = NVMET_FCOP_ABORT;
-       fcpreq->offset = 0;
-       fcpreq->timeout = 0;
-       fcpreq->transfer_length = 0;
-       fcpreq->transferred_length = 0;
-       fcpreq->fcp_error = 0;
-       fcpreq->sg_cnt = 0;
-
-       ret = tgtport->ops->fcp_op(&tgtport->fc_target_port, fcpreq);
-       if (ret)
-               /* should never reach here !! */
-               WARN_ON(1);
-}
-
-
-static void
 nvmet_fc_delete_target_queue(struct nvmet_fc_tgt_queue *queue)
 {
+       struct nvmet_fc_tgtport *tgtport = queue->assoc->tgtport;
        struct nvmet_fc_fcp_iod *fod = queue->fod;
        unsigned long flags;
-       int i;
+       int i, writedataactive;
        bool disconnect;
 
        disconnect = atomic_xchg(&queue->connected, 0);
@@ -652,7 +652,20 @@ nvmet_fc_delete_target_queue(struct nvmet_fc_tgt_queue *queue)
                if (fod->active) {
                        spin_lock(&fod->flock);
                        fod->abort = true;
+                       writedataactive = fod->writedataactive;
                        spin_unlock(&fod->flock);
+                       /*
+                        * only call lldd abort routine if waiting for
+                        * writedata. other outstanding ops should finish
+                        * on their own.
+                        */
+                       if (writedataactive) {
+                               spin_lock(&fod->flock);
+                               fod->aborted = true;
+                               spin_unlock(&fod->flock);
+                               tgtport->ops->fcp_abort(
+                                       &tgtport->fc_target_port, fod->fcpreq);
+                       }
                }
        }
        spin_unlock_irqrestore(&queue->qlock, flags);
@@ -846,7 +859,8 @@ nvmet_fc_register_targetport(struct nvmet_fc_port_info *pinfo,
        int ret, idx;
 
        if (!template->xmt_ls_rsp || !template->fcp_op ||
-           !template->targetport_delete ||
+           !template->fcp_abort ||
+           !template->fcp_req_release || !template->targetport_delete ||
            !template->max_hw_queues || !template->max_sgl_segments ||
            !template->max_dif_sgl_segments || !template->dma_boundary) {
                ret = -EINVAL;
@@ -1044,7 +1058,7 @@ EXPORT_SYMBOL_GPL(nvmet_fc_unregister_targetport);
 
 
 static void
-nvmet_fc_format_rsp_hdr(void *buf, u8 ls_cmd, u32 desc_len, u8 rqst_ls_cmd)
+nvmet_fc_format_rsp_hdr(void *buf, u8 ls_cmd, __be32 desc_len, u8 rqst_ls_cmd)
 {
        struct fcnvme_ls_acc_hdr *acc = buf;
 
@@ -1189,8 +1203,8 @@ nvmet_fc_ls_create_association(struct nvmet_fc_tgtport *tgtport,
                        validation_errors[ret]);
                iod->lsreq->rsplen = nvmet_fc_format_rjt(acc,
                                NVME_FC_MAX_LS_BUFFER_SIZE, rqst->w0.ls_cmd,
-                               ELS_RJT_LOGIC,
-                               ELS_EXPL_NONE, 0);
+                               FCNVME_RJT_RC_LOGIC,
+                               FCNVME_RJT_EXP_NONE, 0);
                return;
        }
 
@@ -1281,8 +1295,9 @@ nvmet_fc_ls_create_connection(struct nvmet_fc_tgtport *tgtport,
                iod->lsreq->rsplen = nvmet_fc_format_rjt(acc,
                                NVME_FC_MAX_LS_BUFFER_SIZE, rqst->w0.ls_cmd,
                                (ret == VERR_NO_ASSOC) ?
-                                               ELS_RJT_PROT : ELS_RJT_LOGIC,
-                               ELS_EXPL_NONE, 0);
+                                       FCNVME_RJT_RC_INV_ASSOC :
+                                       FCNVME_RJT_RC_LOGIC,
+                               FCNVME_RJT_EXP_NONE, 0);
                return;
        }
 
@@ -1369,8 +1384,12 @@ nvmet_fc_ls_disconnect(struct nvmet_fc_tgtport *tgtport,
                        validation_errors[ret]);
                iod->lsreq->rsplen = nvmet_fc_format_rjt(acc,
                                NVME_FC_MAX_LS_BUFFER_SIZE, rqst->w0.ls_cmd,
-                               (ret == 8) ? ELS_RJT_PROT : ELS_RJT_LOGIC,
-                               ELS_EXPL_NONE, 0);
+                               (ret == VERR_NO_ASSOC) ?
+                                       FCNVME_RJT_RC_INV_ASSOC :
+                                       (ret == VERR_NO_CONN) ?
+                                               FCNVME_RJT_RC_INV_CONN :
+                                               FCNVME_RJT_RC_LOGIC,
+                               FCNVME_RJT_EXP_NONE, 0);
                return;
        }
 
@@ -1479,7 +1498,7 @@ nvmet_fc_handle_ls_rqst(struct nvmet_fc_tgtport *tgtport,
        default:
                iod->lsreq->rsplen = nvmet_fc_format_rjt(iod->rspbuf,
                                NVME_FC_MAX_LS_BUFFER_SIZE, w0->ls_cmd,
-                               ELS_RJT_INVAL, ELS_EXPL_NONE, 0);
+                               FCNVME_RJT_RC_INVAL, FCNVME_RJT_EXP_NONE, 0);
        }
 
        nvmet_fc_xmt_ls_rsp(tgtport, iod);
@@ -1619,6 +1638,8 @@ nvmet_fc_free_tgt_pgs(struct nvmet_fc_fcp_iod *fod)
        for_each_sg(fod->data_sg, sg, fod->data_sg_cnt, count)
                __free_page(sg_page(sg));
        kfree(fod->data_sg);
+       fod->data_sg = NULL;
+       fod->data_sg_cnt = 0;
 }
 
 
@@ -1679,7 +1700,7 @@ nvmet_fc_prep_fcp_rsp(struct nvmet_fc_tgtport *tgtport,
            xfr_length != fod->total_length ||
            (le16_to_cpu(cqe->status) & 0xFFFE) || cqewd[0] || cqewd[1] ||
            (sqe->flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND)) ||
-           queue_90percent_full(fod->queue, cqe->sq_head))
+           queue_90percent_full(fod->queue, le16_to_cpu(cqe->sq_head)))
                send_ersp = true;
 
        /* re-set the fields */
@@ -1704,6 +1725,26 @@ nvmet_fc_prep_fcp_rsp(struct nvmet_fc_tgtport *tgtport,
 static void nvmet_fc_xmt_fcp_op_done(struct nvmefc_tgt_fcp_req *fcpreq);
 
 static void
+nvmet_fc_abort_op(struct nvmet_fc_tgtport *tgtport,
+                               struct nvmet_fc_fcp_iod *fod)
+{
+       struct nvmefc_tgt_fcp_req *fcpreq = fod->fcpreq;
+
+       /* data no longer needed */
+       nvmet_fc_free_tgt_pgs(fod);
+
+       /*
+        * if an ABTS was received or we issued the fcp_abort early
+        * don't call abort routine again.
+        */
+       /* no need to take lock - lock was taken earlier to get here */
+       if (!fod->aborted)
+               tgtport->ops->fcp_abort(&tgtport->fc_target_port, fcpreq);
+
+       nvmet_fc_free_fcp_iod(fod->queue, fod);
+}
+
+static void
 nvmet_fc_xmt_fcp_rsp(struct nvmet_fc_tgtport *tgtport,
                                struct nvmet_fc_fcp_iod *fod)
 {
@@ -1716,7 +1757,7 @@ nvmet_fc_xmt_fcp_rsp(struct nvmet_fc_tgtport *tgtport,
 
        ret = tgtport->ops->fcp_op(&tgtport->fc_target_port, fod->fcpreq);
        if (ret)
-               nvmet_fc_abort_op(tgtport, fod->fcpreq);
+               nvmet_fc_abort_op(tgtport, fod);
 }
 
 static void
@@ -1725,6 +1766,7 @@ nvmet_fc_transfer_fcp_data(struct nvmet_fc_tgtport *tgtport,
 {
        struct nvmefc_tgt_fcp_req *fcpreq = fod->fcpreq;
        struct scatterlist *sg, *datasg;
+       unsigned long flags;
        u32 tlen, sg_off;
        int ret;
 
@@ -1789,10 +1831,13 @@ nvmet_fc_transfer_fcp_data(struct nvmet_fc_tgtport *tgtport,
                 */
                fod->abort = true;
 
-               if (op == NVMET_FCOP_WRITEDATA)
+               if (op == NVMET_FCOP_WRITEDATA) {
+                       spin_lock_irqsave(&fod->flock, flags);
+                       fod->writedataactive = false;
+                       spin_unlock_irqrestore(&fod->flock, flags);
                        nvmet_req_complete(&fod->req,
                                        NVME_SC_FC_TRANSPORT_ERROR);
-               else /* NVMET_FCOP_READDATA or NVMET_FCOP_READDATA_RSP */ {
+               else /* NVMET_FCOP_READDATA or NVMET_FCOP_READDATA_RSP */ {
                        fcpreq->fcp_error = ret;
                        fcpreq->transferred_length = 0;
                        nvmet_fc_xmt_fcp_op_done(fod->fcpreq);
@@ -1800,32 +1845,54 @@ nvmet_fc_transfer_fcp_data(struct nvmet_fc_tgtport *tgtport,
        }
 }
 
+static inline bool
+__nvmet_fc_fod_op_abort(struct nvmet_fc_fcp_iod *fod, bool abort)
+{
+       struct nvmefc_tgt_fcp_req *fcpreq = fod->fcpreq;
+       struct nvmet_fc_tgtport *tgtport = fod->tgtport;
+
+       /* if in the middle of an io and we need to tear down */
+       if (abort) {
+               if (fcpreq->op == NVMET_FCOP_WRITEDATA) {
+                       nvmet_req_complete(&fod->req,
+                                       NVME_SC_FC_TRANSPORT_ERROR);
+                       return true;
+               }
+
+               nvmet_fc_abort_op(tgtport, fod);
+               return true;
+       }
+
+       return false;
+}
+
+/*
+ * actual done handler for FCP operations when completed by the lldd
+ */
 static void
-nvmet_fc_xmt_fcp_op_done(struct nvmefc_tgt_fcp_req *fcpreq)
+nvmet_fc_fod_op_done(struct nvmet_fc_fcp_iod *fod)
 {
-       struct nvmet_fc_fcp_iod *fod = fcpreq->nvmet_fc_private;
+       struct nvmefc_tgt_fcp_req *fcpreq = fod->fcpreq;
        struct nvmet_fc_tgtport *tgtport = fod->tgtport;
        unsigned long flags;
        bool abort;
 
        spin_lock_irqsave(&fod->flock, flags);
        abort = fod->abort;
+       fod->writedataactive = false;
        spin_unlock_irqrestore(&fod->flock, flags);
 
-       /* if in the middle of an io and we need to tear down */
-       if (abort && fcpreq->op != NVMET_FCOP_ABORT) {
-               /* data no longer needed */
-               nvmet_fc_free_tgt_pgs(fod);
-
-               nvmet_req_complete(&fod->req, fcpreq->fcp_error);
-               return;
-       }
-
        switch (fcpreq->op) {
 
        case NVMET_FCOP_WRITEDATA:
+               if (__nvmet_fc_fod_op_abort(fod, abort))
+                       return;
                if (fcpreq->fcp_error ||
                    fcpreq->transferred_length != fcpreq->transfer_length) {
+                       spin_lock(&fod->flock);
+                       fod->abort = true;
+                       spin_unlock(&fod->flock);
+
                        nvmet_req_complete(&fod->req,
                                        NVME_SC_FC_TRANSPORT_ERROR);
                        return;
@@ -1833,6 +1900,10 @@ nvmet_fc_xmt_fcp_op_done(struct nvmefc_tgt_fcp_req *fcpreq)
 
                fod->offset += fcpreq->transferred_length;
                if (fod->offset != fod->total_length) {
+                       spin_lock_irqsave(&fod->flock, flags);
+                       fod->writedataactive = true;
+                       spin_unlock_irqrestore(&fod->flock, flags);
+
                        /* transfer the next chunk */
                        nvmet_fc_transfer_fcp_data(tgtport, fod,
                                                NVMET_FCOP_WRITEDATA);
@@ -1847,12 +1918,11 @@ nvmet_fc_xmt_fcp_op_done(struct nvmefc_tgt_fcp_req *fcpreq)
 
        case NVMET_FCOP_READDATA:
        case NVMET_FCOP_READDATA_RSP:
+               if (__nvmet_fc_fod_op_abort(fod, abort))
+                       return;
                if (fcpreq->fcp_error ||
                    fcpreq->transferred_length != fcpreq->transfer_length) {
-                       /* data no longer needed */
-                       nvmet_fc_free_tgt_pgs(fod);
-
-                       nvmet_fc_abort_op(tgtport, fod->fcpreq);
+                       nvmet_fc_abort_op(tgtport, fod);
                        return;
                }
 
@@ -1861,8 +1931,6 @@ nvmet_fc_xmt_fcp_op_done(struct nvmefc_tgt_fcp_req *fcpreq)
                if (fcpreq->op == NVMET_FCOP_READDATA_RSP) {
                        /* data no longer needed */
                        nvmet_fc_free_tgt_pgs(fod);
-                       fc_dma_sync_single_for_cpu(tgtport->dev, fod->rspdma,
-                                       sizeof(fod->rspiubuf), DMA_TO_DEVICE);
                        nvmet_fc_free_fcp_iod(fod->queue, fod);
                        return;
                }
@@ -1885,19 +1953,38 @@ nvmet_fc_xmt_fcp_op_done(struct nvmefc_tgt_fcp_req *fcpreq)
                break;
 
        case NVMET_FCOP_RSP:
-       case NVMET_FCOP_ABORT:
-               fc_dma_sync_single_for_cpu(tgtport->dev, fod->rspdma,
-                               sizeof(fod->rspiubuf), DMA_TO_DEVICE);
+               if (__nvmet_fc_fod_op_abort(fod, abort))
+                       return;
                nvmet_fc_free_fcp_iod(fod->queue, fod);
                break;
 
        default:
-               nvmet_fc_free_tgt_pgs(fod);
-               nvmet_fc_abort_op(tgtport, fod->fcpreq);
                break;
        }
 }
 
+static void
+nvmet_fc_fcp_rqst_op_done_work(struct work_struct *work)
+{
+       struct nvmet_fc_fcp_iod *fod =
+               container_of(work, struct nvmet_fc_fcp_iod, done_work);
+
+       nvmet_fc_fod_op_done(fod);
+}
+
+static void
+nvmet_fc_xmt_fcp_op_done(struct nvmefc_tgt_fcp_req *fcpreq)
+{
+       struct nvmet_fc_fcp_iod *fod = fcpreq->nvmet_fc_private;
+       struct nvmet_fc_tgt_queue *queue = fod->queue;
+
+       if (fod->tgtport->ops->target_features & NVMET_FCTGTFEAT_OPDONE_IN_ISR)
+               /* context switch so completion is not in ISR context */
+               queue_work_on(queue->cpu, queue->work_q, &fod->done_work);
+       else
+               nvmet_fc_fod_op_done(fod);
+}
+
 /*
  * actual completion handler after execution by the nvmet layer
  */
@@ -1919,10 +2006,7 @@ __nvmet_fc_fcp_nvme_cmd_done(struct nvmet_fc_tgtport *tgtport,
                fod->queue->sqhd = cqe->sq_head;
 
        if (abort) {
-               /* data no longer needed */
-               nvmet_fc_free_tgt_pgs(fod);
-
-               nvmet_fc_abort_op(tgtport, fod->fcpreq);
+               nvmet_fc_abort_op(tgtport, fod);
                return;
        }
 
@@ -1971,7 +2055,7 @@ nvmet_fc_fcp_nvme_cmd_done(struct nvmet_req *nvme_req)
 /*
  * Actual processing routine for received FC-NVME LS Requests from the LLD
  */
-void
+static void
 nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport,
                        struct nvmet_fc_fcp_iod *fod)
 {
@@ -2018,8 +2102,8 @@ nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport,
                                &fod->queue->nvme_cq,
                                &fod->queue->nvme_sq,
                                &nvmet_fc_tgt_fcp_ops);
-       if (!ret) {     /* bad SQE content */
-               nvmet_fc_abort_op(tgtport, fod->fcpreq);
+       if (!ret) {     /* bad SQE content or invalid ctrl state */
+               nvmet_fc_abort_op(tgtport, fod);
                return;
        }
 
@@ -2059,7 +2143,7 @@ nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport,
        return;
 
 transport_error:
-       nvmet_fc_abort_op(tgtport, fod->fcpreq);
+       nvmet_fc_abort_op(tgtport, fod);
 }
 
 /*
@@ -2089,7 +2173,7 @@ nvmet_fc_handle_fcp_rqst_work(struct work_struct *work)
  * If this routine returns error, the lldd should abort the exchange.
  *
  * @target_port: pointer to the (registered) target port the FCP CMD IU
- *              was receive on.
+ *              was received on.
  * @fcpreq:     pointer to a fcpreq request structure to be used to reference
  *              the exchange corresponding to the FCP Exchange.
  * @cmdiubuf:   pointer to the buffer containing the FCP CMD IU
@@ -2112,7 +2196,6 @@ nvmet_fc_rcv_fcp_req(struct nvmet_fc_target_port *target_port,
                        (be16_to_cpu(cmdiu->iu_len) != (sizeof(*cmdiu)/4)))
                return -EIO;
 
-
        queue = nvmet_fc_find_target_queue(tgtport,
                                be64_to_cpu(cmdiu->connection_id));
        if (!queue)
@@ -2142,12 +2225,68 @@ nvmet_fc_rcv_fcp_req(struct nvmet_fc_target_port *target_port,
                        ((queue->qid - 1) % tgtport->ops->max_hw_queues) : 0;
        memcpy(&fod->cmdiubuf, cmdiubuf, cmdiubuf_len);
 
-       queue_work_on(queue->cpu, queue->work_q, &fod->work);
+       if (tgtport->ops->target_features & NVMET_FCTGTFEAT_CMD_IN_ISR)
+               queue_work_on(queue->cpu, queue->work_q, &fod->work);
+       else
+               nvmet_fc_handle_fcp_rqst(tgtport, fod);
 
        return 0;
 }
 EXPORT_SYMBOL_GPL(nvmet_fc_rcv_fcp_req);
 
+/**
+ * nvmet_fc_rcv_fcp_abort - transport entry point called by an LLDD
+ *                       upon the reception of an ABTS for a FCP command
+ *
+ * Notify the transport that an ABTS has been received for a FCP command
+ * that had been given to the transport via nvmet_fc_rcv_fcp_req(). The
+ * LLDD believes the command is still being worked on
+ * (template_ops->fcp_req_release() has not been called).
+ *
+ * The transport will wait for any outstanding work (an op to the LLDD,
+ * which the lldd should complete with error due to the ABTS; or the
+ * completion from the nvmet layer of the nvme command), then will
+ * stop processing and call the nvmet_fc_rcv_fcp_req() callback to
+ * return the i/o context to the LLDD.  The LLDD may send the BA_ACC
+ * to the ABTS either after return from this function (assuming any
+ * outstanding op work has been terminated) or upon the callback being
+ * called.
+ *
+ * @target_port: pointer to the (registered) target port the FCP CMD IU
+ *              was received on.
+ * @fcpreq:     pointer to the fcpreq request structure that corresponds
+ *              to the exchange that received the ABTS.
+ */
+void
+nvmet_fc_rcv_fcp_abort(struct nvmet_fc_target_port *target_port,
+                       struct nvmefc_tgt_fcp_req *fcpreq)
+{
+       struct nvmet_fc_fcp_iod *fod = fcpreq->nvmet_fc_private;
+       struct nvmet_fc_tgt_queue *queue;
+       unsigned long flags;
+
+       if (!fod || fod->fcpreq != fcpreq)
+               /* job appears to have already completed, ignore abort */
+               return;
+
+       queue = fod->queue;
+
+       spin_lock_irqsave(&queue->qlock, flags);
+       if (fod->active) {
+               /*
+                * mark as abort. The abort handler, invoked upon completion
+                * of any work, will detect the aborted status and do the
+                * callback.
+                */
+               spin_lock(&fod->flock);
+               fod->abort = true;
+               fod->aborted = true;
+               spin_unlock(&fod->flock);
+       }
+       spin_unlock_irqrestore(&queue->qlock, flags);
+}
+EXPORT_SYMBOL_GPL(nvmet_fc_rcv_fcp_abort);
+
 enum {
        FCT_TRADDR_ERR          = 0,
        FCT_TRADDR_WWNN         = 1 << 0,
@@ -2177,7 +2316,7 @@ nvmet_fc_parse_traddr(struct nvmet_fc_traddr *traddr, char *buf)
        if (!options)
                return -ENOMEM;
 
-       while ((p = strsep(&o, ",\n")) != NULL) {
+       while ((p = strsep(&o, ":\n")) != NULL) {
                if (!*p)
                        continue;
 
@@ -2238,6 +2377,7 @@ nvmet_fc_add_port(struct nvmet_port *port)
                        if (!tgtport->port) {
                                tgtport->port = port;
                                port->priv = tgtport;
+                               nvmet_fc_tgtport_get(tgtport);
                                ret = 0;
                        } else
                                ret = -EALREADY;
index 4e8e6a2..15551ef 100644 (file)
@@ -246,11 +246,19 @@ struct fcloop_lsreq {
 struct fcloop_fcpreq {
        struct fcloop_tport             *tport;
        struct nvmefc_fcp_req           *fcpreq;
+       spinlock_t                      reqlock;
        u16                             status;
+       bool                            active;
+       bool                            aborted;
        struct work_struct              work;
        struct nvmefc_tgt_fcp_req       tgt_fcp_req;
 };
 
+struct fcloop_ini_fcpreq {
+       struct nvmefc_fcp_req           *fcpreq;
+       struct fcloop_fcpreq            *tfcp_req;
+       struct work_struct              iniwork;
+};
 
 static inline struct fcloop_lsreq *
 tgt_ls_req_to_lsreq(struct nvmefc_tgt_ls_req *tgt_lsreq)
@@ -341,7 +349,21 @@ fcloop_xmt_ls_rsp(struct nvmet_fc_target_port *tport,
 }
 
 /*
- * FCP IO operation done. call back up initiator "done" flows.
+ * FCP IO operation done by initiator abort.
+ * call back up initiator "done" flows.
+ */
+static void
+fcloop_tgt_fcprqst_ini_done_work(struct work_struct *work)
+{
+       struct fcloop_ini_fcpreq *inireq =
+               container_of(work, struct fcloop_ini_fcpreq, iniwork);
+
+       inireq->fcpreq->done(inireq->fcpreq);
+}
+
+/*
+ * FCP IO operation done by target completion.
+ * call back up initiator "done" flows.
  */
 static void
 fcloop_tgt_fcprqst_done_work(struct work_struct *work)
@@ -349,12 +371,18 @@ fcloop_tgt_fcprqst_done_work(struct work_struct *work)
        struct fcloop_fcpreq *tfcp_req =
                container_of(work, struct fcloop_fcpreq, work);
        struct fcloop_tport *tport = tfcp_req->tport;
-       struct nvmefc_fcp_req *fcpreq = tfcp_req->fcpreq;
+       struct nvmefc_fcp_req *fcpreq;
 
-       if (tport->remoteport) {
+       spin_lock(&tfcp_req->reqlock);
+       fcpreq = tfcp_req->fcpreq;
+       spin_unlock(&tfcp_req->reqlock);
+
+       if (tport->remoteport && fcpreq) {
                fcpreq->status = tfcp_req->status;
                fcpreq->done(fcpreq);
        }
+
+       kfree(tfcp_req);
 }
 
 
@@ -364,20 +392,25 @@ fcloop_fcp_req(struct nvme_fc_local_port *localport,
                        void *hw_queue_handle,
                        struct nvmefc_fcp_req *fcpreq)
 {
-       struct fcloop_fcpreq *tfcp_req = fcpreq->private;
        struct fcloop_rport *rport = remoteport->private;
+       struct fcloop_ini_fcpreq *inireq = fcpreq->private;
+       struct fcloop_fcpreq *tfcp_req;
        int ret = 0;
 
-       INIT_WORK(&tfcp_req->work, fcloop_tgt_fcprqst_done_work);
+       if (!rport->targetport)
+               return -ECONNREFUSED;
 
-       if (!rport->targetport) {
-               tfcp_req->status = NVME_SC_FC_TRANSPORT_ERROR;
-               schedule_work(&tfcp_req->work);
-               return ret;
-       }
+       tfcp_req = kzalloc(sizeof(*tfcp_req), GFP_KERNEL);
+       if (!tfcp_req)
+               return -ENOMEM;
 
+       inireq->fcpreq = fcpreq;
+       inireq->tfcp_req = tfcp_req;
+       INIT_WORK(&inireq->iniwork, fcloop_tgt_fcprqst_ini_done_work);
        tfcp_req->fcpreq = fcpreq;
        tfcp_req->tport = rport->targetport->private;
+       spin_lock_init(&tfcp_req->reqlock);
+       INIT_WORK(&tfcp_req->work, fcloop_tgt_fcprqst_done_work);
 
        ret = nvmet_fc_rcv_fcp_req(rport->targetport, &tfcp_req->tgt_fcp_req,
                                 fcpreq->cmdaddr, fcpreq->cmdlen);
@@ -444,63 +477,129 @@ fcloop_fcp_op(struct nvmet_fc_target_port *tgtport,
                        struct nvmefc_tgt_fcp_req *tgt_fcpreq)
 {
        struct fcloop_fcpreq *tfcp_req = tgt_fcp_req_to_fcpreq(tgt_fcpreq);
-       struct nvmefc_fcp_req *fcpreq = tfcp_req->fcpreq;
+       struct nvmefc_fcp_req *fcpreq;
        u32 rsplen = 0, xfrlen = 0;
-       int fcp_err = 0;
+       int fcp_err = 0, active, aborted;
        u8 op = tgt_fcpreq->op;
 
+       spin_lock(&tfcp_req->reqlock);
+       fcpreq = tfcp_req->fcpreq;
+       active = tfcp_req->active;
+       aborted = tfcp_req->aborted;
+       tfcp_req->active = true;
+       spin_unlock(&tfcp_req->reqlock);
+
+       if (unlikely(active))
+               /* illegal - call while i/o active */
+               return -EALREADY;
+
+       if (unlikely(aborted)) {
+               /* target transport has aborted i/o prior */
+               spin_lock(&tfcp_req->reqlock);
+               tfcp_req->active = false;
+               spin_unlock(&tfcp_req->reqlock);
+               tgt_fcpreq->transferred_length = 0;
+               tgt_fcpreq->fcp_error = -ECANCELED;
+               tgt_fcpreq->done(tgt_fcpreq);
+               return 0;
+       }
+
+       /*
+        * if fcpreq is NULL, the I/O has been aborted (from
+        * initiator side). For the target side, act as if all is well
+        * but don't actually move data.
+        */
+
        switch (op) {
        case NVMET_FCOP_WRITEDATA:
                xfrlen = tgt_fcpreq->transfer_length;
-               fcloop_fcp_copy_data(op, tgt_fcpreq->sg, fcpreq->first_sgl,
-                                       tgt_fcpreq->offset, xfrlen);
-               fcpreq->transferred_length += xfrlen;
+               if (fcpreq) {
+                       fcloop_fcp_copy_data(op, tgt_fcpreq->sg,
+                                       fcpreq->first_sgl, tgt_fcpreq->offset,
+                                       xfrlen);
+                       fcpreq->transferred_length += xfrlen;
+               }
                break;
 
        case NVMET_FCOP_READDATA:
        case NVMET_FCOP_READDATA_RSP:
                xfrlen = tgt_fcpreq->transfer_length;
-               fcloop_fcp_copy_data(op, tgt_fcpreq->sg, fcpreq->first_sgl,
-                                       tgt_fcpreq->offset, xfrlen);
-               fcpreq->transferred_length += xfrlen;
+               if (fcpreq) {
+                       fcloop_fcp_copy_data(op, tgt_fcpreq->sg,
+                                       fcpreq->first_sgl, tgt_fcpreq->offset,
+                                       xfrlen);
+                       fcpreq->transferred_length += xfrlen;
+               }
                if (op == NVMET_FCOP_READDATA)
                        break;
 
                /* Fall-Thru to RSP handling */
 
        case NVMET_FCOP_RSP:
-               rsplen = ((fcpreq->rsplen < tgt_fcpreq->rsplen) ?
-                               fcpreq->rsplen : tgt_fcpreq->rsplen);
-               memcpy(fcpreq->rspaddr, tgt_fcpreq->rspaddr, rsplen);
-               if (rsplen < tgt_fcpreq->rsplen)
-                       fcp_err = -E2BIG;
-               fcpreq->rcv_rsplen = rsplen;
-               fcpreq->status = 0;
+               if (fcpreq) {
+                       rsplen = ((fcpreq->rsplen < tgt_fcpreq->rsplen) ?
+                                       fcpreq->rsplen : tgt_fcpreq->rsplen);
+                       memcpy(fcpreq->rspaddr, tgt_fcpreq->rspaddr, rsplen);
+                       if (rsplen < tgt_fcpreq->rsplen)
+                               fcp_err = -E2BIG;
+                       fcpreq->rcv_rsplen = rsplen;
+                       fcpreq->status = 0;
+               }
                tfcp_req->status = 0;
                break;
 
-       case NVMET_FCOP_ABORT:
-               tfcp_req->status = NVME_SC_FC_TRANSPORT_ABORTED;
-               break;
-
        default:
                fcp_err = -EINVAL;
                break;
        }
 
+       spin_lock(&tfcp_req->reqlock);
+       tfcp_req->active = false;
+       spin_unlock(&tfcp_req->reqlock);
+
        tgt_fcpreq->transferred_length = xfrlen;
        tgt_fcpreq->fcp_error = fcp_err;
        tgt_fcpreq->done(tgt_fcpreq);
 
-       if ((!fcp_err) && (op == NVMET_FCOP_RSP ||
-                       op == NVMET_FCOP_READDATA_RSP ||
-                       op == NVMET_FCOP_ABORT))
-               schedule_work(&tfcp_req->work);
-
        return 0;
 }
 
 static void
+fcloop_tgt_fcp_abort(struct nvmet_fc_target_port *tgtport,
+                       struct nvmefc_tgt_fcp_req *tgt_fcpreq)
+{
+       struct fcloop_fcpreq *tfcp_req = tgt_fcp_req_to_fcpreq(tgt_fcpreq);
+       int active;
+
+       /*
+        * mark aborted only in case there were 2 threads in transport
+        * (one doing io, other doing abort) and only kills ops posted
+        * after the abort request
+        */
+       spin_lock(&tfcp_req->reqlock);
+       active = tfcp_req->active;
+       tfcp_req->aborted = true;
+       spin_unlock(&tfcp_req->reqlock);
+
+       tfcp_req->status = NVME_SC_FC_TRANSPORT_ABORTED;
+
+       /*
+        * nothing more to do. If io wasn't active, the transport should
+        * immediately call the req_release. If it was active, the op
+        * will complete, and the lldd should call req_release.
+        */
+}
+
+static void
+fcloop_fcp_req_release(struct nvmet_fc_target_port *tgtport,
+                       struct nvmefc_tgt_fcp_req *tgt_fcpreq)
+{
+       struct fcloop_fcpreq *tfcp_req = tgt_fcp_req_to_fcpreq(tgt_fcpreq);
+
+       schedule_work(&tfcp_req->work);
+}
+
+static void
 fcloop_ls_abort(struct nvme_fc_local_port *localport,
                        struct nvme_fc_remote_port *remoteport,
                                struct nvmefc_ls_req *lsreq)
@@ -513,6 +612,27 @@ fcloop_fcp_abort(struct nvme_fc_local_port *localport,
                        void *hw_queue_handle,
                        struct nvmefc_fcp_req *fcpreq)
 {
+       struct fcloop_rport *rport = remoteport->private;
+       struct fcloop_ini_fcpreq *inireq = fcpreq->private;
+       struct fcloop_fcpreq *tfcp_req = inireq->tfcp_req;
+
+       if (!tfcp_req)
+               /* abort has already been called */
+               return;
+
+       if (rport->targetport)
+               nvmet_fc_rcv_fcp_abort(rport->targetport,
+                                       &tfcp_req->tgt_fcp_req);
+
+       /* break initiator/target relationship for io */
+       spin_lock(&tfcp_req->reqlock);
+       inireq->tfcp_req = NULL;
+       tfcp_req->fcpreq = NULL;
+       spin_unlock(&tfcp_req->reqlock);
+
+       /* post the aborted io completion */
+       fcpreq->status = -ECANCELED;
+       schedule_work(&inireq->iniwork);
 }
 
 static void
@@ -546,7 +666,7 @@ fcloop_targetport_delete(struct nvmet_fc_target_port *targetport)
 #define        FCLOOP_SGL_SEGS                 256
 #define FCLOOP_DMABOUND_4G             0xFFFFFFFF
 
-struct nvme_fc_port_template fctemplate = {
+static struct nvme_fc_port_template fctemplate = {
        .localport_delete       = fcloop_localport_delete,
        .remoteport_delete      = fcloop_remoteport_delete,
        .create_queue           = fcloop_create_queue,
@@ -563,20 +683,23 @@ struct nvme_fc_port_template fctemplate = {
        .local_priv_sz          = sizeof(struct fcloop_lport),
        .remote_priv_sz         = sizeof(struct fcloop_rport),
        .lsrqst_priv_sz         = sizeof(struct fcloop_lsreq),
-       .fcprqst_priv_sz        = sizeof(struct fcloop_fcpreq),
+       .fcprqst_priv_sz        = sizeof(struct fcloop_ini_fcpreq),
 };
 
-struct nvmet_fc_target_template tgttemplate = {
+static struct nvmet_fc_target_template tgttemplate = {
        .targetport_delete      = fcloop_targetport_delete,
        .xmt_ls_rsp             = fcloop_xmt_ls_rsp,
        .fcp_op                 = fcloop_fcp_op,
+       .fcp_abort              = fcloop_tgt_fcp_abort,
+       .fcp_req_release        = fcloop_fcp_req_release,
        .max_hw_queues          = FCLOOP_HW_QUEUES,
        .max_sgl_segments       = FCLOOP_SGL_SEGS,
        .max_dif_sgl_segments   = FCLOOP_SGL_SEGS,
        .dma_boundary           = FCLOOP_DMABOUND_4G,
        /* optional features */
-       .target_features        = NVMET_FCTGTFEAT_READDATA_RSP |
-                                 NVMET_FCTGTFEAT_NEEDS_CMD_CPUSCHED,
+       .target_features        = NVMET_FCTGTFEAT_CMD_IN_ISR |
+                                 NVMET_FCTGTFEAT_NEEDS_CMD_CPUSCHED |
+                                 NVMET_FCTGTFEAT_OPDONE_IN_ISR,
        /* sizes of additional private data for data structures */
        .target_priv_sz         = sizeof(struct fcloop_tport),
 };
index 4195115..c77940d 100644 (file)
@@ -180,11 +180,11 @@ static void nvmet_execute_write_zeroes(struct nvmet_req *req)
 
        sector = le64_to_cpu(write_zeroes->slba) <<
                (req->ns->blksize_shift - 9);
-       nr_sector = (((sector_t)le32_to_cpu(write_zeroes->length)) <<
+       nr_sector = (((sector_t)le16_to_cpu(write_zeroes->length)) <<
                (req->ns->blksize_shift - 9)) + 1;
 
        if (__blkdev_issue_zeroout(req->ns->bdev, sector, nr_sector,
-                               GFP_KERNEL, &bio, true))
+                               GFP_KERNEL, &bio, 0))
                status = NVME_SC_INTERNAL | NVME_SC_DNR;
 
        if (bio) {
@@ -196,26 +196,19 @@ static void nvmet_execute_write_zeroes(struct nvmet_req *req)
        }
 }
 
-int nvmet_parse_io_cmd(struct nvmet_req *req)
+u16 nvmet_parse_io_cmd(struct nvmet_req *req)
 {
        struct nvme_command *cmd = req->cmd;
+       u16 ret;
 
-       if (unlikely(!(req->sq->ctrl->cc & NVME_CC_ENABLE))) {
-               pr_err("nvmet: got io cmd %d while CC.EN == 0\n",
-                               cmd->common.opcode);
+       ret = nvmet_check_ctrl_status(req, cmd);
+       if (unlikely(ret)) {
                req->ns = NULL;
-               return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
-       }
-
-       if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) {
-               pr_err("nvmet: got io cmd %d while CSTS.RDY == 0\n",
-                               cmd->common.opcode);
-               req->ns = NULL;
-               return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
+               return ret;
        }
 
        req->ns = nvmet_find_namespace(req->sq->ctrl, cmd->rw.nsid);
-       if (!req->ns)
+       if (unlikely(!req->ns))
                return NVME_SC_INVALID_NS | NVME_SC_DNR;
 
        switch (cmd->common.opcode) {
@@ -230,14 +223,15 @@ int nvmet_parse_io_cmd(struct nvmet_req *req)
                return 0;
        case nvme_cmd_dsm:
                req->execute = nvmet_execute_dsm;
-               req->data_len = le32_to_cpu(cmd->dsm.nr + 1) *
+               req->data_len = (le32_to_cpu(cmd->dsm.nr) + 1) *
                        sizeof(struct nvme_dsm_range);
                return 0;
        case nvme_cmd_write_zeroes:
                req->execute = nvmet_execute_write_zeroes;
                return 0;
        default:
-               pr_err("nvmet: unhandled cmd %d\n", cmd->common.opcode);
+               pr_err("unhandled cmd %d on qid %d\n", cmd->common.opcode,
+                      req->sq->qid);
                return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
        }
 }
index 22f7bc6..304f1c8 100644 (file)
  */
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/scatterlist.h>
-#include <linux/delay.h>
 #include <linux/blk-mq.h>
 #include <linux/nvme.h>
 #include <linux/module.h>
 #include <linux/parser.h>
-#include <linux/t10-pi.h>
 #include "nvmet.h"
 #include "../host/nvme.h"
 #include "../host/fabrics.h"
@@ -93,31 +91,26 @@ static inline int nvme_loop_queue_idx(struct nvme_loop_queue *queue)
 static void nvme_loop_complete_rq(struct request *req)
 {
        struct nvme_loop_iod *iod = blk_mq_rq_to_pdu(req);
-       int error = 0;
 
        nvme_cleanup_cmd(req);
        sg_free_table_chained(&iod->sg_table, true);
+       nvme_complete_rq(req);
+}
 
-       if (unlikely(req->errors)) {
-               if (nvme_req_needs_retry(req, req->errors)) {
-                       nvme_requeue_req(req);
-                       return;
-               }
-
-               if (blk_rq_is_passthrough(req))
-                       error = req->errors;
-               else
-                       error = nvme_error_status(req->errors);
-       }
+static struct blk_mq_tags *nvme_loop_tagset(struct nvme_loop_queue *queue)
+{
+       u32 queue_idx = nvme_loop_queue_idx(queue);
 
-       blk_mq_end_request(req, error);
+       if (queue_idx == 0)
+               return queue->ctrl->admin_tag_set.tags[queue_idx];
+       return queue->ctrl->tag_set.tags[queue_idx - 1];
 }
 
 static void nvme_loop_queue_response(struct nvmet_req *req)
 {
-       struct nvme_loop_iod *iod =
-               container_of(req, struct nvme_loop_iod, req);
-       struct nvme_completion *cqe = &iod->rsp;
+       struct nvme_loop_queue *queue =
+               container_of(req->sq, struct nvme_loop_queue, nvme_sq);
+       struct nvme_completion *cqe = req->rsp;
 
        /*
         * AEN requests are special as they don't time out and can
@@ -125,15 +118,22 @@ static void nvme_loop_queue_response(struct nvmet_req *req)
         * aborts.  We don't even bother to allocate a struct request
         * for them but rather special case them here.
         */
-       if (unlikely(nvme_loop_queue_idx(iod->queue) == 0 &&
+       if (unlikely(nvme_loop_queue_idx(queue) == 0 &&
                        cqe->command_id >= NVME_LOOP_AQ_BLKMQ_DEPTH)) {
-               nvme_complete_async_event(&iod->queue->ctrl->ctrl, cqe->status,
+               nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
                                &cqe->result);
        } else {
-               struct request *rq = blk_mq_rq_from_pdu(iod);
+               struct request *rq;
+
+               rq = blk_mq_tag_to_rq(nvme_loop_tagset(queue), cqe->command_id);
+               if (!rq) {
+                       dev_err(queue->ctrl->ctrl.device,
+                               "tag 0x%x on queue %d not found\n",
+                               cqe->command_id, nvme_loop_queue_idx(queue));
+                       return;
+               }
 
-               iod->nvme_req.result = cqe->result;
-               blk_mq_complete_request(rq, le16_to_cpu(cqe->status) >> 1);
+               nvme_end_request(rq, cqe->status, cqe->result);
        }
 }
 
@@ -154,7 +154,7 @@ nvme_loop_timeout(struct request *rq, bool reserved)
        schedule_work(&iod->queue->ctrl->reset_work);
 
        /* fail with DNR on admin cmd timeout */
-       rq->errors = NVME_SC_ABORT_REQ | NVME_SC_DNR;
+       nvme_req(rq)->status = NVME_SC_ABORT_REQ | NVME_SC_DNR;
 
        return BLK_EH_HANDLED;
 }
@@ -268,7 +268,7 @@ static int nvme_loop_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
        return 0;
 }
 
-static struct blk_mq_ops nvme_loop_mq_ops = {
+static const struct blk_mq_ops nvme_loop_mq_ops = {
        .queue_rq       = nvme_loop_queue_rq,
        .complete       = nvme_loop_complete_rq,
        .init_request   = nvme_loop_init_request,
@@ -276,7 +276,7 @@ static struct blk_mq_ops nvme_loop_mq_ops = {
        .timeout        = nvme_loop_timeout,
 };
 
-static struct blk_mq_ops nvme_loop_admin_mq_ops = {
+static const struct blk_mq_ops nvme_loop_admin_mq_ops = {
        .queue_rq       = nvme_loop_queue_rq,
        .complete       = nvme_loop_complete_rq,
        .init_request   = nvme_loop_init_admin_request,
@@ -349,6 +349,19 @@ out_destroy_queues:
        return ret;
 }
 
+static int nvme_loop_connect_io_queues(struct nvme_loop_ctrl *ctrl)
+{
+       int i, ret;
+
+       for (i = 1; i < ctrl->queue_count; i++) {
+               ret = nvmf_connect_io_queue(&ctrl->ctrl, i);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
+
 static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl)
 {
        int error;
@@ -392,7 +405,7 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl)
        }
 
        ctrl->ctrl.sqsize =
-               min_t(int, NVME_CAP_MQES(ctrl->cap) + 1, ctrl->ctrl.sqsize);
+               min_t(int, NVME_CAP_MQES(ctrl->cap), ctrl->ctrl.sqsize);
 
        error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap);
        if (error)
@@ -490,7 +503,7 @@ static void nvme_loop_reset_ctrl_work(struct work_struct *work)
        struct nvme_loop_ctrl *ctrl = container_of(work,
                                        struct nvme_loop_ctrl, reset_work);
        bool changed;
-       int i, ret;
+       int ret;
 
        nvme_loop_shutdown_ctrl(ctrl);
 
@@ -502,11 +515,9 @@ static void nvme_loop_reset_ctrl_work(struct work_struct *work)
        if (ret)
                goto out_destroy_admin;
 
-       for (i = 1; i < ctrl->queue_count; i++) {
-               ret = nvmf_connect_io_queue(&ctrl->ctrl, i);
-               if (ret)
-                       goto out_destroy_io;
-       }
+       ret = nvme_loop_connect_io_queues(ctrl);
+       if (ret)
+               goto out_destroy_io;
 
        changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
        WARN_ON_ONCE(!changed);
@@ -559,7 +570,7 @@ static const struct nvme_ctrl_ops nvme_loop_ctrl_ops = {
 
 static int nvme_loop_create_io_queues(struct nvme_loop_ctrl *ctrl)
 {
-       int ret, i;
+       int ret;
 
        ret = nvme_loop_init_io_queues(ctrl);
        if (ret)
@@ -588,11 +599,9 @@ static int nvme_loop_create_io_queues(struct nvme_loop_ctrl *ctrl)
                goto out_free_tagset;
        }
 
-       for (i = 1; i < ctrl->queue_count; i++) {
-               ret = nvmf_connect_io_queue(&ctrl->ctrl, i);
-               if (ret)
-                       goto out_cleanup_connect_q;
-       }
+       ret = nvme_loop_connect_io_queues(ctrl);
+       if (ret)
+               goto out_cleanup_connect_q;
 
        return 0;
 
@@ -736,7 +745,12 @@ static int __init nvme_loop_init_module(void)
        ret = nvmet_register_transport(&nvme_loop_ops);
        if (ret)
                return ret;
-       return nvmf_register_transport(&nvme_loop_transport);
+
+       ret = nvmf_register_transport(&nvme_loop_transport);
+       if (ret)
+               nvmet_unregister_transport(&nvme_loop_ops);
+
+       return ret;
 }
 
 static void __exit nvme_loop_cleanup_module(void)
index f7ff15f..7cb77ba 100644 (file)
@@ -253,11 +253,11 @@ struct nvmet_async_event {
        u8                      log_page;
 };
 
-int nvmet_parse_connect_cmd(struct nvmet_req *req);
-int nvmet_parse_io_cmd(struct nvmet_req *req);
-int nvmet_parse_admin_cmd(struct nvmet_req *req);
-int nvmet_parse_discovery_cmd(struct nvmet_req *req);
-int nvmet_parse_fabrics_cmd(struct nvmet_req *req);
+u16 nvmet_parse_connect_cmd(struct nvmet_req *req);
+u16 nvmet_parse_io_cmd(struct nvmet_req *req);
+u16 nvmet_parse_admin_cmd(struct nvmet_req *req);
+u16 nvmet_parse_discovery_cmd(struct nvmet_req *req);
+u16 nvmet_parse_fabrics_cmd(struct nvmet_req *req);
 
 bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
                struct nvmet_sq *sq, struct nvmet_fabrics_ops *ops);
@@ -278,6 +278,7 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
 u16 nvmet_ctrl_find_get(const char *subsysnqn, const char *hostnqn, u16 cntlid,
                struct nvmet_req *req, struct nvmet_ctrl **ret);
 void nvmet_ctrl_put(struct nvmet_ctrl *ctrl);
+u16 nvmet_check_ctrl_status(struct nvmet_req *req, struct nvme_command *cmd);
 
 struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn,
                enum nvme_subsys_type type);
index ecc4fe8..99c6901 100644 (file)
@@ -1199,6 +1199,11 @@ static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id,
        }
        queue->port = cm_id->context;
 
+       if (queue->host_qid == 0) {
+               /* Let inflight controller teardown complete */
+               flush_scheduled_work();
+       }
+
        ret = nvmet_rdma_cm_accept(cm_id, queue, &event->param.conn);
        if (ret)
                goto release_queue;
@@ -1427,12 +1432,16 @@ restart:
 static int nvmet_rdma_add_port(struct nvmet_port *port)
 {
        struct rdma_cm_id *cm_id;
-       struct sockaddr_in addr_in;
-       u16 port_in;
+       struct sockaddr_storage addr = { };
+       __kernel_sa_family_t af;
        int ret;
 
        switch (port->disc_addr.adrfam) {
        case NVMF_ADDR_FAMILY_IP4:
+               af = AF_INET;
+               break;
+       case NVMF_ADDR_FAMILY_IP6:
+               af = AF_INET6;
                break;
        default:
                pr_err("address family %d not supported\n",
@@ -1440,13 +1449,13 @@ static int nvmet_rdma_add_port(struct nvmet_port *port)
                return -EINVAL;
        }
 
-       ret = kstrtou16(port->disc_addr.trsvcid, 0, &port_in);
-       if (ret)
+       ret = inet_pton_with_scope(&init_net, af, port->disc_addr.traddr,
+                       port->disc_addr.trsvcid, &addr);
+       if (ret) {
+               pr_err("malformed ip/port passed: %s:%s\n",
+                       port->disc_addr.traddr, port->disc_addr.trsvcid);
                return ret;
-
-       addr_in.sin_family = AF_INET;
-       addr_in.sin_addr.s_addr = in_aton(port->disc_addr.traddr);
-       addr_in.sin_port = htons(port_in);
+       }
 
        cm_id = rdma_create_id(&init_net, nvmet_rdma_cm_handler, port,
                        RDMA_PS_TCP, IB_QPT_RC);
@@ -1455,20 +1464,32 @@ static int nvmet_rdma_add_port(struct nvmet_port *port)
                return PTR_ERR(cm_id);
        }
 
-       ret = rdma_bind_addr(cm_id, (struct sockaddr *)&addr_in);
+       /*
+        * Allow both IPv4 and IPv6 sockets to bind a single port
+        * at the same time.
+        */
+       ret = rdma_set_afonly(cm_id, 1);
+       if (ret) {
+               pr_err("rdma_set_afonly failed (%d)\n", ret);
+               goto out_destroy_id;
+       }
+
+       ret = rdma_bind_addr(cm_id, (struct sockaddr *)&addr);
        if (ret) {
-               pr_err("binding CM ID to %pISpc failed (%d)\n", &addr_in, ret);
+               pr_err("binding CM ID to %pISpcs failed (%d)\n",
+                       (struct sockaddr *)&addr, ret);
                goto out_destroy_id;
        }
 
        ret = rdma_listen(cm_id, 128);
        if (ret) {
-               pr_err("listening to %pISpc failed (%d)\n", &addr_in, ret);
+               pr_err("listening to %pISpcs failed (%d)\n",
+                       (struct sockaddr *)&addr, ret);
                goto out_destroy_id;
        }
 
-       pr_info("enabling port %d (%pISpc)\n",
-               le16_to_cpu(port->disc_addr.portid), &addr_in);
+       pr_info("enabling port %d (%pISpcs)\n",
+               le16_to_cpu(port->disc_addr.portid), (struct sockaddr *)&addr);
        port->priv = cm_id;
        return 0;
 
index dfb8a69..d2d2ba5 100644 (file)
@@ -89,6 +89,7 @@ config PCI_HISI
        depends on PCI_MSI_IRQ_DOMAIN
        select PCIEPORTBUS
        select PCIE_DW_HOST
+       select PCI_HOST_COMMON
        help
          Say Y here if you want PCIe controller support on HiSilicon
          Hip05 and Hip06 SoCs
index fcd3ef8..6d23683 100644 (file)
@@ -234,6 +234,9 @@ static int artpec6_add_pcie_port(struct artpec6_pcie *artpec6_pcie,
        return 0;
 }
 
+static const struct dw_pcie_ops dw_pcie_ops = {
+};
+
 static int artpec6_pcie_probe(struct platform_device *pdev)
 {
        struct device *dev = &pdev->dev;
@@ -252,6 +255,7 @@ static int artpec6_pcie_probe(struct platform_device *pdev)
                return -ENOMEM;
 
        pci->dev = dev;
+       pci->ops = &dw_pcie_ops;
 
        artpec6_pcie->pci = pci;
 
index b6c832b..f20d494 100644 (file)
@@ -86,6 +86,9 @@ static int dw_plat_add_pcie_port(struct pcie_port *pp,
        return 0;
 }
 
+static const struct dw_pcie_ops dw_pcie_ops = {
+};
+
 static int dw_plat_pcie_probe(struct platform_device *pdev)
 {
        struct device *dev = &pdev->dev;
@@ -103,6 +106,7 @@ static int dw_plat_pcie_probe(struct platform_device *pdev)
                return -ENOMEM;
 
        pci->dev = dev;
+       pci->ops = &dw_pcie_ops;
 
        dw_plat_pcie->pci = pci;
 
index fd66a31..cf9d6a9 100644 (file)
@@ -380,9 +380,13 @@ struct pci_ecam_ops hisi_pcie_platform_ops = {
 
 static const struct of_device_id hisi_pcie_almost_ecam_of_match[] = {
        {
-               .compatible = "hisilicon,pcie-almost-ecam",
+               .compatible =  "hisilicon,hip06-pcie-ecam",
                .data       = (void *) &hisi_pcie_platform_ops,
        },
+       {
+               .compatible =  "hisilicon,hip07-pcie-ecam",
+               .data       = (void *) &hisi_pcie_platform_ops,
+       },
        {},
 };
 
index b89c373..6e031b5 100644 (file)
@@ -375,7 +375,6 @@ static void thunder_pem_legacy_fw(struct acpi_pci_root *root,
        index -= node * PEM_MAX_DOM_IN_NODE;
        res_pem->start = PEM_RES_BASE | FIELD_PREP(PEM_NODE_MASK, node) |
                                        FIELD_PREP(PEM_INDX_MASK, index);
-       res_pem->end = res_pem->start + SZ_16M - 1;
        res_pem->flags = IORESOURCE_MEM;
 }
 
@@ -399,8 +398,15 @@ static int thunder_pem_acpi_init(struct pci_config_window *cfg)
         */
        if (ret) {
                thunder_pem_legacy_fw(root, res_pem);
-               /* Reserve PEM-specific resources and PCI configuration space */
+               /*
+                * Reserve 64K size PEM specific resources. The full 16M range
+                * size is required for thunder_pem_init() call.
+                */
+               res_pem->end = res_pem->start + SZ_64K - 1;
                thunder_pem_reserve_range(dev, root->segment, res_pem);
+               res_pem->end = res_pem->start + SZ_16M - 1;
+
+               /* Reserve PCI configuration space as well. */
                thunder_pem_reserve_range(dev, root->segment, &cfg->res);
        }
 
index d690465..32822b0 100644 (file)
@@ -2010,29 +2010,57 @@ out_err:
        return ERR_PTR(ret);
 }
 
-static int pinctrl_create_and_start(struct pinctrl_dev *pctldev)
+static int pinctrl_claim_hogs(struct pinctrl_dev *pctldev)
 {
        pctldev->p = create_pinctrl(pctldev->dev, pctldev);
-       if (!IS_ERR(pctldev->p)) {
-               kref_get(&pctldev->p->users);
-               pctldev->hog_default =
-                       pinctrl_lookup_state(pctldev->p, PINCTRL_STATE_DEFAULT);
-               if (IS_ERR(pctldev->hog_default)) {
-                       dev_dbg(pctldev->dev,
-                               "failed to lookup the default state\n");
-               } else {
-                       if (pinctrl_select_state(pctldev->p,
-                                               pctldev->hog_default))
-                               dev_err(pctldev->dev,
-                                       "failed to select default state\n");
-               }
+       if (PTR_ERR(pctldev->p) == -ENODEV) {
+               dev_dbg(pctldev->dev, "no hogs found\n");
 
-               pctldev->hog_sleep =
-                       pinctrl_lookup_state(pctldev->p,
-                                                   PINCTRL_STATE_SLEEP);
-               if (IS_ERR(pctldev->hog_sleep))
-                       dev_dbg(pctldev->dev,
-                               "failed to lookup the sleep state\n");
+               return 0;
+       }
+
+       if (IS_ERR(pctldev->p)) {
+               dev_err(pctldev->dev, "error claiming hogs: %li\n",
+                       PTR_ERR(pctldev->p));
+
+               return PTR_ERR(pctldev->p);
+       }
+
+       kref_get(&pctldev->p->users);
+       pctldev->hog_default =
+               pinctrl_lookup_state(pctldev->p, PINCTRL_STATE_DEFAULT);
+       if (IS_ERR(pctldev->hog_default)) {
+               dev_dbg(pctldev->dev,
+                       "failed to lookup the default state\n");
+       } else {
+               if (pinctrl_select_state(pctldev->p,
+                                        pctldev->hog_default))
+                       dev_err(pctldev->dev,
+                               "failed to select default state\n");
+       }
+
+       pctldev->hog_sleep =
+               pinctrl_lookup_state(pctldev->p,
+                                    PINCTRL_STATE_SLEEP);
+       if (IS_ERR(pctldev->hog_sleep))
+               dev_dbg(pctldev->dev,
+                       "failed to lookup the sleep state\n");
+
+       return 0;
+}
+
+int pinctrl_enable(struct pinctrl_dev *pctldev)
+{
+       int error;
+
+       error = pinctrl_claim_hogs(pctldev);
+       if (error) {
+               dev_err(pctldev->dev, "could not claim hogs: %i\n",
+                       error);
+               mutex_destroy(&pctldev->mutex);
+               kfree(pctldev);
+
+               return error;
        }
 
        mutex_lock(&pinctrldev_list_mutex);
@@ -2043,6 +2071,7 @@ static int pinctrl_create_and_start(struct pinctrl_dev *pctldev)
 
        return 0;
 }
+EXPORT_SYMBOL_GPL(pinctrl_enable);
 
 /**
  * pinctrl_register() - register a pin controller device
@@ -2065,25 +2094,30 @@ struct pinctrl_dev *pinctrl_register(struct pinctrl_desc *pctldesc,
        if (IS_ERR(pctldev))
                return pctldev;
 
-       error = pinctrl_create_and_start(pctldev);
-       if (error) {
-               mutex_destroy(&pctldev->mutex);
-               kfree(pctldev);
-
+       error = pinctrl_enable(pctldev);
+       if (error)
                return ERR_PTR(error);
-       }
 
        return pctldev;
 
 }
 EXPORT_SYMBOL_GPL(pinctrl_register);
 
+/**
+ * pinctrl_register_and_init() - register and init pin controller device
+ * @pctldesc: descriptor for this pin controller
+ * @dev: parent device for this pin controller
+ * @driver_data: private pin controller data for this pin controller
+ * @pctldev: pin controller device
+ *
+ * Note that pinctrl_enable() still needs to be manually called after
+ * this once the driver is ready.
+ */
 int pinctrl_register_and_init(struct pinctrl_desc *pctldesc,
                              struct device *dev, void *driver_data,
                              struct pinctrl_dev **pctldev)
 {
        struct pinctrl_dev *p;
-       int error;
 
        p = pinctrl_init_controller(pctldesc, dev, driver_data);
        if (IS_ERR(p))
@@ -2097,15 +2131,6 @@ int pinctrl_register_and_init(struct pinctrl_desc *pctldesc,
         */
        *pctldev = p;
 
-       error = pinctrl_create_and_start(p);
-       if (error) {
-               mutex_destroy(&p->mutex);
-               kfree(p);
-               *pctldev = NULL;
-
-               return error;
-       }
-
        return 0;
 }
 EXPORT_SYMBOL_GPL(pinctrl_register_and_init);
index a7ace9e..74bd90d 100644 (file)
@@ -790,7 +790,7 @@ int imx_pinctrl_probe(struct platform_device *pdev,
 
        dev_info(&pdev->dev, "initialized IMX pinctrl driver\n");
 
-       return 0;
+       return pinctrl_enable(ipctl->pctl);
 
 free:
        imx_free_resources(ipctl);
index f80134e..9ff7901 100644 (file)
@@ -13,6 +13,7 @@
  * published by the Free Software Foundation.
  */
 
+#include <linux/dmi.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/init.h>
@@ -1524,10 +1525,31 @@ static void chv_gpio_irq_handler(struct irq_desc *desc)
        chained_irq_exit(chip, desc);
 }
 
+/*
+ * Certain machines seem to hardcode Linux IRQ numbers in their ACPI
+ * tables. Since we leave GPIOs that are not capable of generating
+ * interrupts out of the irqdomain the numbering will be different and
+ * cause devices using the hardcoded IRQ numbers fail. In order not to
+ * break such machines we will only mask pins from irqdomain if the machine
+ * is not listed below.
+ */
+static const struct dmi_system_id chv_no_valid_mask[] = {
+       {
+               /* See https://bugzilla.kernel.org/show_bug.cgi?id=194945 */
+               .ident = "Acer Chromebook (CYAN)",
+               .matches = {
+                       DMI_MATCH(DMI_SYS_VENDOR, "GOOGLE"),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "Edgar"),
+                       DMI_MATCH(DMI_BIOS_DATE, "05/21/2016"),
+               },
+       }
+};
+
 static int chv_gpio_probe(struct chv_pinctrl *pctrl, int irq)
 {
        const struct chv_gpio_pinrange *range;
        struct gpio_chip *chip = &pctrl->chip;
+       bool need_valid_mask = !dmi_check_system(chv_no_valid_mask);
        int ret, i, offset;
 
        *chip = chv_gpio_chip;
@@ -1536,7 +1558,7 @@ static int chv_gpio_probe(struct chv_pinctrl *pctrl, int irq)
        chip->label = dev_name(pctrl->dev);
        chip->parent = pctrl->dev;
        chip->base = -1;
-       chip->irq_need_valid_mask = true;
+       chip->irq_need_valid_mask = need_valid_mask;
 
        ret = devm_gpiochip_add_data(pctrl->dev, chip, pctrl);
        if (ret) {
@@ -1567,7 +1589,7 @@ static int chv_gpio_probe(struct chv_pinctrl *pctrl, int irq)
                intsel &= CHV_PADCTRL0_INTSEL_MASK;
                intsel >>= CHV_PADCTRL0_INTSEL_SHIFT;
 
-               if (intsel >= pctrl->community->nirqs)
+               if (need_valid_mask && intsel >= pctrl->community->nirqs)
                        clear_bit(i, chip->irq_valid_mask);
        }
 
index 8b2d45e..9c267dc 100644 (file)
@@ -1781,7 +1781,7 @@ static int pcs_probe(struct platform_device *pdev)
        dev_info(pcs->dev, "%i pins at pa %p size %u\n",
                 pcs->desc.npins, pcs->base, pcs->size);
 
-       return 0;
+       return pinctrl_enable(pcs->pctl);
 
 free:
        pcs_free_resources(pcs);
index f9b4996..63e51b5 100644 (file)
@@ -1468,82 +1468,82 @@ const struct samsung_pin_ctrl exynos5420_pin_ctrl[] __initconst = {
 
 /* pin banks of exynos5433 pin-controller - ALIVE */
 static const struct samsung_pin_bank_data exynos5433_pin_banks0[] __initconst = {
-       EXYNOS_PIN_BANK_EINTW(8, 0x000, "gpa0", 0x00),
-       EXYNOS_PIN_BANK_EINTW(8, 0x020, "gpa1", 0x04),
-       EXYNOS_PIN_BANK_EINTW(8, 0x040, "gpa2", 0x08),
-       EXYNOS_PIN_BANK_EINTW(8, 0x060, "gpa3", 0x0c),
-       EXYNOS_PIN_BANK_EINTW_EXT(8, 0x020, "gpf1", 0x1004, 1),
-       EXYNOS_PIN_BANK_EINTW_EXT(4, 0x040, "gpf2", 0x1008, 1),
-       EXYNOS_PIN_BANK_EINTW_EXT(4, 0x060, "gpf3", 0x100c, 1),
-       EXYNOS_PIN_BANK_EINTW_EXT(8, 0x080, "gpf4", 0x1010, 1),
-       EXYNOS_PIN_BANK_EINTW_EXT(8, 0x0a0, "gpf5", 0x1014, 1),
+       EXYNOS5433_PIN_BANK_EINTW(8, 0x000, "gpa0", 0x00),
+       EXYNOS5433_PIN_BANK_EINTW(8, 0x020, "gpa1", 0x04),
+       EXYNOS5433_PIN_BANK_EINTW(8, 0x040, "gpa2", 0x08),
+       EXYNOS5433_PIN_BANK_EINTW(8, 0x060, "gpa3", 0x0c),
+       EXYNOS5433_PIN_BANK_EINTW_EXT(8, 0x020, "gpf1", 0x1004, 1),
+       EXYNOS5433_PIN_BANK_EINTW_EXT(4, 0x040, "gpf2", 0x1008, 1),
+       EXYNOS5433_PIN_BANK_EINTW_EXT(4, 0x060, "gpf3", 0x100c, 1),
+       EXYNOS5433_PIN_BANK_EINTW_EXT(8, 0x080, "gpf4", 0x1010, 1),
+       EXYNOS5433_PIN_BANK_EINTW_EXT(8, 0x0a0, "gpf5", 0x1014, 1),
 };
 
 /* pin banks of exynos5433 pin-controller - AUD */
 static const struct samsung_pin_bank_data exynos5433_pin_banks1[] __initconst = {
-       EXYNOS_PIN_BANK_EINTG(7, 0x000, "gpz0", 0x00),
-       EXYNOS_PIN_BANK_EINTG(4, 0x020, "gpz1", 0x04),
+       EXYNOS5433_PIN_BANK_EINTG(7, 0x000, "gpz0", 0x00),
+       EXYNOS5433_PIN_BANK_EINTG(4, 0x020, "gpz1", 0x04),
 };
 
 /* pin banks of exynos5433 pin-controller - CPIF */
 static const struct samsung_pin_bank_data exynos5433_pin_banks2[] __initconst = {
-       EXYNOS_PIN_BANK_EINTG(2, 0x000, "gpv6", 0x00),
+       EXYNOS5433_PIN_BANK_EINTG(2, 0x000, "gpv6", 0x00),
 };
 
 /* pin banks of exynos5433 pin-controller - eSE */
 static const struct samsung_pin_bank_data exynos5433_pin_banks3[] __initconst = {
-       EXYNOS_PIN_BANK_EINTG(3, 0x000, "gpj2", 0x00),
+       EXYNOS5433_PIN_BANK_EINTG(3, 0x000, "gpj2", 0x00),
 };
 
 /* pin banks of exynos5433 pin-controller - FINGER */
 static const struct samsung_pin_bank_data exynos5433_pin_banks4[] __initconst = {
-       EXYNOS_PIN_BANK_EINTG(4, 0x000, "gpd5", 0x00),
+       EXYNOS5433_PIN_BANK_EINTG(4, 0x000, "gpd5", 0x00),
 };
 
 /* pin banks of exynos5433 pin-controller - FSYS */
 static const struct samsung_pin_bank_data exynos5433_pin_banks5[] __initconst = {
-       EXYNOS_PIN_BANK_EINTG(6, 0x000, "gph1", 0x00),
-       EXYNOS_PIN_BANK_EINTG(7, 0x020, "gpr4", 0x04),
-       EXYNOS_PIN_BANK_EINTG(5, 0x040, "gpr0", 0x08),
-       EXYNOS_PIN_BANK_EINTG(8, 0x060, "gpr1", 0x0c),
-       EXYNOS_PIN_BANK_EINTG(2, 0x080, "gpr2", 0x10),
-       EXYNOS_PIN_BANK_EINTG(8, 0x0a0, "gpr3", 0x14),
+       EXYNOS5433_PIN_BANK_EINTG(6, 0x000, "gph1", 0x00),
+       EXYNOS5433_PIN_BANK_EINTG(7, 0x020, "gpr4", 0x04),
+       EXYNOS5433_PIN_BANK_EINTG(5, 0x040, "gpr0", 0x08),
+       EXYNOS5433_PIN_BANK_EINTG(8, 0x060, "gpr1", 0x0c),
+       EXYNOS5433_PIN_BANK_EINTG(2, 0x080, "gpr2", 0x10),
+       EXYNOS5433_PIN_BANK_EINTG(8, 0x0a0, "gpr3", 0x14),
 };
 
 /* pin banks of exynos5433 pin-controller - IMEM */
 static const struct samsung_pin_bank_data exynos5433_pin_banks6[] __initconst = {
-       EXYNOS_PIN_BANK_EINTG(8, 0x000, "gpf0", 0x00),
+       EXYNOS5433_PIN_BANK_EINTG(8, 0x000, "gpf0", 0x00),
 };
 
 /* pin banks of exynos5433 pin-controller - NFC */
 static const struct samsung_pin_bank_data exynos5433_pin_banks7[] __initconst = {
-       EXYNOS_PIN_BANK_EINTG(3, 0x000, "gpj0", 0x00),
+       EXYNOS5433_PIN_BANK_EINTG(3, 0x000, "gpj0", 0x00),
 };
 
 /* pin banks of exynos5433 pin-controller - PERIC */
 static const struct samsung_pin_bank_data exynos5433_pin_banks8[] __initconst = {
-       EXYNOS_PIN_BANK_EINTG(6, 0x000, "gpv7", 0x00),
-       EXYNOS_PIN_BANK_EINTG(5, 0x020, "gpb0", 0x04),
-       EXYNOS_PIN_BANK_EINTG(8, 0x040, "gpc0", 0x08),
-       EXYNOS_PIN_BANK_EINTG(2, 0x060, "gpc1", 0x0c),
-       EXYNOS_PIN_BANK_EINTG(6, 0x080, "gpc2", 0x10),
-       EXYNOS_PIN_BANK_EINTG(8, 0x0a0, "gpc3", 0x14),
-       EXYNOS_PIN_BANK_EINTG(2, 0x0c0, "gpg0", 0x18),
-       EXYNOS_PIN_BANK_EINTG(4, 0x0e0, "gpd0", 0x1c),
-       EXYNOS_PIN_BANK_EINTG(6, 0x100, "gpd1", 0x20),
-       EXYNOS_PIN_BANK_EINTG(8, 0x120, "gpd2", 0x24),
-       EXYNOS_PIN_BANK_EINTG(5, 0x140, "gpd4", 0x28),
-       EXYNOS_PIN_BANK_EINTG(2, 0x160, "gpd8", 0x2c),
-       EXYNOS_PIN_BANK_EINTG(7, 0x180, "gpd6", 0x30),
-       EXYNOS_PIN_BANK_EINTG(3, 0x1a0, "gpd7", 0x34),
-       EXYNOS_PIN_BANK_EINTG(5, 0x1c0, "gpg1", 0x38),
-       EXYNOS_PIN_BANK_EINTG(2, 0x1e0, "gpg2", 0x3c),
-       EXYNOS_PIN_BANK_EINTG(8, 0x200, "gpg3", 0x40),
+       EXYNOS5433_PIN_BANK_EINTG(6, 0x000, "gpv7", 0x00),
+       EXYNOS5433_PIN_BANK_EINTG(5, 0x020, "gpb0", 0x04),
+       EXYNOS5433_PIN_BANK_EINTG(8, 0x040, "gpc0", 0x08),
+       EXYNOS5433_PIN_BANK_EINTG(2, 0x060, "gpc1", 0x0c),
+       EXYNOS5433_PIN_BANK_EINTG(6, 0x080, "gpc2", 0x10),
+       EXYNOS5433_PIN_BANK_EINTG(8, 0x0a0, "gpc3", 0x14),
+       EXYNOS5433_PIN_BANK_EINTG(2, 0x0c0, "gpg0", 0x18),
+       EXYNOS5433_PIN_BANK_EINTG(4, 0x0e0, "gpd0", 0x1c),
+       EXYNOS5433_PIN_BANK_EINTG(6, 0x100, "gpd1", 0x20),
+       EXYNOS5433_PIN_BANK_EINTG(8, 0x120, "gpd2", 0x24),
+       EXYNOS5433_PIN_BANK_EINTG(5, 0x140, "gpd4", 0x28),
+       EXYNOS5433_PIN_BANK_EINTG(2, 0x160, "gpd8", 0x2c),
+       EXYNOS5433_PIN_BANK_EINTG(7, 0x180, "gpd6", 0x30),
+       EXYNOS5433_PIN_BANK_EINTG(3, 0x1a0, "gpd7", 0x34),
+       EXYNOS5433_PIN_BANK_EINTG(5, 0x1c0, "gpg1", 0x38),
+       EXYNOS5433_PIN_BANK_EINTG(2, 0x1e0, "gpg2", 0x3c),
+       EXYNOS5433_PIN_BANK_EINTG(8, 0x200, "gpg3", 0x40),
 };
 
 /* pin banks of exynos5433 pin-controller - TOUCH */
 static const struct samsung_pin_bank_data exynos5433_pin_banks9[] __initconst = {
-       EXYNOS_PIN_BANK_EINTG(3, 0x000, "gpj1", 0x00),
+       EXYNOS5433_PIN_BANK_EINTG(3, 0x000, "gpj1", 0x00),
 };
 
 /*
index a473092..cd046eb 100644 (file)
                .name           = id                    \
        }
 
-#define EXYNOS_PIN_BANK_EINTW_EXT(pins, reg, id, offs, pctl_idx) \
-       {                                               \
-               .type           = &bank_type_alive,     \
-               .pctl_offset    = reg,                  \
-               .nr_pins        = pins,                 \
-               .eint_type      = EINT_TYPE_WKUP,       \
-               .eint_offset    = offs,                 \
-               .name           = id,                   \
-               .pctl_res_idx   = pctl_idx,             \
-       }                                               \
-
 #define EXYNOS5433_PIN_BANK_EINTG(pins, reg, id, offs)         \
        {                                                       \
                .type           = &exynos5433_bank_type_off,    \
index 08150a3..a70157f 100644 (file)
@@ -816,6 +816,13 @@ int sh_pfc_register_pinctrl(struct sh_pfc *pfc)
        pmx->pctl_desc.pins = pmx->pins;
        pmx->pctl_desc.npins = pfc->info->nr_pins;
 
-       return devm_pinctrl_register_and_init(pfc->dev, &pmx->pctl_desc, pmx,
-                                             &pmx->pctl);
+       ret = devm_pinctrl_register_and_init(pfc->dev, &pmx->pctl_desc, pmx,
+                                            &pmx->pctl);
+       if (ret) {
+               dev_err(pfc->dev, "could not register: %i\n", ret);
+
+               return ret;
+       }
+
+       return pinctrl_enable(pmx->pctl);
 }
index 717e340..362c509 100644 (file)
@@ -893,6 +893,8 @@ static int ti_iodelay_probe(struct platform_device *pdev)
 
        platform_set_drvdata(pdev, iod);
 
+       return pinctrl_enable(iod->pctl);
+
 exit_out:
        of_node_put(np);
        return ret;
index 4bc88eb..e1bffc9 100644 (file)
@@ -141,6 +141,14 @@ config DELL_WMI_AIO
          To compile this driver as a module, choose M here: the module will
          be called dell-wmi-aio.
 
+config DELL_WMI_LED
+       tristate "External LED on Dell Business Netbooks"
+       depends on LEDS_CLASS
+       depends on ACPI_WMI
+       help
+         This adds support for the Latitude 2100 and similar
+         notebooks that have an external LED.
+
 config DELL_SMO8800
        tristate "Dell Latitude freefall driver (ACPI SMO88XX)"
        depends on ACPI
index 299d0f9..776b3a7 100644 (file)
@@ -15,6 +15,7 @@ obj-$(CONFIG_DELL_SMBIOS)     += dell-smbios.o
 obj-$(CONFIG_DELL_LAPTOP)      += dell-laptop.o
 obj-$(CONFIG_DELL_WMI)         += dell-wmi.o
 obj-$(CONFIG_DELL_WMI_AIO)     += dell-wmi-aio.o
+obj-$(CONFIG_DELL_WMI_LED)     += dell-wmi-led.o
 obj-$(CONFIG_DELL_SMO8800)     += dell-smo8800.o
 obj-$(CONFIG_DELL_RBTN)                += dell-rbtn.o
 obj-$(CONFIG_ACER_WMI)         += acer-wmi.o
index f57dd28..2e237ba 100644 (file)
@@ -29,6 +29,7 @@
 #include <linux/mm.h>
 #include <linux/i8042.h>
 #include <linux/debugfs.h>
+#include <linux/dell-led.h>
 #include <linux/seq_file.h>
 #include <acpi/video.h>
 #include "dell-rbtn.h"
@@ -42,6 +43,8 @@
 #define KBD_LED_AUTO_50_TOKEN 0x02EB
 #define KBD_LED_AUTO_75_TOKEN 0x02EC
 #define KBD_LED_AUTO_100_TOKEN 0x02F6
+#define GLOBAL_MIC_MUTE_ENABLE 0x0364
+#define GLOBAL_MIC_MUTE_DISABLE 0x0365
 
 struct quirk_entry {
        u8 touchpad_led;
@@ -1978,6 +1981,31 @@ static void kbd_led_exit(void)
        led_classdev_unregister(&kbd_led);
 }
 
+int dell_micmute_led_set(int state)
+{
+       struct calling_interface_buffer *buffer;
+       struct calling_interface_token *token;
+
+       if (state == 0)
+               token = dell_smbios_find_token(GLOBAL_MIC_MUTE_DISABLE);
+       else if (state == 1)
+               token = dell_smbios_find_token(GLOBAL_MIC_MUTE_ENABLE);
+       else
+               return -EINVAL;
+
+       if (!token)
+               return -ENODEV;
+
+       buffer = dell_smbios_get_buffer();
+       buffer->input[0] = token->location;
+       buffer->input[1] = token->value;
+       dell_smbios_send_request(1, 0);
+       dell_smbios_release_buffer();
+
+       return state;
+}
+EXPORT_SYMBOL_GPL(dell_micmute_led_set);
+
 static int __init dell_init(void)
 {
        struct calling_interface_buffer *buffer;
similarity index 56%
rename from drivers/leds/dell-led.c
rename to drivers/platform/x86/dell-wmi-led.c
index b3d6e9c..a0c7e99 100644 (file)
@@ -1,6 +1,4 @@
 /*
- * dell_led.c - Dell LED Driver
- *
  * Copyright (C) 2010 Dell Inc.
  * Louis Davis <louis_davis@dell.com>
  * Jim Dailey <jim_dailey@dell.com>
 #include <linux/leds.h>
 #include <linux/slab.h>
 #include <linux/module.h>
-#include <linux/dmi.h>
-#include <linux/dell-led.h>
-#include "../platform/x86/dell-smbios.h"
 
 MODULE_AUTHOR("Louis Davis/Jim Dailey");
 MODULE_DESCRIPTION("Dell LED Control Driver");
 MODULE_LICENSE("GPL");
 
 #define DELL_LED_BIOS_GUID "F6E4FE6E-909D-47cb-8BAB-C9F6F2F8D396"
-#define DELL_APP_GUID "A80593CE-A997-11DA-B012-B622A1EF5492"
 MODULE_ALIAS("wmi:" DELL_LED_BIOS_GUID);
 
 /* Error Result Codes: */
@@ -43,53 +37,6 @@ MODULE_ALIAS("wmi:" DELL_LED_BIOS_GUID);
 #define CMD_LED_OFF    17
 #define CMD_LED_BLINK  18
 
-#define GLOBAL_MIC_MUTE_ENABLE 0x364
-#define GLOBAL_MIC_MUTE_DISABLE        0x365
-
-static int dell_micmute_led_set(int state)
-{
-       struct calling_interface_buffer *buffer;
-       struct calling_interface_token *token;
-
-       if (!wmi_has_guid(DELL_APP_GUID))
-               return -ENODEV;
-
-       if (state == 0)
-               token = dell_smbios_find_token(GLOBAL_MIC_MUTE_DISABLE);
-       else if (state == 1)
-               token = dell_smbios_find_token(GLOBAL_MIC_MUTE_ENABLE);
-       else
-               return -EINVAL;
-
-       if (!token)
-               return -ENODEV;
-
-       buffer = dell_smbios_get_buffer();
-       buffer->input[0] = token->location;
-       buffer->input[1] = token->value;
-       dell_smbios_send_request(1, 0);
-       dell_smbios_release_buffer();
-
-       return state;
-}
-
-int dell_app_wmi_led_set(int whichled, int on)
-{
-       int state = 0;
-
-       switch (whichled) {
-       case DELL_LED_MICMUTE:
-               state = dell_micmute_led_set(on);
-               break;
-       default:
-               pr_warn("led type %x is not supported\n", whichled);
-               break;
-       }
-
-       return state;
-}
-EXPORT_SYMBOL_GPL(dell_app_wmi_led_set);
-
 struct bios_args {
        unsigned char length;
        unsigned char result_code;
@@ -99,37 +46,29 @@ struct bios_args {
        unsigned char off_time;
 };
 
-static int dell_led_perform_fn(u8 length,
-               u8 result_code,
-               u8 device_id,
-               u8 command,
-               u8 on_time,
-               u8 off_time)
+static int dell_led_perform_fn(u8 length, u8 result_code, u8 device_id,
+                              u8 command, u8 on_time, u8 off_time)
 {
-       struct bios_args *bios_return;
-       u8 return_code;
-       union acpi_object *obj;
        struct acpi_buffer output = { ACPI_ALLOCATE_BUFFER, NULL };
+       struct bios_args *bios_return;
        struct acpi_buffer input;
+       union acpi_object *obj;
        acpi_status status;
+       u8 return_code;
 
-       struct bios_args args;
-       args.length = length;
-       args.result_code = result_code;
-       args.device_id = device_id;
-       args.command = command;
-       args.on_time = on_time;
-       args.off_time = off_time;
+       struct bios_args args = {
+               .length = length,
+               .result_code = result_code,
+               .device_id = device_id,
+               .command = command,
+               .on_time = on_time,
+               .off_time = off_time
+       };
 
        input.length = sizeof(struct bios_args);
        input.pointer = &args;
 
-       status = wmi_evaluate_method(DELL_LED_BIOS_GUID,
-               1,
-               1,
-               &input,
-               &output);
-
+       status = wmi_evaluate_method(DELL_LED_BIOS_GUID, 1, 1, &input, &output);
        if (ACPI_FAILURE(status))
                return status;
 
@@ -137,7 +76,7 @@ static int dell_led_perform_fn(u8 length,
 
        if (!obj)
                return -EINVAL;
-       else if (obj->type != ACPI_TYPE_BUFFER) {
+       if (obj->type != ACPI_TYPE_BUFFER) {
                kfree(obj);
                return -EINVAL;
        }
@@ -170,8 +109,7 @@ static int led_off(void)
                0);                     /* not used */
 }
 
-static int led_blink(unsigned char on_eighths,
-               unsigned char off_eighths)
+static int led_blink(unsigned char on_eighths, unsigned char off_eighths)
 {
        return dell_led_perform_fn(5,   /* Length of command */
                INTERFACE_ERROR,        /* Init to  INTERFACE_ERROR */
@@ -182,7 +120,7 @@ static int led_blink(unsigned char on_eighths,
 }
 
 static void dell_led_set(struct led_classdev *led_cdev,
-               enum led_brightness value)
+                        enum led_brightness value)
 {
        if (value == LED_OFF)
                led_off();
@@ -191,27 +129,22 @@ static void dell_led_set(struct led_classdev *led_cdev,
 }
 
 static int dell_led_blink(struct led_classdev *led_cdev,
-               unsigned long *delay_on,
-               unsigned long *delay_off)
+                         unsigned long *delay_on, unsigned long *delay_off)
 {
        unsigned long on_eighths;
        unsigned long off_eighths;
 
-       /* The Dell LED delay is based on 125ms intervals.
-          Need to round up to next interval. */
+       /*
+        * The Dell LED delay is based on 125ms intervals.
+        * Need to round up to next interval.
+        */
 
-       on_eighths = (*delay_on + 124) / 125;
-       if (0 == on_eighths)
-               on_eighths = 1;
-       if (on_eighths > 255)
-               on_eighths = 255;
+       on_eighths = DIV_ROUND_UP(*delay_on, 125);
+       on_eighths = clamp_t(unsigned long, on_eighths, 1, 255);
        *delay_on = on_eighths * 125;
 
-       off_eighths = (*delay_off + 124) / 125;
-       if (0 == off_eighths)
-               off_eighths = 1;
-       if (off_eighths > 255)
-               off_eighths = 255;
+       off_eighths = DIV_ROUND_UP(*delay_off, 125);
+       off_eighths = clamp_t(unsigned long, off_eighths, 1, 255);
        *delay_off = off_eighths * 125;
 
        led_blink(on_eighths, off_eighths);
@@ -232,29 +165,21 @@ static int __init dell_led_init(void)
 {
        int error = 0;
 
-       if (!wmi_has_guid(DELL_LED_BIOS_GUID) && !wmi_has_guid(DELL_APP_GUID))
+       if (!wmi_has_guid(DELL_LED_BIOS_GUID))
                return -ENODEV;
 
-       if (wmi_has_guid(DELL_LED_BIOS_GUID)) {
-               error = led_off();
-               if (error != 0)
-                       return -ENODEV;
-
-               error = led_classdev_register(NULL, &dell_led);
-       }
+       error = led_off();
+       if (error != 0)
+               return -ENODEV;
 
-       return error;
+       return led_classdev_register(NULL, &dell_led);
 }
 
 static void __exit dell_led_exit(void)
 {
-       int error = 0;
+       led_classdev_unregister(&dell_led);
 
-       if (wmi_has_guid(DELL_LED_BIOS_GUID)) {
-               error = led_off();
-               if (error == 0)
-                       led_classdev_unregister(&dell_led);
-       }
+       led_off();
 }
 
 module_init(dell_led_init);
index 56bce19..8581252 100644 (file)
 #define RK3288_SOC_CON2_FLASH0         BIT(7)
 #define RK3288_SOC_FLASH_SUPPLY_NUM    2
 
+#define RK3328_SOC_CON4                        0x410
+#define RK3328_SOC_CON4_VCCIO2         BIT(7)
+#define RK3328_SOC_VCCIO2_SUPPLY_NUM   1
+
 #define RK3368_SOC_CON15               0x43c
 #define RK3368_SOC_CON15_FLASH0                BIT(14)
 #define RK3368_SOC_FLASH_SUPPLY_NUM    2
@@ -166,6 +170,25 @@ static void rk3288_iodomain_init(struct rockchip_iodomain *iod)
                dev_warn(iod->dev, "couldn't update flash0 ctrl\n");
 }
 
+static void rk3328_iodomain_init(struct rockchip_iodomain *iod)
+{
+       int ret;
+       u32 val;
+
+       /* if no vccio2 supply we should leave things alone */
+       if (!iod->supplies[RK3328_SOC_VCCIO2_SUPPLY_NUM].reg)
+               return;
+
+       /*
+        * set vccio2 iodomain to also use this framework
+        * instead of a special gpio.
+        */
+       val = RK3328_SOC_CON4_VCCIO2 | (RK3328_SOC_CON4_VCCIO2 << 16);
+       ret = regmap_write(iod->grf, RK3328_SOC_CON4, val);
+       if (ret < 0)
+               dev_warn(iod->dev, "couldn't update vccio2 vsel ctrl\n");
+}
+
 static void rk3368_iodomain_init(struct rockchip_iodomain *iod)
 {
        int ret;
@@ -247,6 +270,20 @@ static const struct rockchip_iodomain_soc_data soc_data_rk3288 = {
        .init = rk3288_iodomain_init,
 };
 
+static const struct rockchip_iodomain_soc_data soc_data_rk3328 = {
+       .grf_offset = 0x410,
+       .supply_names = {
+               "vccio1",
+               "vccio2",
+               "vccio3",
+               "vccio4",
+               "vccio5",
+               "vccio6",
+               "pmuio",
+       },
+       .init = rk3328_iodomain_init,
+};
+
 static const struct rockchip_iodomain_soc_data soc_data_rk3368 = {
        .grf_offset = 0x900,
        .supply_names = {
@@ -312,6 +349,10 @@ static const struct of_device_id rockchip_iodomain_match[] = {
                .data = (void *)&soc_data_rk3288
        },
        {
+               .compatible = "rockchip,rk3328-io-voltage-domain",
+               .data = (void *)&soc_data_rk3328
+       },
+       {
                .compatible = "rockchip,rk3368-io-voltage-domain",
                .data = (void *)&soc_data_rk3368
        },
index b8caccc..13f1714 100644 (file)
@@ -67,6 +67,15 @@ config POWER_RESET_BRCMSTB
          Say Y here if you have a Broadcom STB board and you wish
          to have restart support.
 
+config POWER_RESET_GEMINI_POWEROFF
+       bool "Cortina Gemini power-off driver"
+       depends on ARCH_GEMINI || COMPILE_TEST
+       depends on OF && HAS_IOMEM
+       default ARCH_GEMINI
+       help
+         This driver supports turning off the Cortina Gemini SoC.
+         Select this if you're building a kernel with Gemini SoC support.
+
 config POWER_RESET_GPIO
        bool "GPIO power-off driver"
        depends on OF_GPIO
index 11dae3b..58cf5b3 100644 (file)
@@ -5,6 +5,7 @@ obj-$(CONFIG_POWER_RESET_AT91_SAMA5D2_SHDWC) += at91-sama5d2_shdwc.o
 obj-$(CONFIG_POWER_RESET_AXXIA) += axxia-reset.o
 obj-$(CONFIG_POWER_RESET_BRCMKONA) += brcm-kona-reset.o
 obj-$(CONFIG_POWER_RESET_BRCMSTB) += brcmstb-reboot.o
+obj-$(CONFIG_POWER_RESET_GEMINI_POWEROFF) += gemini-poweroff.o
 obj-$(CONFIG_POWER_RESET_GPIO) += gpio-poweroff.o
 obj-$(CONFIG_POWER_RESET_GPIO_RESTART) += gpio-restart.o
 obj-$(CONFIG_POWER_RESET_HISI) += hisi-reboot.o
diff --git a/drivers/power/reset/gemini-poweroff.c b/drivers/power/reset/gemini-poweroff.c
new file mode 100644 (file)
index 0000000..de878fd
--- /dev/null
@@ -0,0 +1,160 @@
+/*
+ * Gemini power management controller
+ * Copyright (C) 2017 Linus Walleij <linus.walleij@linaro.org>
+ *
+ * Inspired by code from the SL3516 board support by Jason Lee
+ * Inspired by code from Janos Laube <janos.dev@gmail.com>
+ */
+#include <linux/of.h>
+#include <linux/of_platform.h>
+#include <linux/platform_device.h>
+#include <linux/pm.h>
+#include <linux/bitops.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/reboot.h>
+
+#define GEMINI_PWC_ID          0x00010500
+#define        GEMINI_PWC_IDREG        0x00
+#define        GEMINI_PWC_CTRLREG      0x04
+#define        GEMINI_PWC_STATREG      0x08
+
+#define GEMINI_CTRL_SHUTDOWN   BIT(0)
+#define GEMINI_CTRL_ENABLE     BIT(1)
+#define GEMINI_CTRL_IRQ_CLR    BIT(2)
+
+#define GEMINI_STAT_CIR                BIT(4)
+#define        GEMINI_STAT_RTC         BIT(5)
+#define        GEMINI_STAT_POWERBUTTON BIT(6)
+
+struct gemini_powercon {
+        struct device           *dev;
+        void __iomem            *base;
+};
+
+static irqreturn_t gemini_powerbutton_interrupt(int irq, void *data)
+{
+       struct gemini_powercon *gpw = data;
+       u32 val;
+
+       /* ACK the IRQ */
+       val = readl(gpw->base + GEMINI_PWC_CTRLREG);
+       val |= GEMINI_CTRL_IRQ_CLR;
+       writel(val, gpw->base + GEMINI_PWC_CTRLREG);
+
+       val = readl(gpw->base + GEMINI_PWC_STATREG);
+       val &= 0x70U;
+       switch (val) {
+       case GEMINI_STAT_CIR:
+               dev_info(gpw->dev, "infrared poweroff\n");
+               orderly_poweroff(true);
+               break;
+       case GEMINI_STAT_RTC:
+               dev_info(gpw->dev, "RTC poweroff\n");
+               orderly_poweroff(true);
+               break;
+       case GEMINI_STAT_POWERBUTTON:
+               dev_info(gpw->dev, "poweroff button pressed\n");
+               orderly_poweroff(true);
+               break;
+       default:
+               dev_info(gpw->dev, "other power management IRQ\n");
+               break;
+       }
+
+       return IRQ_HANDLED;
+}
+
+/* This callback needs this static local as it has void as argument */
+static struct gemini_powercon *gpw_poweroff;
+
+static void gemini_poweroff(void)
+{
+       struct gemini_powercon *gpw = gpw_poweroff;
+       u32 val;
+
+       dev_crit(gpw->dev, "Gemini power off\n");
+       val = readl(gpw->base + GEMINI_PWC_CTRLREG);
+       val |= GEMINI_CTRL_ENABLE | GEMINI_CTRL_IRQ_CLR;
+       writel(val, gpw->base + GEMINI_PWC_CTRLREG);
+
+       val &= ~GEMINI_CTRL_ENABLE;
+       val |= GEMINI_CTRL_SHUTDOWN;
+       writel(val, gpw->base + GEMINI_PWC_CTRLREG);
+}
+
+static int gemini_poweroff_probe(struct platform_device *pdev)
+{
+       struct device *dev = &pdev->dev;
+       struct resource *res;
+       struct gemini_powercon *gpw;
+       u32 val;
+       int irq;
+       int ret;
+
+       gpw = devm_kzalloc(dev, sizeof(*gpw), GFP_KERNEL);
+       if (!gpw)
+               return -ENOMEM;
+
+       res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+       gpw->base = devm_ioremap_resource(dev, res);
+       if (IS_ERR(gpw->base))
+               return PTR_ERR(gpw->base);
+
+       irq = platform_get_irq(pdev, 0);
+       if (!irq)
+               return -EINVAL;
+
+       gpw->dev = dev;
+
+       val = readl(gpw->base + GEMINI_PWC_IDREG);
+       val &= 0xFFFFFF00U;
+       if (val != GEMINI_PWC_ID) {
+               dev_err(dev, "wrong power controller ID: %08x\n",
+                       val);
+               return -ENODEV;
+       }
+
+       /* Clear the power management IRQ */
+       val = readl(gpw->base + GEMINI_PWC_CTRLREG);
+       val |= GEMINI_CTRL_IRQ_CLR;
+       writel(val, gpw->base + GEMINI_PWC_CTRLREG);
+
+       ret = devm_request_irq(dev, irq, gemini_powerbutton_interrupt, 0,
+                              "poweroff", gpw);
+       if (ret)
+               return ret;
+
+       pm_power_off = gemini_poweroff;
+       gpw_poweroff = gpw;
+
+       /*
+        * Enable the power controller. This is crucial on Gemini
+        * systems: if this is not done, pressing the power button
+        * will result in unconditional poweroff without any warning.
+        * This makes the kernel handle the poweroff.
+        */
+       val = readl(gpw->base + GEMINI_PWC_CTRLREG);
+       val |= GEMINI_CTRL_ENABLE;
+       writel(val, gpw->base + GEMINI_PWC_CTRLREG);
+
+       dev_info(dev, "Gemini poweroff driver registered\n");
+
+       return 0;
+}
+
+static const struct of_device_id gemini_poweroff_of_match[] = {
+       {
+               .compatible = "cortina,gemini-power-controller",
+       },
+       {}
+};
+
+static struct platform_driver gemini_poweroff_driver = {
+       .probe = gemini_poweroff_probe,
+       .driver = {
+               .name = "gemini-poweroff",
+               .of_match_table = gemini_poweroff_of_match,
+       },
+};
+builtin_platform_driver(gemini_poweroff_driver);
index b683383..f9f1cb5 100644 (file)
 
 static struct regmap *map;
 static u32 offset;
+static u32 value;
 static u32 mask;
 
 static void syscon_poweroff(void)
 {
        /* Issue the poweroff */
-       regmap_write(map, offset, mask);
+       regmap_update_bits(map, offset, mask, value);
 
        mdelay(1000);
 
@@ -43,6 +44,7 @@ static void syscon_poweroff(void)
 static int syscon_poweroff_probe(struct platform_device *pdev)
 {
        char symname[KSYM_NAME_LEN];
+       int mask_err, value_err;
 
        map = syscon_regmap_lookup_by_phandle(pdev->dev.of_node, "regmap");
        if (IS_ERR(map)) {
@@ -55,11 +57,22 @@ static int syscon_poweroff_probe(struct platform_device *pdev)
                return -EINVAL;
        }
 
-       if (of_property_read_u32(pdev->dev.of_node, "mask", &mask)) {
-               dev_err(&pdev->dev, "unable to read 'mask'");
+       value_err = of_property_read_u32(pdev->dev.of_node, "value", &value);
+       mask_err = of_property_read_u32(pdev->dev.of_node, "mask", &mask);
+       if (value_err && mask_err) {
+               dev_err(&pdev->dev, "unable to read 'value' and 'mask'");
                return -EINVAL;
        }
 
+       if (value_err) {
+               /* support old binding */
+               value = mask;
+               mask = 0xFFFFFFFF;
+       } else if (mask_err) {
+               /* support value without mask*/
+               mask = 0xFFFFFFFF;
+       }
+
        if (pm_power_off) {
                lookup_symbol_name((ulong)pm_power_off, symname);
                dev_err(&pdev->dev,
index da54ac8..da92275 100644 (file)
@@ -117,6 +117,12 @@ config BATTERY_DS2782
          Say Y here to enable support for the DS2782/DS2786 standalone battery
          gas-gauge.
 
+config BATTERY_LEGO_EV3
+       tristate "LEGO MINDSTORMS EV3 battery"
+       depends on OF && IIO && GPIOLIB
+       help
+         Say Y here to enable support for the LEGO MINDSTORMS EV3 battery.
+
 config BATTERY_PMU
        tristate "Apple PMU battery"
        depends on PPC32 && ADB_PMU
@@ -317,6 +323,14 @@ config BATTERY_RX51
          Say Y here to enable support for battery information on Nokia
          RX-51, also known as N900 tablet.
 
+config CHARGER_CPCAP
+       tristate "CPCAP PMIC Charger Driver"
+       depends on MFD_CPCAP && IIO
+       default MFD_CPCAP
+       help
+         Say Y to enable support for CPCAP PMIC charger driver for Motorola
+         mobile devices such as Droid 4.
+
 config CHARGER_ISP1704
        tristate "ISP1704 USB Charger Detection"
        depends on USB_PHY
@@ -438,6 +452,7 @@ config CHARGER_BQ2415X
 config CHARGER_BQ24190
        tristate "TI BQ24190 battery charger driver"
        depends on I2C
+       depends on EXTCON
        depends on GPIOLIB || COMPILE_TEST
        help
          Say Y to enable support for the TI BQ24190 battery charger.
index 3789a2c..39fc733 100644 (file)
@@ -25,6 +25,7 @@ obj-$(CONFIG_BATTERY_DS2781)  += ds2781_battery.o
 obj-$(CONFIG_BATTERY_DS2782)   += ds2782_battery.o
 obj-$(CONFIG_BATTERY_GAUGE_LTC2941)    += ltc2941-battery-gauge.o
 obj-$(CONFIG_BATTERY_GOLDFISH) += goldfish_battery.o
+obj-$(CONFIG_BATTERY_LEGO_EV3) += lego_ev3_battery.o
 obj-$(CONFIG_BATTERY_PMU)      += pmu_battery.o
 obj-$(CONFIG_BATTERY_OLPC)     += olpc_battery.o
 obj-$(CONFIG_BATTERY_TOSA)     += tosa_battery.o
@@ -51,6 +52,7 @@ obj-$(CONFIG_CHARGER_PCF50633)        += pcf50633-charger.o
 obj-$(CONFIG_BATTERY_JZ4740)   += jz4740-battery.o
 obj-$(CONFIG_BATTERY_RX51)     += rx51_battery.o
 obj-$(CONFIG_AB8500_BM)                += ab8500_bmdata.o ab8500_charger.o ab8500_fg.o ab8500_btemp.o abx500_chargalg.o pm2301_charger.o
+obj-$(CONFIG_CHARGER_CPCAP)    += cpcap-charger.o
 obj-$(CONFIG_CHARGER_ISP1704)  += isp1704_charger.o
 obj-$(CONFIG_CHARGER_MAX8903)  += max8903_charger.o
 obj-$(CONFIG_CHARGER_TWL4030)  += twl4030_charger.o
index d298645..8c49586 100644 (file)
@@ -430,10 +430,10 @@ static const struct abx500_maxim_parameters ab8500_maxi_params = {
 };
 
 static const struct abx500_maxim_parameters abx540_maxi_params = {
-        .ena_maxi = true,
-        .chg_curr = 3000,
-        .wait_cycles = 10,
-        .charger_curr_step = 200,
+       .ena_maxi = true,
+       .chg_curr = 3000,
+       .wait_cycles = 10,
+       .charger_curr_step = 200,
 };
 
 static const struct abx500_bm_charger_parameters chg = {
index 6be2fe2..d51ebd1 100644 (file)
@@ -14,6 +14,7 @@
  * GNU General Public License for more details.
  */
 
+#include <linux/acpi.h>
 #include <linux/module.h>
 #include <linux/device.h>
 #include <linux/regmap.h>
 #define ILIM_3000MA                    3000    /* 3000mA */
 
 #define AXP288_EXTCON_DEV_NAME         "axp288_extcon"
-#define USB_HOST_EXTCON_DEV_NAME       "INT3496:00"
+#define USB_HOST_EXTCON_HID            "INT3496"
+#define USB_HOST_EXTCON_NAME           "INT3496:00"
 
 static const unsigned int cable_ids[] =
        { EXTCON_CHG_USB_SDP, EXTCON_CHG_USB_CDP, EXTCON_CHG_USB_DCP };
@@ -807,10 +809,14 @@ static int axp288_charger_probe(struct platform_device *pdev)
                return -EPROBE_DEFER;
        }
 
-       info->otg.cable = extcon_get_extcon_dev(USB_HOST_EXTCON_DEV_NAME);
-       if (info->otg.cable == NULL) {
-               dev_dbg(dev, "EXTCON_USB_HOST is not ready, probe deferred\n");
-               return -EPROBE_DEFER;
+       if (acpi_dev_present(USB_HOST_EXTCON_HID, NULL, -1)) {
+               info->otg.cable = extcon_get_extcon_dev(USB_HOST_EXTCON_NAME);
+               if (info->otg.cable == NULL) {
+                       dev_dbg(dev, "EXTCON_USB_HOST is not ready, probe deferred\n");
+                       return -EPROBE_DEFER;
+               }
+               dev_info(&pdev->dev,
+                        "Using " USB_HOST_EXTCON_HID " extcon for usb-id\n");
        }
 
        platform_set_drvdata(pdev, info);
@@ -849,13 +855,15 @@ static int axp288_charger_probe(struct platform_device *pdev)
        /* Register for OTG notification */
        INIT_WORK(&info->otg.work, axp288_charger_otg_evt_worker);
        info->otg.id_nb.notifier_call = axp288_charger_handle_otg_evt;
-       ret = devm_extcon_register_notifier(&pdev->dev, info->otg.cable,
+       if (info->otg.cable) {
+               ret = devm_extcon_register_notifier(&pdev->dev, info->otg.cable,
                                        EXTCON_USB_HOST, &info->otg.id_nb);
-       if (ret) {
-               dev_err(dev, "failed to register EXTCON_USB_HOST notifier\n");
-               return ret;
+               if (ret) {
+                       dev_err(dev, "failed to register EXTCON_USB_HOST notifier\n");
+                       return ret;
+               }
+               schedule_work(&info->otg.work);
        }
-       schedule_work(&info->otg.work);
 
        /* Register charger interrupts */
        for (i = 0; i < CHRG_INTR_END; i++) {
index a4f0849..bd9e5c3 100644 (file)
 #include <linux/module.h>
 #include <linux/interrupt.h>
 #include <linux/delay.h>
+#include <linux/extcon.h>
 #include <linux/of_irq.h>
 #include <linux/of_device.h>
 #include <linux/pm_runtime.h>
 #include <linux/power_supply.h>
+#include <linux/workqueue.h>
 #include <linux/gpio.h>
 #include <linux/i2c.h>
 
-#include <linux/power/bq24190_charger.h>
-
-
 #define        BQ24190_MANUFACTURER    "Texas Instruments"
 
 #define BQ24190_REG_ISC                0x00 /* Input Source Control */
@@ -39,6 +38,9 @@
 #define BQ24190_REG_POC_WDT_RESET_SHIFT                6
 #define BQ24190_REG_POC_CHG_CONFIG_MASK                (BIT(5) | BIT(4))
 #define BQ24190_REG_POC_CHG_CONFIG_SHIFT       4
+#define BQ24190_REG_POC_CHG_CONFIG_DISABLE             0x0
+#define BQ24190_REG_POC_CHG_CONFIG_CHARGE              0x1
+#define BQ24190_REG_POC_CHG_CONFIG_OTG                 0x2
 #define BQ24190_REG_POC_SYS_MIN_MASK           (BIT(3) | BIT(2) | BIT(1))
 #define BQ24190_REG_POC_SYS_MIN_SHIFT          1
 #define BQ24190_REG_POC_BOOST_LIM_MASK         BIT(0)
@@ -151,10 +153,12 @@ struct bq24190_dev_info {
        struct device                   *dev;
        struct power_supply             *charger;
        struct power_supply             *battery;
+       struct extcon_dev               *extcon;
+       struct notifier_block           extcon_nb;
+       struct delayed_work             extcon_work;
        char                            model_name[I2C_NAME_SIZE];
-       kernel_ulong_t                  model;
-       unsigned int                    gpio_int;
-       unsigned int                    irq;
+       bool                            initialized;
+       bool                            irq_event;
        struct mutex                    f_reg_lock;
        u8                              f_reg;
        u8                              ss_reg;
@@ -168,6 +172,12 @@ struct bq24190_dev_info {
  * number at that index in the array is the real-world value that it
  * represents.
  */
+
+/* REG00[2:0] (IINLIM) in uAh */
+static const int bq24190_isc_iinlim_values[] = {
+        100000,  150000,  500000,  900000, 1200000, 1500000, 2000000, 3000000
+};
+
 /* REG02[7:2] (ICHG) in uAh */
 static const int bq24190_ccc_ichg_values[] = {
         512000,  576000,  640000,  704000,  768000,  832000,  896000,  960000,
@@ -418,6 +428,7 @@ static ssize_t bq24190_sysfs_show(struct device *dev,
        struct power_supply *psy = dev_get_drvdata(dev);
        struct bq24190_dev_info *bdi = power_supply_get_drvdata(psy);
        struct bq24190_sysfs_field_info *info;
+       ssize_t count;
        int ret;
        u8 v;
 
@@ -425,11 +436,20 @@ static ssize_t bq24190_sysfs_show(struct device *dev,
        if (!info)
                return -EINVAL;
 
+       ret = pm_runtime_get_sync(bdi->dev);
+       if (ret < 0)
+               return ret;
+
        ret = bq24190_read_mask(bdi, info->reg, info->mask, info->shift, &v);
        if (ret)
-               return ret;
+               count = ret;
+       else
+               count = scnprintf(buf, PAGE_SIZE, "%hhx\n", v);
+
+       pm_runtime_mark_last_busy(bdi->dev);
+       pm_runtime_put_autosuspend(bdi->dev);
 
-       return scnprintf(buf, PAGE_SIZE, "%hhx\n", v);
+       return count;
 }
 
 static ssize_t bq24190_sysfs_store(struct device *dev,
@@ -449,9 +469,16 @@ static ssize_t bq24190_sysfs_store(struct device *dev,
        if (ret < 0)
                return ret;
 
+       ret = pm_runtime_get_sync(bdi->dev);
+       if (ret < 0)
+               return ret;
+
        ret = bq24190_write_mask(bdi, info->reg, info->mask, info->shift, v);
        if (ret)
-               return ret;
+               count = ret;
+
+       pm_runtime_mark_last_busy(bdi->dev);
+       pm_runtime_put_autosuspend(bdi->dev);
 
        return count;
 }
@@ -523,16 +550,13 @@ static int bq24190_register_reset(struct bq24190_dev_info *bdi)
                if (ret < 0)
                        return ret;
 
-               if (!v)
-                       break;
+               if (v == 0)
+                       return 0;
 
-               udelay(10);
+               usleep_range(100, 200);
        } while (--limit);
 
-       if (!limit)
-               return -EIO;
-
-       return 0;
+       return -EIO;
 }
 
 /* Charger power supply property routines */
@@ -793,7 +817,9 @@ static int bq24190_charger_get_property(struct power_supply *psy,
 
        dev_dbg(bdi->dev, "prop: %d\n", psp);
 
-       pm_runtime_get_sync(bdi->dev);
+       ret = pm_runtime_get_sync(bdi->dev);
+       if (ret < 0)
+               return ret;
 
        switch (psp) {
        case POWER_SUPPLY_PROP_CHARGE_TYPE:
@@ -833,7 +859,9 @@ static int bq24190_charger_get_property(struct power_supply *psy,
                ret = -ENODATA;
        }
 
-       pm_runtime_put_sync(bdi->dev);
+       pm_runtime_mark_last_busy(bdi->dev);
+       pm_runtime_put_autosuspend(bdi->dev);
+
        return ret;
 }
 
@@ -846,7 +874,9 @@ static int bq24190_charger_set_property(struct power_supply *psy,
 
        dev_dbg(bdi->dev, "prop: %d\n", psp);
 
-       pm_runtime_get_sync(bdi->dev);
+       ret = pm_runtime_get_sync(bdi->dev);
+       if (ret < 0)
+               return ret;
 
        switch (psp) {
        case POWER_SUPPLY_PROP_CHARGE_TYPE:
@@ -862,7 +892,9 @@ static int bq24190_charger_set_property(struct power_supply *psy,
                ret = -EINVAL;
        }
 
-       pm_runtime_put_sync(bdi->dev);
+       pm_runtime_mark_last_busy(bdi->dev);
+       pm_runtime_put_autosuspend(bdi->dev);
+
        return ret;
 }
 
@@ -1063,7 +1095,9 @@ static int bq24190_battery_get_property(struct power_supply *psy,
 
        dev_dbg(bdi->dev, "prop: %d\n", psp);
 
-       pm_runtime_get_sync(bdi->dev);
+       ret = pm_runtime_get_sync(bdi->dev);
+       if (ret < 0)
+               return ret;
 
        switch (psp) {
        case POWER_SUPPLY_PROP_STATUS:
@@ -1091,7 +1125,9 @@ static int bq24190_battery_get_property(struct power_supply *psy,
                ret = -ENODATA;
        }
 
-       pm_runtime_put_sync(bdi->dev);
+       pm_runtime_mark_last_busy(bdi->dev);
+       pm_runtime_put_autosuspend(bdi->dev);
+
        return ret;
 }
 
@@ -1104,7 +1140,9 @@ static int bq24190_battery_set_property(struct power_supply *psy,
 
        dev_dbg(bdi->dev, "prop: %d\n", psp);
 
-       pm_runtime_get_sync(bdi->dev);
+       ret = pm_runtime_get_sync(bdi->dev);
+       if (ret < 0)
+               return ret;
 
        switch (psp) {
        case POWER_SUPPLY_PROP_ONLINE:
@@ -1117,7 +1155,9 @@ static int bq24190_battery_set_property(struct power_supply *psy,
                ret = -EINVAL;
        }
 
-       pm_runtime_put_sync(bdi->dev);
+       pm_runtime_mark_last_busy(bdi->dev);
+       pm_runtime_put_autosuspend(bdi->dev);
+
        return ret;
 }
 
@@ -1157,9 +1197,8 @@ static const struct power_supply_desc bq24190_battery_desc = {
        .property_is_writeable  = bq24190_battery_property_is_writeable,
 };
 
-static irqreturn_t bq24190_irq_handler_thread(int irq, void *data)
+static void bq24190_check_status(struct bq24190_dev_info *bdi)
 {
-       struct bq24190_dev_info *bdi = data;
        const u8 battery_mask_ss = BQ24190_REG_SS_CHRG_STAT_MASK;
        const u8 battery_mask_f = BQ24190_REG_F_BAT_FAULT_MASK
                                | BQ24190_REG_F_NTC_FAULT_MASK;
@@ -1167,12 +1206,10 @@ static irqreturn_t bq24190_irq_handler_thread(int irq, void *data)
        u8 ss_reg = 0, f_reg = 0;
        int i, ret;
 
-       pm_runtime_get_sync(bdi->dev);
-
        ret = bq24190_read(bdi, BQ24190_REG_SS, &ss_reg);
        if (ret < 0) {
                dev_err(bdi->dev, "Can't read SS reg: %d\n", ret);
-               goto out;
+               return;
        }
 
        i = 0;
@@ -1180,12 +1217,17 @@ static irqreturn_t bq24190_irq_handler_thread(int irq, void *data)
                ret = bq24190_read(bdi, BQ24190_REG_F, &f_reg);
                if (ret < 0) {
                        dev_err(bdi->dev, "Can't read F reg: %d\n", ret);
-                       goto out;
+                       return;
                }
        } while (f_reg && ++i < 2);
 
+       /* ignore over/under voltage fault after disconnect */
+       if (f_reg == (1 << BQ24190_REG_F_CHRG_FAULT_SHIFT) &&
+           !(ss_reg & BQ24190_REG_SS_PG_STAT_MASK))
+               f_reg = 0;
+
        if (f_reg != bdi->f_reg) {
-               dev_info(bdi->dev,
+               dev_warn(bdi->dev,
                        "Fault: boost %d, charge %d, battery %d, ntc %d\n",
                        !!(f_reg & BQ24190_REG_F_BOOST_FAULT_MASK),
                        !!(f_reg & BQ24190_REG_F_CHRG_FAULT_MASK),
@@ -1229,90 +1271,126 @@ static irqreturn_t bq24190_irq_handler_thread(int irq, void *data)
        if (alert_battery)
                power_supply_changed(bdi->battery);
 
-out:
-       pm_runtime_put_sync(bdi->dev);
-
        dev_dbg(bdi->dev, "ss_reg: 0x%02x, f_reg: 0x%02x\n", ss_reg, f_reg);
+}
+
+static irqreturn_t bq24190_irq_handler_thread(int irq, void *data)
+{
+       struct bq24190_dev_info *bdi = data;
+       int error;
+
+       bdi->irq_event = true;
+       error = pm_runtime_get_sync(bdi->dev);
+       if (error < 0) {
+               dev_warn(bdi->dev, "pm_runtime_get failed: %i\n", error);
+               pm_runtime_put_noidle(bdi->dev);
+               return IRQ_NONE;
+       }
+       bq24190_check_status(bdi);
+       pm_runtime_mark_last_busy(bdi->dev);
+       pm_runtime_put_autosuspend(bdi->dev);
+       bdi->irq_event = false;
 
        return IRQ_HANDLED;
 }
 
-static int bq24190_hw_init(struct bq24190_dev_info *bdi)
+static void bq24190_extcon_work(struct work_struct *work)
 {
+       struct bq24190_dev_info *bdi =
+               container_of(work, struct bq24190_dev_info, extcon_work.work);
+       int error, iinlim = 0;
        u8 v;
-       int ret;
-
-       pm_runtime_get_sync(bdi->dev);
 
-       /* First check that the device really is what its supposed to be */
-       ret = bq24190_read_mask(bdi, BQ24190_REG_VPRS,
-                       BQ24190_REG_VPRS_PN_MASK,
-                       BQ24190_REG_VPRS_PN_SHIFT,
-                       &v);
-       if (ret < 0)
-               goto out;
+       error = pm_runtime_get_sync(bdi->dev);
+       if (error < 0) {
+               dev_warn(bdi->dev, "pm_runtime_get failed: %i\n", error);
+               pm_runtime_put_noidle(bdi->dev);
+               return;
+       }
 
-       if (v != bdi->model) {
-               ret = -ENODEV;
-               goto out;
+       if      (extcon_get_state(bdi->extcon, EXTCON_CHG_USB_SDP) == 1)
+               iinlim =  500000;
+       else if (extcon_get_state(bdi->extcon, EXTCON_CHG_USB_CDP) == 1 ||
+                extcon_get_state(bdi->extcon, EXTCON_CHG_USB_ACA) == 1)
+               iinlim = 1500000;
+       else if (extcon_get_state(bdi->extcon, EXTCON_CHG_USB_DCP) == 1)
+               iinlim = 2000000;
+
+       if (iinlim) {
+               error = bq24190_set_field_val(bdi, BQ24190_REG_ISC,
+                                             BQ24190_REG_ISC_IINLIM_MASK,
+                                             BQ24190_REG_ISC_IINLIM_SHIFT,
+                                             bq24190_isc_iinlim_values,
+                                             ARRAY_SIZE(bq24190_isc_iinlim_values),
+                                             iinlim);
+               if (error < 0)
+                       dev_err(bdi->dev, "Can't set IINLIM: %d\n", error);
        }
 
-       ret = bq24190_register_reset(bdi);
-       if (ret < 0)
-               goto out;
+       /* if no charger found and in USB host mode, set OTG 5V boost, else normal */
+       if (!iinlim && extcon_get_state(bdi->extcon, EXTCON_USB_HOST) == 1)
+               v = BQ24190_REG_POC_CHG_CONFIG_OTG;
+       else
+               v = BQ24190_REG_POC_CHG_CONFIG_CHARGE;
 
-       ret = bq24190_set_mode_host(bdi);
-       if (ret < 0)
-               goto out;
+       error = bq24190_write_mask(bdi, BQ24190_REG_POC,
+                                  BQ24190_REG_POC_CHG_CONFIG_MASK,
+                                  BQ24190_REG_POC_CHG_CONFIG_SHIFT,
+                                  v);
+       if (error < 0)
+               dev_err(bdi->dev, "Can't set CHG_CONFIG: %d\n", error);
 
-       ret = bq24190_read(bdi, BQ24190_REG_SS, &bdi->ss_reg);
-out:
-       pm_runtime_put_sync(bdi->dev);
-       return ret;
+       pm_runtime_mark_last_busy(bdi->dev);
+       pm_runtime_put_autosuspend(bdi->dev);
 }
 
-#ifdef CONFIG_OF
-static int bq24190_setup_dt(struct bq24190_dev_info *bdi)
+static int bq24190_extcon_event(struct notifier_block *nb, unsigned long event,
+                               void *param)
 {
-       bdi->irq = irq_of_parse_and_map(bdi->dev->of_node, 0);
-       if (bdi->irq <= 0)
-               return -1;
+       struct bq24190_dev_info *bdi =
+               container_of(nb, struct bq24190_dev_info, extcon_nb);
 
-       return 0;
-}
-#else
-static int bq24190_setup_dt(struct bq24190_dev_info *bdi)
-{
-       return -1;
+       /*
+        * The Power-Good detection may take up to 220ms, sometimes
+        * the external charger detection is quicker, and the bq24190 will
+        * reset to iinlim based on its own charger detection (which is not
+        * hooked up when using external charger detection) resulting in
+        * a too low default 500mA iinlim. Delay applying the extcon value
+        * for 300ms to avoid this.
+        */
+       queue_delayed_work(system_wq, &bdi->extcon_work, msecs_to_jiffies(300));
+
+       return NOTIFY_OK;
 }
-#endif
 
-static int bq24190_setup_pdata(struct bq24190_dev_info *bdi,
-               struct bq24190_platform_data *pdata)
+static int bq24190_hw_init(struct bq24190_dev_info *bdi)
 {
+       u8 v;
        int ret;
 
-       if (!gpio_is_valid(pdata->gpio_int))
-               return -1;
-
-       ret = gpio_request(pdata->gpio_int, dev_name(bdi->dev));
+       /* First check that the device really is what its supposed to be */
+       ret = bq24190_read_mask(bdi, BQ24190_REG_VPRS,
+                       BQ24190_REG_VPRS_PN_MASK,
+                       BQ24190_REG_VPRS_PN_SHIFT,
+                       &v);
        if (ret < 0)
-               return -1;
+               return ret;
 
-       ret = gpio_direction_input(pdata->gpio_int);
-       if (ret < 0)
-               goto out;
+       if (v != BQ24190_REG_VPRS_PN_24190 &&
+           v != BQ24190_REG_VPRS_PN_24192I) {
+               dev_err(bdi->dev, "Error unknown model: 0x%02x\n", v);
+               return -ENODEV;
+       }
 
-       bdi->irq = gpio_to_irq(pdata->gpio_int);
-       if (!bdi->irq)
-               goto out;
+       ret = bq24190_register_reset(bdi);
+       if (ret < 0)
+               return ret;
 
-       bdi->gpio_int = pdata->gpio_int;
-       return 0;
+       ret = bq24190_set_mode_host(bdi);
+       if (ret < 0)
+               return ret;
 
-out:
-       gpio_free(pdata->gpio_int);
-       return -1;
+       return bq24190_read(bdi, BQ24190_REG_SS, &bdi->ss_reg);
 }
 
 static int bq24190_probe(struct i2c_client *client,
@@ -1320,9 +1398,9 @@ static int bq24190_probe(struct i2c_client *client,
 {
        struct i2c_adapter *adapter = to_i2c_adapter(client->dev.parent);
        struct device *dev = &client->dev;
-       struct bq24190_platform_data *pdata = client->dev.platform_data;
        struct power_supply_config charger_cfg = {}, battery_cfg = {};
        struct bq24190_dev_info *bdi;
+       const char *name;
        int ret;
 
        if (!i2c_check_functionality(adapter, I2C_FUNC_SMBUS_BYTE_DATA)) {
@@ -1338,7 +1416,6 @@ static int bq24190_probe(struct i2c_client *client,
 
        bdi->client = client;
        bdi->dev = dev;
-       bdi->model = id->driver_data;
        strncpy(bdi->model_name, id->name, I2C_NAME_SIZE);
        mutex_init(&bdi->f_reg_lock);
        bdi->f_reg = 0;
@@ -1346,23 +1423,43 @@ static int bq24190_probe(struct i2c_client *client,
 
        i2c_set_clientdata(client, bdi);
 
-       if (dev->of_node)
-               ret = bq24190_setup_dt(bdi);
-       else
-               ret = bq24190_setup_pdata(bdi, pdata);
-
-       if (ret) {
+       if (!client->irq) {
                dev_err(dev, "Can't get irq info\n");
                return -EINVAL;
        }
 
+       /*
+        * Devicetree platforms should get extcon via phandle (not yet supported).
+        * On ACPI platforms, extcon clients may invoke us with:
+        * struct property_entry pe[] =
+        *   { PROPERTY_ENTRY_STRING("extcon-name", client_name), ... };
+        * struct i2c_board_info bi =
+        *   { .type = "bq24190", .addr = 0x6b, .properties = pe, .irq = irq };
+        * struct i2c_adapter ad = { ... };
+        * i2c_add_adapter(&ad);
+        * i2c_new_device(&ad, &bi);
+        */
+       if (device_property_read_string(dev, "extcon-name", &name) == 0) {
+               bdi->extcon = extcon_get_extcon_dev(name);
+               if (!bdi->extcon)
+                       return -EPROBE_DEFER;
+
+               dev_info(bdi->dev, "using extcon device %s\n", name);
+       }
+
        pm_runtime_enable(dev);
-       pm_runtime_resume(dev);
+       pm_runtime_use_autosuspend(dev);
+       pm_runtime_set_autosuspend_delay(dev, 600);
+       ret = pm_runtime_get_sync(dev);
+       if (ret < 0) {
+               dev_err(dev, "pm_runtime_get failed: %i\n", ret);
+               goto out_pmrt;
+       }
 
        ret = bq24190_hw_init(bdi);
        if (ret < 0) {
                dev_err(dev, "Hardware init failed\n");
-               goto out1;
+               goto out_pmrt;
        }
 
        charger_cfg.drv_data = bdi;
@@ -1373,7 +1470,7 @@ static int bq24190_probe(struct i2c_client *client,
        if (IS_ERR(bdi->charger)) {
                dev_err(dev, "Can't register charger\n");
                ret = PTR_ERR(bdi->charger);
-               goto out1;
+               goto out_pmrt;
        }
 
        battery_cfg.drv_data = bdi;
@@ -1382,87 +1479,160 @@ static int bq24190_probe(struct i2c_client *client,
        if (IS_ERR(bdi->battery)) {
                dev_err(dev, "Can't register battery\n");
                ret = PTR_ERR(bdi->battery);
-               goto out2;
+               goto out_charger;
        }
 
        ret = bq24190_sysfs_create_group(bdi);
        if (ret) {
                dev_err(dev, "Can't create sysfs entries\n");
-               goto out3;
+               goto out_battery;
        }
 
-       ret = devm_request_threaded_irq(dev, bdi->irq, NULL,
+       bdi->initialized = true;
+
+       ret = devm_request_threaded_irq(dev, client->irq, NULL,
                        bq24190_irq_handler_thread,
                        IRQF_TRIGGER_FALLING | IRQF_ONESHOT,
                        "bq24190-charger", bdi);
        if (ret < 0) {
                dev_err(dev, "Can't set up irq handler\n");
-               goto out4;
+               goto out_sysfs;
+       }
+
+       if (bdi->extcon) {
+               INIT_DELAYED_WORK(&bdi->extcon_work, bq24190_extcon_work);
+               bdi->extcon_nb.notifier_call = bq24190_extcon_event;
+               ret = devm_extcon_register_notifier_all(dev, bdi->extcon,
+                                                       &bdi->extcon_nb);
+               if (ret) {
+                       dev_err(dev, "Can't register extcon\n");
+                       goto out_sysfs;
+               }
+
+               /* Sync initial cable state */
+               queue_delayed_work(system_wq, &bdi->extcon_work, 0);
        }
 
+       enable_irq_wake(client->irq);
+
+       pm_runtime_mark_last_busy(dev);
+       pm_runtime_put_autosuspend(dev);
+
        return 0;
 
-out4:
+out_sysfs:
        bq24190_sysfs_remove_group(bdi);
 
-out3:
+out_battery:
        power_supply_unregister(bdi->battery);
 
-out2:
+out_charger:
        power_supply_unregister(bdi->charger);
 
-out1:
+out_pmrt:
+       pm_runtime_put_sync(dev);
+       pm_runtime_dont_use_autosuspend(dev);
        pm_runtime_disable(dev);
-       if (bdi->gpio_int)
-               gpio_free(bdi->gpio_int);
        return ret;
 }
 
 static int bq24190_remove(struct i2c_client *client)
 {
        struct bq24190_dev_info *bdi = i2c_get_clientdata(client);
+       int error;
 
-       pm_runtime_get_sync(bdi->dev);
-       bq24190_register_reset(bdi);
-       pm_runtime_put_sync(bdi->dev);
+       error = pm_runtime_get_sync(bdi->dev);
+       if (error < 0) {
+               dev_warn(bdi->dev, "pm_runtime_get failed: %i\n", error);
+               pm_runtime_put_noidle(bdi->dev);
+       }
 
+       bq24190_register_reset(bdi);
        bq24190_sysfs_remove_group(bdi);
        power_supply_unregister(bdi->battery);
        power_supply_unregister(bdi->charger);
+       if (error >= 0)
+               pm_runtime_put_sync(bdi->dev);
+       pm_runtime_dont_use_autosuspend(bdi->dev);
        pm_runtime_disable(bdi->dev);
 
-       if (bdi->gpio_int)
-               gpio_free(bdi->gpio_int);
+       return 0;
+}
+
+static __maybe_unused int bq24190_runtime_suspend(struct device *dev)
+{
+       struct i2c_client *client = to_i2c_client(dev);
+       struct bq24190_dev_info *bdi = i2c_get_clientdata(client);
+
+       if (!bdi->initialized)
+               return 0;
+
+       dev_dbg(bdi->dev, "%s\n", __func__);
+
+       return 0;
+}
+
+static __maybe_unused int bq24190_runtime_resume(struct device *dev)
+{
+       struct i2c_client *client = to_i2c_client(dev);
+       struct bq24190_dev_info *bdi = i2c_get_clientdata(client);
+
+       if (!bdi->initialized)
+               return 0;
+
+       if (!bdi->irq_event) {
+               dev_dbg(bdi->dev, "checking events on possible wakeirq\n");
+               bq24190_check_status(bdi);
+       }
 
        return 0;
 }
 
-#ifdef CONFIG_PM_SLEEP
-static int bq24190_pm_suspend(struct device *dev)
+static __maybe_unused int bq24190_pm_suspend(struct device *dev)
 {
        struct i2c_client *client = to_i2c_client(dev);
        struct bq24190_dev_info *bdi = i2c_get_clientdata(client);
+       int error;
+
+       error = pm_runtime_get_sync(bdi->dev);
+       if (error < 0) {
+               dev_warn(bdi->dev, "pm_runtime_get failed: %i\n", error);
+               pm_runtime_put_noidle(bdi->dev);
+       }
 
-       pm_runtime_get_sync(bdi->dev);
        bq24190_register_reset(bdi);
-       pm_runtime_put_sync(bdi->dev);
+
+       if (error >= 0) {
+               pm_runtime_mark_last_busy(bdi->dev);
+               pm_runtime_put_autosuspend(bdi->dev);
+       }
 
        return 0;
 }
 
-static int bq24190_pm_resume(struct device *dev)
+static __maybe_unused int bq24190_pm_resume(struct device *dev)
 {
        struct i2c_client *client = to_i2c_client(dev);
        struct bq24190_dev_info *bdi = i2c_get_clientdata(client);
+       int error;
 
        bdi->f_reg = 0;
        bdi->ss_reg = BQ24190_REG_SS_VBUS_STAT_MASK; /* impossible state */
 
-       pm_runtime_get_sync(bdi->dev);
+       error = pm_runtime_get_sync(bdi->dev);
+       if (error < 0) {
+               dev_warn(bdi->dev, "pm_runtime_get failed: %i\n", error);
+               pm_runtime_put_noidle(bdi->dev);
+       }
+
        bq24190_register_reset(bdi);
        bq24190_set_mode_host(bdi);
        bq24190_read(bdi, BQ24190_REG_SS, &bdi->ss_reg);
-       pm_runtime_put_sync(bdi->dev);
+
+       if (error >= 0) {
+               pm_runtime_mark_last_busy(bdi->dev);
+               pm_runtime_put_autosuspend(bdi->dev);
+       }
 
        /* Things may have changed while suspended so alert upper layer */
        power_supply_changed(bdi->charger);
@@ -1470,17 +1640,16 @@ static int bq24190_pm_resume(struct device *dev)
 
        return 0;
 }
-#endif
 
-static SIMPLE_DEV_PM_OPS(bq24190_pm_ops, bq24190_pm_suspend, bq24190_pm_resume);
+static const struct dev_pm_ops bq24190_pm_ops = {
+       SET_RUNTIME_PM_OPS(bq24190_runtime_suspend, bq24190_runtime_resume,
+                          NULL)
+       SET_SYSTEM_SLEEP_PM_OPS(bq24190_pm_suspend, bq24190_pm_resume)
+};
 
-/*
- * Only support the bq24190 right now.  The bq24192, bq24192i, and bq24193
- * are similar but not identical so the driver needs to be extended to
- * support them.
- */
 static const struct i2c_device_id bq24190_i2c_ids[] = {
-       { "bq24190", BQ24190_REG_VPRS_PN_24190 },
+       { "bq24190" },
+       { "bq24192i" },
        { },
 };
 MODULE_DEVICE_TABLE(i2c, bq24190_i2c_ids);
index f993a55..8e2c41d 100644 (file)
@@ -723,7 +723,7 @@ static int bq25890_irq_probe(struct bq25890_device *bq)
 {
        struct gpio_desc *irq;
 
-       irq = devm_gpiod_get_index(bq->dev, BQ25890_IRQ_PIN, 0, GPIOD_IN);
+       irq = devm_gpiod_get(bq->dev, BQ25890_IRQ_PIN, GPIOD_IN);
        if (IS_ERR(irq)) {
                dev_err(bq->dev, "Could not probe irq pin.\n");
                return PTR_ERR(irq);
index e664ca7..adc3761 100644 (file)
@@ -1198,7 +1198,7 @@ static int charger_extcon_notifier(struct notifier_block *self,
 static int charger_extcon_init(struct charger_manager *cm,
                struct charger_cable *cable)
 {
-       int ret = 0;
+       int ret;
 
        /*
         * Charger manager use Extcon framework to identify
@@ -1232,7 +1232,7 @@ static int charger_manager_register_extcon(struct charger_manager *cm)
 {
        struct charger_desc *desc = cm->desc;
        struct charger_regulator *charger;
-       int ret = 0;
+       int ret;
        int i;
        int j;
 
@@ -1255,15 +1255,14 @@ static int charger_manager_register_extcon(struct charger_manager *cm)
                        if (ret < 0) {
                                dev_err(cm->dev, "Cannot initialize charger(%s)\n",
                                        charger->regulator_name);
-                               goto err;
+                               return ret;
                        }
                        cable->charger = charger;
                        cable->cm = cm;
                }
        }
 
-err:
-       return ret;
+       return 0;
 }
 
 /* help function of sysfs node to control charger(regulator) */
@@ -1372,7 +1371,7 @@ static int charger_manager_register_sysfs(struct charger_manager *cm)
        int chargers_externally_control = 1;
        char buf[11];
        char *str;
-       int ret = 0;
+       int ret;
        int i;
 
        /* Create sysfs entry to control charger(regulator) */
@@ -1382,10 +1381,9 @@ static int charger_manager_register_sysfs(struct charger_manager *cm)
                snprintf(buf, 10, "charger.%d", i);
                str = devm_kzalloc(cm->dev,
                                sizeof(char) * (strlen(buf) + 1), GFP_KERNEL);
-               if (!str) {
-                       ret = -ENOMEM;
-                       goto err;
-               }
+               if (!str)
+                       return -ENOMEM;
+
                strcpy(str, buf);
 
                charger->attrs[0] = &charger->attr_name.attr;
@@ -1426,19 +1424,16 @@ static int charger_manager_register_sysfs(struct charger_manager *cm)
                if (ret < 0) {
                        dev_err(cm->dev, "Cannot create sysfs entry of %s regulator\n",
                                charger->regulator_name);
-                       ret = -EINVAL;
-                       goto err;
+                       return ret;
                }
        }
 
        if (chargers_externally_control) {
                dev_err(cm->dev, "Cannot register regulator because charger-manager must need at least one charger for charging battery\n");
-               ret = -EINVAL;
-               goto err;
+               return -EINVAL;
        }
 
-err:
-       return ret;
+       return 0;
 }
 
 static int cm_init_thermal_data(struct charger_manager *cm,
@@ -1626,7 +1621,7 @@ static int charger_manager_probe(struct platform_device *pdev)
 {
        struct charger_desc *desc = cm_get_drv_data(pdev);
        struct charger_manager *cm;
-       int ret = 0, i = 0;
+       int ret, i = 0;
        int j = 0;
        union power_supply_propval val;
        struct power_supply *fuel_gauge;
@@ -1887,14 +1882,12 @@ MODULE_DEVICE_TABLE(platform, charger_manager_id);
 
 static int cm_suspend_noirq(struct device *dev)
 {
-       int ret = 0;
-
        if (device_may_wakeup(dev)) {
                device_set_wakeup_capable(dev, false);
-               ret = -EAGAIN;
+               return -EAGAIN;
        }
 
-       return ret;
+       return 0;
 }
 
 static bool cm_need_to_awake(void)
diff --git a/drivers/power/supply/cpcap-charger.c b/drivers/power/supply/cpcap-charger.c
new file mode 100644 (file)
index 0000000..543a1bd
--- /dev/null
@@ -0,0 +1,681 @@
+/*
+ * Motorola CPCAP PMIC battery charger driver
+ *
+ * Copyright (C) 2017 Tony Lindgren <tony@atomide.com>
+ *
+ * Rewritten for Linux power framework with some parts based on
+ * on earlier driver found in the Motorola Linux kernel:
+ *
+ * Copyright (C) 2009-2010 Motorola, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/atomic.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/notifier.h>
+#include <linux/of.h>
+#include <linux/of_platform.h>
+#include <linux/platform_device.h>
+#include <linux/power_supply.h>
+#include <linux/regmap.h>
+
+#include <linux/gpio/consumer.h>
+#include <linux/usb/phy_companion.h>
+#include <linux/phy/omap_usb.h>
+#include <linux/usb/otg.h>
+#include <linux/iio/consumer.h>
+#include <linux/mfd/motorola-cpcap.h>
+
+/* CPCAP_REG_CRM register bits */
+#define CPCAP_REG_CRM_UNUSED_641_15    BIT(15) /* 641 = register number */
+#define CPCAP_REG_CRM_UNUSED_641_14    BIT(14) /* 641 = register number */
+#define CPCAP_REG_CRM_CHRG_LED_EN      BIT(13)
+#define CPCAP_REG_CRM_RVRSMODE         BIT(12)
+#define CPCAP_REG_CRM_ICHRG_TR1                BIT(11)
+#define CPCAP_REG_CRM_ICHRG_TR0                BIT(10)
+#define CPCAP_REG_CRM_FET_OVRD         BIT(9)
+#define CPCAP_REG_CRM_FET_CTRL         BIT(8)
+#define CPCAP_REG_CRM_VCHRG3           BIT(7)
+#define CPCAP_REG_CRM_VCHRG2           BIT(6)
+#define CPCAP_REG_CRM_VCHRG1           BIT(5)
+#define CPCAP_REG_CRM_VCHRG0           BIT(4)
+#define CPCAP_REG_CRM_ICHRG3           BIT(3)
+#define CPCAP_REG_CRM_ICHRG2           BIT(2)
+#define CPCAP_REG_CRM_ICHRG1           BIT(1)
+#define CPCAP_REG_CRM_ICHRG0           BIT(0)
+
+/* CPCAP_REG_CRM trickle charge voltages */
+#define CPCAP_REG_CRM_TR(val)          (((val) & 0x3) << 10)
+#define CPCAP_REG_CRM_TR_0A00          CPCAP_REG_CRM_TR(0x0)
+#define CPCAP_REG_CRM_TR_0A24          CPCAP_REG_CRM_TR(0x1)
+#define CPCAP_REG_CRM_TR_0A48          CPCAP_REG_CRM_TR(0x2)
+#define CPCAP_REG_CRM_TR_0A72          CPCAP_REG_CRM_TR(0x4)
+
+/* CPCAP_REG_CRM charge voltages */
+#define CPCAP_REG_CRM_VCHRG(val)       (((val) & 0xf) << 4)
+#define CPCAP_REG_CRM_VCHRG_3V80       CPCAP_REG_CRM_VCHRG(0x0)
+#define CPCAP_REG_CRM_VCHRG_4V10       CPCAP_REG_CRM_VCHRG(0x1)
+#define CPCAP_REG_CRM_VCHRG_4V15       CPCAP_REG_CRM_VCHRG(0x2)
+#define CPCAP_REG_CRM_VCHRG_4V20       CPCAP_REG_CRM_VCHRG(0x3)
+#define CPCAP_REG_CRM_VCHRG_4V22       CPCAP_REG_CRM_VCHRG(0x4)
+#define CPCAP_REG_CRM_VCHRG_4V24       CPCAP_REG_CRM_VCHRG(0x5)
+#define CPCAP_REG_CRM_VCHRG_4V26       CPCAP_REG_CRM_VCHRG(0x6)
+#define CPCAP_REG_CRM_VCHRG_4V28       CPCAP_REG_CRM_VCHRG(0x7)
+#define CPCAP_REG_CRM_VCHRG_4V30       CPCAP_REG_CRM_VCHRG(0x8)
+#define CPCAP_REG_CRM_VCHRG_4V32       CPCAP_REG_CRM_VCHRG(0x9)
+#define CPCAP_REG_CRM_VCHRG_4V34       CPCAP_REG_CRM_VCHRG(0xa)
+#define CPCAP_REG_CRM_VCHRG_4V36       CPCAP_REG_CRM_VCHRG(0xb)
+#define CPCAP_REG_CRM_VCHRG_4V38       CPCAP_REG_CRM_VCHRG(0xc)
+#define CPCAP_REG_CRM_VCHRG_4V40       CPCAP_REG_CRM_VCHRG(0xd)
+#define CPCAP_REG_CRM_VCHRG_4V42       CPCAP_REG_CRM_VCHRG(0xe)
+#define CPCAP_REG_CRM_VCHRG_4V44       CPCAP_REG_CRM_VCHRG(0xf)
+
+/* CPCAP_REG_CRM charge currents */
+#define CPCAP_REG_CRM_ICHRG(val)       (((val) & 0xf) << 0)
+#define CPCAP_REG_CRM_ICHRG_0A000      CPCAP_REG_CRM_ICHRG(0x0)
+#define CPCAP_REG_CRM_ICHRG_0A070      CPCAP_REG_CRM_ICHRG(0x1)
+#define CPCAP_REG_CRM_ICHRG_0A176      CPCAP_REG_CRM_ICHRG(0x2)
+#define CPCAP_REG_CRM_ICHRG_0A264      CPCAP_REG_CRM_ICHRG(0x3)
+#define CPCAP_REG_CRM_ICHRG_0A352      CPCAP_REG_CRM_ICHRG(0x4)
+#define CPCAP_REG_CRM_ICHRG_0A440      CPCAP_REG_CRM_ICHRG(0x5)
+#define CPCAP_REG_CRM_ICHRG_0A528      CPCAP_REG_CRM_ICHRG(0x6)
+#define CPCAP_REG_CRM_ICHRG_0A616      CPCAP_REG_CRM_ICHRG(0x7)
+#define CPCAP_REG_CRM_ICHRG_0A704      CPCAP_REG_CRM_ICHRG(0x8)
+#define CPCAP_REG_CRM_ICHRG_0A792      CPCAP_REG_CRM_ICHRG(0x9)
+#define CPCAP_REG_CRM_ICHRG_0A880      CPCAP_REG_CRM_ICHRG(0xa)
+#define CPCAP_REG_CRM_ICHRG_0A968      CPCAP_REG_CRM_ICHRG(0xb)
+#define CPCAP_REG_CRM_ICHRG_1A056      CPCAP_REG_CRM_ICHRG(0xc)
+#define CPCAP_REG_CRM_ICHRG_1A144      CPCAP_REG_CRM_ICHRG(0xd)
+#define CPCAP_REG_CRM_ICHRG_1A584      CPCAP_REG_CRM_ICHRG(0xe)
+#define CPCAP_REG_CRM_ICHRG_NO_LIMIT   CPCAP_REG_CRM_ICHRG(0xf)
+
+enum {
+       CPCAP_CHARGER_IIO_BATTDET,
+       CPCAP_CHARGER_IIO_VOLTAGE,
+       CPCAP_CHARGER_IIO_VBUS,
+       CPCAP_CHARGER_IIO_CHRG_CURRENT,
+       CPCAP_CHARGER_IIO_BATT_CURRENT,
+       CPCAP_CHARGER_IIO_NR,
+};
+
+struct cpcap_charger_ddata {
+       struct device *dev;
+       struct regmap *reg;
+       struct list_head irq_list;
+       struct delayed_work detect_work;
+       struct delayed_work vbus_work;
+       struct gpio_desc *gpio[2];              /* gpio_reven0 & 1 */
+
+       struct iio_channel *channels[CPCAP_CHARGER_IIO_NR];
+
+       struct power_supply *usb;
+
+       struct phy_companion comparator;        /* For USB VBUS */
+       bool vbus_enabled;
+       atomic_t active;
+
+       int status;
+};
+
+struct cpcap_interrupt_desc {
+       int irq;
+       struct list_head node;
+       const char *name;
+};
+
+struct cpcap_charger_ints_state {
+       bool chrg_det;
+       bool rvrs_chrg;
+       bool vbusov;
+
+       bool chrg_se1b;
+       bool rvrs_mode;
+       bool chrgcurr1;
+       bool vbusvld;
+
+       bool battdetb;
+};
+
+static enum power_supply_property cpcap_charger_props[] = {
+       POWER_SUPPLY_PROP_STATUS,
+       POWER_SUPPLY_PROP_ONLINE,
+       POWER_SUPPLY_PROP_VOLTAGE_NOW,
+       POWER_SUPPLY_PROP_CURRENT_NOW,
+};
+
+static bool cpcap_charger_battery_found(struct cpcap_charger_ddata *ddata)
+{
+       struct iio_channel *channel;
+       int error, value;
+
+       channel = ddata->channels[CPCAP_CHARGER_IIO_BATTDET];
+       error = iio_read_channel_raw(channel, &value);
+       if (error < 0) {
+               dev_warn(ddata->dev, "%s failed: %i\n", __func__, error);
+
+               return false;
+       }
+
+       return value == 1;
+}
+
+static int cpcap_charger_get_charge_voltage(struct cpcap_charger_ddata *ddata)
+{
+       struct iio_channel *channel;
+       int error, value = 0;
+
+       channel = ddata->channels[CPCAP_CHARGER_IIO_VOLTAGE];
+       error = iio_read_channel_processed(channel, &value);
+       if (error < 0) {
+               dev_warn(ddata->dev, "%s failed: %i\n", __func__, error);
+
+               return 0;
+       }
+
+       return value;
+}
+
+static int cpcap_charger_get_charge_current(struct cpcap_charger_ddata *ddata)
+{
+       struct iio_channel *channel;
+       int error, value = 0;
+
+       channel = ddata->channels[CPCAP_CHARGER_IIO_CHRG_CURRENT];
+       error = iio_read_channel_processed(channel, &value);
+       if (error < 0) {
+               dev_warn(ddata->dev, "%s failed: %i\n", __func__, error);
+
+               return 0;
+       }
+
+       return value;
+}
+
+static int cpcap_charger_get_property(struct power_supply *psy,
+                                     enum power_supply_property psp,
+                                     union power_supply_propval *val)
+{
+       struct cpcap_charger_ddata *ddata = dev_get_drvdata(psy->dev.parent);
+
+       switch (psp) {
+       case POWER_SUPPLY_PROP_STATUS:
+               val->intval = ddata->status;
+               break;
+       case POWER_SUPPLY_PROP_VOLTAGE_NOW:
+               if (ddata->status == POWER_SUPPLY_STATUS_CHARGING)
+                       val->intval = cpcap_charger_get_charge_voltage(ddata) *
+                               1000;
+               else
+                       val->intval = 0;
+               break;
+       case POWER_SUPPLY_PROP_CURRENT_NOW:
+               if (ddata->status == POWER_SUPPLY_STATUS_CHARGING)
+                       val->intval = cpcap_charger_get_charge_current(ddata) *
+                               1000;
+               else
+                       val->intval = 0;
+               break;
+       case POWER_SUPPLY_PROP_ONLINE:
+               val->intval = ddata->status == POWER_SUPPLY_STATUS_CHARGING;
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+static void cpcap_charger_set_cable_path(struct cpcap_charger_ddata *ddata,
+                                        bool enabled)
+{
+       if (!ddata->gpio[0])
+               return;
+
+       gpiod_set_value(ddata->gpio[0], enabled);
+}
+
+static void cpcap_charger_set_inductive_path(struct cpcap_charger_ddata *ddata,
+                                            bool enabled)
+{
+       if (!ddata->gpio[1])
+               return;
+
+       gpiod_set_value(ddata->gpio[1], enabled);
+}
+
+static int cpcap_charger_set_state(struct cpcap_charger_ddata *ddata,
+                                  int max_voltage, int charge_current,
+                                  int trickle_current)
+{
+       bool enable;
+       int error;
+
+       enable = max_voltage && (charge_current || trickle_current);
+       dev_dbg(ddata->dev, "%s enable: %i\n", __func__, enable);
+
+       if (!enable) {
+               error = regmap_update_bits(ddata->reg, CPCAP_REG_CRM,
+                                          0x3fff,
+                                          CPCAP_REG_CRM_FET_OVRD |
+                                          CPCAP_REG_CRM_FET_CTRL);
+               if (error) {
+                       ddata->status = POWER_SUPPLY_STATUS_UNKNOWN;
+                       goto out_err;
+               }
+
+               ddata->status = POWER_SUPPLY_STATUS_DISCHARGING;
+
+               return 0;
+       }
+
+       error = regmap_update_bits(ddata->reg, CPCAP_REG_CRM, 0x3fff,
+                                  CPCAP_REG_CRM_CHRG_LED_EN |
+                                  trickle_current |
+                                  CPCAP_REG_CRM_FET_OVRD |
+                                  CPCAP_REG_CRM_FET_CTRL |
+                                  max_voltage |
+                                  charge_current);
+       if (error) {
+               ddata->status = POWER_SUPPLY_STATUS_UNKNOWN;
+               goto out_err;
+       }
+
+       ddata->status = POWER_SUPPLY_STATUS_CHARGING;
+
+       return 0;
+
+out_err:
+       dev_err(ddata->dev, "%s failed with %i\n", __func__, error);
+
+       return error;
+}
+
+static bool cpcap_charger_vbus_valid(struct cpcap_charger_ddata *ddata)
+{
+       int error, value = 0;
+       struct iio_channel *channel =
+               ddata->channels[CPCAP_CHARGER_IIO_VBUS];
+
+       error = iio_read_channel_processed(channel, &value);
+       if (error >= 0)
+               return value > 3900 ? true : false;
+
+       dev_err(ddata->dev, "error reading VBUS: %i\n", error);
+
+       return false;
+}
+
+/* VBUS control functions for the USB PHY companion */
+
+static void cpcap_charger_vbus_work(struct work_struct *work)
+{
+       struct cpcap_charger_ddata *ddata;
+       bool vbus = false;
+       int error;
+
+       ddata = container_of(work, struct cpcap_charger_ddata,
+                            vbus_work.work);
+
+       if (ddata->vbus_enabled) {
+               vbus = cpcap_charger_vbus_valid(ddata);
+               if (vbus) {
+                       dev_info(ddata->dev, "VBUS already provided\n");
+
+                       return;
+               }
+
+               cpcap_charger_set_cable_path(ddata, false);
+               cpcap_charger_set_inductive_path(ddata, false);
+
+               error = cpcap_charger_set_state(ddata, 0, 0, 0);
+               if (error)
+                       goto out_err;
+
+               error = regmap_update_bits(ddata->reg, CPCAP_REG_CRM,
+                                          CPCAP_REG_CRM_RVRSMODE,
+                                          CPCAP_REG_CRM_RVRSMODE);
+               if (error)
+                       goto out_err;
+       } else {
+               error = regmap_update_bits(ddata->reg, CPCAP_REG_CRM,
+                                          CPCAP_REG_CRM_RVRSMODE, 0);
+               if (error)
+                       goto out_err;
+
+               cpcap_charger_set_cable_path(ddata, true);
+               cpcap_charger_set_inductive_path(ddata, true);
+       }
+
+       return;
+
+out_err:
+       dev_err(ddata->dev, "%s could not %s vbus: %i\n", __func__,
+               ddata->vbus_enabled ? "enable" : "disable", error);
+}
+
+static int cpcap_charger_set_vbus(struct phy_companion *comparator,
+                                 bool enabled)
+{
+       struct cpcap_charger_ddata *ddata =
+               container_of(comparator, struct cpcap_charger_ddata,
+                            comparator);
+
+       ddata->vbus_enabled = enabled;
+       schedule_delayed_work(&ddata->vbus_work, 0);
+
+       return 0;
+}
+
+/* Charger interrupt handling functions */
+
+static int cpcap_charger_get_ints_state(struct cpcap_charger_ddata *ddata,
+                                       struct cpcap_charger_ints_state *s)
+{
+       int val, error;
+
+       error = regmap_read(ddata->reg, CPCAP_REG_INTS1, &val);
+       if (error)
+               return error;
+
+       s->chrg_det = val & BIT(13);
+       s->rvrs_chrg = val & BIT(12);
+       s->vbusov = val & BIT(11);
+
+       error = regmap_read(ddata->reg, CPCAP_REG_INTS2, &val);
+       if (error)
+               return error;
+
+       s->chrg_se1b = val & BIT(13);
+       s->rvrs_mode = val & BIT(6);
+       s->chrgcurr1 = val & BIT(4);
+       s->vbusvld = val & BIT(3);
+
+       error = regmap_read(ddata->reg, CPCAP_REG_INTS4, &val);
+       if (error)
+               return error;
+
+       s->battdetb = val & BIT(6);
+
+       return 0;
+}
+
+static void cpcap_usb_detect(struct work_struct *work)
+{
+       struct cpcap_charger_ddata *ddata;
+       struct cpcap_charger_ints_state s;
+       int error;
+
+       ddata = container_of(work, struct cpcap_charger_ddata,
+                            detect_work.work);
+
+       error = cpcap_charger_get_ints_state(ddata, &s);
+       if (error)
+               return;
+
+       if (cpcap_charger_vbus_valid(ddata) && s.chrgcurr1) {
+               int max_current;
+
+               if (cpcap_charger_battery_found(ddata))
+                       max_current = CPCAP_REG_CRM_ICHRG_1A584;
+               else
+                       max_current = CPCAP_REG_CRM_ICHRG_0A528;
+
+               error = cpcap_charger_set_state(ddata,
+                                               CPCAP_REG_CRM_VCHRG_4V20,
+                                               max_current,
+                                               CPCAP_REG_CRM_TR_0A72);
+               if (error)
+                       goto out_err;
+       } else {
+               error = cpcap_charger_set_state(ddata, 0, 0, 0);
+               if (error)
+                       goto out_err;
+       }
+
+       return;
+
+out_err:
+       dev_err(ddata->dev, "%s failed with %i\n", __func__, error);
+}
+
+static irqreturn_t cpcap_charger_irq_thread(int irq, void *data)
+{
+       struct cpcap_charger_ddata *ddata = data;
+
+       if (!atomic_read(&ddata->active))
+               return IRQ_NONE;
+
+       schedule_delayed_work(&ddata->detect_work, 0);
+
+       return IRQ_HANDLED;
+}
+
+static int cpcap_usb_init_irq(struct platform_device *pdev,
+                             struct cpcap_charger_ddata *ddata,
+                             const char *name)
+{
+       struct cpcap_interrupt_desc *d;
+       int irq, error;
+
+       irq = platform_get_irq_byname(pdev, name);
+       if (!irq)
+               return -ENODEV;
+
+       error = devm_request_threaded_irq(ddata->dev, irq, NULL,
+                                         cpcap_charger_irq_thread,
+                                         IRQF_SHARED,
+                                         name, ddata);
+       if (error) {
+               dev_err(ddata->dev, "could not get irq %s: %i\n",
+                       name, error);
+
+               return error;
+       }
+
+       d = devm_kzalloc(ddata->dev, sizeof(*d), GFP_KERNEL);
+       if (!d)
+               return -ENOMEM;
+
+       d->name = name;
+       d->irq = irq;
+       list_add(&d->node, &ddata->irq_list);
+
+       return 0;
+}
+
+static const char * const cpcap_charger_irqs[] = {
+       /* REG_INT_0 */
+       "chrg_det", "rvrs_chrg",
+
+       /* REG_INT1 */
+       "chrg_se1b", "se0conn", "rvrs_mode", "chrgcurr1", "vbusvld",
+
+       /* REG_INT_3 */
+       "battdetb",
+};
+
+static int cpcap_usb_init_interrupts(struct platform_device *pdev,
+                                    struct cpcap_charger_ddata *ddata)
+{
+       int i, error;
+
+       for (i = 0; i < ARRAY_SIZE(cpcap_charger_irqs); i++) {
+               error = cpcap_usb_init_irq(pdev, ddata, cpcap_charger_irqs[i]);
+               if (error)
+                       return error;
+       }
+
+       return 0;
+}
+
+static void cpcap_charger_init_optional_gpios(struct cpcap_charger_ddata *ddata)
+{
+       int i;
+
+       for (i = 0; i < 2; i++) {
+               ddata->gpio[i] = devm_gpiod_get_index(ddata->dev, "mode",
+                                                     i, GPIOD_OUT_HIGH);
+               if (IS_ERR(ddata->gpio[i])) {
+                       dev_info(ddata->dev, "no mode change GPIO%i: %li\n",
+                                i, PTR_ERR(ddata->gpio[i]));
+                                ddata->gpio[i] = NULL;
+               }
+       }
+}
+
+static int cpcap_charger_init_iio(struct cpcap_charger_ddata *ddata)
+{
+       const char * const names[CPCAP_CHARGER_IIO_NR] = {
+               "battdetb", "battp", "vbus", "chg_isense", "batti",
+       };
+       int error, i;
+
+       for (i = 0; i < CPCAP_CHARGER_IIO_NR; i++) {
+               ddata->channels[i] = devm_iio_channel_get(ddata->dev,
+                                                         names[i]);
+               if (IS_ERR(ddata->channels[i])) {
+                       error = PTR_ERR(ddata->channels[i]);
+                       goto out_err;
+               }
+
+               if (!ddata->channels[i]->indio_dev) {
+                       error = -ENXIO;
+                       goto out_err;
+               }
+       }
+
+       return 0;
+
+out_err:
+       dev_err(ddata->dev, "could not initialize VBUS or ID IIO: %i\n",
+               error);
+
+       return error;
+}
+
+static const struct power_supply_desc cpcap_charger_usb_desc = {
+       .name           = "cpcap_usb",
+       .type           = POWER_SUPPLY_TYPE_USB,
+       .properties     = cpcap_charger_props,
+       .num_properties = ARRAY_SIZE(cpcap_charger_props),
+       .get_property   = cpcap_charger_get_property,
+};
+
+#ifdef CONFIG_OF
+static const struct of_device_id cpcap_charger_id_table[] = {
+       {
+               .compatible = "motorola,mapphone-cpcap-charger",
+       },
+       {},
+};
+MODULE_DEVICE_TABLE(of, cpcap_charger_id_table);
+#endif
+
+static int cpcap_charger_probe(struct platform_device *pdev)
+{
+       struct cpcap_charger_ddata *ddata;
+       const struct of_device_id *of_id;
+       int error;
+
+       of_id = of_match_device(of_match_ptr(cpcap_charger_id_table),
+                               &pdev->dev);
+       if (!of_id)
+               return -EINVAL;
+
+       ddata = devm_kzalloc(&pdev->dev, sizeof(*ddata), GFP_KERNEL);
+       if (!ddata)
+               return -ENOMEM;
+
+       ddata->dev = &pdev->dev;
+
+       ddata->reg = dev_get_regmap(ddata->dev->parent, NULL);
+       if (!ddata->reg)
+               return -ENODEV;
+
+       INIT_LIST_HEAD(&ddata->irq_list);
+       INIT_DELAYED_WORK(&ddata->detect_work, cpcap_usb_detect);
+       INIT_DELAYED_WORK(&ddata->vbus_work, cpcap_charger_vbus_work);
+       platform_set_drvdata(pdev, ddata);
+
+       error = cpcap_charger_init_iio(ddata);
+       if (error)
+               return error;
+
+       atomic_set(&ddata->active, 1);
+
+       ddata->usb = devm_power_supply_register(ddata->dev,
+                                               &cpcap_charger_usb_desc,
+                                               NULL);
+       if (IS_ERR(ddata->usb)) {
+               error = PTR_ERR(ddata->usb);
+               dev_err(ddata->dev, "failed to register USB charger: %i\n",
+                       error);
+
+               return error;
+       }
+
+       error = cpcap_usb_init_interrupts(pdev, ddata);
+       if (error)
+               return error;
+
+       ddata->comparator.set_vbus = cpcap_charger_set_vbus;
+       error = omap_usb2_set_comparator(&ddata->comparator);
+       if (error == -ENODEV) {
+               dev_info(ddata->dev, "charger needs phy, deferring probe\n");
+               return -EPROBE_DEFER;
+       }
+
+       cpcap_charger_init_optional_gpios(ddata);
+
+       schedule_delayed_work(&ddata->detect_work, 0);
+
+       return 0;
+}
+
+static int cpcap_charger_remove(struct platform_device *pdev)
+{
+       struct cpcap_charger_ddata *ddata = platform_get_drvdata(pdev);
+       int error;
+
+       atomic_set(&ddata->active, 0);
+       error = omap_usb2_set_comparator(NULL);
+       if (error)
+               dev_warn(ddata->dev, "could not clear USB comparator: %i\n",
+                        error);
+
+       error = cpcap_charger_set_state(ddata, 0, 0, 0);
+       if (error)
+               dev_warn(ddata->dev, "could not clear charger: %i\n",
+                        error);
+       cancel_delayed_work_sync(&ddata->vbus_work);
+       cancel_delayed_work_sync(&ddata->detect_work);
+
+       return 0;
+}
+
+static struct platform_driver cpcap_charger_driver = {
+       .probe = cpcap_charger_probe,
+       .driver = {
+               .name   = "cpcap-charger",
+               .of_match_table = of_match_ptr(cpcap_charger_id_table),
+       },
+       .remove = cpcap_charger_remove,
+};
+module_platform_driver(cpcap_charger_driver);
+
+MODULE_AUTHOR("Tony Lindgren <tony@atomide.com>");
+MODULE_DESCRIPTION("CPCAP Battery Charger Interface driver");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS("platform:cpcap-charger");
diff --git a/drivers/power/supply/lego_ev3_battery.c b/drivers/power/supply/lego_ev3_battery.c
new file mode 100644 (file)
index 0000000..7b993d6
--- /dev/null
@@ -0,0 +1,228 @@
+/*
+ * Battery driver for LEGO MINDSTORMS EV3
+ *
+ * Copyright (C) 2017 David Lechner <david@lechnology.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+
+ * This program is distributed "as is" WITHOUT ANY WARRANTY of any
+ * kind, whether express or implied; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/delay.h>
+#include <linux/err.h>
+#include <linux/gpio/consumer.h>
+#include <linux/iio/consumer.h>
+#include <linux/iio/types.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/of_device.h>
+#include <linux/platform_device.h>
+#include <linux/power_supply.h>
+
+struct lego_ev3_battery {
+       struct iio_channel *iio_v;
+       struct iio_channel *iio_i;
+       struct gpio_desc *rechargeable_gpio;
+       struct power_supply *psy;
+       int technology;
+       int v_max;
+       int v_min;
+};
+
+static int lego_ev3_battery_get_property(struct power_supply *psy,
+                                        enum power_supply_property psp,
+                                        union power_supply_propval *val)
+{
+       struct lego_ev3_battery *batt = power_supply_get_drvdata(psy);
+       int val2;
+
+       switch (psp) {
+       case POWER_SUPPLY_PROP_TECHNOLOGY:
+               val->intval = batt->technology;
+               break;
+       case POWER_SUPPLY_PROP_VOLTAGE_NOW:
+               /* battery voltage is iio channel * 2 + Vce of transistor */
+               iio_read_channel_processed(batt->iio_v, &val->intval);
+               val->intval *= 2000;
+               val->intval += 200000;
+               /* plus adjust for shunt resistor drop */
+               iio_read_channel_processed(batt->iio_i, &val2);
+               val2 *= 1000;
+               val2 /= 15;
+               val->intval += val2;
+               break;
+       case POWER_SUPPLY_PROP_VOLTAGE_MAX_DESIGN:
+               val->intval = batt->v_max;
+               break;
+       case POWER_SUPPLY_PROP_VOLTAGE_MIN_DESIGN:
+               val->intval = batt->v_min;
+               break;
+       case POWER_SUPPLY_PROP_CURRENT_NOW:
+               /* battery current is iio channel / 15 / 0.05 ohms */
+               iio_read_channel_processed(batt->iio_i, &val->intval);
+               val->intval *= 20000;
+               val->intval /= 15;
+               break;
+       case POWER_SUPPLY_PROP_SCOPE:
+               val->intval = POWER_SUPPLY_SCOPE_SYSTEM;
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+static int lego_ev3_battery_set_property(struct power_supply *psy,
+                                        enum power_supply_property psp,
+                                        const union power_supply_propval *val)
+{
+       struct lego_ev3_battery *batt = power_supply_get_drvdata(psy);
+
+       switch (psp) {
+       case POWER_SUPPLY_PROP_TECHNOLOGY:
+               /*
+                * Only allow changing technology from Unknown to NiMH. Li-ion
+                * batteries are automatically detected and should not be
+                * overridden. Rechargeable AA batteries, on the other hand,
+                * cannot be automatically detected, and so must be manually
+                * specified. This should only be set once during system init,
+                * so there is no mechanism to go back to Unknown.
+                */
+               if (batt->technology != POWER_SUPPLY_TECHNOLOGY_UNKNOWN)
+                       return -EINVAL;
+               switch (val->intval) {
+               case POWER_SUPPLY_TECHNOLOGY_NiMH:
+                       batt->technology = POWER_SUPPLY_TECHNOLOGY_NiMH;
+                       batt->v_max = 7800000;
+                       batt->v_min = 5400000;
+                       break;
+               default:
+                       return -EINVAL;
+               }
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+static int lego_ev3_battery_property_is_writeable(struct power_supply *psy,
+                                                 enum power_supply_property psp)
+{
+       struct lego_ev3_battery *batt = power_supply_get_drvdata(psy);
+
+       return psp == POWER_SUPPLY_PROP_TECHNOLOGY &&
+               batt->technology == POWER_SUPPLY_TECHNOLOGY_UNKNOWN;
+}
+
+static enum power_supply_property lego_ev3_battery_props[] = {
+       POWER_SUPPLY_PROP_TECHNOLOGY,
+       POWER_SUPPLY_PROP_VOLTAGE_NOW,
+       POWER_SUPPLY_PROP_VOLTAGE_MAX_DESIGN,
+       POWER_SUPPLY_PROP_VOLTAGE_MIN_DESIGN,
+       POWER_SUPPLY_PROP_CURRENT_NOW,
+       POWER_SUPPLY_PROP_SCOPE,
+};
+
+static const struct power_supply_desc lego_ev3_battery_desc = {
+       .name                   = "lego-ev3-battery",
+       .type                   = POWER_SUPPLY_TYPE_BATTERY,
+       .properties             = lego_ev3_battery_props,
+       .num_properties         = ARRAY_SIZE(lego_ev3_battery_props),
+       .get_property           = lego_ev3_battery_get_property,
+       .set_property           = lego_ev3_battery_set_property,
+       .property_is_writeable  = lego_ev3_battery_property_is_writeable,
+};
+
+static int lego_ev3_battery_probe(struct platform_device *pdev)
+{
+       struct device *dev = &pdev->dev;
+       struct lego_ev3_battery *batt;
+       struct power_supply_config psy_cfg = {};
+       int err;
+
+       batt = devm_kzalloc(dev, sizeof(*batt), GFP_KERNEL);
+       if (!batt)
+               return -ENOMEM;
+
+       platform_set_drvdata(pdev, batt);
+
+       batt->iio_v = devm_iio_channel_get(dev, "voltage");
+       err = PTR_ERR_OR_ZERO(batt->iio_v);
+       if (err) {
+               if (err != -EPROBE_DEFER)
+                       dev_err(dev, "Failed to get voltage iio channel\n");
+               return err;
+       }
+
+       batt->iio_i = devm_iio_channel_get(dev, "current");
+       err = PTR_ERR_OR_ZERO(batt->iio_i);
+       if (err) {
+               if (err != -EPROBE_DEFER)
+                       dev_err(dev, "Failed to get current iio channel\n");
+               return err;
+       }
+
+       batt->rechargeable_gpio = devm_gpiod_get(dev, "rechargeable", GPIOD_IN);
+       err = PTR_ERR_OR_ZERO(batt->rechargeable_gpio);
+       if (err) {
+               if (err != -EPROBE_DEFER)
+                       dev_err(dev, "Failed to get rechargeable gpio\n");
+               return err;
+       }
+
+       /*
+        * The rechargeable battery indication switch cannot be changed without
+        * removing the battery, so we only need to read it once.
+        */
+       if (gpiod_get_value(batt->rechargeable_gpio)) {
+               /* 2-cell Li-ion, 7.4V nominal */
+               batt->technology = POWER_SUPPLY_TECHNOLOGY_LION;
+               batt->v_max = 84000000;
+               batt->v_min = 60000000;
+       } else {
+               /* 6x AA Alkaline, 9V nominal */
+               batt->technology = POWER_SUPPLY_TECHNOLOGY_UNKNOWN;
+               batt->v_max = 90000000;
+               batt->v_min = 48000000;
+       }
+
+       psy_cfg.of_node = pdev->dev.of_node;
+       psy_cfg.drv_data = batt;
+
+       batt->psy = devm_power_supply_register(dev, &lego_ev3_battery_desc,
+                                              &psy_cfg);
+       err = PTR_ERR_OR_ZERO(batt->psy);
+       if (err) {
+               dev_err(dev, "failed to register power supply\n");
+               return err;
+       }
+
+       return 0;
+}
+
+static const struct of_device_id of_lego_ev3_battery_match[] = {
+       { .compatible = "lego,ev3-battery", },
+       { }
+};
+MODULE_DEVICE_TABLE(of, of_lego_ev3_battery_match);
+
+static struct platform_driver lego_ev3_battery_driver = {
+       .driver = {
+               .name           = "lego-ev3-battery",
+               .of_match_table = of_lego_ev3_battery_match,
+       },
+       .probe  = lego_ev3_battery_probe,
+};
+module_platform_driver(lego_ev3_battery_driver);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("David Lechner <david@lechnology.com>");
+MODULE_DESCRIPTION("LEGO MINDSTORMS EV3 Battery Driver");
index 509e2b3..677f7c4 100644 (file)
@@ -651,7 +651,7 @@ static ssize_t lp8788_show_eoc_time(struct device *dev,
 {
        struct lp8788_charger *pchg = dev_get_drvdata(dev);
        char *stime[] = { "400ms", "5min", "10min", "15min",
-                       "20min", "25min", "30min" "No timeout" };
+                       "20min", "25min", "30min", "No timeout" };
        u8 val;
 
        lp8788_read_byte(pchg->lp, LP8788_CHG_EOC, &val);
index 4adf2ba..7efb908 100644 (file)
@@ -9,6 +9,7 @@
  */
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/of_device.h>
 #include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/swab.h>
@@ -61,7 +62,7 @@ struct ltc294x_info {
        struct power_supply *supply;    /* Supply pointer */
        struct power_supply_desc supply_desc;   /* Supply description */
        struct delayed_work work;       /* Work scheduler */
-       int num_regs;   /* Number of registers (chip type) */
+       unsigned long num_regs; /* Number of registers (chip type) */
        int charge;     /* Last charge register content */
        int r_sense;    /* mOhm */
        int Qlsb;       /* nAh */
@@ -387,7 +388,7 @@ static int ltc294x_i2c_probe(struct i2c_client *client,
 
        np = of_node_get(client->dev.of_node);
 
-       info->num_regs = id->driver_data;
+       info->num_regs = (unsigned long)of_device_get_match_data(&client->dev);
        info->supply_desc.name = np->name;
 
        /* r_sense can be negative, when sense+ is connected to the battery
@@ -497,9 +498,23 @@ static const struct i2c_device_id ltc294x_i2c_id[] = {
 };
 MODULE_DEVICE_TABLE(i2c, ltc294x_i2c_id);
 
+static const struct of_device_id ltc294x_i2c_of_match[] = {
+       {
+               .compatible = "lltc,ltc2941",
+               .data = (void *)LTC2941_NUM_REGS
+       },
+       {
+               .compatible = "lltc,ltc2943",
+               .data = (void *)LTC2943_NUM_REGS
+       },
+       { },
+};
+MODULE_DEVICE_TABLE(of, ltc294x_i2c_of_match);
+
 static struct i2c_driver ltc294x_driver = {
        .driver = {
                .name   = "LTC2941",
+               .of_match_table = ltc294x_i2c_of_match,
                .pm     = LTC294X_PM_OPS,
        },
        .probe          = ltc294x_i2c_probe,
index e7c3649..33c40f7 100644 (file)
@@ -277,9 +277,17 @@ static const struct i2c_device_id max17040_id[] = {
 };
 MODULE_DEVICE_TABLE(i2c, max17040_id);
 
+static const struct of_device_id max17040_of_match[] = {
+       { .compatible = "maxim,max17040" },
+       { .compatible = "maxim,max77836-battery" },
+       { },
+};
+MODULE_DEVICE_TABLE(of, max17040_of_match);
+
 static struct i2c_driver max17040_i2c_driver = {
        .driver = {
                .name   = "max17040",
+               .of_match_table = max17040_of_match,
                .pm     = MAX17040_PM_OPS,
        },
        .probe          = max17040_probe,
index 353765a..15947db 100644 (file)
@@ -137,10 +137,7 @@ static enum power_supply_property sbs_properties[] = {
 
 static bool sbs_readable_reg(struct device *dev, unsigned int reg)
 {
-       if (reg < SBS_CHARGER_REG_SPEC_INFO)
-               return false;
-       else
-               return true;
+       return reg >= SBS_CHARGER_REG_SPEC_INFO;
 }
 
 static bool sbs_volatile_reg(struct device *dev, unsigned int reg)
index 29b61e8..1f52340 100644 (file)
@@ -58,8 +58,6 @@ static int tps65217_config_charger(struct tps65217_charger *charger)
 {
        int ret;
 
-       dev_dbg(charger->dev, "%s\n", __func__);
-
        /*
         * tps65217 rev. G, p. 31 (see p. 32 for NTC schematic)
         *
@@ -205,8 +203,6 @@ static int tps65217_charger_probe(struct platform_device *pdev)
        int ret;
        int i;
 
-       dev_dbg(&pdev->dev, "%s\n", __func__);
-
        charger = devm_kzalloc(&pdev->dev, sizeof(*charger), GFP_KERNEL);
        if (!charger)
                return -ENOMEM;
index bcd4dc3..990ff3d 100644 (file)
@@ -1117,7 +1117,7 @@ fail:
        return ret;
 }
 
-static int __exit twl4030_bci_remove(struct platform_device *pdev)
+static int twl4030_bci_remove(struct platform_device *pdev)
 {
        struct twl4030_bci *bci = platform_get_drvdata(pdev);
 
@@ -1148,11 +1148,11 @@ MODULE_DEVICE_TABLE(of, twl_bci_of_match);
 
 static struct platform_driver twl4030_bci_driver = {
        .probe = twl4030_bci_probe,
+       .remove = twl4030_bci_remove,
        .driver = {
                .name   = "twl4030_bci",
                .of_match_table = of_match_ptr(twl_bci_of_match),
        },
-       .remove = __exit_p(twl4030_bci_remove),
 };
 module_platform_driver(twl4030_bci_driver);
 
index e814280..b774357 100644 (file)
@@ -97,30 +97,26 @@ static s32 scaled_ppm_to_ppb(long ppm)
 
 /* posix clock implementation */
 
-static int ptp_clock_getres(struct posix_clock *pc, struct timespec *tp)
+static int ptp_clock_getres(struct posix_clock *pc, struct timespec64 *tp)
 {
        tp->tv_sec = 0;
        tp->tv_nsec = 1;
        return 0;
 }
 
-static int ptp_clock_settime(struct posix_clock *pc, const struct timespec *tp)
+static int ptp_clock_settime(struct posix_clock *pc, const struct timespec64 *tp)
 {
        struct ptp_clock *ptp = container_of(pc, struct ptp_clock, clock);
-       struct timespec64 ts = timespec_to_timespec64(*tp);
 
-       return  ptp->info->settime64(ptp->info, &ts);
+       return  ptp->info->settime64(ptp->info, tp);
 }
 
-static int ptp_clock_gettime(struct posix_clock *pc, struct timespec *tp)
+static int ptp_clock_gettime(struct posix_clock *pc, struct timespec64 *tp)
 {
        struct ptp_clock *ptp = container_of(pc, struct ptp_clock, clock);
-       struct timespec64 ts;
        int err;
 
-       err = ptp->info->gettime64(ptp->info, &ts);
-       if (!err)
-               *tp = timespec64_to_timespec(ts);
+       err = ptp->info->gettime64(ptp->info, tp);
        return err;
 }
 
@@ -133,7 +129,7 @@ static int ptp_clock_adjtime(struct posix_clock *pc, struct timex *tx)
        ops = ptp->info;
 
        if (tx->modes & ADJ_SETOFFSET) {
-               struct timespec ts;
+               struct timespec64 ts;
                ktime_t kt;
                s64 delta;
 
@@ -146,7 +142,7 @@ static int ptp_clock_adjtime(struct posix_clock *pc, struct timex *tx)
                if ((unsigned long) ts.tv_nsec >= NSEC_PER_SEC)
                        return -EINVAL;
 
-               kt = timespec_to_ktime(ts);
+               kt = timespec64_to_ktime(ts);
                delta = ktime_to_ns(kt);
                err = ops->adjtime(ops, delta);
        } else if (tx->modes & ADJ_FREQUENCY) {
index 053088b..c1527cb 100644 (file)
@@ -36,6 +36,14 @@ static const struct pwm_lpss_boardinfo pwm_lpss_bxt_info = {
        .clk_rate = 19200000,
        .npwm = 4,
        .base_unit_bits = 22,
+       .bypass = true,
+};
+
+/* Tangier */
+static const struct pwm_lpss_boardinfo pwm_lpss_tng_info = {
+       .clk_rate = 19200000,
+       .npwm = 4,
+       .base_unit_bits = 22,
 };
 
 static int pwm_lpss_probe_pci(struct pci_dev *pdev,
@@ -97,7 +105,7 @@ static const struct pci_device_id pwm_lpss_pci_ids[] = {
        { PCI_VDEVICE(INTEL, 0x0ac8), (unsigned long)&pwm_lpss_bxt_info},
        { PCI_VDEVICE(INTEL, 0x0f08), (unsigned long)&pwm_lpss_byt_info},
        { PCI_VDEVICE(INTEL, 0x0f09), (unsigned long)&pwm_lpss_byt_info},
-       { PCI_VDEVICE(INTEL, 0x11a5), (unsigned long)&pwm_lpss_bxt_info},
+       { PCI_VDEVICE(INTEL, 0x11a5), (unsigned long)&pwm_lpss_tng_info},
        { PCI_VDEVICE(INTEL, 0x1ac8), (unsigned long)&pwm_lpss_bxt_info},
        { PCI_VDEVICE(INTEL, 0x2288), (unsigned long)&pwm_lpss_bsw_info},
        { PCI_VDEVICE(INTEL, 0x2289), (unsigned long)&pwm_lpss_bsw_info},
index b22b6fd..5d6ed15 100644 (file)
@@ -37,6 +37,7 @@ static const struct pwm_lpss_boardinfo pwm_lpss_bxt_info = {
        .clk_rate = 19200000,
        .npwm = 4,
        .base_unit_bits = 22,
+       .bypass = true,
 };
 
 static int pwm_lpss_probe_platform(struct platform_device *pdev)
index 689d2c1..8db0d40 100644 (file)
@@ -57,7 +57,7 @@ static inline void pwm_lpss_write(const struct pwm_device *pwm, u32 value)
        writel(value, lpwm->regs + pwm->hwpwm * PWM_SIZE + PWM);
 }
 
-static int pwm_lpss_update(struct pwm_device *pwm)
+static int pwm_lpss_wait_for_update(struct pwm_device *pwm)
 {
        struct pwm_lpss_chip *lpwm = to_lpwm(pwm->chip);
        const void __iomem *addr = lpwm->regs + pwm->hwpwm * PWM_SIZE + PWM;
@@ -65,8 +65,6 @@ static int pwm_lpss_update(struct pwm_device *pwm)
        u32 val;
        int err;
 
-       pwm_lpss_write(pwm, pwm_lpss_read(pwm) | PWM_SW_UPDATE);
-
        /*
         * PWM Configuration register has SW_UPDATE bit that is set when a new
         * configuration is written to the register. The bit is automatically
@@ -122,6 +120,12 @@ static void pwm_lpss_prepare(struct pwm_lpss_chip *lpwm, struct pwm_device *pwm,
        pwm_lpss_write(pwm, ctrl);
 }
 
+static inline void pwm_lpss_cond_enable(struct pwm_device *pwm, bool cond)
+{
+       if (cond)
+               pwm_lpss_write(pwm, pwm_lpss_read(pwm) | PWM_ENABLE);
+}
+
 static int pwm_lpss_apply(struct pwm_chip *chip, struct pwm_device *pwm,
                          struct pwm_state *state)
 {
@@ -137,18 +141,21 @@ static int pwm_lpss_apply(struct pwm_chip *chip, struct pwm_device *pwm,
                                return ret;
                        }
                        pwm_lpss_prepare(lpwm, pwm, state->duty_cycle, state->period);
-                       ret = pwm_lpss_update(pwm);
+                       pwm_lpss_write(pwm, pwm_lpss_read(pwm) | PWM_SW_UPDATE);
+                       pwm_lpss_cond_enable(pwm, lpwm->info->bypass == false);
+                       ret = pwm_lpss_wait_for_update(pwm);
                        if (ret) {
                                pm_runtime_put(chip->dev);
                                return ret;
                        }
-                       pwm_lpss_write(pwm, pwm_lpss_read(pwm) | PWM_ENABLE);
+                       pwm_lpss_cond_enable(pwm, lpwm->info->bypass == true);
                } else {
                        ret = pwm_lpss_is_updating(pwm);
                        if (ret)
                                return ret;
                        pwm_lpss_prepare(lpwm, pwm, state->duty_cycle, state->period);
-                       return pwm_lpss_update(pwm);
+                       pwm_lpss_write(pwm, pwm_lpss_read(pwm) | PWM_SW_UPDATE);
+                       return pwm_lpss_wait_for_update(pwm);
                }
        } else if (pwm_is_enabled(pwm)) {
                pwm_lpss_write(pwm, pwm_lpss_read(pwm) & ~PWM_ENABLE);
index c94cd7c..98306bb 100644 (file)
@@ -22,6 +22,7 @@ struct pwm_lpss_boardinfo {
        unsigned long clk_rate;
        unsigned int npwm;
        unsigned long base_unit_bits;
+       bool bypass;
 };
 
 struct pwm_lpss_chip *pwm_lpss_probe(struct device *dev, struct resource *r,
index ef89df1..744d561 100644 (file)
@@ -191,6 +191,28 @@ static int rockchip_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
        return 0;
 }
 
+static int rockchip_pwm_enable(struct pwm_chip *chip,
+                        struct pwm_device *pwm,
+                        bool enable,
+                        enum pwm_polarity polarity)
+{
+       struct rockchip_pwm_chip *pc = to_rockchip_pwm_chip(chip);
+       int ret;
+
+       if (enable) {
+               ret = clk_enable(pc->clk);
+               if (ret)
+                       return ret;
+       }
+
+       pc->data->set_enable(chip, pwm, enable, polarity);
+
+       if (!enable)
+               clk_disable(pc->clk);
+
+       return 0;
+}
+
 static int rockchip_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
                              struct pwm_state *state)
 {
@@ -207,22 +229,26 @@ static int rockchip_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
                return ret;
 
        if (state->polarity != curstate.polarity && enabled) {
-               pc->data->set_enable(chip, pwm, false, state->polarity);
+               ret = rockchip_pwm_enable(chip, pwm, false, state->polarity);
+               if (ret)
+                       goto out;
                enabled = false;
        }
 
        ret = rockchip_pwm_config(chip, pwm, state->duty_cycle, state->period);
        if (ret) {
                if (enabled != curstate.enabled)
-                       pc->data->set_enable(chip, pwm, !enabled,
-                                            state->polarity);
-
+                       rockchip_pwm_enable(chip, pwm, !enabled,
+                                     state->polarity);
                goto out;
        }
 
-       if (state->enabled != enabled)
-               pc->data->set_enable(chip, pwm, state->enabled,
-                                    state->polarity);
+       if (state->enabled != enabled) {
+               ret = rockchip_pwm_enable(chip, pwm, state->enabled,
+                                   state->polarity);
+               if (ret)
+                       goto out;
+       }
 
        /*
         * Update the state with the real hardware, which can differ a bit
index d7f7334..7b26dd3 100644 (file)
@@ -1 +1,2 @@
-obj-$(CONFIG_RAS) += ras.o debugfs.o
+obj-$(CONFIG_RAS)      += ras.o debugfs.o
+obj-$(CONFIG_RAS_CEC)  += cec.o
diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c
new file mode 100644 (file)
index 0000000..6aab46d
--- /dev/null
@@ -0,0 +1,532 @@
+#include <linux/mm.h>
+#include <linux/gfp.h>
+#include <linux/kernel.h>
+
+#include <asm/mce.h>
+
+#include "debugfs.h"
+
+/*
+ * RAS Correctable Errors Collector
+ *
+ * This is a simple gadget which collects correctable errors and counts their
+ * occurrence per physical page address.
+ *
+ * We've opted for possibly the simplest data structure to collect those - an
+ * array of the size of a memory page. It stores 512 u64's with the following
+ * structure:
+ *
+ * [63 ... PFN ... 12 | 11 ... generation ... 10 | 9 ... count ... 0]
+ *
+ * The generation in the two highest order bits is two bits which are set to 11b
+ * on every insertion. During the course of each entry's existence, the
+ * generation field gets decremented during spring cleaning to 10b, then 01b and
+ * then 00b.
+ *
+ * This way we're employing the natural numeric ordering to make sure that newly
+ * inserted/touched elements have higher 12-bit counts (which we've manufactured)
+ * and thus iterating over the array initially won't kick out those elements
+ * which were inserted last.
+ *
+ * Spring cleaning is what we do when we reach a certain number CLEAN_ELEMS of
+ * elements entered into the array, during which, we're decaying all elements.
+ * If, after decay, an element gets inserted again, its generation is set to 11b
+ * to make sure it has higher numerical count than other, older elements and
+ * thus emulate an an LRU-like behavior when deleting elements to free up space
+ * in the page.
+ *
+ * When an element reaches it's max count of count_threshold, we try to poison
+ * it by assuming that errors triggered count_threshold times in a single page
+ * are excessive and that page shouldn't be used anymore. count_threshold is
+ * initialized to COUNT_MASK which is the maximum.
+ *
+ * That error event entry causes cec_add_elem() to return !0 value and thus
+ * signal to its callers to log the error.
+ *
+ * To the question why we've chosen a page and moving elements around with
+ * memmove(), it is because it is a very simple structure to handle and max data
+ * movement is 4K which on highly optimized modern CPUs is almost unnoticeable.
+ * We wanted to avoid the pointer traversal of more complex structures like a
+ * linked list or some sort of a balancing search tree.
+ *
+ * Deleting an element takes O(n) but since it is only a single page, it should
+ * be fast enough and it shouldn't happen all too often depending on error
+ * patterns.
+ */
+
+#undef pr_fmt
+#define pr_fmt(fmt) "RAS: " fmt
+
+/*
+ * We use DECAY_BITS bits of PAGE_SHIFT bits for counting decay, i.e., how long
+ * elements have stayed in the array without having been accessed again.
+ */
+#define DECAY_BITS             2
+#define DECAY_MASK             ((1ULL << DECAY_BITS) - 1)
+#define MAX_ELEMS              (PAGE_SIZE / sizeof(u64))
+
+/*
+ * Threshold amount of inserted elements after which we start spring
+ * cleaning.
+ */
+#define CLEAN_ELEMS            (MAX_ELEMS >> DECAY_BITS)
+
+/* Bits which count the number of errors happened in this 4K page. */
+#define COUNT_BITS             (PAGE_SHIFT - DECAY_BITS)
+#define COUNT_MASK             ((1ULL << COUNT_BITS) - 1)
+#define FULL_COUNT_MASK                (PAGE_SIZE - 1)
+
+/*
+ * u64: [ 63 ... 12 | DECAY_BITS | COUNT_BITS ]
+ */
+
+#define PFN(e)                 ((e) >> PAGE_SHIFT)
+#define DECAY(e)               (((e) >> COUNT_BITS) & DECAY_MASK)
+#define COUNT(e)               ((unsigned int)(e) & COUNT_MASK)
+#define FULL_COUNT(e)          ((e) & (PAGE_SIZE - 1))
+
+static struct ce_array {
+       u64 *array;                     /* container page */
+       unsigned int n;                 /* number of elements in the array */
+
+       unsigned int decay_count;       /*
+                                        * number of element insertions/increments
+                                        * since the last spring cleaning.
+                                        */
+
+       u64 pfns_poisoned;              /*
+                                        * number of PFNs which got poisoned.
+                                        */
+
+       u64 ces_entered;                /*
+                                        * The number of correctable errors
+                                        * entered into the collector.
+                                        */
+
+       u64 decays_done;                /*
+                                        * Times we did spring cleaning.
+                                        */
+
+       union {
+               struct {
+                       __u32   disabled : 1,   /* cmdline disabled */
+                       __resv   : 31;
+               };
+               __u32 flags;
+       };
+} ce_arr;
+
+static DEFINE_MUTEX(ce_mutex);
+static u64 dfs_pfn;
+
+/* Amount of errors after which we offline */
+static unsigned int count_threshold = COUNT_MASK;
+
+/*
+ * The timer "decays" element count each timer_interval which is 24hrs by
+ * default.
+ */
+
+#define CEC_TIMER_DEFAULT_INTERVAL     24 * 60 * 60    /* 24 hrs */
+#define CEC_TIMER_MIN_INTERVAL          1 * 60 * 60    /* 1h */
+#define CEC_TIMER_MAX_INTERVAL    30 * 24 * 60 * 60    /* one month */
+static struct timer_list cec_timer;
+static u64 timer_interval = CEC_TIMER_DEFAULT_INTERVAL;
+
+/*
+ * Decrement decay value. We're using DECAY_BITS bits to denote decay of an
+ * element in the array. On insertion and any access, it gets reset to max.
+ */
+static void do_spring_cleaning(struct ce_array *ca)
+{
+       int i;
+
+       for (i = 0; i < ca->n; i++) {
+               u8 decay = DECAY(ca->array[i]);
+
+               if (!decay)
+                       continue;
+
+               decay--;
+
+               ca->array[i] &= ~(DECAY_MASK << COUNT_BITS);
+               ca->array[i] |= (decay << COUNT_BITS);
+       }
+       ca->decay_count = 0;
+       ca->decays_done++;
+}
+
+/*
+ * @interval in seconds
+ */
+static void cec_mod_timer(struct timer_list *t, unsigned long interval)
+{
+       unsigned long iv;
+
+       iv = interval * HZ + jiffies;
+
+       mod_timer(t, round_jiffies(iv));
+}
+
+static void cec_timer_fn(unsigned long data)
+{
+       struct ce_array *ca = (struct ce_array *)data;
+
+       do_spring_cleaning(ca);
+
+       cec_mod_timer(&cec_timer, timer_interval);
+}
+
+/*
+ * @to: index of the smallest element which is >= then @pfn.
+ *
+ * Return the index of the pfn if found, otherwise negative value.
+ */
+static int __find_elem(struct ce_array *ca, u64 pfn, unsigned int *to)
+{
+       u64 this_pfn;
+       int min = 0, max = ca->n;
+
+       while (min < max) {
+               int tmp = (max + min) >> 1;
+
+               this_pfn = PFN(ca->array[tmp]);
+
+               if (this_pfn < pfn)
+                       min = tmp + 1;
+               else if (this_pfn > pfn)
+                       max = tmp;
+               else {
+                       min = tmp;
+                       break;
+               }
+       }
+
+       if (to)
+               *to = min;
+
+       this_pfn = PFN(ca->array[min]);
+
+       if (this_pfn == pfn)
+               return min;
+
+       return -ENOKEY;
+}
+
+static int find_elem(struct ce_array *ca, u64 pfn, unsigned int *to)
+{
+       WARN_ON(!to);
+
+       if (!ca->n) {
+               *to = 0;
+               return -ENOKEY;
+       }
+       return __find_elem(ca, pfn, to);
+}
+
+static void del_elem(struct ce_array *ca, int idx)
+{
+       /* Save us a function call when deleting the last element. */
+       if (ca->n - (idx + 1))
+               memmove((void *)&ca->array[idx],
+                       (void *)&ca->array[idx + 1],
+                       (ca->n - (idx + 1)) * sizeof(u64));
+
+       ca->n--;
+}
+
+static u64 del_lru_elem_unlocked(struct ce_array *ca)
+{
+       unsigned int min = FULL_COUNT_MASK;
+       int i, min_idx = 0;
+
+       for (i = 0; i < ca->n; i++) {
+               unsigned int this = FULL_COUNT(ca->array[i]);
+
+               if (min > this) {
+                       min = this;
+                       min_idx = i;
+               }
+       }
+
+       del_elem(ca, min_idx);
+
+       return PFN(ca->array[min_idx]);
+}
+
+/*
+ * We return the 0th pfn in the error case under the assumption that it cannot
+ * be poisoned and excessive CEs in there are a serious deal anyway.
+ */
+static u64 __maybe_unused del_lru_elem(void)
+{
+       struct ce_array *ca = &ce_arr;
+       u64 pfn;
+
+       if (!ca->n)
+               return 0;
+
+       mutex_lock(&ce_mutex);
+       pfn = del_lru_elem_unlocked(ca);
+       mutex_unlock(&ce_mutex);
+
+       return pfn;
+}
+
+
+int cec_add_elem(u64 pfn)
+{
+       struct ce_array *ca = &ce_arr;
+       unsigned int to;
+       int count, ret = 0;
+
+       /*
+        * We can be called very early on the identify_cpu() path where we are
+        * not initialized yet. We ignore the error for simplicity.
+        */
+       if (!ce_arr.array || ce_arr.disabled)
+               return -ENODEV;
+
+       ca->ces_entered++;
+
+       mutex_lock(&ce_mutex);
+
+       if (ca->n == MAX_ELEMS)
+               WARN_ON(!del_lru_elem_unlocked(ca));
+
+       ret = find_elem(ca, pfn, &to);
+       if (ret < 0) {
+               /*
+                * Shift range [to-end] to make room for one more element.
+                */
+               memmove((void *)&ca->array[to + 1],
+                       (void *)&ca->array[to],
+                       (ca->n - to) * sizeof(u64));
+
+               ca->array[to] = (pfn << PAGE_SHIFT) |
+                               (DECAY_MASK << COUNT_BITS) | 1;
+
+               ca->n++;
+
+               ret = 0;
+
+               goto decay;
+       }
+
+       count = COUNT(ca->array[to]);
+
+       if (count < count_threshold) {
+               ca->array[to] |= (DECAY_MASK << COUNT_BITS);
+               ca->array[to]++;
+
+               ret = 0;
+       } else {
+               u64 pfn = ca->array[to] >> PAGE_SHIFT;
+
+               if (!pfn_valid(pfn)) {
+                       pr_warn("CEC: Invalid pfn: 0x%llx\n", pfn);
+               } else {
+                       /* We have reached max count for this page, soft-offline it. */
+                       pr_err("Soft-offlining pfn: 0x%llx\n", pfn);
+                       memory_failure_queue(pfn, 0, MF_SOFT_OFFLINE);
+                       ca->pfns_poisoned++;
+               }
+
+               del_elem(ca, to);
+
+               /*
+                * Return a >0 value to denote that we've reached the offlining
+                * threshold.
+                */
+               ret = 1;
+
+               goto unlock;
+       }
+
+decay:
+       ca->decay_count++;
+
+       if (ca->decay_count >= CLEAN_ELEMS)
+               do_spring_cleaning(ca);
+
+unlock:
+       mutex_unlock(&ce_mutex);
+
+       return ret;
+}
+
+static int u64_get(void *data, u64 *val)
+{
+       *val = *(u64 *)data;
+
+       return 0;
+}
+
+static int pfn_set(void *data, u64 val)
+{
+       *(u64 *)data = val;
+
+       return cec_add_elem(val);
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(pfn_ops, u64_get, pfn_set, "0x%llx\n");
+
+static int decay_interval_set(void *data, u64 val)
+{
+       *(u64 *)data = val;
+
+       if (val < CEC_TIMER_MIN_INTERVAL)
+               return -EINVAL;
+
+       if (val > CEC_TIMER_MAX_INTERVAL)
+               return -EINVAL;
+
+       timer_interval = val;
+
+       cec_mod_timer(&cec_timer, timer_interval);
+       return 0;
+}
+DEFINE_DEBUGFS_ATTRIBUTE(decay_interval_ops, u64_get, decay_interval_set, "%lld\n");
+
+static int count_threshold_set(void *data, u64 val)
+{
+       *(u64 *)data = val;
+
+       if (val > COUNT_MASK)
+               val = COUNT_MASK;
+
+       count_threshold = val;
+
+       return 0;
+}
+DEFINE_DEBUGFS_ATTRIBUTE(count_threshold_ops, u64_get, count_threshold_set, "%lld\n");
+
+static int array_dump(struct seq_file *m, void *v)
+{
+       struct ce_array *ca = &ce_arr;
+       u64 prev = 0;
+       int i;
+
+       mutex_lock(&ce_mutex);
+
+       seq_printf(m, "{ n: %d\n", ca->n);
+       for (i = 0; i < ca->n; i++) {
+               u64 this = PFN(ca->array[i]);
+
+               seq_printf(m, " %03d: [%016llx|%03llx]\n", i, this, FULL_COUNT(ca->array[i]));
+
+               WARN_ON(prev > this);
+
+               prev = this;
+       }
+
+       seq_printf(m, "}\n");
+
+       seq_printf(m, "Stats:\nCEs: %llu\nofflined pages: %llu\n",
+                  ca->ces_entered, ca->pfns_poisoned);
+
+       seq_printf(m, "Flags: 0x%x\n", ca->flags);
+
+       seq_printf(m, "Timer interval: %lld seconds\n", timer_interval);
+       seq_printf(m, "Decays: %lld\n", ca->decays_done);
+
+       seq_printf(m, "Action threshold: %d\n", count_threshold);
+
+       mutex_unlock(&ce_mutex);
+
+       return 0;
+}
+
+static int array_open(struct inode *inode, struct file *filp)
+{
+       return single_open(filp, array_dump, NULL);
+}
+
+static const struct file_operations array_ops = {
+       .owner   = THIS_MODULE,
+       .open    = array_open,
+       .read    = seq_read,
+       .llseek  = seq_lseek,
+       .release = single_release,
+};
+
+static int __init create_debugfs_nodes(void)
+{
+       struct dentry *d, *pfn, *decay, *count, *array;
+
+       d = debugfs_create_dir("cec", ras_debugfs_dir);
+       if (!d) {
+               pr_warn("Error creating cec debugfs node!\n");
+               return -1;
+       }
+
+       pfn = debugfs_create_file("pfn", S_IRUSR | S_IWUSR, d, &dfs_pfn, &pfn_ops);
+       if (!pfn) {
+               pr_warn("Error creating pfn debugfs node!\n");
+               goto err;
+       }
+
+       array = debugfs_create_file("array", S_IRUSR, d, NULL, &array_ops);
+       if (!array) {
+               pr_warn("Error creating array debugfs node!\n");
+               goto err;
+       }
+
+       decay = debugfs_create_file("decay_interval", S_IRUSR | S_IWUSR, d,
+                                   &timer_interval, &decay_interval_ops);
+       if (!decay) {
+               pr_warn("Error creating decay_interval debugfs node!\n");
+               goto err;
+       }
+
+       count = debugfs_create_file("count_threshold", S_IRUSR | S_IWUSR, d,
+                                   &count_threshold, &count_threshold_ops);
+       if (!decay) {
+               pr_warn("Error creating count_threshold debugfs node!\n");
+               goto err;
+       }
+
+
+       return 0;
+
+err:
+       debugfs_remove_recursive(d);
+
+       return 1;
+}
+
+void __init cec_init(void)
+{
+       if (ce_arr.disabled)
+               return;
+
+       ce_arr.array = (void *)get_zeroed_page(GFP_KERNEL);
+       if (!ce_arr.array) {
+               pr_err("Error allocating CE array page!\n");
+               return;
+       }
+
+       if (create_debugfs_nodes())
+               return;
+
+       setup_timer(&cec_timer, cec_timer_fn, (unsigned long)&ce_arr);
+       cec_mod_timer(&cec_timer, CEC_TIMER_DEFAULT_INTERVAL);
+
+       pr_info("Correctable Errors collector initialized.\n");
+}
+
+int __init parse_cec_param(char *str)
+{
+       if (!str)
+               return 0;
+
+       if (*str == '=')
+               str++;
+
+       if (!strncmp(str, "cec_disable", 7))
+               ce_arr.disabled = 1;
+       else
+               return 0;
+
+       return 1;
+}
index 0322acf..5016030 100644 (file)
@@ -1,6 +1,6 @@
 #include <linux/debugfs.h>
 
-static struct dentry *ras_debugfs_dir;
+struct dentry *ras_debugfs_dir;
 
 static atomic_t trace_count = ATOMIC_INIT(0);
 
diff --git a/drivers/ras/debugfs.h b/drivers/ras/debugfs.h
new file mode 100644 (file)
index 0000000..db72e45
--- /dev/null
@@ -0,0 +1,8 @@
+#ifndef __RAS_DEBUGFS_H__
+#define __RAS_DEBUGFS_H__
+
+#include <linux/debugfs.h>
+
+extern struct dentry *ras_debugfs_dir;
+
+#endif /* __RAS_DEBUGFS_H__ */
index b67dd36..94f8038 100644 (file)
@@ -27,3 +27,14 @@ subsys_initcall(ras_init);
 EXPORT_TRACEPOINT_SYMBOL_GPL(extlog_mem_event);
 #endif
 EXPORT_TRACEPOINT_SYMBOL_GPL(mc_event);
+
+
+int __init parse_ras_param(char *str)
+{
+#ifdef CONFIG_RAS_CEC
+       parse_cec_param(str);
+#endif
+
+       return 1;
+}
+__setup("ras", parse_ras_param);
index f1e5e65..cd739d2 100644 (file)
@@ -275,7 +275,7 @@ int reset_control_status(struct reset_control *rstc)
 }
 EXPORT_SYMBOL_GPL(reset_control_status);
 
-static struct reset_control *__reset_control_get(
+static struct reset_control *__reset_control_get_internal(
                                struct reset_controller_dev *rcdev,
                                unsigned int index, bool shared)
 {
@@ -308,7 +308,7 @@ static struct reset_control *__reset_control_get(
        return rstc;
 }
 
-static void __reset_control_put(struct reset_control *rstc)
+static void __reset_control_put_internal(struct reset_control *rstc)
 {
        lockdep_assert_held(&reset_list_mutex);
 
@@ -377,7 +377,7 @@ struct reset_control *__of_reset_control_get(struct device_node *node,
        }
 
        /* reset_list_mutex also protects the rcdev's reset_control list */
-       rstc = __reset_control_get(rcdev, rstc_id, shared);
+       rstc = __reset_control_get_internal(rcdev, rstc_id, shared);
 
        mutex_unlock(&reset_list_mutex);
 
@@ -385,6 +385,17 @@ struct reset_control *__of_reset_control_get(struct device_node *node,
 }
 EXPORT_SYMBOL_GPL(__of_reset_control_get);
 
+struct reset_control *__reset_control_get(struct device *dev, const char *id,
+                                         int index, bool shared, bool optional)
+{
+       if (dev->of_node)
+               return __of_reset_control_get(dev->of_node, id, index, shared,
+                                             optional);
+
+       return optional ? NULL : ERR_PTR(-EINVAL);
+}
+EXPORT_SYMBOL_GPL(__reset_control_get);
+
 /**
  * reset_control_put - free the reset controller
  * @rstc: reset controller
@@ -396,7 +407,7 @@ void reset_control_put(struct reset_control *rstc)
                return;
 
        mutex_lock(&reset_list_mutex);
-       __reset_control_put(rstc);
+       __reset_control_put_internal(rstc);
        mutex_unlock(&reset_list_mutex);
 }
 EXPORT_SYMBOL_GPL(reset_control_put);
@@ -417,8 +428,7 @@ struct reset_control *__devm_reset_control_get(struct device *dev,
        if (!ptr)
                return ERR_PTR(-ENOMEM);
 
-       rstc = __of_reset_control_get(dev ? dev->of_node : NULL,
-                                     id, index, shared, optional);
+       rstc = __reset_control_get(dev, id, index, shared, optional);
        if (!IS_ERR(rstc)) {
                *ptr = rstc;
                devres_add(dev, ptr);
index e7addea..d9561e3 100644 (file)
@@ -961,7 +961,8 @@ int qeth_bridgeport_query_ports(struct qeth_card *card,
 int qeth_bridgeport_setrole(struct qeth_card *card, enum qeth_sbp_roles role);
 int qeth_bridgeport_an_set(struct qeth_card *card, int enable);
 int qeth_get_priority_queue(struct qeth_card *, struct sk_buff *, int, int);
-int qeth_get_elements_no(struct qeth_card *, struct sk_buff *, int);
+int qeth_get_elements_no(struct qeth_card *card, struct sk_buff *skb,
+                        int extra_elems, int data_offset);
 int qeth_get_elements_for_frags(struct sk_buff *);
 int qeth_do_send_packet_fast(struct qeth_card *, struct qeth_qdio_out_q *,
                        struct sk_buff *, struct qeth_hdr *, int, int, int);
index 315d8a2..9a5f99c 100644 (file)
@@ -3837,6 +3837,7 @@ EXPORT_SYMBOL_GPL(qeth_get_elements_for_frags);
  * @card:                      qeth card structure, to check max. elems.
  * @skb:                       SKB address
  * @extra_elems:               extra elems needed, to check against max.
+ * @data_offset:               range starts at skb->data + data_offset
  *
  * Returns the number of pages, and thus QDIO buffer elements, needed to cover
  * skb data, including linear part and fragments. Checks if the result plus
@@ -3844,10 +3845,10 @@ EXPORT_SYMBOL_GPL(qeth_get_elements_for_frags);
  * Note: extra_elems is not included in the returned result.
  */
 int qeth_get_elements_no(struct qeth_card *card,
-                    struct sk_buff *skb, int extra_elems)
+                    struct sk_buff *skb, int extra_elems, int data_offset)
 {
        int elements = qeth_get_elements_for_range(
-                               (addr_t)skb->data,
+                               (addr_t)skb->data + data_offset,
                                (addr_t)skb->data + skb_headlen(skb)) +
                        qeth_get_elements_for_frags(skb);
 
index bea4833..af4e6a6 100644 (file)
@@ -849,7 +849,7 @@ static int qeth_l2_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
         * chaining we can not send long frag lists
         */
        if ((card->info.type != QETH_CARD_TYPE_IQD) &&
-           !qeth_get_elements_no(card, new_skb, 0)) {
+           !qeth_get_elements_no(card, new_skb, 0, 0)) {
                int lin_rc = skb_linearize(new_skb);
 
                if (card->options.performance_stats) {
@@ -894,7 +894,8 @@ static int qeth_l2_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
                }
        }
 
-       elements = qeth_get_elements_no(card, new_skb, elements_needed);
+       elements = qeth_get_elements_no(card, new_skb, elements_needed,
+                                       (data_offset > 0) ? data_offset : 0);
        if (!elements) {
                if (data_offset >= 0)
                        kmem_cache_free(qeth_core_header_cache, hdr);
index 06d0add..653f0fb 100644 (file)
@@ -2609,17 +2609,13 @@ static void qeth_l3_fill_af_iucv_hdr(struct qeth_card *card,
        char daddr[16];
        struct af_iucv_trans_hdr *iucv_hdr;
 
-       skb_pull(skb, 14);
-       card->dev->header_ops->create(skb, card->dev, 0,
-                                     card->dev->dev_addr, card->dev->dev_addr,
-                                     card->dev->addr_len);
-       skb_pull(skb, 14);
-       iucv_hdr = (struct af_iucv_trans_hdr *)skb->data;
        memset(hdr, 0, sizeof(struct qeth_hdr));
        hdr->hdr.l3.id = QETH_HEADER_TYPE_LAYER3;
        hdr->hdr.l3.ext_flags = 0;
-       hdr->hdr.l3.length = skb->len;
+       hdr->hdr.l3.length = skb->len - ETH_HLEN;
        hdr->hdr.l3.flags = QETH_HDR_IPV6 | QETH_CAST_UNICAST;
+
+       iucv_hdr = (struct af_iucv_trans_hdr *) (skb->data + ETH_HLEN);
        memset(daddr, 0, sizeof(daddr));
        daddr[0] = 0xfe;
        daddr[1] = 0x80;
@@ -2823,10 +2819,7 @@ static int qeth_l3_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
        if ((card->info.type == QETH_CARD_TYPE_IQD) &&
            !skb_is_nonlinear(skb)) {
                new_skb = skb;
-               if (new_skb->protocol == ETH_P_AF_IUCV)
-                       data_offset = 0;
-               else
-                       data_offset = ETH_HLEN;
+               data_offset = ETH_HLEN;
                hdr = kmem_cache_alloc(qeth_core_header_cache, GFP_ATOMIC);
                if (!hdr)
                        goto tx_drop;
@@ -2867,7 +2860,7 @@ static int qeth_l3_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
         */
        if ((card->info.type != QETH_CARD_TYPE_IQD) &&
            ((use_tso && !qeth_l3_get_elements_no_tso(card, new_skb, 1)) ||
-            (!use_tso && !qeth_get_elements_no(card, new_skb, 0)))) {
+            (!use_tso && !qeth_get_elements_no(card, new_skb, 0, 0)))) {
                int lin_rc = skb_linearize(new_skb);
 
                if (card->options.performance_stats) {
@@ -2909,7 +2902,8 @@ static int qeth_l3_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
 
        elements = use_tso ?
                   qeth_l3_get_elements_no_tso(card, new_skb, hdr_elements) :
-                  qeth_get_elements_no(card, new_skb, hdr_elements);
+                  qeth_get_elements_no(card, new_skb, hdr_elements,
+                                       (data_offset > 0) ? data_offset : 0);
        if (!elements) {
                if (data_offset >= 0)
                        kmem_cache_free(qeth_core_header_cache, hdr);
index 6ff61da..62fed9d 100644 (file)
@@ -183,11 +183,33 @@ static void jsfd_read(char *buf, unsigned long p, size_t togo) {
        }
 }
 
-static void jsfd_do_request(struct request_queue *q)
+static int jsfd_queue;
+
+static struct request *jsfd_next_request(void)
+{
+       struct request_queue *q;
+       struct request *rq;
+       int old_pos = jsfd_queue;
+
+       do {
+               q = jsfd_disk[jsfd_queue]->queue;
+               if (++jsfd_queue == JSF_MAX)
+                       jsfd_queue = 0;
+               if (q) {
+                       rq = blk_fetch_request(q);
+                       if (rq)
+                               return rq;
+               }
+       } while (jsfd_queue != old_pos);
+
+       return NULL;
+}
+
+static void jsfd_request(void)
 {
        struct request *req;
 
-       req = blk_fetch_request(q);
+       req = jsfd_next_request();
        while (req) {
                struct jsfd_part *jdp = req->rq_disk->private_data;
                unsigned long offset = blk_rq_pos(req) << 9;
@@ -211,10 +233,15 @@ static void jsfd_do_request(struct request_queue *q)
                err = 0;
        end:
                if (!__blk_end_request_cur(req, err))
-                       req = blk_fetch_request(q);
+                       req = jsfd_next_request();
        }
 }
 
+static void jsfd_do_request(struct request_queue *q)
+{
+       jsfd_request();
+}
+
 /*
  * The memory devices use the full 32/64 bits of the offset, and so we cannot
  * check against negative addresses: they are ok. The return value is weird,
@@ -544,8 +571,6 @@ static int jsflash_init(void)
        return 0;
 }
 
-static struct request_queue *jsf_queue;
-
 static int jsfd_init(void)
 {
        static DEFINE_SPINLOCK(lock);
@@ -562,6 +587,11 @@ static int jsfd_init(void)
                struct gendisk *disk = alloc_disk(1);
                if (!disk)
                        goto out;
+               disk->queue = blk_init_queue(jsfd_do_request, &lock);
+               if (!disk->queue) {
+                       put_disk(disk);
+                       goto out;
+               }
                jsfd_disk[i] = disk;
        }
 
@@ -570,13 +600,6 @@ static int jsfd_init(void)
                goto out;
        }
 
-       jsf_queue = blk_init_queue(jsfd_do_request, &lock);
-       if (!jsf_queue) {
-               err = -ENOMEM;
-               unregister_blkdev(JSFD_MAJOR, "jsfd");
-               goto out;
-       }
-
        for (i = 0; i < JSF_MAX; i++) {
                struct gendisk *disk = jsfd_disk[i];
                if ((i & JSF_PART_MASK) >= JSF_NPART) continue;
@@ -589,7 +612,6 @@ static int jsfd_init(void)
                disk->fops = &jsfd_fops;
                set_capacity(disk, jdp->dsize >> 9);
                disk->private_data = jdp;
-               disk->queue = jsf_queue;
                add_disk(disk);
                set_disk_ro(disk, 1);
        }
@@ -619,6 +641,7 @@ static void __exit jsflash_cleanup_module(void)
        for (i = 0; i < JSF_MAX; i++) {
                if ((i & JSF_PART_MASK) >= JSF_NPART) continue;
                del_gendisk(jsfd_disk[i]);
+               blk_cleanup_queue(jsfd_disk[i]->queue);
                put_disk(jsfd_disk[i]);
        }
        if (jsf0.busy)
@@ -628,7 +651,6 @@ static void __exit jsflash_cleanup_module(void)
 
        misc_deregister(&jsf_dev);
        unregister_blkdev(JSFD_MAJOR, "jsfd");
-       blk_cleanup_queue(jsf_queue);
 }
 
 module_init(jsflash_init_module);
index fc28555..93dbe58 100644 (file)
@@ -166,6 +166,7 @@ scsi_mod-y                  += scsi_scan.o scsi_sysfs.o scsi_devinfo.o
 scsi_mod-$(CONFIG_SCSI_NETLINK)        += scsi_netlink.o
 scsi_mod-$(CONFIG_SYSCTL)      += scsi_sysctl.o
 scsi_mod-$(CONFIG_SCSI_PROC_FS)        += scsi_proc.o
+scsi_mod-$(CONFIG_BLK_DEBUG_FS)        += scsi_debugfs.o
 scsi_mod-y                     += scsi_trace.o scsi_logging.o
 scsi_mod-$(CONFIG_PM)          += scsi_pm.o
 scsi_mod-$(CONFIG_SCSI_DH)     += scsi_dh.o
index d036a80..d281492 100644 (file)
@@ -1690,9 +1690,6 @@ struct aac_dev
 #define aac_adapter_sync_cmd(dev, command, p1, p2, p3, p4, p5, p6, status, r1, r2, r3, r4) \
        (dev)->a_ops.adapter_sync_cmd(dev, command, p1, p2, p3, p4, p5, p6, status, r1, r2, r3, r4)
 
-#define aac_adapter_check_health(dev) \
-       (dev)->a_ops.adapter_check_health(dev)
-
 #define aac_adapter_restart(dev, bled, reset_type) \
        ((dev)->a_ops.adapter_restart(dev, bled, reset_type))
 
@@ -2615,6 +2612,14 @@ static inline unsigned int cap_to_cyls(sector_t capacity, unsigned divisor)
        return capacity;
 }
 
+static inline int aac_adapter_check_health(struct aac_dev *dev)
+{
+       if (unlikely(pci_channel_offline(dev->pdev)))
+               return -1;
+
+       return (dev)->a_ops.adapter_check_health(dev);
+}
+
 /* SCp.phase values */
 #define AAC_OWNER_MIDLEVEL     0x101
 #define AAC_OWNER_LOWLEVEL     0x102
index c8172f1..1f49183 100644 (file)
@@ -1873,7 +1873,8 @@ int aac_check_health(struct aac_dev * aac)
        spin_unlock_irqrestore(&aac->fib_lock, flagv);
 
        if (BlinkLED < 0) {
-               printk(KERN_ERR "%s: Host adapter dead %d\n", aac->name, BlinkLED);
+               printk(KERN_ERR "%s: Host adapter is dead (or got a PCI error) %d\n",
+                               aac->name, BlinkLED);
                goto out;
        }
 
index b35ed38..2d4b7f0 100644 (file)
@@ -1289,32 +1289,13 @@ int esas2r_ioctl_handler(void *hostdata, int cmd, void __user *arg)
            || (cmd > EXPRESS_IOCTL_MAX))
                return -ENOTSUPP;
 
-       if (!access_ok(VERIFY_WRITE, arg, sizeof(struct atto_express_ioctl))) {
+       ioctl = memdup_user(arg, sizeof(struct atto_express_ioctl));
+       if (IS_ERR(ioctl)) {
                esas2r_log(ESAS2R_LOG_WARN,
                           "ioctl_handler access_ok failed for cmd %d, "
                           "address %p", cmd,
                           arg);
-               return -EFAULT;
-       }
-
-       /* allocate a kernel memory buffer for the IOCTL data */
-       ioctl = kzalloc(sizeof(struct atto_express_ioctl), GFP_KERNEL);
-       if (ioctl == NULL) {
-               esas2r_log(ESAS2R_LOG_WARN,
-                          "ioctl_handler kzalloc failed for %zu bytes",
-                          sizeof(struct atto_express_ioctl));
-               return -ENOMEM;
-       }
-
-       err = __copy_from_user(ioctl, arg, sizeof(struct atto_express_ioctl));
-       if (err != 0) {
-               esas2r_log(ESAS2R_LOG_WARN,
-                          "copy_from_user didn't copy everything (err %d, cmd %d)",
-                          err,
-                          cmd);
-               kfree(ioctl);
-
-               return -EFAULT;
+               return PTR_ERR(ioctl);
        }
 
        /* verify the signature */
index b29afaf..5d5e272 100644 (file)
@@ -6293,7 +6293,12 @@ static void ipr_erp_start(struct ipr_ioa_cfg *ioa_cfg,
                break;
        case IPR_IOASC_MED_DO_NOT_REALLOC: /* prevent retries */
        case IPR_IOASA_IR_DUAL_IOA_DISABLED:
-               scsi_cmd->result |= (DID_PASSTHROUGH << 16);
+               /*
+                * exception: do not set DID_PASSTHROUGH on CHECK CONDITION
+                * so SCSI mid-layer and upper layers handle it accordingly.
+                */
+               if (scsi_cmd->result != SAM_STAT_CHECK_CONDITION)
+                       scsi_cmd->result |= (DID_PASSTHROUGH << 16);
                break;
        case IPR_IOASC_BUS_WAS_RESET:
        case IPR_IOASC_BUS_WAS_RESET_BY_OTHER:
index 4228aba..bbea8ea 100644 (file)
@@ -387,7 +387,7 @@ static int iscsi_sw_tcp_pdu_xmit(struct iscsi_task *task)
                rc = 0;
        }
 
-       tsk_restore_flags(current, pflags, PF_MEMALLOC);
+       current_restore_flags(pflags, PF_MEMALLOC);
        return rc;
 }
 
index 257bbdd..6d7840b 100644 (file)
@@ -56,7 +56,7 @@ struct lpfc_sli2_slim;
 #define LPFC_MAX_SG_SEG_CNT    4096    /* sg element count per scsi cmnd */
 #define LPFC_MAX_SGL_SEG_CNT   512     /* SGL element count per scsi cmnd */
 #define LPFC_MAX_BPL_SEG_CNT   4096    /* BPL element count per scsi cmnd */
-#define LPFC_MIN_NVME_SEG_CNT  254
+#define LPFC_MAX_NVME_SEG_CNT  128     /* max SGL element cnt per NVME cmnd */
 
 #define LPFC_MAX_SGE_SIZE       0x80000000 /* Maximum data allowed in a SGE */
 #define LPFC_IOCB_LIST_CNT     2250    /* list of IOCBs for fast-path usage. */
@@ -474,6 +474,8 @@ struct lpfc_vport {
        unsigned long rcv_buffer_time_stamp;
        uint32_t vport_flag;
 #define STATIC_VPORT   1
+#define FAWWPN_SET     2
+#define FAWWPN_PARAM_CHG       4
 
        uint16_t fdmi_num_disc;
        uint32_t fdmi_hba_mask;
@@ -781,6 +783,7 @@ struct lpfc_hba {
        uint32_t cfg_nvmet_fb_size;
        uint32_t cfg_total_seg_cnt;
        uint32_t cfg_sg_seg_cnt;
+       uint32_t cfg_nvme_seg_cnt;
        uint32_t cfg_sg_dma_buf_size;
        uint64_t cfg_soft_wwnn;
        uint64_t cfg_soft_wwpn;
index 22819af..513fd07 100644 (file)
@@ -2292,6 +2292,8 @@ lpfc_soft_wwn_enable_store(struct device *dev, struct device_attribute *attr,
        struct lpfc_vport *vport = (struct lpfc_vport *) shost->hostdata;
        struct lpfc_hba   *phba = vport->phba;
        unsigned int cnt = count;
+       uint8_t vvvl = vport->fc_sparam.cmn.valid_vendor_ver_level;
+       u32 *fawwpn_key = (uint32_t *)&vport->fc_sparam.un.vendorVersion[0];
 
        /*
         * We're doing a simple sanity check for soft_wwpn setting.
@@ -2305,6 +2307,12 @@ lpfc_soft_wwn_enable_store(struct device *dev, struct device_attribute *attr,
         * here. The intent is to protect against the random user or
         * application that is just writing attributes.
         */
+       if (vvvl == 1 && cpu_to_be32(*fawwpn_key) == FAPWWN_KEY_VENDOR) {
+               lpfc_printf_log(phba, KERN_ERR, LOG_INIT,
+                                "0051 "LPFC_DRIVER_NAME" soft wwpn can not"
+                                " be enabled: fawwpn is enabled\n");
+               return -EINVAL;
+       }
 
        /* count may include a LF at end of string */
        if (buf[cnt-1] == '\n')
@@ -3335,7 +3343,7 @@ LPFC_ATTR_R(enable_fc4_type, LPFC_ENABLE_FCP,
  * percentage will go to NVME.
  */
 LPFC_ATTR_R(xri_split, 50, 10, 90,
-            "Division of XRI resources between SCSI and NVME");
+           "Division of XRI resources between SCSI and NVME");
 
 /*
 # lpfc_log_verbose: Only turn this flag on if you are willing to risk being
index 18157d2..a1686c2 100644 (file)
@@ -2486,6 +2486,10 @@ static int lpfcdiag_loop_self_reg(struct lpfc_hba *phba, uint16_t *rpi)
                                mbox, *rpi);
        else {
                *rpi = lpfc_sli4_alloc_rpi(phba);
+               if (*rpi == LPFC_RPI_ALLOC_ERROR) {
+                       mempool_free(mbox, phba->mbox_mem_pool);
+                       return -EBUSY;
+               }
                status = lpfc_reg_rpi(phba, phba->pport->vpi,
                                phba->pport->fc_myDID,
                                (uint8_t *)&phba->pport->fc_sparam,
index 54e6ac4..944b32c 100644 (file)
@@ -24,6 +24,7 @@ typedef int (*node_filter)(struct lpfc_nodelist *, void *);
 
 struct fc_rport;
 struct fc_frame_header;
+struct lpfc_nvmet_rcv_ctx;
 void lpfc_down_link(struct lpfc_hba *, LPFC_MBOXQ_t *);
 void lpfc_sli_read_link_ste(struct lpfc_hba *);
 void lpfc_dump_mem(struct lpfc_hba *, LPFC_MBOXQ_t *, uint16_t, uint16_t);
@@ -99,7 +100,7 @@ void lpfc_issue_reg_vpi(struct lpfc_hba *, struct lpfc_vport *);
 
 int lpfc_check_sli_ndlp(struct lpfc_hba *, struct lpfc_sli_ring *,
                        struct lpfc_iocbq *, struct lpfc_nodelist *);
-void lpfc_nlp_init(struct lpfc_vport *, struct lpfc_nodelist *, uint32_t);
+struct lpfc_nodelist *lpfc_nlp_init(struct lpfc_vport *vport, uint32_t did);
 struct lpfc_nodelist *lpfc_nlp_get(struct lpfc_nodelist *);
 int  lpfc_nlp_put(struct lpfc_nodelist *);
 int  lpfc_nlp_not_used(struct lpfc_nodelist *ndlp);
@@ -245,6 +246,10 @@ struct hbq_dmabuf *lpfc_sli4_rb_alloc(struct lpfc_hba *);
 void lpfc_sli4_rb_free(struct lpfc_hba *, struct hbq_dmabuf *);
 struct rqb_dmabuf *lpfc_sli4_nvmet_alloc(struct lpfc_hba *phba);
 void lpfc_sli4_nvmet_free(struct lpfc_hba *phba, struct rqb_dmabuf *dmab);
+void lpfc_nvmet_rq_post(struct lpfc_hba *phba, struct lpfc_nvmet_rcv_ctx *ctxp,
+                       struct lpfc_dmabuf *mp);
+int lpfc_nvmet_rcv_unsol_abort(struct lpfc_vport *vport,
+                              struct fc_frame_header *fc_hdr);
 void lpfc_sli4_build_dflt_fcf_record(struct lpfc_hba *, struct fcf_record *,
                        uint16_t);
 int lpfc_sli4_rq_put(struct lpfc_queue *hq, struct lpfc_queue *dq,
@@ -302,6 +307,8 @@ int lpfc_sli_check_eratt(struct lpfc_hba *);
 void lpfc_sli_handle_slow_ring_event(struct lpfc_hba *,
                                    struct lpfc_sli_ring *, uint32_t);
 void lpfc_sli4_handle_received_buffer(struct lpfc_hba *, struct hbq_dmabuf *);
+void lpfc_sli4_seq_abort_rsp(struct lpfc_vport *vport,
+                            struct fc_frame_header *fc_hdr, bool aborted);
 void lpfc_sli_def_mbox_cmpl(struct lpfc_hba *, LPFC_MBOXQ_t *);
 void lpfc_sli4_unreg_rpi_cmpl_clr(struct lpfc_hba *, LPFC_MBOXQ_t *);
 int lpfc_sli_issue_iocb(struct lpfc_hba *, uint32_t,
index d3e9af9..1487406 100644 (file)
@@ -537,19 +537,53 @@ lpfc_prep_node_fc4type(struct lpfc_vport *vport, uint32_t Did, uint8_t fc4_type)
        }
 }
 
+static void
+lpfc_ns_rsp_audit_did(struct lpfc_vport *vport, uint32_t Did, uint8_t fc4_type)
+{
+       struct lpfc_hba *phba = vport->phba;
+       struct lpfc_nodelist *ndlp = NULL;
+       struct Scsi_Host *shost = lpfc_shost_from_vport(vport);
+
+       /*
+        * To conserve rpi's, filter out addresses for other
+        * vports on the same physical HBAs.
+        */
+       if (Did != vport->fc_myDID &&
+           (!lpfc_find_vport_by_did(phba, Did) ||
+            vport->cfg_peer_port_login)) {
+               if (!phba->nvmet_support) {
+                       /* FCPI/NVMEI path. Process Did */
+                       lpfc_prep_node_fc4type(vport, Did, fc4_type);
+                       return;
+               }
+               /* NVMET path.  NVMET only cares about NVMEI nodes. */
+               list_for_each_entry(ndlp, &vport->fc_nodes, nlp_listp) {
+                       if (ndlp->nlp_type != NLP_NVME_INITIATOR ||
+                           ndlp->nlp_state != NLP_STE_UNMAPPED_NODE)
+                               continue;
+                       spin_lock_irq(shost->host_lock);
+                       if (ndlp->nlp_DID == Did)
+                               ndlp->nlp_flag &= ~NLP_NVMET_RECOV;
+                       else
+                               ndlp->nlp_flag |= NLP_NVMET_RECOV;
+                       spin_unlock_irq(shost->host_lock);
+               }
+       }
+}
+
 static int
 lpfc_ns_rsp(struct lpfc_vport *vport, struct lpfc_dmabuf *mp, uint8_t fc4_type,
            uint32_t Size)
 {
-       struct lpfc_hba  *phba = vport->phba;
        struct lpfc_sli_ct_request *Response =
                (struct lpfc_sli_ct_request *) mp->virt;
-       struct lpfc_nodelist *ndlp = NULL;
        struct lpfc_dmabuf *mlast, *next_mp;
        uint32_t *ctptr = (uint32_t *) & Response->un.gid.PortType;
        uint32_t Did, CTentry;
        int Cnt;
        struct list_head head;
+       struct Scsi_Host *shost = lpfc_shost_from_vport(vport);
+       struct lpfc_nodelist *ndlp = NULL;
 
        lpfc_set_disctmo(vport);
        vport->num_disc_nodes = 0;
@@ -574,19 +608,7 @@ lpfc_ns_rsp(struct lpfc_vport *vport, struct lpfc_dmabuf *mp, uint8_t fc4_type,
                        /* Get next DID from NameServer List */
                        CTentry = *ctptr++;
                        Did = ((be32_to_cpu(CTentry)) & Mask_DID);
-
-                       ndlp = NULL;
-
-                       /*
-                        * Check for rscn processing or not
-                        * To conserve rpi's, filter out addresses for other
-                        * vports on the same physical HBAs.
-                        */
-                       if ((Did != vport->fc_myDID) &&
-                           ((lpfc_find_vport_by_did(phba, Did) == NULL) ||
-                            vport->cfg_peer_port_login))
-                               lpfc_prep_node_fc4type(vport, Did, fc4_type);
-
+                       lpfc_ns_rsp_audit_did(vport, Did, fc4_type);
                        if (CTentry & (cpu_to_be32(SLI_CT_LAST_ENTRY)))
                                goto nsout1;
 
@@ -596,6 +618,22 @@ lpfc_ns_rsp(struct lpfc_vport *vport, struct lpfc_dmabuf *mp, uint8_t fc4_type,
 
        }
 
+       /* All GID_FT entries processed.  If the driver is running in
+        * in target mode, put impacted nodes into recovery and drop
+        * the RPI to flush outstanding IO.
+        */
+       if (vport->phba->nvmet_support) {
+               list_for_each_entry(ndlp, &vport->fc_nodes, nlp_listp) {
+                       if (!(ndlp->nlp_flag & NLP_NVMET_RECOV))
+                               continue;
+                       lpfc_disc_state_machine(vport, ndlp, NULL,
+                                               NLP_EVT_DEVICE_RECOVERY);
+                       spin_lock_irq(shost->host_lock);
+                       ndlp->nlp_flag &= ~NLP_NVMET_RECOV;
+                       spin_lock_irq(shost->host_lock);
+               }
+       }
+
 nsout1:
        list_del(&head);
        return 0;
index 913eed8..fce549a 100644 (file)
@@ -745,73 +745,102 @@ lpfc_debugfs_nvmestat_data(struct lpfc_vport *vport, char *buf, int size)
 {
        struct lpfc_hba   *phba = vport->phba;
        struct lpfc_nvmet_tgtport *tgtp;
+       struct lpfc_nvmet_rcv_ctx *ctxp, *next_ctxp;
        int len = 0;
+       int cnt;
 
        if (phba->nvmet_support) {
                if (!phba->targetport)
                        return len;
                tgtp = (struct lpfc_nvmet_tgtport *)phba->targetport->private;
-               len += snprintf(buf+len, size-len,
+               len += snprintf(buf + len, size - len,
                                "\nNVME Targetport Statistics\n");
 
-               len += snprintf(buf+len, size-len,
+               len += snprintf(buf + len, size - len,
                                "LS: Rcv %08x Drop %08x Abort %08x\n",
                                atomic_read(&tgtp->rcv_ls_req_in),
                                atomic_read(&tgtp->rcv_ls_req_drop),
                                atomic_read(&tgtp->xmt_ls_abort));
                if (atomic_read(&tgtp->rcv_ls_req_in) !=
                    atomic_read(&tgtp->rcv_ls_req_out)) {
-                       len += snprintf(buf+len, size-len,
+                       len += snprintf(buf + len, size - len,
                                        "Rcv LS: in %08x != out %08x\n",
                                        atomic_read(&tgtp->rcv_ls_req_in),
                                        atomic_read(&tgtp->rcv_ls_req_out));
                }
 
-               len += snprintf(buf+len, size-len,
+               len += snprintf(buf + len, size - len,
                                "LS: Xmt %08x Drop %08x Cmpl %08x Err %08x\n",
                                atomic_read(&tgtp->xmt_ls_rsp),
                                atomic_read(&tgtp->xmt_ls_drop),
                                atomic_read(&tgtp->xmt_ls_rsp_cmpl),
                                atomic_read(&tgtp->xmt_ls_rsp_error));
 
-               len += snprintf(buf+len, size-len,
+               len += snprintf(buf + len, size - len,
                                "FCP: Rcv %08x Drop %08x\n",
                                atomic_read(&tgtp->rcv_fcp_cmd_in),
                                atomic_read(&tgtp->rcv_fcp_cmd_drop));
 
                if (atomic_read(&tgtp->rcv_fcp_cmd_in) !=
                    atomic_read(&tgtp->rcv_fcp_cmd_out)) {
-                       len += snprintf(buf+len, size-len,
+                       len += snprintf(buf + len, size - len,
                                        "Rcv FCP: in %08x != out %08x\n",
                                        atomic_read(&tgtp->rcv_fcp_cmd_in),
                                        atomic_read(&tgtp->rcv_fcp_cmd_out));
                }
 
-               len += snprintf(buf+len, size-len,
-                               "FCP Rsp: read %08x readrsp %08x write %08x rsp %08x\n",
+               len += snprintf(buf + len, size - len,
+                               "FCP Rsp: read %08x readrsp %08x "
+                               "write %08x rsp %08x\n",
                                atomic_read(&tgtp->xmt_fcp_read),
                                atomic_read(&tgtp->xmt_fcp_read_rsp),
                                atomic_read(&tgtp->xmt_fcp_write),
                                atomic_read(&tgtp->xmt_fcp_rsp));
 
-               len += snprintf(buf+len, size-len,
+               len += snprintf(buf + len, size - len,
                                "FCP Rsp: abort %08x drop %08x\n",
                                atomic_read(&tgtp->xmt_fcp_abort),
                                atomic_read(&tgtp->xmt_fcp_drop));
 
-               len += snprintf(buf+len, size-len,
+               len += snprintf(buf + len, size - len,
                                "FCP Rsp Cmpl: %08x err %08x drop %08x\n",
                                atomic_read(&tgtp->xmt_fcp_rsp_cmpl),
                                atomic_read(&tgtp->xmt_fcp_rsp_error),
                                atomic_read(&tgtp->xmt_fcp_rsp_drop));
 
-               len += snprintf(buf+len, size-len,
+               len += snprintf(buf + len, size - len,
                                "ABORT: Xmt %08x Err %08x Cmpl %08x",
                                atomic_read(&tgtp->xmt_abort_rsp),
                                atomic_read(&tgtp->xmt_abort_rsp_error),
                                atomic_read(&tgtp->xmt_abort_cmpl));
 
-               len +=  snprintf(buf+len, size-len, "\n");
+               len +=  snprintf(buf + len, size - len, "\n");
+
+               cnt = 0;
+               spin_lock(&phba->sli4_hba.abts_nvme_buf_list_lock);
+               list_for_each_entry_safe(ctxp, next_ctxp,
+                               &phba->sli4_hba.lpfc_abts_nvmet_ctx_list,
+                               list) {
+                       cnt++;
+               }
+               spin_unlock(&phba->sli4_hba.abts_nvme_buf_list_lock);
+               if (cnt) {
+                       len += snprintf(buf + len, size - len,
+                                       "ABORT: %d ctx entries\n", cnt);
+                       spin_lock(&phba->sli4_hba.abts_nvme_buf_list_lock);
+                       list_for_each_entry_safe(ctxp, next_ctxp,
+                                   &phba->sli4_hba.lpfc_abts_nvmet_ctx_list,
+                                   list) {
+                               if (len >= (size - LPFC_DEBUG_OUT_LINE_SZ))
+                                       break;
+                               len += snprintf(buf + len, size - len,
+                                               "Entry: oxid %x state %x "
+                                               "flag %x\n",
+                                               ctxp->oxid, ctxp->state,
+                                               ctxp->flag);
+                       }
+                       spin_unlock(&phba->sli4_hba.abts_nvme_buf_list_lock);
+               }
        } else {
                if (!(phba->cfg_enable_fc4_type & LPFC_ENABLE_NVME))
                        return len;
@@ -3128,8 +3157,6 @@ __lpfc_idiag_print_rqpair(struct lpfc_queue *qp, struct lpfc_queue *datqp,
                        datqp->queue_id, datqp->entry_count,
                        datqp->entry_size, datqp->host_index,
                        datqp->hba_index);
-       len +=  snprintf(pbuffer + len, LPFC_QUE_INFO_GET_BUF_SIZE - len, "\n");
-
        return len;
 }
 
@@ -5700,10 +5727,8 @@ lpfc_debugfs_terminate(struct lpfc_vport *vport)
 #ifdef CONFIG_SCSI_LPFC_DEBUG_FS
        struct lpfc_hba   *phba = vport->phba;
 
-       if (vport->disc_trc) {
-               kfree(vport->disc_trc);
-               vport->disc_trc = NULL;
-       }
+       kfree(vport->disc_trc);
+       vport->disc_trc = NULL;
 
        debugfs_remove(vport->debug_disc_trc); /* discovery_trace */
        vport->debug_disc_trc = NULL;
@@ -5770,10 +5795,8 @@ lpfc_debugfs_terminate(struct lpfc_vport *vport)
                debugfs_remove(phba->debug_readRef); /* readRef */
                phba->debug_readRef = NULL;
 
-               if (phba->slow_ring_trc) {
-                       kfree(phba->slow_ring_trc);
-                       phba->slow_ring_trc = NULL;
-               }
+               kfree(phba->slow_ring_trc);
+               phba->slow_ring_trc = NULL;
 
                /* slow_ring_trace */
                debugfs_remove(phba->debug_slow_ring_trc);
index f4ff99d..9d5a379 100644 (file)
@@ -157,6 +157,7 @@ struct lpfc_node_rrq {
 #define NLP_LOGO_SND       0x00000100  /* sent LOGO request for this entry */
 #define NLP_RNID_SND       0x00000400  /* sent RNID request for this entry */
 #define NLP_ELS_SND_MASK   0x000007e0  /* sent ELS request for this entry */
+#define NLP_NVMET_RECOV    0x00001000   /* NVMET auditing node for recovery. */
 #define NLP_DEFER_RM       0x00010000  /* Remove this ndlp if no longer used */
 #define NLP_DELAY_TMO      0x00020000  /* delay timeout is running for node */
 #define NLP_NPR_2B_DISC    0x00040000  /* node is included in num_disc_nodes */
index a5ca37e..67827e3 100644 (file)
@@ -603,9 +603,11 @@ lpfc_check_clean_addr_bit(struct lpfc_vport *vport,
                memcmp(&vport->fabric_portname, &sp->portName,
                        sizeof(struct lpfc_name)) ||
                memcmp(&vport->fabric_nodename, &sp->nodeName,
-                       sizeof(struct lpfc_name)))
+                       sizeof(struct lpfc_name)) ||
+               (vport->vport_flag & FAWWPN_PARAM_CHG)) {
                fabric_param_changed = 1;
-
+               vport->vport_flag &= ~FAWWPN_PARAM_CHG;
+       }
        /*
         * Word 1 Bit 31 in common service parameter is overloaded.
         * Word 1 Bit 31 in FLOGI request is multiple NPort request
@@ -895,10 +897,9 @@ lpfc_cmpl_els_flogi_nport(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp,
                         * Cannot find existing Fabric ndlp, so allocate a
                         * new one
                         */
-                       ndlp = mempool_alloc(phba->nlp_mem_pool, GFP_KERNEL);
+                       ndlp = lpfc_nlp_init(vport, PT2PT_RemoteID);
                        if (!ndlp)
                                goto fail;
-                       lpfc_nlp_init(vport, ndlp, PT2PT_RemoteID);
                } else if (!NLP_CHK_NODE_ACT(ndlp)) {
                        ndlp = lpfc_enable_node(vport, ndlp,
                                                NLP_STE_UNUSED_NODE);
@@ -1364,7 +1365,6 @@ lpfc_els_abort_flogi(struct lpfc_hba *phba)
 int
 lpfc_initial_flogi(struct lpfc_vport *vport)
 {
-       struct lpfc_hba *phba = vport->phba;
        struct lpfc_nodelist *ndlp;
 
        vport->port_state = LPFC_FLOGI;
@@ -1374,10 +1374,9 @@ lpfc_initial_flogi(struct lpfc_vport *vport)
        ndlp = lpfc_findnode_did(vport, Fabric_DID);
        if (!ndlp) {
                /* Cannot find existing Fabric ndlp, so allocate a new one */
-               ndlp = mempool_alloc(phba->nlp_mem_pool, GFP_KERNEL);
+               ndlp = lpfc_nlp_init(vport, Fabric_DID);
                if (!ndlp)
                        return 0;
-               lpfc_nlp_init(vport, ndlp, Fabric_DID);
                /* Set the node type */
                ndlp->nlp_type |= NLP_FABRIC;
                /* Put ndlp onto node list */
@@ -1418,17 +1417,15 @@ lpfc_initial_flogi(struct lpfc_vport *vport)
 int
 lpfc_initial_fdisc(struct lpfc_vport *vport)
 {
-       struct lpfc_hba *phba = vport->phba;
        struct lpfc_nodelist *ndlp;
 
        /* First look for the Fabric ndlp */
        ndlp = lpfc_findnode_did(vport, Fabric_DID);
        if (!ndlp) {
                /* Cannot find existing Fabric ndlp, so allocate a new one */
-               ndlp = mempool_alloc(phba->nlp_mem_pool, GFP_KERNEL);
+               ndlp = lpfc_nlp_init(vport, Fabric_DID);
                if (!ndlp)
                        return 0;
-               lpfc_nlp_init(vport, ndlp, Fabric_DID);
                /* Put ndlp onto node list */
                lpfc_enqueue_node(vport, ndlp);
        } else if (!NLP_CHK_NODE_ACT(ndlp)) {
@@ -1564,14 +1561,13 @@ lpfc_plogi_confirm_nport(struct lpfc_hba *phba, uint32_t *prsp,
                                             phba->active_rrq_pool);
                        return ndlp;
                }
-               new_ndlp = mempool_alloc(phba->nlp_mem_pool, GFP_ATOMIC);
+               new_ndlp = lpfc_nlp_init(vport, ndlp->nlp_DID);
                if (!new_ndlp) {
                        if (active_rrqs_xri_bitmap)
                                mempool_free(active_rrqs_xri_bitmap,
                                             phba->active_rrq_pool);
                        return ndlp;
                }
-               lpfc_nlp_init(vport, new_ndlp, ndlp->nlp_DID);
        } else if (!NLP_CHK_NODE_ACT(new_ndlp)) {
                rc = memcmp(&ndlp->nlp_portname, name,
                            sizeof(struct lpfc_name));
@@ -2845,10 +2841,9 @@ lpfc_issue_els_scr(struct lpfc_vport *vport, uint32_t nportid, uint8_t retry)
 
        ndlp = lpfc_findnode_did(vport, nportid);
        if (!ndlp) {
-               ndlp = mempool_alloc(phba->nlp_mem_pool, GFP_KERNEL);
+               ndlp = lpfc_nlp_init(vport, nportid);
                if (!ndlp)
                        return 1;
-               lpfc_nlp_init(vport, ndlp, nportid);
                lpfc_enqueue_node(vport, ndlp);
        } else if (!NLP_CHK_NODE_ACT(ndlp)) {
                ndlp = lpfc_enable_node(vport, ndlp, NLP_STE_UNUSED_NODE);
@@ -2938,10 +2933,9 @@ lpfc_issue_els_farpr(struct lpfc_vport *vport, uint32_t nportid, uint8_t retry)
 
        ndlp = lpfc_findnode_did(vport, nportid);
        if (!ndlp) {
-               ndlp = mempool_alloc(phba->nlp_mem_pool, GFP_KERNEL);
+               ndlp = lpfc_nlp_init(vport, nportid);
                if (!ndlp)
                        return 1;
-               lpfc_nlp_init(vport, ndlp, nportid);
                lpfc_enqueue_node(vport, ndlp);
        } else if (!NLP_CHK_NODE_ACT(ndlp)) {
                ndlp = lpfc_enable_node(vport, ndlp, NLP_STE_UNUSED_NODE);
@@ -4403,7 +4397,7 @@ lpfc_els_rsp_prli_acc(struct lpfc_vport *vport, struct lpfc_iocbq *oldiocb,
        pcmd = (uint8_t *) (((struct lpfc_dmabuf *) elsiocb->context2)->virt);
        memset(pcmd, 0, cmdsize);
 
-       *((uint32_t *) (pcmd)) = (ELS_CMD_ACC | (ELS_CMD_PRLI & ~ELS_RSP_MASK));
+       *((uint32_t *)(pcmd)) = elsrspcmd;
        pcmd += sizeof(uint32_t);
 
        /* For PRLI, remainder of payload is PRLI parameter page */
@@ -5867,8 +5861,11 @@ lpfc_rscn_recovery_check(struct lpfc_vport *vport)
                    (ndlp->nlp_state == NLP_STE_UNUSED_NODE) ||
                    !lpfc_rscn_payload_check(vport, ndlp->nlp_DID))
                        continue;
+
+               /* NVME Target mode does not do RSCN Recovery. */
                if (vport->phba->nvmet_support)
                        continue;
+
                lpfc_disc_state_machine(vport, ndlp, NULL,
                                        NLP_EVT_DEVICE_RECOVERY);
                lpfc_cancel_retry_delay_tmo(vport, ndlp);
@@ -6133,7 +6130,6 @@ int
 lpfc_els_handle_rscn(struct lpfc_vport *vport)
 {
        struct lpfc_nodelist *ndlp;
-       struct lpfc_hba *phba = vport->phba;
 
        /* Ignore RSCN if the port is being torn down. */
        if (vport->load_flag & FC_UNLOADING) {
@@ -6157,22 +6153,16 @@ lpfc_els_handle_rscn(struct lpfc_vport *vport)
        ndlp = lpfc_findnode_did(vport, NameServer_DID);
        if (ndlp && NLP_CHK_NODE_ACT(ndlp)
            && ndlp->nlp_state == NLP_STE_UNMAPPED_NODE) {
-               /* Good ndlp, issue CT Request to NameServer */
+               /* Good ndlp, issue CT Request to NameServer.  Need to
+                * know how many gidfts were issued.  If none, then just
+                * flush the RSCN.  Otherwise, the outstanding requests
+                * need to complete.
+                */
                vport->gidft_inp = 0;
-               if (lpfc_issue_gidft(vport) == 0)
-                       /* Wait for NameServer query cmpl before we can
-                        * continue
-                        */
+               if (lpfc_issue_gidft(vport) > 0)
                        return 1;
        } else {
-               /* If login to NameServer does not exist, issue one */
-               /* Good status, issue PLOGI to NameServer */
-               ndlp = lpfc_findnode_did(vport, NameServer_DID);
-               if (ndlp && NLP_CHK_NODE_ACT(ndlp))
-                       /* Wait for NameServer login cmpl before we can
-                          continue */
-                       return 1;
-
+               /* Nameserver login in question.  Revalidate. */
                if (ndlp) {
                        ndlp = lpfc_enable_node(vport, ndlp,
                                                NLP_STE_PLOGI_ISSUE);
@@ -6182,12 +6172,11 @@ lpfc_els_handle_rscn(struct lpfc_vport *vport)
                        }
                        ndlp->nlp_prev_state = NLP_STE_UNUSED_NODE;
                } else {
-                       ndlp = mempool_alloc(phba->nlp_mem_pool, GFP_KERNEL);
+                       ndlp = lpfc_nlp_init(vport, NameServer_DID);
                        if (!ndlp) {
                                lpfc_els_flush_rscn(vport);
                                return 0;
                        }
-                       lpfc_nlp_init(vport, ndlp, NameServer_DID);
                        ndlp->nlp_prev_state = ndlp->nlp_state;
                        lpfc_nlp_set_state(vport, ndlp, NLP_STE_PLOGI_ISSUE);
                }
@@ -7746,11 +7735,9 @@ lpfc_els_unsol_buffer(struct lpfc_hba *phba, struct lpfc_sli_ring *pring,
        ndlp = lpfc_findnode_did(vport, did);
        if (!ndlp) {
                /* Cannot find existing Fabric ndlp, so allocate a new one */
-               ndlp = mempool_alloc(phba->nlp_mem_pool, GFP_KERNEL);
+               ndlp = lpfc_nlp_init(vport, did);
                if (!ndlp)
                        goto dropit;
-
-               lpfc_nlp_init(vport, ndlp, did);
                lpfc_nlp_set_state(vport, ndlp, NLP_STE_NPR_NODE);
                newnode = 1;
                if ((did & Fabric_DID_MASK) == Fabric_DID_MASK)
@@ -8193,7 +8180,6 @@ lpfc_els_unsol_event(struct lpfc_hba *phba, struct lpfc_sli_ring *pring,
 static void
 lpfc_start_fdmi(struct lpfc_vport *vport)
 {
-       struct lpfc_hba *phba = vport->phba;
        struct lpfc_nodelist *ndlp;
 
        /* If this is the first time, allocate an ndlp and initialize
@@ -8202,9 +8188,8 @@ lpfc_start_fdmi(struct lpfc_vport *vport)
         */
        ndlp = lpfc_findnode_did(vport, FDMI_DID);
        if (!ndlp) {
-               ndlp = mempool_alloc(phba->nlp_mem_pool, GFP_KERNEL);
+               ndlp = lpfc_nlp_init(vport, FDMI_DID);
                if (ndlp) {
-                       lpfc_nlp_init(vport, ndlp, FDMI_DID);
                        ndlp->nlp_type |= NLP_FABRIC;
                } else {
                        return;
@@ -8257,7 +8242,7 @@ lpfc_do_scr_ns_plogi(struct lpfc_hba *phba, struct lpfc_vport *vport)
 
        ndlp = lpfc_findnode_did(vport, NameServer_DID);
        if (!ndlp) {
-               ndlp = mempool_alloc(phba->nlp_mem_pool, GFP_KERNEL);
+               ndlp = lpfc_nlp_init(vport, NameServer_DID);
                if (!ndlp) {
                        if (phba->fc_topology == LPFC_TOPOLOGY_LOOP) {
                                lpfc_disc_start(vport);
@@ -8268,7 +8253,6 @@ lpfc_do_scr_ns_plogi(struct lpfc_hba *phba, struct lpfc_vport *vport)
                                         "0251 NameServer login: no memory\n");
                        return;
                }
-               lpfc_nlp_init(vport, ndlp, NameServer_DID);
        } else if (!NLP_CHK_NODE_ACT(ndlp)) {
                ndlp = lpfc_enable_node(vport, ndlp, NLP_STE_UNUSED_NODE);
                if (!ndlp) {
@@ -8771,7 +8755,7 @@ lpfc_issue_els_fdisc(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp,
        pcmd += sizeof(uint32_t); /* Node Name */
        pcmd += sizeof(uint32_t); /* Node Name */
        memcpy(pcmd, &vport->fc_nodename, 8);
-
+       memset(sp->un.vendorVersion, 0, sizeof(sp->un.vendorVersion));
        lpfc_set_disctmo(vport);
 
        phba->fc_stat.elsXmitFDISC++;
index 180b072..0482c55 100644 (file)
@@ -3002,6 +3002,7 @@ lpfc_mbx_cmpl_read_sparam(struct lpfc_hba *phba, LPFC_MBOXQ_t *pmb)
        MAILBOX_t *mb = &pmb->u.mb;
        struct lpfc_dmabuf *mp = (struct lpfc_dmabuf *) pmb->context1;
        struct lpfc_vport  *vport = pmb->vport;
+       struct Scsi_Host *shost = lpfc_shost_from_vport(vport);
        struct serv_parm *sp = &vport->fc_sparam;
        uint32_t ed_tov;
 
@@ -3031,6 +3032,7 @@ lpfc_mbx_cmpl_read_sparam(struct lpfc_hba *phba, LPFC_MBOXQ_t *pmb)
        }
 
        lpfc_update_vport_wwn(vport);
+       fc_host_port_name(shost) = wwn_to_u64(vport->fc_portname.u.wwn);
        if (vport->port_type == LPFC_PHYSICAL_PORT) {
                memcpy(&phba->wwnn, &vport->fc_nodename, sizeof(phba->wwnn));
                memcpy(&phba->wwpn, &vport->fc_portname, sizeof(phba->wwnn));
@@ -3309,6 +3311,7 @@ lpfc_mbx_cmpl_read_topology(struct lpfc_hba *phba, LPFC_MBOXQ_t *pmb)
        struct lpfc_sli_ring *pring;
        MAILBOX_t *mb = &pmb->u.mb;
        struct lpfc_dmabuf *mp = (struct lpfc_dmabuf *) (pmb->context1);
+       uint8_t attn_type;
 
        /* Unblock ELS traffic */
        pring = lpfc_phba_elsring(phba);
@@ -3325,6 +3328,7 @@ lpfc_mbx_cmpl_read_topology(struct lpfc_hba *phba, LPFC_MBOXQ_t *pmb)
        }
 
        la = (struct lpfc_mbx_read_top *) &pmb->u.mb.un.varReadTop;
+       attn_type = bf_get(lpfc_mbx_read_top_att_type, la);
 
        memcpy(&phba->alpa_map[0], mp->virt, 128);
 
@@ -3337,7 +3341,7 @@ lpfc_mbx_cmpl_read_topology(struct lpfc_hba *phba, LPFC_MBOXQ_t *pmb)
 
        if (phba->fc_eventTag <= la->eventTag) {
                phba->fc_stat.LinkMultiEvent++;
-               if (bf_get(lpfc_mbx_read_top_att_type, la) == LPFC_ATT_LINK_UP)
+               if (attn_type == LPFC_ATT_LINK_UP)
                        if (phba->fc_eventTag != 0)
                                lpfc_linkdown(phba);
        }
@@ -3353,7 +3357,7 @@ lpfc_mbx_cmpl_read_topology(struct lpfc_hba *phba, LPFC_MBOXQ_t *pmb)
        }
 
        phba->link_events++;
-       if ((bf_get(lpfc_mbx_read_top_att_type, la) == LPFC_ATT_LINK_UP) &&
+       if ((attn_type == LPFC_ATT_LINK_UP) &&
            !(phba->sli.sli_flag & LPFC_MENLO_MAINT)) {
                phba->fc_stat.LinkUp++;
                if (phba->link_flag & LS_LOOPBACK_MODE) {
@@ -3379,8 +3383,8 @@ lpfc_mbx_cmpl_read_topology(struct lpfc_hba *phba, LPFC_MBOXQ_t *pmb)
                                        phba->wait_4_mlo_maint_flg);
                }
                lpfc_mbx_process_link_up(phba, la);
-       } else if (bf_get(lpfc_mbx_read_top_att_type, la) ==
-                  LPFC_ATT_LINK_DOWN) {
+       } else if (attn_type == LPFC_ATT_LINK_DOWN ||
+                  attn_type == LPFC_ATT_UNEXP_WWPN) {
                phba->fc_stat.LinkDown++;
                if (phba->link_flag & LS_LOOPBACK_MODE)
                        lpfc_printf_log(phba, KERN_ERR, LOG_LINK_EVENT,
@@ -3389,6 +3393,14 @@ lpfc_mbx_cmpl_read_topology(struct lpfc_hba *phba, LPFC_MBOXQ_t *pmb)
                                "Data: x%x x%x x%x\n",
                                la->eventTag, phba->fc_eventTag,
                                phba->pport->port_state, vport->fc_flag);
+               else if (attn_type == LPFC_ATT_UNEXP_WWPN)
+                       lpfc_printf_log(phba, KERN_ERR, LOG_LINK_EVENT,
+                               "1313 Link Down UNEXP WWPN Event x%x received "
+                               "Data: x%x x%x x%x x%x x%x\n",
+                               la->eventTag, phba->fc_eventTag,
+                               phba->pport->port_state, vport->fc_flag,
+                               bf_get(lpfc_mbx_read_top_mm, la),
+                               bf_get(lpfc_mbx_read_top_fa, la));
                else
                        lpfc_printf_log(phba, KERN_ERR, LOG_LINK_EVENT,
                                "1305 Link Down Event x%x received "
@@ -3399,8 +3411,8 @@ lpfc_mbx_cmpl_read_topology(struct lpfc_hba *phba, LPFC_MBOXQ_t *pmb)
                                bf_get(lpfc_mbx_read_top_fa, la));
                lpfc_mbx_issue_link_down(phba);
        }
-       if ((phba->sli.sli_flag & LPFC_MENLO_MAINT) &&
-           ((bf_get(lpfc_mbx_read_top_att_type, la) == LPFC_ATT_LINK_UP))) {
+       if (phba->sli.sli_flag & LPFC_MENLO_MAINT &&
+           attn_type == LPFC_ATT_LINK_UP) {
                if (phba->link_state != LPFC_LINK_DOWN) {
                        phba->fc_stat.LinkDown++;
                        lpfc_printf_log(phba, KERN_ERR, LOG_LINK_EVENT,
@@ -4136,7 +4148,6 @@ lpfc_nlp_state_cleanup(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp,
                       int old_state, int new_state)
 {
        struct Scsi_Host *shost = lpfc_shost_from_vport(vport);
-       struct lpfc_hba *phba = vport->phba;
 
        if (new_state == NLP_STE_UNMAPPED_NODE) {
                ndlp->nlp_flag &= ~NLP_NODEV_REMOVE;
@@ -4155,14 +4166,14 @@ lpfc_nlp_state_cleanup(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp,
                        lpfc_unregister_remote_port(ndlp);
                }
 
-               /* Notify the NVME transport of this rport's loss */
-               if (((phba->cfg_enable_fc4_type == LPFC_ENABLE_BOTH) ||
-                    (phba->cfg_enable_fc4_type == LPFC_ENABLE_NVME)) &&
-                   (vport->phba->nvmet_support == 0) &&
-                   ((ndlp->nlp_fc4_type & NLP_FC4_NVME) ||
-                   (ndlp->nlp_DID == Fabric_DID))) {
+               /* Notify the NVME transport of this rport's loss on the
+                * Initiator.  For NVME Target, should upcall transport
+                * in the else clause when API available.
+                */
+               if (ndlp->nlp_fc4_type & NLP_FC4_NVME) {
                        vport->phba->nport_event_cnt++;
-                       lpfc_nvme_unregister_port(vport, ndlp);
+                       if (vport->phba->nvmet_support == 0)
+                               lpfc_nvme_unregister_port(vport, ndlp);
                }
        }
 
@@ -4368,10 +4379,17 @@ lpfc_enable_node(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp,
        uint32_t did;
        unsigned long flags;
        unsigned long *active_rrqs_xri_bitmap = NULL;
+       int rpi = LPFC_RPI_ALLOC_ERROR;
 
        if (!ndlp)
                return NULL;
 
+       if (phba->sli_rev == LPFC_SLI_REV4) {
+               rpi = lpfc_sli4_alloc_rpi(vport->phba);
+               if (rpi == LPFC_RPI_ALLOC_ERROR)
+                       return NULL;
+       }
+
        spin_lock_irqsave(&phba->ndlp_lock, flags);
        /* The ndlp should not be in memory free mode */
        if (NLP_CHK_FREE_REQ(ndlp)) {
@@ -4381,7 +4399,7 @@ lpfc_enable_node(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp,
                                "usgmap:x%x refcnt:%d\n",
                                (void *)ndlp, ndlp->nlp_usg_map,
                                kref_read(&ndlp->kref));
-               return NULL;
+               goto free_rpi;
        }
        /* The ndlp should not already be in active mode */
        if (NLP_CHK_NODE_ACT(ndlp)) {
@@ -4391,7 +4409,7 @@ lpfc_enable_node(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp,
                                "usgmap:x%x refcnt:%d\n",
                                (void *)ndlp, ndlp->nlp_usg_map,
                                kref_read(&ndlp->kref));
-               return NULL;
+               goto free_rpi;
        }
 
        /* Keep the original DID */
@@ -4409,7 +4427,7 @@ lpfc_enable_node(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp,
 
        spin_unlock_irqrestore(&phba->ndlp_lock, flags);
        if (vport->phba->sli_rev == LPFC_SLI_REV4) {
-               ndlp->nlp_rpi = lpfc_sli4_alloc_rpi(vport->phba);
+               ndlp->nlp_rpi = rpi;
                lpfc_printf_vlog(vport, KERN_INFO, LOG_NODE,
                                 "0008 rpi:%x DID:%x flg:%x refcnt:%d "
                                 "map:%x %p\n", ndlp->nlp_rpi, ndlp->nlp_DID,
@@ -4426,6 +4444,11 @@ lpfc_enable_node(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp,
                "node enable:       did:x%x",
                ndlp->nlp_DID, 0, 0);
        return ndlp;
+
+free_rpi:
+       if (phba->sli_rev == LPFC_SLI_REV4)
+               lpfc_sli4_free_rpi(vport->phba, rpi);
+       return NULL;
 }
 
 void
@@ -5104,65 +5127,82 @@ lpfc_setup_disc_node(struct lpfc_vport *vport, uint32_t did)
 
        ndlp = lpfc_findnode_did(vport, did);
        if (!ndlp) {
+               if (vport->phba->nvmet_support)
+                       return NULL;
                if ((vport->fc_flag & FC_RSCN_MODE) != 0 &&
                    lpfc_rscn_payload_check(vport, did) == 0)
                        return NULL;
-               ndlp = (struct lpfc_nodelist *)
-                    mempool_alloc(vport->phba->nlp_mem_pool, GFP_KERNEL);
+               ndlp = lpfc_nlp_init(vport, did);
                if (!ndlp)
                        return NULL;
-               lpfc_nlp_init(vport, ndlp, did);
                lpfc_nlp_set_state(vport, ndlp, NLP_STE_NPR_NODE);
-               if (vport->phba->nvmet_support)
-                       return ndlp;
                spin_lock_irq(shost->host_lock);
                ndlp->nlp_flag |= NLP_NPR_2B_DISC;
                spin_unlock_irq(shost->host_lock);
                return ndlp;
        } else if (!NLP_CHK_NODE_ACT(ndlp)) {
+               if (vport->phba->nvmet_support)
+                       return NULL;
                ndlp = lpfc_enable_node(vport, ndlp, NLP_STE_NPR_NODE);
                if (!ndlp)
                        return NULL;
-               if (vport->phba->nvmet_support)
-                       return ndlp;
                spin_lock_irq(shost->host_lock);
                ndlp->nlp_flag |= NLP_NPR_2B_DISC;
                spin_unlock_irq(shost->host_lock);
                return ndlp;
        }
 
+       /* The NVME Target does not want to actively manage an rport.
+        * The goal is to allow the target to reset its state and clear
+        * pending IO in preparation for the initiator to recover.
+        */
        if ((vport->fc_flag & FC_RSCN_MODE) &&
            !(vport->fc_flag & FC_NDISC_ACTIVE)) {
                if (lpfc_rscn_payload_check(vport, did)) {
-                       /* If we've already received a PLOGI from this NPort
-                        * we don't need to try to discover it again.
-                        */
-                       if (ndlp->nlp_flag & NLP_RCV_PLOGI)
-                               return NULL;
 
                        /* Since this node is marked for discovery,
                         * delay timeout is not needed.
                         */
                        lpfc_cancel_retry_delay_tmo(vport, ndlp);
+
+                       /* NVME Target mode waits until rport is known to be
+                        * impacted by the RSCN before it transitions.  No
+                        * active management - just go to NPR provided the
+                        * node had a valid login.
+                        */
                        if (vport->phba->nvmet_support)
                                return ndlp;
+
+                       /* If we've already received a PLOGI from this NPort
+                        * we don't need to try to discover it again.
+                        */
+                       if (ndlp->nlp_flag & NLP_RCV_PLOGI)
+                               return NULL;
+
                        spin_lock_irq(shost->host_lock);
                        ndlp->nlp_flag |= NLP_NPR_2B_DISC;
                        spin_unlock_irq(shost->host_lock);
                } else
                        ndlp = NULL;
        } else {
-               /* If we've already received a PLOGI from this NPort,
-                * or we are already in the process of discovery on it,
-                * we don't need to try to discover it again.
+               /* If the initiator received a PLOGI from this NPort or if the
+                * initiator is already in the process of discovery on it,
+                * there's no need to try to discover it again.
                 */
                if (ndlp->nlp_state == NLP_STE_ADISC_ISSUE ||
                    ndlp->nlp_state == NLP_STE_PLOGI_ISSUE ||
-                   ndlp->nlp_flag & NLP_RCV_PLOGI)
+                   (!vport->phba->nvmet_support &&
+                    ndlp->nlp_flag & NLP_RCV_PLOGI))
                        return NULL;
-               lpfc_nlp_set_state(vport, ndlp, NLP_STE_NPR_NODE);
+
                if (vport->phba->nvmet_support)
                        return ndlp;
+
+               /* Moving to NPR state clears unsolicited flags and
+                * allows for rediscovery
+                */
+               lpfc_nlp_set_state(vport, ndlp, NLP_STE_NPR_NODE);
+
                spin_lock_irq(shost->host_lock);
                ndlp->nlp_flag |= NLP_NPR_2B_DISC;
                spin_unlock_irq(shost->host_lock);
@@ -5887,16 +5927,31 @@ lpfc_find_vport_by_vpid(struct lpfc_hba *phba, uint16_t vpi)
        return NULL;
 }
 
-void
-lpfc_nlp_init(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp,
-             uint32_t did)
+struct lpfc_nodelist *
+lpfc_nlp_init(struct lpfc_vport *vport, uint32_t did)
 {
+       struct lpfc_nodelist *ndlp;
+       int rpi = LPFC_RPI_ALLOC_ERROR;
+
+       if (vport->phba->sli_rev == LPFC_SLI_REV4) {
+               rpi = lpfc_sli4_alloc_rpi(vport->phba);
+               if (rpi == LPFC_RPI_ALLOC_ERROR)
+                       return NULL;
+       }
+
+       ndlp = mempool_alloc(vport->phba->nlp_mem_pool, GFP_KERNEL);
+       if (!ndlp) {
+               if (vport->phba->sli_rev == LPFC_SLI_REV4)
+                       lpfc_sli4_free_rpi(vport->phba, rpi);
+               return NULL;
+       }
+
        memset(ndlp, 0, sizeof (struct lpfc_nodelist));
 
        lpfc_initialize_node(vport, ndlp, did);
        INIT_LIST_HEAD(&ndlp->nlp_listp);
        if (vport->phba->sli_rev == LPFC_SLI_REV4) {
-               ndlp->nlp_rpi = lpfc_sli4_alloc_rpi(vport->phba);
+               ndlp->nlp_rpi = rpi;
                lpfc_printf_vlog(vport, KERN_INFO, LOG_NODE,
                                 "0007 rpi:%x DID:%x flg:%x refcnt:%d "
                                 "map:%x %p\n", ndlp->nlp_rpi, ndlp->nlp_DID,
@@ -5918,7 +5973,7 @@ lpfc_nlp_init(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp,
                "node init:       did:x%x",
                ndlp->nlp_DID, 0, 0);
 
-       return;
+       return ndlp;
 }
 
 /* This routine releases all resources associated with a specifc NPort's ndlp
index 15ca214..26a5647 100644 (file)
@@ -509,6 +509,8 @@ struct class_parms {
        uint8_t word3Reserved2; /* Fc Word 3, bit  0: 7 */
 };
 
+#define FAPWWN_KEY_VENDOR      0x42524344 /*valid vendor version fawwpn key*/
+
 struct serv_parm {     /* Structure is in Big Endian format */
        struct csp cmn;
        struct lpfc_name portName;
@@ -2885,6 +2887,7 @@ struct lpfc_mbx_read_top {
 #define LPFC_ATT_RESERVED    0x00      /* Reserved - attType */
 #define LPFC_ATT_LINK_UP     0x01      /* Link is up */
 #define LPFC_ATT_LINK_DOWN   0x02      /* Link is down */
+#define LPFC_ATT_UNEXP_WWPN  0x06      /* Link is down Unexpected WWWPN */
        uint32_t word3;
 #define lpfc_mbx_read_top_alpa_granted_SHIFT   24
 #define lpfc_mbx_read_top_alpa_granted_MASK    0x000000FF
index 1527770..1d12f2b 100644 (file)
@@ -2720,6 +2720,9 @@ struct lpfc_mbx_request_features {
 #define lpfc_mbx_rq_ftr_rq_ifip_SHIFT          7
 #define lpfc_mbx_rq_ftr_rq_ifip_MASK           0x00000001
 #define lpfc_mbx_rq_ftr_rq_ifip_WORD           word2
+#define lpfc_mbx_rq_ftr_rq_iaar_SHIFT          9
+#define lpfc_mbx_rq_ftr_rq_iaar_MASK           0x00000001
+#define lpfc_mbx_rq_ftr_rq_iaar_WORD           word2
 #define lpfc_mbx_rq_ftr_rq_perfh_SHIFT         11
 #define lpfc_mbx_rq_ftr_rq_perfh_MASK          0x00000001
 #define lpfc_mbx_rq_ftr_rq_perfh_WORD          word2
@@ -3853,6 +3856,7 @@ struct lpfc_acqe_fc_la {
 #define LPFC_FC_LA_TYPE_NO_HARD_ALPA   0x3
 #define LPFC_FC_LA_TYPE_MDS_LINK_DOWN  0x4
 #define LPFC_FC_LA_TYPE_MDS_LOOPBACK   0x5
+#define LPFC_FC_LA_TYPE_UNEXP_WWPN     0x6
 #define lpfc_acqe_fc_la_port_type_SHIFT                6
 #define lpfc_acqe_fc_la_port_type_MASK         0x00000003
 #define lpfc_acqe_fc_la_port_type_WORD         word0
index 6cc561b..90ae354 100644 (file)
 #include <scsi/scsi_device.h>
 #include <scsi/scsi_host.h>
 #include <scsi/scsi_transport_fc.h>
+#include <scsi/scsi_tcq.h>
+#include <scsi/fc/fc_fs.h>
+
+#include <linux/nvme-fc-driver.h>
 
 #include "lpfc_hw4.h"
 #include "lpfc_hw.h"
@@ -52,6 +56,7 @@
 #include "lpfc.h"
 #include "lpfc_scsi.h"
 #include "lpfc_nvme.h"
+#include "lpfc_nvmet.h"
 #include "lpfc_logmsg.h"
 #include "lpfc_crtn.h"
 #include "lpfc_vport.h"
@@ -335,6 +340,9 @@ lpfc_dump_wakeup_param_cmpl(struct lpfc_hba *phba, LPFC_MBOXQ_t *pmboxq)
 void
 lpfc_update_vport_wwn(struct lpfc_vport *vport)
 {
+       uint8_t vvvl = vport->fc_sparam.cmn.valid_vendor_ver_level;
+       u32 *fawwpn_key = (u32 *)&vport->fc_sparam.un.vendorVersion[0];
+
        /* If the soft name exists then update it using the service params */
        if (vport->phba->cfg_soft_wwnn)
                u64_to_wwn(vport->phba->cfg_soft_wwnn,
@@ -354,9 +362,25 @@ lpfc_update_vport_wwn(struct lpfc_vport *vport)
                memcpy(&vport->fc_sparam.nodeName, &vport->fc_nodename,
                        sizeof(struct lpfc_name));
 
-       if (vport->fc_portname.u.wwn[0] == 0 || vport->phba->cfg_soft_wwpn)
+       /*
+        * If the port name has changed, then set the Param changes flag
+        * to unreg the login
+        */
+       if (vport->fc_portname.u.wwn[0] != 0 &&
+               memcmp(&vport->fc_portname, &vport->fc_sparam.portName,
+                       sizeof(struct lpfc_name)))
+               vport->vport_flag |= FAWWPN_PARAM_CHG;
+
+       if (vport->fc_portname.u.wwn[0] == 0 ||
+           vport->phba->cfg_soft_wwpn ||
+           (vvvl == 1 && cpu_to_be32(*fawwpn_key) == FAPWWN_KEY_VENDOR) ||
+           vport->vport_flag & FAWWPN_SET) {
                memcpy(&vport->fc_portname, &vport->fc_sparam.portName,
                        sizeof(struct lpfc_name));
+               vport->vport_flag &= ~FAWWPN_SET;
+               if (vvvl == 1 && cpu_to_be32(*fawwpn_key) == FAPWWN_KEY_VENDOR)
+                       vport->vport_flag |= FAWWPN_SET;
+       }
        else
                memcpy(&vport->fc_sparam.portName, &vport->fc_portname,
                        sizeof(struct lpfc_name));
@@ -1003,8 +1027,10 @@ static int
 lpfc_hba_down_post_s4(struct lpfc_hba *phba)
 {
        struct lpfc_scsi_buf *psb, *psb_next;
+       struct lpfc_nvmet_rcv_ctx *ctxp, *ctxp_next;
        LIST_HEAD(aborts);
        LIST_HEAD(nvme_aborts);
+       LIST_HEAD(nvmet_aborts);
        unsigned long iflag = 0;
        struct lpfc_sglq *sglq_entry = NULL;
 
@@ -1027,16 +1053,10 @@ lpfc_hba_down_post_s4(struct lpfc_hba *phba)
        list_for_each_entry(sglq_entry,
                &phba->sli4_hba.lpfc_abts_els_sgl_list, list)
                sglq_entry->state = SGL_FREED;
-       list_for_each_entry(sglq_entry,
-               &phba->sli4_hba.lpfc_abts_nvmet_sgl_list, list)
-               sglq_entry->state = SGL_FREED;
 
        list_splice_init(&phba->sli4_hba.lpfc_abts_els_sgl_list,
                        &phba->sli4_hba.lpfc_els_sgl_list);
 
-       if (phba->sli4_hba.nvme_wq)
-               list_splice_init(&phba->sli4_hba.lpfc_abts_nvmet_sgl_list,
-                                &phba->sli4_hba.lpfc_nvmet_sgl_list);
 
        spin_unlock(&phba->sli4_hba.sgl_list_lock);
        /* abts_scsi_buf_list_lock required because worker thread uses this
@@ -1053,6 +1073,8 @@ lpfc_hba_down_post_s4(struct lpfc_hba *phba)
                spin_lock(&phba->sli4_hba.abts_nvme_buf_list_lock);
                list_splice_init(&phba->sli4_hba.lpfc_abts_nvme_buf_list,
                                 &nvme_aborts);
+               list_splice_init(&phba->sli4_hba.lpfc_abts_nvmet_ctx_list,
+                                &nvmet_aborts);
                spin_unlock(&phba->sli4_hba.abts_nvme_buf_list_lock);
        }
 
@@ -1066,13 +1088,20 @@ lpfc_hba_down_post_s4(struct lpfc_hba *phba)
        list_splice(&aborts, &phba->lpfc_scsi_buf_list_put);
        spin_unlock_irqrestore(&phba->scsi_buf_list_put_lock, iflag);
 
-       list_for_each_entry_safe(psb, psb_next, &nvme_aborts, list) {
-               psb->pCmd = NULL;
-               psb->status = IOSTAT_SUCCESS;
+       if (phba->cfg_enable_fc4_type & LPFC_ENABLE_NVME) {
+               list_for_each_entry_safe(psb, psb_next, &nvme_aborts, list) {
+                       psb->pCmd = NULL;
+                       psb->status = IOSTAT_SUCCESS;
+               }
+               spin_lock_irqsave(&phba->nvme_buf_list_put_lock, iflag);
+               list_splice(&nvme_aborts, &phba->lpfc_nvme_buf_list_put);
+               spin_unlock_irqrestore(&phba->nvme_buf_list_put_lock, iflag);
+
+               list_for_each_entry_safe(ctxp, ctxp_next, &nvmet_aborts, list) {
+                       ctxp->flag &= ~(LPFC_NVMET_XBUSY | LPFC_NVMET_ABORT_OP);
+                       lpfc_nvmet_rq_post(phba, ctxp, &ctxp->rqb_buffer->hbuf);
+               }
        }
-       spin_lock_irqsave(&phba->nvme_buf_list_put_lock, iflag);
-       list_splice(&nvme_aborts, &phba->lpfc_nvme_buf_list_put);
-       spin_unlock_irqrestore(&phba->nvme_buf_list_put_lock, iflag);
 
        lpfc_sli4_free_sp_events(phba);
        return 0;
@@ -2874,34 +2903,38 @@ lpfc_sli4_node_prep(struct lpfc_hba *phba)
 {
        struct lpfc_nodelist  *ndlp, *next_ndlp;
        struct lpfc_vport **vports;
-       int i;
+       int i, rpi;
+       unsigned long flags;
 
        if (phba->sli_rev != LPFC_SLI_REV4)
                return;
 
        vports = lpfc_create_vport_work_array(phba);
-       if (vports != NULL) {
-               for (i = 0; i <= phba->max_vports && vports[i] != NULL; i++) {
-                       if (vports[i]->load_flag & FC_UNLOADING)
-                               continue;
+       if (vports == NULL)
+               return;
 
-                       list_for_each_entry_safe(ndlp, next_ndlp,
-                                                &vports[i]->fc_nodes,
-                                                nlp_listp) {
-                               if (NLP_CHK_NODE_ACT(ndlp)) {
-                                       ndlp->nlp_rpi =
-                                               lpfc_sli4_alloc_rpi(phba);
-                                       lpfc_printf_vlog(ndlp->vport, KERN_INFO,
-                                                        LOG_NODE,
-                                                        "0009 rpi:%x DID:%x "
-                                                        "flg:%x map:%x %p\n",
-                                                        ndlp->nlp_rpi,
-                                                        ndlp->nlp_DID,
-                                                        ndlp->nlp_flag,
-                                                        ndlp->nlp_usg_map,
-                                                        ndlp);
-                               }
+       for (i = 0; i <= phba->max_vports && vports[i] != NULL; i++) {
+               if (vports[i]->load_flag & FC_UNLOADING)
+                       continue;
+
+               list_for_each_entry_safe(ndlp, next_ndlp,
+                                        &vports[i]->fc_nodes,
+                                        nlp_listp) {
+                       if (!NLP_CHK_NODE_ACT(ndlp))
+                               continue;
+                       rpi = lpfc_sli4_alloc_rpi(phba);
+                       if (rpi == LPFC_RPI_ALLOC_ERROR) {
+                               spin_lock_irqsave(&phba->ndlp_lock, flags);
+                               NLP_CLR_NODE_ACT(ndlp);
+                               spin_unlock_irqrestore(&phba->ndlp_lock, flags);
+                               continue;
                        }
+                       ndlp->nlp_rpi = rpi;
+                       lpfc_printf_vlog(ndlp->vport, KERN_INFO, LOG_NODE,
+                                        "0009 rpi:%x DID:%x "
+                                        "flg:%x map:%x %p\n", ndlp->nlp_rpi,
+                                        ndlp->nlp_DID, ndlp->nlp_flag,
+                                        ndlp->nlp_usg_map, ndlp);
                }
        }
        lpfc_destroy_vport_work_array(phba, vports);
@@ -3508,6 +3541,12 @@ lpfc_sli4_scsi_sgl_update(struct lpfc_hba *phba)
        spin_unlock(&phba->scsi_buf_list_put_lock);
        spin_unlock_irq(&phba->scsi_buf_list_get_lock);
 
+       lpfc_printf_log(phba, KERN_INFO, LOG_SLI,
+                       "6060 Current allocated SCSI xri-sgl count:%d, "
+                       "maximum  SCSI xri count:%d (split:%d)\n",
+                       phba->sli4_hba.scsi_xri_cnt,
+                       phba->sli4_hba.scsi_xri_max, phba->cfg_xri_split);
+
        if (phba->sli4_hba.scsi_xri_cnt > phba->sli4_hba.scsi_xri_max) {
                /* max scsi xri shrinked below the allocated scsi buffers */
                scsi_xri_cnt = phba->sli4_hba.scsi_xri_cnt -
@@ -4508,9 +4547,15 @@ lpfc_sli4_async_fc_evt(struct lpfc_hba *phba, struct lpfc_acqe_fc_la *acqe_fc)
                /* Parse and translate link attention fields */
                la = (struct lpfc_mbx_read_top *)&pmb->u.mb.un.varReadTop;
                la->eventTag = acqe_fc->event_tag;
-               bf_set(lpfc_mbx_read_top_att_type, la,
-                      LPFC_FC_LA_TYPE_LINK_DOWN);
 
+               if (phba->sli4_hba.link_state.status ==
+                   LPFC_FC_LA_TYPE_UNEXP_WWPN) {
+                       bf_set(lpfc_mbx_read_top_att_type, la,
+                              LPFC_FC_LA_TYPE_UNEXP_WWPN);
+               } else {
+                       bf_set(lpfc_mbx_read_top_att_type, la,
+                              LPFC_FC_LA_TYPE_LINK_DOWN);
+               }
                /* Invoke the mailbox command callback function */
                lpfc_mbx_cmpl_read_topology(phba, pmb);
 
@@ -4716,10 +4761,9 @@ lpfc_sli4_perform_vport_cvl(struct lpfc_vport *vport)
        ndlp = lpfc_findnode_did(vport, Fabric_DID);
        if (!ndlp) {
                /* Cannot find existing Fabric ndlp, so allocate a new one */
-               ndlp = mempool_alloc(phba->nlp_mem_pool, GFP_KERNEL);
+               ndlp = lpfc_nlp_init(vport, Fabric_DID);
                if (!ndlp)
                        return 0;
-               lpfc_nlp_init(vport, ndlp, Fabric_DID);
                /* Set the node type */
                ndlp->nlp_type |= NLP_FABRIC;
                /* Put ndlp onto node list */
@@ -5778,6 +5822,7 @@ lpfc_sli4_driver_resource_setup(struct lpfc_hba *phba)
                /* Initialize the Abort nvme buffer list used by driver */
                spin_lock_init(&phba->sli4_hba.abts_nvme_buf_list_lock);
                INIT_LIST_HEAD(&phba->sli4_hba.lpfc_abts_nvme_buf_list);
+               INIT_LIST_HEAD(&phba->sli4_hba.lpfc_abts_nvmet_ctx_list);
                /* Fast-path XRI aborted CQ Event work queue list */
                INIT_LIST_HEAD(&phba->sli4_hba.sp_nvme_xri_aborted_work_queue);
        }
@@ -5809,6 +5854,12 @@ lpfc_sli4_driver_resource_setup(struct lpfc_hba *phba)
        INIT_LIST_HEAD(&phba->sli4_hba.lpfc_vfi_blk_list);
        INIT_LIST_HEAD(&phba->lpfc_vpi_blk_list);
 
+       /* Initialize mboxq lists. If the early init routines fail
+        * these lists need to be correctly initialized.
+        */
+       INIT_LIST_HEAD(&phba->sli.mboxq);
+       INIT_LIST_HEAD(&phba->sli.mboxq_cmpl);
+
        /* initialize optic_state to 0xFF */
        phba->sli4_hba.lnk_info.optic_state = 0xff;
 
@@ -5874,6 +5925,7 @@ lpfc_sli4_driver_resource_setup(struct lpfc_hba *phba)
                                        "READ_NV, mbxStatus x%x\n",
                                        bf_get(lpfc_mqe_command, &mboxq->u.mqe),
                                        bf_get(lpfc_mqe_status, &mboxq->u.mqe));
+                       mempool_free(mboxq, phba->mbox_mem_pool);
                        rc = -EIO;
                        goto out_free_bsmbx;
                }
@@ -6398,7 +6450,7 @@ lpfc_init_sgl_list(struct lpfc_hba *phba)
        INIT_LIST_HEAD(&phba->sli4_hba.lpfc_els_sgl_list);
        INIT_LIST_HEAD(&phba->sli4_hba.lpfc_abts_els_sgl_list);
        INIT_LIST_HEAD(&phba->sli4_hba.lpfc_nvmet_sgl_list);
-       INIT_LIST_HEAD(&phba->sli4_hba.lpfc_abts_nvmet_sgl_list);
+       INIT_LIST_HEAD(&phba->sli4_hba.lpfc_abts_nvmet_ctx_list);
 
        /* els xri-sgl book keeping */
        phba->sli4_hba.els_xri_cnt = 0;
@@ -7799,7 +7851,7 @@ lpfc_alloc_fcp_wq_cq(struct lpfc_hba *phba, int wqidx)
 
        /* Create Fast Path FCP WQs */
        wqesize = (phba->fcp_embed_io) ?
-                               LPFC_WQE128_SIZE : phba->sli4_hba.wq_esize;
+               LPFC_WQE128_SIZE : phba->sli4_hba.wq_esize;
        qdesc = lpfc_sli4_queue_alloc(phba, wqesize, phba->sli4_hba.wq_ecount);
        if (!qdesc) {
                lpfc_printf_log(phba, KERN_ERR, LOG_INIT,
@@ -7830,7 +7882,7 @@ int
 lpfc_sli4_queue_create(struct lpfc_hba *phba)
 {
        struct lpfc_queue *qdesc;
-       int idx, io_channel, max;
+       int idx, io_channel;
 
        /*
         * Create HBA Record arrays.
@@ -7991,15 +8043,6 @@ lpfc_sli4_queue_create(struct lpfc_hba *phba)
                if (lpfc_alloc_nvme_wq_cq(phba, idx))
                        goto out_error;
 
-       /* allocate MRQ CQs */
-       max = phba->cfg_nvme_io_channel;
-       if (max < phba->cfg_nvmet_mrq)
-               max = phba->cfg_nvmet_mrq;
-
-       for (idx = 0; idx < max; idx++)
-               if (lpfc_alloc_nvme_wq_cq(phba, idx))
-                       goto out_error;
-
        if (phba->nvmet_support) {
                for (idx = 0; idx < phba->cfg_nvmet_mrq; idx++) {
                        qdesc = lpfc_sli4_queue_alloc(phba,
@@ -8221,11 +8264,11 @@ lpfc_sli4_queue_destroy(struct lpfc_hba *phba)
 
        /* Release FCP cqs */
        lpfc_sli4_release_queues(&phba->sli4_hba.fcp_cq,
-                                       phba->cfg_fcp_io_channel);
+                                phba->cfg_fcp_io_channel);
 
        /* Release FCP wqs */
        lpfc_sli4_release_queues(&phba->sli4_hba.fcp_wq,
-                                       phba->cfg_fcp_io_channel);
+                                phba->cfg_fcp_io_channel);
 
        /* Release FCP CQ mapping array */
        lpfc_sli4_release_queue_map(&phba->sli4_hba.fcp_cq_map);
@@ -8571,15 +8614,15 @@ lpfc_sli4_queue_setup(struct lpfc_hba *phba)
                lpfc_printf_log(phba, KERN_ERR, LOG_INIT,
                                "0528 %s not allocated\n",
                                phba->sli4_hba.mbx_cq ?
-                                               "Mailbox WQ" : "Mailbox CQ");
+                               "Mailbox WQ" : "Mailbox CQ");
                rc = -ENOMEM;
                goto out_destroy;
        }
 
        rc = lpfc_create_wq_cq(phba, phba->sli4_hba.hba_eq[0],
-                                       phba->sli4_hba.mbx_cq,
-                                       phba->sli4_hba.mbx_wq,
-                                       NULL, 0, LPFC_MBOX);
+                              phba->sli4_hba.mbx_cq,
+                              phba->sli4_hba.mbx_wq,
+                              NULL, 0, LPFC_MBOX);
        if (rc) {
                lpfc_printf_log(phba, KERN_ERR, LOG_INIT,
                        "0529 Failed setup of mailbox WQ/CQ: rc = 0x%x\n",
@@ -9934,17 +9977,19 @@ lpfc_sli4_xri_exchange_busy_wait(struct lpfc_hba *phba)
 {
        int wait_time = 0;
        int nvme_xri_cmpl = 1;
+       int nvmet_xri_cmpl = 1;
        int fcp_xri_cmpl = 1;
        int els_xri_cmpl = list_empty(&phba->sli4_hba.lpfc_abts_els_sgl_list);
-       int nvmet_xri_cmpl =
-                       list_empty(&phba->sli4_hba.lpfc_abts_nvmet_sgl_list);
 
        if (phba->cfg_enable_fc4_type & LPFC_ENABLE_FCP)
                fcp_xri_cmpl =
                        list_empty(&phba->sli4_hba.lpfc_abts_scsi_buf_list);
-       if (phba->cfg_enable_fc4_type & LPFC_ENABLE_NVME)
+       if (phba->cfg_enable_fc4_type & LPFC_ENABLE_NVME) {
                nvme_xri_cmpl =
                        list_empty(&phba->sli4_hba.lpfc_abts_nvme_buf_list);
+               nvmet_xri_cmpl =
+                       list_empty(&phba->sli4_hba.lpfc_abts_nvmet_ctx_list);
+       }
 
        while (!fcp_xri_cmpl || !els_xri_cmpl || !nvme_xri_cmpl ||
               !nvmet_xri_cmpl) {
@@ -9970,9 +10015,12 @@ lpfc_sli4_xri_exchange_busy_wait(struct lpfc_hba *phba)
                        msleep(LPFC_XRI_EXCH_BUSY_WAIT_T1);
                        wait_time += LPFC_XRI_EXCH_BUSY_WAIT_T1;
                }
-               if (phba->cfg_enable_fc4_type & LPFC_ENABLE_NVME)
+               if (phba->cfg_enable_fc4_type & LPFC_ENABLE_NVME) {
                        nvme_xri_cmpl = list_empty(
                                &phba->sli4_hba.lpfc_abts_nvme_buf_list);
+                       nvmet_xri_cmpl = list_empty(
+                               &phba->sli4_hba.lpfc_abts_nvmet_ctx_list);
+               }
 
                if (phba->cfg_enable_fc4_type & LPFC_ENABLE_FCP)
                        fcp_xri_cmpl = list_empty(
@@ -9981,8 +10029,6 @@ lpfc_sli4_xri_exchange_busy_wait(struct lpfc_hba *phba)
                els_xri_cmpl =
                        list_empty(&phba->sli4_hba.lpfc_abts_els_sgl_list);
 
-               nvmet_xri_cmpl =
-                       list_empty(&phba->sli4_hba.lpfc_abts_nvmet_sgl_list);
        }
 }
 
@@ -10048,9 +10094,14 @@ lpfc_sli4_hba_unset(struct lpfc_hba *phba)
        /* Stop kthread signal shall trigger work_done one more time */
        kthread_stop(phba->worker_thread);
 
+       /* Unset the queues shared with the hardware then release all
+        * allocated resources.
+        */
+       lpfc_sli4_queue_unset(phba);
+       lpfc_sli4_queue_destroy(phba);
+
        /* Reset SLI4 HBA FCoE function */
        lpfc_pci_function_reset(phba);
-       lpfc_sli4_queue_destroy(phba);
 
        /* Stop the SLI4 device port */
        phba->pport->work_port_events = 0;
@@ -10306,6 +10357,7 @@ lpfc_pci_probe_one_s3(struct pci_dev *pdev, const struct pci_device_id *pid)
        }
 
        /* Initialize and populate the iocb list per host */
+
        error = lpfc_init_iocb_list(phba, LPFC_IOCB_LIST_CNT);
        if (error) {
                lpfc_printf_log(phba, KERN_ERR, LOG_INIT,
@@ -11051,7 +11103,7 @@ lpfc_pci_probe_one_s4(struct pci_dev *pdev, const struct pci_device_id *pid)
        struct lpfc_hba   *phba;
        struct lpfc_vport *vport = NULL;
        struct Scsi_Host  *shost = NULL;
-       int error;
+       int error, cnt;
        uint32_t cfg_mode, intr_mode;
 
        /* Allocate memory for HBA structure */
@@ -11085,12 +11137,15 @@ lpfc_pci_probe_one_s4(struct pci_dev *pdev, const struct pci_device_id *pid)
                goto out_unset_pci_mem_s4;
        }
 
-       /* Initialize and populate the iocb list per host */
+       cnt = phba->cfg_iocb_cnt * 1024;
+       if (phba->nvmet_support)
+               cnt += phba->cfg_nvmet_mrq_post * phba->cfg_nvmet_mrq;
 
+       /* Initialize and populate the iocb list per host */
        lpfc_printf_log(phba, KERN_INFO, LOG_INIT,
-                       "2821 initialize iocb list %d.\n",
-                       phba->cfg_iocb_cnt*1024);
-       error = lpfc_init_iocb_list(phba, phba->cfg_iocb_cnt*1024);
+                       "2821 initialize iocb list %d total %d\n",
+                       phba->cfg_iocb_cnt, cnt);
+       error = lpfc_init_iocb_list(phba, cnt);
 
        if (error) {
                lpfc_printf_log(phba, KERN_ERR, LOG_INIT,
@@ -11177,7 +11232,9 @@ lpfc_pci_probe_one_s4(struct pci_dev *pdev, const struct pci_device_id *pid)
        if ((phba->nvmet_support == 0) &&
            (phba->cfg_enable_fc4_type & LPFC_ENABLE_NVME)) {
                /* Create NVME binding with nvme_fc_transport. This
-                * ensures the vport is initialized.
+                * ensures the vport is initialized.  If the localport
+                * create fails, it should not unload the driver to
+                * support field issues.
                 */
                error = lpfc_nvme_create_localport(vport);
                if (error) {
@@ -11185,7 +11242,6 @@ lpfc_pci_probe_one_s4(struct pci_dev *pdev, const struct pci_device_id *pid)
                                        "6004 NVME registration failed, "
                                        "error x%x\n",
                                        error);
-                       goto out_disable_intr;
                }
        }
 
@@ -11984,6 +12040,7 @@ int
 lpfc_fof_queue_create(struct lpfc_hba *phba)
 {
        struct lpfc_queue *qdesc;
+       uint32_t wqesize;
 
        /* Create FOF EQ */
        qdesc = lpfc_sli4_queue_alloc(phba, phba->sli4_hba.eq_esize,
@@ -12004,8 +12061,11 @@ lpfc_fof_queue_create(struct lpfc_hba *phba)
                phba->sli4_hba.oas_cq = qdesc;
 
                /* Create OAS WQ */
-               qdesc = lpfc_sli4_queue_alloc(phba, phba->sli4_hba.wq_esize,
+               wqesize = (phba->fcp_embed_io) ?
+                               LPFC_WQE128_SIZE : phba->sli4_hba.wq_esize;
+               qdesc = lpfc_sli4_queue_alloc(phba, wqesize,
                                              phba->sli4_hba.wq_ecount);
+
                if (!qdesc)
                        goto out_error;
 
index a928f51..ce25a18 100644 (file)
@@ -2083,9 +2083,12 @@ lpfc_request_features(struct lpfc_hba *phba, struct lpfcMboxq *mboxq)
        if (phba->max_vpi && phba->cfg_enable_npiv)
                bf_set(lpfc_mbx_rq_ftr_rq_npiv, &mboxq->u.mqe.un.req_ftrs, 1);
 
-       if (phba->nvmet_support)
+       if (phba->nvmet_support) {
                bf_set(lpfc_mbx_rq_ftr_rq_mrqp, &mboxq->u.mqe.un.req_ftrs, 1);
-
+               /* iaab/iaar NOT set for now */
+                bf_set(lpfc_mbx_rq_ftr_rq_iaab, &mboxq->u.mqe.un.req_ftrs, 0);
+                bf_set(lpfc_mbx_rq_ftr_rq_iaar, &mboxq->u.mqe.un.req_ftrs, 0);
+       }
        return;
 }
 
index 061626b..8777c2d 100644 (file)
@@ -361,8 +361,12 @@ lpfc_rcv_plogi(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp,
        case  NLP_STE_PRLI_ISSUE:
        case  NLP_STE_UNMAPPED_NODE:
        case  NLP_STE_MAPPED_NODE:
-               /* lpfc_plogi_confirm_nport skips fabric did, handle it here */
-               if (!(ndlp->nlp_type & NLP_FABRIC)) {
+               /* For initiators, lpfc_plogi_confirm_nport skips fabric did.
+                * For target mode, execute implicit logo.
+                * Fabric nodes go into NPR.
+                */
+               if (!(ndlp->nlp_type & NLP_FABRIC) &&
+                   !(phba->nvmet_support)) {
                        lpfc_els_rsp_acc(vport, ELS_CMD_PLOGI, cmdiocb,
                                         ndlp, NULL);
                        return 1;
index 0024de1..8008c82 100644 (file)
@@ -401,6 +401,7 @@ lpfc_nvme_ls_req(struct nvme_fc_local_port *pnvme_lport,
        struct lpfc_nodelist *ndlp;
        struct ulp_bde64 *bpl;
        struct lpfc_dmabuf *bmp;
+       uint16_t ntype, nstate;
 
        /* there are two dma buf in the request, actually there is one and
         * the second one is just the start address + cmd size.
@@ -417,11 +418,26 @@ lpfc_nvme_ls_req(struct nvme_fc_local_port *pnvme_lport,
        vport = lport->vport;
 
        ndlp = lpfc_findnode_did(vport, pnvme_rport->port_id);
-       if (!ndlp) {
-               lpfc_printf_vlog(vport, KERN_ERR, LOG_NVME_DISC,
-                                "6043 Could not find node for DID %x\n",
+       if (!ndlp || !NLP_CHK_NODE_ACT(ndlp)) {
+               lpfc_printf_vlog(vport, KERN_ERR, LOG_NODE | LOG_NVME_IOERR,
+                                "6051 DID x%06x not an active rport.\n",
                                 pnvme_rport->port_id);
-               return 1;
+               return -ENODEV;
+       }
+
+       /* The remote node has to be a mapped nvme target or an
+        * unmapped nvme initiator or it's an error.
+        */
+       ntype = ndlp->nlp_type;
+       nstate = ndlp->nlp_state;
+       if ((ntype & NLP_NVME_TARGET && nstate != NLP_STE_MAPPED_NODE) ||
+           (ntype & NLP_NVME_INITIATOR && nstate != NLP_STE_UNMAPPED_NODE)) {
+               lpfc_printf_vlog(vport, KERN_ERR, LOG_NODE | LOG_NVME_IOERR,
+                                "6088 DID x%06x not ready for "
+                                "IO. State x%x, Type x%x\n",
+                                pnvme_rport->port_id,
+                                ndlp->nlp_state, ndlp->nlp_type);
+               return -ENODEV;
        }
        bmp = kmalloc(sizeof(struct lpfc_dmabuf), GFP_KERNEL);
        if (!bmp) {
@@ -456,7 +472,7 @@ lpfc_nvme_ls_req(struct nvme_fc_local_port *pnvme_lport,
 
        /* Expand print to include key fields. */
        lpfc_printf_vlog(vport, KERN_INFO, LOG_NVME_DISC,
-                        "6051 ENTER.  lport %p, rport %p lsreq%p rqstlen:%d "
+                        "6149 ENTER.  lport %p, rport %p lsreq%p rqstlen:%d "
                         "rsplen:%d %pad %pad\n",
                         pnvme_lport, pnvme_rport,
                         pnvme_lsreq, pnvme_lsreq->rqstlen,
@@ -745,6 +761,7 @@ lpfc_nvme_io_cmd_wqe_cmpl(struct lpfc_hba *phba, struct lpfc_iocbq *pwqeIn,
        struct nvme_fc_cmd_iu *cp;
        struct lpfc_nvme_rport *rport;
        struct lpfc_nodelist *ndlp;
+       struct lpfc_nvme_fcpreq_priv *freqpriv;
        unsigned long flags;
        uint32_t code;
        uint16_t cid, sqhd, data;
@@ -772,9 +789,8 @@ lpfc_nvme_io_cmd_wqe_cmpl(struct lpfc_hba *phba, struct lpfc_iocbq *pwqeIn,
        ndlp = rport->ndlp;
        if (!ndlp || !NLP_CHK_NODE_ACT(ndlp)) {
                lpfc_printf_vlog(vport, KERN_ERR, LOG_NODE | LOG_NVME_IOERR,
-                                "6061 rport %p, ndlp %p, DID x%06x ndlp "
-                                "not ready.\n",
-                                rport, ndlp, rport->remoteport->port_id);
+                                "6061 rport %p,  DID x%06x node not ready.\n",
+                                rport, rport->remoteport->port_id);
 
                ndlp = lpfc_findnode_did(vport, rport->remoteport->port_id);
                if (!ndlp) {
@@ -853,15 +869,18 @@ lpfc_nvme_io_cmd_wqe_cmpl(struct lpfc_hba *phba, struct lpfc_iocbq *pwqeIn,
                                break;
                        lpfc_printf_vlog(vport, KERN_ERR, LOG_NVME_IOERR,
                                         "6081 NVME Completion Protocol Error: "
-                                        "status x%x result x%x placed x%x\n",
+                                        "xri %x status x%x result x%x "
+                                        "placed x%x\n",
+                                        lpfc_ncmd->cur_iocbq.sli4_xritag,
                                         lpfc_ncmd->status, lpfc_ncmd->result,
                                         wcqe->total_data_placed);
                        break;
                default:
 out_err:
                        lpfc_printf_vlog(vport, KERN_ERR, LOG_NVME_IOERR,
-                                        "6072 NVME Completion Error: "
+                                        "6072 NVME Completion Error: xri %x "
                                         "status x%x result x%x placed x%x\n",
+                                        lpfc_ncmd->cur_iocbq.sli4_xritag,
                                         lpfc_ncmd->status, lpfc_ncmd->result,
                                         wcqe->total_data_placed);
                        nCmd->transferred_length = 0;
@@ -900,6 +919,8 @@ out_err:
                        phba->cpucheck_cmpl_io[lpfc_ncmd->cpu]++;
        }
 #endif
+       freqpriv = nCmd->private;
+       freqpriv->nvme_buf = NULL;
        nCmd->done(nCmd);
 
        spin_lock_irqsave(&phba->hbalock, flags);
@@ -1099,12 +1120,12 @@ lpfc_nvme_prep_io_dma(struct lpfc_vport *vport,
 
                first_data_sgl = sgl;
                lpfc_ncmd->seg_cnt = nCmd->sg_cnt;
-               if (lpfc_ncmd->seg_cnt > phba->cfg_sg_seg_cnt) {
+               if (lpfc_ncmd->seg_cnt > phba->cfg_nvme_seg_cnt) {
                        lpfc_printf_log(phba, KERN_ERR, LOG_NVME_IOERR,
                                        "6058 Too many sg segments from "
                                        "NVME Transport.  Max %d, "
                                        "nvmeIO sg_cnt %d\n",
-                                       phba->cfg_sg_seg_cnt,
+                                       phba->cfg_nvme_seg_cnt,
                                        lpfc_ncmd->seg_cnt);
                        lpfc_ncmd->seg_cnt = 0;
                        return 1;
@@ -1196,6 +1217,7 @@ lpfc_nvme_fcp_io_submit(struct nvme_fc_local_port *pnvme_lport,
        struct lpfc_nvme_buf *lpfc_ncmd;
        struct lpfc_nvme_rport *rport;
        struct lpfc_nvme_qhandle *lpfc_queue_info;
+       struct lpfc_nvme_fcpreq_priv *freqpriv = pnvme_fcreq->private;
 #ifdef CONFIG_SCSI_LPFC_DEBUG_FS
        uint64_t start = 0;
 #endif
@@ -1274,7 +1296,7 @@ lpfc_nvme_fcp_io_submit(struct nvme_fc_local_port *pnvme_lport,
         * Do not let the IO hang out forever.  There is no midlayer issuing
         * an abort so inform the FW of the maximum IO pending time.
         */
-       pnvme_fcreq->private = (void *)lpfc_ncmd;
+       freqpriv->nvme_buf = lpfc_ncmd;
        lpfc_ncmd->nvmeCmd = pnvme_fcreq;
        lpfc_ncmd->nrport = rport;
        lpfc_ncmd->ndlp = ndlp;
@@ -1404,6 +1426,7 @@ lpfc_nvme_fcp_abort(struct nvme_fc_local_port *pnvme_lport,
        struct lpfc_nvme_buf *lpfc_nbuf;
        struct lpfc_iocbq *abts_buf;
        struct lpfc_iocbq *nvmereq_wqe;
+       struct lpfc_nvme_fcpreq_priv *freqpriv = pnvme_fcreq->private;
        union lpfc_wqe *abts_wqe;
        unsigned long flags;
        int ret_val;
@@ -1414,7 +1437,7 @@ lpfc_nvme_fcp_abort(struct nvme_fc_local_port *pnvme_lport,
        phba = vport->phba;
 
        /* Announce entry to new IO submit field. */
-       lpfc_printf_vlog(vport, KERN_ERR, LOG_NVME_ABTS,
+       lpfc_printf_vlog(vport, KERN_INFO, LOG_NVME_ABTS,
                         "6002 Abort Request to rport DID x%06x "
                         "for nvme_fc_req %p\n",
                         pnvme_rport->port_id,
@@ -1444,7 +1467,7 @@ lpfc_nvme_fcp_abort(struct nvme_fc_local_port *pnvme_lport,
        /* The remote node has to be ready to send an abort. */
        if ((ndlp->nlp_state != NLP_STE_MAPPED_NODE) &&
            !(ndlp->nlp_type & NLP_NVME_TARGET)) {
-               lpfc_printf_vlog(vport, KERN_ERR, LOG_NODE | LOG_NVME_ABTS,
+               lpfc_printf_vlog(vport, KERN_ERR, LOG_NVME_ABTS,
                                 "6048 rport %p, DID x%06x not ready for "
                                 "IO. State x%x, Type x%x\n",
                                 rport, pnvme_rport->port_id,
@@ -1459,27 +1482,28 @@ lpfc_nvme_fcp_abort(struct nvme_fc_local_port *pnvme_lport,
        /* driver queued commands are in process of being flushed */
        if (phba->hba_flag & HBA_NVME_IOQ_FLUSH) {
                spin_unlock_irqrestore(&phba->hbalock, flags);
-               lpfc_printf_vlog(vport, KERN_ERR, LOG_NVME,
+               lpfc_printf_vlog(vport, KERN_ERR, LOG_NVME_ABTS,
                                 "6139 Driver in reset cleanup - flushing "
                                 "NVME Req now.  hba_flag x%x\n",
                                 phba->hba_flag);
                return;
        }
 
-       lpfc_nbuf = (struct lpfc_nvme_buf *)pnvme_fcreq->private;
+       lpfc_nbuf = freqpriv->nvme_buf;
        if (!lpfc_nbuf) {
                spin_unlock_irqrestore(&phba->hbalock, flags);
-               lpfc_printf_vlog(vport, KERN_ERR, LOG_NVME,
+               lpfc_printf_vlog(vport, KERN_ERR, LOG_NVME_ABTS,
                                 "6140 NVME IO req has no matching lpfc nvme "
                                 "io buffer.  Skipping abort req.\n");
                return;
        } else if (!lpfc_nbuf->nvmeCmd) {
                spin_unlock_irqrestore(&phba->hbalock, flags);
-               lpfc_printf_vlog(vport, KERN_ERR, LOG_NVME,
+               lpfc_printf_vlog(vport, KERN_ERR, LOG_NVME_ABTS,
                                 "6141 lpfc NVME IO req has no nvme_fcreq "
                                 "io buffer.  Skipping abort req.\n");
                return;
        }
+       nvmereq_wqe = &lpfc_nbuf->cur_iocbq;
 
        /*
         * The lpfc_nbuf and the mapped nvme_fcreq in the driver's
@@ -1490,23 +1514,22 @@ lpfc_nvme_fcp_abort(struct nvme_fc_local_port *pnvme_lport,
         */
        if (lpfc_nbuf->nvmeCmd != pnvme_fcreq) {
                spin_unlock_irqrestore(&phba->hbalock, flags);
-               lpfc_printf_vlog(vport, KERN_ERR, LOG_NVME,
+               lpfc_printf_vlog(vport, KERN_ERR, LOG_NVME_ABTS,
                                 "6143 NVME req mismatch: "
                                 "lpfc_nbuf %p nvmeCmd %p, "
-                                "pnvme_fcreq %p.  Skipping Abort\n",
+                                "pnvme_fcreq %p.  Skipping Abort xri x%x\n",
                                 lpfc_nbuf, lpfc_nbuf->nvmeCmd,
-                                pnvme_fcreq);
+                                pnvme_fcreq, nvmereq_wqe->sli4_xritag);
                return;
        }
 
        /* Don't abort IOs no longer on the pending queue. */
-       nvmereq_wqe = &lpfc_nbuf->cur_iocbq;
        if (!(nvmereq_wqe->iocb_flag & LPFC_IO_ON_TXCMPLQ)) {
                spin_unlock_irqrestore(&phba->hbalock, flags);
-               lpfc_printf_vlog(vport, KERN_ERR, LOG_NVME,
+               lpfc_printf_vlog(vport, KERN_ERR, LOG_NVME_ABTS,
                                 "6142 NVME IO req %p not queued - skipping "
-                                "abort req\n",
-                                pnvme_fcreq);
+                                "abort req xri x%x\n",
+                                pnvme_fcreq, nvmereq_wqe->sli4_xritag);
                return;
        }
 
@@ -1517,21 +1540,22 @@ lpfc_nvme_fcp_abort(struct nvme_fc_local_port *pnvme_lport,
        /* Outstanding abort is in progress */
        if (nvmereq_wqe->iocb_flag & LPFC_DRIVER_ABORTED) {
                spin_unlock_irqrestore(&phba->hbalock, flags);
-               lpfc_printf_vlog(vport, KERN_ERR, LOG_NVME,
+               lpfc_printf_vlog(vport, KERN_ERR, LOG_NVME_ABTS,
                                 "6144 Outstanding NVME I/O Abort Request "
                                 "still pending on nvme_fcreq %p, "
-                                "lpfc_ncmd %p\n",
-                                pnvme_fcreq, lpfc_nbuf);
+                                "lpfc_ncmd %p xri x%x\n",
+                                pnvme_fcreq, lpfc_nbuf,
+                                nvmereq_wqe->sli4_xritag);
                return;
        }
 
        abts_buf = __lpfc_sli_get_iocbq(phba);
        if (!abts_buf) {
                spin_unlock_irqrestore(&phba->hbalock, flags);
-               lpfc_printf_vlog(vport, KERN_ERR, LOG_NVME,
+               lpfc_printf_vlog(vport, KERN_ERR, LOG_NVME_ABTS,
                                 "6136 No available abort wqes. Skipping "
-                                "Abts req for nvme_fcreq %p.\n",
-                                pnvme_fcreq);
+                                "Abts req for nvme_fcreq %p xri x%x\n",
+                                pnvme_fcreq, nvmereq_wqe->sli4_xritag);
                return;
        }
 
@@ -1580,7 +1604,7 @@ lpfc_nvme_fcp_abort(struct nvme_fc_local_port *pnvme_lport,
        ret_val = lpfc_sli4_issue_wqe(phba, LPFC_FCP_RING, abts_buf);
        spin_unlock_irqrestore(&phba->hbalock, flags);
        if (ret_val == IOCB_ERROR) {
-               lpfc_printf_vlog(vport, KERN_ERR, LOG_NVME,
+               lpfc_printf_vlog(vport, KERN_ERR, LOG_NVME_ABTS,
                                 "6137 Failed abts issue_wqe with status x%x "
                                 "for nvme_fcreq %p.\n",
                                 ret_val, pnvme_fcreq);
@@ -1588,8 +1612,8 @@ lpfc_nvme_fcp_abort(struct nvme_fc_local_port *pnvme_lport,
                return;
        }
 
-       lpfc_printf_vlog(vport, KERN_ERR, LOG_NVME,
-                        "6138 Transport Abort NVME Request Issued for\n"
+       lpfc_printf_vlog(vport, KERN_INFO, LOG_NVME_ABTS,
+                        "6138 Transport Abort NVME Request Issued for "
                         "ox_id x%x on reqtag x%x\n",
                         nvmereq_wqe->sli4_xritag,
                         abts_buf->iotag);
@@ -1618,7 +1642,7 @@ static struct nvme_fc_port_template lpfc_nvme_template = {
        .local_priv_sz = sizeof(struct lpfc_nvme_lport),
        .remote_priv_sz = sizeof(struct lpfc_nvme_rport),
        .lsrqst_priv_sz = 0,
-       .fcprqst_priv_sz = 0,
+       .fcprqst_priv_sz = sizeof(struct lpfc_nvme_fcpreq_priv),
 };
 
 /**
@@ -2049,7 +2073,7 @@ lpfc_get_nvme_buf(struct lpfc_hba *phba, struct lpfc_nodelist *ndlp)
                if (lpfc_test_rrq_active(phba, ndlp,
                                         lpfc_ncmd->cur_iocbq.sli4_lxritag))
                        continue;
-               list_del(&lpfc_ncmd->list);
+               list_del_init(&lpfc_ncmd->list);
                found = 1;
                break;
        }
@@ -2064,7 +2088,7 @@ lpfc_get_nvme_buf(struct lpfc_hba *phba, struct lpfc_nodelist *ndlp)
                        if (lpfc_test_rrq_active(
                                phba, ndlp, lpfc_ncmd->cur_iocbq.sli4_lxritag))
                                continue;
-                       list_del(&lpfc_ncmd->list);
+                       list_del_init(&lpfc_ncmd->list);
                        found = 1;
                        break;
                }
@@ -2092,6 +2116,12 @@ lpfc_release_nvme_buf(struct lpfc_hba *phba, struct lpfc_nvme_buf *lpfc_ncmd)
 
        lpfc_ncmd->nonsg_phys = 0;
        if (lpfc_ncmd->flags & LPFC_SBUF_XBUSY) {
+               lpfc_printf_log(phba, KERN_INFO, LOG_NVME_ABTS,
+                               "6310 XB release deferred for "
+                               "ox_id x%x on reqtag x%x\n",
+                               lpfc_ncmd->cur_iocbq.sli4_xritag,
+                               lpfc_ncmd->cur_iocbq.iotag);
+
                spin_lock_irqsave(&phba->sli4_hba.abts_nvme_buf_list_lock,
                                        iflag);
                lpfc_ncmd->nvmeCmd = NULL;
@@ -2142,8 +2172,18 @@ lpfc_nvme_create_localport(struct lpfc_vport *vport)
        nfcp_info.node_name = wwn_to_u64(vport->fc_nodename.u.wwn);
        nfcp_info.port_name = wwn_to_u64(vport->fc_portname.u.wwn);
 
-       /* For now need + 1 to get around NVME transport logic */
-       lpfc_nvme_template.max_sgl_segments = phba->cfg_sg_seg_cnt + 1;
+       /* Limit to LPFC_MAX_NVME_SEG_CNT.
+        * For now need + 1 to get around NVME transport logic.
+        */
+       if (phba->cfg_sg_seg_cnt > LPFC_MAX_NVME_SEG_CNT) {
+               lpfc_printf_vlog(vport, KERN_INFO, LOG_NVME | LOG_INIT,
+                                "6300 Reducing sg segment cnt to %d\n",
+                                LPFC_MAX_NVME_SEG_CNT);
+               phba->cfg_nvme_seg_cnt = LPFC_MAX_NVME_SEG_CNT;
+       } else {
+               phba->cfg_nvme_seg_cnt = phba->cfg_sg_seg_cnt;
+       }
+       lpfc_nvme_template.max_sgl_segments = phba->cfg_nvme_seg_cnt + 1;
        lpfc_nvme_template.max_hw_queues = phba->cfg_nvme_io_channel;
 
        /* localport is allocated from the stack, but the registration
@@ -2249,12 +2289,23 @@ lpfc_nvme_destroy_localport(struct lpfc_vport *vport)
 void
 lpfc_nvme_update_localport(struct lpfc_vport *vport)
 {
+#if (IS_ENABLED(CONFIG_NVME_FC))
        struct nvme_fc_local_port *localport;
        struct lpfc_nvme_lport *lport;
 
        localport = vport->localport;
+       if (!localport) {
+               lpfc_printf_vlog(vport, KERN_WARNING, LOG_NVME,
+                                "6710 Update NVME fail. No localport\n");
+               return;
+       }
        lport = (struct lpfc_nvme_lport *)localport->private;
-
+       if (!lport) {
+               lpfc_printf_vlog(vport, KERN_WARNING, LOG_NVME,
+                                "6171 Update NVME fail. localP %p, No lport\n",
+                                localport);
+               return;
+       }
        lpfc_printf_vlog(vport, KERN_INFO, LOG_NVME,
                         "6012 Update NVME lport %p did x%x\n",
                         localport, vport->fc_myDID);
@@ -2268,7 +2319,7 @@ lpfc_nvme_update_localport(struct lpfc_vport *vport)
        lpfc_printf_vlog(vport, KERN_INFO, LOG_NVME_DISC,
                         "6030 bound lport %p to DID x%06x\n",
                         lport, localport->port_id);
-
+#endif
 }
 
 int
@@ -2409,6 +2460,7 @@ lpfc_nvme_unregister_port(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp)
        struct lpfc_nvme_lport *lport;
        struct lpfc_nvme_rport *rport;
        struct nvme_fc_remote_port *remoteport;
+       unsigned long wait_tmo;
 
        localport = vport->localport;
 
@@ -2451,11 +2503,12 @@ lpfc_nvme_unregister_port(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp)
                 * before proceeding.  This guarantees the transport and driver
                 * have completed the unreg process.
                 */
-               ret = wait_for_completion_timeout(&rport->rport_unreg_done, 5);
+               wait_tmo = msecs_to_jiffies(5000);
+               ret = wait_for_completion_timeout(&rport->rport_unreg_done,
+                                                 wait_tmo);
                if (ret == 0) {
                        lpfc_printf_vlog(vport, KERN_ERR, LOG_NVME_DISC,
-                                        "6169 Unreg nvme wait failed %d\n",
-                                        ret);
+                                        "6169 Unreg nvme wait timeout\n");
                }
        }
        return;
@@ -2463,7 +2516,7 @@ lpfc_nvme_unregister_port(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp)
  input_err:
 #endif
        lpfc_printf_vlog(vport, KERN_ERR, LOG_NVME_DISC,
-                        "6168: State error: lport %p, rport%p FCID x%06x\n",
+                        "6168 State error: lport %p, rport%p FCID x%06x\n",
                         vport->localport, ndlp->rport, ndlp->nlp_DID);
 }
 
@@ -2494,7 +2547,7 @@ lpfc_sli4_nvme_xri_aborted(struct lpfc_hba *phba,
                                 &phba->sli4_hba.lpfc_abts_nvme_buf_list,
                                 list) {
                if (lpfc_ncmd->cur_iocbq.sli4_xritag == xri) {
-                       list_del(&lpfc_ncmd->list);
+                       list_del_init(&lpfc_ncmd->list);
                        lpfc_ncmd->flags &= ~LPFC_SBUF_XBUSY;
                        lpfc_ncmd->status = IOSTAT_SUCCESS;
                        spin_unlock(
@@ -2510,6 +2563,12 @@ lpfc_sli4_nvme_xri_aborted(struct lpfc_hba *phba,
                                        rxid, 1);
                                lpfc_sli4_abts_err_handler(phba, ndlp, axri);
                        }
+
+                       lpfc_printf_log(phba, KERN_INFO, LOG_NVME_ABTS,
+                                       "6311 XRI Aborted xri x%x tag x%x "
+                                       "released\n",
+                                       xri, lpfc_ncmd->cur_iocbq.iotag);
+
                        lpfc_release_nvme_buf(phba, lpfc_ncmd);
                        if (rrq_empty)
                                lpfc_worker_wake_up(phba);
@@ -2518,4 +2577,8 @@ lpfc_sli4_nvme_xri_aborted(struct lpfc_hba *phba,
        }
        spin_unlock(&phba->sli4_hba.abts_nvme_buf_list_lock);
        spin_unlock_irqrestore(&phba->hbalock, iflag);
+
+       lpfc_printf_log(phba, KERN_INFO, LOG_NVME_ABTS,
+                       "6312 XRI Aborted xri x%x not found\n", xri);
+
 }
index 1347deb..ec32f45 100644 (file)
  * included with this package.                                     *
  ********************************************************************/
 
-#define LPFC_NVME_MIN_SEGS             16
-#define LPFC_NVME_DEFAULT_SEGS         66      /* 256K IOs - 64 + 2 */
-#define LPFC_NVME_MAX_SEGS             510
-#define LPFC_NVMET_MIN_POSTBUF         16
-#define LPFC_NVMET_DEFAULT_POSTBUF     1024
-#define LPFC_NVMET_MAX_POSTBUF         4096
+#define LPFC_NVME_DEFAULT_SEGS         (64 + 1)        /* 256K IOs */
 #define LPFC_NVME_WQSIZE               256
 
 #define LPFC_NVME_ERSP_LEN             0x20
@@ -102,3 +97,7 @@ struct lpfc_nvme_buf {
        uint64_t ts_data_nvme;
 #endif
 };
+
+struct lpfc_nvme_fcpreq_priv {
+       struct lpfc_nvme_buf *nvme_buf;
+};
index acba1b6..94434e6 100644 (file)
@@ -71,6 +71,26 @@ static int lpfc_nvmet_unsol_ls_issue_abort(struct lpfc_hba *,
                                           struct lpfc_nvmet_rcv_ctx *,
                                           uint32_t, uint16_t);
 
+void
+lpfc_nvmet_defer_release(struct lpfc_hba *phba, struct lpfc_nvmet_rcv_ctx *ctxp)
+{
+       unsigned long iflag;
+
+       lpfc_printf_log(phba, KERN_ERR, LOG_NVME_ABTS,
+                       "6313 NVMET Defer ctx release xri x%x flg x%x\n",
+                       ctxp->oxid, ctxp->flag);
+
+       spin_lock_irqsave(&phba->sli4_hba.abts_nvme_buf_list_lock, iflag);
+       if (ctxp->flag & LPFC_NVMET_CTX_RLS) {
+               spin_unlock_irqrestore(&phba->sli4_hba.abts_nvme_buf_list_lock,
+                                      iflag);
+               return;
+       }
+       ctxp->flag |= LPFC_NVMET_CTX_RLS;
+       list_add_tail(&ctxp->list, &phba->sli4_hba.lpfc_abts_nvmet_ctx_list);
+       spin_unlock_irqrestore(&phba->sli4_hba.abts_nvme_buf_list_lock, iflag);
+}
+
 /**
  * lpfc_nvmet_xmt_ls_rsp_cmp - Completion handler for LS Response
  * @phba: Pointer to HBA context object.
@@ -139,6 +159,11 @@ lpfc_nvmet_rq_post(struct lpfc_hba *phba, struct lpfc_nvmet_rcv_ctx *ctxp,
                   struct lpfc_dmabuf *mp)
 {
        if (ctxp) {
+               if (ctxp->flag)
+                       lpfc_printf_log(phba, KERN_INFO, LOG_NVME_ABTS,
+                               "6314 rq_post ctx xri x%x flag x%x\n",
+                               ctxp->oxid, ctxp->flag);
+
                if (ctxp->txrdy) {
                        pci_pool_free(phba->txrdy_payload_pool, ctxp->txrdy,
                                      ctxp->txrdy_phys);
@@ -337,39 +362,55 @@ lpfc_nvmet_xmt_fcp_op_cmp(struct lpfc_hba *phba, struct lpfc_iocbq *cmdwqe,
 #endif
 
        ctxp = cmdwqe->context2;
+       ctxp->flag &= ~LPFC_NVMET_IO_INP;
+
        rsp = &ctxp->ctx.fcp_req;
        op = rsp->op;
-       ctxp->flag &= ~LPFC_NVMET_IO_INP;
 
        status = bf_get(lpfc_wcqe_c_status, wcqe);
        result = wcqe->parameter;
 
-       if (!phba->targetport)
-               goto out;
+       if (phba->targetport)
+               tgtp = (struct lpfc_nvmet_tgtport *)phba->targetport->private;
+       else
+               tgtp = NULL;
 
        lpfc_nvmeio_data(phba, "NVMET FCP CMPL: xri x%x op x%x status x%x\n",
                         ctxp->oxid, op, status);
 
-       tgtp = (struct lpfc_nvmet_tgtport *)phba->targetport->private;
        if (status) {
                rsp->fcp_error = NVME_SC_DATA_XFER_ERROR;
                rsp->transferred_length = 0;
-               atomic_inc(&tgtp->xmt_fcp_rsp_error);
+               if (tgtp)
+                       atomic_inc(&tgtp->xmt_fcp_rsp_error);
+
+               /* pick up SLI4 exhange busy condition */
+               if (bf_get(lpfc_wcqe_c_xb, wcqe)) {
+                       ctxp->flag |= LPFC_NVMET_XBUSY;
+
+                       lpfc_printf_log(phba, KERN_INFO, LOG_NVME_ABTS,
+                                       "6315 IO Cmpl XBUSY: xri x%x: %x/%x\n",
+                                       ctxp->oxid, status, result);
+               } else {
+                       ctxp->flag &= ~LPFC_NVMET_XBUSY;
+               }
+
        } else {
                rsp->fcp_error = NVME_SC_SUCCESS;
                if (op == NVMET_FCOP_RSP)
                        rsp->transferred_length = rsp->rsplen;
                else
                        rsp->transferred_length = rsp->transfer_length;
-               atomic_inc(&tgtp->xmt_fcp_rsp_cmpl);
+               if (tgtp)
+                       atomic_inc(&tgtp->xmt_fcp_rsp_cmpl);
        }
 
-out:
        if ((op == NVMET_FCOP_READDATA_RSP) ||
            (op == NVMET_FCOP_RSP)) {
                /* Sanity check */
                ctxp->state = LPFC_NVMET_STE_DONE;
                ctxp->entry_cnt++;
+
 #ifdef CONFIG_SCSI_LPFC_DEBUG_FS
                if (phba->ktime_on) {
                        if (rsp->op == NVMET_FCOP_READDATA_RSP) {
@@ -408,9 +449,7 @@ out:
                if (phba->ktime_on)
                        lpfc_nvmet_ktime(phba, ctxp);
 #endif
-               /* Let Abort cmpl repost the context */
-               if (!(ctxp->flag & LPFC_NVMET_ABORT_OP))
-                       lpfc_nvmet_rq_post(phba, ctxp, &ctxp->rqb_buffer->hbuf);
+               /* lpfc_nvmet_xmt_fcp_release() will recycle the context */
        } else {
                ctxp->entry_cnt++;
                start_clean = offsetof(struct lpfc_iocbq, wqe);
@@ -519,7 +558,6 @@ lpfc_nvmet_xmt_fcp_op(struct nvmet_fc_target_port *tgtport,
                container_of(rsp, struct lpfc_nvmet_rcv_ctx, ctx.fcp_req);
        struct lpfc_hba *phba = ctxp->phba;
        struct lpfc_iocbq *nvmewqeq;
-       unsigned long iflags;
        int rc;
 
 #ifdef CONFIG_SCSI_LPFC_DEBUG_FS
@@ -544,32 +582,12 @@ lpfc_nvmet_xmt_fcp_op(struct nvmet_fc_target_port *tgtport,
        }
 #endif
 
-       if (rsp->op == NVMET_FCOP_ABORT) {
-               lpfc_printf_log(phba, KERN_INFO, LOG_NVME_ABTS,
-                               "6103 Abort op: oxri x%x %d cnt %d\n",
-                               ctxp->oxid, ctxp->state, ctxp->entry_cnt);
-
-               lpfc_nvmeio_data(phba, "NVMET FCP ABRT: "
-                                "xri x%x state x%x cnt x%x\n",
-                                ctxp->oxid, ctxp->state, ctxp->entry_cnt);
-
-               atomic_inc(&lpfc_nvmep->xmt_fcp_abort);
-               ctxp->entry_cnt++;
-               ctxp->flag |= LPFC_NVMET_ABORT_OP;
-               if (ctxp->flag & LPFC_NVMET_IO_INP)
-                       lpfc_nvmet_sol_fcp_issue_abort(phba, ctxp, ctxp->sid,
-                                                      ctxp->oxid);
-               else
-                       lpfc_nvmet_unsol_fcp_issue_abort(phba, ctxp, ctxp->sid,
-                                                        ctxp->oxid);
-               return 0;
-       }
-
        /* Sanity check */
-       if (ctxp->state == LPFC_NVMET_STE_ABORT) {
+       if ((ctxp->flag & LPFC_NVMET_ABTS_RCV) ||
+           (ctxp->state == LPFC_NVMET_STE_ABORT)) {
                atomic_inc(&lpfc_nvmep->xmt_fcp_drop);
                lpfc_printf_log(phba, KERN_ERR, LOG_NVME_IOERR,
-                               "6102 Bad state IO x%x aborted\n",
+                               "6102 IO xri x%x aborted\n",
                                ctxp->oxid);
                rc = -ENXIO;
                goto aerr;
@@ -594,10 +612,7 @@ lpfc_nvmet_xmt_fcp_op(struct nvmet_fc_target_port *tgtport,
        lpfc_nvmeio_data(phba, "NVMET FCP CMND: xri x%x op x%x len x%x\n",
                         ctxp->oxid, rsp->op, rsp->rsplen);
 
-       /* For now we take hbalock */
-       spin_lock_irqsave(&phba->hbalock, iflags);
        rc = lpfc_sli4_issue_wqe(phba, LPFC_FCP_RING, nvmewqeq);
-       spin_unlock_irqrestore(&phba->hbalock, iflags);
        if (rc == WQE_SUCCESS) {
                ctxp->flag |= LPFC_NVMET_IO_INP;
 #ifdef CONFIG_SCSI_LPFC_DEBUG_FS
@@ -634,10 +649,79 @@ lpfc_nvmet_targetport_delete(struct nvmet_fc_target_port *targetport)
        complete(&tport->tport_unreg_done);
 }
 
+static void
+lpfc_nvmet_xmt_fcp_abort(struct nvmet_fc_target_port *tgtport,
+                        struct nvmefc_tgt_fcp_req *req)
+{
+       struct lpfc_nvmet_tgtport *lpfc_nvmep = tgtport->private;
+       struct lpfc_nvmet_rcv_ctx *ctxp =
+               container_of(req, struct lpfc_nvmet_rcv_ctx, ctx.fcp_req);
+       struct lpfc_hba *phba = ctxp->phba;
+       unsigned long flags;
+
+       lpfc_printf_log(phba, KERN_INFO, LOG_NVME_ABTS,
+                       "6103 Abort op: oxri x%x flg x%x cnt %d\n",
+                       ctxp->oxid, ctxp->flag, ctxp->entry_cnt);
+
+       lpfc_nvmeio_data(phba, "NVMET FCP ABRT: "
+                        "xri x%x flg x%x cnt x%x\n",
+                        ctxp->oxid, ctxp->flag, ctxp->entry_cnt);
+
+       atomic_inc(&lpfc_nvmep->xmt_fcp_abort);
+       ctxp->entry_cnt++;
+       spin_lock_irqsave(&ctxp->ctxlock, flags);
+
+       /* Since iaab/iaar are NOT set, we need to check
+        * if the firmware is in process of aborting IO
+        */
+       if (ctxp->flag & LPFC_NVMET_XBUSY) {
+               spin_unlock_irqrestore(&ctxp->ctxlock, flags);
+               return;
+       }
+       ctxp->flag |= LPFC_NVMET_ABORT_OP;
+       if (ctxp->flag & LPFC_NVMET_IO_INP)
+               lpfc_nvmet_sol_fcp_issue_abort(phba, ctxp, ctxp->sid,
+                                              ctxp->oxid);
+       else
+               lpfc_nvmet_unsol_fcp_issue_abort(phba, ctxp, ctxp->sid,
+                                                ctxp->oxid);
+       spin_unlock_irqrestore(&ctxp->ctxlock, flags);
+}
+
+static void
+lpfc_nvmet_xmt_fcp_release(struct nvmet_fc_target_port *tgtport,
+                          struct nvmefc_tgt_fcp_req *rsp)
+{
+       struct lpfc_nvmet_rcv_ctx *ctxp =
+               container_of(rsp, struct lpfc_nvmet_rcv_ctx, ctx.fcp_req);
+       struct lpfc_hba *phba = ctxp->phba;
+       unsigned long flags;
+       bool aborting = false;
+
+       spin_lock_irqsave(&ctxp->ctxlock, flags);
+       if ((ctxp->flag & LPFC_NVMET_ABORT_OP) ||
+           (ctxp->flag & LPFC_NVMET_XBUSY)) {
+               aborting = true;
+               /* let the abort path do the real release */
+               lpfc_nvmet_defer_release(phba, ctxp);
+       }
+       spin_unlock_irqrestore(&ctxp->ctxlock, flags);
+
+       lpfc_nvmeio_data(phba, "NVMET FCP FREE: xri x%x ste %d\n", ctxp->oxid,
+                        ctxp->state, 0);
+
+       if (aborting)
+               return;
+
+       lpfc_nvmet_rq_post(phba, ctxp, &ctxp->rqb_buffer->hbuf);
+}
+
 static struct nvmet_fc_target_template lpfc_tgttemplate = {
        .targetport_delete = lpfc_nvmet_targetport_delete,
        .xmt_ls_rsp     = lpfc_nvmet_xmt_ls_rsp,
        .fcp_op         = lpfc_nvmet_xmt_fcp_op,
+       .fcp_abort      = lpfc_nvmet_xmt_fcp_abort,
+       .fcp_req_release = lpfc_nvmet_xmt_fcp_release,
 
        .max_hw_queues  = 1,
        .max_sgl_segments = LPFC_NVMET_DEFAULT_SEGS,
@@ -666,10 +750,23 @@ lpfc_nvmet_create_targetport(struct lpfc_hba *phba)
        pinfo.port_name = wwn_to_u64(vport->fc_portname.u.wwn);
        pinfo.port_id = vport->fc_myDID;
 
+       /* Limit to LPFC_MAX_NVME_SEG_CNT.
+        * For now need + 1 to get around NVME transport logic.
+        */
+       if (phba->cfg_sg_seg_cnt > LPFC_MAX_NVME_SEG_CNT) {
+               lpfc_printf_log(phba, KERN_INFO, LOG_NVME | LOG_INIT,
+                               "6400 Reducing sg segment cnt to %d\n",
+                               LPFC_MAX_NVME_SEG_CNT);
+               phba->cfg_nvme_seg_cnt = LPFC_MAX_NVME_SEG_CNT;
+       } else {
+               phba->cfg_nvme_seg_cnt = phba->cfg_sg_seg_cnt;
+       }
+       lpfc_tgttemplate.max_sgl_segments = phba->cfg_nvme_seg_cnt + 1;
        lpfc_tgttemplate.max_hw_queues = phba->cfg_nvme_io_channel;
-       lpfc_tgttemplate.max_sgl_segments = phba->cfg_sg_seg_cnt;
        lpfc_tgttemplate.target_features = NVMET_FCTGTFEAT_READDATA_RSP |
-                                          NVMET_FCTGTFEAT_NEEDS_CMD_CPUSCHED;
+                                          NVMET_FCTGTFEAT_NEEDS_CMD_CPUSCHED |
+                                          NVMET_FCTGTFEAT_CMD_IN_ISR |
+                                          NVMET_FCTGTFEAT_OPDONE_IN_ISR;
 
 #if (IS_ENABLED(CONFIG_NVME_TARGET_FC))
        error = nvmet_fc_register_targetport(&pinfo, &lpfc_tgttemplate,
@@ -750,7 +847,120 @@ void
 lpfc_sli4_nvmet_xri_aborted(struct lpfc_hba *phba,
                            struct sli4_wcqe_xri_aborted *axri)
 {
-       /* TODO: work in progress */
+       uint16_t xri = bf_get(lpfc_wcqe_xa_xri, axri);
+       uint16_t rxid = bf_get(lpfc_wcqe_xa_remote_xid, axri);
+       struct lpfc_nvmet_rcv_ctx *ctxp, *next_ctxp;
+       struct lpfc_nodelist *ndlp;
+       unsigned long iflag = 0;
+       int rrq_empty = 0;
+       bool released = false;
+
+       lpfc_printf_log(phba, KERN_INFO, LOG_NVME_ABTS,
+                       "6317 XB aborted xri x%x rxid x%x\n", xri, rxid);
+
+       if (!(phba->cfg_enable_fc4_type & LPFC_ENABLE_NVME))
+               return;
+       spin_lock_irqsave(&phba->hbalock, iflag);
+       spin_lock(&phba->sli4_hba.abts_nvme_buf_list_lock);
+       list_for_each_entry_safe(ctxp, next_ctxp,
+                                &phba->sli4_hba.lpfc_abts_nvmet_ctx_list,
+                                list) {
+               if (ctxp->rqb_buffer->sglq->sli4_xritag != xri)
+                       continue;
+
+               /* Check if we already received a free context call
+                * and we have completed processing an abort situation.
+                */
+               if (ctxp->flag & LPFC_NVMET_CTX_RLS &&
+                   !(ctxp->flag & LPFC_NVMET_ABORT_OP)) {
+                       list_del(&ctxp->list);
+                       released = true;
+               }
+               ctxp->flag &= ~LPFC_NVMET_XBUSY;
+               spin_unlock(&phba->sli4_hba.abts_nvme_buf_list_lock);
+
+               rrq_empty = list_empty(&phba->active_rrq_list);
+               spin_unlock_irqrestore(&phba->hbalock, iflag);
+               ndlp = lpfc_findnode_did(phba->pport, ctxp->sid);
+               if (ndlp && NLP_CHK_NODE_ACT(ndlp) &&
+                   (ndlp->nlp_state == NLP_STE_UNMAPPED_NODE ||
+                    ndlp->nlp_state == NLP_STE_MAPPED_NODE)) {
+                       lpfc_set_rrq_active(phba, ndlp,
+                               ctxp->rqb_buffer->sglq->sli4_lxritag,
+                               rxid, 1);
+                       lpfc_sli4_abts_err_handler(phba, ndlp, axri);
+               }
+
+               lpfc_printf_log(phba, KERN_INFO, LOG_NVME_ABTS,
+                               "6318 XB aborted %x flg x%x (%x)\n",
+                               ctxp->oxid, ctxp->flag, released);
+               if (released)
+                       lpfc_nvmet_rq_post(phba, ctxp,
+                                          &ctxp->rqb_buffer->hbuf);
+               if (rrq_empty)
+                       lpfc_worker_wake_up(phba);
+               return;
+       }
+       spin_unlock(&phba->sli4_hba.abts_nvme_buf_list_lock);
+       spin_unlock_irqrestore(&phba->hbalock, iflag);
+}
+
+int
+lpfc_nvmet_rcv_unsol_abort(struct lpfc_vport *vport,
+                          struct fc_frame_header *fc_hdr)
+
+{
+#if (IS_ENABLED(CONFIG_NVME_TARGET_FC))
+       struct lpfc_hba *phba = vport->phba;
+       struct lpfc_nvmet_rcv_ctx *ctxp, *next_ctxp;
+       struct nvmefc_tgt_fcp_req *rsp;
+       uint16_t xri;
+       unsigned long iflag = 0;
+
+       xri = be16_to_cpu(fc_hdr->fh_ox_id);
+
+       spin_lock_irqsave(&phba->hbalock, iflag);
+       spin_lock(&phba->sli4_hba.abts_nvme_buf_list_lock);
+       list_for_each_entry_safe(ctxp, next_ctxp,
+                                &phba->sli4_hba.lpfc_abts_nvmet_ctx_list,
+                                list) {
+               if (ctxp->rqb_buffer->sglq->sli4_xritag != xri)
+                       continue;
+
+               spin_unlock(&phba->sli4_hba.abts_nvme_buf_list_lock);
+               spin_unlock_irqrestore(&phba->hbalock, iflag);
+
+               spin_lock_irqsave(&ctxp->ctxlock, iflag);
+               ctxp->flag |= LPFC_NVMET_ABTS_RCV;
+               spin_unlock_irqrestore(&ctxp->ctxlock, iflag);
+
+               lpfc_nvmeio_data(phba,
+                       "NVMET ABTS RCV: xri x%x CPU %02x rjt %d\n",
+                       xri, smp_processor_id(), 0);
+
+               lpfc_printf_log(phba, KERN_INFO, LOG_NVME_ABTS,
+                               "6319 NVMET Rcv ABTS:acc xri x%x\n", xri);
+
+               rsp = &ctxp->ctx.fcp_req;
+               nvmet_fc_rcv_fcp_abort(phba->targetport, rsp);
+
+               /* Respond with BA_ACC accordingly */
+               lpfc_sli4_seq_abort_rsp(vport, fc_hdr, 1);
+               return 0;
+       }
+       spin_unlock(&phba->sli4_hba.abts_nvme_buf_list_lock);
+       spin_unlock_irqrestore(&phba->hbalock, iflag);
+
+       lpfc_nvmeio_data(phba, "NVMET ABTS RCV: xri x%x CPU %02x rjt %d\n",
+                        xri, smp_processor_id(), 1);
+
+       lpfc_printf_log(phba, KERN_INFO, LOG_NVME_ABTS,
+                       "6320 NVMET Rcv ABTS:rjt xri x%x\n", xri);
+
+       /* Respond with BA_RJT accordingly */
+       lpfc_sli4_seq_abort_rsp(vport, fc_hdr, 0);
+#endif
+       return 0;
 }
 
 void
@@ -940,6 +1150,7 @@ lpfc_nvmet_unsol_fcp_buffer(struct lpfc_hba *phba,
        ctxp->rqb_buffer = nvmebuf;
        ctxp->entry_cnt = 1;
        ctxp->flag = 0;
+       spin_lock_init(&ctxp->ctxlock);
 
 #ifdef CONFIG_SCSI_LPFC_DEBUG_FS
        if (phba->ktime_on) {
@@ -962,8 +1173,8 @@ lpfc_nvmet_unsol_fcp_buffer(struct lpfc_hba *phba,
        }
 #endif
 
-       lpfc_nvmeio_data(phba, "NVMET FCP  RCV: xri x%x sz %d from %06x\n",
-                        oxid, size, sid);
+       lpfc_nvmeio_data(phba, "NVMET FCP  RCV: xri x%x sz %d CPU %02x\n",
+                        oxid, size, smp_processor_id());
 
        atomic_inc(&tgtp->rcv_fcp_cmd_in);
        /*
@@ -1237,11 +1448,11 @@ lpfc_nvmet_prep_fcp_wqe(struct lpfc_hba *phba,
                return NULL;
        }
 
-       if (rsp->sg_cnt > phba->cfg_sg_seg_cnt) {
+       if (rsp->sg_cnt > phba->cfg_nvme_seg_cnt) {
                lpfc_printf_log(phba, KERN_ERR, LOG_NVME_IOERR,
                                "6109 lpfc_nvmet_prep_fcp_wqe: seg cnt err: "
-                               "NPORT x%x oxid:x%x\n",
-                               ctxp->sid, ctxp->oxid);
+                               "NPORT x%x oxid:x%x cnt %d\n",
+                               ctxp->sid, ctxp->oxid, phba->cfg_nvme_seg_cnt);
                return NULL;
        }
 
@@ -1593,6 +1804,8 @@ lpfc_nvmet_sol_fcp_abort_cmp(struct lpfc_hba *phba, struct lpfc_iocbq *cmdwqe,
        struct lpfc_nvmet_rcv_ctx *ctxp;
        struct lpfc_nvmet_tgtport *tgtp;
        uint32_t status, result;
+       unsigned long flags;
+       bool released = false;
 
        ctxp = cmdwqe->context2;
        status = bf_get(lpfc_wcqe_c_status, wcqe);
@@ -1601,21 +1814,46 @@ lpfc_nvmet_sol_fcp_abort_cmp(struct lpfc_hba *phba, struct lpfc_iocbq *cmdwqe,
        tgtp = (struct lpfc_nvmet_tgtport *)phba->targetport->private;
        atomic_inc(&tgtp->xmt_abort_cmpl);
 
+       ctxp->state = LPFC_NVMET_STE_DONE;
+
+       /* Check if we already received a free context call
+        * and we have completed processing an abort situation.
+        */
+       spin_lock_irqsave(&ctxp->ctxlock, flags);
+       if ((ctxp->flag & LPFC_NVMET_CTX_RLS) &&
+           !(ctxp->flag & LPFC_NVMET_XBUSY)) {
+               list_del(&ctxp->list);
+               released = true;
+       }
+       ctxp->flag &= ~LPFC_NVMET_ABORT_OP;
+       spin_unlock_irqrestore(&ctxp->ctxlock, flags);
+
        lpfc_printf_log(phba, KERN_ERR, LOG_NVME_ABTS,
-                       "6165 Abort cmpl: xri x%x WCQE: %08x %08x %08x %08x\n",
-                       ctxp->oxid, wcqe->word0, wcqe->total_data_placed,
+                       "6165 ABORT cmpl: xri x%x flg x%x (%d) "
+                       "WCQE: %08x %08x %08x %08x\n",
+                       ctxp->oxid, ctxp->flag, released,
+                       wcqe->word0, wcqe->total_data_placed,
                        result, wcqe->word3);
 
-       ctxp->state = LPFC_NVMET_STE_DONE;
-       lpfc_nvmet_rq_post(phba, ctxp, &ctxp->rqb_buffer->hbuf);
+       /*
+        * if transport has released ctx, then can reuse it. Otherwise,
+        * will be recycled by transport release call.
+        */
+       if (released)
+               lpfc_nvmet_rq_post(phba, ctxp, &ctxp->rqb_buffer->hbuf);
 
        cmdwqe->context2 = NULL;
        cmdwqe->context3 = NULL;
        lpfc_sli_release_iocbq(phba, cmdwqe);
+
+       /* Since iaab/iaar are NOT set, there is no work left.
+        * For LPFC_NVMET_XBUSY, lpfc_sli4_nvmet_xri_aborted
+        * should have been called already.
+        */
 }
 
 /**
- * lpfc_nvmet_xmt_fcp_abort_cmp - Completion handler for ABTS
+ * lpfc_nvmet_unsol_fcp_abort_cmp - Completion handler for ABTS
  * @phba: Pointer to HBA context object.
  * @cmdwqe: Pointer to driver command WQE object.
  * @wcqe: Pointer to driver response CQE object.
@@ -1625,12 +1863,14 @@ lpfc_nvmet_sol_fcp_abort_cmp(struct lpfc_hba *phba, struct lpfc_iocbq *cmdwqe,
  * The function frees memory resources used for the NVME commands.
  **/
 static void
-lpfc_nvmet_xmt_fcp_abort_cmp(struct lpfc_hba *phba, struct lpfc_iocbq *cmdwqe,
-                            struct lpfc_wcqe_complete *wcqe)
+lpfc_nvmet_unsol_fcp_abort_cmp(struct lpfc_hba *phba, struct lpfc_iocbq *cmdwqe,
+                              struct lpfc_wcqe_complete *wcqe)
 {
        struct lpfc_nvmet_rcv_ctx *ctxp;
        struct lpfc_nvmet_tgtport *tgtp;
+       unsigned long flags;
        uint32_t status, result;
+       bool released = false;
 
        ctxp = cmdwqe->context2;
        status = bf_get(lpfc_wcqe_c_status, wcqe);
@@ -1639,23 +1879,55 @@ lpfc_nvmet_xmt_fcp_abort_cmp(struct lpfc_hba *phba, struct lpfc_iocbq *cmdwqe,
        tgtp = (struct lpfc_nvmet_tgtport *)phba->targetport->private;
        atomic_inc(&tgtp->xmt_abort_cmpl);
 
+       if (!ctxp) {
+               /* if context is clear, related io alrady complete */
+               lpfc_printf_log(phba, KERN_INFO, LOG_NVME_ABTS,
+                               "6070 ABTS cmpl: WCQE: %08x %08x %08x %08x\n",
+                               wcqe->word0, wcqe->total_data_placed,
+                               result, wcqe->word3);
+               return;
+       }
+
+       /* Sanity check */
+       if (ctxp->state != LPFC_NVMET_STE_ABORT) {
+               lpfc_printf_log(phba, KERN_ERR, LOG_NVME_ABTS,
+                               "6112 ABTS Wrong state:%d oxid x%x\n",
+                               ctxp->state, ctxp->oxid);
+       }
+
+       /* Check if we already received a free context call
+        * and we have completed processing an abort situation.
+        */
+       ctxp->state = LPFC_NVMET_STE_DONE;
+       spin_lock_irqsave(&ctxp->ctxlock, flags);
+       if ((ctxp->flag & LPFC_NVMET_CTX_RLS) &&
+           !(ctxp->flag & LPFC_NVMET_XBUSY)) {
+               list_del(&ctxp->list);
+               released = true;
+       }
+       ctxp->flag &= ~LPFC_NVMET_ABORT_OP;
+       spin_unlock_irqrestore(&ctxp->ctxlock, flags);
+
        lpfc_printf_log(phba, KERN_INFO, LOG_NVME_ABTS,
-                       "6070 Abort cmpl: ctx %p WCQE: %08x %08x %08x %08x\n",
-                       ctxp, wcqe->word0, wcqe->total_data_placed,
+                       "6316 ABTS cmpl xri x%x flg x%x (%x) "
+                       "WCQE: %08x %08x %08x %08x\n",
+                       ctxp->oxid, ctxp->flag, released,
+                       wcqe->word0, wcqe->total_data_placed,
                        result, wcqe->word3);
-
-       if (ctxp) {
-               /* Sanity check */
-               if (ctxp->state != LPFC_NVMET_STE_ABORT) {
-                       lpfc_printf_log(phba, KERN_ERR, LOG_NVME_ABTS,
-                                       "6112 ABORT Wrong state:%d oxid x%x\n",
-                                       ctxp->state, ctxp->oxid);
-               }
-               ctxp->state = LPFC_NVMET_STE_DONE;
+       /*
+        * if transport has released ctx, then can reuse it. Otherwise,
+        * will be recycled by transport release call.
+        */
+       if (released)
                lpfc_nvmet_rq_post(phba, ctxp, &ctxp->rqb_buffer->hbuf);
-               cmdwqe->context2 = NULL;
-               cmdwqe->context3 = NULL;
-       }
+
+       cmdwqe->context2 = NULL;
+       cmdwqe->context3 = NULL;
+
+       /* Since iaab/iaar are NOT set, there is no work left.
+        * For LPFC_NVMET_XBUSY, lpfc_sli4_nvmet_xri_aborted
+        * should have been called already.
+        */
 }
 
 /**
@@ -1708,10 +1980,14 @@ lpfc_nvmet_unsol_issue_abort(struct lpfc_hba *phba,
        struct lpfc_nodelist *ndlp;
 
        lpfc_printf_log(phba, KERN_INFO, LOG_NVME_ABTS,
-                       "6067 Abort: sid %x xri x%x/x%x\n",
+                       "6067 ABTS: sid %x xri x%x/x%x\n",
                        sid, xri, ctxp->wqeq->sli4_xritag);
 
        tgtp = (struct lpfc_nvmet_tgtport *)phba->targetport->private;
+       if (!ctxp->wqeq) {
+               ctxp->wqeq = ctxp->rqb_buffer->iocbq;
+               ctxp->wqeq->hba_wqidx = 0;
+       }
 
        ndlp = lpfc_findnode_did(phba->pport, sid);
        if (!ndlp || !NLP_CHK_NODE_ACT(ndlp) ||
@@ -1817,10 +2093,11 @@ lpfc_nvmet_sol_fcp_issue_abort(struct lpfc_hba *phba,
            (ndlp->nlp_state != NLP_STE_MAPPED_NODE))) {
                atomic_inc(&tgtp->xmt_abort_rsp_error);
                lpfc_printf_log(phba, KERN_WARNING, LOG_NVME_ABTS,
-                               "6160 Drop ABTS - wrong NDLP state x%x.\n",
+                               "6160 Drop ABORT - wrong NDLP state x%x.\n",
                                (ndlp) ? ndlp->nlp_state : NLP_STE_MAX_STATE);
 
                /* No failure to an ABTS request. */
+               ctxp->flag &= ~LPFC_NVMET_ABORT_OP;
                return 0;
        }
 
@@ -1828,9 +2105,10 @@ lpfc_nvmet_sol_fcp_issue_abort(struct lpfc_hba *phba,
        ctxp->abort_wqeq = lpfc_sli_get_iocbq(phba);
        if (!ctxp->abort_wqeq) {
                lpfc_printf_log(phba, KERN_WARNING, LOG_NVME_ABTS,
-                               "6161 Abort failed: No wqeqs: "
+                               "6161 ABORT failed: No wqeqs: "
                                "xri: x%x\n", ctxp->oxid);
                /* No failure to an ABTS request. */
+               ctxp->flag &= ~LPFC_NVMET_ABORT_OP;
                return 0;
        }
        abts_wqeq = ctxp->abort_wqeq;
@@ -1838,8 +2116,8 @@ lpfc_nvmet_sol_fcp_issue_abort(struct lpfc_hba *phba,
        ctxp->state = LPFC_NVMET_STE_ABORT;
 
        /* Announce entry to new IO submit field. */
-       lpfc_printf_log(phba, KERN_ERR, LOG_NVME_ABTS,
-                       "6162 Abort Request to rport DID x%06x "
+       lpfc_printf_log(phba, KERN_INFO, LOG_NVME_ABTS,
+                       "6162 ABORT Request to rport DID x%06x "
                        "for xri x%x x%x\n",
                        ctxp->sid, ctxp->oxid, ctxp->wqeq->sli4_xritag);
 
@@ -1855,6 +2133,7 @@ lpfc_nvmet_sol_fcp_issue_abort(struct lpfc_hba *phba,
                                "NVME Req now. hba_flag x%x oxid x%x\n",
                                phba->hba_flag, ctxp->oxid);
                lpfc_sli_release_iocbq(phba, abts_wqeq);
+               ctxp->flag &= ~LPFC_NVMET_ABORT_OP;
                return 0;
        }
 
@@ -1866,6 +2145,7 @@ lpfc_nvmet_sol_fcp_issue_abort(struct lpfc_hba *phba,
                                "still pending on oxid x%x\n",
                                ctxp->oxid);
                lpfc_sli_release_iocbq(phba, abts_wqeq);
+               ctxp->flag &= ~LPFC_NVMET_ABORT_OP;
                return 0;
        }
 
@@ -1913,9 +2193,10 @@ lpfc_nvmet_sol_fcp_issue_abort(struct lpfc_hba *phba,
        if (rc == WQE_SUCCESS)
                return 0;
 
+       ctxp->flag &= ~LPFC_NVMET_ABORT_OP;
        lpfc_sli_release_iocbq(phba, abts_wqeq);
-       lpfc_printf_log(phba, KERN_ERR, LOG_NVME,
-                       "6166 Failed abts issue_wqe with status x%x "
+       lpfc_printf_log(phba, KERN_ERR, LOG_NVME_ABTS,
+                       "6166 Failed ABORT issue_wqe with status x%x "
                        "for oxid x%x.\n",
                        rc, ctxp->oxid);
        return 1;
@@ -1944,8 +2225,8 @@ lpfc_nvmet_unsol_fcp_issue_abort(struct lpfc_hba *phba,
 
        spin_lock_irqsave(&phba->hbalock, flags);
        abts_wqeq = ctxp->wqeq;
-       abts_wqeq->wqe_cmpl = lpfc_nvmet_xmt_fcp_abort_cmp;
-       abts_wqeq->iocb_cmpl = 0;
+       abts_wqeq->wqe_cmpl = lpfc_nvmet_unsol_fcp_abort_cmp;
+       abts_wqeq->iocb_cmpl = NULL;
        abts_wqeq->iocb_flag |= LPFC_IO_NVMET;
        rc = lpfc_sli4_issue_wqe(phba, LPFC_FCP_RING, abts_wqeq);
        spin_unlock_irqrestore(&phba->hbalock, flags);
@@ -1955,7 +2236,7 @@ lpfc_nvmet_unsol_fcp_issue_abort(struct lpfc_hba *phba,
        }
 
 aerr:
-       lpfc_nvmet_rq_post(phba, ctxp, &ctxp->rqb_buffer->hbuf);
+       ctxp->flag &= ~LPFC_NVMET_ABORT_OP;
        atomic_inc(&tgtp->xmt_abort_rsp_error);
        lpfc_printf_log(phba, KERN_WARNING, LOG_NVME_ABTS,
                        "6135 Failed to Issue ABTS for oxid x%x. Status x%x\n",
index ca96f05..128759f 100644 (file)
@@ -21,9 +21,7 @@
  * included with this package.                                     *
  ********************************************************************/
 
-#define LPFC_NVMET_MIN_SEGS            16
-#define LPFC_NVMET_DEFAULT_SEGS                64      /* 256K IOs */
-#define LPFC_NVMET_MAX_SEGS            510
+#define LPFC_NVMET_DEFAULT_SEGS                (64 + 1)        /* 256K IOs */
 #define LPFC_NVMET_SUCCESS_LEN 12
 
 /* Used for NVME Target */
@@ -77,10 +75,12 @@ struct lpfc_nvmet_rcv_ctx {
                struct nvmefc_tgt_ls_req ls_req;
                struct nvmefc_tgt_fcp_req fcp_req;
        } ctx;
+       struct list_head list;
        struct lpfc_hba *phba;
        struct lpfc_iocbq *wqeq;
        struct lpfc_iocbq *abort_wqeq;
        dma_addr_t txrdy_phys;
+       spinlock_t ctxlock; /* protect flag access */
        uint32_t *txrdy;
        uint32_t sid;
        uint32_t offset;
@@ -97,8 +97,11 @@ struct lpfc_nvmet_rcv_ctx {
 #define LPFC_NVMET_STE_RSP             4
 #define LPFC_NVMET_STE_DONE            5
        uint16_t flag;
-#define LPFC_NVMET_IO_INP              1
-#define LPFC_NVMET_ABORT_OP            2
+#define LPFC_NVMET_IO_INP              0x1  /* IO is in progress on exchange */
+#define LPFC_NVMET_ABORT_OP            0x2  /* Abort WQE issued on exchange */
+#define LPFC_NVMET_XBUSY               0x4  /* XB bit set on IO cmpl */
+#define LPFC_NVMET_CTX_RLS             0x8  /* ctx free requested */
+#define LPFC_NVMET_ABTS_RCV            0x10  /* ABTS received on exchange */
        struct rqb_dmabuf *rqb_buffer;
 
 #ifdef CONFIG_SCSI_LPFC_DEBUG_FS
index 1c9fa45..cf19f49 100644 (file)
@@ -6338,7 +6338,7 @@ lpfc_sli4_get_allocated_extnts(struct lpfc_hba *phba, uint16_t type,
 }
 
 /**
- * lpfc_sli4_repost_sgl_list - Repsot the buffers sgl pages as block
+ * lpfc_sli4_repost_sgl_list - Repost the buffers sgl pages as block
  * @phba: pointer to lpfc hba data structure.
  * @pring: Pointer to driver SLI ring object.
  * @sgl_list: linked link of sgl buffers to post
@@ -13758,7 +13758,10 @@ lpfc_sli4_queue_free(struct lpfc_queue *queue)
                lpfc_free_rq_buffer(queue->phba, queue);
                kfree(queue->rqbp);
        }
-       kfree(queue->pring);
+
+       if (!list_empty(&queue->wq_list))
+               list_del(&queue->wq_list);
+
        kfree(queue);
        return;
 }
@@ -14738,6 +14741,9 @@ lpfc_wq_create(struct lpfc_hba *phba, struct lpfc_queue *wq,
        case LPFC_Q_CREATE_VERSION_1:
                bf_set(lpfc_mbx_wq_create_wqe_count, &wq_create->u.request_1,
                       wq->entry_count);
+               bf_set(lpfc_mbox_hdr_version, &shdr->request,
+                      LPFC_Q_CREATE_VERSION_1);
+
                switch (wq->entry_size) {
                default:
                case 64:
@@ -15561,6 +15567,8 @@ lpfc_wq_destroy(struct lpfc_hba *phba, struct lpfc_queue *wq)
        }
        /* Remove wq from any list */
        list_del_init(&wq->list);
+       kfree(wq->pring);
+       wq->pring = NULL;
        mempool_free(mbox, wq->phba->mbox_mem_pool);
        return status;
 }
@@ -16513,7 +16521,7 @@ lpfc_sli4_xri_inrange(struct lpfc_hba *phba,
  * This function sends a basic response to a previous unsol sequence abort
  * event after aborting the sequence handling.
  **/
-static void
+void
 lpfc_sli4_seq_abort_rsp(struct lpfc_vport *vport,
                        struct fc_frame_header *fc_hdr, bool aborted)
 {
@@ -16534,14 +16542,13 @@ lpfc_sli4_seq_abort_rsp(struct lpfc_vport *vport,
 
        ndlp = lpfc_findnode_did(vport, sid);
        if (!ndlp) {
-               ndlp = mempool_alloc(phba->nlp_mem_pool, GFP_KERNEL);
+               ndlp = lpfc_nlp_init(vport, sid);
                if (!ndlp) {
                        lpfc_printf_vlog(vport, KERN_WARNING, LOG_ELS,
                                         "1268 Failed to allocate ndlp for "
                                         "oxid:x%x SID:x%x\n", oxid, sid);
                        return;
                }
-               lpfc_nlp_init(vport, ndlp, sid);
                /* Put ndlp onto pport node list */
                lpfc_enqueue_node(vport, ndlp);
        } else if (!NLP_CHK_NODE_ACT(ndlp)) {
@@ -16690,6 +16697,11 @@ lpfc_sli4_handle_unsol_abort(struct lpfc_vport *vport,
        }
        lpfc_in_buf_free(phba, &dmabuf->dbuf);
 
+       if (phba->nvmet_support) {
+               lpfc_nvmet_rcv_unsol_abort(vport, &fc_hdr);
+               return;
+       }
+
        /* Respond with BA_ACC or BA_RJT accordingly */
        lpfc_sli4_seq_abort_rsp(vport, &fc_hdr, aborted);
 }
index 710458c..da46471 100644 (file)
@@ -620,7 +620,7 @@ struct lpfc_sli4_hba {
        struct list_head lpfc_els_sgl_list;
        struct list_head lpfc_abts_els_sgl_list;
        struct list_head lpfc_nvmet_sgl_list;
-       struct list_head lpfc_abts_nvmet_sgl_list;
+       struct list_head lpfc_abts_nvmet_ctx_list;
        struct list_head lpfc_abts_scsi_buf_list;
        struct list_head lpfc_abts_nvme_buf_list;
        struct lpfc_sglq **lpfc_sglq_active_list;
index d4e95e2..1c26dc6 100644 (file)
@@ -20,7 +20,7 @@
  * included with this package.                                     *
  *******************************************************************/
 
-#define LPFC_DRIVER_VERSION "11.2.0.10"
+#define LPFC_DRIVER_VERSION "11.2.0.12"
 #define LPFC_DRIVER_NAME               "lpfc"
 
 /* Used for SLI 2/3 */
index 9a0339d..c714482 100644 (file)
@@ -738,10 +738,9 @@ lpfc_vport_delete(struct fc_vport *fc_vport)
                ndlp = lpfc_findnode_did(vport, Fabric_DID);
                if (!ndlp) {
                        /* Cannot find existing Fabric ndlp, allocate one */
-                       ndlp = mempool_alloc(phba->nlp_mem_pool, GFP_KERNEL);
+                       ndlp = lpfc_nlp_init(vport, Fabric_DID);
                        if (!ndlp)
                                goto skip_logo;
-                       lpfc_nlp_init(vport, ndlp, Fabric_DID);
                        /* Indicate free memory when release */
                        NLP_SET_FREE_REQ(ndlp);
                } else {
index 6903f03..8a1b948 100644 (file)
@@ -477,7 +477,7 @@ static void _set_error_resid(struct osd_request *or, struct request *req,
                             int error)
 {
        or->async_error = error;
-       or->req_errors = req->errors ? : error;
+       or->req_errors = scsi_req(req)->result ? : error;
        or->sense_len = scsi_req(req)->sense_len;
        if (or->sense_len)
                memcpy(or->sense, scsi_req(req)->sense, or->sense_len);
@@ -489,7 +489,10 @@ static void _set_error_resid(struct osd_request *or, struct request *req,
 
 int osd_execute_request(struct osd_request *or)
 {
-       int error = blk_execute_rq(or->request->q, NULL, or->request, 0);
+       int error;
+
+       blk_execute_rq(or->request->q, NULL, or->request, 0);
+       error = scsi_req(or->request)->result ? -EIO : 0;
 
        _set_error_resid(or, or->request, error);
        return error;
@@ -1602,7 +1605,7 @@ static int _init_blk_request(struct osd_request *or,
        req->rq_flags |= RQF_QUIET;
 
        req->timeout = or->timeout;
-       req->retries = or->retries;
+       scsi_req(req)->retries = or->retries;
 
        if (has_out) {
                or->out.req = req;
index c47f4b3..67cbed9 100644 (file)
@@ -327,7 +327,7 @@ static void osst_end_async(struct request *req, int update)
        struct osst_tape *STp = SRpnt->stp;
        struct rq_map_data *mdata = &SRpnt->stp->buffer->map_data;
 
-       STp->buffer->cmdstat.midlevel_result = SRpnt->result = req->errors;
+       STp->buffer->cmdstat.midlevel_result = SRpnt->result = rq->result;
 #if DEBUG
        STp->write_pending = 0;
 #endif
@@ -414,7 +414,7 @@ static int osst_execute(struct osst_request *SRpnt, const unsigned char *cmd,
        memset(rq->cmd, 0, BLK_MAX_CDB); /* ATAPI hates garbage after CDB */
        memcpy(rq->cmd, cmd, rq->cmd_len);
        req->timeout = timeout;
-       req->retries = retries;
+       rq->retries = retries;
        req->end_io_data = SRpnt;
 
        blk_execute_rq_nowait(req->q, NULL, req, 1, osst_end_async);
index ed58b91..e10b91c 100644 (file)
@@ -99,7 +99,8 @@ static void qedf_fcoe_process_vlan_resp(struct qedf_ctx *qedf,
                qedf_set_vlan_id(qedf, vid);
 
                /* Inform waiter that it's ok to call fcoe_ctlr_link up() */
-               complete(&qedf->fipvlan_compl);
+               if (!completion_done(&qedf->fipvlan_compl))
+                       complete(&qedf->fipvlan_compl);
        }
 }
 
index 8e2a160..cceddd9 100644 (file)
@@ -2803,6 +2803,7 @@ static int __qedf_probe(struct pci_dev *pdev, int mode)
                atomic_set(&qedf->num_offloads, 0);
                qedf->stop_io_on_error = false;
                pci_set_drvdata(pdev, qedf);
+               init_completion(&qedf->fipvlan_compl);
 
                QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_INFO,
                   "QLogic FastLinQ FCoE Module qedf %s, "
index 84c9098..b6e40fd 100644 (file)
@@ -2553,13 +2553,13 @@ qla24xx_bsg_timeout(struct bsg_job *bsg_job)
                                                ql_log(ql_log_warn, vha, 0x7089,
                                                    "mbx abort_command "
                                                    "failed.\n");
-                                               bsg_job->req->errors =
+                                               scsi_req(bsg_job->req)->result =
                                                bsg_reply->result = -EIO;
                                        } else {
                                                ql_dbg(ql_dbg_user, vha, 0x708a,
                                                    "mbx abort_command "
                                                    "success.\n");
-                                               bsg_job->req->errors =
+                                               scsi_req(bsg_job->req)->result =
                                                bsg_reply->result = 0;
                                        }
                                        spin_lock_irqsave(&ha->hardware_lock, flags);
@@ -2570,7 +2570,7 @@ qla24xx_bsg_timeout(struct bsg_job *bsg_job)
        }
        spin_unlock_irqrestore(&ha->hardware_lock, flags);
        ql_log(ql_log_info, vha, 0x708b, "SRB not found to abort.\n");
-       bsg_job->req->errors = bsg_reply->result = -ENXIO;
+       scsi_req(bsg_job->req)->result = bsg_reply->result = -ENXIO;
        return 0;
 
 done:
index 3e70117..83d61d2 100644 (file)
@@ -1160,8 +1160,13 @@ static inline
 uint32_t qla2x00_isp_reg_stat(struct qla_hw_data *ha)
 {
        struct device_reg_24xx __iomem *reg = &ha->iobase->isp24;
+       struct device_reg_82xx __iomem *reg82 = &ha->iobase->isp82;
 
-       return ((RD_REG_DWORD(&reg->host_status)) == ISP_REG_DISCONNECT);
+       if (IS_P3P_TYPE(ha))
+               return ((RD_REG_DWORD(&reg82->host_int)) == ISP_REG_DISCONNECT);
+       else
+               return ((RD_REG_DWORD(&reg->host_status)) ==
+                       ISP_REG_DISCONNECT);
 }
 
 /**************************************************************************
diff --git a/drivers/scsi/scsi_debugfs.c b/drivers/scsi/scsi_debugfs.c
new file mode 100644 (file)
index 0000000..a97c950
--- /dev/null
@@ -0,0 +1,13 @@
+#include <linux/seq_file.h>
+#include <scsi/scsi_cmnd.h>
+#include <scsi/scsi_dbg.h>
+#include "scsi_debugfs.h"
+
+void scsi_show_rq(struct seq_file *m, struct request *rq)
+{
+       struct scsi_cmnd *cmd = container_of(scsi_req(rq), typeof(*cmd), req);
+       char buf[80];
+
+       __scsi_format_command(buf, sizeof(buf), cmd->cmnd, cmd->cmd_len);
+       seq_printf(m, ", .cmd=%s", buf);
+}
diff --git a/drivers/scsi/scsi_debugfs.h b/drivers/scsi/scsi_debugfs.h
new file mode 100644 (file)
index 0000000..951b043
--- /dev/null
@@ -0,0 +1,4 @@
+struct request;
+struct seq_file;
+
+void scsi_show_rq(struct seq_file *m, struct request *rq);
index f2cafae..2db412d 100644 (file)
@@ -1988,7 +1988,7 @@ static void scsi_eh_lock_door(struct scsi_device *sdev)
 
        req->rq_flags |= RQF_QUIET;
        req->timeout = 10 * HZ;
-       req->retries = 5;
+       rq->retries = 5;
 
        blk_execute_rq_nowait(req->q, NULL, req, 1, eh_lock_door_done);
 }
index 19125d7..1c3e87d 100644 (file)
@@ -34,6 +34,7 @@
 
 #include <trace/events/scsi.h>
 
+#include "scsi_debugfs.h"
 #include "scsi_priv.h"
 #include "scsi_logging.h"
 
@@ -229,8 +230,8 @@ void scsi_queue_insert(struct scsi_cmnd *cmd, int reason)
  * @rq_flags:  flags for ->rq_flags
  * @resid:     optional residual length
  *
- * returns the req->errors value which is the scsi_cmnd result
- * field.
+ * Returns the scsi_cmnd result field if a command was executed, or a negative
+ * Linux error code if we didn't get that far.
  */
 int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
                 int data_direction, void *buffer, unsigned bufflen,
@@ -256,7 +257,7 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
 
        rq->cmd_len = COMMAND_SIZE(cmd[0]);
        memcpy(rq->cmd, cmd, rq->cmd_len);
-       req->retries = retries;
+       rq->retries = retries;
        req->timeout = timeout;
        req->cmd_flags |= flags;
        req->rq_flags |= rq_flags | RQF_QUIET | RQF_PREEMPT;
@@ -281,7 +282,7 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
                memcpy(sense, rq->sense, SCSI_SENSE_BUFFERSIZE);
        if (sshdr)
                scsi_normalize_sense(rq->sense, rq->sense_len, sshdr);
-       ret = req->errors;
+       ret = rq->result;
  out:
        blk_put_request(req);
 
@@ -496,7 +497,7 @@ static void scsi_run_queue(struct request_queue *q)
                scsi_starved_list_run(sdev->host);
 
        if (q->mq_ops)
-               blk_mq_start_stopped_hw_queues(q, false);
+               blk_mq_run_hw_queues(q, false);
        else
                blk_run_queue(q);
 }
@@ -667,7 +668,7 @@ static bool scsi_end_request(struct request *req, int error,
                    !list_empty(&sdev->host->starved_list))
                        kblockd_schedule_work(&sdev->requeue_work);
                else
-                       blk_mq_start_stopped_hw_queues(q, true);
+                       blk_mq_run_hw_queues(q, true);
        } else {
                unsigned long flags;
 
@@ -797,8 +798,7 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
                /*
                 * __scsi_error_from_host_byte may have reset the host_byte
                 */
-               req->errors = cmd->result;
-
+               scsi_req(req)->result = cmd->result;
                scsi_req(req)->resid_len = scsi_get_resid(cmd);
 
                if (scsi_bidi_cmnd(cmd)) {
@@ -835,7 +835,7 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
        /*
         * Recovered errors need reporting, but they're always treated as
         * success, so fiddle the result code here.  For passthrough requests
-        * we already took a copy of the original into rq->errors which
+        * we already took a copy of the original into sreq->result which
         * is what gets returned to the user
         */
        if (sense_valid && (sshdr.sense_key == RECOVERED_ERROR)) {
@@ -1061,10 +1061,10 @@ int scsi_init_io(struct scsi_cmnd *cmd)
        struct scsi_device *sdev = cmd->device;
        struct request *rq = cmd->request;
        bool is_mq = (rq->mq_ctx != NULL);
-       int error;
+       int error = BLKPREP_KILL;
 
        if (WARN_ON_ONCE(!blk_rq_nr_phys_segments(rq)))
-               return -EINVAL;
+               goto err_exit;
 
        error = scsi_init_sgtable(rq, &cmd->sdb);
        if (error)
@@ -1177,7 +1177,7 @@ static int scsi_setup_scsi_cmnd(struct scsi_device *sdev, struct request *req)
        cmd->cmd_len = scsi_req(req)->cmd_len;
        cmd->cmnd = scsi_req(req)->cmd;
        cmd->transfersize = blk_rq_bytes(req);
-       cmd->allowed = req->retries;
+       cmd->allowed = scsi_req(req)->retries;
        return BLKPREP_OK;
 }
 
@@ -1281,7 +1281,7 @@ scsi_prep_return(struct request_queue *q, struct request *req, int ret)
        switch (ret) {
        case BLKPREP_KILL:
        case BLKPREP_INVALID:
-               req->errors = DID_NO_CONNECT << 16;
+               scsi_req(req)->result = DID_NO_CONNECT << 16;
                /* release the command and kill it */
                if (req->special) {
                        struct scsi_cmnd *cmd = req->special;
@@ -1905,7 +1905,7 @@ static int scsi_mq_prep_fn(struct request *req)
 static void scsi_mq_done(struct scsi_cmnd *cmd)
 {
        trace_scsi_dispatch_cmd_done(cmd);
-       blk_mq_complete_request(cmd->request, cmd->request->errors);
+       blk_mq_complete_request(cmd->request);
 }
 
 static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
@@ -1974,7 +1974,7 @@ out:
        case BLK_MQ_RQ_QUEUE_BUSY:
                if (atomic_read(&sdev->device_busy) == 0 &&
                    !scsi_device_blocked(sdev))
-                       blk_mq_delay_queue(hctx, SCSI_QUEUE_DELAY);
+                       blk_mq_delay_run_hw_queue(hctx, SCSI_QUEUE_DELAY);
                break;
        case BLK_MQ_RQ_QUEUE_ERROR:
                /*
@@ -2154,10 +2154,13 @@ struct request_queue *scsi_alloc_queue(struct scsi_device *sdev)
        return q;
 }
 
-static struct blk_mq_ops scsi_mq_ops = {
+static const struct blk_mq_ops scsi_mq_ops = {
        .queue_rq       = scsi_queue_rq,
        .complete       = scsi_softirq_done,
        .timeout        = scsi_timeout,
+#ifdef CONFIG_BLK_DEBUG_FS
+       .show_rq        = scsi_show_rq,
+#endif
        .init_request   = scsi_init_request,
        .exit_request   = scsi_exit_request,
        .map_queues     = scsi_map_queues,
index cdbb293..9fdbd50 100644 (file)
@@ -184,9 +184,9 @@ static void sas_smp_request(struct request_queue *q, struct Scsi_Host *shost,
                                blk_rq_bytes(req->next_rq);
                handler = to_sas_internal(shost->transportt)->f->smp_handler;
                ret = handler(shost, rphy, req);
-               req->errors = ret;
+               scsi_req(req)->result = ret;
 
-               blk_end_request_all(req, ret);
+               blk_end_request_all(req, 0);
 
                spin_lock_irq(q->queue_lock);
        }
index fcfeddc..0dc95e1 100644 (file)
@@ -418,6 +418,46 @@ provisioning_mode_store(struct device *dev, struct device_attribute *attr,
 }
 static DEVICE_ATTR_RW(provisioning_mode);
 
+static const char *zeroing_mode[] = {
+       [SD_ZERO_WRITE]         = "write",
+       [SD_ZERO_WS]            = "writesame",
+       [SD_ZERO_WS16_UNMAP]    = "writesame_16_unmap",
+       [SD_ZERO_WS10_UNMAP]    = "writesame_10_unmap",
+};
+
+static ssize_t
+zeroing_mode_show(struct device *dev, struct device_attribute *attr,
+                 char *buf)
+{
+       struct scsi_disk *sdkp = to_scsi_disk(dev);
+
+       return snprintf(buf, 20, "%s\n", zeroing_mode[sdkp->zeroing_mode]);
+}
+
+static ssize_t
+zeroing_mode_store(struct device *dev, struct device_attribute *attr,
+                  const char *buf, size_t count)
+{
+       struct scsi_disk *sdkp = to_scsi_disk(dev);
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EACCES;
+
+       if (!strncmp(buf, zeroing_mode[SD_ZERO_WRITE], 20))
+               sdkp->zeroing_mode = SD_ZERO_WRITE;
+       else if (!strncmp(buf, zeroing_mode[SD_ZERO_WS], 20))
+               sdkp->zeroing_mode = SD_ZERO_WS;
+       else if (!strncmp(buf, zeroing_mode[SD_ZERO_WS16_UNMAP], 20))
+               sdkp->zeroing_mode = SD_ZERO_WS16_UNMAP;
+       else if (!strncmp(buf, zeroing_mode[SD_ZERO_WS10_UNMAP], 20))
+               sdkp->zeroing_mode = SD_ZERO_WS10_UNMAP;
+       else
+               return -EINVAL;
+
+       return count;
+}
+static DEVICE_ATTR_RW(zeroing_mode);
+
 static ssize_t
 max_medium_access_timeouts_show(struct device *dev,
                                struct device_attribute *attr, char *buf)
@@ -496,6 +536,7 @@ static struct attribute *sd_disk_attrs[] = {
        &dev_attr_app_tag_own.attr,
        &dev_attr_thin_provisioning.attr,
        &dev_attr_provisioning_mode.attr,
+       &dev_attr_zeroing_mode.attr,
        &dev_attr_max_write_same_blocks.attr,
        &dev_attr_max_medium_access_timeouts.attr,
        NULL,
@@ -644,26 +685,11 @@ static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode)
        unsigned int logical_block_size = sdkp->device->sector_size;
        unsigned int max_blocks = 0;
 
-       q->limits.discard_zeroes_data = 0;
-
-       /*
-        * When LBPRZ is reported, discard alignment and granularity
-        * must be fixed to the logical block size. Otherwise the block
-        * layer will drop misaligned portions of the request which can
-        * lead to data corruption. If LBPRZ is not set, we honor the
-        * device preference.
-        */
-       if (sdkp->lbprz) {
-               q->limits.discard_alignment = 0;
-               q->limits.discard_granularity = logical_block_size;
-       } else {
-               q->limits.discard_alignment = sdkp->unmap_alignment *
-                       logical_block_size;
-               q->limits.discard_granularity =
-                       max(sdkp->physical_block_size,
-                           sdkp->unmap_granularity * logical_block_size);
-       }
-
+       q->limits.discard_alignment =
+               sdkp->unmap_alignment * logical_block_size;
+       q->limits.discard_granularity =
+               max(sdkp->physical_block_size,
+                   sdkp->unmap_granularity * logical_block_size);
        sdkp->provisioning_mode = mode;
 
        switch (mode) {
@@ -681,19 +707,16 @@ static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode)
        case SD_LBP_WS16:
                max_blocks = min_not_zero(sdkp->max_ws_blocks,
                                          (u32)SD_MAX_WS16_BLOCKS);
-               q->limits.discard_zeroes_data = sdkp->lbprz;
                break;
 
        case SD_LBP_WS10:
                max_blocks = min_not_zero(sdkp->max_ws_blocks,
                                          (u32)SD_MAX_WS10_BLOCKS);
-               q->limits.discard_zeroes_data = sdkp->lbprz;
                break;
 
        case SD_LBP_ZERO:
                max_blocks = min_not_zero(sdkp->max_ws_blocks,
                                          (u32)SD_MAX_WS10_BLOCKS);
-               q->limits.discard_zeroes_data = 1;
                break;
        }
 
@@ -701,93 +724,122 @@ static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode)
        queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
 }
 
-/**
- * sd_setup_discard_cmnd - unmap blocks on thinly provisioned device
- * @sdp: scsi device to operate on
- * @rq: Request to prepare
- *
- * Will issue either UNMAP or WRITE SAME(16) depending on preference
- * indicated by target device.
- **/
-static int sd_setup_discard_cmnd(struct scsi_cmnd *cmd)
+static int sd_setup_unmap_cmnd(struct scsi_cmnd *cmd)
 {
-       struct request *rq = cmd->request;
        struct scsi_device *sdp = cmd->device;
-       struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
-       sector_t sector = blk_rq_pos(rq);
-       unsigned int nr_sectors = blk_rq_sectors(rq);
-       unsigned int len;
-       int ret;
+       struct request *rq = cmd->request;
+       u64 sector = blk_rq_pos(rq) >> (ilog2(sdp->sector_size) - 9);
+       u32 nr_sectors = blk_rq_sectors(rq) >> (ilog2(sdp->sector_size) - 9);
+       unsigned int data_len = 24;
        char *buf;
-       struct page *page;
-
-       sector >>= ilog2(sdp->sector_size) - 9;
-       nr_sectors >>= ilog2(sdp->sector_size) - 9;
 
-       page = alloc_page(GFP_ATOMIC | __GFP_ZERO);
-       if (!page)
+       rq->special_vec.bv_page = alloc_page(GFP_ATOMIC | __GFP_ZERO);
+       if (!rq->special_vec.bv_page)
                return BLKPREP_DEFER;
+       rq->special_vec.bv_offset = 0;
+       rq->special_vec.bv_len = data_len;
+       rq->rq_flags |= RQF_SPECIAL_PAYLOAD;
 
-       switch (sdkp->provisioning_mode) {
-       case SD_LBP_UNMAP:
-               buf = page_address(page);
+       cmd->cmd_len = 10;
+       cmd->cmnd[0] = UNMAP;
+       cmd->cmnd[8] = 24;
 
-               cmd->cmd_len = 10;
-               cmd->cmnd[0] = UNMAP;
-               cmd->cmnd[8] = 24;
+       buf = page_address(rq->special_vec.bv_page);
+       put_unaligned_be16(6 + 16, &buf[0]);
+       put_unaligned_be16(16, &buf[2]);
+       put_unaligned_be64(sector, &buf[8]);
+       put_unaligned_be32(nr_sectors, &buf[16]);
 
-               put_unaligned_be16(6 + 16, &buf[0]);
-               put_unaligned_be16(16, &buf[2]);
-               put_unaligned_be64(sector, &buf[8]);
-               put_unaligned_be32(nr_sectors, &buf[16]);
+       cmd->allowed = SD_MAX_RETRIES;
+       cmd->transfersize = data_len;
+       rq->timeout = SD_TIMEOUT;
+       scsi_req(rq)->resid_len = data_len;
 
-               len = 24;
-               break;
+       return scsi_init_io(cmd);
+}
 
-       case SD_LBP_WS16:
-               cmd->cmd_len = 16;
-               cmd->cmnd[0] = WRITE_SAME_16;
+static int sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd, bool unmap)
+{
+       struct scsi_device *sdp = cmd->device;
+       struct request *rq = cmd->request;
+       u64 sector = blk_rq_pos(rq) >> (ilog2(sdp->sector_size) - 9);
+       u32 nr_sectors = blk_rq_sectors(rq) >> (ilog2(sdp->sector_size) - 9);
+       u32 data_len = sdp->sector_size;
+
+       rq->special_vec.bv_page = alloc_page(GFP_ATOMIC | __GFP_ZERO);
+       if (!rq->special_vec.bv_page)
+               return BLKPREP_DEFER;
+       rq->special_vec.bv_offset = 0;
+       rq->special_vec.bv_len = data_len;
+       rq->rq_flags |= RQF_SPECIAL_PAYLOAD;
+
+       cmd->cmd_len = 16;
+       cmd->cmnd[0] = WRITE_SAME_16;
+       if (unmap)
                cmd->cmnd[1] = 0x8; /* UNMAP */
-               put_unaligned_be64(sector, &cmd->cmnd[2]);
-               put_unaligned_be32(nr_sectors, &cmd->cmnd[10]);
+       put_unaligned_be64(sector, &cmd->cmnd[2]);
+       put_unaligned_be32(nr_sectors, &cmd->cmnd[10]);
 
-               len = sdkp->device->sector_size;
-               break;
+       cmd->allowed = SD_MAX_RETRIES;
+       cmd->transfersize = data_len;
+       rq->timeout = unmap ? SD_TIMEOUT : SD_WRITE_SAME_TIMEOUT;
+       scsi_req(rq)->resid_len = data_len;
 
-       case SD_LBP_WS10:
-       case SD_LBP_ZERO:
-               cmd->cmd_len = 10;
-               cmd->cmnd[0] = WRITE_SAME;
-               if (sdkp->provisioning_mode == SD_LBP_WS10)
-                       cmd->cmnd[1] = 0x8; /* UNMAP */
-               put_unaligned_be32(sector, &cmd->cmnd[2]);
-               put_unaligned_be16(nr_sectors, &cmd->cmnd[7]);
+       return scsi_init_io(cmd);
+}
 
-               len = sdkp->device->sector_size;
-               break;
+static int sd_setup_write_same10_cmnd(struct scsi_cmnd *cmd, bool unmap)
+{
+       struct scsi_device *sdp = cmd->device;
+       struct request *rq = cmd->request;
+       u64 sector = blk_rq_pos(rq) >> (ilog2(sdp->sector_size) - 9);
+       u32 nr_sectors = blk_rq_sectors(rq) >> (ilog2(sdp->sector_size) - 9);
+       u32 data_len = sdp->sector_size;
 
-       default:
-               ret = BLKPREP_INVALID;
-               goto out;
-       }
+       rq->special_vec.bv_page = alloc_page(GFP_ATOMIC | __GFP_ZERO);
+       if (!rq->special_vec.bv_page)
+               return BLKPREP_DEFER;
+       rq->special_vec.bv_offset = 0;
+       rq->special_vec.bv_len = data_len;
+       rq->rq_flags |= RQF_SPECIAL_PAYLOAD;
 
-       rq->timeout = SD_TIMEOUT;
+       cmd->cmd_len = 10;
+       cmd->cmnd[0] = WRITE_SAME;
+       if (unmap)
+               cmd->cmnd[1] = 0x8; /* UNMAP */
+       put_unaligned_be32(sector, &cmd->cmnd[2]);
+       put_unaligned_be16(nr_sectors, &cmd->cmnd[7]);
 
-       cmd->transfersize = len;
        cmd->allowed = SD_MAX_RETRIES;
+       cmd->transfersize = data_len;
+       rq->timeout = unmap ? SD_TIMEOUT : SD_WRITE_SAME_TIMEOUT;
+       scsi_req(rq)->resid_len = data_len;
 
-       rq->special_vec.bv_page = page;
-       rq->special_vec.bv_offset = 0;
-       rq->special_vec.bv_len = len;
+       return scsi_init_io(cmd);
+}
 
-       rq->rq_flags |= RQF_SPECIAL_PAYLOAD;
-       scsi_req(rq)->resid_len = len;
+static int sd_setup_write_zeroes_cmnd(struct scsi_cmnd *cmd)
+{
+       struct request *rq = cmd->request;
+       struct scsi_device *sdp = cmd->device;
+       struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
+       u64 sector = blk_rq_pos(rq) >> (ilog2(sdp->sector_size) - 9);
+       u32 nr_sectors = blk_rq_sectors(rq) >> (ilog2(sdp->sector_size) - 9);
+
+       if (!(rq->cmd_flags & REQ_NOUNMAP)) {
+               switch (sdkp->zeroing_mode) {
+               case SD_ZERO_WS16_UNMAP:
+                       return sd_setup_write_same16_cmnd(cmd, true);
+               case SD_ZERO_WS10_UNMAP:
+                       return sd_setup_write_same10_cmnd(cmd, true);
+               }
+       }
 
-       ret = scsi_init_io(cmd);
-out:
-       if (ret != BLKPREP_OK)
-               __free_page(page);
-       return ret;
+       if (sdp->no_write_same)
+               return BLKPREP_INVALID;
+       if (sdkp->ws16 || sector > 0xffffffff || nr_sectors > 0xffff)
+               return sd_setup_write_same16_cmnd(cmd, false);
+       return sd_setup_write_same10_cmnd(cmd, false);
 }
 
 static void sd_config_write_same(struct scsi_disk *sdkp)
@@ -816,9 +868,20 @@ static void sd_config_write_same(struct scsi_disk *sdkp)
                sdkp->max_ws_blocks = 0;
        }
 
+       if (sdkp->lbprz && sdkp->lbpws)
+               sdkp->zeroing_mode = SD_ZERO_WS16_UNMAP;
+       else if (sdkp->lbprz && sdkp->lbpws10)
+               sdkp->zeroing_mode = SD_ZERO_WS10_UNMAP;
+       else if (sdkp->max_ws_blocks)
+               sdkp->zeroing_mode = SD_ZERO_WS;
+       else
+               sdkp->zeroing_mode = SD_ZERO_WRITE;
+
 out:
        blk_queue_max_write_same_sectors(q, sdkp->max_ws_blocks *
                                         (logical_block_size >> 9));
+       blk_queue_max_write_zeroes_sectors(q, sdkp->max_ws_blocks *
+                                        (logical_block_size >> 9));
 }
 
 /**
@@ -1155,7 +1218,20 @@ static int sd_init_command(struct scsi_cmnd *cmd)
 
        switch (req_op(rq)) {
        case REQ_OP_DISCARD:
-               return sd_setup_discard_cmnd(cmd);
+               switch (scsi_disk(rq->rq_disk)->provisioning_mode) {
+               case SD_LBP_UNMAP:
+                       return sd_setup_unmap_cmnd(cmd);
+               case SD_LBP_WS16:
+                       return sd_setup_write_same16_cmnd(cmd, true);
+               case SD_LBP_WS10:
+                       return sd_setup_write_same10_cmnd(cmd, true);
+               case SD_LBP_ZERO:
+                       return sd_setup_write_same10_cmnd(cmd, false);
+               default:
+                       return BLKPREP_INVALID;
+               }
+       case REQ_OP_WRITE_ZEROES:
+               return sd_setup_write_zeroes_cmnd(cmd);
        case REQ_OP_WRITE_SAME:
                return sd_setup_write_same_cmnd(cmd);
        case REQ_OP_FLUSH:
@@ -1795,6 +1871,7 @@ static int sd_done(struct scsi_cmnd *SCpnt)
 
        switch (req_op(req)) {
        case REQ_OP_DISCARD:
+       case REQ_OP_WRITE_ZEROES:
        case REQ_OP_WRITE_SAME:
        case REQ_OP_ZONE_RESET:
                if (!result) {
@@ -2102,6 +2179,22 @@ static void read_capacity_error(struct scsi_disk *sdkp, struct scsi_device *sdp,
 
 #define READ_CAPACITY_RETRIES_ON_RESET 10
 
+/*
+ * Ensure that we don't overflow sector_t when CONFIG_LBDAF is not set
+ * and the reported logical block size is bigger than 512 bytes. Note
+ * that last_sector is a u64 and therefore logical_to_sectors() is not
+ * applicable.
+ */
+static bool sd_addressable_capacity(u64 lba, unsigned int sector_size)
+{
+       u64 last_sector = (lba + 1ULL) << (ilog2(sector_size) - 9);
+
+       if (sizeof(sector_t) == 4 && last_sector > U32_MAX)
+               return false;
+
+       return true;
+}
+
 static int read_capacity_16(struct scsi_disk *sdkp, struct scsi_device *sdp,
                                                unsigned char *buffer)
 {
@@ -2167,7 +2260,7 @@ static int read_capacity_16(struct scsi_disk *sdkp, struct scsi_device *sdp,
                return -ENODEV;
        }
 
-       if ((sizeof(sdkp->capacity) == 4) && (lba >= 0xffffffffULL)) {
+       if (!sd_addressable_capacity(lba, sector_size)) {
                sd_printk(KERN_ERR, sdkp, "Too big for this kernel. Use a "
                        "kernel compiled with support for large block "
                        "devices.\n");
@@ -2256,7 +2349,7 @@ static int read_capacity_10(struct scsi_disk *sdkp, struct scsi_device *sdp,
                return sector_size;
        }
 
-       if ((sizeof(sdkp->capacity) == 4) && (lba == 0xffffffff)) {
+       if (!sd_addressable_capacity(lba, sector_size)) {
                sd_printk(KERN_ERR, sdkp, "Too big for this kernel. Use a "
                        "kernel compiled with support for large block "
                        "devices.\n");
@@ -2752,7 +2845,7 @@ static void sd_read_block_limits(struct scsi_disk *sdkp)
                                sd_config_discard(sdkp, SD_LBP_WS16);
 
                } else {        /* LBP VPD page tells us what to use */
-                       if (sdkp->lbpu && sdkp->max_unmap_blocks && !sdkp->lbprz)
+                       if (sdkp->lbpu && sdkp->max_unmap_blocks)
                                sd_config_discard(sdkp, SD_LBP_UNMAP);
                        else if (sdkp->lbpws)
                                sd_config_discard(sdkp, SD_LBP_WS16);
@@ -2956,7 +3049,8 @@ static int sd_revalidate_disk(struct gendisk *disk)
                q->limits.io_opt = logical_to_bytes(sdp, sdkp->opt_xfer_blocks);
                rw_max = logical_to_sectors(sdp, sdkp->opt_xfer_blocks);
        } else
-               rw_max = BLK_DEF_MAX_SECTORS;
+               rw_max = min_not_zero(logical_to_sectors(sdp, dev_max),
+                                     (sector_t)BLK_DEF_MAX_SECTORS);
 
        /* Combine with controller limits */
        q->limits.max_sectors = min(rw_max, queue_max_hw_sectors(q));
index 4dac35e..a2c4b5c 100644 (file)
@@ -59,6 +59,13 @@ enum {
        SD_LBP_DISABLE,         /* Discard disabled due to failed cmd */
 };
 
+enum {
+       SD_ZERO_WRITE = 0,      /* Use WRITE(10/16) command */
+       SD_ZERO_WS,             /* Use WRITE SAME(10/16) command */
+       SD_ZERO_WS16_UNMAP,     /* Use WRITE SAME(16) with UNMAP */
+       SD_ZERO_WS10_UNMAP,     /* Use WRITE SAME(10) with UNMAP */
+};
+
 struct scsi_disk {
        struct scsi_driver *driver;     /* always &sd_template */
        struct scsi_device *device;
@@ -89,6 +96,7 @@ struct scsi_disk {
        u8              write_prot;
        u8              protection_type;/* Data Integrity Field */
        u8              provisioning_mode;
+       u8              zeroing_mode;
        unsigned        ATO : 1;        /* state of disk ATO bit */
        unsigned        cache_override : 1; /* temp override of WCE,RCD */
        unsigned        WCE : 1;        /* state of disk WCE bit */
index 92620c8..1994f77 100644 (file)
@@ -329,6 +329,7 @@ void sd_zbc_complete(struct scsi_cmnd *cmd,
 
        switch (req_op(rq)) {
        case REQ_OP_WRITE:
+       case REQ_OP_WRITE_ZEROES:
        case REQ_OP_WRITE_SAME:
        case REQ_OP_ZONE_RESET:
 
index 225abaa..0b60245 100644 (file)
@@ -581,7 +581,7 @@ sg_write(struct file *filp, const char __user *buf, size_t count, loff_t * ppos)
        sg_io_hdr_t *hp;
        unsigned char cmnd[SG_MAX_CDB_SIZE];
 
-       if (unlikely(segment_eq(get_fs(), KERNEL_DS)))
+       if (unlikely(uaccess_kernel()))
                return -EINVAL;
 
        if ((!(sfp = (Sg_fd *) filp->private_data)) || (!(sdp = sfp->parentdp)))
@@ -1300,7 +1300,7 @@ sg_rq_end_io(struct request *rq, int uptodate)
                pr_info("%s: device detaching\n", __func__);
 
        sense = req->sense;
-       result = rq->errors;
+       result = req->result;
        resid = req->resid_len;
 
        SCSI_LOG_TIMEOUT(4, sg_printk(KERN_INFO, sdp,
@@ -1718,7 +1718,7 @@ sg_start_req(Sg_request *srp, unsigned char *cmd)
 
        srp->rq = rq;
        rq->end_io_data = srp;
-       rq->retries = SG_DEFAULT_RETRIES;
+       req->retries = SG_DEFAULT_RETRIES;
 
        if ((dxfer_len <= 0) || (dxfer_dir == SG_DXFER_NONE))
                return 0;
index 0b29b93..a8f6302 100644 (file)
@@ -836,6 +836,7 @@ static void get_capabilities(struct scsi_cd *cd)
        unsigned char *buffer;
        struct scsi_mode_data data;
        struct scsi_sense_hdr sshdr;
+       unsigned int ms_len = 128;
        int rc, n;
 
        static const char *loadmech[] =
@@ -862,10 +863,11 @@ static void get_capabilities(struct scsi_cd *cd)
        scsi_test_unit_ready(cd->device, SR_TIMEOUT, MAX_RETRIES, &sshdr);
 
        /* ask for mode page 0x2a */
-       rc = scsi_mode_sense(cd->device, 0, 0x2a, buffer, 128,
+       rc = scsi_mode_sense(cd->device, 0, 0x2a, buffer, ms_len,
                             SR_TIMEOUT, 3, &data, NULL);
 
-       if (!scsi_status_is_good(rc)) {
+       if (!scsi_status_is_good(rc) || data.length > ms_len ||
+           data.header_length + data.block_descriptor_length > data.length) {
                /* failed, drive doesn't have capabilities mode page */
                cd->cdi.speed = 1;
                cd->cdi.mask |= (CDC_CD_R | CDC_CD_RW | CDC_DVD_R |
index e5ef78a..1ea34d6 100644 (file)
@@ -480,7 +480,7 @@ static void st_do_stats(struct scsi_tape *STp, struct request *req)
                atomic64_add(ktime_to_ns(now), &STp->stats->tot_write_time);
                atomic64_add(ktime_to_ns(now), &STp->stats->tot_io_time);
                atomic64_inc(&STp->stats->write_cnt);
-               if (req->errors) {
+               if (scsi_req(req)->result) {
                        atomic64_add(atomic_read(&STp->stats->last_write_size)
                                - STp->buffer->cmdstat.residual,
                                &STp->stats->write_byte_cnt);
@@ -494,7 +494,7 @@ static void st_do_stats(struct scsi_tape *STp, struct request *req)
                atomic64_add(ktime_to_ns(now), &STp->stats->tot_read_time);
                atomic64_add(ktime_to_ns(now), &STp->stats->tot_io_time);
                atomic64_inc(&STp->stats->read_cnt);
-               if (req->errors) {
+               if (scsi_req(req)->result) {
                        atomic64_add(atomic_read(&STp->stats->last_read_size)
                                - STp->buffer->cmdstat.residual,
                                &STp->stats->read_byte_cnt);
@@ -518,7 +518,7 @@ static void st_scsi_execute_end(struct request *req, int uptodate)
        struct scsi_tape *STp = SRpnt->stp;
        struct bio *tmp;
 
-       STp->buffer->cmdstat.midlevel_result = SRpnt->result = req->errors;
+       STp->buffer->cmdstat.midlevel_result = SRpnt->result = rq->result;
        STp->buffer->cmdstat.residual = rq->resid_len;
 
        st_do_stats(STp, req);
@@ -579,7 +579,7 @@ static int st_scsi_execute(struct st_request *SRpnt, const unsigned char *cmd,
        memset(rq->cmd, 0, BLK_MAX_CDB);
        memcpy(rq->cmd, cmd, rq->cmd_len);
        req->timeout = timeout;
-       req->retries = retries;
+       rq->retries = retries;
        req->end_io_data = SRpnt;
 
        blk_execute_rq_nowait(req->q, NULL, req, 1, st_scsi_execute_end);
index 7cbad0d..6ba270e 100644 (file)
@@ -409,6 +409,7 @@ static int ashmem_mmap(struct file *file, struct vm_area_struct *vma)
                        ret = PTR_ERR(vmfile);
                        goto out;
                }
+               vmfile->f_mode |= FMODE_LSEEK;
                asma->file = vmfile;
        }
        get_file(asma->file);
index 8886458..a676bcc 100644 (file)
@@ -133,13 +133,9 @@ struct lustre_sb_info {
        struct obd_export        *lsi_osd_exp;
        char                      lsi_osd_type[16];
        char                      lsi_fstype[16];
-       struct backing_dev_info   lsi_bdi;     /* each client mountpoint needs
-                                               * own backing_dev_info
-                                               */
 };
 
 #define LSI_UMOUNT_FAILOVER          0x00200000
-#define LSI_BDI_INITIALIZED          0x00400000
 
 #define     s2lsi(sb)  ((struct lustre_sb_info *)((sb)->s_fs_info))
 #define     s2lsi_nocast(sb) ((sb)->s_fs_info)
index b229cbc..d483c44 100644 (file)
@@ -863,15 +863,6 @@ void ll_lli_init(struct ll_inode_info *lli)
        mutex_init(&lli->lli_layout_mutex);
 }
 
-static inline int ll_bdi_register(struct backing_dev_info *bdi)
-{
-       static atomic_t ll_bdi_num = ATOMIC_INIT(0);
-
-       bdi->name = "lustre";
-       return bdi_register(bdi, NULL, "lustre-%d",
-                           atomic_inc_return(&ll_bdi_num));
-}
-
 int ll_fill_super(struct super_block *sb, struct vfsmount *mnt)
 {
        struct lustre_profile *lprof = NULL;
@@ -881,6 +872,7 @@ int ll_fill_super(struct super_block *sb, struct vfsmount *mnt)
        char  *profilenm = get_profile_name(sb);
        struct config_llog_instance *cfg;
        int    err;
+       static atomic_t ll_bdi_num = ATOMIC_INIT(0);
 
        CDEBUG(D_VFSTRACE, "VFS Op: sb %p\n", sb);
 
@@ -903,16 +895,11 @@ int ll_fill_super(struct super_block *sb, struct vfsmount *mnt)
        if (err)
                goto out_free;
 
-       err = bdi_init(&lsi->lsi_bdi);
-       if (err)
-               goto out_free;
-       lsi->lsi_flags |= LSI_BDI_INITIALIZED;
-       lsi->lsi_bdi.capabilities = 0;
-       err = ll_bdi_register(&lsi->lsi_bdi);
+       err = super_setup_bdi_name(sb, "lustre-%d",
+                                  atomic_inc_return(&ll_bdi_num));
        if (err)
                goto out_free;
 
-       sb->s_bdi = &lsi->lsi_bdi;
        /* kernel >= 2.6.38 store dentry operations in sb->s_d_op. */
        sb->s_d_op = &ll_d_ops;
 
@@ -1033,11 +1020,6 @@ void ll_put_super(struct super_block *sb)
        if (profilenm)
                class_del_profile(profilenm);
 
-       if (lsi->lsi_flags & LSI_BDI_INITIALIZED) {
-               bdi_destroy(&lsi->lsi_bdi);
-               lsi->lsi_flags &= ~LSI_BDI_INITIALIZED;
-       }
-
        ll_free_sbi(sb);
        lsi->lsi_llsbi = NULL;
 
index a918024..e3f9ed3 100644 (file)
@@ -485,8 +485,7 @@ static void iscsit_get_rx_pdu(struct iscsi_conn *);
 
 int iscsit_queue_rsp(struct iscsi_conn *conn, struct iscsi_cmd *cmd)
 {
-       iscsit_add_cmd_to_response_queue(cmd, cmd->conn, cmd->i_state);
-       return 0;
+       return iscsit_add_cmd_to_response_queue(cmd, cmd->conn, cmd->i_state);
 }
 EXPORT_SYMBOL(iscsit_queue_rsp);
 
index bf40f03..5798810 100644 (file)
@@ -167,10 +167,7 @@ static struct se_tpg_np *lio_target_call_addnptotpg(
        struct iscsi_portal_group *tpg;
        struct iscsi_tpg_np *tpg_np;
        char *str, *str2, *ip_str, *port_str;
-       struct sockaddr_storage sockaddr;
-       struct sockaddr_in *sock_in;
-       struct sockaddr_in6 *sock_in6;
-       unsigned long port;
+       struct sockaddr_storage sockaddr = { };
        int ret;
        char buf[MAX_PORTAL_LEN + 1];
 
@@ -182,21 +179,19 @@ static struct se_tpg_np *lio_target_call_addnptotpg(
        memset(buf, 0, MAX_PORTAL_LEN + 1);
        snprintf(buf, MAX_PORTAL_LEN + 1, "%s", name);
 
-       memset(&sockaddr, 0, sizeof(struct sockaddr_storage));
-
        str = strstr(buf, "[");
        if (str) {
-               const char *end;
-
                str2 = strstr(str, "]");
                if (!str2) {
                        pr_err("Unable to locate trailing \"]\""
                                " in IPv6 iSCSI network portal address\n");
                        return ERR_PTR(-EINVAL);
                }
-               str++; /* Skip over leading "[" */
+
+               ip_str = str + 1; /* Skip over leading "[" */
                *str2 = '\0'; /* Terminate the unbracketed IPv6 address */
                str2++; /* Skip over the \0 */
+
                port_str = strstr(str2, ":");
                if (!port_str) {
                        pr_err("Unable to locate \":port\""
@@ -205,23 +200,8 @@ static struct se_tpg_np *lio_target_call_addnptotpg(
                }
                *port_str = '\0'; /* Terminate string for IP */
                port_str++; /* Skip over ":" */
-
-               ret = kstrtoul(port_str, 0, &port);
-               if (ret < 0) {
-                       pr_err("kstrtoul() failed for port_str: %d\n", ret);
-                       return ERR_PTR(ret);
-               }
-               sock_in6 = (struct sockaddr_in6 *)&sockaddr;
-               sock_in6->sin6_family = AF_INET6;
-               sock_in6->sin6_port = htons((unsigned short)port);
-               ret = in6_pton(str, -1,
-                               (void *)&sock_in6->sin6_addr.in6_u, -1, &end);
-               if (ret <= 0) {
-                       pr_err("in6_pton returned: %d\n", ret);
-                       return ERR_PTR(-EINVAL);
-               }
        } else {
-               str = ip_str = &buf[0];
+               ip_str = &buf[0];
                port_str = strstr(ip_str, ":");
                if (!port_str) {
                        pr_err("Unable to locate \":port\""
@@ -230,17 +210,15 @@ static struct se_tpg_np *lio_target_call_addnptotpg(
                }
                *port_str = '\0'; /* Terminate string for IP */
                port_str++; /* Skip over ":" */
+       }
 
-               ret = kstrtoul(port_str, 0, &port);
-               if (ret < 0) {
-                       pr_err("kstrtoul() failed for port_str: %d\n", ret);
-                       return ERR_PTR(ret);
-               }
-               sock_in = (struct sockaddr_in *)&sockaddr;
-               sock_in->sin_family = AF_INET;
-               sock_in->sin_port = htons((unsigned short)port);
-               sock_in->sin_addr.s_addr = in_aton(ip_str);
+       ret = inet_pton_with_scope(&init_net, AF_UNSPEC, ip_str,
+                       port_str, &sockaddr);
+       if (ret) {
+               pr_err("malformed ip/port passed: %s\n", name);
+               return ERR_PTR(ret);
        }
+
        tpg = container_of(se_tpg, struct iscsi_portal_group, tpg_se_tpg);
        ret = iscsit_get_tpg(tpg);
        if (ret < 0)
@@ -1398,11 +1376,10 @@ static u32 lio_sess_get_initiator_sid(
 static int lio_queue_data_in(struct se_cmd *se_cmd)
 {
        struct iscsi_cmd *cmd = container_of(se_cmd, struct iscsi_cmd, se_cmd);
+       struct iscsi_conn *conn = cmd->conn;
 
        cmd->i_state = ISTATE_SEND_DATAIN;
-       cmd->conn->conn_transport->iscsit_queue_data_in(cmd->conn, cmd);
-
-       return 0;
+       return conn->conn_transport->iscsit_queue_data_in(conn, cmd);
 }
 
 static int lio_write_pending(struct se_cmd *se_cmd)
@@ -1431,16 +1408,14 @@ static int lio_write_pending_status(struct se_cmd *se_cmd)
 static int lio_queue_status(struct se_cmd *se_cmd)
 {
        struct iscsi_cmd *cmd = container_of(se_cmd, struct iscsi_cmd, se_cmd);
+       struct iscsi_conn *conn = cmd->conn;
 
        cmd->i_state = ISTATE_SEND_STATUS;
 
        if (cmd->se_cmd.scsi_status || cmd->sense_reason) {
-               iscsit_add_cmd_to_response_queue(cmd, cmd->conn, cmd->i_state);
-               return 0;
+               return iscsit_add_cmd_to_response_queue(cmd, conn, cmd->i_state);
        }
-       cmd->conn->conn_transport->iscsit_queue_status(cmd->conn, cmd);
-
-       return 0;
+       return conn->conn_transport->iscsit_queue_status(conn, cmd);
 }
 
 static void lio_queue_tm_rsp(struct se_cmd *se_cmd)
index e65bf78..fce6276 100644 (file)
@@ -782,22 +782,6 @@ static void iscsi_check_proposer_for_optional_reply(struct iscsi_param *param)
                if (!strcmp(param->name, MAXRECVDATASEGMENTLENGTH))
                        SET_PSTATE_REPLY_OPTIONAL(param);
                /*
-                * The GlobalSAN iSCSI Initiator for MacOSX does
-                * not respond to MaxBurstLength, FirstBurstLength,
-                * DefaultTime2Wait or DefaultTime2Retain parameter keys.
-                * So, we set them to 'reply optional' here, and assume the
-                * the defaults from iscsi_parameters.h if the initiator
-                * is not RFC compliant and the keys are not negotiated.
-                */
-               if (!strcmp(param->name, MAXBURSTLENGTH))
-                       SET_PSTATE_REPLY_OPTIONAL(param);
-               if (!strcmp(param->name, FIRSTBURSTLENGTH))
-                       SET_PSTATE_REPLY_OPTIONAL(param);
-               if (!strcmp(param->name, DEFAULTTIME2WAIT))
-                       SET_PSTATE_REPLY_OPTIONAL(param);
-               if (!strcmp(param->name, DEFAULTTIME2RETAIN))
-                       SET_PSTATE_REPLY_OPTIONAL(param);
-               /*
                 * Required for gPXE iSCSI boot client
                 */
                if (!strcmp(param->name, MAXCONNECTIONS))
index 5041a9c..7d3e2fc 100644 (file)
@@ -567,7 +567,7 @@ static void iscsit_remove_cmd_from_immediate_queue(
        }
 }
 
-void iscsit_add_cmd_to_response_queue(
+int iscsit_add_cmd_to_response_queue(
        struct iscsi_cmd *cmd,
        struct iscsi_conn *conn,
        u8 state)
@@ -578,7 +578,7 @@ void iscsit_add_cmd_to_response_queue(
        if (!qr) {
                pr_err("Unable to allocate memory for"
                        " struct iscsi_queue_req\n");
-               return;
+               return -ENOMEM;
        }
        INIT_LIST_HEAD(&qr->qr_list);
        qr->cmd = cmd;
@@ -590,6 +590,7 @@ void iscsit_add_cmd_to_response_queue(
        spin_unlock_bh(&conn->response_queue_lock);
 
        wake_up(&conn->queues_wq);
+       return 0;
 }
 
 struct iscsi_queue_req *iscsit_get_cmd_from_response_queue(struct iscsi_conn *conn)
@@ -737,21 +738,23 @@ void iscsit_free_cmd(struct iscsi_cmd *cmd, bool shutdown)
 {
        struct se_cmd *se_cmd = NULL;
        int rc;
+       bool op_scsi = false;
        /*
         * Determine if a struct se_cmd is associated with
         * this struct iscsi_cmd.
         */
        switch (cmd->iscsi_opcode) {
        case ISCSI_OP_SCSI_CMD:
-               se_cmd = &cmd->se_cmd;
-               __iscsit_free_cmd(cmd, true, shutdown);
+               op_scsi = true;
                /*
                 * Fallthrough
                 */
        case ISCSI_OP_SCSI_TMFUNC:
-               rc = transport_generic_free_cmd(&cmd->se_cmd, shutdown);
-               if (!rc && shutdown && se_cmd && se_cmd->se_sess) {
-                       __iscsit_free_cmd(cmd, true, shutdown);
+               se_cmd = &cmd->se_cmd;
+               __iscsit_free_cmd(cmd, op_scsi, shutdown);
+               rc = transport_generic_free_cmd(se_cmd, shutdown);
+               if (!rc && shutdown && se_cmd->se_sess) {
+                       __iscsit_free_cmd(cmd, op_scsi, shutdown);
                        target_put_sess_cmd(se_cmd);
                }
                break;
index 8ff0885..9e4197a 100644 (file)
@@ -31,7 +31,7 @@ extern int iscsit_find_cmd_for_recovery(struct iscsi_session *, struct iscsi_cmd
                        struct iscsi_conn_recovery **, itt_t);
 extern void iscsit_add_cmd_to_immediate_queue(struct iscsi_cmd *, struct iscsi_conn *, u8);
 extern struct iscsi_queue_req *iscsit_get_cmd_from_immediate_queue(struct iscsi_conn *);
-extern void iscsit_add_cmd_to_response_queue(struct iscsi_cmd *, struct iscsi_conn *, u8);
+extern int iscsit_add_cmd_to_response_queue(struct iscsi_cmd *, struct iscsi_conn *, u8);
 extern struct iscsi_queue_req *iscsit_get_cmd_from_response_queue(struct iscsi_conn *);
 extern void iscsit_remove_cmd_from_tx_queues(struct iscsi_cmd *, struct iscsi_conn *);
 extern bool iscsit_conn_all_queues_empty(struct iscsi_conn *);
index fd7c16a..fc4a9c3 100644 (file)
@@ -197,8 +197,7 @@ target_emulate_report_target_port_groups(struct se_cmd *cmd)
                /*
                 * Set the ASYMMETRIC ACCESS State
                 */
-               buf[off++] |= (atomic_read(
-                       &tg_pt_gp->tg_pt_gp_alua_access_state) & 0xff);
+               buf[off++] |= tg_pt_gp->tg_pt_gp_alua_access_state & 0xff;
                /*
                 * Set supported ASYMMETRIC ACCESS State bits
                 */
@@ -710,7 +709,7 @@ target_alua_state_check(struct se_cmd *cmd)
 
        spin_lock(&lun->lun_tg_pt_gp_lock);
        tg_pt_gp = lun->lun_tg_pt_gp;
-       out_alua_state = atomic_read(&tg_pt_gp->tg_pt_gp_alua_access_state);
+       out_alua_state = tg_pt_gp->tg_pt_gp_alua_access_state;
        nonop_delay_msecs = tg_pt_gp->tg_pt_gp_nonop_delay_msecs;
 
        // XXX: keeps using tg_pt_gp witout reference after unlock
@@ -911,7 +910,7 @@ static int core_alua_write_tpg_metadata(
 }
 
 /*
- * Called with tg_pt_gp->tg_pt_gp_md_mutex held
+ * Called with tg_pt_gp->tg_pt_gp_transition_mutex held
  */
 static int core_alua_update_tpg_primary_metadata(
        struct t10_alua_tg_pt_gp *tg_pt_gp)
@@ -934,7 +933,7 @@ static int core_alua_update_tpg_primary_metadata(
                        "alua_access_state=0x%02x\n"
                        "alua_access_status=0x%02x\n",
                        tg_pt_gp->tg_pt_gp_id,
-                       tg_pt_gp->tg_pt_gp_alua_pending_state,
+                       tg_pt_gp->tg_pt_gp_alua_access_state,
                        tg_pt_gp->tg_pt_gp_alua_access_status);
 
        snprintf(path, ALUA_METADATA_PATH_LEN,
@@ -1013,93 +1012,41 @@ static void core_alua_queue_state_change_ua(struct t10_alua_tg_pt_gp *tg_pt_gp)
        spin_unlock(&tg_pt_gp->tg_pt_gp_lock);
 }
 
-static void core_alua_do_transition_tg_pt_work(struct work_struct *work)
-{
-       struct t10_alua_tg_pt_gp *tg_pt_gp = container_of(work,
-               struct t10_alua_tg_pt_gp, tg_pt_gp_transition_work);
-       struct se_device *dev = tg_pt_gp->tg_pt_gp_dev;
-       bool explicit = (tg_pt_gp->tg_pt_gp_alua_access_status ==
-                        ALUA_STATUS_ALTERED_BY_EXPLICIT_STPG);
-
-       /*
-        * Update the ALUA metadata buf that has been allocated in
-        * core_alua_do_port_transition(), this metadata will be written
-        * to struct file.
-        *
-        * Note that there is the case where we do not want to update the
-        * metadata when the saved metadata is being parsed in userspace
-        * when setting the existing port access state and access status.
-        *
-        * Also note that the failure to write out the ALUA metadata to
-        * struct file does NOT affect the actual ALUA transition.
-        */
-       if (tg_pt_gp->tg_pt_gp_write_metadata) {
-               mutex_lock(&tg_pt_gp->tg_pt_gp_md_mutex);
-               core_alua_update_tpg_primary_metadata(tg_pt_gp);
-               mutex_unlock(&tg_pt_gp->tg_pt_gp_md_mutex);
-       }
-       /*
-        * Set the current primary ALUA access state to the requested new state
-        */
-       atomic_set(&tg_pt_gp->tg_pt_gp_alua_access_state,
-                  tg_pt_gp->tg_pt_gp_alua_pending_state);
-
-       pr_debug("Successful %s ALUA transition TG PT Group: %s ID: %hu"
-               " from primary access state %s to %s\n", (explicit) ? "explicit" :
-               "implicit", config_item_name(&tg_pt_gp->tg_pt_gp_group.cg_item),
-               tg_pt_gp->tg_pt_gp_id,
-               core_alua_dump_state(tg_pt_gp->tg_pt_gp_alua_previous_state),
-               core_alua_dump_state(tg_pt_gp->tg_pt_gp_alua_pending_state));
-
-       core_alua_queue_state_change_ua(tg_pt_gp);
-
-       spin_lock(&dev->t10_alua.tg_pt_gps_lock);
-       atomic_dec(&tg_pt_gp->tg_pt_gp_ref_cnt);
-       spin_unlock(&dev->t10_alua.tg_pt_gps_lock);
-
-       if (tg_pt_gp->tg_pt_gp_transition_complete)
-               complete(tg_pt_gp->tg_pt_gp_transition_complete);
-}
-
 static int core_alua_do_transition_tg_pt(
        struct t10_alua_tg_pt_gp *tg_pt_gp,
        int new_state,
        int explicit)
 {
-       struct se_device *dev = tg_pt_gp->tg_pt_gp_dev;
-       DECLARE_COMPLETION_ONSTACK(wait);
+       int prev_state;
 
+       mutex_lock(&tg_pt_gp->tg_pt_gp_transition_mutex);
        /* Nothing to be done here */
-       if (atomic_read(&tg_pt_gp->tg_pt_gp_alua_access_state) == new_state)
+       if (tg_pt_gp->tg_pt_gp_alua_access_state == new_state) {
+               mutex_unlock(&tg_pt_gp->tg_pt_gp_transition_mutex);
                return 0;
+       }
 
-       if (explicit && new_state == ALUA_ACCESS_STATE_TRANSITION)
+       if (explicit && new_state == ALUA_ACCESS_STATE_TRANSITION) {
+               mutex_unlock(&tg_pt_gp->tg_pt_gp_transition_mutex);
                return -EAGAIN;
-
-       /*
-        * Flush any pending transitions
-        */
-       if (!explicit)
-               flush_work(&tg_pt_gp->tg_pt_gp_transition_work);
+       }
 
        /*
         * Save the old primary ALUA access state, and set the current state
         * to ALUA_ACCESS_STATE_TRANSITION.
         */
-       atomic_set(&tg_pt_gp->tg_pt_gp_alua_access_state,
-                       ALUA_ACCESS_STATE_TRANSITION);
+       prev_state = tg_pt_gp->tg_pt_gp_alua_access_state;
+       tg_pt_gp->tg_pt_gp_alua_access_state = ALUA_ACCESS_STATE_TRANSITION;
        tg_pt_gp->tg_pt_gp_alua_access_status = (explicit) ?
                                ALUA_STATUS_ALTERED_BY_EXPLICIT_STPG :
                                ALUA_STATUS_ALTERED_BY_IMPLICIT_ALUA;
 
        core_alua_queue_state_change_ua(tg_pt_gp);
 
-       if (new_state == ALUA_ACCESS_STATE_TRANSITION)
+       if (new_state == ALUA_ACCESS_STATE_TRANSITION) {
+               mutex_unlock(&tg_pt_gp->tg_pt_gp_transition_mutex);
                return 0;
-
-       tg_pt_gp->tg_pt_gp_alua_previous_state =
-               atomic_read(&tg_pt_gp->tg_pt_gp_alua_access_state);
-       tg_pt_gp->tg_pt_gp_alua_pending_state = new_state;
+       }
 
        /*
         * Check for the optional ALUA primary state transition delay
@@ -1108,19 +1055,36 @@ static int core_alua_do_transition_tg_pt(
                msleep_interruptible(tg_pt_gp->tg_pt_gp_trans_delay_msecs);
 
        /*
-        * Take a reference for workqueue item
+        * Set the current primary ALUA access state to the requested new state
         */
-       spin_lock(&dev->t10_alua.tg_pt_gps_lock);
-       atomic_inc(&tg_pt_gp->tg_pt_gp_ref_cnt);
-       spin_unlock(&dev->t10_alua.tg_pt_gps_lock);
+       tg_pt_gp->tg_pt_gp_alua_access_state = new_state;
 
-       schedule_work(&tg_pt_gp->tg_pt_gp_transition_work);
-       if (explicit) {
-               tg_pt_gp->tg_pt_gp_transition_complete = &wait;
-               wait_for_completion(&wait);
-               tg_pt_gp->tg_pt_gp_transition_complete = NULL;
+       /*
+        * Update the ALUA metadata buf that has been allocated in
+        * core_alua_do_port_transition(), this metadata will be written
+        * to struct file.
+        *
+        * Note that there is the case where we do not want to update the
+        * metadata when the saved metadata is being parsed in userspace
+        * when setting the existing port access state and access status.
+        *
+        * Also note that the failure to write out the ALUA metadata to
+        * struct file does NOT affect the actual ALUA transition.
+        */
+       if (tg_pt_gp->tg_pt_gp_write_metadata) {
+               core_alua_update_tpg_primary_metadata(tg_pt_gp);
        }
 
+       pr_debug("Successful %s ALUA transition TG PT Group: %s ID: %hu"
+               " from primary access state %s to %s\n", (explicit) ? "explicit" :
+               "implicit", config_item_name(&tg_pt_gp->tg_pt_gp_group.cg_item),
+               tg_pt_gp->tg_pt_gp_id,
+               core_alua_dump_state(prev_state),
+               core_alua_dump_state(new_state));
+
+       core_alua_queue_state_change_ua(tg_pt_gp);
+
+       mutex_unlock(&tg_pt_gp->tg_pt_gp_transition_mutex);
        return 0;
 }
 
@@ -1685,14 +1649,12 @@ struct t10_alua_tg_pt_gp *core_alua_allocate_tg_pt_gp(struct se_device *dev,
        }
        INIT_LIST_HEAD(&tg_pt_gp->tg_pt_gp_list);
        INIT_LIST_HEAD(&tg_pt_gp->tg_pt_gp_lun_list);
-       mutex_init(&tg_pt_gp->tg_pt_gp_md_mutex);
+       mutex_init(&tg_pt_gp->tg_pt_gp_transition_mutex);
        spin_lock_init(&tg_pt_gp->tg_pt_gp_lock);
        atomic_set(&tg_pt_gp->tg_pt_gp_ref_cnt, 0);
-       INIT_WORK(&tg_pt_gp->tg_pt_gp_transition_work,
-                 core_alua_do_transition_tg_pt_work);
        tg_pt_gp->tg_pt_gp_dev = dev;
-       atomic_set(&tg_pt_gp->tg_pt_gp_alua_access_state,
-               ALUA_ACCESS_STATE_ACTIVE_OPTIMIZED);
+       tg_pt_gp->tg_pt_gp_alua_access_state =
+                       ALUA_ACCESS_STATE_ACTIVE_OPTIMIZED;
        /*
         * Enable both explicit and implicit ALUA support by default
         */
@@ -1797,8 +1759,6 @@ void core_alua_free_tg_pt_gp(
        dev->t10_alua.alua_tg_pt_gps_counter--;
        spin_unlock(&dev->t10_alua.tg_pt_gps_lock);
 
-       flush_work(&tg_pt_gp->tg_pt_gp_transition_work);
-
        /*
         * Allow a struct t10_alua_tg_pt_gp_member * referenced by
         * core_alua_get_tg_pt_gp_by_name() in
@@ -1938,8 +1898,8 @@ ssize_t core_alua_show_tg_pt_gp_info(struct se_lun *lun, char *page)
                        "Primary Access Status: %s\nTG Port Secondary Access"
                        " State: %s\nTG Port Secondary Access Status: %s\n",
                        config_item_name(tg_pt_ci), tg_pt_gp->tg_pt_gp_id,
-                       core_alua_dump_state(atomic_read(
-                                       &tg_pt_gp->tg_pt_gp_alua_access_state)),
+                       core_alua_dump_state(
+                               tg_pt_gp->tg_pt_gp_alua_access_state),
                        core_alua_dump_status(
                                tg_pt_gp->tg_pt_gp_alua_access_status),
                        atomic_read(&lun->lun_tg_pt_secondary_offline) ?
index 38b5025..70657fd 100644 (file)
@@ -2392,7 +2392,7 @@ static ssize_t target_tg_pt_gp_alua_access_state_show(struct config_item *item,
                char *page)
 {
        return sprintf(page, "%d\n",
-               atomic_read(&to_tg_pt_gp(item)->tg_pt_gp_alua_access_state));
+                      to_tg_pt_gp(item)->tg_pt_gp_alua_access_state);
 }
 
 static ssize_t target_tg_pt_gp_alua_access_state_store(struct config_item *item,
index c754ae3..d2f089c 100644 (file)
@@ -851,7 +851,7 @@ bool target_configure_unmap_from_queue(struct se_dev_attrib *attrib,
        attrib->unmap_granularity = q->limits.discard_granularity / block_size;
        attrib->unmap_granularity_alignment = q->limits.discard_alignment /
                                                                block_size;
-       attrib->unmap_zeroes_data = q->limits.discard_zeroes_data;
+       attrib->unmap_zeroes_data = 0;
        return true;
 }
 EXPORT_SYMBOL(target_configure_unmap_from_queue);
index d8a16ca..d1e6cab 100644 (file)
@@ -92,6 +92,11 @@ static int target_fabric_mappedlun_link(
                pr_err("Source se_lun->lun_se_dev does not exist\n");
                return -EINVAL;
        }
+       if (lun->lun_shutdown) {
+               pr_err("Unable to create mappedlun symlink because"
+                       " lun->lun_shutdown=true\n");
+               return -EINVAL;
+       }
        se_tpg = lun->lun_tpg;
 
        nacl_ci = &lun_acl_ci->ci_parent->ci_group->cg_item;
index 94cda79..a93d94e 100644 (file)
@@ -1008,7 +1008,7 @@ pscsi_execute_cmd(struct se_cmd *cmd)
                req->timeout = PS_TIMEOUT_DISK;
        else
                req->timeout = PS_TIMEOUT_OTHER;
-       req->retries = PS_RETRY;
+       scsi_req(req)->retries = PS_RETRY;
 
        blk_execute_rq_nowait(pdv->pdv_sd->request_queue, NULL, req,
                        (cmd->sam_task_attr == TCM_HEAD_TAG),
@@ -1050,7 +1050,7 @@ static void pscsi_req_done(struct request *req, int uptodate)
        struct se_cmd *cmd = req->end_io_data;
        struct pscsi_plugin_task *pt = cmd->priv;
 
-       pt->pscsi_result = req->errors;
+       pt->pscsi_result = scsi_req(req)->result;
        pt->pscsi_resid = scsi_req(req)->resid_len;
 
        cmd->scsi_status = status_byte(pt->pscsi_result) << 1;
index 6fb1919..dfaef4d 100644 (file)
@@ -642,6 +642,8 @@ void core_tpg_remove_lun(
         */
        struct se_device *dev = rcu_dereference_raw(lun->lun_se_dev);
 
+       lun->lun_shutdown = true;
+
        core_clear_lun_from_tpg(lun, tpg);
        /*
         * Wait for any active I/O references to percpu se_lun->lun_ref to
@@ -663,6 +665,8 @@ void core_tpg_remove_lun(
        }
        if (!(dev->se_hba->hba_flags & HBA_FLAGS_INTERNAL_USE))
                hlist_del_rcu(&lun->link);
+
+       lun->lun_shutdown = false;
        mutex_unlock(&tpg->tpg_lun_mutex);
 
        percpu_ref_exit(&lun->lun_ref);
index b1a3cdb..a0cd56e 100644 (file)
@@ -64,8 +64,9 @@ struct kmem_cache *t10_alua_lba_map_cache;
 struct kmem_cache *t10_alua_lba_map_mem_cache;
 
 static void transport_complete_task_attr(struct se_cmd *cmd);
+static int translate_sense_reason(struct se_cmd *cmd, sense_reason_t reason);
 static void transport_handle_queue_full(struct se_cmd *cmd,
-               struct se_device *dev);
+               struct se_device *dev, int err, bool write_pending);
 static int transport_put_cmd(struct se_cmd *cmd);
 static void target_complete_ok_work(struct work_struct *work);
 
@@ -804,7 +805,8 @@ void target_qf_do_work(struct work_struct *work)
 
                if (cmd->t_state == TRANSPORT_COMPLETE_QF_WP)
                        transport_write_pending_qf(cmd);
-               else if (cmd->t_state == TRANSPORT_COMPLETE_QF_OK)
+               else if (cmd->t_state == TRANSPORT_COMPLETE_QF_OK ||
+                        cmd->t_state == TRANSPORT_COMPLETE_QF_ERR)
                        transport_complete_qf(cmd);
        }
 }
@@ -1719,7 +1721,7 @@ void transport_generic_request_failure(struct se_cmd *cmd,
                }
                trace_target_cmd_complete(cmd);
                ret = cmd->se_tfo->queue_status(cmd);
-               if (ret == -EAGAIN || ret == -ENOMEM)
+               if (ret)
                        goto queue_full;
                goto check_stop;
        default:
@@ -1730,7 +1732,7 @@ void transport_generic_request_failure(struct se_cmd *cmd,
        }
 
        ret = transport_send_check_condition_and_sense(cmd, sense_reason, 0);
-       if (ret == -EAGAIN || ret == -ENOMEM)
+       if (ret)
                goto queue_full;
 
 check_stop:
@@ -1739,8 +1741,7 @@ check_stop:
        return;
 
 queue_full:
-       cmd->t_state = TRANSPORT_COMPLETE_QF_OK;
-       transport_handle_queue_full(cmd, cmd->se_dev);
+       transport_handle_queue_full(cmd, cmd->se_dev, ret, false);
 }
 EXPORT_SYMBOL(transport_generic_request_failure);
 
@@ -1977,13 +1978,29 @@ static void transport_complete_qf(struct se_cmd *cmd)
        int ret = 0;
 
        transport_complete_task_attr(cmd);
+       /*
+        * If a fabric driver ->write_pending() or ->queue_data_in() callback
+        * has returned neither -ENOMEM or -EAGAIN, assume it's fatal and
+        * the same callbacks should not be retried.  Return CHECK_CONDITION
+        * if a scsi_status is not already set.
+        *
+        * If a fabric driver ->queue_status() has returned non zero, always
+        * keep retrying no matter what..
+        */
+       if (cmd->t_state == TRANSPORT_COMPLETE_QF_ERR) {
+               if (cmd->scsi_status)
+                       goto queue_status;
 
-       if (cmd->se_cmd_flags & SCF_TRANSPORT_TASK_SENSE) {
-               trace_target_cmd_complete(cmd);
-               ret = cmd->se_tfo->queue_status(cmd);
-               goto out;
+               cmd->se_cmd_flags |= SCF_EMULATED_TASK_SENSE;
+               cmd->scsi_status = SAM_STAT_CHECK_CONDITION;
+               cmd->scsi_sense_length  = TRANSPORT_SENSE_BUFFER;
+               translate_sense_reason(cmd, TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE);
+               goto queue_status;
        }
 
+       if (cmd->se_cmd_flags & SCF_TRANSPORT_TASK_SENSE)
+               goto queue_status;
+
        switch (cmd->data_direction) {
        case DMA_FROM_DEVICE:
                if (cmd->scsi_status)
@@ -2007,19 +2024,33 @@ queue_status:
                break;
        }
 
-out:
        if (ret < 0) {
-               transport_handle_queue_full(cmd, cmd->se_dev);
+               transport_handle_queue_full(cmd, cmd->se_dev, ret, false);
                return;
        }
        transport_lun_remove_cmd(cmd);
        transport_cmd_check_stop_to_fabric(cmd);
 }
 
-static void transport_handle_queue_full(
-       struct se_cmd *cmd,
-       struct se_device *dev)
+static void transport_handle_queue_full(struct se_cmd *cmd, struct se_device *dev,
+                                       int err, bool write_pending)
 {
+       /*
+        * -EAGAIN or -ENOMEM signals retry of ->write_pending() and/or
+        * ->queue_data_in() callbacks from new process context.
+        *
+        * Otherwise for other errors, transport_complete_qf() will send
+        * CHECK_CONDITION via ->queue_status() instead of attempting to
+        * retry associated fabric driver data-transfer callbacks.
+        */
+       if (err == -EAGAIN || err == -ENOMEM) {
+               cmd->t_state = (write_pending) ? TRANSPORT_COMPLETE_QF_WP :
+                                                TRANSPORT_COMPLETE_QF_OK;
+       } else {
+               pr_warn_ratelimited("Got unknown fabric queue status: %d\n", err);
+               cmd->t_state = TRANSPORT_COMPLETE_QF_ERR;
+       }
+
        spin_lock_irq(&dev->qf_cmd_lock);
        list_add_tail(&cmd->se_qf_node, &cmd->se_dev->qf_cmd_list);
        atomic_inc_mb(&dev->dev_qf_count);
@@ -2083,7 +2114,7 @@ static void target_complete_ok_work(struct work_struct *work)
                WARN_ON(!cmd->scsi_status);
                ret = transport_send_check_condition_and_sense(
                                        cmd, 0, 1);
-               if (ret == -EAGAIN || ret == -ENOMEM)
+               if (ret)
                        goto queue_full;
 
                transport_lun_remove_cmd(cmd);
@@ -2109,7 +2140,7 @@ static void target_complete_ok_work(struct work_struct *work)
                } else if (rc) {
                        ret = transport_send_check_condition_and_sense(cmd,
                                                rc, 0);
-                       if (ret == -EAGAIN || ret == -ENOMEM)
+                       if (ret)
                                goto queue_full;
 
                        transport_lun_remove_cmd(cmd);
@@ -2134,7 +2165,7 @@ queue_rsp:
                if (target_read_prot_action(cmd)) {
                        ret = transport_send_check_condition_and_sense(cmd,
                                                cmd->pi_err, 0);
-                       if (ret == -EAGAIN || ret == -ENOMEM)
+                       if (ret)
                                goto queue_full;
 
                        transport_lun_remove_cmd(cmd);
@@ -2144,7 +2175,7 @@ queue_rsp:
 
                trace_target_cmd_complete(cmd);
                ret = cmd->se_tfo->queue_data_in(cmd);
-               if (ret == -EAGAIN || ret == -ENOMEM)
+               if (ret)
                        goto queue_full;
                break;
        case DMA_TO_DEVICE:
@@ -2157,7 +2188,7 @@ queue_rsp:
                        atomic_long_add(cmd->data_length,
                                        &cmd->se_lun->lun_stats.tx_data_octets);
                        ret = cmd->se_tfo->queue_data_in(cmd);
-                       if (ret == -EAGAIN || ret == -ENOMEM)
+                       if (ret)
                                goto queue_full;
                        break;
                }
@@ -2166,7 +2197,7 @@ queue_rsp:
 queue_status:
                trace_target_cmd_complete(cmd);
                ret = cmd->se_tfo->queue_status(cmd);
-               if (ret == -EAGAIN || ret == -ENOMEM)
+               if (ret)
                        goto queue_full;
                break;
        default:
@@ -2180,8 +2211,8 @@ queue_status:
 queue_full:
        pr_debug("Handling complete_ok QUEUE_FULL: se_cmd: %p,"
                " data_direction: %d\n", cmd, cmd->data_direction);
-       cmd->t_state = TRANSPORT_COMPLETE_QF_OK;
-       transport_handle_queue_full(cmd, cmd->se_dev);
+
+       transport_handle_queue_full(cmd, cmd->se_dev, ret, false);
 }
 
 void target_free_sgl(struct scatterlist *sgl, int nents)
@@ -2449,18 +2480,14 @@ transport_generic_new_cmd(struct se_cmd *cmd)
        spin_unlock_irqrestore(&cmd->t_state_lock, flags);
 
        ret = cmd->se_tfo->write_pending(cmd);
-       if (ret == -EAGAIN || ret == -ENOMEM)
+       if (ret)
                goto queue_full;
 
-       /* fabric drivers should only return -EAGAIN or -ENOMEM as error */
-       WARN_ON(ret);
-
-       return (!ret) ? 0 : TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
+       return 0;
 
 queue_full:
        pr_debug("Handling write_pending QUEUE__FULL: se_cmd: %p\n", cmd);
-       cmd->t_state = TRANSPORT_COMPLETE_QF_WP;
-       transport_handle_queue_full(cmd, cmd->se_dev);
+       transport_handle_queue_full(cmd, cmd->se_dev, ret, true);
        return 0;
 }
 EXPORT_SYMBOL(transport_generic_new_cmd);
@@ -2470,10 +2497,10 @@ static void transport_write_pending_qf(struct se_cmd *cmd)
        int ret;
 
        ret = cmd->se_tfo->write_pending(cmd);
-       if (ret == -EAGAIN || ret == -ENOMEM) {
+       if (ret) {
                pr_debug("Handling write_pending QUEUE__FULL: se_cmd: %p\n",
                         cmd);
-               transport_handle_queue_full(cmd, cmd->se_dev);
+               transport_handle_queue_full(cmd, cmd->se_dev, ret, true);
        }
 }
 
@@ -3011,6 +3038,8 @@ static int __transport_check_aborted_status(struct se_cmd *cmd, int send_status)
        __releases(&cmd->t_state_lock)
        __acquires(&cmd->t_state_lock)
 {
+       int ret;
+
        assert_spin_locked(&cmd->t_state_lock);
        WARN_ON_ONCE(!irqs_disabled());
 
@@ -3034,7 +3063,9 @@ static int __transport_check_aborted_status(struct se_cmd *cmd, int send_status)
        trace_target_cmd_complete(cmd);
 
        spin_unlock_irq(&cmd->t_state_lock);
-       cmd->se_tfo->queue_status(cmd);
+       ret = cmd->se_tfo->queue_status(cmd);
+       if (ret)
+               transport_handle_queue_full(cmd, cmd->se_dev, ret, false);
        spin_lock_irq(&cmd->t_state_lock);
 
        return 1;
@@ -3055,6 +3086,7 @@ EXPORT_SYMBOL(transport_check_aborted_status);
 void transport_send_task_abort(struct se_cmd *cmd)
 {
        unsigned long flags;
+       int ret;
 
        spin_lock_irqsave(&cmd->t_state_lock, flags);
        if (cmd->se_cmd_flags & (SCF_SENT_CHECK_CONDITION)) {
@@ -3090,7 +3122,9 @@ send_abort:
                 cmd->t_task_cdb[0], cmd->tag);
 
        trace_target_cmd_complete(cmd);
-       cmd->se_tfo->queue_status(cmd);
+       ret = cmd->se_tfo->queue_status(cmd);
+       if (ret)
+               transport_handle_queue_full(cmd, cmd->se_dev, ret, false);
 }
 
 static void target_tmr_work(struct work_struct *work)
index c6874c3..f615c3b 100644 (file)
@@ -311,24 +311,50 @@ static void free_data_area(struct tcmu_dev *udev, struct tcmu_cmd *cmd)
                   DATA_BLOCK_BITS);
 }
 
-static void gather_data_area(struct tcmu_dev *udev, unsigned long *cmd_bitmap,
-               struct scatterlist *data_sg, unsigned int data_nents)
+static void gather_data_area(struct tcmu_dev *udev, struct tcmu_cmd *cmd,
+                            bool bidi)
 {
+       struct se_cmd *se_cmd = cmd->se_cmd;
        int i, block;
        int block_remaining = 0;
        void *from, *to;
        size_t copy_bytes, from_offset;
-       struct scatterlist *sg;
+       struct scatterlist *sg, *data_sg;
+       unsigned int data_nents;
+       DECLARE_BITMAP(bitmap, DATA_BLOCK_BITS);
+
+       bitmap_copy(bitmap, cmd->data_bitmap, DATA_BLOCK_BITS);
+
+       if (!bidi) {
+               data_sg = se_cmd->t_data_sg;
+               data_nents = se_cmd->t_data_nents;
+       } else {
+               uint32_t count;
+
+               /*
+                * For bidi case, the first count blocks are for Data-Out
+                * buffer blocks, and before gathering the Data-In buffer
+                * the Data-Out buffer blocks should be discarded.
+                */
+               count = DIV_ROUND_UP(se_cmd->data_length, DATA_BLOCK_SIZE);
+               while (count--) {
+                       block = find_first_bit(bitmap, DATA_BLOCK_BITS);
+                       clear_bit(block, bitmap);
+               }
+
+               data_sg = se_cmd->t_bidi_data_sg;
+               data_nents = se_cmd->t_bidi_data_nents;
+       }
 
        for_each_sg(data_sg, sg, data_nents, i) {
                int sg_remaining = sg->length;
                to = kmap_atomic(sg_page(sg)) + sg->offset;
                while (sg_remaining > 0) {
                        if (block_remaining == 0) {
-                               block = find_first_bit(cmd_bitmap,
+                               block = find_first_bit(bitmap,
                                                DATA_BLOCK_BITS);
                                block_remaining = DATA_BLOCK_SIZE;
-                               clear_bit(block, cmd_bitmap);
+                               clear_bit(block, bitmap);
                        }
                        copy_bytes = min_t(size_t, sg_remaining,
                                        block_remaining);
@@ -394,6 +420,27 @@ static bool is_ring_space_avail(struct tcmu_dev *udev, size_t cmd_size, size_t d
        return true;
 }
 
+static inline size_t tcmu_cmd_get_data_length(struct tcmu_cmd *tcmu_cmd)
+{
+       struct se_cmd *se_cmd = tcmu_cmd->se_cmd;
+       size_t data_length = round_up(se_cmd->data_length, DATA_BLOCK_SIZE);
+
+       if (se_cmd->se_cmd_flags & SCF_BIDI) {
+               BUG_ON(!(se_cmd->t_bidi_data_sg && se_cmd->t_bidi_data_nents));
+               data_length += round_up(se_cmd->t_bidi_data_sg->length,
+                               DATA_BLOCK_SIZE);
+       }
+
+       return data_length;
+}
+
+static inline uint32_t tcmu_cmd_get_block_cnt(struct tcmu_cmd *tcmu_cmd)
+{
+       size_t data_length = tcmu_cmd_get_data_length(tcmu_cmd);
+
+       return data_length / DATA_BLOCK_SIZE;
+}
+
 static sense_reason_t
 tcmu_queue_cmd_ring(struct tcmu_cmd *tcmu_cmd)
 {
@@ -407,7 +454,7 @@ tcmu_queue_cmd_ring(struct tcmu_cmd *tcmu_cmd)
        uint32_t cmd_head;
        uint64_t cdb_off;
        bool copy_to_data_area;
-       size_t data_length;
+       size_t data_length = tcmu_cmd_get_data_length(tcmu_cmd);
        DECLARE_BITMAP(old_bitmap, DATA_BLOCK_BITS);
 
        if (test_bit(TCMU_DEV_BIT_BROKEN, &udev->flags))
@@ -421,8 +468,7 @@ tcmu_queue_cmd_ring(struct tcmu_cmd *tcmu_cmd)
         * expensive to tell how many regions are freed in the bitmap
        */
        base_command_size = max(offsetof(struct tcmu_cmd_entry,
-                               req.iov[se_cmd->t_bidi_data_nents +
-                                       se_cmd->t_data_nents]),
+                               req.iov[tcmu_cmd_get_block_cnt(tcmu_cmd)]),
                                sizeof(struct tcmu_cmd_entry));
        command_size = base_command_size
                + round_up(scsi_command_size(se_cmd->t_task_cdb), TCMU_OP_ALIGN_SIZE);
@@ -433,11 +479,6 @@ tcmu_queue_cmd_ring(struct tcmu_cmd *tcmu_cmd)
 
        mb = udev->mb_addr;
        cmd_head = mb->cmd_head % udev->cmdr_size; /* UAM */
-       data_length = se_cmd->data_length;
-       if (se_cmd->se_cmd_flags & SCF_BIDI) {
-               BUG_ON(!(se_cmd->t_bidi_data_sg && se_cmd->t_bidi_data_nents));
-               data_length += se_cmd->t_bidi_data_sg->length;
-       }
        if ((command_size > (udev->cmdr_size / 2)) ||
            data_length > udev->data_size) {
                pr_warn("TCMU: Request of size %zu/%zu is too big for %u/%zu "
@@ -511,11 +552,14 @@ tcmu_queue_cmd_ring(struct tcmu_cmd *tcmu_cmd)
        entry->req.iov_dif_cnt = 0;
 
        /* Handle BIDI commands */
-       iov_cnt = 0;
-       alloc_and_scatter_data_area(udev, se_cmd->t_bidi_data_sg,
-               se_cmd->t_bidi_data_nents, &iov, &iov_cnt, false);
-       entry->req.iov_bidi_cnt = iov_cnt;
-
+       if (se_cmd->se_cmd_flags & SCF_BIDI) {
+               iov_cnt = 0;
+               iov++;
+               alloc_and_scatter_data_area(udev, se_cmd->t_bidi_data_sg,
+                               se_cmd->t_bidi_data_nents, &iov, &iov_cnt,
+                               false);
+               entry->req.iov_bidi_cnt = iov_cnt;
+       }
        /* cmd's data_bitmap is what changed in process */
        bitmap_xor(tcmu_cmd->data_bitmap, old_bitmap, udev->data_bitmap,
                        DATA_BLOCK_BITS);
@@ -592,19 +636,11 @@ static void tcmu_handle_completion(struct tcmu_cmd *cmd, struct tcmu_cmd_entry *
                               se_cmd->scsi_sense_length);
                free_data_area(udev, cmd);
        } else if (se_cmd->se_cmd_flags & SCF_BIDI) {
-               DECLARE_BITMAP(bitmap, DATA_BLOCK_BITS);
-
                /* Get Data-In buffer before clean up */
-               bitmap_copy(bitmap, cmd->data_bitmap, DATA_BLOCK_BITS);
-               gather_data_area(udev, bitmap,
-                       se_cmd->t_bidi_data_sg, se_cmd->t_bidi_data_nents);
+               gather_data_area(udev, cmd, true);
                free_data_area(udev, cmd);
        } else if (se_cmd->data_direction == DMA_FROM_DEVICE) {
-               DECLARE_BITMAP(bitmap, DATA_BLOCK_BITS);
-
-               bitmap_copy(bitmap, cmd->data_bitmap, DATA_BLOCK_BITS);
-               gather_data_area(udev, bitmap,
-                       se_cmd->t_data_sg, se_cmd->t_data_nents);
+               gather_data_area(udev, cmd, false);
                free_data_area(udev, cmd);
        } else if (se_cmd->data_direction == DMA_TO_DEVICE) {
                free_data_area(udev, cmd);
@@ -1196,11 +1232,6 @@ static ssize_t tcmu_cmd_time_out_store(struct config_item *item, const char *pag
        if (ret < 0)
                return ret;
 
-       if (!val) {
-               pr_err("Illegal value for cmd_time_out\n");
-               return -EINVAL;
-       }
-
        udev->cmd_time_out = val * MSEC_PER_SEC;
        return count;
 }
index 776b343..0a16cf4 100644 (file)
@@ -291,18 +291,6 @@ config ARMADA_THERMAL
          Enable this option if you want to have support for thermal management
          controller present in Armada 370 and Armada XP SoC.
 
-config DB8500_CPUFREQ_COOLING
-       tristate "DB8500 cpufreq cooling"
-       depends on ARCH_U8500 || COMPILE_TEST
-       depends on HAS_IOMEM
-       depends on CPU_THERMAL
-       default y
-       help
-         Adds DB8500 cpufreq cooling devices, and these cooling devices can be
-         bound to thermal zone trip points. When a trip point reached, the
-         bound cpufreq cooling device turns active to set CPU frequency low to
-         cool down the CPU.
-
 config INTEL_POWERCLAMP
        tristate "Intel PowerClamp idle injection driver"
        depends on THERMAL
index 7adae20..c2372f1 100644 (file)
@@ -41,7 +41,6 @@ obj-$(CONFIG_TANGO_THERMAL)   += tango_thermal.o
 obj-$(CONFIG_IMX_THERMAL)      += imx_thermal.o
 obj-$(CONFIG_MAX77620_THERMAL) += max77620_thermal.o
 obj-$(CONFIG_QORIQ_THERMAL)    += qoriq_thermal.o
-obj-$(CONFIG_DB8500_CPUFREQ_COOLING)   += db8500_cpufreq_cooling.o
 obj-$(CONFIG_INTEL_POWERCLAMP) += intel_powerclamp.o
 obj-$(CONFIG_X86_PKG_TEMP_THERMAL)     += x86_pkg_temp_thermal.o
 obj-$(CONFIG_INTEL_SOC_DTS_IOSF_CORE)  += intel_soc_dts_iosf.o
diff --git a/drivers/thermal/db8500_cpufreq_cooling.c b/drivers/thermal/db8500_cpufreq_cooling.c
deleted file mode 100644 (file)
index e58bd0b..0000000
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * db8500_cpufreq_cooling.c - DB8500 cpufreq works as cooling device.
- *
- * Copyright (C) 2012 ST-Ericsson
- * Copyright (C) 2012 Linaro Ltd.
- *
- * Author: Hongbo Zhang <hongbo.zhang@linaro.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
-
-#include <linux/cpu_cooling.h>
-#include <linux/err.h>
-#include <linux/module.h>
-#include <linux/of.h>
-#include <linux/platform_device.h>
-#include <linux/slab.h>
-
-static int db8500_cpufreq_cooling_probe(struct platform_device *pdev)
-{
-       struct thermal_cooling_device *cdev;
-
-       cdev = cpufreq_cooling_register(cpu_present_mask);
-       if (IS_ERR(cdev)) {
-               int ret = PTR_ERR(cdev);
-
-               if (ret != -EPROBE_DEFER)
-                       dev_err(&pdev->dev,
-                               "Failed to register cooling device %d\n",
-                               ret);
-                               
-               return ret;
-       }
-
-       platform_set_drvdata(pdev, cdev);
-
-       dev_info(&pdev->dev, "Cooling device registered: %s\n", cdev->type);
-
-       return 0;
-}
-
-static int db8500_cpufreq_cooling_remove(struct platform_device *pdev)
-{
-       struct thermal_cooling_device *cdev = platform_get_drvdata(pdev);
-
-       cpufreq_cooling_unregister(cdev);
-
-       return 0;
-}
-
-static int db8500_cpufreq_cooling_suspend(struct platform_device *pdev,
-               pm_message_t state)
-{
-       return -ENOSYS;
-}
-
-static int db8500_cpufreq_cooling_resume(struct platform_device *pdev)
-{
-       return -ENOSYS;
-}
-
-#ifdef CONFIG_OF
-static const struct of_device_id db8500_cpufreq_cooling_match[] = {
-       { .compatible = "stericsson,db8500-cpufreq-cooling" },
-       {},
-};
-MODULE_DEVICE_TABLE(of, db8500_cpufreq_cooling_match);
-#endif
-
-static struct platform_driver db8500_cpufreq_cooling_driver = {
-       .driver = {
-               .name = "db8500-cpufreq-cooling",
-               .of_match_table = of_match_ptr(db8500_cpufreq_cooling_match),
-       },
-       .probe = db8500_cpufreq_cooling_probe,
-       .suspend = db8500_cpufreq_cooling_suspend,
-       .resume = db8500_cpufreq_cooling_resume,
-       .remove = db8500_cpufreq_cooling_remove,
-};
-
-static int __init db8500_cpufreq_cooling_init(void)
-{
-       return platform_driver_register(&db8500_cpufreq_cooling_driver);
-}
-
-static void __exit db8500_cpufreq_cooling_exit(void)
-{
-       platform_driver_unregister(&db8500_cpufreq_cooling_driver);
-}
-
-/* Should be later than db8500_cpufreq_register */
-late_initcall(db8500_cpufreq_cooling_init);
-module_exit(db8500_cpufreq_cooling_exit);
-
-MODULE_AUTHOR("Hongbo Zhang <hongbo.zhang@stericsson.com>");
-MODULE_DESCRIPTION("DB8500 cpufreq cooling driver");
-MODULE_LICENSE("GPL");
index b0500a0..e4603b0 100644 (file)
@@ -492,6 +492,41 @@ static void tty_ldisc_close(struct tty_struct *tty, struct tty_ldisc *ld)
 }
 
 /**
+ *     tty_ldisc_restore       -       helper for tty ldisc change
+ *     @tty: tty to recover
+ *     @old: previous ldisc
+ *
+ *     Restore the previous line discipline or N_TTY when a line discipline
+ *     change fails due to an open error
+ */
+
+static void tty_ldisc_restore(struct tty_struct *tty, struct tty_ldisc *old)
+{
+       struct tty_ldisc *new_ldisc;
+       int r;
+
+       /* There is an outstanding reference here so this is safe */
+       old = tty_ldisc_get(tty, old->ops->num);
+       WARN_ON(IS_ERR(old));
+       tty->ldisc = old;
+       tty_set_termios_ldisc(tty, old->ops->num);
+       if (tty_ldisc_open(tty, old) < 0) {
+               tty_ldisc_put(old);
+               /* This driver is always present */
+               new_ldisc = tty_ldisc_get(tty, N_TTY);
+               if (IS_ERR(new_ldisc))
+                       panic("n_tty: get");
+               tty->ldisc = new_ldisc;
+               tty_set_termios_ldisc(tty, N_TTY);
+               r = tty_ldisc_open(tty, new_ldisc);
+               if (r < 0)
+                       panic("Couldn't open N_TTY ldisc for "
+                             "%s --- error %d.",
+                             tty_name(tty), r);
+       }
+}
+
+/**
  *     tty_set_ldisc           -       set line discipline
  *     @tty: the terminal to set
  *     @ldisc: the line discipline
@@ -504,7 +539,12 @@ static void tty_ldisc_close(struct tty_struct *tty, struct tty_ldisc *ld)
 
 int tty_set_ldisc(struct tty_struct *tty, int disc)
 {
-       int retval, old_disc;
+       int retval;
+       struct tty_ldisc *old_ldisc, *new_ldisc;
+
+       new_ldisc = tty_ldisc_get(tty, disc);
+       if (IS_ERR(new_ldisc))
+               return PTR_ERR(new_ldisc);
 
        tty_lock(tty);
        retval = tty_ldisc_lock(tty, 5 * HZ);
@@ -517,8 +557,7 @@ int tty_set_ldisc(struct tty_struct *tty, int disc)
        }
 
        /* Check the no-op case */
-       old_disc = tty->ldisc->ops->num;
-       if (old_disc == disc)
+       if (tty->ldisc->ops->num == disc)
                goto out;
 
        if (test_bit(TTY_HUPPED, &tty->flags)) {
@@ -527,25 +566,34 @@ int tty_set_ldisc(struct tty_struct *tty, int disc)
                goto out;
        }
 
-       retval = tty_ldisc_reinit(tty, disc);
+       old_ldisc = tty->ldisc;
+
+       /* Shutdown the old discipline. */
+       tty_ldisc_close(tty, old_ldisc);
+
+       /* Now set up the new line discipline. */
+       tty->ldisc = new_ldisc;
+       tty_set_termios_ldisc(tty, disc);
+
+       retval = tty_ldisc_open(tty, new_ldisc);
        if (retval < 0) {
                /* Back to the old one or N_TTY if we can't */
-               if (tty_ldisc_reinit(tty, old_disc) < 0) {
-                       pr_err("tty: TIOCSETD failed, reinitializing N_TTY\n");
-                       if (tty_ldisc_reinit(tty, N_TTY) < 0) {
-                               /* At this point we have tty->ldisc == NULL. */
-                               pr_err("tty: reinitializing N_TTY failed\n");
-                       }
-               }
+               tty_ldisc_put(new_ldisc);
+               tty_ldisc_restore(tty, old_ldisc);
        }
 
-       if (tty->ldisc && tty->ldisc->ops->num != old_disc &&
-           tty->ops->set_ldisc) {
+       if (tty->ldisc->ops->num != old_ldisc->ops->num && tty->ops->set_ldisc) {
                down_read(&tty->termios_rwsem);
                tty->ops->set_ldisc(tty);
                up_read(&tty->termios_rwsem);
        }
 
+       /* At this point we hold a reference to the new ldisc and a
+          reference to the old ldisc, or we hold two references to
+          the old ldisc (if it was restored as part of error cleanup
+          above). In either case, releasing a single reference from
+          the old ldisc is correct. */
+       new_ldisc = old_ldisc;
 out:
        tty_ldisc_unlock(tty);
 
@@ -553,6 +601,7 @@ out:
           already running */
        tty_buffer_restart_work(tty->port);
 err:
+       tty_ldisc_put(new_ldisc);       /* drop the extra reference */
        tty_unlock(tty);
        return retval;
 }
@@ -613,8 +662,10 @@ int tty_ldisc_reinit(struct tty_struct *tty, int disc)
        int retval;
 
        ld = tty_ldisc_get(tty, disc);
-       if (IS_ERR(ld))
+       if (IS_ERR(ld)) {
+               BUG_ON(disc == N_TTY);
                return PTR_ERR(ld);
+       }
 
        if (tty->ldisc) {
                tty_ldisc_close(tty, tty->ldisc);
@@ -626,8 +677,10 @@ int tty_ldisc_reinit(struct tty_struct *tty, int disc)
        tty_set_termios_ldisc(tty, disc);
        retval = tty_ldisc_open(tty, tty->ldisc);
        if (retval) {
-               tty_ldisc_put(tty->ldisc);
-               tty->ldisc = NULL;
+               if (!WARN_ON(disc == N_TTY)) {
+                       tty_ldisc_put(tty->ldisc);
+                       tty->ldisc = NULL;
+               }
        }
        return retval;
 }
index d235113..a82e2bd 100644 (file)
@@ -373,7 +373,7 @@ static void bot_cleanup_old_alt(struct f_uas *fu)
        usb_ep_free_request(fu->ep_in, fu->bot_req_in);
        usb_ep_free_request(fu->ep_out, fu->bot_req_out);
        usb_ep_free_request(fu->ep_out, fu->cmd.req);
-       usb_ep_free_request(fu->ep_out, fu->bot_status.req);
+       usb_ep_free_request(fu->ep_in, fu->bot_status.req);
 
        kfree(fu->cmd.buf);
 
index d7efcb6..002f1ce 100644 (file)
@@ -297,14 +297,15 @@ static int pwm_backlight_probe(struct platform_device *pdev)
        }
 
        /*
-        * If the GPIO is configured as input, change the direction to output
-        * and set the GPIO as active.
+        * If the GPIO is not known to be already configured as output, that
+        * is, if gpiod_get_direction returns either GPIOF_DIR_IN or -EINVAL,
+        * change the direction to output and set the GPIO as active.
         * Do not force the GPIO to active when it was already output as it
         * could cause backlight flickering or we would enable the backlight too
         * early. Leave the decision of the initial backlight state for later.
         */
        if (pb->enable_gpio &&
-           gpiod_get_direction(pb->enable_gpio) == GPIOF_DIR_IN)
+           gpiod_get_direction(pb->enable_gpio) != GPIOF_DIR_OUT)
                gpiod_direction_output(pb->enable_gpio, 1);
 
        pb->power_supply = devm_regulator_get(&pdev->dev, "power");
index 8c4dc1e..b827a81 100644 (file)
@@ -10,6 +10,7 @@
 #include <linux/efi.h>
 #include <linux/errno.h>
 #include <linux/fb.h>
+#include <linux/pci.h>
 #include <linux/platform_device.h>
 #include <linux/screen_info.h>
 #include <video/vga.h>
@@ -143,6 +144,8 @@ static struct attribute *efifb_attrs[] = {
 };
 ATTRIBUTE_GROUPS(efifb);
 
+static bool pci_dev_disabled;  /* FB base matches BAR of a disabled device */
+
 static int efifb_probe(struct platform_device *dev)
 {
        struct fb_info *info;
@@ -152,7 +155,7 @@ static int efifb_probe(struct platform_device *dev)
        unsigned int size_total;
        char *option = NULL;
 
-       if (screen_info.orig_video_isVGA != VIDEO_TYPE_EFI)
+       if (screen_info.orig_video_isVGA != VIDEO_TYPE_EFI || pci_dev_disabled)
                return -ENODEV;
 
        if (fb_get_options("efifb", &option))
@@ -360,3 +363,64 @@ static struct platform_driver efifb_driver = {
 };
 
 builtin_platform_driver(efifb_driver);
+
+#if defined(CONFIG_PCI) && !defined(CONFIG_X86)
+
+static bool pci_bar_found;     /* did we find a BAR matching the efifb base? */
+
+static void claim_efifb_bar(struct pci_dev *dev, int idx)
+{
+       u16 word;
+
+       pci_bar_found = true;
+
+       pci_read_config_word(dev, PCI_COMMAND, &word);
+       if (!(word & PCI_COMMAND_MEMORY)) {
+               pci_dev_disabled = true;
+               dev_err(&dev->dev,
+                       "BAR %d: assigned to efifb but device is disabled!\n",
+                       idx);
+               return;
+       }
+
+       if (pci_claim_resource(dev, idx)) {
+               pci_dev_disabled = true;
+               dev_err(&dev->dev,
+                       "BAR %d: failed to claim resource for efifb!\n", idx);
+               return;
+       }
+
+       dev_info(&dev->dev, "BAR %d: assigned to efifb\n", idx);
+}
+
+static void efifb_fixup_resources(struct pci_dev *dev)
+{
+       u64 base = screen_info.lfb_base;
+       u64 size = screen_info.lfb_size;
+       int i;
+
+       if (pci_bar_found || screen_info.orig_video_isVGA != VIDEO_TYPE_EFI)
+               return;
+
+       if (screen_info.capabilities & VIDEO_CAPABILITY_64BIT_BASE)
+               base |= (u64)screen_info.ext_lfb_base << 32;
+
+       if (!base)
+               return;
+
+       for (i = 0; i < PCI_STD_RESOURCE_END; i++) {
+               struct resource *res = &dev->resource[i];
+
+               if (!(res->flags & IORESOURCE_MEM))
+                       continue;
+
+               if (res->start <= base && res->end >= base + size - 1) {
+                       claim_efifb_bar(dev, i);
+                       break;
+               }
+       }
+}
+DECLARE_PCI_FIXUP_CLASS_HEADER(PCI_ANY_ID, PCI_ANY_ID, PCI_BASE_CLASS_DISPLAY,
+                              16, efifb_fixup_resources);
+
+#endif
index 1abba07..f4cbfb3 100644 (file)
@@ -1608,19 +1608,6 @@ static int omapfb_find_ctrl(struct omapfb_device *fbdev)
        return 0;
 }
 
-static void check_required_callbacks(struct omapfb_device *fbdev)
-{
-#define _C(x) (fbdev->ctrl->x != NULL)
-#define _P(x) (fbdev->panel->x != NULL)
-       BUG_ON(fbdev->ctrl == NULL || fbdev->panel == NULL);
-       BUG_ON(!(_C(init) && _C(cleanup) && _C(get_caps) &&
-                _C(set_update_mode) && _C(setup_plane) && _C(enable_plane) &&
-                _P(init) && _P(cleanup) && _P(enable) && _P(disable) &&
-                _P(get_caps)));
-#undef _P
-#undef _C
-}
-
 /*
  * Called by LDM binding to probe and attach a new device.
  * Initialization sequence:
@@ -1705,8 +1692,6 @@ static int omapfb_do_probe(struct platform_device *pdev,
                omapfb_ops.fb_mmap = omapfb_mmap;
        init_state++;
 
-       check_required_callbacks(fbdev);
-
        r = planes_init(fbdev);
        if (r)
                goto cleanup;
index bd017b5..f599520 100644 (file)
@@ -578,10 +578,14 @@ static int ssd1307fb_probe(struct i2c_client *client,
 
        par->vbat_reg = devm_regulator_get_optional(&client->dev, "vbat");
        if (IS_ERR(par->vbat_reg)) {
-               dev_err(&client->dev, "failed to get VBAT regulator: %ld\n",
-                       PTR_ERR(par->vbat_reg));
                ret = PTR_ERR(par->vbat_reg);
-               goto fb_alloc_error;
+               if (ret == -ENODEV) {
+                       par->vbat_reg = NULL;
+               } else {
+                       dev_err(&client->dev, "failed to get VBAT regulator: %d\n",
+                               ret);
+                       goto fb_alloc_error;
+               }
        }
 
        if (of_property_read_u32(node, "solomon,width", &par->width))
@@ -668,10 +672,13 @@ static int ssd1307fb_probe(struct i2c_client *client,
                udelay(4);
        }
 
-       ret = regulator_enable(par->vbat_reg);
-       if (ret) {
-               dev_err(&client->dev, "failed to enable VBAT: %d\n", ret);
-               goto reset_oled_error;
+       if (par->vbat_reg) {
+               ret = regulator_enable(par->vbat_reg);
+               if (ret) {
+                       dev_err(&client->dev, "failed to enable VBAT: %d\n",
+                               ret);
+                       goto reset_oled_error;
+               }
        }
 
        ret = ssd1307fb_init(par);
@@ -710,7 +717,8 @@ panel_init_error:
                pwm_put(par->pwm);
        };
 regulator_enable_error:
-       regulator_disable(par->vbat_reg);
+       if (par->vbat_reg)
+               regulator_disable(par->vbat_reg);
 reset_oled_error:
        fb_deferred_io_cleanup(info);
 fb_alloc_error:
index d0115a7..3ee309c 100644 (file)
@@ -643,7 +643,6 @@ static void xenfb_backend_changed(struct xenbus_device *dev,
                break;
 
        case XenbusStateInitWait:
-InitWait:
                xenbus_switch_state(dev, XenbusStateConnected);
                break;
 
@@ -654,7 +653,8 @@ InitWait:
                 * get Connected twice here.
                 */
                if (dev->state != XenbusStateConnected)
-                       goto InitWait; /* no InitWait seen yet, fudge it */
+                       /* no InitWait seen yet, fudge it */
+                       xenbus_switch_state(dev, XenbusStateConnected);
 
                if (xenbus_read_unsigned(info->xbdev->otherend,
                                         "request-update", 0))
index 400d70b..48230a5 100644 (file)
@@ -232,6 +232,12 @@ static int virtio_dev_probe(struct device *_d)
                if (device_features & (1ULL << i))
                        __virtio_set_bit(dev, i);
 
+       if (drv->validate) {
+               err = drv->validate(dev);
+               if (err)
+                       goto err;
+       }
+
        err = virtio_finalize_features(dev);
        if (err)
                goto err;
index 5905349..698d5d0 100644 (file)
@@ -33,8 +33,10 @@ void vp_synchronize_vectors(struct virtio_device *vdev)
        struct virtio_pci_device *vp_dev = to_vp_device(vdev);
        int i;
 
-       synchronize_irq(pci_irq_vector(vp_dev->pci_dev, 0));
-       for (i = 1; i < vp_dev->msix_vectors; i++)
+       if (vp_dev->intx_enabled)
+               synchronize_irq(vp_dev->pci_dev->irq);
+
+       for (i = 0; i < vp_dev->msix_vectors; ++i)
                synchronize_irq(pci_irq_vector(vp_dev->pci_dev, i));
 }
 
@@ -60,13 +62,16 @@ static irqreturn_t vp_config_changed(int irq, void *opaque)
 static irqreturn_t vp_vring_interrupt(int irq, void *opaque)
 {
        struct virtio_pci_device *vp_dev = opaque;
+       struct virtio_pci_vq_info *info;
        irqreturn_t ret = IRQ_NONE;
-       struct virtqueue *vq;
+       unsigned long flags;
 
-       list_for_each_entry(vq, &vp_dev->vdev.vqs, list) {
-               if (vq->callback && vring_interrupt(irq, vq) == IRQ_HANDLED)
+       spin_lock_irqsave(&vp_dev->lock, flags);
+       list_for_each_entry(info, &vp_dev->virtqueues, node) {
+               if (vring_interrupt(irq, info->vq) == IRQ_HANDLED)
                        ret = IRQ_HANDLED;
        }
+       spin_unlock_irqrestore(&vp_dev->lock, flags);
 
        return ret;
 }
@@ -97,186 +102,244 @@ static irqreturn_t vp_interrupt(int irq, void *opaque)
        return vp_vring_interrupt(irq, opaque);
 }
 
-static void vp_remove_vqs(struct virtio_device *vdev)
+static int vp_request_msix_vectors(struct virtio_device *vdev, int nvectors,
+                                  bool per_vq_vectors, struct irq_affinity *desc)
 {
        struct virtio_pci_device *vp_dev = to_vp_device(vdev);
-       struct virtqueue *vq, *n;
+       const char *name = dev_name(&vp_dev->vdev.dev);
+       unsigned i, v;
+       int err = -ENOMEM;
 
-       list_for_each_entry_safe(vq, n, &vdev->vqs, list) {
-               if (vp_dev->msix_vector_map) {
-                       int v = vp_dev->msix_vector_map[vq->index];
+       vp_dev->msix_vectors = nvectors;
 
-                       if (v != VIRTIO_MSI_NO_VECTOR)
-                               free_irq(pci_irq_vector(vp_dev->pci_dev, v),
-                                       vq);
-               }
-               vp_dev->del_vq(vq);
+       vp_dev->msix_names = kmalloc(nvectors * sizeof *vp_dev->msix_names,
+                                    GFP_KERNEL);
+       if (!vp_dev->msix_names)
+               goto error;
+       vp_dev->msix_affinity_masks
+               = kzalloc(nvectors * sizeof *vp_dev->msix_affinity_masks,
+                         GFP_KERNEL);
+       if (!vp_dev->msix_affinity_masks)
+               goto error;
+       for (i = 0; i < nvectors; ++i)
+               if (!alloc_cpumask_var(&vp_dev->msix_affinity_masks[i],
+                                       GFP_KERNEL))
+                       goto error;
+
+       err = pci_alloc_irq_vectors_affinity(vp_dev->pci_dev, nvectors,
+                                            nvectors, PCI_IRQ_MSIX |
+                                            (desc ? PCI_IRQ_AFFINITY : 0),
+                                            desc);
+       if (err < 0)
+               goto error;
+       vp_dev->msix_enabled = 1;
+
+       /* Set the vector used for configuration */
+       v = vp_dev->msix_used_vectors;
+       snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names,
+                "%s-config", name);
+       err = request_irq(pci_irq_vector(vp_dev->pci_dev, v),
+                         vp_config_changed, 0, vp_dev->msix_names[v],
+                         vp_dev);
+       if (err)
+               goto error;
+       ++vp_dev->msix_used_vectors;
+
+       v = vp_dev->config_vector(vp_dev, v);
+       /* Verify we had enough resources to assign the vector */
+       if (v == VIRTIO_MSI_NO_VECTOR) {
+               err = -EBUSY;
+               goto error;
        }
+
+       if (!per_vq_vectors) {
+               /* Shared vector for all VQs */
+               v = vp_dev->msix_used_vectors;
+               snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names,
+                        "%s-virtqueues", name);
+               err = request_irq(pci_irq_vector(vp_dev->pci_dev, v),
+                                 vp_vring_interrupt, 0, vp_dev->msix_names[v],
+                                 vp_dev);
+               if (err)
+                       goto error;
+               ++vp_dev->msix_used_vectors;
+       }
+       return 0;
+error:
+       return err;
+}
+
+static struct virtqueue *vp_setup_vq(struct virtio_device *vdev, unsigned index,
+                                    void (*callback)(struct virtqueue *vq),
+                                    const char *name,
+                                    u16 msix_vec)
+{
+       struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+       struct virtio_pci_vq_info *info = kmalloc(sizeof *info, GFP_KERNEL);
+       struct virtqueue *vq;
+       unsigned long flags;
+
+       /* fill out our structure that represents an active queue */
+       if (!info)
+               return ERR_PTR(-ENOMEM);
+
+       vq = vp_dev->setup_vq(vp_dev, info, index, callback, name,
+                             msix_vec);
+       if (IS_ERR(vq))
+               goto out_info;
+
+       info->vq = vq;
+       if (callback) {
+               spin_lock_irqsave(&vp_dev->lock, flags);
+               list_add(&info->node, &vp_dev->virtqueues);
+               spin_unlock_irqrestore(&vp_dev->lock, flags);
+       } else {
+               INIT_LIST_HEAD(&info->node);
+       }
+
+       vp_dev->vqs[index] = info;
+       return vq;
+
+out_info:
+       kfree(info);
+       return vq;
+}
+
+static void vp_del_vq(struct virtqueue *vq)
+{
+       struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev);
+       struct virtio_pci_vq_info *info = vp_dev->vqs[vq->index];
+       unsigned long flags;
+
+       spin_lock_irqsave(&vp_dev->lock, flags);
+       list_del(&info->node);
+       spin_unlock_irqrestore(&vp_dev->lock, flags);
+
+       vp_dev->del_vq(info);
+       kfree(info);
 }
 
 /* the config->del_vqs() implementation */
 void vp_del_vqs(struct virtio_device *vdev)
 {
        struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+       struct virtqueue *vq, *n;
        int i;
 
-       if (WARN_ON_ONCE(list_empty_careful(&vdev->vqs)))
-               return;
+       list_for_each_entry_safe(vq, n, &vdev->vqs, list) {
+               if (vp_dev->per_vq_vectors) {
+                       int v = vp_dev->vqs[vq->index]->msix_vector;
 
-       vp_remove_vqs(vdev);
+                       if (v != VIRTIO_MSI_NO_VECTOR) {
+                               int irq = pci_irq_vector(vp_dev->pci_dev, v);
+
+                               irq_set_affinity_hint(irq, NULL);
+                               free_irq(irq, vq);
+                       }
+               }
+               vp_del_vq(vq);
+       }
+       vp_dev->per_vq_vectors = false;
+
+       if (vp_dev->intx_enabled) {
+               free_irq(vp_dev->pci_dev->irq, vp_dev);
+               vp_dev->intx_enabled = 0;
+       }
 
-       if (vp_dev->pci_dev->msix_enabled) {
-               for (i = 0; i < vp_dev->msix_vectors; i++)
+       for (i = 0; i < vp_dev->msix_used_vectors; ++i)
+               free_irq(pci_irq_vector(vp_dev->pci_dev, i), vp_dev);
+
+       for (i = 0; i < vp_dev->msix_vectors; i++)
+               if (vp_dev->msix_affinity_masks[i])
                        free_cpumask_var(vp_dev->msix_affinity_masks[i]);
 
+       if (vp_dev->msix_enabled) {
                /* Disable the vector used for configuration */
                vp_dev->config_vector(vp_dev, VIRTIO_MSI_NO_VECTOR);
 
-               kfree(vp_dev->msix_affinity_masks);
-               kfree(vp_dev->msix_names);
-               kfree(vp_dev->msix_vector_map);
+               pci_free_irq_vectors(vp_dev->pci_dev);
+               vp_dev->msix_enabled = 0;
        }
 
-       free_irq(pci_irq_vector(vp_dev->pci_dev, 0), vp_dev);
-       pci_free_irq_vectors(vp_dev->pci_dev);
+       vp_dev->msix_vectors = 0;
+       vp_dev->msix_used_vectors = 0;
+       kfree(vp_dev->msix_names);
+       vp_dev->msix_names = NULL;
+       kfree(vp_dev->msix_affinity_masks);
+       vp_dev->msix_affinity_masks = NULL;
+       kfree(vp_dev->vqs);
+       vp_dev->vqs = NULL;
 }
 
 static int vp_find_vqs_msix(struct virtio_device *vdev, unsigned nvqs,
                struct virtqueue *vqs[], vq_callback_t *callbacks[],
-               const char * const names[], struct irq_affinity *desc)
+               const char * const names[], bool per_vq_vectors,
+               struct irq_affinity *desc)
 {
        struct virtio_pci_device *vp_dev = to_vp_device(vdev);
-       const char *name = dev_name(&vp_dev->vdev.dev);
-       int i, j, err = -ENOMEM, allocated_vectors, nvectors;
-       unsigned flags = PCI_IRQ_MSIX;
-       bool shared = false;
        u16 msix_vec;
+       int i, err, nvectors, allocated_vectors;
 
-       if (desc) {
-               flags |= PCI_IRQ_AFFINITY;
-               desc->pre_vectors++; /* virtio config vector */
-       }
-
-       nvectors = 1;
-       for (i = 0; i < nvqs; i++)
-               if (callbacks[i])
-                       nvectors++;
-
-       /* Try one vector per queue first. */
-       err = pci_alloc_irq_vectors_affinity(vp_dev->pci_dev, nvectors,
-                       nvectors, flags, desc);
-       if (err < 0) {
-               /* Fallback to one vector for config, one shared for queues. */
-               shared = true;
-               err = pci_alloc_irq_vectors(vp_dev->pci_dev, 2, 2,
-                               PCI_IRQ_MSIX);
-               if (err < 0)
-                       return err;
-       }
-       if (err < 0)
-               return err;
-
-       vp_dev->msix_vectors = nvectors;
-       vp_dev->msix_names = kmalloc_array(nvectors,
-                       sizeof(*vp_dev->msix_names), GFP_KERNEL);
-       if (!vp_dev->msix_names)
-               goto out_free_irq_vectors;
-
-       vp_dev->msix_affinity_masks = kcalloc(nvectors,
-                       sizeof(*vp_dev->msix_affinity_masks), GFP_KERNEL);
-       if (!vp_dev->msix_affinity_masks)
-               goto out_free_msix_names;
+       vp_dev->vqs = kcalloc(nvqs, sizeof(*vp_dev->vqs), GFP_KERNEL);
+       if (!vp_dev->vqs)
+               return -ENOMEM;
 
-       for (i = 0; i < nvectors; ++i) {
-               if (!alloc_cpumask_var(&vp_dev->msix_affinity_masks[i],
-                               GFP_KERNEL))
-                       goto out_free_msix_affinity_masks;
+       if (per_vq_vectors) {
+               /* Best option: one for change interrupt, one per vq. */
+               nvectors = 1;
+               for (i = 0; i < nvqs; ++i)
+                       if (callbacks[i])
+                               ++nvectors;
+       } else {
+               /* Second best: one for change, shared for all vqs. */
+               nvectors = 2;
        }
 
-       /* Set the vector used for configuration */
-       snprintf(vp_dev->msix_names[0], sizeof(*vp_dev->msix_names),
-                "%s-config", name);
-       err = request_irq(pci_irq_vector(vp_dev->pci_dev, 0), vp_config_changed,
-                       0, vp_dev->msix_names[0], vp_dev);
+       err = vp_request_msix_vectors(vdev, nvectors, per_vq_vectors,
+                                     per_vq_vectors ? desc : NULL);
        if (err)
-               goto out_free_msix_affinity_masks;
+               goto error_find;
 
-       /* Verify we had enough resources to assign the vector */
-       if (vp_dev->config_vector(vp_dev, 0) == VIRTIO_MSI_NO_VECTOR) {
-               err = -EBUSY;
-               goto out_free_config_irq;
-       }
-
-       vp_dev->msix_vector_map = kmalloc_array(nvqs,
-                       sizeof(*vp_dev->msix_vector_map), GFP_KERNEL);
-       if (!vp_dev->msix_vector_map)
-               goto out_disable_config_irq;
-
-       allocated_vectors = j = 1; /* vector 0 is the config interrupt */
+       vp_dev->per_vq_vectors = per_vq_vectors;
+       allocated_vectors = vp_dev->msix_used_vectors;
        for (i = 0; i < nvqs; ++i) {
                if (!names[i]) {
                        vqs[i] = NULL;
                        continue;
                }
 
-               if (callbacks[i])
-                       msix_vec = allocated_vectors;
-               else
+               if (!callbacks[i])
                        msix_vec = VIRTIO_MSI_NO_VECTOR;
-
-               vqs[i] = vp_dev->setup_vq(vp_dev, i, callbacks[i], names[i],
-                               msix_vec);
+               else if (vp_dev->per_vq_vectors)
+                       msix_vec = allocated_vectors++;
+               else
+                       msix_vec = VP_MSIX_VQ_VECTOR;
+               vqs[i] = vp_setup_vq(vdev, i, callbacks[i], names[i],
+                                    msix_vec);
                if (IS_ERR(vqs[i])) {
                        err = PTR_ERR(vqs[i]);
-                       goto out_remove_vqs;
+                       goto error_find;
                }
 
-               if (msix_vec == VIRTIO_MSI_NO_VECTOR) {
-                       vp_dev->msix_vector_map[i] = VIRTIO_MSI_NO_VECTOR;
+               if (!vp_dev->per_vq_vectors || msix_vec == VIRTIO_MSI_NO_VECTOR)
                        continue;
-               }
 
-               snprintf(vp_dev->msix_names[j],
-                        sizeof(*vp_dev->msix_names), "%s-%s",
+               /* allocate per-vq irq if available and necessary */
+               snprintf(vp_dev->msix_names[msix_vec],
+                        sizeof *vp_dev->msix_names,
+                        "%s-%s",
                         dev_name(&vp_dev->vdev.dev), names[i]);
                err = request_irq(pci_irq_vector(vp_dev->pci_dev, msix_vec),
-                                 vring_interrupt, IRQF_SHARED,
-                                 vp_dev->msix_names[j], vqs[i]);
-               if (err) {
-                       /* don't free this irq on error */
-                       vp_dev->msix_vector_map[i] = VIRTIO_MSI_NO_VECTOR;
-                       goto out_remove_vqs;
-               }
-               vp_dev->msix_vector_map[i] = msix_vec;
-               j++;
-
-               /*
-                * Use a different vector for each queue if they are available,
-                * else share the same vector for all VQs.
-                */
-               if (!shared)
-                       allocated_vectors++;
+                                 vring_interrupt, 0,
+                                 vp_dev->msix_names[msix_vec],
+                                 vqs[i]);
+               if (err)
+                       goto error_find;
        }
-
        return 0;
 
-out_remove_vqs:
-       vp_remove_vqs(vdev);
-       kfree(vp_dev->msix_vector_map);
-out_disable_config_irq:
-       vp_dev->config_vector(vp_dev, VIRTIO_MSI_NO_VECTOR);
-out_free_config_irq:
-       free_irq(pci_irq_vector(vp_dev->pci_dev, 0), vp_dev);
-out_free_msix_affinity_masks:
-       for (i = 0; i < nvectors; i++) {
-               if (vp_dev->msix_affinity_masks[i])
-                       free_cpumask_var(vp_dev->msix_affinity_masks[i]);
-       }
-       kfree(vp_dev->msix_affinity_masks);
-out_free_msix_names:
-       kfree(vp_dev->msix_names);
-out_free_irq_vectors:
-       pci_free_irq_vectors(vp_dev->pci_dev);
+error_find:
+       vp_del_vqs(vdev);
        return err;
 }
 
@@ -287,29 +350,33 @@ static int vp_find_vqs_intx(struct virtio_device *vdev, unsigned nvqs,
        struct virtio_pci_device *vp_dev = to_vp_device(vdev);
        int i, err;
 
+       vp_dev->vqs = kcalloc(nvqs, sizeof(*vp_dev->vqs), GFP_KERNEL);
+       if (!vp_dev->vqs)
+               return -ENOMEM;
+
        err = request_irq(vp_dev->pci_dev->irq, vp_interrupt, IRQF_SHARED,
                        dev_name(&vdev->dev), vp_dev);
        if (err)
-               return err;
+               goto out_del_vqs;
 
+       vp_dev->intx_enabled = 1;
+       vp_dev->per_vq_vectors = false;
        for (i = 0; i < nvqs; ++i) {
                if (!names[i]) {
                        vqs[i] = NULL;
                        continue;
                }
-               vqs[i] = vp_dev->setup_vq(vp_dev, i, callbacks[i], names[i],
-                               VIRTIO_MSI_NO_VECTOR);
+               vqs[i] = vp_setup_vq(vdev, i, callbacks[i], names[i],
+                                    VIRTIO_MSI_NO_VECTOR);
                if (IS_ERR(vqs[i])) {
                        err = PTR_ERR(vqs[i]);
-                       goto out_remove_vqs;
+                       goto out_del_vqs;
                }
        }
 
        return 0;
-
-out_remove_vqs:
-       vp_remove_vqs(vdev);
-       free_irq(pci_irq_vector(vp_dev->pci_dev, 0), vp_dev);
+out_del_vqs:
+       vp_del_vqs(vdev);
        return err;
 }
 
@@ -320,9 +387,15 @@ int vp_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 {
        int err;
 
-       err = vp_find_vqs_msix(vdev, nvqs, vqs, callbacks, names, desc);
+       /* Try MSI-X with one vector per queue. */
+       err = vp_find_vqs_msix(vdev, nvqs, vqs, callbacks, names, true, desc);
        if (!err)
                return 0;
+       /* Fallback: MSI-X with one vector for config, one shared for queues. */
+       err = vp_find_vqs_msix(vdev, nvqs, vqs, callbacks, names, false, desc);
+       if (!err)
+               return 0;
+       /* Finally fall back to regular interrupts. */
        return vp_find_vqs_intx(vdev, nvqs, vqs, callbacks, names);
 }
 
@@ -342,15 +415,16 @@ int vp_set_vq_affinity(struct virtqueue *vq, int cpu)
 {
        struct virtio_device *vdev = vq->vdev;
        struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+       struct virtio_pci_vq_info *info = vp_dev->vqs[vq->index];
+       struct cpumask *mask;
+       unsigned int irq;
 
        if (!vq->callback)
                return -EINVAL;
 
-       if (vp_dev->pci_dev->msix_enabled) {
-               int vec = vp_dev->msix_vector_map[vq->index];
-               struct cpumask *mask = vp_dev->msix_affinity_masks[vec];
-               unsigned int irq = pci_irq_vector(vp_dev->pci_dev, vec);
-
+       if (vp_dev->msix_enabled) {
+               mask = vp_dev->msix_affinity_masks[info->msix_vector];
+               irq = pci_irq_vector(vp_dev->pci_dev, info->msix_vector);
                if (cpu == -1)
                        irq_set_affinity_hint(irq, NULL);
                else {
@@ -365,12 +439,13 @@ int vp_set_vq_affinity(struct virtqueue *vq, int cpu)
 const struct cpumask *vp_get_vq_affinity(struct virtio_device *vdev, int index)
 {
        struct virtio_pci_device *vp_dev = to_vp_device(vdev);
-       unsigned int *map = vp_dev->msix_vector_map;
 
-       if (!map || map[index] == VIRTIO_MSI_NO_VECTOR)
+       if (!vp_dev->per_vq_vectors ||
+           vp_dev->vqs[index]->msix_vector == VIRTIO_MSI_NO_VECTOR)
                return NULL;
 
-       return pci_irq_get_affinity(vp_dev->pci_dev, map[index]);
+       return pci_irq_get_affinity(vp_dev->pci_dev,
+                                   vp_dev->vqs[index]->msix_vector);
 }
 
 #ifdef CONFIG_PM_SLEEP
@@ -441,6 +516,8 @@ static int virtio_pci_probe(struct pci_dev *pci_dev,
        vp_dev->vdev.dev.parent = &pci_dev->dev;
        vp_dev->vdev.dev.release = virtio_pci_release_dev;
        vp_dev->pci_dev = pci_dev;
+       INIT_LIST_HEAD(&vp_dev->virtqueues);
+       spin_lock_init(&vp_dev->lock);
 
        /* enable the device */
        rc = pci_enable_device(pci_dev);
index ac8c9d7..e96334a 100644 (file)
 #include <linux/highmem.h>
 #include <linux/spinlock.h>
 
+struct virtio_pci_vq_info {
+       /* the actual virtqueue */
+       struct virtqueue *vq;
+
+       /* the list node for the virtqueues list */
+       struct list_head node;
+
+       /* MSI-X vector (or none) */
+       unsigned msix_vector;
+};
+
 /* Our device structure */
 struct virtio_pci_device {
        struct virtio_device vdev;
@@ -64,25 +75,47 @@ struct virtio_pci_device {
        /* the IO mapping for the PCI config space */
        void __iomem *ioaddr;
 
+       /* a list of queues so we can dispatch IRQs */
+       spinlock_t lock;
+       struct list_head virtqueues;
+
+       /* array of all queues for house-keeping */
+       struct virtio_pci_vq_info **vqs;
+
+       /* MSI-X support */
+       int msix_enabled;
+       int intx_enabled;
        cpumask_var_t *msix_affinity_masks;
        /* Name strings for interrupts. This size should be enough,
         * and I'm too lazy to allocate each name separately. */
        char (*msix_names)[256];
-       /* Total Number of MSI-X vectors (including per-VQ ones). */
-       int msix_vectors;
-       /* Map of per-VQ MSI-X vectors, may be NULL */
-       unsigned *msix_vector_map;
+       /* Number of available vectors */
+       unsigned msix_vectors;
+       /* Vectors allocated, excluding per-vq vectors if any */
+       unsigned msix_used_vectors;
+
+       /* Whether we have vector per vq */
+       bool per_vq_vectors;
 
        struct virtqueue *(*setup_vq)(struct virtio_pci_device *vp_dev,
+                                     struct virtio_pci_vq_info *info,
                                      unsigned idx,
                                      void (*callback)(struct virtqueue *vq),
                                      const char *name,
                                      u16 msix_vec);
-       void (*del_vq)(struct virtqueue *vq);
+       void (*del_vq)(struct virtio_pci_vq_info *info);
 
        u16 (*config_vector)(struct virtio_pci_device *vp_dev, u16 vector);
 };
 
+/* Constants for MSI-X */
+/* Use first vector for configuration changes, second and the rest for
+ * virtqueues Thus, we need at least 2 vectors for MSI. */
+enum {
+       VP_MSIX_CONFIG_VECTOR = 0,
+       VP_MSIX_VQ_VECTOR = 1,
+};
+
 /* Convert a generic virtio device to our structure */
 static struct virtio_pci_device *to_vp_device(struct virtio_device *vdev)
 {
index f7362c5..4bfa48f 100644 (file)
@@ -112,6 +112,7 @@ static u16 vp_config_vector(struct virtio_pci_device *vp_dev, u16 vector)
 }
 
 static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev,
+                                 struct virtio_pci_vq_info *info,
                                  unsigned index,
                                  void (*callback)(struct virtqueue *vq),
                                  const char *name,
@@ -129,6 +130,8 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev,
        if (!num || ioread32(vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN))
                return ERR_PTR(-ENOENT);
 
+       info->msix_vector = msix_vec;
+
        /* create the vring */
        vq = vring_create_virtqueue(index, num,
                                    VIRTIO_PCI_VRING_ALIGN, &vp_dev->vdev,
@@ -159,13 +162,14 @@ out_deactivate:
        return ERR_PTR(err);
 }
 
-static void del_vq(struct virtqueue *vq)
+static void del_vq(struct virtio_pci_vq_info *info)
 {
+       struct virtqueue *vq = info->vq;
        struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev);
 
        iowrite16(vq->index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_SEL);
 
-       if (vp_dev->pci_dev->msix_enabled) {
+       if (vp_dev->msix_enabled) {
                iowrite16(VIRTIO_MSI_NO_VECTOR,
                          vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR);
                /* Flush the write out to device */
index 7bc3004..8978f10 100644 (file)
@@ -293,6 +293,7 @@ static u16 vp_config_vector(struct virtio_pci_device *vp_dev, u16 vector)
 }
 
 static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev,
+                                 struct virtio_pci_vq_info *info,
                                  unsigned index,
                                  void (*callback)(struct virtqueue *vq),
                                  const char *name,
@@ -322,6 +323,8 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev,
        /* get offset of notification word for this vq */
        off = vp_ioread16(&cfg->queue_notify_off);
 
+       info->msix_vector = msix_vec;
+
        /* create the vring */
        vq = vring_create_virtqueue(index, num,
                                    SMP_CACHE_BYTES, &vp_dev->vdev,
@@ -405,13 +408,14 @@ static int vp_modern_find_vqs(struct virtio_device *vdev, unsigned nvqs,
        return 0;
 }
 
-static void del_vq(struct virtqueue *vq)
+static void del_vq(struct virtio_pci_vq_info *info)
 {
+       struct virtqueue *vq = info->vq;
        struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev);
 
        vp_iowrite16(vq->index, &vp_dev->common->queue_select);
 
-       if (vp_dev->pci_dev->msix_enabled) {
+       if (vp_dev->msix_enabled) {
                vp_iowrite16(VIRTIO_MSI_NO_VECTOR,
                             &vp_dev->common->queue_msix_vector);
                /* Flush the write out to device */
index 1f4733b..f3b089b 100644 (file)
@@ -442,8 +442,10 @@ static int xenbus_write_transaction(unsigned msg_type,
                return xenbus_command_reply(u, XS_ERROR, "ENOENT");
 
        rc = xenbus_dev_request_and_reply(&u->u.msg, u);
-       if (rc)
+       if (rc && trans) {
+               list_del(&trans->list);
                kfree(trans);
+       }
 
 out:
        return rc;
index a89f3cf..c202930 100644 (file)
@@ -333,10 +333,6 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
                goto err_names;
        init_rwsem(&v9ses->rename_sem);
 
-       rc = bdi_setup_and_register(&v9ses->bdi, "9p");
-       if (rc)
-               goto err_names;
-
        v9ses->uid = INVALID_UID;
        v9ses->dfltuid = V9FS_DEFUID;
        v9ses->dfltgid = V9FS_DEFGID;
@@ -345,7 +341,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
        if (IS_ERR(v9ses->clnt)) {
                rc = PTR_ERR(v9ses->clnt);
                p9_debug(P9_DEBUG_ERROR, "problem initializing 9p client\n");
-               goto err_bdi;
+               goto err_names;
        }
 
        v9ses->flags = V9FS_ACCESS_USER;
@@ -415,8 +411,6 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
 
 err_clnt:
        p9_client_destroy(v9ses->clnt);
-err_bdi:
-       bdi_destroy(&v9ses->bdi);
 err_names:
        kfree(v9ses->uname);
        kfree(v9ses->aname);
@@ -445,8 +439,6 @@ void v9fs_session_close(struct v9fs_session_info *v9ses)
        kfree(v9ses->uname);
        kfree(v9ses->aname);
 
-       bdi_destroy(&v9ses->bdi);
-
        spin_lock(&v9fs_sessionlist_lock);
        list_del(&v9ses->slist);
        spin_unlock(&v9fs_sessionlist_lock);
index 443d12e..76eaf49 100644 (file)
@@ -114,7 +114,6 @@ struct v9fs_session_info {
        kuid_t uid;             /* if ACCESS_SINGLE, the uid that has access */
        struct p9_client *clnt; /* 9p client */
        struct list_head slist; /* list of sessions registered with v9fs */
-       struct backing_dev_info bdi;
        struct rw_semaphore rename_sem;
 };
 
index de3ed86..a0965fb 100644 (file)
@@ -72,10 +72,12 @@ static int v9fs_set_super(struct super_block *s, void *data)
  *
  */
 
-static void
+static int
 v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
                int flags, void *data)
 {
+       int ret;
+
        sb->s_maxbytes = MAX_LFS_FILESIZE;
        sb->s_blocksize_bits = fls(v9ses->maxdata - 1);
        sb->s_blocksize = 1 << sb->s_blocksize_bits;
@@ -85,7 +87,11 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
                sb->s_xattr = v9fs_xattr_handlers;
        } else
                sb->s_op = &v9fs_super_ops;
-       sb->s_bdi = &v9ses->bdi;
+
+       ret = super_setup_bdi(sb);
+       if (ret)
+               return ret;
+
        if (v9ses->cache)
                sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024)/PAGE_SIZE;
 
@@ -99,6 +105,7 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
 #endif
 
        save_mount_options(sb, data);
+       return 0;
 }
 
 /**
@@ -138,7 +145,9 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
                retval = PTR_ERR(sb);
                goto clunk_fid;
        }
-       v9fs_fill_super(sb, v9ses, flags, data);
+       retval = v9fs_fill_super(sb, v9ses, flags, data);
+       if (retval)
+               goto release_sb;
 
        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
                sb->s_d_op = &v9fs_cached_dentry_operations;
index a690136..3936729 100644 (file)
@@ -318,7 +318,6 @@ struct afs_volume {
        unsigned short          rjservers;      /* number of servers discarded due to -ENOMEDIUM */
        struct afs_server       *servers[8];    /* servers on which volume resides (ordered) */
        struct rw_semaphore     server_sem;     /* lock for accessing current server */
-       struct backing_dev_info bdi;
 };
 
 /*
index fbdb022..c79633e 100644 (file)
@@ -319,7 +319,10 @@ static int afs_fill_super(struct super_block *sb,
        sb->s_blocksize_bits    = PAGE_SHIFT;
        sb->s_magic             = AFS_FS_MAGIC;
        sb->s_op                = &afs_super_ops;
-       sb->s_bdi               = &as->volume->bdi;
+       ret = super_setup_bdi(sb);
+       if (ret)
+               return ret;
+       sb->s_bdi->ra_pages     = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
        strlcpy(sb->s_id, as->volume->vlocation->vldb.name, sizeof(sb->s_id));
 
        /* allocate the root inode and dentry */
index 546f9d0..db73d6d 100644 (file)
@@ -106,11 +106,6 @@ struct afs_volume *afs_volume_lookup(struct afs_mount_params *params)
        volume->cell            = params->cell;
        volume->vid             = vlocation->vldb.vid[params->type];
 
-       volume->bdi.ra_pages    = VM_MAX_READAHEAD*1024/PAGE_SIZE; 
-       ret = bdi_setup_and_register(&volume->bdi, "afs");
-       if (ret)
-               goto error_bdi;
-
        init_rwsem(&volume->server_sem);
 
        /* look up all the applicable server records */
@@ -156,8 +151,6 @@ error:
        return ERR_PTR(ret);
 
 error_discard:
-       bdi_destroy(&volume->bdi);
-error_bdi:
        up_write(&params->cell->vl_sem);
 
        for (loop = volume->nservers - 1; loop >= 0; loop--)
@@ -207,7 +200,6 @@ void afs_put_volume(struct afs_volume *volume)
        for (loop = volume->nservers - 1; loop >= 0; loop--)
                afs_put_server(volume->servers[loop]);
 
-       bdi_destroy(&volume->bdi);
        kfree(volume);
 
        _leave(" [destroyed]");
index 2eca00e..9ccabe3 100644 (file)
@@ -885,6 +885,8 @@ static void bdev_evict_inode(struct inode *inode)
        spin_lock(&bdev_lock);
        list_del_init(&bdev->bd_list);
        spin_unlock(&bdev_lock);
+       /* Detach inode from wb early as bdi_put() may free bdi->wb */
+       inode_detach_wb(inode);
        if (bdev->bd_bdi != &noop_backing_dev_info) {
                bdi_put(bdev->bd_bdi);
                bdev->bd_bdi = &noop_backing_dev_info;
@@ -1451,7 +1453,6 @@ int revalidate_disk(struct gendisk *disk)
 
        if (disk->fops->revalidate_disk)
                ret = disk->fops->revalidate_disk(disk);
-       blk_integrity_revalidate(disk);
        bdev = bdget_disk(disk, 0);
        if (!bdev)
                return ret;
@@ -1556,8 +1557,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
                bdev->bd_disk = disk;
                bdev->bd_queue = disk->queue;
                bdev->bd_contains = bdev;
-               if (bdev->bd_bdi == &noop_backing_dev_info)
-                       bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info);
 
                if (!partno) {
                        ret = -ENXIO;
@@ -1622,6 +1621,9 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
                        }
                        bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
                }
+
+               if (bdev->bd_bdi == &noop_backing_dev_info)
+                       bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info);
        } else {
                if (bdev->bd_contains == bdev) {
                        ret = 0;
@@ -1653,8 +1655,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
        bdev->bd_disk = NULL;
        bdev->bd_part = NULL;
        bdev->bd_queue = NULL;
-       bdi_put(bdev->bd_bdi);
-       bdev->bd_bdi = &noop_backing_dev_info;
        if (bdev != bdev->bd_contains)
                __blkdev_put(bdev->bd_contains, mode, 1);
        bdev->bd_contains = NULL;
@@ -1876,12 +1876,6 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
                kill_bdev(bdev);
 
                bdev_write_inode(bdev);
-               /*
-                * Detaching bdev inode from its wb in __destroy_inode()
-                * is too late: the queue which embeds its bdi (along with
-                * root wb) can be gone as soon as we put_disk() below.
-                */
-               inode_detach_wb(bdev->bd_inode);
        }
        if (bdev->bd_contains == bdev) {
                if (disk->fops->release)
@@ -2074,7 +2068,6 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
                             loff_t len)
 {
        struct block_device *bdev = I_BDEV(bdev_file_inode(file));
-       struct request_queue *q = bdev_get_queue(bdev);
        struct address_space *mapping;
        loff_t end = start + len - 1;
        loff_t isize;
@@ -2110,18 +2103,13 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
        case FALLOC_FL_ZERO_RANGE:
        case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE:
                error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9,
-                                           GFP_KERNEL, false);
+                                           GFP_KERNEL, BLKDEV_ZERO_NOUNMAP);
                break;
        case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE:
-               /* Only punch if the device can do zeroing discard. */
-               if (!blk_queue_discard(q) || !q->limits.discard_zeroes_data)
-                       return -EOPNOTSUPP;
-               error = blkdev_issue_discard(bdev, start >> 9, len >> 9,
-                                            GFP_KERNEL, 0);
+               error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9,
+                                            GFP_KERNEL, BLKDEV_ZERO_NOFALLBACK);
                break;
        case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE:
-               if (!blk_queue_discard(q))
-                       return -EOPNOTSUPP;
                error = blkdev_issue_discard(bdev, start >> 9, len >> 9,
                                             GFP_KERNEL, 0);
                break;
index c411590..3e21211 100644 (file)
@@ -810,7 +810,6 @@ struct btrfs_fs_info {
        struct btrfs_super_block *super_for_commit;
        struct super_block *sb;
        struct inode *btree_inode;
-       struct backing_dev_info bdi;
        struct mutex tree_log_mutex;
        struct mutex transaction_kthread_mutex;
        struct mutex cleaner_mutex;
index eb1ee7b..061c1d1 100644 (file)
@@ -1808,21 +1808,6 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
        return ret;
 }
 
-static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
-{
-       int err;
-
-       err = bdi_setup_and_register(bdi, "btrfs");
-       if (err)
-               return err;
-
-       bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
-       bdi->congested_fn       = btrfs_congested_fn;
-       bdi->congested_data     = info;
-       bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK;
-       return 0;
-}
-
 /*
  * called by the kthread helper functions to finally call the bio end_io
  * functions.  This is where read checksum verification actually happens
@@ -2601,16 +2586,10 @@ int open_ctree(struct super_block *sb,
                goto fail;
        }
 
-       ret = setup_bdi(fs_info, &fs_info->bdi);
-       if (ret) {
-               err = ret;
-               goto fail_srcu;
-       }
-
        ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
        if (ret) {
                err = ret;
-               goto fail_bdi;
+               goto fail_srcu;
        }
        fs_info->dirty_metadata_batch = PAGE_SIZE *
                                        (1 + ilog2(nr_cpu_ids));
@@ -2718,7 +2697,6 @@ int open_ctree(struct super_block *sb,
 
        sb->s_blocksize = 4096;
        sb->s_blocksize_bits = blksize_bits(4096);
-       sb->s_bdi = &fs_info->bdi;
 
        btrfs_init_btree_inode(fs_info);
 
@@ -2915,9 +2893,12 @@ int open_ctree(struct super_block *sb,
                goto fail_sb_buffer;
        }
 
-       fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
-       fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
-                                   SZ_4M / PAGE_SIZE);
+       sb->s_bdi->congested_fn = btrfs_congested_fn;
+       sb->s_bdi->congested_data = fs_info;
+       sb->s_bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK;
+       sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
+       sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super);
+       sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE);
 
        sb->s_blocksize = sectorsize;
        sb->s_blocksize_bits = blksize_bits(sectorsize);
@@ -3285,8 +3266,6 @@ fail_delalloc_bytes:
        percpu_counter_destroy(&fs_info->delalloc_bytes);
 fail_dirty_metadata_bytes:
        percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
-fail_bdi:
-       bdi_destroy(&fs_info->bdi);
 fail_srcu:
        cleanup_srcu_struct(&fs_info->subvol_srcu);
 fail:
@@ -4007,7 +3986,6 @@ void close_ctree(struct btrfs_fs_info *fs_info)
        percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
        percpu_counter_destroy(&fs_info->delalloc_bytes);
        percpu_counter_destroy(&fs_info->bio_counter);
-       bdi_destroy(&fs_info->bdi);
        cleanup_srcu_struct(&fs_info->subvol_srcu);
 
        btrfs_free_stripe_hash_table(fs_info);
index a18510b..5e71f1e 100644 (file)
@@ -7910,7 +7910,6 @@ struct btrfs_retry_complete {
 static void btrfs_retry_endio_nocsum(struct bio *bio)
 {
        struct btrfs_retry_complete *done = bio->bi_private;
-       struct inode *inode;
        struct bio_vec *bvec;
        int i;
 
@@ -7918,12 +7917,12 @@ static void btrfs_retry_endio_nocsum(struct bio *bio)
                goto end;
 
        ASSERT(bio->bi_vcnt == 1);
-       inode = bio->bi_io_vec->bv_page->mapping->host;
-       ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(inode));
+       ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(done->inode));
 
        done->uptodate = 1;
        bio_for_each_segment_all(bvec, bio, i)
-       clean_io_failure(BTRFS_I(done->inode), done->start, bvec->bv_page, 0);
+               clean_io_failure(BTRFS_I(done->inode), done->start,
+                                bvec->bv_page, 0);
 end:
        complete(&done->done);
        bio_put(bio);
@@ -7973,8 +7972,10 @@ next_block_or_try_again:
 
                start += sectorsize;
 
-               if (nr_sectors--) {
+               nr_sectors--;
+               if (nr_sectors) {
                        pgoff += sectorsize;
+                       ASSERT(pgoff < PAGE_SIZE);
                        goto next_block_or_try_again;
                }
        }
@@ -7986,9 +7987,7 @@ static void btrfs_retry_endio(struct bio *bio)
 {
        struct btrfs_retry_complete *done = bio->bi_private;
        struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
-       struct inode *inode;
        struct bio_vec *bvec;
-       u64 start;
        int uptodate;
        int ret;
        int i;
@@ -7998,11 +7997,8 @@ static void btrfs_retry_endio(struct bio *bio)
 
        uptodate = 1;
 
-       start = done->start;
-
        ASSERT(bio->bi_vcnt == 1);
-       inode = bio->bi_io_vec->bv_page->mapping->host;
-       ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(inode));
+       ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(done->inode));
 
        bio_for_each_segment_all(bvec, bio, i) {
                ret = __readpage_endio_check(done->inode, io_bio, i,
@@ -8080,8 +8076,10 @@ next:
 
                ASSERT(nr_sectors);
 
-               if (--nr_sectors) {
+               nr_sectors--;
+               if (nr_sectors) {
                        pgoff += sectorsize;
+                       ASSERT(pgoff < PAGE_SIZE);
                        goto next_block;
                }
        }
index a59801d..afbea61 100644 (file)
@@ -1042,9 +1042,12 @@ static void report_reserved_underflow(struct btrfs_fs_info *fs_info,
                                      struct btrfs_qgroup *qgroup,
                                      u64 num_bytes)
 {
-       btrfs_warn(fs_info,
+#ifdef CONFIG_BTRFS_DEBUG
+       WARN_ON(qgroup->reserved < num_bytes);
+       btrfs_debug(fs_info,
                "qgroup %llu reserved space underflow, have: %llu, to free: %llu",
                qgroup->qgroupid, qgroup->reserved, num_bytes);
+#endif
        qgroup->reserved = 0;
 }
 /*
@@ -1075,7 +1078,7 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
        qgroup->excl += sign * num_bytes;
        qgroup->excl_cmpr += sign * num_bytes;
        if (sign > 0) {
-               if (WARN_ON(qgroup->reserved < num_bytes))
+               if (qgroup->reserved < num_bytes)
                        report_reserved_underflow(fs_info, qgroup, num_bytes);
                else
                        qgroup->reserved -= num_bytes;
@@ -1100,7 +1103,7 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
                WARN_ON(sign < 0 && qgroup->excl < num_bytes);
                qgroup->excl += sign * num_bytes;
                if (sign > 0) {
-                       if (WARN_ON(qgroup->reserved < num_bytes))
+                       if (qgroup->reserved < num_bytes)
                                report_reserved_underflow(fs_info, qgroup,
                                                          num_bytes);
                        else
@@ -2469,7 +2472,7 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
 
                qg = unode_aux_to_qgroup(unode);
 
-               if (WARN_ON(qg->reserved < num_bytes))
+               if (qg->reserved < num_bytes)
                        report_reserved_underflow(fs_info, qg, num_bytes);
                else
                        qg->reserved -= num_bytes;
index da687dc..72a053c 100644 (file)
@@ -549,16 +549,19 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
                case Opt_ssd:
                        btrfs_set_and_info(info, SSD,
                                           "use ssd allocation scheme");
+                       btrfs_clear_opt(info->mount_opt, NOSSD);
                        break;
                case Opt_ssd_spread:
                        btrfs_set_and_info(info, SSD_SPREAD,
                                           "use spread ssd allocation scheme");
                        btrfs_set_opt(info->mount_opt, SSD);
+                       btrfs_clear_opt(info->mount_opt, NOSSD);
                        break;
                case Opt_nossd:
                        btrfs_set_and_info(info, NOSSD,
                                             "not using ssd allocation scheme");
                        btrfs_clear_opt(info->mount_opt, SSD);
+                       btrfs_clear_opt(info->mount_opt, SSD_SPREAD);
                        break;
                case Opt_barrier:
                        btrfs_clear_and_info(info, NOBARRIER,
@@ -1133,6 +1136,13 @@ static int btrfs_fill_super(struct super_block *sb,
 #endif
        sb->s_flags |= MS_I_VERSION;
        sb->s_iflags |= SB_I_CGROUPWB;
+
+       err = super_setup_bdi(sb);
+       if (err) {
+               btrfs_err(fs_info, "super_setup_bdi failed");
+               return err;
+       }
+
        err = open_ctree(sb, fs_devices, (char *)data);
        if (err) {
                btrfs_err(fs_info, "open_ctree failed");
index 73d56ee..ab8a66d 100644 (file)
@@ -6213,7 +6213,7 @@ int btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
        for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
                dev = bbio->stripes[dev_nr].dev;
                if (!dev || !dev->bdev ||
-                   (bio_op(bio) == REQ_OP_WRITE && !dev->writeable)) {
+                   (bio_op(first_bio) == REQ_OP_WRITE && !dev->writeable)) {
                        bbio_error(bbio, first_bio, logical);
                        continue;
                }
index 1a3e1b4..9ecb2fd 100644 (file)
@@ -578,7 +578,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
        writeback_stat = atomic_long_inc_return(&fsc->writeback_count);
        if (writeback_stat >
            CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
-               set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC);
+               set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
 
        set_page_writeback(page);
        err = ceph_osdc_writepages(osdc, ceph_vino(inode),
@@ -700,7 +700,7 @@ static void writepages_finish(struct ceph_osd_request *req)
                        if (atomic_long_dec_return(&fsc->writeback_count) <
                             CONGESTION_OFF_THRESH(
                                        fsc->mount_options->congestion_kb))
-                               clear_bdi_congested(&fsc->backing_dev_info,
+                               clear_bdi_congested(inode_to_bdi(inode),
                                                    BLK_RW_ASYNC);
 
                        if (rc < 0)
@@ -979,7 +979,7 @@ get_more_pages:
                        if (atomic_long_inc_return(&fsc->writeback_count) >
                            CONGESTION_ON_THRESH(
                                    fsc->mount_options->congestion_kb)) {
-                               set_bdi_congested(&fsc->backing_dev_info,
+                               set_bdi_congested(inode_to_bdi(inode),
                                                  BLK_RW_ASYNC);
                        }
 
index f2ae393..3ef11bc 100644 (file)
@@ -251,7 +251,7 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
                goto out;
 
        snprintf(name, sizeof(name), "../../bdi/%s",
-                dev_name(fsc->backing_dev_info.dev));
+                dev_name(fsc->sb->s_bdi->dev));
        fsc->debugfs_bdi =
                debugfs_create_symlink("bdi",
                                       fsc->client->debugfs_dir,
index d449e1c..d3119fe 100644 (file)
@@ -2071,11 +2071,6 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
        if (inode_dirty_flags)
                __mark_inode_dirty(inode, inode_dirty_flags);
 
-       if (ia_valid & ATTR_MODE) {
-               err = posix_acl_chmod(inode, attr->ia_mode);
-               if (err)
-                       goto out_put;
-       }
 
        if (mask) {
                req->r_inode = inode;
@@ -2089,13 +2084,11 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
             ceph_cap_string(dirtied), mask);
 
        ceph_mdsc_put_request(req);
-       if (mask & CEPH_SETATTR_SIZE)
-               __ceph_do_pending_vmtruncate(inode);
-       ceph_free_cap_flush(prealloc_cf);
-       return err;
-out_put:
-       ceph_mdsc_put_request(req);
        ceph_free_cap_flush(prealloc_cf);
+
+       if (err >= 0 && (mask & CEPH_SETATTR_SIZE))
+               __ceph_do_pending_vmtruncate(inode);
+
        return err;
 }
 
@@ -2114,7 +2107,12 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
        if (err != 0)
                return err;
 
-       return __ceph_setattr(inode, attr);
+       err = __ceph_setattr(inode, attr);
+
+       if (err >= 0 && (attr->ia_valid & ATTR_MODE))
+               err = posix_acl_chmod(inode, attr->ia_mode);
+
+       return err;
 }
 
 /*
index 0ec8d01..a8c81b2 100644 (file)
@@ -579,10 +579,6 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
 
        atomic_long_set(&fsc->writeback_count, 0);
 
-       err = bdi_init(&fsc->backing_dev_info);
-       if (err < 0)
-               goto fail_client;
-
        err = -ENOMEM;
        /*
         * The number of concurrent works can be high but they don't need
@@ -590,7 +586,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
         */
        fsc->wb_wq = alloc_workqueue("ceph-writeback", 0, 1);
        if (fsc->wb_wq == NULL)
-               goto fail_bdi;
+               goto fail_client;
        fsc->pg_inv_wq = alloc_workqueue("ceph-pg-invalid", 0, 1);
        if (fsc->pg_inv_wq == NULL)
                goto fail_wb_wq;
@@ -624,8 +620,6 @@ fail_pg_inv_wq:
        destroy_workqueue(fsc->pg_inv_wq);
 fail_wb_wq:
        destroy_workqueue(fsc->wb_wq);
-fail_bdi:
-       bdi_destroy(&fsc->backing_dev_info);
 fail_client:
        ceph_destroy_client(fsc->client);
 fail:
@@ -643,8 +637,6 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
        destroy_workqueue(fsc->pg_inv_wq);
        destroy_workqueue(fsc->trunc_wq);
 
-       bdi_destroy(&fsc->backing_dev_info);
-
        mempool_destroy(fsc->wb_pagevec_pool);
 
        destroy_mount_options(fsc->mount_options);
@@ -937,33 +929,32 @@ static int ceph_compare_super(struct super_block *sb, void *data)
  */
 static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
 
-static int ceph_register_bdi(struct super_block *sb,
-                            struct ceph_fs_client *fsc)
+static int ceph_setup_bdi(struct super_block *sb, struct ceph_fs_client *fsc)
 {
        int err;
 
+       err = super_setup_bdi_name(sb, "ceph-%ld",
+                                  atomic_long_inc_return(&bdi_seq));
+       if (err)
+               return err;
+
        /* set ra_pages based on rasize mount option? */
        if (fsc->mount_options->rasize >= PAGE_SIZE)
-               fsc->backing_dev_info.ra_pages =
+               sb->s_bdi->ra_pages =
                        (fsc->mount_options->rasize + PAGE_SIZE - 1)
                        >> PAGE_SHIFT;
        else
-               fsc->backing_dev_info.ra_pages =
-                       VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
+               sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
 
        if (fsc->mount_options->rsize > fsc->mount_options->rasize &&
            fsc->mount_options->rsize >= PAGE_SIZE)
-               fsc->backing_dev_info.io_pages =
+               sb->s_bdi->io_pages =
                        (fsc->mount_options->rsize + PAGE_SIZE - 1)
                        >> PAGE_SHIFT;
        else if (fsc->mount_options->rsize == 0)
-               fsc->backing_dev_info.io_pages = ULONG_MAX;
+               sb->s_bdi->io_pages = ULONG_MAX;
 
-       err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld",
-                          atomic_long_inc_return(&bdi_seq));
-       if (!err)
-               sb->s_bdi = &fsc->backing_dev_info;
-       return err;
+       return 0;
 }
 
 static struct dentry *ceph_mount(struct file_system_type *fs_type,
@@ -1018,7 +1009,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
                dout("get_sb got existing client %p\n", fsc);
        } else {
                dout("get_sb using new client %p\n", fsc);
-               err = ceph_register_bdi(sb, fsc);
+               err = ceph_setup_bdi(sb, fsc);
                if (err < 0) {
                        res = ERR_PTR(err);
                        goto out_splat;
index fe6b9cf..176186b 100644 (file)
@@ -92,8 +92,6 @@ struct ceph_fs_client {
        struct workqueue_struct *trunc_wq;
        atomic_long_t writeback_count;
 
-       struct backing_dev_info backing_dev_info;
-
 #ifdef CONFIG_DEBUG_FS
        struct dentry *debugfs_dentry_lru, *debugfs_caps;
        struct dentry *debugfs_congestion_kb;
index 07ed81c..cbd216b 100644 (file)
@@ -68,7 +68,6 @@ struct cifs_sb_info {
        umode_t mnt_dir_mode;
        unsigned int mnt_cifs_flags;
        char   *mountdata; /* options received at mount time or via DFS refs */
-       struct backing_dev_info bdi;
        struct delayed_work prune_tlinks;
        struct rcu_head rcu;
        char *prepath;
index 15e1db8..34fee9f 100644 (file)
@@ -138,7 +138,12 @@ cifs_read_super(struct super_block *sb)
        sb->s_magic = CIFS_MAGIC_NUMBER;
        sb->s_op = &cifs_super_ops;
        sb->s_xattr = cifs_xattr_handlers;
-       sb->s_bdi = &cifs_sb->bdi;
+       rc = super_setup_bdi(sb);
+       if (rc)
+               goto out_no_root;
+       /* tune readahead according to rsize */
+       sb->s_bdi->ra_pages = cifs_sb->rsize / PAGE_SIZE;
+
        sb->s_blocksize = CIFS_MAX_MSGSIZE;
        sb->s_blocksize_bits = 14;      /* default 2**14 = CIFS_MAX_MSGSIZE */
        inode = cifs_root_iget(sb);
@@ -972,6 +977,86 @@ out:
        return rc;
 }
 
+ssize_t cifs_file_copychunk_range(unsigned int xid,
+                               struct file *src_file, loff_t off,
+                               struct file *dst_file, loff_t destoff,
+                               size_t len, unsigned int flags)
+{
+       struct inode *src_inode = file_inode(src_file);
+       struct inode *target_inode = file_inode(dst_file);
+       struct cifsFileInfo *smb_file_src;
+       struct cifsFileInfo *smb_file_target;
+       struct cifs_tcon *src_tcon;
+       struct cifs_tcon *target_tcon;
+       ssize_t rc;
+
+       cifs_dbg(FYI, "copychunk range\n");
+
+       if (src_inode == target_inode) {
+               rc = -EINVAL;
+               goto out;
+       }
+
+       if (!src_file->private_data || !dst_file->private_data) {
+               rc = -EBADF;
+               cifs_dbg(VFS, "missing cifsFileInfo on copy range src file\n");
+               goto out;
+       }
+
+       rc = -EXDEV;
+       smb_file_target = dst_file->private_data;
+       smb_file_src = src_file->private_data;
+       src_tcon = tlink_tcon(smb_file_src->tlink);
+       target_tcon = tlink_tcon(smb_file_target->tlink);
+
+       if (src_tcon->ses != target_tcon->ses) {
+               cifs_dbg(VFS, "source and target of copy not on same server\n");
+               goto out;
+       }
+
+       /*
+        * Note: cifs case is easier than btrfs since server responsible for
+        * checks for proper open modes and file type and if it wants
+        * server could even support copy of range where source = target
+        */
+       lock_two_nondirectories(target_inode, src_inode);
+
+       cifs_dbg(FYI, "about to flush pages\n");
+       /* should we flush first and last page first */
+       truncate_inode_pages(&target_inode->i_data, 0);
+
+       if (target_tcon->ses->server->ops->copychunk_range)
+               rc = target_tcon->ses->server->ops->copychunk_range(xid,
+                       smb_file_src, smb_file_target, off, len, destoff);
+       else
+               rc = -EOPNOTSUPP;
+
+       /* force revalidate of size and timestamps of target file now
+        * that target is updated on the server
+        */
+       CIFS_I(target_inode)->time = 0;
+       /* although unlocking in the reverse order from locking is not
+        * strictly necessary here it is a little cleaner to be consistent
+        */
+       unlock_two_nondirectories(src_inode, target_inode);
+
+out:
+       return rc;
+}
+
+static ssize_t cifs_copy_file_range(struct file *src_file, loff_t off,
+                               struct file *dst_file, loff_t destoff,
+                               size_t len, unsigned int flags)
+{
+       unsigned int xid = get_xid();
+       ssize_t rc;
+
+       rc = cifs_file_copychunk_range(xid, src_file, off, dst_file, destoff,
+                                       len, flags);
+       free_xid(xid);
+       return rc;
+}
+
 const struct file_operations cifs_file_ops = {
        .read_iter = cifs_loose_read_iter,
        .write_iter = cifs_file_write_iter,
@@ -984,6 +1069,7 @@ const struct file_operations cifs_file_ops = {
        .splice_read = generic_file_splice_read,
        .llseek = cifs_llseek,
        .unlocked_ioctl = cifs_ioctl,
+       .copy_file_range = cifs_copy_file_range,
        .clone_file_range = cifs_clone_file_range,
        .setlease = cifs_setlease,
        .fallocate = cifs_fallocate,
@@ -1001,6 +1087,7 @@ const struct file_operations cifs_file_strict_ops = {
        .splice_read = generic_file_splice_read,
        .llseek = cifs_llseek,
        .unlocked_ioctl = cifs_ioctl,
+       .copy_file_range = cifs_copy_file_range,
        .clone_file_range = cifs_clone_file_range,
        .setlease = cifs_setlease,
        .fallocate = cifs_fallocate,
@@ -1018,6 +1105,7 @@ const struct file_operations cifs_file_direct_ops = {
        .mmap = cifs_file_mmap,
        .splice_read = generic_file_splice_read,
        .unlocked_ioctl  = cifs_ioctl,
+       .copy_file_range = cifs_copy_file_range,
        .clone_file_range = cifs_clone_file_range,
        .llseek = cifs_llseek,
        .setlease = cifs_setlease,
@@ -1035,6 +1123,7 @@ const struct file_operations cifs_file_nobrl_ops = {
        .splice_read = generic_file_splice_read,
        .llseek = cifs_llseek,
        .unlocked_ioctl = cifs_ioctl,
+       .copy_file_range = cifs_copy_file_range,
        .clone_file_range = cifs_clone_file_range,
        .setlease = cifs_setlease,
        .fallocate = cifs_fallocate,
@@ -1051,6 +1140,7 @@ const struct file_operations cifs_file_strict_nobrl_ops = {
        .splice_read = generic_file_splice_read,
        .llseek = cifs_llseek,
        .unlocked_ioctl = cifs_ioctl,
+       .copy_file_range = cifs_copy_file_range,
        .clone_file_range = cifs_clone_file_range,
        .setlease = cifs_setlease,
        .fallocate = cifs_fallocate,
@@ -1067,6 +1157,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = {
        .mmap = cifs_file_mmap,
        .splice_read = generic_file_splice_read,
        .unlocked_ioctl  = cifs_ioctl,
+       .copy_file_range = cifs_copy_file_range,
        .clone_file_range = cifs_clone_file_range,
        .llseek = cifs_llseek,
        .setlease = cifs_setlease,
@@ -1078,6 +1169,7 @@ const struct file_operations cifs_dir_ops = {
        .release = cifs_closedir,
        .read    = generic_read_dir,
        .unlocked_ioctl  = cifs_ioctl,
+       .copy_file_range = cifs_copy_file_range,
        .clone_file_range = cifs_clone_file_range,
        .llseek = generic_file_llseek,
 };
index da717fe..30bf89b 100644 (file)
@@ -139,6 +139,11 @@ extern ssize_t     cifs_listxattr(struct dentry *, char *, size_t);
 # define cifs_listxattr NULL
 #endif
 
+extern ssize_t cifs_file_copychunk_range(unsigned int xid,
+                                       struct file *src_file, loff_t off,
+                                       struct file *dst_file, loff_t destoff,
+                                       size_t len, unsigned int flags);
+
 extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 #ifdef CONFIG_CIFS_NFSD_EXPORT
 extern const struct export_operations cifs_export_ops;
index d42dd32..37f5a41 100644 (file)
@@ -243,6 +243,7 @@ struct smb_version_operations {
        /* verify the message */
        int (*check_message)(char *, unsigned int, struct TCP_Server_Info *);
        bool (*is_oplock_break)(char *, struct TCP_Server_Info *);
+       int (*handle_cancelled_mid)(char *, struct TCP_Server_Info *);
        void (*downgrade_oplock)(struct TCP_Server_Info *,
                                        struct cifsInodeInfo *, bool);
        /* process transaction2 response */
@@ -407,9 +408,10 @@ struct smb_version_operations {
        char * (*create_lease_buf)(u8 *, u8);
        /* parse lease context buffer and return oplock/epoch info */
        __u8 (*parse_lease_buf)(void *, unsigned int *);
-       int (*clone_range)(const unsigned int, struct cifsFileInfo *src_file,
-                       struct cifsFileInfo *target_file, u64 src_off, u64 len,
-                       u64 dest_off);
+       ssize_t (*copychunk_range)(const unsigned int,
+                       struct cifsFileInfo *src_file,
+                       struct cifsFileInfo *target_file,
+                       u64 src_off, u64 len, u64 dest_off);
        int (*duplicate_extents)(const unsigned int, struct cifsFileInfo *src,
                        struct cifsFileInfo *target_file, u64 src_off, u64 len,
                        u64 dest_off);
@@ -946,7 +948,6 @@ struct cifs_tcon {
        bool use_persistent:1; /* use persistent instead of durable handles */
 #ifdef CONFIG_CIFS_SMB2
        bool print:1;           /* set if connection to printer share */
-       bool bad_network_name:1; /* set if ret status STATUS_BAD_NETWORK_NAME */
        __le32 capabilities;
        __u32 share_flags;
        __u32 maximal_access;
@@ -1343,6 +1344,7 @@ struct mid_q_entry {
        void *callback_data;      /* general purpose pointer for callback */
        void *resp_buf;         /* pointer to received SMB header */
        int mid_state;  /* wish this were enum but can not pass to wait_event */
+       unsigned int mid_flags;
        __le16 command;         /* smb command code */
        bool large_buf:1;       /* if valid response, is pointer to large buf */
        bool multiRsp:1;        /* multiple trans2 responses for one request  */
@@ -1350,6 +1352,12 @@ struct mid_q_entry {
        bool decrypted:1;       /* decrypted entry */
 };
 
+struct close_cancelled_open {
+       struct cifs_fid         fid;
+       struct cifs_tcon        *tcon;
+       struct work_struct      work;
+};
+
 /*     Make code in transport.c a little cleaner by moving
        update of optional stats into function below */
 #ifdef CONFIG_CIFS_STATS2
@@ -1481,6 +1489,9 @@ static inline void free_dfs_info_array(struct dfs_info3_param *param,
 #define   MID_RESPONSE_MALFORMED 0x10
 #define   MID_SHUTDOWN          0x20
 
+/* Flags */
+#define   MID_WAIT_CANCELLED    1 /* Cancelled while waiting for response */
+
 /* Types of response buffer returned from SendReceive2 */
 #define   CIFS_NO_BUFFER        0    /* Response buffer not returned */
 #define   CIFS_SMALL_BUFFER     1
index 0669506..5d21f00 100644 (file)
@@ -1428,6 +1428,8 @@ cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
 
        length = cifs_discard_remaining_data(server);
        dequeue_mid(mid, rdata->result);
+       mid->resp_buf = server->smallbuf;
+       server->smallbuf = NULL;
        return length;
 }
 
@@ -1541,6 +1543,8 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
                return cifs_readv_discard(server, mid);
 
        dequeue_mid(mid, false);
+       mid->resp_buf = server->smallbuf;
+       server->smallbuf = NULL;
        return length;
 }
 
index 9ae695a..b3c9d8c 100644 (file)
@@ -904,10 +904,19 @@ cifs_demultiplex_thread(void *p)
 
                server->lstrp = jiffies;
                if (mid_entry != NULL) {
+                       if ((mid_entry->mid_flags & MID_WAIT_CANCELLED) &&
+                            mid_entry->mid_state == MID_RESPONSE_RECEIVED &&
+                                       server->ops->handle_cancelled_mid)
+                               server->ops->handle_cancelled_mid(
+                                                       mid_entry->resp_buf,
+                                                       server);
+
                        if (!mid_entry->multiRsp || mid_entry->multiEnd)
                                mid_entry->callback(mid_entry);
-               } else if (!server->ops->is_oplock_break ||
-                          !server->ops->is_oplock_break(buf, server)) {
+               } else if (server->ops->is_oplock_break &&
+                          server->ops->is_oplock_break(buf, server)) {
+                       cifs_dbg(FYI, "Received oplock break\n");
+               } else {
                        cifs_dbg(VFS, "No task to wake, unknown frame received! NumMids %d\n",
                                 atomic_read(&midCount));
                        cifs_dump_mem("Received Data is: ", buf,
@@ -3683,10 +3692,6 @@ cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info)
        int referral_walks_count = 0;
 #endif
 
-       rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs");
-       if (rc)
-               return rc;
-
 #ifdef CONFIG_CIFS_DFS_UPCALL
 try_mount_again:
        /* cleanup activities if we're chasing a referral */
@@ -3714,7 +3719,6 @@ try_mount_again:
        server = cifs_get_tcp_session(volume_info);
        if (IS_ERR(server)) {
                rc = PTR_ERR(server);
-               bdi_destroy(&cifs_sb->bdi);
                goto out;
        }
        if ((volume_info->max_credits < 20) ||
@@ -3744,6 +3748,9 @@ try_mount_again:
        if (IS_ERR(tcon)) {
                rc = PTR_ERR(tcon);
                tcon = NULL;
+               if (rc == -EACCES)
+                       goto mount_fail_check;
+
                goto remote_path_check;
        }
 
@@ -3768,9 +3775,6 @@ try_mount_again:
        cifs_sb->wsize = server->ops->negotiate_wsize(tcon, volume_info);
        cifs_sb->rsize = server->ops->negotiate_rsize(tcon, volume_info);
 
-       /* tune readahead according to rsize */
-       cifs_sb->bdi.ra_pages = cifs_sb->rsize / PAGE_SIZE;
-
 remote_path_check:
 #ifdef CONFIG_CIFS_DFS_UPCALL
        /*
@@ -3887,7 +3891,6 @@ mount_fail_check:
                        cifs_put_smb_ses(ses);
                else
                        cifs_put_tcp_session(server, 0);
-               bdi_destroy(&cifs_sb->bdi);
        }
 
 out:
@@ -4090,7 +4093,6 @@ cifs_umount(struct cifs_sb_info *cifs_sb)
        }
        spin_unlock(&cifs_sb->tlink_tree_lock);
 
-       bdi_destroy(&cifs_sb->bdi);
        kfree(cifs_sb->mountdata);
        kfree(cifs_sb->prepath);
        call_rcu(&cifs_sb->rcu, delayed_free);
index aa3debb..21d4045 100644 (file)
@@ -2597,7 +2597,7 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
                wdata->credits = credits;
 
                if (!wdata->cfile->invalidHandle ||
-                   !cifs_reopen_file(wdata->cfile, false))
+                   !(rc = cifs_reopen_file(wdata->cfile, false)))
                        rc = server->ops->async_writev(wdata,
                                        cifs_uncached_writedata_release);
                if (rc) {
@@ -3022,7 +3022,7 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
                rdata->credits = credits;
 
                if (!rdata->cfile->invalidHandle ||
-                   !cifs_reopen_file(rdata->cfile, true))
+                   !(rc = cifs_reopen_file(rdata->cfile, true)))
                        rc = server->ops->async_readv(rdata);
 error:
                if (rc) {
@@ -3617,7 +3617,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
                }
 
                if (!rdata->cfile->invalidHandle ||
-                   !cifs_reopen_file(rdata->cfile, true))
+                   !(rc = cifs_reopen_file(rdata->cfile, true)))
                        rc = server->ops->async_readv(rdata);
                if (rc) {
                        add_credits_and_wake_if(server, rdata->credits, 0);
index 0015287..265c45f 100644 (file)
 #include "cifs_ioctl.h"
 #include <linux/btrfs.h>
 
-static int cifs_file_clone_range(unsigned int xid, struct file *src_file,
-                         struct file *dst_file)
-{
-       struct inode *src_inode = file_inode(src_file);
-       struct inode *target_inode = file_inode(dst_file);
-       struct cifsFileInfo *smb_file_src;
-       struct cifsFileInfo *smb_file_target;
-       struct cifs_tcon *src_tcon;
-       struct cifs_tcon *target_tcon;
-       int rc;
-
-       cifs_dbg(FYI, "ioctl clone range\n");
-
-       if (!src_file->private_data || !dst_file->private_data) {
-               rc = -EBADF;
-               cifs_dbg(VFS, "missing cifsFileInfo on copy range src file\n");
-               goto out;
-       }
-
-       rc = -EXDEV;
-       smb_file_target = dst_file->private_data;
-       smb_file_src = src_file->private_data;
-       src_tcon = tlink_tcon(smb_file_src->tlink);
-       target_tcon = tlink_tcon(smb_file_target->tlink);
-
-       if (src_tcon->ses != target_tcon->ses) {
-               cifs_dbg(VFS, "source and target of copy not on same server\n");
-               goto out;
-       }
-
-       /*
-        * Note: cifs case is easier than btrfs since server responsible for
-        * checks for proper open modes and file type and if it wants
-        * server could even support copy of range where source = target
-        */
-       lock_two_nondirectories(target_inode, src_inode);
-
-       cifs_dbg(FYI, "about to flush pages\n");
-       /* should we flush first and last page first */
-       truncate_inode_pages(&target_inode->i_data, 0);
-
-       if (target_tcon->ses->server->ops->clone_range)
-               rc = target_tcon->ses->server->ops->clone_range(xid,
-                       smb_file_src, smb_file_target, 0, src_inode->i_size, 0);
-       else
-               rc = -EOPNOTSUPP;
-
-       /* force revalidate of size and timestamps of target file now
-          that target is updated on the server */
-       CIFS_I(target_inode)->time = 0;
-       /* although unlocking in the reverse order from locking is not
-          strictly necessary here it is a little cleaner to be consistent */
-       unlock_two_nondirectories(src_inode, target_inode);
-out:
-       return rc;
-}
-
-static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file,
+static long cifs_ioctl_copychunk(unsigned int xid, struct file *dst_file,
                        unsigned long srcfd)
 {
        int rc;
        struct fd src_file;
        struct inode *src_inode;
 
-       cifs_dbg(FYI, "ioctl clone range\n");
+       cifs_dbg(FYI, "ioctl copychunk range\n");
        /* the destination must be opened for writing */
        if (!(dst_file->f_mode & FMODE_WRITE)) {
                cifs_dbg(FYI, "file target not open for write\n");
@@ -129,7 +72,8 @@ static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file,
        if (S_ISDIR(src_inode->i_mode))
                goto out_fput;
 
-       rc = cifs_file_clone_range(xid, src_file.file, dst_file);
+       rc = cifs_file_copychunk_range(xid, src_file.file, 0, dst_file, 0,
+                                       src_inode->i_size, 0);
 
 out_fput:
        fdput(src_file);
@@ -251,7 +195,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
                        }
                        break;
                case CIFS_IOC_COPYCHUNK_FILE:
-                       rc = cifs_ioctl_clone(xid, filep, arg);
+                       rc = cifs_ioctl_copychunk(xid, filep, arg);
                        break;
                case CIFS_IOC_SET_INTEGRITY:
                        if (pSMBFile == NULL)
index cc93ba4..27bc360 100644 (file)
@@ -1015,6 +1015,15 @@ cifs_dir_needs_close(struct cifsFileInfo *cfile)
        return !cfile->srch_inf.endOfSearch && !cfile->invalidHandle;
 }
 
+static bool
+cifs_can_echo(struct TCP_Server_Info *server)
+{
+       if (server->tcpStatus == CifsGood)
+               return true;
+
+       return false;
+}
+
 struct smb_version_operations smb1_operations = {
        .send_cancel = send_nt_cancel,
        .compare_fids = cifs_compare_fids,
@@ -1049,6 +1058,7 @@ struct smb_version_operations smb1_operations = {
        .get_dfs_refer = CIFSGetDFSRefer,
        .qfs_tcon = cifs_qfs_tcon,
        .is_path_accessible = cifs_is_path_accessible,
+       .can_echo = cifs_can_echo,
        .query_path_info = cifs_query_path_info,
        .query_file_info = cifs_query_file_info,
        .get_srv_inum = cifs_get_srv_inum,
index fd516ea..1a04b3a 100644 (file)
@@ -659,3 +659,49 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
        cifs_dbg(FYI, "Can not process oplock break for non-existent connection\n");
        return false;
 }
+
+void
+smb2_cancelled_close_fid(struct work_struct *work)
+{
+       struct close_cancelled_open *cancelled = container_of(work,
+                                       struct close_cancelled_open, work);
+
+       cifs_dbg(VFS, "Close unmatched open\n");
+
+       SMB2_close(0, cancelled->tcon, cancelled->fid.persistent_fid,
+                  cancelled->fid.volatile_fid);
+       cifs_put_tcon(cancelled->tcon);
+       kfree(cancelled);
+}
+
+int
+smb2_handle_cancelled_mid(char *buffer, struct TCP_Server_Info *server)
+{
+       struct smb2_sync_hdr *sync_hdr = get_sync_hdr(buffer);
+       struct smb2_create_rsp *rsp = (struct smb2_create_rsp *)buffer;
+       struct cifs_tcon *tcon;
+       struct close_cancelled_open *cancelled;
+
+       if (sync_hdr->Command != SMB2_CREATE ||
+           sync_hdr->Status != STATUS_SUCCESS)
+               return 0;
+
+       cancelled = kzalloc(sizeof(*cancelled), GFP_KERNEL);
+       if (!cancelled)
+               return -ENOMEM;
+
+       tcon = smb2_find_smb_tcon(server, sync_hdr->SessionId,
+                                 sync_hdr->TreeId);
+       if (!tcon) {
+               kfree(cancelled);
+               return -ENOENT;
+       }
+
+       cancelled->fid.persistent_fid = rsp->PersistentFileId;
+       cancelled->fid.volatile_fid = rsp->VolatileFileId;
+       cancelled->tcon = tcon;
+       INIT_WORK(&cancelled->work, smb2_cancelled_close_fid);
+       queue_work(cifsiod_wq, &cancelled->work);
+
+       return 0;
+}
index 0231108..152e37f 100644 (file)
@@ -21,6 +21,7 @@
 #include <linux/vfs.h>
 #include <linux/falloc.h>
 #include <linux/scatterlist.h>
+#include <linux/uuid.h>
 #include <crypto/aead.h>
 #include "cifsglob.h"
 #include "smb2pdu.h"
@@ -592,8 +593,8 @@ req_res_key_exit:
        return rc;
 }
 
-static int
-smb2_clone_range(const unsigned int xid,
+static ssize_t
+smb2_copychunk_range(const unsigned int xid,
                        struct cifsFileInfo *srcfile,
                        struct cifsFileInfo *trgtfile, u64 src_off,
                        u64 len, u64 dest_off)
@@ -605,13 +606,14 @@ smb2_clone_range(const unsigned int xid,
        struct cifs_tcon *tcon;
        int chunks_copied = 0;
        bool chunk_sizes_updated = false;
+       ssize_t bytes_written, total_bytes_written = 0;
 
        pcchunk = kmalloc(sizeof(struct copychunk_ioctl), GFP_KERNEL);
 
        if (pcchunk == NULL)
                return -ENOMEM;
 
-       cifs_dbg(FYI, "in smb2_clone_range - about to call request res key\n");
+       cifs_dbg(FYI, "in smb2_copychunk_range - about to call request res key\n");
        /* Request a key from the server to identify the source of the copy */
        rc = SMB2_request_res_key(xid, tlink_tcon(srcfile->tlink),
                                srcfile->fid.persistent_fid,
@@ -669,14 +671,16 @@ smb2_clone_range(const unsigned int xid,
                        }
                        chunks_copied++;
 
-                       src_off += le32_to_cpu(retbuf->TotalBytesWritten);
-                       dest_off += le32_to_cpu(retbuf->TotalBytesWritten);
-                       len -= le32_to_cpu(retbuf->TotalBytesWritten);
+                       bytes_written = le32_to_cpu(retbuf->TotalBytesWritten);
+                       src_off += bytes_written;
+                       dest_off += bytes_written;
+                       len -= bytes_written;
+                       total_bytes_written += bytes_written;
 
-                       cifs_dbg(FYI, "Chunks %d PartialChunk %d Total %d\n",
+                       cifs_dbg(FYI, "Chunks %d PartialChunk %d Total %zu\n",
                                le32_to_cpu(retbuf->ChunksWritten),
                                le32_to_cpu(retbuf->ChunkBytesWritten),
-                               le32_to_cpu(retbuf->TotalBytesWritten));
+                               bytes_written);
                } else if (rc == -EINVAL) {
                        if (ret_data_len != sizeof(struct copychunk_ioctl_rsp))
                                goto cchunk_out;
@@ -713,7 +717,10 @@ smb2_clone_range(const unsigned int xid,
 cchunk_out:
        kfree(pcchunk);
        kfree(retbuf);
-       return rc;
+       if (rc)
+               return rc;
+       else
+               return total_bytes_written;
 }
 
 static int
@@ -2322,6 +2329,7 @@ struct smb_version_operations smb20_operations = {
        .clear_stats = smb2_clear_stats,
        .print_stats = smb2_print_stats,
        .is_oplock_break = smb2_is_valid_oplock_break,
+       .handle_cancelled_mid = smb2_handle_cancelled_mid,
        .downgrade_oplock = smb2_downgrade_oplock,
        .need_neg = smb2_need_neg,
        .negotiate = smb2_negotiate,
@@ -2377,7 +2385,7 @@ struct smb_version_operations smb20_operations = {
        .set_oplock_level = smb2_set_oplock_level,
        .create_lease_buf = smb2_create_lease_buf,
        .parse_lease_buf = smb2_parse_lease_buf,
-       .clone_range = smb2_clone_range,
+       .copychunk_range = smb2_copychunk_range,
        .wp_retry_size = smb2_wp_retry_size,
        .dir_needs_close = smb2_dir_needs_close,
        .get_dfs_refer = smb2_get_dfs_refer,
@@ -2404,6 +2412,7 @@ struct smb_version_operations smb21_operations = {
        .clear_stats = smb2_clear_stats,
        .print_stats = smb2_print_stats,
        .is_oplock_break = smb2_is_valid_oplock_break,
+       .handle_cancelled_mid = smb2_handle_cancelled_mid,
        .downgrade_oplock = smb2_downgrade_oplock,
        .need_neg = smb2_need_neg,
        .negotiate = smb2_negotiate,
@@ -2459,7 +2468,7 @@ struct smb_version_operations smb21_operations = {
        .set_oplock_level = smb21_set_oplock_level,
        .create_lease_buf = smb2_create_lease_buf,
        .parse_lease_buf = smb2_parse_lease_buf,
-       .clone_range = smb2_clone_range,
+       .copychunk_range = smb2_copychunk_range,
        .wp_retry_size = smb2_wp_retry_size,
        .dir_needs_close = smb2_dir_needs_close,
        .enum_snapshots = smb3_enum_snapshots,
@@ -2488,6 +2497,7 @@ struct smb_version_operations smb30_operations = {
        .print_stats = smb2_print_stats,
        .dump_share_caps = smb2_dump_share_caps,
        .is_oplock_break = smb2_is_valid_oplock_break,
+       .handle_cancelled_mid = smb2_handle_cancelled_mid,
        .downgrade_oplock = smb2_downgrade_oplock,
        .need_neg = smb2_need_neg,
        .negotiate = smb2_negotiate,
@@ -2545,7 +2555,7 @@ struct smb_version_operations smb30_operations = {
        .set_oplock_level = smb3_set_oplock_level,
        .create_lease_buf = smb3_create_lease_buf,
        .parse_lease_buf = smb3_parse_lease_buf,
-       .clone_range = smb2_clone_range,
+       .copychunk_range = smb2_copychunk_range,
        .duplicate_extents = smb2_duplicate_extents,
        .validate_negotiate = smb3_validate_negotiate,
        .wp_retry_size = smb2_wp_retry_size,
@@ -2582,6 +2592,7 @@ struct smb_version_operations smb311_operations = {
        .print_stats = smb2_print_stats,
        .dump_share_caps = smb2_dump_share_caps,
        .is_oplock_break = smb2_is_valid_oplock_break,
+       .handle_cancelled_mid = smb2_handle_cancelled_mid,
        .downgrade_oplock = smb2_downgrade_oplock,
        .need_neg = smb2_need_neg,
        .negotiate = smb2_negotiate,
@@ -2639,7 +2650,7 @@ struct smb_version_operations smb311_operations = {
        .set_oplock_level = smb3_set_oplock_level,
        .create_lease_buf = smb3_create_lease_buf,
        .parse_lease_buf = smb3_parse_lease_buf,
-       .clone_range = smb2_clone_range,
+       .copychunk_range = smb2_copychunk_range,
        .duplicate_extents = smb2_duplicate_extents,
 /*     .validate_negotiate = smb3_validate_negotiate, */ /* not used in 3.11 */
        .wp_retry_size = smb2_wp_retry_size,
index 7446496..02da648 100644 (file)
@@ -562,8 +562,10 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
         * but for time being this is our only auth choice so doesn't matter.
         * We just found a server which sets blob length to zero expecting raw.
         */
-       if (blob_length == 0)
+       if (blob_length == 0) {
                cifs_dbg(FYI, "missing security blob on negprot\n");
+               server->sec_ntlmssp = true;
+       }
 
        rc = cifs_enable_signing(server, ses->sign);
        if (rc)
@@ -1171,9 +1173,6 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
        else
                return -EIO;
 
-       if (tcon && tcon->bad_network_name)
-               return -ENOENT;
-
        unc_path = kmalloc(MAX_SHARENAME_LENGTH * 2, GFP_KERNEL);
        if (unc_path == NULL)
                return -ENOMEM;
@@ -1185,6 +1184,10 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
                return -EINVAL;
        }
 
+       /* SMB2 TREE_CONNECT request must be called with TreeId == 0 */
+       if (tcon)
+               tcon->tid = 0;
+
        rc = small_smb2_init(SMB2_TREE_CONNECT, tcon, (void **) &req);
        if (rc) {
                kfree(unc_path);
@@ -1273,8 +1276,6 @@ tcon_exit:
 tcon_error_exit:
        if (rsp->hdr.sync_hdr.Status == STATUS_BAD_NETWORK_NAME) {
                cifs_dbg(VFS, "BAD_NETWORK_NAME: %s\n", tree);
-               if (tcon)
-                       tcon->bad_network_name = true;
        }
        goto tcon_exit;
 }
@@ -2177,6 +2178,9 @@ void smb2_reconnect_server(struct work_struct *work)
        struct cifs_tcon *tcon, *tcon2;
        struct list_head tmp_list;
        int tcon_exist = false;
+       int rc;
+       int resched = false;
+
 
        /* Prevent simultaneous reconnects that can corrupt tcon->rlist list */
        mutex_lock(&server->reconnect_mutex);
@@ -2204,13 +2208,18 @@ void smb2_reconnect_server(struct work_struct *work)
        spin_unlock(&cifs_tcp_ses_lock);
 
        list_for_each_entry_safe(tcon, tcon2, &tmp_list, rlist) {
-               if (!smb2_reconnect(SMB2_INTERNAL_CMD, tcon))
+               rc = smb2_reconnect(SMB2_INTERNAL_CMD, tcon);
+               if (!rc)
                        cifs_reopen_persistent_handles(tcon);
+               else
+                       resched = true;
                list_del_init(&tcon->rlist);
                cifs_put_tcon(tcon);
        }
 
        cifs_dbg(FYI, "Reconnecting tcons finished\n");
+       if (resched)
+               queue_delayed_work(cifsiod_wq, &server->reconnect, 2 * HZ);
        mutex_unlock(&server->reconnect_mutex);
 
        /* now we can safely release srv struct */
index 69e3587..6853454 100644 (file)
@@ -48,6 +48,10 @@ extern struct mid_q_entry *smb2_setup_request(struct cifs_ses *ses,
                              struct smb_rqst *rqst);
 extern struct mid_q_entry *smb2_setup_async_request(
                        struct TCP_Server_Info *server, struct smb_rqst *rqst);
+extern struct cifs_ses *smb2_find_smb_ses(struct TCP_Server_Info *server,
+                                          __u64 ses_id);
+extern struct cifs_tcon *smb2_find_smb_tcon(struct TCP_Server_Info *server,
+                                               __u64 ses_id, __u32  tid);
 extern int smb2_calc_signature(struct smb_rqst *rqst,
                                struct TCP_Server_Info *server);
 extern int smb3_calc_signature(struct smb_rqst *rqst,
@@ -164,6 +168,9 @@ extern int SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
 extern int SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon,
                             const u64 persistent_fid, const u64 volatile_fid,
                             const __u8 oplock_level);
+extern int smb2_handle_cancelled_mid(char *buffer,
+                                       struct TCP_Server_Info *server);
+void smb2_cancelled_close_fid(struct work_struct *work);
 extern int SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon,
                         u64 persistent_file_id, u64 volatile_file_id,
                         struct kstatfs *FSData);
index 7c3bb1b..506b67f 100644 (file)
@@ -115,23 +115,70 @@ smb3_crypto_shash_allocate(struct TCP_Server_Info *server)
        return 0;
 }
 
-struct cifs_ses *
-smb2_find_smb_ses(struct TCP_Server_Info *server, __u64 ses_id)
+static struct cifs_ses *
+smb2_find_smb_ses_unlocked(struct TCP_Server_Info *server, __u64 ses_id)
 {
        struct cifs_ses *ses;
 
-       spin_lock(&cifs_tcp_ses_lock);
        list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
                if (ses->Suid != ses_id)
                        continue;
-               spin_unlock(&cifs_tcp_ses_lock);
                return ses;
        }
+
+       return NULL;
+}
+
+struct cifs_ses *
+smb2_find_smb_ses(struct TCP_Server_Info *server, __u64 ses_id)
+{
+       struct cifs_ses *ses;
+
+       spin_lock(&cifs_tcp_ses_lock);
+       ses = smb2_find_smb_ses_unlocked(server, ses_id);
        spin_unlock(&cifs_tcp_ses_lock);
 
+       return ses;
+}
+
+static struct cifs_tcon *
+smb2_find_smb_sess_tcon_unlocked(struct cifs_ses *ses, __u32  tid)
+{
+       struct cifs_tcon *tcon;
+
+       list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
+               if (tcon->tid != tid)
+                       continue;
+               ++tcon->tc_count;
+               return tcon;
+       }
+
        return NULL;
 }
 
+/*
+ * Obtain tcon corresponding to the tid in the given
+ * cifs_ses
+ */
+
+struct cifs_tcon *
+smb2_find_smb_tcon(struct TCP_Server_Info *server, __u64 ses_id, __u32  tid)
+{
+       struct cifs_ses *ses;
+       struct cifs_tcon *tcon;
+
+       spin_lock(&cifs_tcp_ses_lock);
+       ses = smb2_find_smb_ses_unlocked(server, ses_id);
+       if (!ses) {
+               spin_unlock(&cifs_tcp_ses_lock);
+               return NULL;
+       }
+       tcon = smb2_find_smb_sess_tcon_unlocked(ses, tid);
+       spin_unlock(&cifs_tcp_ses_lock);
+
+       return tcon;
+}
+
 int
 smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
 {
index 526f053..f6e13a9 100644 (file)
@@ -752,9 +752,11 @@ cifs_send_recv(const unsigned int xid, struct cifs_ses *ses,
 
        rc = wait_for_response(ses->server, midQ);
        if (rc != 0) {
+               cifs_dbg(FYI, "Cancelling wait for mid %llu\n", midQ->mid);
                send_cancel(ses->server, rqst, midQ);
                spin_lock(&GlobalMid_Lock);
                if (midQ->mid_state == MID_REQUEST_SUBMITTED) {
+                       midQ->mid_flags |= MID_WAIT_CANCELLED;
                        midQ->callback = DeleteMidQEntry;
                        spin_unlock(&GlobalMid_Lock);
                        add_credits(ses->server, 1, optype);
index 2dea594..6058df3 100644 (file)
@@ -183,10 +183,6 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
                goto unlock_out;
        }
 
-       error = bdi_setup_and_register(&vc->bdi, "coda");
-       if (error)
-               goto unlock_out;
-
        vc->vc_sb = sb;
        mutex_unlock(&vc->vc_mutex);
 
@@ -197,7 +193,10 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_magic = CODA_SUPER_MAGIC;
        sb->s_op = &coda_super_operations;
        sb->s_d_op = &coda_dentry_operations;
-       sb->s_bdi = &vc->bdi;
+
+       error = super_setup_bdi(sb);
+       if (error)
+               goto error;
 
        /* get root fid from Venus: this needs the root inode */
        error = venus_rootfid(sb, &fid);
@@ -228,7 +227,6 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
 
 error:
        mutex_lock(&vc->vc_mutex);
-       bdi_destroy(&vc->bdi);
        vc->vc_sb = NULL;
        sb->s_fs_info = NULL;
 unlock_out:
@@ -240,7 +238,6 @@ static void coda_put_super(struct super_block *sb)
 {
        struct venus_comm *vcp = coda_vcp(sb);
        mutex_lock(&vcp->vc_mutex);
-       bdi_destroy(&vcp->bdi);
        vcp->vc_sb = NULL;
        sb->s_fs_info = NULL;
        mutex_unlock(&vcp->vc_mutex);
index de622d4..6433650 100644 (file)
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -373,6 +373,22 @@ restart:
                }
                spin_lock_irq(&mapping->tree_lock);
 
+               if (!entry) {
+                       /*
+                        * We needed to drop the page_tree lock while calling
+                        * radix_tree_preload() and we didn't have an entry to
+                        * lock.  See if another thread inserted an entry at
+                        * our index during this time.
+                        */
+                       entry = __radix_tree_lookup(&mapping->page_tree, index,
+                                       NULL, &slot);
+                       if (entry) {
+                               radix_tree_preload_end();
+                               spin_unlock_irq(&mapping->tree_lock);
+                               goto restart;
+                       }
+               }
+
                if (pmd_downgrade) {
                        radix_tree_delete(&mapping->page_tree, index);
                        mapping->nrexceptional--;
@@ -388,19 +404,12 @@ restart:
                if (err) {
                        spin_unlock_irq(&mapping->tree_lock);
                        /*
-                        * Someone already created the entry?  This is a
-                        * normal failure when inserting PMDs in a range
-                        * that already contains PTEs.  In that case we want
-                        * to return -EEXIST immediately.
-                        */
-                       if (err == -EEXIST && !(size_flag & RADIX_DAX_PMD))
-                               goto restart;
-                       /*
-                        * Our insertion of a DAX PMD entry failed, most
-                        * likely because it collided with a PTE sized entry
-                        * at a different index in the PMD range.  We haven't
-                        * inserted anything into the radix tree and have no
-                        * waiters to wake.
+                        * Our insertion of a DAX entry failed, most likely
+                        * because we were inserting a PMD entry and it
+                        * collided with a PTE sized entry at a different
+                        * index in the PMD range.  We haven't inserted
+                        * anything into the radix tree and have no waiters to
+                        * wake.
                         */
                        return ERR_PTR(err);
                }
@@ -982,7 +991,7 @@ int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
                sector_t start_sector = dax.sector + (offset >> 9);
 
                return blkdev_issue_zeroout(bdev, start_sector,
-                               length >> 9, GFP_NOFS, true);
+                               length >> 9, GFP_NOFS, 0);
        } else {
                if (dax_map_atomic(bdev, &dax) < 0)
                        return PTR_ERR(dax.addr);
index 95c1c8d..9c351bf 100644 (file)
@@ -349,7 +349,6 @@ struct ecryptfs_mount_crypt_stat {
 struct ecryptfs_sb_info {
        struct super_block *wsi_sb;
        struct ecryptfs_mount_crypt_stat mount_crypt_stat;
-       struct backing_dev_info bdi;
 };
 
 /* file private data. */
index 151872d..9014479 100644 (file)
@@ -519,12 +519,11 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
                goto out;
        }
 
-       rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs");
+       rc = super_setup_bdi(s);
        if (rc)
                goto out1;
 
        ecryptfs_set_superblock_private(s, sbi);
-       s->s_bdi = &sbi->bdi;
 
        /* ->kill_sb() will take care of sbi after that point */
        sbi = NULL;
@@ -633,7 +632,6 @@ static void ecryptfs_kill_block_super(struct super_block *sb)
        if (!sb_info)
                return;
        ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat);
-       bdi_destroy(&sb_info->bdi);
        kmem_cache_free(ecryptfs_sb_info_cache, sb_info);
 }
 
index 65145a3..72934df 100644 (file)
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1320,6 +1320,7 @@ void setup_new_exec(struct linux_binprm * bprm)
        else
                set_dumpable(current->mm, suid_dumpable);
 
+       arch_setup_new_exec();
        perf_event_exec();
        __set_task_comm(current, kbasename(bprm->filename), true);
 
index 2e86086..5dc3924 100644 (file)
@@ -64,7 +64,6 @@ struct exofs_dev {
  * our extension to the in-memory superblock
  */
 struct exofs_sb_info {
-       struct backing_dev_info bdi;            /* register our bdi with VFS  */
        struct exofs_sb_stats s_ess;            /* Written often, pre-allocate*/
        int             s_timeout;              /* timeout for OSD operations */
        uint64_t        s_nextid;               /* highest object ID used     */
index 1076a42..819624c 100644 (file)
@@ -464,7 +464,6 @@ static void exofs_put_super(struct super_block *sb)
                            sbi->one_comp.obj.partition);
 
        exofs_sysfs_sb_del(sbi);
-       bdi_destroy(&sbi->bdi);
        exofs_free_sbi(sbi);
        sb->s_fs_info = NULL;
 }
@@ -809,8 +808,12 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
        __sbi_read_stats(sbi);
 
        /* set up operation vectors */
-       sbi->bdi.ra_pages = __ra_pages(&sbi->layout);
-       sb->s_bdi = &sbi->bdi;
+       ret = super_setup_bdi(sb);
+       if (ret) {
+               EXOFS_DBGMSG("Failed to super_setup_bdi\n");
+               goto free_sbi;
+       }
+       sb->s_bdi->ra_pages = __ra_pages(&sbi->layout);
        sb->s_fs_info = sbi;
        sb->s_op = &exofs_sops;
        sb->s_export_op = &exofs_export_ops;
@@ -836,14 +839,6 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
                goto free_sbi;
        }
 
-       ret = bdi_setup_and_register(&sbi->bdi, "exofs");
-       if (ret) {
-               EXOFS_DBGMSG("Failed to bdi_setup_and_register\n");
-               dput(sb->s_root);
-               sb->s_root = NULL;
-               goto free_sbi;
-       }
-
        exofs_sysfs_dbg_print();
        _exofs_print_device("Mounting", opts->dev_name,
                            ore_comp_dev(&sbi->oc, 0),
index f493af6..fb69ee2 100644 (file)
@@ -2466,6 +2466,7 @@ extern int  ext4_setattr(struct dentry *, struct iattr *);
 extern int  ext4_getattr(const struct path *, struct kstat *, u32, unsigned int);
 extern void ext4_evict_inode(struct inode *);
 extern void ext4_clear_inode(struct inode *);
+extern int  ext4_file_getattr(const struct path *, struct kstat *, u32, unsigned int);
 extern int  ext4_sync_inode(handle_t *, struct inode *);
 extern void ext4_dirty_inode(struct inode *, int);
 extern int ext4_change_inode_journal_flag(struct inode *, int);
index 8210c1f..cefa983 100644 (file)
@@ -744,7 +744,7 @@ const struct file_operations ext4_file_operations = {
 
 const struct inode_operations ext4_file_inode_operations = {
        .setattr        = ext4_setattr,
-       .getattr        = ext4_getattr,
+       .getattr        = ext4_file_getattr,
        .listxattr      = ext4_listxattr,
        .get_acl        = ext4_get_acl,
        .set_acl        = ext4_set_acl,
index 4247d8d..b9ffa9f 100644 (file)
@@ -5390,11 +5390,46 @@ err_out:
 int ext4_getattr(const struct path *path, struct kstat *stat,
                 u32 request_mask, unsigned int query_flags)
 {
-       struct inode *inode;
-       unsigned long long delalloc_blocks;
+       struct inode *inode = d_inode(path->dentry);
+       struct ext4_inode *raw_inode;
+       struct ext4_inode_info *ei = EXT4_I(inode);
+       unsigned int flags;
+
+       if (EXT4_FITS_IN_INODE(raw_inode, ei, i_crtime)) {
+               stat->result_mask |= STATX_BTIME;
+               stat->btime.tv_sec = ei->i_crtime.tv_sec;
+               stat->btime.tv_nsec = ei->i_crtime.tv_nsec;
+       }
+
+       flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
+       if (flags & EXT4_APPEND_FL)
+               stat->attributes |= STATX_ATTR_APPEND;
+       if (flags & EXT4_COMPR_FL)
+               stat->attributes |= STATX_ATTR_COMPRESSED;
+       if (flags & EXT4_ENCRYPT_FL)
+               stat->attributes |= STATX_ATTR_ENCRYPTED;
+       if (flags & EXT4_IMMUTABLE_FL)
+               stat->attributes |= STATX_ATTR_IMMUTABLE;
+       if (flags & EXT4_NODUMP_FL)
+               stat->attributes |= STATX_ATTR_NODUMP;
+
+       stat->attributes_mask |= (STATX_ATTR_APPEND |
+                                 STATX_ATTR_COMPRESSED |
+                                 STATX_ATTR_ENCRYPTED |
+                                 STATX_ATTR_IMMUTABLE |
+                                 STATX_ATTR_NODUMP);
 
-       inode = d_inode(path->dentry);
        generic_fillattr(inode, stat);
+       return 0;
+}
+
+int ext4_file_getattr(const struct path *path, struct kstat *stat,
+                     u32 request_mask, unsigned int query_flags)
+{
+       struct inode *inode = d_inode(path->dentry);
+       u64 delalloc_blocks;
+
+       ext4_getattr(path, stat, request_mask, query_flags);
 
        /*
         * If there is inline data in the inode, the inode will normally not
index 6ad612c..07e5e14 100644 (file)
@@ -3912,6 +3912,7 @@ const struct inode_operations ext4_dir_inode_operations = {
        .tmpfile        = ext4_tmpfile,
        .rename         = ext4_rename2,
        .setattr        = ext4_setattr,
+       .getattr        = ext4_getattr,
        .listxattr      = ext4_listxattr,
        .get_acl        = ext4_get_acl,
        .set_acl        = ext4_set_acl,
@@ -3920,6 +3921,7 @@ const struct inode_operations ext4_dir_inode_operations = {
 
 const struct inode_operations ext4_special_inode_operations = {
        .setattr        = ext4_setattr,
+       .getattr        = ext4_getattr,
        .listxattr      = ext4_listxattr,
        .get_acl        = ext4_get_acl,
        .set_acl        = ext4_set_acl,
index 73b184d..5c8fc53 100644 (file)
@@ -85,17 +85,20 @@ errout:
 const struct inode_operations ext4_encrypted_symlink_inode_operations = {
        .get_link       = ext4_encrypted_get_link,
        .setattr        = ext4_setattr,
+       .getattr        = ext4_getattr,
        .listxattr      = ext4_listxattr,
 };
 
 const struct inode_operations ext4_symlink_inode_operations = {
        .get_link       = page_get_link,
        .setattr        = ext4_setattr,
+       .getattr        = ext4_getattr,
        .listxattr      = ext4_listxattr,
 };
 
 const struct inode_operations ext4_fast_symlink_inode_operations = {
        .get_link       = simple_get_link,
        .setattr        = ext4_setattr,
+       .getattr        = ext4_getattr,
        .listxattr      = ext4_listxattr,
 };
index b681b43..c2d7f3a 100644 (file)
@@ -382,9 +382,9 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
                        wake_up(&fc->blocked_waitq);
 
                if (fc->num_background == fc->congestion_threshold &&
-                   fc->connected && fc->bdi_initialized) {
-                       clear_bdi_congested(&fc->bdi, BLK_RW_SYNC);
-                       clear_bdi_congested(&fc->bdi, BLK_RW_ASYNC);
+                   fc->connected && fc->sb) {
+                       clear_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC);
+                       clear_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC);
                }
                fc->num_background--;
                fc->active_background--;
@@ -573,10 +573,9 @@ void fuse_request_send_background_locked(struct fuse_conn *fc,
        fc->num_background++;
        if (fc->num_background == fc->max_background)
                fc->blocked = 1;
-       if (fc->num_background == fc->congestion_threshold &&
-           fc->bdi_initialized) {
-               set_bdi_congested(&fc->bdi, BLK_RW_SYNC);
-               set_bdi_congested(&fc->bdi, BLK_RW_ASYNC);
+       if (fc->num_background == fc->congestion_threshold && fc->sb) {
+               set_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC);
+               set_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC);
        }
        list_add_tail(&req->list, &fc->bg_queue);
        flush_bg_queue(fc);
index 32ac2c9..f33341d 100644 (file)
@@ -527,9 +527,6 @@ struct fuse_conn {
        /** Filesystem supports NFS exporting.  Only set in INIT */
        unsigned export_support:1;
 
-       /** Set if bdi is valid */
-       unsigned bdi_initialized:1;
-
        /** write-back cache policy (default is write-through) */
        unsigned writeback_cache:1;
 
@@ -631,9 +628,6 @@ struct fuse_conn {
        /** Negotiated minor version */
        unsigned minor;
 
-       /** Backing dev info */
-       struct backing_dev_info bdi;
-
        /** Entry on the fuse_conn_list */
        struct list_head entry;
 
index 6fe6a88..73cf051 100644 (file)
@@ -386,12 +386,6 @@ static void fuse_send_destroy(struct fuse_conn *fc)
        }
 }
 
-static void fuse_bdi_destroy(struct fuse_conn *fc)
-{
-       if (fc->bdi_initialized)
-               bdi_destroy(&fc->bdi);
-}
-
 static void fuse_put_super(struct super_block *sb)
 {
        struct fuse_conn *fc = get_fuse_conn_super(sb);
@@ -403,7 +397,6 @@ static void fuse_put_super(struct super_block *sb)
        list_del(&fc->entry);
        fuse_ctl_remove_conn(fc);
        mutex_unlock(&fuse_mutex);
-       fuse_bdi_destroy(fc);
 
        fuse_conn_put(fc);
 }
@@ -928,7 +921,8 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
                        fc->no_flock = 1;
                }
 
-               fc->bdi.ra_pages = min(fc->bdi.ra_pages, ra_pages);
+               fc->sb->s_bdi->ra_pages =
+                               min(fc->sb->s_bdi->ra_pages, ra_pages);
                fc->minor = arg->minor;
                fc->max_write = arg->minor < 5 ? 4096 : arg->max_write;
                fc->max_write = max_t(unsigned, 4096, fc->max_write);
@@ -944,7 +938,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
 
        arg->major = FUSE_KERNEL_VERSION;
        arg->minor = FUSE_KERNEL_MINOR_VERSION;
-       arg->max_readahead = fc->bdi.ra_pages * PAGE_SIZE;
+       arg->max_readahead = fc->sb->s_bdi->ra_pages * PAGE_SIZE;
        arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC |
                FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK |
                FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ |
@@ -976,27 +970,18 @@ static void fuse_free_conn(struct fuse_conn *fc)
 static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
 {
        int err;
+       char *suffix = "";
 
-       fc->bdi.name = "fuse";
-       fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_SIZE;
-       /* fuse does it's own writeback accounting */
-       fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB | BDI_CAP_STRICTLIMIT;
-
-       err = bdi_init(&fc->bdi);
+       if (sb->s_bdev)
+               suffix = "-fuseblk";
+       err = super_setup_bdi_name(sb, "%u:%u%s", MAJOR(fc->dev),
+                                  MINOR(fc->dev), suffix);
        if (err)
                return err;
 
-       fc->bdi_initialized = 1;
-
-       if (sb->s_bdev) {
-               err =  bdi_register(&fc->bdi, NULL, "%u:%u-fuseblk",
-                                   MAJOR(fc->dev), MINOR(fc->dev));
-       } else {
-               err = bdi_register_dev(&fc->bdi, fc->dev);
-       }
-
-       if (err)
-               return err;
+       sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_SIZE;
+       /* fuse does it's own writeback accounting */
+       sb->s_bdi->capabilities = BDI_CAP_NO_ACCT_WB | BDI_CAP_STRICTLIMIT;
 
        /*
         * For a single fuse filesystem use max 1% of dirty +
@@ -1010,7 +995,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
         *
         *    /sys/class/bdi/<bdi>/max_ratio
         */
-       bdi_set_max_ratio(&fc->bdi, 1);
+       bdi_set_max_ratio(sb->s_bdi, 1);
 
        return 0;
 }
@@ -1113,8 +1098,6 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
        if (err)
                goto err_dev_free;
 
-       sb->s_bdi = &fc->bdi;
-
        /* Handle umasking inside the fuse code */
        if (sb->s_flags & MS_POSIXACL)
                fc->dont_mask = 1;
@@ -1182,7 +1165,6 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
  err_dev_free:
        fuse_dev_free(fud);
  err_put_conn:
-       fuse_bdi_destroy(fc);
        fuse_conn_put(fc);
  err_fput:
        fput(file);
index b108e7b..ed67548 100644 (file)
@@ -23,6 +23,7 @@
 #include <linux/quotaops.h>
 #include <linux/lockdep.h>
 #include <linux/module.h>
+#include <linux/backing-dev.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -1222,12 +1223,7 @@ static int set_gfs2_super(struct super_block *s, void *data)
 {
        s->s_bdev = data;
        s->s_dev = s->s_bdev->bd_dev;
-
-       /*
-        * We set the bdi here to the queue backing, file systems can
-        * overwrite this in ->fill_super()
-        */
-       s->s_bdi = bdev_get_queue(s->s_bdev)->backing_dev_info;
+       s->s_bdi = bdi_get(s->s_bdev->bd_bdi);
        return 0;
 }
 
index 7163fe0..dde8613 100644 (file)
@@ -136,17 +136,26 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
        vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND;
        vma->vm_ops = &hugetlb_vm_ops;
 
+       /*
+        * Offset passed to mmap (before page shift) could have been
+        * negative when represented as a (l)off_t.
+        */
+       if (((loff_t)vma->vm_pgoff << PAGE_SHIFT) < 0)
+               return -EINVAL;
+
        if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
                return -EINVAL;
 
        vma_len = (loff_t)(vma->vm_end - vma->vm_start);
+       len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
+       /* check for overflow */
+       if (len < vma_len)
+               return -EINVAL;
 
        inode_lock(inode);
        file_accessed(file);
 
        ret = -ENOMEM;
-       len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
-
        if (hugetlb_reserve_pages(inode,
                                vma->vm_pgoff >> huge_page_order(h),
                                len >> huge_page_shift(h), vma,
@@ -155,7 +164,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 
        ret = 0;
        if (vma->vm_flags & VM_WRITE && inode->i_size < len)
-               inode->i_size = len;
+               i_size_write(inode, len);
 out:
        inode_unlock(inode);
 
index d41fab7..19dcf62 100644 (file)
@@ -2145,6 +2145,9 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
        int retval = 0;
        const char *s = nd->name->name;
 
+       if (!*s)
+               flags &= ~LOOKUP_RCU;
+
        nd->last_type = LAST_ROOT; /* if there are only slashes... */
        nd->flags = flags | LOOKUP_JUMPED | LOOKUP_PARENT;
        nd->depth = 0;
index d560609..6d0f14c 100644 (file)
@@ -554,12 +554,11 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
        sb->s_magic = NCP_SUPER_MAGIC;
        sb->s_op = &ncp_sops;
        sb->s_d_op = &ncp_dentry_operations;
-       sb->s_bdi = &server->bdi;
 
        server = NCP_SBP(sb);
        memset(server, 0, sizeof(*server));
 
-       error = bdi_setup_and_register(&server->bdi, "ncpfs");
+       error = super_setup_bdi(sb);
        if (error)
                goto out_fput;
 
@@ -568,7 +567,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
        if (data.info_fd != -1) {
                struct socket *info_sock = sockfd_lookup(data.info_fd, &error);
                if (!info_sock)
-                       goto out_bdi;
+                       goto out_fput;
                server->info_sock = info_sock;
                error = -EBADFD;
                if (info_sock->type != SOCK_STREAM)
@@ -746,8 +745,6 @@ out_nls:
 out_fput2:
        if (server->info_sock)
                sockfd_put(server->info_sock);
-out_bdi:
-       bdi_destroy(&server->bdi);
 out_fput:
        sockfd_put(sock);
 out:
@@ -788,7 +785,6 @@ static void ncp_put_super(struct super_block *sb)
        kill_pid(server->m.wdog_pid, SIGTERM, 1);
        put_pid(server->m.wdog_pid);
 
-       bdi_destroy(&server->bdi);
        kfree(server->priv.data);
        kfree(server->auth.object_name);
        vfree(server->rxbuf);
index 55e26fd..366fd63 100644 (file)
@@ -143,7 +143,6 @@ struct ncp_server {
                size_t len;
                __u8 data[128];
        } unexpected_packet;
-       struct backing_dev_info bdi;
 };
 
 extern void ncp_tcp_rcv_proc(struct work_struct *work);
index 390ada8..04d15a0 100644 (file)
@@ -761,9 +761,6 @@ static void nfs_server_set_fsinfo(struct nfs_server *server,
                server->rsize = NFS_MAX_FILE_IO_SIZE;
        server->rpages = (server->rsize + PAGE_SIZE - 1) >> PAGE_SHIFT;
 
-       server->backing_dev_info.name = "nfs";
-       server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD;
-
        if (server->wsize > max_rpc_payload)
                server->wsize = max_rpc_payload;
        if (server->wsize > NFS_MAX_FILE_IO_SIZE)
@@ -917,12 +914,6 @@ struct nfs_server *nfs_alloc_server(void)
                return NULL;
        }
 
-       if (bdi_init(&server->backing_dev_info)) {
-               nfs_free_iostats(server->io_stats);
-               kfree(server);
-               return NULL;
-       }
-
        ida_init(&server->openowner_id);
        ida_init(&server->lockowner_id);
        pnfs_init_server(server);
@@ -953,7 +944,6 @@ void nfs_free_server(struct nfs_server *server)
        ida_destroy(&server->lockowner_id);
        ida_destroy(&server->openowner_id);
        nfs_free_iostats(server->io_stats);
-       bdi_destroy(&server->backing_dev_info);
        kfree(server);
        nfs_release_automount_timer();
        dprintk("<-- nfs_free_server()\n");
index aab32fc..c1b5fed 100644 (file)
@@ -537,7 +537,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
 
        if (put_dreq(dreq))
                nfs_direct_complete(dreq);
-       return 0;
+       return requested_bytes;
 }
 
 /**
@@ -566,7 +566,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
        struct inode *inode = mapping->host;
        struct nfs_direct_req *dreq;
        struct nfs_lock_context *l_ctx;
-       ssize_t result = -EINVAL;
+       ssize_t result = -EINVAL, requested;
        size_t count = iov_iter_count(iter);
        nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count);
 
@@ -600,14 +600,19 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
        nfs_start_io_direct(inode);
 
        NFS_I(inode)->read_io += count;
-       result = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos);
+       requested = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos);
 
        nfs_end_io_direct(inode);
 
-       if (!result) {
+       if (requested > 0) {
                result = nfs_direct_wait(dreq);
-               if (result > 0)
+               if (result > 0) {
+                       requested -= result;
                        iocb->ki_pos += result;
+               }
+               iov_iter_revert(iter, requested);
+       } else {
+               result = requested;
        }
 
 out_release:
@@ -954,7 +959,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
 
        if (put_dreq(dreq))
                nfs_direct_write_complete(dreq);
-       return 0;
+       return requested_bytes;
 }
 
 /**
@@ -979,7 +984,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
  */
 ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
 {
-       ssize_t result = -EINVAL;
+       ssize_t result = -EINVAL, requested;
        size_t count;
        struct file *file = iocb->ki_filp;
        struct address_space *mapping = file->f_mapping;
@@ -1022,7 +1027,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
 
        nfs_start_io_direct(inode);
 
-       result = nfs_direct_write_schedule_iovec(dreq, iter, pos);
+       requested = nfs_direct_write_schedule_iovec(dreq, iter, pos);
 
        if (mapping->nrpages) {
                invalidate_inode_pages2_range(mapping,
@@ -1031,13 +1036,17 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
 
        nfs_end_io_direct(inode);
 
-       if (!result) {
+       if (requested > 0) {
                result = nfs_direct_wait(dreq);
                if (result > 0) {
+                       requested -= result;
                        iocb->ki_pos = pos + result;
                        /* XXX: should check the generic_write_sync retval */
                        generic_write_sync(iocb, result);
                }
+               iov_iter_revert(iter, requested);
+       } else {
+               result = requested;
        }
 out_release:
        nfs_direct_req_release(dreq);
index 7b38fed..9dc65d7 100644 (file)
@@ -139,7 +139,7 @@ struct nfs_mount_request {
 };
 
 struct nfs_mount_info {
-       void (*fill_super)(struct super_block *, struct nfs_mount_info *);
+       int (*fill_super)(struct super_block *, struct nfs_mount_info *);
        int (*set_security)(struct super_block *, struct dentry *, struct nfs_mount_info *);
        struct nfs_parsed_mount_data *parsed;
        struct nfs_clone_mount *cloned;
@@ -407,7 +407,7 @@ struct dentry *nfs_fs_mount(struct file_system_type *, int, const char *, void *
 struct dentry * nfs_xdev_mount_common(struct file_system_type *, int,
                const char *, struct nfs_mount_info *);
 void nfs_kill_super(struct super_block *);
-void nfs_fill_super(struct super_block *, struct nfs_mount_info *);
+int nfs_fill_super(struct super_block *, struct nfs_mount_info *);
 
 extern struct rpc_stat nfs_rpcstat;
 
@@ -458,7 +458,7 @@ extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
 extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio);
 
 /* super.c */
-void nfs_clone_super(struct super_block *, struct nfs_mount_info *);
+int nfs_clone_super(struct super_block *, struct nfs_mount_info *);
 void nfs_umount_begin(struct super_block *);
 int  nfs_statfs(struct dentry *, struct kstatfs *);
 int  nfs_show_options(struct seq_file *, struct dentry *);
index 54e0f9f..dc69314 100644 (file)
@@ -2315,18 +2315,17 @@ inline void nfs_initialise_sb(struct super_block *sb)
                sb->s_blocksize = nfs_block_bits(server->wsize,
                                                 &sb->s_blocksize_bits);
 
-       sb->s_bdi = &server->backing_dev_info;
-
        nfs_super_set_maxbytes(sb, server->maxfilesize);
 }
 
 /*
  * Finish setting up an NFS2/3 superblock
  */
-void nfs_fill_super(struct super_block *sb, struct nfs_mount_info *mount_info)
+int nfs_fill_super(struct super_block *sb, struct nfs_mount_info *mount_info)
 {
        struct nfs_parsed_mount_data *data = mount_info->parsed;
        struct nfs_server *server = NFS_SB(sb);
+       int ret;
 
        sb->s_blocksize_bits = 0;
        sb->s_blocksize = 0;
@@ -2344,13 +2343,21 @@ void nfs_fill_super(struct super_block *sb, struct nfs_mount_info *mount_info)
        }
 
        nfs_initialise_sb(sb);
+
+       ret = super_setup_bdi_name(sb, "%u:%u", MAJOR(server->s_dev),
+                                  MINOR(server->s_dev));
+       if (ret)
+               return ret;
+       sb->s_bdi->ra_pages = server->rpages * NFS_MAX_READAHEAD;
+       return 0;
+
 }
 EXPORT_SYMBOL_GPL(nfs_fill_super);
 
 /*
  * Finish setting up a cloned NFS2/3/4 superblock
  */
-void nfs_clone_super(struct super_block *sb, struct nfs_mount_info *mount_info)
+int nfs_clone_super(struct super_block *sb, struct nfs_mount_info *mount_info)
 {
        const struct super_block *old_sb = mount_info->cloned->sb;
        struct nfs_server *server = NFS_SB(sb);
@@ -2370,6 +2377,10 @@ void nfs_clone_super(struct super_block *sb, struct nfs_mount_info *mount_info)
        }
 
        nfs_initialise_sb(sb);
+
+       sb->s_bdi = bdi_get(old_sb->s_bdi);
+
+       return 0;
 }
 
 static int nfs_compare_mount_options(const struct super_block *s, const struct nfs_server *b, int flags)
@@ -2522,11 +2533,6 @@ static void nfs_get_cache_cookie(struct super_block *sb,
 }
 #endif
 
-static int nfs_bdi_register(struct nfs_server *server)
-{
-       return bdi_register_dev(&server->backing_dev_info, server->s_dev);
-}
-
 int nfs_set_sb_security(struct super_block *s, struct dentry *mntroot,
                        struct nfs_mount_info *mount_info)
 {
@@ -2594,17 +2600,14 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server,
                nfs_free_server(server);
                server = NULL;
        } else {
-               error = nfs_bdi_register(server);
-               if (error) {
-                       mntroot = ERR_PTR(error);
-                       goto error_splat_super;
-               }
                server->super = s;
        }
 
        if (!s->s_root) {
                /* initial superblock/root creation */
-               mount_info->fill_super(s, mount_info);
+               error = mount_info->fill_super(s, mount_info);
+               if (error)
+                       goto error_splat_super;
                nfs_get_cache_cookie(s, mount_info->parsed, mount_info->cloned);
        }
 
index abb2c8a..cc341fc 100644 (file)
@@ -263,16 +263,15 @@ int nfs_congestion_kb;
 
 static void nfs_set_page_writeback(struct page *page)
 {
-       struct nfs_server *nfss = NFS_SERVER(page_file_mapping(page)->host);
+       struct inode *inode = page_file_mapping(page)->host;
+       struct nfs_server *nfss = NFS_SERVER(inode);
        int ret = test_set_page_writeback(page);
 
        WARN_ON_ONCE(ret != 0);
 
        if (atomic_long_inc_return(&nfss->writeback) >
-                       NFS_CONGESTION_ON_THRESH) {
-               set_bdi_congested(&nfss->backing_dev_info,
-                                       BLK_RW_ASYNC);
-       }
+                       NFS_CONGESTION_ON_THRESH)
+               set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
 }
 
 static void nfs_end_page_writeback(struct nfs_page *req)
@@ -285,7 +284,7 @@ static void nfs_end_page_writeback(struct nfs_page *req)
 
        end_page_writeback(req->wb_page);
        if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
-               clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
+               clear_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
 }
 
 
@@ -1808,7 +1807,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
        }
        nfss = NFS_SERVER(data->inode);
        if (atomic_long_read(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
-               clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
+               clear_bdi_congested(inode_to_bdi(data->inode), BLK_RW_ASYNC);
 
        nfs_init_cinfo(&cinfo, data->inode, data->dreq);
        nfs_commit_end(cinfo.mds);
index 92b4b41..fb5213a 100644 (file)
@@ -242,10 +242,11 @@ static int nfsd4_scsi_identify_device(struct block_device *bdev,
        req->cmd[4] = bufflen & 0xff;
        req->cmd_len = COMMAND_SIZE(INQUIRY);
 
-       error = blk_execute_rq(rq->q, NULL, rq, 1);
-       if (error) {
+       blk_execute_rq(rq->q, NULL, rq, 1);
+       if (req->result) {
                pr_err("pNFS: INQUIRY 0x83 failed with: %x\n",
-                       rq->errors);
+                       req->result);
+               error = -EIO;
                goto out_put_request;
        }
 
index dba2ff8..4523346 100644 (file)
@@ -358,6 +358,8 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
 {
        unsigned int len, v, hdr, dlen;
        u32 max_blocksize = svc_max_payload(rqstp);
+       struct kvec *head = rqstp->rq_arg.head;
+       struct kvec *tail = rqstp->rq_arg.tail;
 
        p = decode_fh(p, &args->fh);
        if (!p)
@@ -367,6 +369,8 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
        args->count = ntohl(*p++);
        args->stable = ntohl(*p++);
        len = args->len = ntohl(*p++);
+       if ((void *)p > head->iov_base + head->iov_len)
+               return 0;
        /*
         * The count must equal the amount of data passed.
         */
@@ -377,9 +381,8 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
         * Check to make sure that we got the right number of
         * bytes.
         */
-       hdr = (void*)p - rqstp->rq_arg.head[0].iov_base;
-       dlen = rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len
-               + rqstp->rq_arg.tail[0].iov_len - hdr;
+       hdr = (void*)p - head->iov_base;
+       dlen = head->iov_len + rqstp->rq_arg.page_len + tail->iov_len - hdr;
        /*
         * Round the length of the data which was specified up to
         * the next multiple of XDR units and then compare that
@@ -396,7 +399,7 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
                len = args->len = max_blocksize;
        }
        rqstp->rq_vec[0].iov_base = (void*)p;
-       rqstp->rq_vec[0].iov_len = rqstp->rq_arg.head[0].iov_len - hdr;
+       rqstp->rq_vec[0].iov_len = head->iov_len - hdr;
        v = 0;
        while (len > rqstp->rq_vec[v].iov_len) {
                len -= rqstp->rq_vec[v].iov_len;
@@ -471,6 +474,8 @@ nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p,
        /* first copy and check from the first page */
        old = (char*)p;
        vec = &rqstp->rq_arg.head[0];
+       if ((void *)old > vec->iov_base + vec->iov_len)
+               return 0;
        avail = vec->iov_len - (old - (char*)vec->iov_base);
        while (len && avail && *old) {
                *new++ = *old++;
index cbeeda1..d86031b 100644 (file)
@@ -2489,7 +2489,7 @@ bool nfsd4_spo_must_allow(struct svc_rqst *rqstp)
 
 int nfsd4_max_reply(struct svc_rqst *rqstp, struct nfsd4_op *op)
 {
-       if (op->opnum == OP_ILLEGAL)
+       if (op->opnum == OP_ILLEGAL || op->status == nfserr_notsupp)
                return op_encode_hdr_size * sizeof(__be32);
 
        BUG_ON(OPDESC(op)->op_rsize_bop == NULL);
index 31e1f95..59979f0 100644 (file)
@@ -747,6 +747,37 @@ static __be32 map_new_errors(u32 vers, __be32 nfserr)
        return nfserr;
 }
 
+/*
+ * A write procedure can have a large argument, and a read procedure can
+ * have a large reply, but no NFSv2 or NFSv3 procedure has argument and
+ * reply that can both be larger than a page.  The xdr code has taken
+ * advantage of this assumption to be a sloppy about bounds checking in
+ * some cases.  Pending a rewrite of the NFSv2/v3 xdr code to fix that
+ * problem, we enforce these assumptions here:
+ */
+static bool nfs_request_too_big(struct svc_rqst *rqstp,
+                               struct svc_procedure *proc)
+{
+       /*
+        * The ACL code has more careful bounds-checking and is not
+        * susceptible to this problem:
+        */
+       if (rqstp->rq_prog != NFS_PROGRAM)
+               return false;
+       /*
+        * Ditto NFSv4 (which can in theory have argument and reply both
+        * more than a page):
+        */
+       if (rqstp->rq_vers >= 4)
+               return false;
+       /* The reply will be small, we're OK: */
+       if (proc->pc_xdrressize > 0 &&
+           proc->pc_xdrressize < XDR_QUADLEN(PAGE_SIZE))
+               return false;
+
+       return rqstp->rq_arg.len > PAGE_SIZE;
+}
+
 int
 nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
 {
@@ -759,6 +790,11 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
                                rqstp->rq_vers, rqstp->rq_proc);
        proc = rqstp->rq_procinfo;
 
+       if (nfs_request_too_big(rqstp, proc)) {
+               dprintk("nfsd: NFSv%d argument too large\n", rqstp->rq_vers);
+               *statp = rpc_garbage_args;
+               return 1;
+       }
        /*
         * Give the xdr decoder a chance to change this if it wants
         * (necessary in the NFSv4.0 compound case)
index 41b468a..de07ff6 100644 (file)
@@ -280,6 +280,7 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
                                        struct nfsd_writeargs *args)
 {
        unsigned int len, hdr, dlen;
+       struct kvec *head = rqstp->rq_arg.head;
        int v;
 
        p = decode_fh(p, &args->fh);
@@ -300,9 +301,10 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
         * Check to make sure that we got the right number of
         * bytes.
         */
-       hdr = (void*)p - rqstp->rq_arg.head[0].iov_base;
-       dlen = rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len
-               - hdr;
+       hdr = (void*)p - head->iov_base;
+       if (hdr > head->iov_len)
+               return 0;
+       dlen = head->iov_len + rqstp->rq_arg.page_len - hdr;
 
        /*
         * Round the length of the data which was specified up to
@@ -316,7 +318,7 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
                return 0;
 
        rqstp->rq_vec[0].iov_base = (void*)p;
-       rqstp->rq_vec[0].iov_len = rqstp->rq_arg.head[0].iov_len - hdr;
+       rqstp->rq_vec[0].iov_len = head->iov_len - hdr;
        v = 0;
        while (len > rqstp->rq_vec[v].iov_len) {
                len -= rqstp->rq_vec[v].iov_len;
index 19d50f6..9aaf6ca 100644 (file)
@@ -1004,7 +1004,7 @@ out_nfserr:
        else
                err = nfserrno(host_err);
        if (test_bit(RQ_LOCAL, &rqstp->rq_flags))
-               tsk_restore_flags(current, pflags, PF_LESS_THROTTLE);
+               current_restore_flags(pflags, PF_LESS_THROTTLE);
        return err;
 }
 
index e1872f3..9266829 100644 (file)
@@ -1068,7 +1068,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_time_gran = 1;
        sb->s_max_links = NILFS_LINK_MAX;
 
-       sb->s_bdi = bdev_get_queue(sb->s_bdev)->backing_dev_info;
+       sb->s_bdi = bdi_get(sb->s_bdev->bd_bdi);
 
        err = load_nilfs(nilfs, sb);
        if (err)
index 1656843..323f492 100644 (file)
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -91,6 +91,7 @@ slow:
                return ERR_PTR(-ENOMEM);
        }
        d_instantiate(dentry, inode);
+       dentry->d_flags |= DCACHE_RCUACCESS;
        dentry->d_fsdata = (void *)ns->ops;
        d = atomic_long_cmpxchg(&ns->stashed, 0, (unsigned long)dentry);
        if (d) {
index d0ab7e5..5b51c31 100644 (file)
@@ -1460,27 +1460,10 @@ static void o2net_rx_until_empty(struct work_struct *work)
 
 static int o2net_set_nodelay(struct socket *sock)
 {
-       int ret, val = 1;
-       mm_segment_t oldfs;
+       int val = 1;
 
-       oldfs = get_fs();
-       set_fs(KERNEL_DS);
-
-       /*
-        * Dear unsuspecting programmer,
-        *
-        * Don't use sock_setsockopt() for SOL_TCP.  It doesn't check its level
-        * argument and assumes SOL_SOCKET so, say, your TCP_NODELAY will
-        * silently turn into SO_DEBUG.
-        *
-        * Yours,
-        * Keeper of hilariously fragile interfaces.
-        */
-       ret = sock->ops->setsockopt(sock, SOL_TCP, TCP_NODELAY,
-                                   (char __user *)&val, sizeof(val));
-
-       set_fs(oldfs);
-       return ret;
+       return kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY,
+                                   (void *)&val, sizeof(val));
 }
 
 static int o2net_set_usertimeout(struct socket *sock)
@@ -1488,7 +1471,7 @@ static int o2net_set_usertimeout(struct socket *sock)
        int user_timeout = O2NET_TCP_USER_TIMEOUT;
 
        return kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT,
-                               (char *)&user_timeout, sizeof(user_timeout));
+                               (void *)&user_timeout, sizeof(user_timeout));
 }
 
 static void o2net_initialize_handshake(void)
index c4ab6fd..e1534c9 100644 (file)
@@ -208,14 +208,19 @@ restart:
                                continue;
                        /*
                         * Skip ops whose filesystem we don't know about unless
-                        * it is being mounted.
+                        * it is being mounted or unmounted.  It is possible for
+                        * a filesystem we don't know about to be unmounted if
+                        * it fails to mount in the kernel after userspace has
+                        * been sent the mount request.
                         */
                        /* XXX: is there a better way to detect this? */
                        } else if (ret == -1 &&
                                   !(op->upcall.type ==
                                        ORANGEFS_VFS_OP_FS_MOUNT ||
                                     op->upcall.type ==
-                                       ORANGEFS_VFS_OP_GETATTR)) {
+                                       ORANGEFS_VFS_OP_GETATTR ||
+                                    op->upcall.type ==
+                                       ORANGEFS_VFS_OP_FS_UMOUNT)) {
                                gossip_debug(GOSSIP_DEV_DEBUG,
                                    "orangefs: skipping op tag %llu %s\n",
                                    llu(op->tag), get_opname_string(op));
index 6333cbb..83b5060 100644 (file)
@@ -521,13 +521,11 @@ int orangefs_bufmap_copy_from_iovec(struct iov_iter *iter,
                size_t n = size;
                if (n > PAGE_SIZE)
                        n = PAGE_SIZE;
-               n = copy_page_from_iter(page, 0, n, iter);
-               if (!n)
+               if (copy_page_from_iter(page, 0, n, iter) != n)
                        return -EFAULT;
                size -= n;
        }
        return 0;
-
 }
 
 /*
index 5e48a0b..8afac46 100644 (file)
@@ -249,6 +249,7 @@ struct orangefs_sb_info_s {
        char devname[ORANGEFS_MAX_SERVER_ADDR_LEN];
        struct super_block *sb;
        int mount_pending;
+       int no_list;
        struct list_head list;
 };
 
index 67c2435..629d8c9 100644 (file)
@@ -263,8 +263,13 @@ int orangefs_remount(struct orangefs_sb_info_s *orangefs_sb)
                if (!new_op)
                        return -ENOMEM;
                new_op->upcall.req.features.features = 0;
-               ret = service_operation(new_op, "orangefs_features", 0);
-               orangefs_features = new_op->downcall.resp.features.features;
+               ret = service_operation(new_op, "orangefs_features",
+                   ORANGEFS_OP_PRIORITY | ORANGEFS_OP_NO_MUTEX);
+               if (!ret)
+                       orangefs_features =
+                           new_op->downcall.resp.features.features;
+               else
+                       orangefs_features = 0;
                op_release(new_op);
        } else {
                orangefs_features = 0;
@@ -488,7 +493,7 @@ struct dentry *orangefs_mount(struct file_system_type *fst,
 
        if (ret) {
                d = ERR_PTR(ret);
-               goto free_op;
+               goto free_sb_and_op;
        }
 
        /*
@@ -514,6 +519,9 @@ struct dentry *orangefs_mount(struct file_system_type *fst,
        spin_unlock(&orangefs_superblocks_lock);
        op_release(new_op);
 
+       /* Must be removed from the list now. */
+       ORANGEFS_SB(sb)->no_list = 0;
+
        if (orangefs_userspace_version >= 20906) {
                new_op = op_alloc(ORANGEFS_VFS_OP_FEATURES);
                if (!new_op)
@@ -528,6 +536,10 @@ struct dentry *orangefs_mount(struct file_system_type *fst,
 
        return dget(sb->s_root);
 
+free_sb_and_op:
+       /* Will call orangefs_kill_sb with sb not in list. */
+       ORANGEFS_SB(sb)->no_list = 1;
+       deactivate_locked_super(sb);
 free_op:
        gossip_err("orangefs_mount: mount request failed with %d\n", ret);
        if (ret == -EINVAL) {
@@ -553,12 +565,14 @@ void orangefs_kill_sb(struct super_block *sb)
         */
         orangefs_unmount_sb(sb);
 
-       /* remove the sb from our list of orangefs specific sb's */
-
-       spin_lock(&orangefs_superblocks_lock);
-       __list_del_entry(&ORANGEFS_SB(sb)->list);       /* not list_del_init */
-       ORANGEFS_SB(sb)->list.prev = NULL;
-       spin_unlock(&orangefs_superblocks_lock);
+       if (!ORANGEFS_SB(sb)->no_list) {
+               /* remove the sb from our list of orangefs specific sb's */
+               spin_lock(&orangefs_superblocks_lock);
+               /* not list_del_init */
+               __list_del_entry(&ORANGEFS_SB(sb)->list);
+               ORANGEFS_SB(sb)->list.prev = NULL;
+               spin_unlock(&orangefs_superblocks_lock);
+       }
 
        /*
         * make sure that ORANGEFS_DEV_REMOUNT_ALL loop that might've seen us
index 8f91ec6..d04ea43 100644 (file)
@@ -1074,6 +1074,7 @@ static int sysctl_check_table(const char *path, struct ctl_table *table)
 
                if ((table->proc_handler == proc_dostring) ||
                    (table->proc_handler == proc_dointvec) ||
+                   (table->proc_handler == proc_douintvec) ||
                    (table->proc_handler == proc_dointvec_minmax) ||
                    (table->proc_handler == proc_dointvec_jiffies) ||
                    (table->proc_handler == proc_dointvec_userhz_jiffies) ||
index f08bd31..3125780 100644 (file)
@@ -900,7 +900,14 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
 static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
                unsigned long addr, pmd_t *pmdp)
 {
-       pmd_t pmd = pmdp_huge_get_and_clear(vma->vm_mm, addr, pmdp);
+       pmd_t pmd = *pmdp;
+
+       /* See comment in change_huge_pmd() */
+       pmdp_invalidate(vma, addr, pmdp);
+       if (pmd_dirty(*pmdp))
+               pmd = pmd_mkdirty(pmd);
+       if (pmd_young(*pmdp))
+               pmd = pmd_mkyoung(pmd);
 
        pmd = pmd_wrprotect(pmd);
        pmd = pmd_clear_soft_dirty(pmd);
index fa0be59..a257b87 100644 (file)
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -130,9 +130,13 @@ EXPORT_SYMBOL(vfs_getattr);
 int vfs_statx_fd(unsigned int fd, struct kstat *stat,
                 u32 request_mask, unsigned int query_flags)
 {
-       struct fd f = fdget_raw(fd);
+       struct fd f;
        int error = -EBADF;
 
+       if (query_flags & ~KSTAT_QUERY_FLAGS)
+               return -EINVAL;
+
+       f = fdget_raw(fd);
        if (f.file) {
                error = vfs_getattr(&f.file->f_path, stat,
                                    request_mask, query_flags);
@@ -155,9 +159,6 @@ EXPORT_SYMBOL(vfs_statx_fd);
  * Additionally, the use of AT_SYMLINK_NOFOLLOW in flags will prevent a symlink
  * at the given name from being referenced.
  *
- * The caller must have preset stat->request_mask as for vfs_getattr().  The
- * flags are also used to load up stat->query_flags.
- *
  * 0 will be returned on success, and a -ve error code if unsuccessful.
  */
 int vfs_statx(int dfd, const char __user *filename, int flags,
@@ -509,58 +510,50 @@ SYSCALL_DEFINE4(fstatat64, int, dfd, const char __user *, filename,
 }
 #endif /* __ARCH_WANT_STAT64 || __ARCH_WANT_COMPAT_STAT64 */
 
-static inline int __put_timestamp(struct timespec *kts,
-                                 struct statx_timestamp __user *uts)
-{
-       return (__put_user(kts->tv_sec,         &uts->tv_sec            ) ||
-               __put_user(kts->tv_nsec,        &uts->tv_nsec           ) ||
-               __put_user(0,                   &uts->__reserved        ));
-}
-
-/*
- * Set the statx results.
- */
-static long statx_set_result(struct kstat *stat, struct statx __user *buffer)
+static noinline_for_stack int
+cp_statx(const struct kstat *stat, struct statx __user *buffer)
 {
-       uid_t uid = from_kuid_munged(current_user_ns(), stat->uid);
-       gid_t gid = from_kgid_munged(current_user_ns(), stat->gid);
-
-       if (__put_user(stat->result_mask,       &buffer->stx_mask       ) ||
-           __put_user(stat->mode,              &buffer->stx_mode       ) ||
-           __clear_user(&buffer->__spare0, sizeof(buffer->__spare0))     ||
-           __put_user(stat->nlink,             &buffer->stx_nlink      ) ||
-           __put_user(uid,                     &buffer->stx_uid        ) ||
-           __put_user(gid,                     &buffer->stx_gid        ) ||
-           __put_user(stat->attributes,        &buffer->stx_attributes ) ||
-           __put_user(stat->blksize,           &buffer->stx_blksize    ) ||
-           __put_user(MAJOR(stat->rdev),       &buffer->stx_rdev_major ) ||
-           __put_user(MINOR(stat->rdev),       &buffer->stx_rdev_minor ) ||
-           __put_user(MAJOR(stat->dev),        &buffer->stx_dev_major  ) ||
-           __put_user(MINOR(stat->dev),        &buffer->stx_dev_minor  ) ||
-           __put_timestamp(&stat->atime,       &buffer->stx_atime      ) ||
-           __put_timestamp(&stat->btime,       &buffer->stx_btime      ) ||
-           __put_timestamp(&stat->ctime,       &buffer->stx_ctime      ) ||
-           __put_timestamp(&stat->mtime,       &buffer->stx_mtime      ) ||
-           __put_user(stat->ino,               &buffer->stx_ino        ) ||
-           __put_user(stat->size,              &buffer->stx_size       ) ||
-           __put_user(stat->blocks,            &buffer->stx_blocks     ) ||
-           __clear_user(&buffer->__spare1, sizeof(buffer->__spare1))     ||
-           __clear_user(&buffer->__spare2, sizeof(buffer->__spare2)))
-               return -EFAULT;
-
-       return 0;
+       struct statx tmp;
+
+       memset(&tmp, 0, sizeof(tmp));
+
+       tmp.stx_mask = stat->result_mask;
+       tmp.stx_blksize = stat->blksize;
+       tmp.stx_attributes = stat->attributes;
+       tmp.stx_nlink = stat->nlink;
+       tmp.stx_uid = from_kuid_munged(current_user_ns(), stat->uid);
+       tmp.stx_gid = from_kgid_munged(current_user_ns(), stat->gid);
+       tmp.stx_mode = stat->mode;
+       tmp.stx_ino = stat->ino;
+       tmp.stx_size = stat->size;
+       tmp.stx_blocks = stat->blocks;
+       tmp.stx_attributes_mask = stat->attributes_mask;
+       tmp.stx_atime.tv_sec = stat->atime.tv_sec;
+       tmp.stx_atime.tv_nsec = stat->atime.tv_nsec;
+       tmp.stx_btime.tv_sec = stat->btime.tv_sec;
+       tmp.stx_btime.tv_nsec = stat->btime.tv_nsec;
+       tmp.stx_ctime.tv_sec = stat->ctime.tv_sec;
+       tmp.stx_ctime.tv_nsec = stat->ctime.tv_nsec;
+       tmp.stx_mtime.tv_sec = stat->mtime.tv_sec;
+       tmp.stx_mtime.tv_nsec = stat->mtime.tv_nsec;
+       tmp.stx_rdev_major = MAJOR(stat->rdev);
+       tmp.stx_rdev_minor = MINOR(stat->rdev);
+       tmp.stx_dev_major = MAJOR(stat->dev);
+       tmp.stx_dev_minor = MINOR(stat->dev);
+
+       return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0;
 }
 
 /**
  * sys_statx - System call to get enhanced stats
  * @dfd: Base directory to pathwalk from *or* fd to stat.
- * @filename: File to stat *or* NULL.
+ * @filename: File to stat or "" with AT_EMPTY_PATH
  * @flags: AT_* flags to control pathwalk.
  * @mask: Parts of statx struct actually required.
  * @buffer: Result buffer.
  *
- * Note that if filename is NULL, then it does the equivalent of fstat() using
- * dfd to indicate the file of interest.
+ * Note that fstat() can be emulated by setting dfd to the fd of interest,
+ * supplying "" as the filename and setting AT_EMPTY_PATH in the flags.
  */
 SYSCALL_DEFINE5(statx,
                int, dfd, const char __user *, filename, unsigned, flags,
@@ -570,18 +563,16 @@ SYSCALL_DEFINE5(statx,
        struct kstat stat;
        int error;
 
+       if (mask & STATX__RESERVED)
+               return -EINVAL;
        if ((flags & AT_STATX_SYNC_TYPE) == AT_STATX_SYNC_TYPE)
                return -EINVAL;
-       if (!access_ok(VERIFY_WRITE, buffer, sizeof(*buffer)))
-               return -EFAULT;
 
-       if (filename)
-               error = vfs_statx(dfd, filename, flags, &stat, mask);
-       else
-               error = vfs_statx_fd(dfd, &stat, mask, flags);
+       error = vfs_statx(dfd, filename, flags, &stat, mask);
        if (error)
                return error;
-       return statx_set_result(&stat, buffer);
+
+       return cp_statx(&stat, buffer);
 }
 
 /* Caller is here responsible for sufficient locking (ie. inode->i_lock) */
index b8b6a08..adb0c0d 100644 (file)
@@ -446,6 +446,10 @@ void generic_shutdown_super(struct super_block *sb)
        hlist_del_init(&sb->s_instances);
        spin_unlock(&sb_lock);
        up_write(&sb->s_umount);
+       if (sb->s_bdi != &noop_backing_dev_info) {
+               bdi_put(sb->s_bdi);
+               sb->s_bdi = &noop_backing_dev_info;
+       }
 }
 
 EXPORT_SYMBOL(generic_shutdown_super);
@@ -1049,12 +1053,8 @@ static int set_bdev_super(struct super_block *s, void *data)
 {
        s->s_bdev = data;
        s->s_dev = s->s_bdev->bd_dev;
+       s->s_bdi = bdi_get(s->s_bdev->bd_bdi);
 
-       /*
-        * We set the bdi here to the queue backing, file systems can
-        * overwrite this in ->fill_super()
-        */
-       s->s_bdi = bdev_get_queue(s->s_bdev)->backing_dev_info;
        return 0;
 }
 
@@ -1256,6 +1256,49 @@ out:
 }
 
 /*
+ * Setup private BDI for given superblock. It gets automatically cleaned up
+ * in generic_shutdown_super().
+ */
+int super_setup_bdi_name(struct super_block *sb, char *fmt, ...)
+{
+       struct backing_dev_info *bdi;
+       int err;
+       va_list args;
+
+       bdi = bdi_alloc(GFP_KERNEL);
+       if (!bdi)
+               return -ENOMEM;
+
+       bdi->name = sb->s_type->name;
+
+       va_start(args, fmt);
+       err = bdi_register_va(bdi, fmt, args);
+       va_end(args);
+       if (err) {
+               bdi_put(bdi);
+               return err;
+       }
+       WARN_ON(sb->s_bdi != &noop_backing_dev_info);
+       sb->s_bdi = bdi;
+
+       return 0;
+}
+EXPORT_SYMBOL(super_setup_bdi_name);
+
+/*
+ * Setup private BDI for given superblock. I gets automatically cleaned up
+ * in generic_shutdown_super().
+ */
+int super_setup_bdi(struct super_block *sb)
+{
+       static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
+
+       return super_setup_bdi_name(sb, "%.28s-%ld", sb->s_type->name,
+                                   atomic_long_inc_return(&bdi_seq));
+}
+EXPORT_SYMBOL(super_setup_bdi);
+
+/*
  * This is an internal function, please use sb_end_{write,pagefault,intwrite}
  * instead.
  */
index b803213..39c75a8 100644 (file)
@@ -108,7 +108,7 @@ static ssize_t sysfs_kf_read(struct kernfs_open_file *of, char *buf,
 {
        const struct sysfs_ops *ops = sysfs_file_ops(of->kn);
        struct kobject *kobj = of->kn->parent->priv;
-       size_t len;
+       ssize_t len;
 
        /*
         * If buf != of->prealloc_buf, we don't know how
@@ -117,13 +117,15 @@ static ssize_t sysfs_kf_read(struct kernfs_open_file *of, char *buf,
        if (WARN_ON_ONCE(buf != of->prealloc_buf))
                return 0;
        len = ops->show(kobj, of->kn->priv, buf);
+       if (len < 0)
+               return len;
        if (pos) {
                if (len <= pos)
                        return 0;
                len -= pos;
                memmove(buf, buf + pos, len);
        }
-       return min(count, len);
+       return min_t(ssize_t, count, len);
 }
 
 /* kernfs write callback for regular sysfs files */
index 1e712a3..718b749 100644 (file)
@@ -32,6 +32,7 @@
 #include <linux/math64.h>
 #include <linux/uaccess.h>
 #include <linux/random.h>
+#include <linux/ctype.h>
 #include "ubifs.h"
 
 static DEFINE_SPINLOCK(dbg_lock);
@@ -286,8 +287,10 @@ void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode)
                        break;
                }
 
-               pr_err("\t%d: %s (%s)\n",
-                      count++, dent->name, get_dent_type(dent->type));
+               pr_err("\t%d: inode %llu, type %s, len %d\n",
+                      count++, (unsigned long long) le64_to_cpu(dent->inum),
+                      get_dent_type(dent->type),
+                      le16_to_cpu(dent->nlen));
 
                fname_name(&nm) = dent->name;
                fname_len(&nm) = le16_to_cpu(dent->nlen);
@@ -464,7 +467,8 @@ void ubifs_dump_node(const struct ubifs_info *c, const void *node)
                        pr_err("(bad name length, not printing, bad or corrupted node)");
                else {
                        for (i = 0; i < nlen && dent->name[i]; i++)
-                               pr_cont("%c", dent->name[i]);
+                               pr_cont("%c", isprint(dent->name[i]) ?
+                                       dent->name[i] : '?');
                }
                pr_cont("\n");
 
index 30825d8..b777bdd 100644 (file)
@@ -606,8 +606,8 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx)
        }
 
        while (1) {
-               dbg_gen("feed '%s', ino %llu, new f_pos %#x",
-                       dent->name, (unsigned long long)le64_to_cpu(dent->inum),
+               dbg_gen("ino %llu, new f_pos %#x",
+                       (unsigned long long)le64_to_cpu(dent->inum),
                        key_hash_flash(c, &dent->key));
                ubifs_assert(le64_to_cpu(dent->ch.sqnum) >
                             ubifs_inode(dir)->creat_sqnum);
@@ -748,6 +748,11 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
                goto out_fname;
 
        lock_2_inodes(dir, inode);
+
+       /* Handle O_TMPFILE corner case, it is allowed to link a O_TMPFILE. */
+       if (inode->i_nlink == 0)
+               ubifs_delete_orphan(c, inode->i_ino);
+
        inc_nlink(inode);
        ihold(inode);
        inode->i_ctime = ubifs_current_time(inode);
@@ -768,6 +773,8 @@ out_cancel:
        dir->i_size -= sz_change;
        dir_ui->ui_size = dir->i_size;
        drop_nlink(inode);
+       if (inode->i_nlink == 0)
+               ubifs_add_orphan(c, inode->i_ino);
        unlock_2_inodes(dir, inode);
        ubifs_release_budget(c, &req);
        iput(inode);
@@ -1068,8 +1075,10 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry,
        }
 
        err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm);
-       if (err)
+       if (err) {
+               kfree(dev);
                goto out_budg;
+       }
 
        sz_change = CALC_DENT_SIZE(fname_len(&nm));
 
@@ -1316,9 +1325,6 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
        unsigned int uninitialized_var(saved_nlink);
        struct fscrypt_name old_nm, new_nm;
 
-       if (flags & ~RENAME_NOREPLACE)
-               return -EINVAL;
-
        /*
         * Budget request settings: deletion direntry, new direntry, removing
         * the old inode, and changing old and new parent directory inodes.
index b73811b..cf4cc99 100644 (file)
@@ -1827,7 +1827,6 @@ static void ubifs_put_super(struct super_block *sb)
        }
 
        ubifs_umount(c);
-       bdi_destroy(&c->bdi);
        ubi_close_volume(c->ubi);
        mutex_unlock(&c->umount_mutex);
 }
@@ -2019,29 +2018,25 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
                goto out;
        }
 
+       err = ubifs_parse_options(c, data, 0);
+       if (err)
+               goto out_close;
+
        /*
         * UBIFS provides 'backing_dev_info' in order to disable read-ahead. For
         * UBIFS, I/O is not deferred, it is done immediately in readpage,
         * which means the user would have to wait not just for their own I/O
         * but the read-ahead I/O as well i.e. completely pointless.
         *
-        * Read-ahead will be disabled because @c->bdi.ra_pages is 0.
+        * Read-ahead will be disabled because @sb->s_bdi->ra_pages is 0. Also
+        * @sb->s_bdi->capabilities are initialized to 0 so there won't be any
+        * writeback happening.
         */
-       c->bdi.name = "ubifs",
-       c->bdi.capabilities = 0;
-       err  = bdi_init(&c->bdi);
+       err = super_setup_bdi_name(sb, "ubifs_%d_%d", c->vi.ubi_num,
+                                  c->vi.vol_id);
        if (err)
                goto out_close;
-       err = bdi_register(&c->bdi, NULL, "ubifs_%d_%d",
-                          c->vi.ubi_num, c->vi.vol_id);
-       if (err)
-               goto out_bdi;
-
-       err = ubifs_parse_options(c, data, 0);
-       if (err)
-               goto out_bdi;
 
-       sb->s_bdi = &c->bdi;
        sb->s_fs_info = c;
        sb->s_magic = UBIFS_SUPER_MAGIC;
        sb->s_blocksize = UBIFS_BLOCK_SIZE;
@@ -2080,8 +2075,6 @@ out_umount:
        ubifs_umount(c);
 out_unlock:
        mutex_unlock(&c->umount_mutex);
-out_bdi:
-       bdi_destroy(&c->bdi);
 out_close:
        ubi_close_volume(c->ubi);
 out:
index 4d57e48..4da10a6 100644 (file)
@@ -972,7 +972,6 @@ struct ubifs_debug_info;
  * struct ubifs_info - UBIFS file-system description data structure
  * (per-superblock).
  * @vfs_sb: VFS @struct super_block object
- * @bdi: backing device info object to make VFS happy and disable read-ahead
  *
  * @highest_inum: highest used inode number
  * @max_sqnum: current global sequence number
@@ -1220,7 +1219,6 @@ struct ubifs_debug_info;
  */
 struct ubifs_info {
        struct super_block *vfs_sb;
-       struct backing_dev_info bdi;
 
        ino_t highest_inum;
        unsigned long long max_sqnum;
@@ -1461,7 +1459,6 @@ extern const struct inode_operations ubifs_file_inode_operations;
 extern const struct file_operations ubifs_dir_operations;
 extern const struct inode_operations ubifs_dir_inode_operations;
 extern const struct inode_operations ubifs_symlink_inode_operations;
-extern struct backing_dev_info ubifs_backing_dev_info;
 extern struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT];
 
 /* io.c */
index 1d227b0..f7555fc 100644 (file)
@@ -1756,7 +1756,7 @@ static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
         *      protocols: aa:... bb:...
         */
        seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n",
-                  pending, total, UFFD_API, UFFD_API_FEATURES,
+                  pending, total, UFFD_API, ctx->features,
                   UFFD_API_IOCTLS|UFFD_API_RANGE_IOCTLS);
 }
 #endif
index eb00bc1..39f8604 100644 (file)
@@ -125,8 +125,7 @@ extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino);
 extern int xfs_dir2_sf_lookup(struct xfs_da_args *args);
 extern int xfs_dir2_sf_removename(struct xfs_da_args *args);
 extern int xfs_dir2_sf_replace(struct xfs_da_args *args);
-extern int xfs_dir2_sf_verify(struct xfs_mount *mp, struct xfs_dir2_sf_hdr *sfp,
-               int size);
+extern int xfs_dir2_sf_verify(struct xfs_inode *ip);
 
 /* xfs_dir2_readdir.c */
 extern int xfs_readdir(struct xfs_inode *dp, struct dir_context *ctx,
index 96b45cd..e84af09 100644 (file)
@@ -632,36 +632,49 @@ xfs_dir2_sf_check(
 /* Verify the consistency of an inline directory. */
 int
 xfs_dir2_sf_verify(
-       struct xfs_mount                *mp,
-       struct xfs_dir2_sf_hdr          *sfp,
-       int                             size)
+       struct xfs_inode                *ip)
 {
+       struct xfs_mount                *mp = ip->i_mount;
+       struct xfs_dir2_sf_hdr          *sfp;
        struct xfs_dir2_sf_entry        *sfep;
        struct xfs_dir2_sf_entry        *next_sfep;
        char                            *endp;
        const struct xfs_dir_ops        *dops;
+       struct xfs_ifork                *ifp;
        xfs_ino_t                       ino;
        int                             i;
        int                             i8count;
        int                             offset;
+       int                             size;
+       int                             error;
        __uint8_t                       filetype;
 
+       ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_LOCAL);
+       /*
+        * xfs_iread calls us before xfs_setup_inode sets up ip->d_ops,
+        * so we can only trust the mountpoint to have the right pointer.
+        */
        dops = xfs_dir_get_ops(mp, NULL);
 
+       ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+       sfp = (struct xfs_dir2_sf_hdr *)ifp->if_u1.if_data;
+       size = ifp->if_bytes;
+
        /*
         * Give up if the directory is way too short.
         */
-       XFS_WANT_CORRUPTED_RETURN(mp, size >
-                       offsetof(struct xfs_dir2_sf_hdr, parent));
-       XFS_WANT_CORRUPTED_RETURN(mp, size >=
-                       xfs_dir2_sf_hdr_size(sfp->i8count));
+       if (size <= offsetof(struct xfs_dir2_sf_hdr, parent) ||
+           size < xfs_dir2_sf_hdr_size(sfp->i8count))
+               return -EFSCORRUPTED;
 
        endp = (char *)sfp + size;
 
        /* Check .. entry */
        ino = dops->sf_get_parent_ino(sfp);
        i8count = ino > XFS_DIR2_MAX_SHORT_INUM;
-       XFS_WANT_CORRUPTED_RETURN(mp, !xfs_dir_ino_validate(mp, ino));
+       error = xfs_dir_ino_validate(mp, ino);
+       if (error)
+               return error;
        offset = dops->data_first_offset;
 
        /* Check all reported entries */
@@ -672,12 +685,12 @@ xfs_dir2_sf_verify(
                 * Check the fixed-offset parts of the structure are
                 * within the data buffer.
                 */
-               XFS_WANT_CORRUPTED_RETURN(mp,
-                               ((char *)sfep + sizeof(*sfep)) < endp);
+               if (((char *)sfep + sizeof(*sfep)) >= endp)
+                       return -EFSCORRUPTED;
 
                /* Don't allow names with known bad length. */
-               XFS_WANT_CORRUPTED_RETURN(mp, sfep->namelen > 0);
-               XFS_WANT_CORRUPTED_RETURN(mp, sfep->namelen < MAXNAMELEN);
+               if (sfep->namelen == 0)
+                       return -EFSCORRUPTED;
 
                /*
                 * Check that the variable-length part of the structure is
@@ -685,33 +698,39 @@ xfs_dir2_sf_verify(
                 * name component, so nextentry is an acceptable test.
                 */
                next_sfep = dops->sf_nextentry(sfp, sfep);
-               XFS_WANT_CORRUPTED_RETURN(mp, endp >= (char *)next_sfep);
+               if (endp < (char *)next_sfep)
+                       return -EFSCORRUPTED;
 
                /* Check that the offsets always increase. */
-               XFS_WANT_CORRUPTED_RETURN(mp,
-                               xfs_dir2_sf_get_offset(sfep) >= offset);
+               if (xfs_dir2_sf_get_offset(sfep) < offset)
+                       return -EFSCORRUPTED;
 
                /* Check the inode number. */
                ino = dops->sf_get_ino(sfp, sfep);
                i8count += ino > XFS_DIR2_MAX_SHORT_INUM;
-               XFS_WANT_CORRUPTED_RETURN(mp, !xfs_dir_ino_validate(mp, ino));
+               error = xfs_dir_ino_validate(mp, ino);
+               if (error)
+                       return error;
 
                /* Check the file type. */
                filetype = dops->sf_get_ftype(sfep);
-               XFS_WANT_CORRUPTED_RETURN(mp, filetype < XFS_DIR3_FT_MAX);
+               if (filetype >= XFS_DIR3_FT_MAX)
+                       return -EFSCORRUPTED;
 
                offset = xfs_dir2_sf_get_offset(sfep) +
                                dops->data_entsize(sfep->namelen);
 
                sfep = next_sfep;
        }
-       XFS_WANT_CORRUPTED_RETURN(mp, i8count == sfp->i8count);
-       XFS_WANT_CORRUPTED_RETURN(mp, (void *)sfep == (void *)endp);
+       if (i8count != sfp->i8count)
+               return -EFSCORRUPTED;
+       if ((void *)sfep != (void *)endp)
+               return -EFSCORRUPTED;
 
        /* Make sure this whole thing ought to be in local format. */
-       XFS_WANT_CORRUPTED_RETURN(mp, offset +
-              (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) +
-              (uint)sizeof(xfs_dir2_block_tail_t) <= mp->m_dir_geo->blksize);
+       if (offset + (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) +
+           (uint)sizeof(xfs_dir2_block_tail_t) > mp->m_dir_geo->blksize)
+               return -EFSCORRUPTED;
 
        return 0;
 }
index 9653e96..8a37efe 100644 (file)
@@ -212,6 +212,16 @@ xfs_iformat_fork(
        if (error)
                return error;
 
+       /* Check inline dir contents. */
+       if (S_ISDIR(VFS_I(ip)->i_mode) &&
+           dip->di_format == XFS_DINODE_FMT_LOCAL) {
+               error = xfs_dir2_sf_verify(ip);
+               if (error) {
+                       xfs_idestroy_fork(ip, XFS_DATA_FORK);
+                       return error;
+               }
+       }
+
        if (xfs_is_reflink_inode(ip)) {
                ASSERT(ip->i_cowfp == NULL);
                xfs_ifork_init_cow(ip);
@@ -322,8 +332,6 @@ xfs_iformat_local(
        int             whichfork,
        int             size)
 {
-       int             error;
-
        /*
         * If the size is unreasonable, then something
         * is wrong and we just bail out rather than crash in
@@ -339,14 +347,6 @@ xfs_iformat_local(
                return -EFSCORRUPTED;
        }
 
-       if (S_ISDIR(VFS_I(ip)->i_mode) && whichfork == XFS_DATA_FORK) {
-               error = xfs_dir2_sf_verify(ip->i_mount,
-                               (struct xfs_dir2_sf_hdr *)XFS_DFORK_DPTR(dip),
-                               size);
-               if (error)
-                       return error;
-       }
-
        xfs_init_local_fork(ip, whichfork, XFS_DFORK_PTR(dip, whichfork), size);
        return 0;
 }
@@ -867,7 +867,7 @@ xfs_iextents_copy(
  * In these cases, the format always takes precedence, because the
  * format indicates the current state of the fork.
  */
-int
+void
 xfs_iflush_fork(
        xfs_inode_t             *ip,
        xfs_dinode_t            *dip,
@@ -877,7 +877,6 @@ xfs_iflush_fork(
        char                    *cp;
        xfs_ifork_t             *ifp;
        xfs_mount_t             *mp;
-       int                     error;
        static const short      brootflag[2] =
                { XFS_ILOG_DBROOT, XFS_ILOG_ABROOT };
        static const short      dataflag[2] =
@@ -886,7 +885,7 @@ xfs_iflush_fork(
                { XFS_ILOG_DEXT, XFS_ILOG_AEXT };
 
        if (!iip)
-               return 0;
+               return;
        ifp = XFS_IFORK_PTR(ip, whichfork);
        /*
         * This can happen if we gave up in iformat in an error path,
@@ -894,19 +893,12 @@ xfs_iflush_fork(
         */
        if (!ifp) {
                ASSERT(whichfork == XFS_ATTR_FORK);
-               return 0;
+               return;
        }
        cp = XFS_DFORK_PTR(dip, whichfork);
        mp = ip->i_mount;
        switch (XFS_IFORK_FORMAT(ip, whichfork)) {
        case XFS_DINODE_FMT_LOCAL:
-               if (S_ISDIR(VFS_I(ip)->i_mode) && whichfork == XFS_DATA_FORK) {
-                       error = xfs_dir2_sf_verify(mp,
-                                       (struct xfs_dir2_sf_hdr *)ifp->if_u1.if_data,
-                                       ifp->if_bytes);
-                       if (error)
-                               return error;
-               }
                if ((iip->ili_fields & dataflag[whichfork]) &&
                    (ifp->if_bytes > 0)) {
                        ASSERT(ifp->if_u1.if_data != NULL);
@@ -959,7 +951,6 @@ xfs_iflush_fork(
                ASSERT(0);
                break;
        }
-       return 0;
 }
 
 /*
index 132dc59..7fb8365 100644 (file)
@@ -140,7 +140,7 @@ typedef struct xfs_ifork {
 struct xfs_ifork *xfs_iext_state_to_fork(struct xfs_inode *ip, int state);
 
 int            xfs_iformat_fork(struct xfs_inode *, struct xfs_dinode *);
-int            xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *,
+void           xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *,
                                struct xfs_inode_log_item *, int);
 void           xfs_idestroy_fork(struct xfs_inode *, int);
 void           xfs_idata_realloc(struct xfs_inode *, int, int);
index 8b75dce..8795e9c 100644 (file)
@@ -81,7 +81,7 @@ xfs_zero_extent(
        return blkdev_issue_zeroout(xfs_find_bdev_for_inode(VFS_I(ip)),
                block << (mp->m_super->s_blocksize_bits - 9),
                count_fsb << (mp->m_super->s_blocksize_bits - 9),
-               GFP_NOFS, true);
+               GFP_NOFS, 0);
 }
 
 int
@@ -1311,8 +1311,16 @@ xfs_free_file_space(
        /*
         * Now that we've unmap all full blocks we'll have to zero out any
         * partial block at the beginning and/or end.  xfs_zero_range is
-        * smart enough to skip any holes, including those we just created.
+        * smart enough to skip any holes, including those we just created,
+        * but we must take care not to zero beyond EOF and enlarge i_size.
         */
+
+       if (offset >= XFS_ISIZE(ip))
+               return 0;
+
+       if (offset + len > XFS_ISIZE(ip))
+               len = XFS_ISIZE(ip) - offset;
+
        return xfs_zero_range(ip, offset, len, NULL);
 }
 
index c7fe2c2..7605d83 100644 (file)
@@ -50,6 +50,7 @@
 #include "xfs_log.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_reflink.h"
+#include "xfs_dir2_priv.h"
 
 kmem_zone_t *xfs_inode_zone;
 
@@ -3475,7 +3476,6 @@ xfs_iflush_int(
        struct xfs_inode_log_item *iip = ip->i_itemp;
        struct xfs_dinode       *dip;
        struct xfs_mount        *mp = ip->i_mount;
-       int                     error;
 
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
        ASSERT(xfs_isiflocked(ip));
@@ -3547,6 +3547,12 @@ xfs_iflush_int(
        if (ip->i_d.di_version < 3)
                ip->i_d.di_flushiter++;
 
+       /* Check the inline directory data. */
+       if (S_ISDIR(VFS_I(ip)->i_mode) &&
+           ip->i_d.di_format == XFS_DINODE_FMT_LOCAL &&
+           xfs_dir2_sf_verify(ip))
+               goto corrupt_out;
+
        /*
         * Copy the dirty parts of the inode into the on-disk inode.  We always
         * copy out the core of the inode, because if the inode is dirty at all
@@ -3558,14 +3564,9 @@ xfs_iflush_int(
        if (ip->i_d.di_flushiter == DI_MAX_FLUSH)
                ip->i_d.di_flushiter = 0;
 
-       error = xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK);
-       if (error)
-               return error;
-       if (XFS_IFORK_Q(ip)) {
-               error = xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK);
-               if (error)
-                       return error;
-       }
+       xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK);
+       if (XFS_IFORK_Q(ip))
+               xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK);
        xfs_inobp_check(mp, bp);
 
        /*
index 229cc6a..ebfc133 100644 (file)
@@ -516,6 +516,20 @@ xfs_vn_getattr(
        stat->blocks =
                XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks);
 
+       if (ip->i_d.di_version == 3) {
+               if (request_mask & STATX_BTIME) {
+                       stat->result_mask |= STATX_BTIME;
+                       stat->btime.tv_sec = ip->i_d.di_crtime.t_sec;
+                       stat->btime.tv_nsec = ip->i_d.di_crtime.t_nsec;
+               }
+       }
+
+       if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE)
+               stat->attributes |= STATX_ATTR_IMMUTABLE;
+       if (ip->i_d.di_flags & XFS_DIFLAG_APPEND)
+               stat->attributes |= STATX_ATTR_APPEND;
+       if (ip->i_d.di_flags & XFS_DIFLAG_NODUMP)
+               stat->attributes |= STATX_ATTR_NODUMP;
 
        switch (inode->i_mode & S_IFMT) {
        case S_IFBLK:
index 2a6d9b1..26d67ce 100644 (file)
@@ -583,7 +583,7 @@ xfs_inumbers(
                return error;
 
        bcount = MIN(left, (int)(PAGE_SIZE / sizeof(*buffer)));
-       buffer = kmem_alloc(bcount * sizeof(*buffer), KM_SLEEP);
+       buffer = kmem_zalloc(bcount * sizeof(*buffer), KM_SLEEP);
        do {
                struct xfs_inobt_rec_incore     r;
                int                             stat;
index ef0ae8a..2fc678e 100644 (file)
@@ -88,6 +88,7 @@ acpi_evaluate_dsm_typed(acpi_handle handle, const u8 *uuid, u64 rev, u64 func,
        }
 
 bool acpi_dev_found(const char *hid);
+bool acpi_dev_present(const char *hid, const char *uid, s64 hrv);
 
 #ifdef CONFIG_ACPI
 
@@ -386,6 +387,7 @@ struct acpi_data_node {
        const char *name;
        acpi_handle handle;
        struct fwnode_handle fwnode;
+       struct fwnode_handle *parent;
        struct acpi_device_data data;
        struct list_head sibling;
        struct kobject kobj;
index 427a7c3..2010c05 100644 (file)
@@ -103,6 +103,7 @@ struct cppc_perf_caps {
        u32 highest_perf;
        u32 nominal_perf;
        u32 lowest_perf;
+       u32 lowest_nonlinear_perf;
 };
 
 struct cppc_perf_ctrls {
@@ -115,7 +116,7 @@ struct cppc_perf_fb_ctrs {
        u64 reference;
        u64 delivered;
        u64 reference_perf;
-       u64 ctr_wrap_time;
+       u64 wraparound_time;
 };
 
 /* Per CPU container for runtime CPPC management. */
diff --git a/include/asm-generic/extable.h b/include/asm-generic/extable.h
new file mode 100644 (file)
index 0000000..ca14c66
--- /dev/null
@@ -0,0 +1,26 @@
+#ifndef __ASM_GENERIC_EXTABLE_H
+#define __ASM_GENERIC_EXTABLE_H
+
+/*
+ * The exception table consists of pairs of addresses: the first is the
+ * address of an instruction that is allowed to fault, and the second is
+ * the address at which the program should continue.  No registers are
+ * modified, so it is entirely up to the continuation code to figure out
+ * what to do.
+ *
+ * All the routines below use bits of fixup code that are out of line
+ * with the main instruction path.  This means when everything is well,
+ * we don't even have to jump over them.  Further, they do not intrude
+ * on our cache or tlb entries.
+ */
+
+struct exception_table_entry
+{
+       unsigned long insn, fixup;
+};
+
+
+struct pt_regs;
+extern int fixup_exception(struct pt_regs *regs);
+
+#endif
index cc6bb31..bbe4bb4 100644 (file)
@@ -6,7 +6,6 @@
  * on any machine that has kernel and user data in the same
  * address space, e.g. all NOMMU machines.
  */
-#include <linux/sched.h>
 #include <linux/string.h>
 
 #include <asm/segment.h>
@@ -35,9 +34,6 @@ static inline void set_fs(mm_segment_t fs)
 #define segment_eq(a, b) ((a).seg == (b).seg)
 #endif
 
-#define VERIFY_READ    0
-#define VERIFY_WRITE   1
-
 #define access_ok(type, addr, size) __access_ok((unsigned long)(addr),(size))
 
 /*
@@ -52,87 +48,6 @@ static inline int __access_ok(unsigned long addr, unsigned long size)
 #endif
 
 /*
- * The exception table consists of pairs of addresses: the first is the
- * address of an instruction that is allowed to fault, and the second is
- * the address at which the program should continue.  No registers are
- * modified, so it is entirely up to the continuation code to figure out
- * what to do.
- *
- * All the routines below use bits of fixup code that are out of line
- * with the main instruction path.  This means when everything is well,
- * we don't even have to jump over them.  Further, they do not intrude
- * on our cache or tlb entries.
- */
-
-struct exception_table_entry
-{
-       unsigned long insn, fixup;
-};
-
-/*
- * architectures with an MMU should override these two
- */
-#ifndef __copy_from_user
-static inline __must_check long __copy_from_user(void *to,
-               const void __user * from, unsigned long n)
-{
-       if (__builtin_constant_p(n)) {
-               switch(n) {
-               case 1:
-                       *(u8 *)to = *(u8 __force *)from;
-                       return 0;
-               case 2:
-                       *(u16 *)to = *(u16 __force *)from;
-                       return 0;
-               case 4:
-                       *(u32 *)to = *(u32 __force *)from;
-                       return 0;
-#ifdef CONFIG_64BIT
-               case 8:
-                       *(u64 *)to = *(u64 __force *)from;
-                       return 0;
-#endif
-               default:
-                       break;
-               }
-       }
-
-       memcpy(to, (const void __force *)from, n);
-       return 0;
-}
-#endif
-
-#ifndef __copy_to_user
-static inline __must_check long __copy_to_user(void __user *to,
-               const void *from, unsigned long n)
-{
-       if (__builtin_constant_p(n)) {
-               switch(n) {
-               case 1:
-                       *(u8 __force *)to = *(u8 *)from;
-                       return 0;
-               case 2:
-                       *(u16 __force *)to = *(u16 *)from;
-                       return 0;
-               case 4:
-                       *(u32 __force *)to = *(u32 *)from;
-                       return 0;
-#ifdef CONFIG_64BIT
-               case 8:
-                       *(u64 __force *)to = *(u64 *)from;
-                       return 0;
-#endif
-               default:
-                       break;
-               }
-       }
-
-       memcpy((void __force *)to, from, n);
-       return 0;
-}
-#endif
-
-/*
  * These are the main single-value transfer routines.  They automatically
  * use the right size if we just have the right pointer type.
  * This version just falls back to copy_{from,to}_user, which should
@@ -171,8 +86,7 @@ static inline __must_check long __copy_to_user(void __user *to,
 
 static inline int __put_user_fn(size_t size, void __user *ptr, void *x)
 {
-       size = __copy_to_user(ptr, x, size);
-       return size ? -EFAULT : size;
+       return unlikely(raw_copy_to_user(ptr, x, size)) ? -EFAULT : 0;
 }
 
 #define __put_user_fn(sz, u, k)        __put_user_fn(sz, u, k)
@@ -187,28 +101,28 @@ extern int __put_user_bad(void) __attribute__((noreturn));
        __chk_user_ptr(ptr);                                    \
        switch (sizeof(*(ptr))) {                               \
        case 1: {                                               \
-               unsigned char __x;                              \
+               unsigned char __x = 0;                          \
                __gu_err = __get_user_fn(sizeof (*(ptr)),       \
                                         ptr, &__x);            \
                (x) = *(__force __typeof__(*(ptr)) *) &__x;     \
                break;                                          \
        };                                                      \
        case 2: {                                               \
-               unsigned short __x;                             \
+               unsigned short __x = 0;                         \
                __gu_err = __get_user_fn(sizeof (*(ptr)),       \
                                         ptr, &__x);            \
                (x) = *(__force __typeof__(*(ptr)) *) &__x;     \
                break;                                          \
        };                                                      \
        case 4: {                                               \
-               unsigned int __x;                               \
+               unsigned int __x = 0;                           \
                __gu_err = __get_user_fn(sizeof (*(ptr)),       \
                                         ptr, &__x);            \
                (x) = *(__force __typeof__(*(ptr)) *) &__x;     \
                break;                                          \
        };                                                      \
        case 8: {                                               \
-               unsigned long long __x;                         \
+               unsigned long long __x = 0;                     \
                __gu_err = __get_user_fn(sizeof (*(ptr)),       \
                                         ptr, &__x);            \
                (x) = *(__force __typeof__(*(ptr)) *) &__x;     \
@@ -233,12 +147,7 @@ extern int __put_user_bad(void) __attribute__((noreturn));
 #ifndef __get_user_fn
 static inline int __get_user_fn(size_t size, const void __user *ptr, void *x)
 {
-       size_t n = __copy_from_user(x, ptr, size);
-       if (unlikely(n)) {
-               memset(x + (size - n), 0, n);
-               return -EFAULT;
-       }
-       return 0;
+       return unlikely(raw_copy_from_user(x, ptr, size)) ? -EFAULT : 0;
 }
 
 #define __get_user_fn(sz, u, k)        __get_user_fn(sz, u, k)
@@ -247,36 +156,6 @@ static inline int __get_user_fn(size_t size, const void __user *ptr, void *x)
 
 extern int __get_user_bad(void) __attribute__((noreturn));
 
-#ifndef __copy_from_user_inatomic
-#define __copy_from_user_inatomic __copy_from_user
-#endif
-
-#ifndef __copy_to_user_inatomic
-#define __copy_to_user_inatomic __copy_to_user
-#endif
-
-static inline long copy_from_user(void *to,
-               const void __user * from, unsigned long n)
-{
-       unsigned long res = n;
-       might_fault();
-       if (likely(access_ok(VERIFY_READ, from, n)))
-               res = __copy_from_user(to, from, n);
-       if (unlikely(res))
-               memset(to + (n - res), 0, res);
-       return res;
-}
-
-static inline long copy_to_user(void __user *to,
-               const void *from, unsigned long n)
-{
-       might_fault();
-       if (access_ok(VERIFY_WRITE, to, n))
-               return __copy_to_user(to, from, n);
-       else
-               return n;
-}
-
 /*
  * Copy a null terminated string from userspace.
  */
@@ -348,4 +227,6 @@ clear_user(void __user *to, unsigned long n)
        return __clear_user(to, n);
 }
 
+#include <asm/extable.h>
+
 #endif /* __ASM_GENERIC_UACCESS_H */
index 7cdfe16..143db9c 100644 (file)
  */
 #ifndef RO_AFTER_INIT_DATA
 #define RO_AFTER_INIT_DATA                                             \
-       __start_ro_after_init = .;                                      \
+       VMLINUX_SYMBOL(__start_ro_after_init) = .;                      \
        *(.data..ro_after_init)                                         \
-       __end_ro_after_init = .;
+       VMLINUX_SYMBOL(__end_ro_after_init) = .;
 #endif
 
 /*
index caedb74..cc805b7 100644 (file)
 #ifndef __CLKSOURCE_ARM_ARCH_TIMER_H
 #define __CLKSOURCE_ARM_ARCH_TIMER_H
 
+#include <linux/bitops.h>
 #include <linux/timecounter.h>
 #include <linux/types.h>
 
+#define ARCH_TIMER_TYPE_CP15           BIT(0)
+#define ARCH_TIMER_TYPE_MEM            BIT(1)
+
 #define ARCH_TIMER_CTRL_ENABLE         (1 << 0)
 #define ARCH_TIMER_CTRL_IT_MASK                (1 << 1)
 #define ARCH_TIMER_CTRL_IT_STAT                (1 << 2)
@@ -34,11 +38,27 @@ enum arch_timer_reg {
        ARCH_TIMER_REG_TVAL,
 };
 
+enum arch_timer_ppi_nr {
+       ARCH_TIMER_PHYS_SECURE_PPI,
+       ARCH_TIMER_PHYS_NONSECURE_PPI,
+       ARCH_TIMER_VIRT_PPI,
+       ARCH_TIMER_HYP_PPI,
+       ARCH_TIMER_MAX_TIMER_PPI
+};
+
+enum arch_timer_spi_nr {
+       ARCH_TIMER_PHYS_SPI,
+       ARCH_TIMER_VIRT_SPI,
+       ARCH_TIMER_MAX_TIMER_SPI
+};
+
 #define ARCH_TIMER_PHYS_ACCESS         0
 #define ARCH_TIMER_VIRT_ACCESS         1
 #define ARCH_TIMER_MEM_PHYS_ACCESS     2
 #define ARCH_TIMER_MEM_VIRT_ACCESS     3
 
+#define ARCH_TIMER_MEM_MAX_FRAMES      8
+
 #define ARCH_TIMER_USR_PCT_ACCESS_EN   (1 << 0) /* physical counter */
 #define ARCH_TIMER_USR_VCT_ACCESS_EN   (1 << 1) /* virtual counter */
 #define ARCH_TIMER_VIRT_EVT_EN         (1 << 2)
@@ -54,6 +74,20 @@ struct arch_timer_kvm_info {
        int virtual_irq;
 };
 
+struct arch_timer_mem_frame {
+       bool valid;
+       phys_addr_t cntbase;
+       size_t size;
+       int phys_irq;
+       int virt_irq;
+};
+
+struct arch_timer_mem {
+       phys_addr_t cntctlbase;
+       size_t size;
+       struct arch_timer_mem_frame frame[ARCH_TIMER_MEM_MAX_FRAMES];
+};
+
 #ifdef CONFIG_ARM_ARCH_TIMER
 
 extern u32 arch_timer_get_rate(void);
index 1d4f365..f6d9af3 100644 (file)
@@ -166,6 +166,16 @@ static inline struct ahash_instance *ahash_alloc_instance(
        return crypto_alloc_instance2(name, alg, ahash_instance_headroom());
 }
 
+static inline void ahash_request_complete(struct ahash_request *req, int err)
+{
+       req->base.complete(&req->base, err);
+}
+
+static inline u32 ahash_request_flags(struct ahash_request *req)
+{
+       return req->base.flags;
+}
+
 static inline struct crypto_ahash *crypto_spawn_ahash(
        struct crypto_ahash_spawn *spawn)
 {
index b72dd2a..c0b3d99 100644 (file)
@@ -295,6 +295,7 @@ void kvm_vgic_vcpu_early_init(struct kvm_vcpu *vcpu);
 void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu);
 int kvm_vgic_map_resources(struct kvm *kvm);
 int kvm_vgic_hyp_init(void);
+void kvm_vgic_init_cpu_hardware(void);
 
 int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid,
                        bool level);
index 9b05886..f729ada 100644 (file)
@@ -233,10 +233,6 @@ int acpi_numa_init (void);
 
 int acpi_table_init (void);
 int acpi_table_parse(char *id, acpi_tbl_table_handler handler);
-int __init acpi_parse_entries(char *id, unsigned long table_size,
-                             acpi_tbl_entry_handler handler,
-                             struct acpi_table_header *table_header,
-                             int entry_id, unsigned int max_entries);
 int __init acpi_table_parse_entries(char *id, unsigned long table_size,
                              int entry_id,
                              acpi_tbl_entry_handler handler,
@@ -595,6 +591,13 @@ enum acpi_reconfig_event  {
 int acpi_reconfig_notifier_register(struct notifier_block *nb);
 int acpi_reconfig_notifier_unregister(struct notifier_block *nb);
 
+#ifdef CONFIG_ACPI_GTDT
+int acpi_gtdt_init(struct acpi_table_header *table, int *platform_timer_count);
+int acpi_gtdt_map_ppi(int type);
+bool acpi_gtdt_c3stop(int type);
+int acpi_arch_timer_mem_init(struct arch_timer_mem *timer_mem, int *timer_count);
+#endif
+
 #else  /* !CONFIG_ACPI */
 
 #define acpi_disabled 1
@@ -611,6 +614,11 @@ static inline bool acpi_dev_found(const char *hid)
        return false;
 }
 
+static inline bool acpi_dev_present(const char *hid, const char *uid, s64 hrv)
+{
+       return false;
+}
+
 static inline bool is_acpi_node(struct fwnode_handle *fwnode)
 {
        return false;
@@ -997,8 +1005,16 @@ int acpi_node_prop_read(struct fwnode_handle *fwnode, const char *propname,
 int acpi_dev_prop_read(struct acpi_device *adev, const char *propname,
                       enum dev_prop_type proptype, void *val, size_t nval);
 
-struct fwnode_handle *acpi_get_next_subnode(struct device *dev,
-                                           struct fwnode_handle *subnode);
+struct fwnode_handle *acpi_get_next_subnode(struct fwnode_handle *fwnode,
+                                           struct fwnode_handle *child);
+struct fwnode_handle *acpi_node_get_parent(struct fwnode_handle *fwnode);
+
+struct fwnode_handle *acpi_graph_get_next_endpoint(struct fwnode_handle *fwnode,
+                                                  struct fwnode_handle *prev);
+int acpi_graph_get_remote_endpoint(struct fwnode_handle *fwnode,
+                                  struct fwnode_handle **remote,
+                                  struct fwnode_handle **port,
+                                  struct fwnode_handle **endpoint);
 
 struct acpi_probe_entry;
 typedef bool (*acpi_probe_entry_validate_subtbl)(struct acpi_subtable_header *,
@@ -1115,12 +1131,34 @@ static inline int acpi_dev_prop_read(struct acpi_device *adev,
        return -ENXIO;
 }
 
-static inline struct fwnode_handle *acpi_get_next_subnode(struct device *dev,
-                                               struct fwnode_handle *subnode)
+static inline struct fwnode_handle *
+acpi_get_next_subnode(struct fwnode_handle *fwnode, struct fwnode_handle *child)
+{
+       return NULL;
+}
+
+static inline struct fwnode_handle *
+acpi_node_get_parent(struct fwnode_handle *fwnode)
 {
        return NULL;
 }
 
+static inline struct fwnode_handle *
+acpi_graph_get_next_endpoint(struct fwnode_handle *fwnode,
+                            struct fwnode_handle *prev)
+{
+       return ERR_PTR(-ENXIO);
+}
+
+static inline int
+acpi_graph_get_remote_endpoint(struct fwnode_handle *fwnode,
+                              struct fwnode_handle **remote,
+                              struct fwnode_handle **port,
+                              struct fwnode_handle **endpoint)
+{
+       return -ENXIO;
+}
+
 #define ACPI_DECLARE_PROBE_ENTRY(table, name, table_id, subtable, valid, data, fn) \
        static const void * __acpi_table_##name[]                       \
                __attribute__((unused))                                 \
index af6859b..ad7d9ee 100644 (file)
@@ -817,11 +817,6 @@ static inline bool ata_id_sct_error_recovery_ctrl(const u16 *id)
        return id[ATA_ID_SCT_CMD_XPORT] & (1 << 3) ? true : false;
 }
 
-static inline bool ata_id_sct_write_same(const u16 *id)
-{
-       return id[ATA_ID_SCT_CMD_XPORT] & (1 << 2) ? true : false;
-}
-
 static inline bool ata_id_sct_long_sector_access(const u16 *id)
 {
        return id[ATA_ID_SCT_CMD_XPORT] & (1 << 1) ? true : false;
index e71835b..c56be74 100644 (file)
 #endif
 #endif /* atomic_cmpxchg_relaxed */
 
+#ifndef atomic_try_cmpxchg
+
+#define __atomic_try_cmpxchg(type, _p, _po, _n)                                \
+({                                                                     \
+       typeof(_po) __po = (_po);                                       \
+       typeof(*(_po)) __r, __o = *__po;                                \
+       __r = atomic_cmpxchg##type((_p), __o, (_n));                    \
+       if (unlikely(__r != __o))                                       \
+               *__po = __r;                                            \
+       likely(__r == __o);                                             \
+})
+
+#define atomic_try_cmpxchg(_p, _po, _n)                __atomic_try_cmpxchg(, _p, _po, _n)
+#define atomic_try_cmpxchg_relaxed(_p, _po, _n)        __atomic_try_cmpxchg(_relaxed, _p, _po, _n)
+#define atomic_try_cmpxchg_acquire(_p, _po, _n)        __atomic_try_cmpxchg(_acquire, _p, _po, _n)
+#define atomic_try_cmpxchg_release(_p, _po, _n)        __atomic_try_cmpxchg(_release, _p, _po, _n)
+
+#else /* atomic_try_cmpxchg */
+#define atomic_try_cmpxchg_relaxed     atomic_try_cmpxchg
+#define atomic_try_cmpxchg_acquire     atomic_try_cmpxchg
+#define atomic_try_cmpxchg_release     atomic_try_cmpxchg
+#endif /* atomic_try_cmpxchg */
+
 /* cmpxchg_relaxed */
 #ifndef cmpxchg_relaxed
 #define  cmpxchg_relaxed               cmpxchg
@@ -996,6 +1019,29 @@ static inline int atomic_dec_if_positive(atomic_t *v)
 #endif
 #endif /* atomic64_cmpxchg_relaxed */
 
+#ifndef atomic64_try_cmpxchg
+
+#define __atomic64_try_cmpxchg(type, _p, _po, _n)                      \
+({                                                                     \
+       typeof(_po) __po = (_po);                                       \
+       typeof(*(_po)) __r, __o = *__po;                                \
+       __r = atomic64_cmpxchg##type((_p), __o, (_n));                  \
+       if (unlikely(__r != __o))                                       \
+               *__po = __r;                                            \
+       likely(__r == __o);                                             \
+})
+
+#define atomic64_try_cmpxchg(_p, _po, _n)              __atomic64_try_cmpxchg(, _p, _po, _n)
+#define atomic64_try_cmpxchg_relaxed(_p, _po, _n)      __atomic64_try_cmpxchg(_relaxed, _p, _po, _n)
+#define atomic64_try_cmpxchg_acquire(_p, _po, _n)      __atomic64_try_cmpxchg(_acquire, _p, _po, _n)
+#define atomic64_try_cmpxchg_release(_p, _po, _n)      __atomic64_try_cmpxchg(_release, _p, _po, _n)
+
+#else /* atomic64_try_cmpxchg */
+#define atomic64_try_cmpxchg_relaxed   atomic64_try_cmpxchg
+#define atomic64_try_cmpxchg_acquire   atomic64_try_cmpxchg
+#define atomic64_try_cmpxchg_release   atomic64_try_cmpxchg
+#endif /* atomic64_try_cmpxchg */
+
 #ifndef atomic64_andnot
 static inline void atomic64_andnot(long long i, atomic64_t *v)
 {
index ad95581..866c433 100644 (file)
@@ -21,6 +21,7 @@ struct dentry;
  */
 enum wb_state {
        WB_registered,          /* bdi_register() was done */
+       WB_shutting_down,       /* wb_shutdown() in progress */
        WB_writeback_running,   /* Writeback is in progress */
        WB_has_dirty_io,        /* Dirty inodes on ->b_{dirty|io|more_io} */
 };
@@ -54,7 +55,9 @@ struct bdi_writeback_congested {
        atomic_t refcnt;                /* nr of attached wb's and blkg */
 
 #ifdef CONFIG_CGROUP_WRITEBACK
-       struct backing_dev_info *bdi;   /* the associated bdi */
+       struct backing_dev_info *__bdi; /* the associated bdi, set to NULL
+                                        * on bdi unregistration. For memcg-wb
+                                        * internal use only! */
        int blkcg_id;                   /* ID of the associated blkcg */
        struct rb_node rb_node;         /* on bdi->cgwb_congestion_tree */
 #endif
@@ -143,7 +146,7 @@ struct backing_dev_info {
        congested_fn *congested_fn; /* Function pointer if device is md/dm */
        void *congested_data;   /* Pointer to aux data for congested func */
 
-       char *name;
+       const char *name;
 
        struct kref refcnt;     /* Reference counter for the structure */
        unsigned int capabilities; /* Device capabilities */
@@ -161,7 +164,6 @@ struct backing_dev_info {
 #ifdef CONFIG_CGROUP_WRITEBACK
        struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */
        struct rb_root cgwb_congested_tree; /* their congested states */
-       atomic_t usage_cnt; /* counts both cgwbs and cgwb_contested's */
 #else
        struct bdi_writeback_congested *wb_congested;
 #endif
index c52a48c..557d840 100644 (file)
@@ -17,8 +17,6 @@
 #include <linux/backing-dev-defs.h>
 #include <linux/slab.h>
 
-int __must_check bdi_init(struct backing_dev_info *bdi);
-
 static inline struct backing_dev_info *bdi_get(struct backing_dev_info *bdi)
 {
        kref_get(&bdi->refcnt);
@@ -27,16 +25,18 @@ static inline struct backing_dev_info *bdi_get(struct backing_dev_info *bdi)
 
 void bdi_put(struct backing_dev_info *bdi);
 
-__printf(3, 4)
-int bdi_register(struct backing_dev_info *bdi, struct device *parent,
-               const char *fmt, ...);
-int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
+__printf(2, 3)
+int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...);
+int bdi_register_va(struct backing_dev_info *bdi, const char *fmt,
+                   va_list args);
 int bdi_register_owner(struct backing_dev_info *bdi, struct device *owner);
 void bdi_unregister(struct backing_dev_info *bdi);
 
-int __must_check bdi_setup_and_register(struct backing_dev_info *, char *);
-void bdi_destroy(struct backing_dev_info *bdi);
 struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id);
+static inline struct backing_dev_info *bdi_alloc(gfp_t gfp_mask)
+{
+       return bdi_alloc_node(gfp_mask, NUMA_NO_NODE);
+}
 
 void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
                        bool range_cyclic, enum wb_reason reason);
index 8e52119..4931756 100644 (file)
@@ -383,7 +383,7 @@ extern struct bio_set *bioset_create_nobvec(unsigned int, unsigned int);
 extern void bioset_free(struct bio_set *);
 extern mempool_t *biovec_create_pool(int pool_entries);
 
-extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *);
+extern struct bio *bio_alloc_bioset(gfp_t, unsigned int, struct bio_set *);
 extern void bio_put(struct bio *);
 
 extern void __bio_clone_fast(struct bio *, struct bio *);
index b296a90..f3e5e1d 100644 (file)
@@ -15,7 +15,7 @@ struct blk_mq_hw_ctx {
                unsigned long           state;          /* BLK_MQ_S_* flags */
        } ____cacheline_aligned_in_smp;
 
-       struct work_struct      run_work;
+       struct delayed_work     run_work;
        cpumask_var_t           cpumask;
        int                     next_cpu;
        int                     next_cpu_batch;
@@ -51,8 +51,6 @@ struct blk_mq_hw_ctx {
 
        atomic_t                nr_active;
 
-       struct delayed_work     delay_work;
-
        struct hlist_node       cpuhp_dead;
        struct kobject          kobj;
 
@@ -81,7 +79,6 @@ struct blk_mq_tag_set {
 
 struct blk_mq_queue_data {
        struct request *rq;
-       struct list_head *list;
        bool last;
 };
 
@@ -142,6 +139,14 @@ struct blk_mq_ops {
        reinit_request_fn       *reinit_request;
 
        map_queues_fn           *map_queues;
+
+#ifdef CONFIG_BLK_DEBUG_FS
+       /*
+        * Used by the debugfs implementation to show driver-specific
+        * information about a request.
+        */
+       void (*show_rq)(struct seq_file *m, struct request *rq);
+#endif
 };
 
 enum {
@@ -152,7 +157,6 @@ enum {
        BLK_MQ_F_SHOULD_MERGE   = 1 << 0,
        BLK_MQ_F_TAG_SHARED     = 1 << 1,
        BLK_MQ_F_SG_MERGE       = 1 << 2,
-       BLK_MQ_F_DEFER_ISSUE    = 1 << 4,
        BLK_MQ_F_BLOCKING       = 1 << 5,
        BLK_MQ_F_NO_SCHED       = 1 << 6,
        BLK_MQ_F_ALLOC_POLICY_START_BIT = 8,
@@ -162,6 +166,7 @@ enum {
        BLK_MQ_S_TAG_ACTIVE     = 1,
        BLK_MQ_S_SCHED_RESTART  = 2,
        BLK_MQ_S_TAG_WAITING    = 3,
+       BLK_MQ_S_START_ON_RUN   = 4,
 
        BLK_MQ_MAX_DEPTH        = 10240,
 
@@ -229,7 +234,7 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
 void blk_mq_kick_requeue_list(struct request_queue *q);
 void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs);
 void blk_mq_abort_requeue_list(struct request_queue *q);
-void blk_mq_complete_request(struct request *rq, int error);
+void blk_mq_complete_request(struct request *rq);
 
 bool blk_mq_queue_stopped(struct request_queue *q);
 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx);
@@ -238,13 +243,15 @@ void blk_mq_stop_hw_queues(struct request_queue *q);
 void blk_mq_start_hw_queues(struct request_queue *q);
 void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
+void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
+void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
 void blk_mq_run_hw_queues(struct request_queue *q, bool async);
 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
 void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
                busy_tag_iter_fn *fn, void *priv);
 void blk_mq_freeze_queue(struct request_queue *q);
 void blk_mq_unfreeze_queue(struct request_queue *q);
-void blk_mq_freeze_queue_start(struct request_queue *q);
+void blk_freeze_queue_start(struct request_queue *q);
 void blk_mq_freeze_queue_wait(struct request_queue *q);
 int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
                                     unsigned long timeout);
index d703acb..61339bc 100644 (file)
@@ -17,6 +17,10 @@ struct io_context;
 struct cgroup_subsys_state;
 typedef void (bio_end_io_t) (struct bio *);
 
+struct blk_issue_stat {
+       u64 stat;
+};
+
 /*
  * main unit of I/O for the block layer and lower layers (ie drivers and
  * stacking drivers)
@@ -29,7 +33,7 @@ struct bio {
                                                 * top bits REQ_OP. Use
                                                 * accessors.
                                                 */
-       unsigned short          bi_flags;       /* status, command, etc */
+       unsigned short          bi_flags;       /* status, etc and bvec pool number */
        unsigned short          bi_ioprio;
 
        struct bvec_iter        bi_iter;
@@ -58,6 +62,10 @@ struct bio {
         */
        struct io_context       *bi_ioc;
        struct cgroup_subsys_state *bi_css;
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+       void                    *bi_cg_private;
+       struct blk_issue_stat   bi_issue_stat;
+#endif
 #endif
        union {
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
@@ -102,12 +110,9 @@ struct bio {
 #define BIO_REFFED     8       /* bio has elevated ->bi_cnt */
 #define BIO_THROTTLED  9       /* This bio has already been subjected to
                                 * throttling rules. Don't do it again. */
-
-/*
- * Flags starting here get preserved by bio_reset() - this includes
- * BVEC_POOL_IDX()
- */
-#define BIO_RESET_BITS 10
+#define BIO_TRACE_COMPLETION 10        /* bio_endio() should trace the final completion
+                                * of this bio. */
+/* See BVEC_POOL_OFFSET below before adding new flags */
 
 /*
  * We support 6 different bvec pools, the last one is magic in that it
@@ -117,13 +122,22 @@ struct bio {
 #define BVEC_POOL_MAX          (BVEC_POOL_NR - 1)
 
 /*
- * Top 4 bits of bio flags indicate the pool the bvecs came from.  We add
+ * Top 3 bits of bio flags indicate the pool the bvecs came from.  We add
  * 1 to the actual index so that 0 indicates that there are no bvecs to be
  * freed.
  */
-#define BVEC_POOL_BITS         (4)
+#define BVEC_POOL_BITS         (3)
 #define BVEC_POOL_OFFSET       (16 - BVEC_POOL_BITS)
 #define BVEC_POOL_IDX(bio)     ((bio)->bi_flags >> BVEC_POOL_OFFSET)
+#if (1<< BVEC_POOL_BITS) < (BVEC_POOL_NR+1)
+# error "BVEC_POOL_BITS is too small"
+#endif
+
+/*
+ * Flags starting here get preserved by bio_reset() - this includes
+ * only BVEC_POOL_IDX()
+ */
+#define BIO_RESET_BITS BVEC_POOL_OFFSET
 
 /*
  * Operations and flags common to the bio and request structures.
@@ -160,7 +174,7 @@ enum req_opf {
        /* write the same sector many times */
        REQ_OP_WRITE_SAME       = 7,
        /* write the zero filled sector many times */
-       REQ_OP_WRITE_ZEROES     = 8,
+       REQ_OP_WRITE_ZEROES     = 9,
 
        /* SCSI passthrough using struct scsi_request */
        REQ_OP_SCSI_IN          = 32,
@@ -187,6 +201,10 @@ enum req_flag_bits {
        __REQ_PREFLUSH,         /* request for cache flush */
        __REQ_RAHEAD,           /* read ahead, can fail anytime */
        __REQ_BACKGROUND,       /* background IO */
+
+       /* command specific flags for REQ_OP_WRITE_ZEROES: */
+       __REQ_NOUNMAP,          /* do not free blocks when zeroing */
+
        __REQ_NR_BITS,          /* stops here */
 };
 
@@ -204,6 +222,8 @@ enum req_flag_bits {
 #define REQ_RAHEAD             (1ULL << __REQ_RAHEAD)
 #define REQ_BACKGROUND         (1ULL << __REQ_BACKGROUND)
 
+#define REQ_NOUNMAP            (1ULL << __REQ_NOUNMAP)
+
 #define REQ_FAILFAST_MASK \
        (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
 
@@ -283,12 +303,6 @@ static inline bool blk_qc_t_is_internal(blk_qc_t cookie)
        return (cookie & BLK_QC_T_INTERNAL) != 0;
 }
 
-struct blk_issue_stat {
-       u64 time;
-};
-
-#define BLK_RQ_STAT_BATCH      64
-
 struct blk_rq_stat {
        s64 mean;
        u64 min;
@@ -296,7 +310,6 @@ struct blk_rq_stat {
        s32 nr_samples;
        s32 nr_batch;
        u64 batch;
-       s64 time;
 };
 
 #endif /* __LINUX_BLK_TYPES_H */
index 5a7da60..83d2862 100644 (file)
@@ -40,15 +40,20 @@ struct blkcg_gq;
 struct blk_flush_queue;
 struct pr_ops;
 struct rq_wb;
+struct blk_queue_stats;
+struct blk_stat_callback;
 
 #define BLKDEV_MIN_RQ  4
 #define BLKDEV_MAX_RQ  128     /* Default maximum */
 
+/* Must be consisitent with blk_mq_poll_stats_bkt() */
+#define BLK_MQ_POLL_STATS_BKTS 16
+
 /*
  * Maximum number of blkcg policies allowed to be registered concurrently.
  * Defined here to simplify include dependency.
  */
-#define BLKCG_MAX_POLS         2
+#define BLKCG_MAX_POLS         3
 
 typedef void (rq_end_io_fn)(struct request *, int);
 
@@ -173,6 +178,7 @@ struct request {
                struct rb_node rb_node; /* sort/lookup */
                struct bio_vec special_vec;
                void *completion_data;
+               int error_count; /* for legacy drivers, don't use */
        };
 
        /*
@@ -213,16 +219,14 @@ struct request {
 
        unsigned short ioprio;
 
-       void *special;          /* opaque pointer available for LLD use */
+       unsigned int timeout;
 
-       int errors;
+       void *special;          /* opaque pointer available for LLD use */
 
        unsigned int extra_len; /* length of alignment and padding */
 
        unsigned long deadline;
        struct list_head timeout_list;
-       unsigned int timeout;
-       int retries;
 
        /*
         * completion callback.
@@ -337,7 +341,6 @@ struct queue_limits {
        unsigned char           misaligned;
        unsigned char           discard_misaligned;
        unsigned char           cluster;
-       unsigned char           discard_zeroes_data;
        unsigned char           raid_partial_stripes_expensive;
        enum blk_zoned_model    zoned;
 };
@@ -388,6 +391,7 @@ struct request_queue {
        int                     nr_rqs[2];      /* # allocated [a]sync rqs */
        int                     nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */
 
+       struct blk_queue_stats  *stats;
        struct rq_wb            *rq_wb;
 
        /*
@@ -505,8 +509,6 @@ struct request_queue {
        unsigned int            nr_sorted;
        unsigned int            in_flight[2];
 
-       struct blk_rq_stat      rq_stats[2];
-
        /*
         * Number of active block driver functions for which blk_drain_queue()
         * must wait. Must be incremented around functions that unlock the
@@ -516,6 +518,10 @@ struct request_queue {
 
        unsigned int            rq_timeout;
        int                     poll_nsec;
+
+       struct blk_stat_callback        *poll_cb;
+       struct blk_rq_stat      poll_stat[BLK_MQ_POLL_STATS_BKTS];
+
        struct timer_list       timeout;
        struct work_struct      timeout_work;
        struct list_head        timeout_list;
@@ -610,7 +616,8 @@ struct request_queue {
 #define QUEUE_FLAG_FLUSH_NQ    25      /* flush not queueuable */
 #define QUEUE_FLAG_DAX         26      /* device supports DAX */
 #define QUEUE_FLAG_STATS       27      /* track rq completion times */
-#define QUEUE_FLAG_RESTART     28      /* queue needs restart at completion */
+#define QUEUE_FLAG_POLL_STATS  28      /* collecting stats for hybrid polling */
+#define QUEUE_FLAG_REGISTERED  29      /* queue has been registered to a disk */
 
 #define QUEUE_FLAG_DEFAULT     ((1 << QUEUE_FLAG_IO_STAT) |            \
                                 (1 << QUEUE_FLAG_STACKABLE)    |       \
@@ -919,6 +926,7 @@ extern int blk_register_queue(struct gendisk *disk);
 extern void blk_unregister_queue(struct gendisk *disk);
 extern blk_qc_t generic_make_request(struct bio *bio);
 extern void blk_rq_init(struct request_queue *q, struct request *rq);
+extern void blk_init_request_from_bio(struct request *req, struct bio *bio);
 extern void blk_put_request(struct request *);
 extern void __blk_put_request(struct request_queue *, struct request *);
 extern struct request *blk_get_request(struct request_queue *, int, gfp_t);
@@ -964,7 +972,7 @@ extern int blk_rq_map_kern(struct request_queue *, struct request *, void *, uns
 extern int blk_rq_map_user_iov(struct request_queue *, struct request *,
                               struct rq_map_data *, const struct iov_iter *,
                               gfp_t);
-extern int blk_execute_rq(struct request_queue *, struct gendisk *,
+extern void blk_execute_rq(struct request_queue *, struct gendisk *,
                          struct request *, int);
 extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *,
                                  struct request *, int, rq_end_io_fn *);
@@ -1082,20 +1090,6 @@ static inline unsigned int blk_rq_count_bios(struct request *rq)
 }
 
 /*
- * blk_rq_set_prio - associate a request with prio from ioc
- * @rq: request of interest
- * @ioc: target iocontext
- *
- * Assocate request prio with ioc prio so request based drivers
- * can leverage priority information.
- */
-static inline void blk_rq_set_prio(struct request *rq, struct io_context *ioc)
-{
-       if (ioc)
-               rq->ioprio = ioc->ioprio;
-}
-
-/*
  * Request issue related functions.
  */
 extern struct request *blk_peek_request(struct request_queue *q);
@@ -1121,13 +1115,10 @@ extern void blk_finish_request(struct request *rq, int error);
 extern bool blk_end_request(struct request *rq, int error,
                            unsigned int nr_bytes);
 extern void blk_end_request_all(struct request *rq, int error);
-extern bool blk_end_request_cur(struct request *rq, int error);
-extern bool blk_end_request_err(struct request *rq, int error);
 extern bool __blk_end_request(struct request *rq, int error,
                              unsigned int nr_bytes);
 extern void __blk_end_request_all(struct request *rq, int error);
 extern bool __blk_end_request_cur(struct request *rq, int error);
-extern bool __blk_end_request_err(struct request *rq, int error);
 
 extern void blk_complete_request(struct request *);
 extern void __blk_complete_request(struct request *);
@@ -1330,23 +1321,27 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt,
        return bqt->tag_index[tag];
 }
 
+extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *);
+extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
+               sector_t nr_sects, gfp_t gfp_mask, struct page *page);
 
 #define BLKDEV_DISCARD_SECURE  (1 << 0)        /* issue a secure erase */
-#define BLKDEV_DISCARD_ZERO    (1 << 1)        /* must reliably zero data */
 
-extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *);
 extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
                sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
 extern int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
                sector_t nr_sects, gfp_t gfp_mask, int flags,
                struct bio **biop);
-extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
-               sector_t nr_sects, gfp_t gfp_mask, struct page *page);
+
+#define BLKDEV_ZERO_NOUNMAP    (1 << 0)  /* do not free blocks */
+#define BLKDEV_ZERO_NOFALLBACK (1 << 1)  /* don't write explicit zeroes */
+
 extern int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
                sector_t nr_sects, gfp_t gfp_mask, struct bio **biop,
-               bool discard);
+               unsigned flags);
 extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
-               sector_t nr_sects, gfp_t gfp_mask, bool discard);
+               sector_t nr_sects, gfp_t gfp_mask, unsigned flags);
+
 static inline int sb_issue_discard(struct super_block *sb, sector_t block,
                sector_t nr_blocks, gfp_t gfp_mask, unsigned long flags)
 {
@@ -1360,7 +1355,7 @@ static inline int sb_issue_zeroout(struct super_block *sb, sector_t block,
        return blkdev_issue_zeroout(sb->s_bdev,
                                    block << (sb->s_blocksize_bits - 9),
                                    nr_blocks << (sb->s_blocksize_bits - 9),
-                                   gfp_mask, true);
+                                   gfp_mask, 0);
 }
 
 extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm);
@@ -1530,19 +1525,6 @@ static inline int bdev_discard_alignment(struct block_device *bdev)
        return q->limits.discard_alignment;
 }
 
-static inline unsigned int queue_discard_zeroes_data(struct request_queue *q)
-{
-       if (q->limits.max_discard_sectors && q->limits.discard_zeroes_data == 1)
-               return 1;
-
-       return 0;
-}
-
-static inline unsigned int bdev_discard_zeroes_data(struct block_device *bdev)
-{
-       return queue_discard_zeroes_data(bdev_get_queue(bdev));
-}
-
 static inline unsigned int bdev_write_same(struct block_device *bdev)
 {
        struct request_queue *q = bdev_get_queue(bdev);
@@ -1673,12 +1655,36 @@ static inline bool bios_segs_mergeable(struct request_queue *q,
        return true;
 }
 
-static inline bool bio_will_gap(struct request_queue *q, struct bio *prev,
-                        struct bio *next)
+static inline bool bio_will_gap(struct request_queue *q,
+                               struct request *prev_rq,
+                               struct bio *prev,
+                               struct bio *next)
 {
        if (bio_has_data(prev) && queue_virt_boundary(q)) {
                struct bio_vec pb, nb;
 
+               /*
+                * don't merge if the 1st bio starts with non-zero
+                * offset, otherwise it is quite difficult to respect
+                * sg gap limit. We work hard to merge a huge number of small
+                * single bios in case of mkfs.
+                */
+               if (prev_rq)
+                       bio_get_first_bvec(prev_rq->bio, &pb);
+               else
+                       bio_get_first_bvec(prev, &pb);
+               if (pb.bv_offset)
+                       return true;
+
+               /*
+                * We don't need to worry about the situation that the
+                * merged segment ends in unaligned virt boundary:
+                *
+                * - if 'pb' ends aligned, the merged segment ends aligned
+                * - if 'pb' ends unaligned, the next bio must include
+                *   one single bvec of 'nb', otherwise the 'nb' can't
+                *   merge with 'pb'
+                */
                bio_get_last_bvec(prev, &pb);
                bio_get_first_bvec(next, &nb);
 
@@ -1691,18 +1697,19 @@ static inline bool bio_will_gap(struct request_queue *q, struct bio *prev,
 
 static inline bool req_gap_back_merge(struct request *req, struct bio *bio)
 {
-       return bio_will_gap(req->q, req->biotail, bio);
+       return bio_will_gap(req->q, req, req->biotail, bio);
 }
 
 static inline bool req_gap_front_merge(struct request *req, struct bio *bio)
 {
-       return bio_will_gap(req->q, bio, req->bio);
+       return bio_will_gap(req->q, NULL, bio, req->bio);
 }
 
 int kblockd_schedule_work(struct work_struct *work);
 int kblockd_schedule_work_on(int cpu, struct work_struct *work);
 int kblockd_schedule_delayed_work(struct delayed_work *dwork, unsigned long delay);
 int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay);
+int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay);
 
 #ifdef CONFIG_BLK_CGROUP
 /*
index 6a3f850..2174594 100644 (file)
@@ -13,6 +13,7 @@
 #include <linux/wait.h>
 #include <linux/mutex.h>
 #include <linux/rcupdate.h>
+#include <linux/refcount.h>
 #include <linux/percpu-refcount.h>
 #include <linux/percpu-rwsem.h>
 #include <linux/workqueue.h>
@@ -106,9 +107,6 @@ struct cgroup_subsys_state {
        /* reference count - access via css_[try]get() and css_put() */
        struct percpu_ref refcnt;
 
-       /* PI: the parent css */
-       struct cgroup_subsys_state *parent;
-
        /* siblings list anchored at the parent's ->children */
        struct list_head sibling;
        struct list_head children;
@@ -138,6 +136,12 @@ struct cgroup_subsys_state {
        /* percpu_ref killing and RCU release */
        struct rcu_head rcu_head;
        struct work_struct destroy_work;
+
+       /*
+        * PI: the parent css.  Placed here for cache proximity to following
+        * fields of the containing structure.
+        */
+       struct cgroup_subsys_state *parent;
 };
 
 /*
@@ -156,7 +160,7 @@ struct css_set {
        struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
 
        /* reference count */
-       atomic_t refcount;
+       refcount_t refcount;
 
        /* the default cgroup associated with this css_set */
        struct cgroup *dfl_cgrp;
index f6b43fb..ed2573e 100644 (file)
 #include <linux/seq_file.h>
 #include <linux/kernfs.h>
 #include <linux/jump_label.h>
-#include <linux/nsproxy.h>
 #include <linux/types.h>
 #include <linux/ns_common.h>
 #include <linux/nsproxy.h>
 #include <linux/user_namespace.h>
+#include <linux/refcount.h>
 
 #include <linux/cgroup-defs.h>
 
@@ -570,6 +570,25 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
        pr_cont_kernfs_path(cgrp->kn);
 }
 
+static inline void cgroup_init_kthreadd(void)
+{
+       /*
+        * kthreadd is inherited by all kthreads, keep it in the root so
+        * that the new kthreads are guaranteed to stay in the root until
+        * initialization is finished.
+        */
+       current->no_cgroup_migration = 1;
+}
+
+static inline void cgroup_kthread_ready(void)
+{
+       /*
+        * This kthread finished initialization.  The creator should have
+        * set PF_NO_SETAFFINITY if this kthread should stay in the root.
+        */
+       current->no_cgroup_migration = 0;
+}
+
 #else /* !CONFIG_CGROUPS */
 
 struct cgroup_subsys_state;
@@ -590,6 +609,8 @@ static inline void cgroup_free(struct task_struct *p) {}
 
 static inline int cgroup_init_early(void) { return 0; }
 static inline int cgroup_init(void) { return 0; }
+static inline void cgroup_init_kthreadd(void) {}
+static inline void cgroup_kthread_ready(void) {}
 
 static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
                                               struct cgroup *ancestor)
@@ -640,7 +661,7 @@ static inline void cgroup_sk_free(struct sock_cgroup_data *skcd) {}
 #endif /* CONFIG_CGROUP_DATA */
 
 struct cgroup_namespace {
-       atomic_t                count;
+       refcount_t              count;
        struct ns_common        ns;
        struct user_namespace   *user_ns;
        struct ucounts          *ucounts;
@@ -675,12 +696,12 @@ copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns,
 static inline void get_cgroup_ns(struct cgroup_namespace *ns)
 {
        if (ns)
-               atomic_inc(&ns->count);
+               refcount_inc(&ns->count);
 }
 
 static inline void put_cgroup_ns(struct cgroup_namespace *ns)
 {
-       if (ns && atomic_dec_and_test(&ns->count))
+       if (ns && refcount_dec_and_test(&ns->count))
                free_cgroup_ns(ns);
 }
 
index 6d7edc3..acc9ce0 100644 (file)
@@ -182,7 +182,6 @@ extern u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *e
 extern void clockevents_register_device(struct clock_event_device *dev);
 extern int clockevents_unbind_device(struct clock_event_device *ced, int cpu);
 
-extern void clockevents_config(struct clock_event_device *dev, u32 freq);
 extern void clockevents_config_and_register(struct clock_event_device *dev,
                                            u32 freq, unsigned long min_delta,
                                            unsigned long max_delta);
index cfc7584..f2b10d9 100644 (file)
@@ -120,7 +120,7 @@ struct clocksource {
 #define CLOCK_SOURCE_RESELECT                  0x100
 
 /* simplify initialization of mask field */
-#define CLOCKSOURCE_MASK(bits) (u64)((bits) < 64 ? ((1ULL<<(bits))-1) : -1)
+#define CLOCKSOURCE_MASK(bits) GENMASK_ULL((bits) - 1, 0)
 
 static inline u32 clocksource_freq2mult(u32 freq, u32 shift_constant, u64 from)
 {
index 5b8721e..31e4e1f 100644 (file)
@@ -15,7 +15,6 @@ struct venus_comm {
        struct list_head    vc_processing;
        int                 vc_inuse;
        struct super_block *vc_sb;
-       struct backing_dev_info bdi;
        struct mutex        vc_mutex;
 };
 
index aef47be..af9dbc4 100644 (file)
@@ -723,6 +723,8 @@ asmlinkage long compat_sys_sched_rr_get_interval(compat_pid_t pid,
 asmlinkage long compat_sys_fanotify_mark(int, unsigned int, __u32, __u32,
                                            int, const char __user *);
 
+asmlinkage long compat_sys_arch_prctl(int option, unsigned long arg2);
+
 /*
  * For most but not all architectures, "am I in a compat syscall?" and
  * "am I a compat task?" are the same question.  For architectures on which
index 2a5982c..035c16c 100644 (file)
@@ -201,7 +201,7 @@ struct coresight_ops_sink {
                          void *sink_config);
        unsigned long (*reset_buffer)(struct coresight_device *csdev,
                                      struct perf_output_handle *handle,
-                                     void *sink_config, bool *lost);
+                                     void *sink_config);
        void (*update_buffer)(struct coresight_device *csdev,
                              struct perf_output_handle *handle,
                              void *sink_config);
index 87165f0..a5ce0bb 100644 (file)
@@ -120,6 +120,13 @@ struct cpufreq_policy {
        bool                    fast_switch_possible;
        bool                    fast_switch_enabled;
 
+       /*
+        * Preferred average time interval between consecutive invocations of
+        * the driver to set the frequency for this policy.  To be set by the
+        * scaling driver (0, which is the default, means no preference).
+        */
+       unsigned int            transition_delay_us;
+
         /* Cached frequency lookup from cpufreq_driver_resolve_freq. */
        unsigned int cached_target_freq;
        int cached_resolved_idx;
index 96f1e88..1a67560 100644 (file)
@@ -667,6 +667,11 @@ void alloc_bootmem_cpumask_var(cpumask_var_t *mask);
 void free_cpumask_var(cpumask_var_t mask);
 void free_bootmem_cpumask_var(cpumask_var_t mask);
 
+static inline bool cpumask_available(cpumask_var_t mask)
+{
+       return mask != NULL;
+}
+
 #else
 typedef struct cpumask cpumask_var_t[1];
 
@@ -708,6 +713,11 @@ static inline void free_cpumask_var(cpumask_var_t mask)
 static inline void free_bootmem_cpumask_var(cpumask_var_t mask)
 {
 }
+
+static inline bool cpumask_available(cpumask_var_t mask)
+{
+       return true;
+}
 #endif /* CONFIG_CPUMASK_OFFSTACK */
 
 /* It's common to want to use cpu_all_mask in struct member initializers,
index 611fce5..119a3f9 100644 (file)
@@ -42,7 +42,7 @@ static inline void cpuset_dec(void)
 
 extern int cpuset_init(void);
 extern void cpuset_init_smp(void);
-extern void cpuset_update_active_cpus(bool cpu_online);
+extern void cpuset_update_active_cpus(void);
 extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
 extern void cpuset_cpus_allowed_fallback(struct task_struct *p);
 extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
@@ -155,7 +155,7 @@ static inline bool cpusets_enabled(void) { return false; }
 static inline int cpuset_init(void) { return 0; }
 static inline void cpuset_init_smp(void) {}
 
-static inline void cpuset_update_active_cpus(bool cpu_online)
+static inline void cpuset_update_active_cpus(void)
 {
        partition_sched_domains(1, NULL, NULL);
 }
index 7009b8b..3f033c4 100644 (file)
@@ -1,10 +1,6 @@
 #ifndef __DELL_LED_H__
 #define __DELL_LED_H__
 
-enum {
-       DELL_LED_MICMUTE,
-};
-
-int dell_app_wmi_led_set(int whichled, int on);
+int dell_micmute_led_set(int on);
 
 #endif
index e0acb0e..6c220e4 100644 (file)
@@ -27,6 +27,7 @@
 #define DEVFREQ_POSTCHANGE             (1)
 
 struct devfreq;
+struct devfreq_governor;
 
 /**
  * struct devfreq_dev_status - Data given from devfreq user device to
@@ -101,35 +102,6 @@ struct devfreq_dev_profile {
 };
 
 /**
- * struct devfreq_governor - Devfreq policy governor
- * @node:              list node - contains registered devfreq governors
- * @name:              Governor's name
- * @immutable:         Immutable flag for governor. If the value is 1,
- *                     this govenror is never changeable to other governor.
- * @get_target_freq:   Returns desired operating frequency for the device.
- *                     Basically, get_target_freq will run
- *                     devfreq_dev_profile.get_dev_status() to get the
- *                     status of the device (load = busy_time / total_time).
- *                     If no_central_polling is set, this callback is called
- *                     only with update_devfreq() notified by OPP.
- * @event_handler:      Callback for devfreq core framework to notify events
- *                      to governors. Events include per device governor
- *                      init and exit, opp changes out of devfreq, suspend
- *                      and resume of per device devfreq during device idle.
- *
- * Note that the callbacks are called with devfreq->lock locked by devfreq.
- */
-struct devfreq_governor {
-       struct list_head node;
-
-       const char name[DEVFREQ_NAME_LEN];
-       const unsigned int immutable;
-       int (*get_target_freq)(struct devfreq *this, unsigned long *freq);
-       int (*event_handler)(struct devfreq *devfreq,
-                               unsigned int event, void *data);
-};
-
-/**
  * struct devfreq - Device devfreq structure
  * @node:      list node - contains the devices with devfreq that have been
  *             registered.
index a7e6903..c7ea33e 100644 (file)
@@ -255,6 +255,12 @@ struct dm_target {
        unsigned num_write_same_bios;
 
        /*
+        * The number of WRITE ZEROES bios that will be submitted to the target.
+        * The bio number can be accessed with dm_bio_get_target_bio_nr.
+        */
+       unsigned num_write_zeroes_bios;
+
+       /*
         * The minimum number of extra bytes allocated in each io for the
         * target to use.
         */
@@ -290,11 +296,6 @@ struct dm_target {
         * on max_io_len boundary.
         */
        bool split_discard_bios:1;
-
-       /*
-        * Set if this target does not return zeroes on discarded blocks.
-        */
-       bool discard_zeroes_data_unsupported:1;
 };
 
 /* Each target can link one of these into the table */
index 5b6adf9..8ae0f45 100644 (file)
@@ -28,12 +28,10 @@ struct device;
 #define EDAC_OPSTATE_INT       2
 
 extern int edac_op_state;
-extern int edac_err_assert;
-extern atomic_t edac_handlers;
 
-extern int edac_handler_set(void);
-extern void edac_atomic_assert_error(void);
-extern struct bus_type *edac_get_sysfs_subsys(void);
+struct bus_type *edac_get_sysfs_subsys(void);
+int edac_get_report_status(void);
+void edac_set_report_status(int new);
 
 enum {
        EDAC_REPORTING_ENABLED,
@@ -41,28 +39,6 @@ enum {
        EDAC_REPORTING_FORCE
 };
 
-extern int edac_report_status;
-#ifdef CONFIG_EDAC
-static inline int get_edac_report_status(void)
-{
-       return edac_report_status;
-}
-
-static inline void set_edac_report_status(int new)
-{
-       edac_report_status = new;
-}
-#else
-static inline int get_edac_report_status(void)
-{
-       return EDAC_REPORTING_DISABLED;
-}
-
-static inline void set_edac_report_status(int new)
-{
-}
-#endif
-
 static inline void opstate_init(void)
 {
        switch (edac_op_state) {
index 2fd3993..e6f624b 100644 (file)
@@ -6,6 +6,7 @@
 #ifdef CONFIG_ACPI_BGRT
 
 void efi_bgrt_init(struct acpi_table_header *table);
+int __init acpi_parse_bgrt(struct acpi_table_header *table);
 
 /* The BGRT data itself; only valid if bgrt_image != NULL. */
 extern size_t bgrt_image_size;
@@ -14,6 +15,10 @@ extern struct acpi_table_bgrt bgrt_tab;
 #else /* !CONFIG_ACPI_BGRT */
 
 static inline void efi_bgrt_init(struct acpi_table_header *table) {}
+static inline int __init acpi_parse_bgrt(struct acpi_table_header *table)
+{
+       return 0;
+}
 
 #endif /* !CONFIG_ACPI_BGRT */
 
index 94d34e0..ec36f42 100644 (file)
@@ -1435,9 +1435,6 @@ static inline int efi_runtime_map_copy(void *buf, size_t bufsz)
 
 /* prototypes shared between arch specific and generic stub code */
 
-#define pr_efi(sys_table, msg)     efi_printk(sys_table, "EFI stub: "msg)
-#define pr_efi_err(sys_table, msg) efi_printk(sys_table, "EFI stub: ERROR: "msg)
-
 void efi_printk(efi_system_table_t *sys_table_arg, char *str);
 
 void efi_free(efi_system_table_t *sys_table_arg, unsigned long size,
@@ -1471,7 +1468,7 @@ efi_status_t handle_cmdline_files(efi_system_table_t *sys_table_arg,
                                  unsigned long *load_addr,
                                  unsigned long *load_size);
 
-efi_status_t efi_parse_options(char *cmdline);
+efi_status_t efi_parse_options(char const *cmdline);
 
 efi_status_t efi_setup_gop(efi_system_table_t *sys_table_arg,
                           struct screen_info *si, efi_guid_t *proto,
index aebecc4..3a21631 100644 (file)
@@ -93,6 +93,8 @@ struct blk_mq_hw_ctx;
 struct elevator_mq_ops {
        int (*init_sched)(struct request_queue *, struct elevator_type *);
        void (*exit_sched)(struct elevator_queue *);
+       int (*init_hctx)(struct blk_mq_hw_ctx *, unsigned int);
+       void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int);
 
        bool (*allow_merge)(struct request_queue *, struct request *, struct bio *);
        bool (*bio_merge)(struct blk_mq_hw_ctx *, struct bio *);
@@ -104,7 +106,7 @@ struct elevator_mq_ops {
        void (*insert_requests)(struct blk_mq_hw_ctx *, struct list_head *, bool);
        struct request *(*dispatch_request)(struct blk_mq_hw_ctx *);
        bool (*has_work)(struct blk_mq_hw_ctx *);
-       void (*completed_request)(struct blk_mq_hw_ctx *, struct request *);
+       void (*completed_request)(struct request *);
        void (*started_request)(struct request *);
        void (*requeue_request)(struct request *);
        struct request *(*former_request)(struct request_queue *, struct request *);
@@ -211,7 +213,7 @@ extern ssize_t elv_iosched_show(struct request_queue *, char *);
 extern ssize_t elv_iosched_store(struct request_queue *, const char *, size_t);
 
 extern int elevator_init(struct request_queue *, char *);
-extern void elevator_exit(struct elevator_queue *);
+extern void elevator_exit(struct request_queue *, struct elevator_queue *);
 extern int elevator_change(struct request_queue *, const char *);
 extern bool elv_bio_merge_ok(struct request *, struct bio *);
 extern struct elevator_queue *elevator_alloc(struct request_queue *,
index 7010fb0..7e206a9 100644 (file)
@@ -236,11 +236,11 @@ extern int extcon_set_property_capability(struct extcon_dev *edev,
                                unsigned int id, unsigned int prop);
 
 /*
- * Following APIs are to monitor every action of a notifier.
- * Registrar gets notified for every external port of a connection device.
- * Probably this could be used to debug an action of notifier; however,
- * we do not recommend to use this for normal 'notifiee' device drivers who
- * want to be notified by a specific external port of the notifier.
+ * Following APIs are to monitor the status change of the external connectors.
+ * extcon_register_notifier(*edev, id, *nb) : Register a notifier block
+ *                     for specific external connector of the extcon.
+ * extcon_register_notifier_all(*edev, *nb) : Register a notifier block
+ *                     for all supported external connectors of the extcon.
  */
 extern int extcon_register_notifier(struct extcon_dev *edev, unsigned int id,
                                    struct notifier_block *nb);
@@ -253,6 +253,17 @@ extern void devm_extcon_unregister_notifier(struct device *dev,
                                struct extcon_dev *edev, unsigned int id,
                                struct notifier_block *nb);
 
+extern int extcon_register_notifier_all(struct extcon_dev *edev,
+                               struct notifier_block *nb);
+extern int extcon_unregister_notifier_all(struct extcon_dev *edev,
+                               struct notifier_block *nb);
+extern int devm_extcon_register_notifier_all(struct device *dev,
+                               struct extcon_dev *edev,
+                               struct notifier_block *nb);
+extern void devm_extcon_unregister_notifier_all(struct device *dev,
+                               struct extcon_dev *edev,
+                               struct notifier_block *nb);
+
 /*
  * Following API get the extcon device from devicetree.
  * This function use phandle of devicetree to get extcon device directly.
index 7251f7b..30e5c14 100644 (file)
@@ -2121,6 +2121,9 @@ extern int vfs_ustat(dev_t, struct kstatfs *);
 extern int freeze_super(struct super_block *super);
 extern int thaw_super(struct super_block *super);
 extern bool our_mnt(struct vfsmount *mnt);
+extern __printf(2, 3)
+int super_setup_bdi_name(struct super_block *sb, char *fmt, ...);
+extern int super_setup_bdi(struct super_block *sb);
 
 extern int current_umask(void);
 
index 8bd28ce..3dff239 100644 (file)
@@ -27,4 +27,16 @@ struct fwnode_handle {
        struct fwnode_handle *secondary;
 };
 
+/**
+ * struct fwnode_endpoint - Fwnode graph endpoint
+ * @port: Port number
+ * @id: Endpoint id
+ * @local_fwnode: reference to the related fwnode
+ */
+struct fwnode_endpoint {
+       unsigned int port;
+       unsigned int id;
+       const struct fwnode_handle *local_fwnode;
+};
+
 #endif
index 76f3975..acff943 100644 (file)
@@ -159,11 +159,11 @@ struct badblocks;
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 
 struct blk_integrity {
-       struct blk_integrity_profile    *profile;
-       unsigned char                   flags;
-       unsigned char                   tuple_size;
-       unsigned char                   interval_exp;
-       unsigned char                   tag_size;
+       const struct blk_integrity_profile      *profile;
+       unsigned char                           flags;
+       unsigned char                           tuple_size;
+       unsigned char                           interval_exp;
+       unsigned char                           tag_size;
 };
 
 #endif /* CONFIG_BLK_DEV_INTEGRITY */
@@ -722,11 +722,9 @@ static inline void part_nr_sects_write(struct hd_struct *part, sector_t size)
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 extern void blk_integrity_add(struct gendisk *);
 extern void blk_integrity_del(struct gendisk *);
-extern void blk_integrity_revalidate(struct gendisk *);
 #else  /* CONFIG_BLK_DEV_INTEGRITY */
 static inline void blk_integrity_add(struct gendisk *disk) { }
 static inline void blk_integrity_del(struct gendisk *disk) { }
-static inline void blk_integrity_revalidate(struct gendisk *disk) { }
 #endif /* CONFIG_BLK_DEV_INTEGRITY */
 
 #else /* CONFIG_BLOCK */
index 249e579..8c5b10e 100644 (file)
@@ -276,8 +276,6 @@ static inline int hrtimer_is_hres_active(struct hrtimer *timer)
        return timer->base->cpu_base->hres_active;
 }
 
-extern void hrtimer_peek_ahead_timers(void);
-
 /*
  * The resolution of the clocks. The resolution value is returned in
  * the clock_getres() system call to give application programmers an
@@ -300,8 +298,6 @@ extern unsigned int hrtimer_resolution;
 
 #define hrtimer_resolution     (unsigned int)LOW_RES_NSEC
 
-static inline void hrtimer_peek_ahead_timers(void) { }
-
 static inline int hrtimer_is_hres_active(struct hrtimer *timer)
 {
        return 0;
@@ -456,7 +452,7 @@ static inline u64 hrtimer_forward_now(struct hrtimer *timer,
 }
 
 /* Precise sleep: */
-extern long hrtimer_nanosleep(struct timespec *rqtp,
+extern long hrtimer_nanosleep(struct timespec64 *rqtp,
                              struct timespec __user *rmtp,
                              const enum hrtimer_mode mode,
                              const clockid_t clockid);
index 88b6737..ceb7519 100644 (file)
@@ -337,7 +337,7 @@ struct hwmon_ops {
        int (*read)(struct device *dev, enum hwmon_sensor_types type,
                    u32 attr, int channel, long *val);
        int (*read_string)(struct device *dev, enum hwmon_sensor_types type,
-                   u32 attr, int channel, char **str);
+                   u32 attr, int channel, const char **str);
        int (*write)(struct device *dev, enum hwmon_sensor_types type,
                     u32 attr, int channel, long val);
 };
index 2f51c17..6980ca3 100644 (file)
@@ -88,7 +88,7 @@ static inline bool ata_pm_request(struct request *rq)
                 ide_req(rq)->type == ATA_PRIV_PM_RESUME);
 }
 
-/* Error codes returned in rq->errors to the higher part of the driver. */
+/* Error codes returned in result to the higher part of the driver. */
 enum {
        IDE_DRV_ERROR_GENERAL   = 101,
        IDE_DRV_ERROR_FILEMARK  = 102,
index 4cca05c..636ebe8 100644 (file)
@@ -43,6 +43,8 @@
 #define _LINUX_INET_H
 
 #include <linux/types.h>
+#include <net/net_namespace.h>
+#include <linux/socket.h>
 
 /*
  * These mimic similar macros defined in user-space for inet_ntop(3).
@@ -54,4 +56,8 @@
 extern __be32 in_aton(const char *str);
 extern int in4_pton(const char *src, int srclen, u8 *dst, int delim, const char **end);
 extern int in6_pton(const char *src, int srclen, u8 *dst, int delim, const char **end);
+
+extern int inet_pton_with_scope(struct net *net, unsigned short af,
+               const char *src, const char *port, struct sockaddr_storage *addr);
+
 #endif /* _LINUX_INET_H */
index 91d9049..2c487e0 100644 (file)
@@ -181,6 +181,7 @@ extern struct cred init_cred;
 #ifdef CONFIG_RT_MUTEXES
 # define INIT_RT_MUTEXES(tsk)                                          \
        .pi_waiters = RB_ROOT,                                          \
+       .pi_top_task = NULL,                                            \
        .pi_waiters_leftmost = NULL,
 #else
 # define INIT_RT_MUTEXES(tsk)
index eafc965..dc30f3d 100644 (file)
@@ -96,6 +96,9 @@
 #define GICH_MISR_EOI                  (1 << 0)
 #define GICH_MISR_U                    (1 << 1)
 
+#define GICV_PMR_PRIORITY_SHIFT                3
+#define GICV_PMR_PRIORITY_MASK         (0x1f << GICV_PMR_PRIORITY_SHIFT)
+
 #ifndef __ASSEMBLY__
 
 #include <linux/irqdomain.h>
index 7b49c71..2b0e566 100644 (file)
@@ -258,7 +258,6 @@ extern unsigned int gic_present;
 extern void gic_init(unsigned long gic_base_addr,
        unsigned long gic_addrspace_size, unsigned int cpu_vec,
        unsigned int irqbase);
-extern void gic_clocksource_init(unsigned int);
 extern u64 gic_read_count(void);
 extern unsigned int gic_get_count_width(void);
 extern u64 gic_read_compare(void);
index 4c26dc3..7ae2567 100644 (file)
@@ -438,6 +438,7 @@ extern int get_option(char **str, int *pint);
 extern char *get_options(const char *str, int nints, int *ints);
 extern unsigned long long memparse(const char *ptr, char **retptr);
 extern bool parse_option_str(const char *str, const char *option);
+extern char *next_arg(char *args, char **param, char **val);
 
 extern int core_kernel_text(unsigned long addr);
 extern int core_kernel_data(unsigned long addr);
index e628459..ca85cb8 100644 (file)
@@ -108,6 +108,8 @@ extern int __must_check kobject_rename(struct kobject *, const char *new_name);
 extern int __must_check kobject_move(struct kobject *, struct kobject *);
 
 extern struct kobject *kobject_get(struct kobject *kobj);
+extern struct kobject * __must_check kobject_get_unless_zero(
+                                               struct kobject *kobj);
 extern void kobject_put(struct kobject *kobj);
 
 extern const void *kobject_namespace(struct kobject *kobj);
index c328e4f..47e4da5 100644 (file)
@@ -267,6 +267,8 @@ extern int arch_init_kprobes(void);
 extern void show_registers(struct pt_regs *regs);
 extern void kprobes_inc_nmissed_count(struct kprobe *p);
 extern bool arch_within_kprobe_blacklist(unsigned long addr);
+extern bool arch_function_offset_within_entry(unsigned long offset);
+extern bool function_offset_within_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset);
 
 extern bool within_kprobe_blacklist(unsigned long addr);
 
index d215b45..5e240b2 100644 (file)
@@ -22,7 +22,8 @@ enum pca9532_state {
        PCA9532_OFF  = 0x0,
        PCA9532_ON   = 0x1,
        PCA9532_PWM0 = 0x2,
-       PCA9532_PWM1 = 0x3
+       PCA9532_PWM1 = 0x3,
+       PCA9532_KEEP = 0xff,
 };
 
 struct pca9532_led {
@@ -44,4 +45,3 @@ struct pca9532_platform_data {
 };
 
 #endif /* __LINUX_PCA9532_H */
-
index 38c0bd7..64c56d4 100644 (file)
@@ -122,10 +122,16 @@ struct led_classdev {
        struct mutex            led_access;
 };
 
-extern int led_classdev_register(struct device *parent,
-                                struct led_classdev *led_cdev);
-extern int devm_led_classdev_register(struct device *parent,
-                                     struct led_classdev *led_cdev);
+extern int of_led_classdev_register(struct device *parent,
+                                   struct device_node *np,
+                                   struct led_classdev *led_cdev);
+#define led_classdev_register(parent, led_cdev)                                \
+       of_led_classdev_register(parent, NULL, led_cdev)
+extern int devm_of_led_classdev_register(struct device *parent,
+                                        struct device_node *np,
+                                        struct led_classdev *led_cdev);
+#define devm_led_classdev_register(parent, led_cdev)                   \
+       devm_of_led_classdev_register(parent, NULL, led_cdev)
 extern void led_classdev_unregister(struct led_classdev *led_cdev);
 extern void devm_led_classdev_unregister(struct device *parent,
                                         struct led_classdev *led_cdev);
index ca45e4a..7dfa56e 100644 (file)
@@ -56,7 +56,6 @@ typedef int (nvm_get_l2p_tbl_fn)(struct nvm_dev *, u64, u32,
 typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, u8 *);
 typedef int (nvm_op_set_bb_fn)(struct nvm_dev *, struct ppa_addr *, int, int);
 typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *);
-typedef int (nvm_erase_blk_fn)(struct nvm_dev *, struct nvm_rq *);
 typedef void *(nvm_create_dma_pool_fn)(struct nvm_dev *, char *);
 typedef void (nvm_destroy_dma_pool_fn)(void *);
 typedef void *(nvm_dev_dma_alloc_fn)(struct nvm_dev *, void *, gfp_t,
@@ -70,7 +69,6 @@ struct nvm_dev_ops {
        nvm_op_set_bb_fn        *set_bb_tbl;
 
        nvm_submit_io_fn        *submit_io;
-       nvm_erase_blk_fn        *erase_block;
 
        nvm_create_dma_pool_fn  *create_dma_pool;
        nvm_destroy_dma_pool_fn *destroy_dma_pool;
@@ -125,7 +123,7 @@ enum {
        /* NAND Access Modes */
        NVM_IO_SUSPEND          = 0x80,
        NVM_IO_SLC_MODE         = 0x100,
-       NVM_IO_SCRAMBLE_DISABLE = 0x200,
+       NVM_IO_SCRAMBLE_ENABLE  = 0x200,
 
        /* Block Types */
        NVM_BLK_T_FREE          = 0x0,
@@ -438,7 +436,8 @@ static inline int ppa_cmp_blk(struct ppa_addr ppa1, struct ppa_addr ppa2)
 
 typedef blk_qc_t (nvm_tgt_make_rq_fn)(struct request_queue *, struct bio *);
 typedef sector_t (nvm_tgt_capacity_fn)(void *);
-typedef void *(nvm_tgt_init_fn)(struct nvm_tgt_dev *, struct gendisk *);
+typedef void *(nvm_tgt_init_fn)(struct nvm_tgt_dev *, struct gendisk *,
+                               int flags);
 typedef void (nvm_tgt_exit_fn)(void *);
 typedef int (nvm_tgt_sysfs_init_fn)(struct gendisk *);
 typedef void (nvm_tgt_sysfs_exit_fn)(struct gendisk *);
@@ -479,10 +478,10 @@ extern int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr *,
                              int, int);
 extern int nvm_max_phys_sects(struct nvm_tgt_dev *);
 extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *);
-extern int nvm_set_rqd_ppalist(struct nvm_dev *, struct nvm_rq *,
+extern int nvm_erase_sync(struct nvm_tgt_dev *, struct ppa_addr *, int);
+extern int nvm_set_rqd_ppalist(struct nvm_tgt_dev *, struct nvm_rq *,
                                        const struct ppa_addr *, int, int);
-extern void nvm_free_rqd_ppalist(struct nvm_dev *, struct nvm_rq *);
-extern int nvm_erase_blk(struct nvm_tgt_dev *, struct ppa_addr *, int);
+extern void nvm_free_rqd_ppalist(struct nvm_tgt_dev *, struct nvm_rq *);
 extern int nvm_get_l2p_tbl(struct nvm_tgt_dev *, u64, u32, nvm_l2p_update_fn *,
                           void *);
 extern int nvm_get_area(struct nvm_tgt_dev *, sector_t *, sector_t);
index 1e327bb..fffe49f 100644 (file)
@@ -361,6 +361,8 @@ static inline void lock_set_subclass(struct lockdep_map *lock,
        lock_set_class(lock, lock->name, lock->key, subclass, ip);
 }
 
+extern void lock_downgrade(struct lockdep_map *lock, unsigned long ip);
+
 extern void lockdep_set_current_reclaim_state(gfp_t gfp_mask);
 extern void lockdep_clear_current_reclaim_state(void);
 extern void lockdep_trace_alloc(gfp_t mask);
@@ -411,6 +413,7 @@ static inline void lockdep_on(void)
 
 # define lock_acquire(l, s, t, r, c, n, i)     do { } while (0)
 # define lock_release(l, n, i)                 do { } while (0)
+# define lock_downgrade(l, i)                  do { } while (0)
 # define lock_set_class(l, n, k, s, i)         do { } while (0)
 # define lock_set_subclass(l, s, i)            do { } while (0)
 # define lockdep_set_current_reclaim_state(g)  do { } while (0)
index 6b55c93..c20b484 100644 (file)
@@ -16,6 +16,7 @@
 
 enum brcm_message_type {
        BRCM_MESSAGE_UNKNOWN = 0,
+       BRCM_MESSAGE_BATCH,
        BRCM_MESSAGE_SPU,
        BRCM_MESSAGE_SBA,
        BRCM_MESSAGE_MAX,
@@ -23,24 +24,29 @@ enum brcm_message_type {
 
 struct brcm_sba_command {
        u64 cmd;
+       u64 *cmd_dma;
+       dma_addr_t cmd_dma_addr;
 #define BRCM_SBA_CMD_TYPE_A            BIT(0)
 #define BRCM_SBA_CMD_TYPE_B            BIT(1)
 #define BRCM_SBA_CMD_TYPE_C            BIT(2)
 #define BRCM_SBA_CMD_HAS_RESP          BIT(3)
 #define BRCM_SBA_CMD_HAS_OUTPUT                BIT(4)
        u64 flags;
-       dma_addr_t input;
-       size_t input_len;
        dma_addr_t resp;
        size_t resp_len;
-       dma_addr_t output;
-       size_t output_len;
+       dma_addr_t data;
+       size_t data_len;
 };
 
 struct brcm_message {
        enum brcm_message_type type;
        union {
                struct {
+                       struct brcm_message *msgs;
+                       unsigned int msgs_queued;
+                       unsigned int msgs_count;
+               } batch;
+               struct {
                        struct scatterlist *src;
                        struct scatterlist *dst;
                } spu;
index 7a01c94..3eef9fb 100644 (file)
  * Max bus-specific overhead incurred by request/responses.
  * I2C requires 1 additional byte for requests.
  * I2C requires 2 additional bytes for responses.
+ * SPI requires up to 32 additional bytes for responses.
  * */
 #define EC_PROTO_VERSION_UNKNOWN       0
 #define EC_MAX_REQUEST_OVERHEAD                1
-#define EC_MAX_RESPONSE_OVERHEAD       2
+#define EC_MAX_RESPONSE_OVERHEAD       32
 
 /*
  * Command interface between EC and AP, for LPC, I2C and SPI interfaces.
index b4031c2..53758a7 100644 (file)
@@ -14,6 +14,9 @@
  * published by the Free Software Foundation.
  */
 
+#include <linux/device.h>
+#include <linux/regmap.h>
+
 #define CPCAP_VENDOR_ST                0
 #define CPCAP_VENDOR_TI                1
 
diff --git a/include/linux/mg_disk.h b/include/linux/mg_disk.h
deleted file mode 100644 (file)
index e11f4d9..0000000
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- *  include/linux/mg_disk.c
- *
- *  Private data for mflash platform driver
- *
- * (c) 2008 mGine Co.,LTD
- * (c) 2008 unsik Kim <donari75@gmail.com>
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License version 2 as
- *  published by the Free Software Foundation.
- */
-
-#ifndef __MG_DISK_H__
-#define __MG_DISK_H__
-
-/* name for platform device */
-#define MG_DEV_NAME "mg_disk"
-
-/* names of GPIO resource */
-#define MG_RST_PIN     "mg_rst"
-/* except MG_BOOT_DEV, reset-out pin should be assigned */
-#define MG_RSTOUT_PIN  "mg_rstout"
-
-/* device attribution */
-/* use mflash as boot device */
-#define MG_BOOT_DEV            (1 << 0)
-/* use mflash as storage device */
-#define MG_STORAGE_DEV         (1 << 1)
-/* same as MG_STORAGE_DEV, but bootloader already done reset sequence */
-#define MG_STORAGE_DEV_SKIP_RST        (1 << 2)
-
-/* private driver data */
-struct mg_drv_data {
-       /* disk resource */
-       u32 use_polling;
-
-       /* device attribution */
-       u32 dev_attr;
-
-       /* internally used */
-       void *host;
-};
-
-#endif
index aab032a..97ca105 100644 (file)
@@ -53,7 +53,7 @@ struct sdio_func {
        unsigned int            state;          /* function state */
 #define SDIO_STATE_PRESENT     (1<<0)          /* present in sysfs */
 
-       u8                      tmpbuf[4];      /* DMA:able scratch buffer */
+       u8                      *tmpbuf;        /* DMA:able scratch buffer */
 
        unsigned                num_info;       /* number of info strings */
        const char              **info;         /* info strings */
index 51891fb..c91b3bc 100644 (file)
@@ -394,18 +394,6 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
        ___pud;                                                         \
 })
 
-#define pmdp_huge_get_and_clear_notify(__mm, __haddr, __pmd)           \
-({                                                                     \
-       unsigned long ___haddr = __haddr & HPAGE_PMD_MASK;              \
-       pmd_t ___pmd;                                                   \
-                                                                       \
-       ___pmd = pmdp_huge_get_and_clear(__mm, __haddr, __pmd);         \
-       mmu_notifier_invalidate_range(__mm, ___haddr,                   \
-                                     ___haddr + HPAGE_PMD_SIZE);       \
-                                                                       \
-       ___pmd;                                                         \
-})
-
 /*
  * set_pte_at_notify() sets the pte _after_ running the notifier.
  * This is safe to start by updating the secondary MMUs, because the primary MMU
@@ -489,7 +477,6 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
 #define        ptep_clear_flush_notify ptep_clear_flush
 #define pmdp_huge_clear_flush_notify pmdp_huge_clear_flush
 #define pudp_huge_clear_flush_notify pudp_huge_clear_flush
-#define pmdp_huge_get_and_clear_notify pmdp_huge_get_and_clear
 #define set_pte_at_notify set_pte_at
 
 #endif /* CONFIG_MMU_NOTIFIER */
index 0297c5c..9ad6856 100644 (file)
@@ -493,6 +493,7 @@ static inline int module_is_live(struct module *mod)
 struct module *__module_text_address(unsigned long addr);
 struct module *__module_address(unsigned long addr);
 bool is_module_address(unsigned long addr);
+bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr);
 bool is_module_percpu_address(unsigned long addr);
 bool is_module_text_address(unsigned long addr);
 
@@ -660,6 +661,11 @@ static inline bool is_module_percpu_address(unsigned long addr)
        return false;
 }
 
+static inline bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
+{
+       return false;
+}
+
 static inline bool is_module_text_address(unsigned long addr)
 {
        return false;
index eebdc63..79b176e 100644 (file)
@@ -334,11 +334,6 @@ struct mtd_info {
        int (*_get_device) (struct mtd_info *mtd);
        void (*_put_device) (struct mtd_info *mtd);
 
-       /* Backing device capabilities for this device
-        * - provides mmap capabilities
-        */
-       struct backing_dev_info *backing_dev_info;
-
        struct notifier_block reboot_notifier;  /* default mode before reboot */
 
        /* ECC status information */
index b34097c..e1502c5 100644 (file)
@@ -133,7 +133,6 @@ struct nfs_server {
        struct rpc_clnt *       client_acl;     /* ACL RPC client handle */
        struct nlm_host         *nlm_host;      /* NLM client handle */
        struct nfs_iostats __percpu *io_stats;  /* I/O statistics */
-       struct backing_dev_info backing_dev_info;
        atomic_long_t           writeback;      /* number of writeback pages */
        int                     flags;          /* various flags */
        unsigned int            caps;           /* server capabilities */
index f21471f..0db3715 100644 (file)
@@ -137,9 +137,9 @@ enum nvmefc_fcp_datadir {
  *             transferred. Should equal payload_length on success.
  * @rcv_rsplen: length, in bytes, of the FCP RSP IU received.
  * @status:    Completion status of the FCP operation. must be 0 upon success,
- *             NVME_SC_FC_xxx value upon failure. Note: this is NOT a
- *             reflection of the NVME CQE completion status. Only the status
- *             of the FCP operation at the NVME-FC level.
+ *             negative errno value upon failure (ex: -EIO). Note: this is
+ *             NOT a reflection of the NVME CQE completion status. Only the
+ *             status of the FCP operation at the NVME-FC level.
  */
 struct nvmefc_fcp_req {
        void                    *cmdaddr;
@@ -533,9 +533,6 @@ enum {
                                         * rsp as well
                                         */
        NVMET_FCOP_RSP          = 4,    /* send rsp frame */
-       NVMET_FCOP_ABORT        = 5,    /* abort exchange via ABTS */
-       NVMET_FCOP_BA_ACC       = 6,    /* send BA_ACC */
-       NVMET_FCOP_BA_RJT       = 7,    /* send BA_RJT */
 };
 
 /**
@@ -572,8 +569,6 @@ enum {
  *     upon compeletion of the operation.  The nvmet-fc layer will also set a
  *     private pointer for its own use in the done routine.
  *
- * Note: the LLDD must never fail a NVMET_FCOP_ABORT request !!
- *
  * Values set by the NVMET-FC layer prior to calling the LLDD fcp_op
  * entrypoint.
  * @op:       Indicates the FCP IU operation to perform (see NVMET_FCOP_xxx)
@@ -655,6 +650,22 @@ enum {
                 * on. The transport should pick a cpu to schedule the work
                 * on.
                 */
+       NVMET_FCTGTFEAT_CMD_IN_ISR = (1 << 2),
+               /* Bit 2: When 0, the LLDD is calling the cmd rcv handler
+                * in a non-isr context, allowing the transport to finish
+                * op completion in the calling context. When 1, the LLDD
+                * is calling the cmd rcv handler in an ISR context,
+                * requiring the transport to transition to a workqueue
+                * for op completion.
+                */
+       NVMET_FCTGTFEAT_OPDONE_IN_ISR = (1 << 3),
+               /* Bit 3: When 0, the LLDD is calling the op done handler
+                * in a non-isr context, allowing the transport to finish
+                * op completion in the calling context. When 1, the LLDD
+                * is calling the op done handler in an ISR context,
+                * requiring the transport to transition to a workqueue
+                * for op completion.
+                */
 };
 
 
@@ -725,12 +736,12 @@ struct nvmet_fc_target_port {
  *       be freed/released.
  *       Entrypoint is Mandatory.
  *
- * @fcp_op:  Called to perform a data transfer, transmit a response, or
- *       abort an FCP opertion. The nvmefc_tgt_fcp_req structure is the same
- *       LLDD-supplied exchange structure specified in the
- *       nvmet_fc_rcv_fcp_req() call made when the FCP CMD IU was received.
- *       The op field in the structure shall indicate the operation for
- *       the LLDD to perform relative to the io.
+ * @fcp_op:  Called to perform a data transfer or transmit a response.
+ *       The nvmefc_tgt_fcp_req structure is the same LLDD-supplied
+ *       exchange structure specified in the nvmet_fc_rcv_fcp_req() call
+ *       made when the FCP CMD IU was received. The op field in the
+ *       structure shall indicate the operation for the LLDD to perform
+ *       relative to the io.
  *         NVMET_FCOP_READDATA operation: the LLDD is to send the
  *           payload data (described by sglist) to the host in 1 or
  *           more FC sequences (preferrably 1).  Note: the fc-nvme layer
@@ -752,29 +763,31 @@ struct nvmet_fc_target_port {
  *           successfully, the LLDD is to update the nvmefc_tgt_fcp_req
  *           transferred_length field and may subsequently transmit the
  *           FCP_RSP iu payload (described by rspbuf, rspdma, rsplen).
- *           The LLDD is to await FCP_CONF reception to confirm the RSP
- *           reception by the host. The LLDD may retramsit the FCP_RSP iu
- *           if necessary per FC-NVME. Upon reception of FCP_CONF, or upon
- *           FCP_CONF failure, the LLDD is to set the nvmefc_tgt_fcp_req
- *           fcp_error field and consider the operation complete..
+ *           If FCP_CONF is supported, the LLDD is to await FCP_CONF
+ *           reception to confirm the RSP reception by the host. The LLDD
+ *           may retramsit the FCP_RSP iu if necessary per FC-NVME. Upon
+ *           transmission of the FCP_RSP iu if FCP_CONF is not supported,
+ *           or upon success/failure of FCP_CONF if it is supported, the
+ *           LLDD is to set the nvmefc_tgt_fcp_req fcp_error field and
+ *           consider the operation complete.
  *         NVMET_FCOP_RSP: the LLDD is to transmit the FCP_RSP iu payload
- *           (described by rspbuf, rspdma, rsplen).  The LLDD is to await
- *           FCP_CONF reception to confirm the RSP reception by the host.
- *           The LLDD may retramsit the FCP_RSP iu if necessary per FC-NVME.
- *           Upon reception of FCP_CONF, or upon FCP_CONF failure, the
+ *           (described by rspbuf, rspdma, rsplen). If FCP_CONF is
+ *           supported, the LLDD is to await FCP_CONF reception to confirm
+ *           the RSP reception by the host. The LLDD may retramsit the
+ *           FCP_RSP iu if FCP_CONF is not received per FC-NVME. Upon
+ *           transmission of the FCP_RSP iu if FCP_CONF is not supported,
+ *           or upon success/failure of FCP_CONF if it is supported, the
  *           LLDD is to set the nvmefc_tgt_fcp_req fcp_error field and
- *           consider the operation complete..
- *         NVMET_FCOP_ABORT: the LLDD is to terminate the exchange
- *           corresponding to the fcp operation. The LLDD shall send
- *           ABTS and follow FC exchange abort-multi rules, including
- *           ABTS retries and possible logout.
+ *           consider the operation complete.
  *       Upon completing the indicated operation, the LLDD is to set the
  *       status fields for the operation (tranferred_length and fcp_error
- *       status) in the request, then all the "done" routine
- *       indicated in the fcp request.  Upon return from the "done"
- *       routine for either a NVMET_FCOP_RSP or NVMET_FCOP_ABORT operation
- *       the fc-nvme layer will not longer reference the fcp request,
- *       allowing the LLDD to free/release the fcp request.
+ *       status) in the request, then call the "done" routine
+ *       indicated in the fcp request. After the operation completes,
+ *       regardless of whether the FCP_RSP iu was successfully transmit,
+ *       the LLDD-supplied exchange structure must remain valid until the
+ *       transport calls the fcp_req_release() callback to return ownership
+ *       of the exchange structure back to the LLDD so that it may be used
+ *       for another fcp command.
  *       Note: when calling the done routine for READDATA or WRITEDATA
  *       operations, the fc-nvme layer may immediate convert, in the same
  *       thread and before returning to the LLDD, the fcp operation to
@@ -786,6 +799,22 @@ struct nvmet_fc_target_port {
  *       Returns 0 on success, -<errno> on failure (Ex: -EIO)
  *       Entrypoint is Mandatory.
  *
+ * @fcp_abort:  Called by the transport to abort an active command.
+ *       The command may be in-between operations (nothing active in LLDD)
+ *       or may have an active WRITEDATA operation pending. The LLDD is to
+ *       initiate the ABTS process for the command and return from the
+ *       callback. The ABTS does not need to be complete on the command.
+ *       The fcp_abort callback inherently cannot fail. After the
+ *       fcp_abort() callback completes, the transport will wait for any
+ *       outstanding operation (if there was one) to complete, then will
+ *       call the fcp_req_release() callback to return the command's
+ *       exchange context back to the LLDD.
+ *
+ * @fcp_req_release:  Called by the transport to return a nvmefc_tgt_fcp_req
+ *       to the LLDD after all operations on the fcp operation are complete.
+ *       This may be due to the command completing or upon completion of
+ *       abort cleanup.
+ *
  * @max_hw_queues:  indicates the maximum number of hw queues the LLDD
  *       supports for cpu affinitization.
  *       Value is Mandatory. Must be at least 1.
@@ -820,7 +849,11 @@ struct nvmet_fc_target_template {
        int (*xmt_ls_rsp)(struct nvmet_fc_target_port *tgtport,
                                struct nvmefc_tgt_ls_req *tls_req);
        int (*fcp_op)(struct nvmet_fc_target_port *tgtport,
-                               struct nvmefc_tgt_fcp_req *);
+                               struct nvmefc_tgt_fcp_req *fcpreq);
+       void (*fcp_abort)(struct nvmet_fc_target_port *tgtport,
+                               struct nvmefc_tgt_fcp_req *fcpreq);
+       void (*fcp_req_release)(struct nvmet_fc_target_port *tgtport,
+                               struct nvmefc_tgt_fcp_req *fcpreq);
 
        u32     max_hw_queues;
        u16     max_sgl_segments;
@@ -848,4 +881,7 @@ int nvmet_fc_rcv_fcp_req(struct nvmet_fc_target_port *tgtport,
                        struct nvmefc_tgt_fcp_req *fcpreq,
                        void *cmdiubuf, u32 cmdiubuf_len);
 
+void nvmet_fc_rcv_fcp_abort(struct nvmet_fc_target_port *tgtport,
+                       struct nvmefc_tgt_fcp_req *fcpreq);
+
 #endif /* _NVME_FC_DRIVER_H */
index 4b45226..e997c4a 100644 (file)
@@ -16,8 +16,7 @@
  */
 
 /*
- * This file contains definitions relative to FC-NVME r1.11 and a few
- * newer items
+ * This file contains definitions relative to FC-NVME r1.14 (16-020vB).
  */
 
 #ifndef _NVME_FC_H
@@ -47,8 +46,15 @@ struct nvme_fc_cmd_iu {
 
 #define NVME_FC_SIZEOF_ZEROS_RSP       12
 
+enum {
+       FCNVME_SC_SUCCESS               = 0,
+       FCNVME_SC_INVALID_FIELD         = 1,
+       FCNVME_SC_INVALID_CONNID        = 2,
+};
+
 struct nvme_fc_ersp_iu {
-       __u8                    rsvd0[2];
+       __u8                    status_code;
+       __u8                    rsvd1;
        __be16                  iu_len;
        __be32                  rsn;
        __be32                  xfrd_len;
@@ -58,7 +64,7 @@ struct nvme_fc_ersp_iu {
 };
 
 
-/* FC-NVME r1.03/16-119v0 NVME Link Services */
+/* FC-NVME Link Services */
 enum {
        FCNVME_LS_RSVD                  = 0,
        FCNVME_LS_RJT                   = 1,
@@ -68,7 +74,7 @@ enum {
        FCNVME_LS_DISCONNECT            = 5,
 };
 
-/* FC-NVME r1.03/16-119v0 NVME Link Service Descriptors */
+/* FC-NVME Link Service Descriptors */
 enum {
        FCNVME_LSDESC_RSVD              = 0x0,
        FCNVME_LSDESC_RQST              = 0x1,
@@ -92,7 +98,6 @@ static inline __be32 fcnvme_lsdesc_len(size_t sz)
        return cpu_to_be32(sz - (2 * sizeof(u32)));
 }
 
-
 struct fcnvme_ls_rqst_w0 {
        u8      ls_cmd;                 /* FCNVME_LS_xxx */
        u8      zeros[3];
@@ -106,8 +111,53 @@ struct fcnvme_lsdesc_rqst {
        __be32  rsvd12;
 };
 
+/* FC-NVME LS RJT reason_code values */
+enum fcnvme_ls_rjt_reason {
+       FCNVME_RJT_RC_NONE              = 0,
+       /* no reason - not to be sent */
+
+       FCNVME_RJT_RC_INVAL             = 0x01,
+       /* invalid NVMe_LS command code */
+
+       FCNVME_RJT_RC_LOGIC             = 0x03,
+       /* logical error */
+
+       FCNVME_RJT_RC_UNAB              = 0x09,
+       /* unable to perform command request */
+
+       FCNVME_RJT_RC_UNSUP             = 0x0b,
+       /* command not supported */
+
+       FCNVME_RJT_RC_INPROG            = 0x0e,
+       /* command already in progress */
 
+       FCNVME_RJT_RC_INV_ASSOC         = 0x40,
+       /* Invalid Association ID*/
 
+       FCNVME_RJT_RC_INV_CONN          = 0x41,
+       /* Invalid Connection ID*/
+
+       FCNVME_RJT_RC_VENDOR            = 0xff,
+       /* vendor specific error */
+};
+
+/* FC-NVME LS RJT reason_explanation values */
+enum fcnvme_ls_rjt_explan {
+       FCNVME_RJT_EXP_NONE             = 0x00,
+       /* No additional explanation */
+
+       FCNVME_RJT_EXP_OXID_RXID        = 0x17,
+       /* invalid OX_ID-RX_ID combination */
+
+       FCNVME_RJT_EXP_INSUF_RES        = 0x29,
+       /* insufficient resources */
+
+       FCNVME_RJT_EXP_UNAB_DATA        = 0x2a,
+       /* unable to supply requested data */
+
+       FCNVME_RJT_EXP_INV_LEN          = 0x2d,
+       /* Invalid payload length */
+};
 
 /* FCNVME_LSDESC_RJT */
 struct fcnvme_lsdesc_rjt {
@@ -119,15 +169,15 @@ struct fcnvme_lsdesc_rjt {
         * Reject reason and explanaction codes are generic
         * to ELs's from LS-3.
         */
-       u8      reason_code;
-       u8      reason_explanation;
+       u8      reason_code;            /* fcnvme_ls_rjt_reason */
+       u8      reason_explanation;     /* fcnvme_ls_rjt_explan */
 
        u8      vendor;
        __be32  rsvd12;
 };
 
 
-#define FCNVME_ASSOC_HOSTID_LEN                64
+#define FCNVME_ASSOC_HOSTID_LEN                16
 #define FCNVME_ASSOC_HOSTNQN_LEN       256
 #define FCNVME_ASSOC_SUBNQN_LEN                256
 
index c43d435..b625bac 100644 (file)
@@ -64,26 +64,26 @@ enum {
  * RDMA_QPTYPE field
  */
 enum {
-       NVMF_RDMA_QPTYPE_CONNECTED      = 0, /* Reliable Connected */
-       NVMF_RDMA_QPTYPE_DATAGRAM       = 1, /* Reliable Datagram */
+       NVMF_RDMA_QPTYPE_CONNECTED      = 1, /* Reliable Connected */
+       NVMF_RDMA_QPTYPE_DATAGRAM       = 2, /* Reliable Datagram */
 };
 
 /* RDMA QP Service Type codes for Discovery Log Page entry TSAS
  * RDMA_QPTYPE field
  */
 enum {
-       NVMF_RDMA_PRTYPE_NOT_SPECIFIED  = 0, /* No Provider Specified */
-       NVMF_RDMA_PRTYPE_IB             = 1, /* InfiniBand */
-       NVMF_RDMA_PRTYPE_ROCE           = 2, /* InfiniBand RoCE */
-       NVMF_RDMA_PRTYPE_ROCEV2         = 3, /* InfiniBand RoCEV2 */
-       NVMF_RDMA_PRTYPE_IWARP          = 4, /* IWARP */
+       NVMF_RDMA_PRTYPE_NOT_SPECIFIED  = 1, /* No Provider Specified */
+       NVMF_RDMA_PRTYPE_IB             = 2, /* InfiniBand */
+       NVMF_RDMA_PRTYPE_ROCE           = 3, /* InfiniBand RoCE */
+       NVMF_RDMA_PRTYPE_ROCEV2         = 4, /* InfiniBand RoCEV2 */
+       NVMF_RDMA_PRTYPE_IWARP          = 5, /* IWARP */
 };
 
 /* RDMA Connection Management Service Type codes for Discovery Log Page
  * entry TSAS RDMA_CMS field
  */
 enum {
-       NVMF_RDMA_CMS_RDMA_CM   = 0, /* Sockets based enpoint addressing */
+       NVMF_RDMA_CMS_RDMA_CM   = 1, /* Sockets based endpoint addressing */
 };
 
 #define NVMF_AQ_DEPTH          32
@@ -245,6 +245,7 @@ enum {
        NVME_CTRL_ONCS_WRITE_ZEROES             = 1 << 3,
        NVME_CTRL_VWC_PRESENT                   = 1 << 0,
        NVME_CTRL_OACS_SEC_SUPP                 = 1 << 0,
+       NVME_CTRL_OACS_DBBUF_SUPP               = 1 << 7,
 };
 
 struct nvme_lbaf {
@@ -603,6 +604,7 @@ enum nvme_admin_opcode {
        nvme_admin_download_fw          = 0x11,
        nvme_admin_ns_attach            = 0x15,
        nvme_admin_keep_alive           = 0x18,
+       nvme_admin_dbbuf                = 0x7C,
        nvme_admin_format_nvm           = 0x80,
        nvme_admin_security_send        = 0x81,
        nvme_admin_security_recv        = 0x82,
@@ -874,6 +876,16 @@ struct nvmf_property_get_command {
        __u8            resv4[16];
 };
 
+struct nvme_dbbuf {
+       __u8                    opcode;
+       __u8                    flags;
+       __u16                   command_id;
+       __u32                   rsvd1[5];
+       __le64                  prp1;
+       __le64                  prp2;
+       __u32                   rsvd12[6];
+};
+
 struct nvme_command {
        union {
                struct nvme_common_command common;
@@ -893,6 +905,7 @@ struct nvme_command {
                struct nvmf_connect_command connect;
                struct nvmf_property_set_command prop_set;
                struct nvmf_property_get_command prop_get;
+               struct nvme_dbbuf dbbuf;
        };
 };
 
index 21e6323..e5d4225 100644 (file)
@@ -159,6 +159,8 @@ static inline struct device_node *to_of_node(struct fwnode_handle *fwnode)
                container_of(fwnode, struct device_node, fwnode) : NULL;
 }
 
+#define of_fwnode_handle(node) (&(node)->fwnode)
+
 static inline bool of_have_populated_dt(void)
 {
        return of_root != NULL;
@@ -602,6 +604,8 @@ static inline struct device_node *of_find_node_with_property(
        return NULL;
 }
 
+#define of_fwnode_handle(node) NULL
+
 static inline bool of_have_populated_dt(void)
 {
        return false;
index 56939d3..491b3f5 100644 (file)
@@ -110,6 +110,7 @@ extern int __init pcpu_page_first_chunk(size_t reserved_size,
 #endif
 
 extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align);
+extern bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr);
 extern bool is_kernel_percpu_address(unsigned long addr);
 
 #if !defined(CONFIG_SMP) || !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
index 000fdb2..24a6358 100644 (file)
@@ -165,6 +165,13 @@ struct hw_perf_event {
                        struct list_head                bp_list;
                };
 #endif
+               struct { /* amd_iommu */
+                       u8      iommu_bank;
+                       u8      iommu_cntr;
+                       u16     padding;
+                       u64     conf;
+                       u64     conf1;
+               };
        };
        /*
         * If the event is a per task event, this will point to the task in
@@ -801,6 +808,7 @@ struct perf_output_handle {
        struct ring_buffer              *rb;
        unsigned long                   wakeup;
        unsigned long                   size;
+       u64                             aux_flags;
        union {
                void                    *addr;
                unsigned long           head;
@@ -849,10 +857,11 @@ perf_cgroup_from_task(struct task_struct *task, struct perf_event_context *ctx)
 extern void *perf_aux_output_begin(struct perf_output_handle *handle,
                                   struct perf_event *event);
 extern void perf_aux_output_end(struct perf_output_handle *handle,
-                               unsigned long size, bool truncated);
+                               unsigned long size);
 extern int perf_aux_output_skip(struct perf_output_handle *handle,
                                unsigned long size);
 extern void *perf_get_aux(struct perf_output_handle *handle);
+extern void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags);
 
 extern int perf_pmu_register(struct pmu *pmu, const char *name, int type);
 extern void perf_pmu_unregister(struct pmu *pmu);
@@ -1112,6 +1121,7 @@ extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks
 
 extern void perf_event_exec(void);
 extern void perf_event_comm(struct task_struct *tsk, bool exec);
+extern void perf_event_namespaces(struct task_struct *tsk);
 extern void perf_event_fork(struct task_struct *tsk);
 
 /* Callchains */
@@ -1267,8 +1277,8 @@ static inline void *
 perf_aux_output_begin(struct perf_output_handle *handle,
                      struct perf_event *event)                         { return NULL; }
 static inline void
-perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
-                   bool truncated)                                     { }
+perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
+                                                                       { }
 static inline int
 perf_aux_output_skip(struct perf_output_handle *handle,
                     unsigned long size)                                { return -EINVAL; }
@@ -1315,6 +1325,7 @@ static inline int perf_unregister_guest_info_callbacks
 static inline void perf_event_mmap(struct vm_area_struct *vma)         { }
 static inline void perf_event_exec(void)                               { }
 static inline void perf_event_comm(struct task_struct *tsk, bool exec) { }
+static inline void perf_event_namespaces(struct task_struct *tsk)      { }
 static inline void perf_event_fork(struct task_struct *tsk)            { }
 static inline void perf_event_init(void)                               { }
 static inline int  perf_swevent_get_recursion_context(void)            { return -1; }
index 43a7748..fb38573 100644 (file)
@@ -852,6 +852,7 @@ void phy_change_work(struct work_struct *work);
 void phy_mac_interrupt(struct phy_device *phydev, int new_link);
 void phy_start_machine(struct phy_device *phydev);
 void phy_stop_machine(struct phy_device *phydev);
+void phy_trigger_machine(struct phy_device *phydev, bool sync);
 int phy_ethtool_sset(struct phy_device *phydev, struct ethtool_cmd *cmd);
 int phy_ethtool_gset(struct phy_device *phydev, struct ethtool_cmd *cmd);
 int phy_ethtool_ksettings_get(struct phy_device *phydev,
index 8ce2d87..5e45385 100644 (file)
@@ -145,8 +145,9 @@ struct pinctrl_desc {
 extern int pinctrl_register_and_init(struct pinctrl_desc *pctldesc,
                                     struct device *dev, void *driver_data,
                                     struct pinctrl_dev **pctldev);
+extern int pinctrl_enable(struct pinctrl_dev *pctldev);
 
-/* Please use pinctrl_register_and_init() instead */
+/* Please use pinctrl_register_and_init() and pinctrl_enable() instead */
 extern struct pinctrl_dev *pinctrl_register(struct pinctrl_desc *pctldesc,
                                struct device *dev, void *driver_data);
 
index 5339ed5..9b6abe6 100644 (file)
@@ -20,6 +20,7 @@
 /* Defines used for the flags field in the struct generic_pm_domain */
 #define GENPD_FLAG_PM_CLK      (1U << 0) /* PM domain uses PM clk */
 #define GENPD_FLAG_IRQ_SAFE    (1U << 1) /* PM domain operates in atomic */
+#define GENPD_FLAG_ALWAYS_ON   (1U << 2) /* PM domain is always powered on */
 
 enum gpd_status {
        GPD_STATE_ACTIVE = 0,   /* PM domain is active */
index 34c4498..83b22ae 100644 (file)
@@ -59,23 +59,23 @@ struct posix_clock_operations {
 
        int  (*clock_adjtime)(struct posix_clock *pc, struct timex *tx);
 
-       int  (*clock_gettime)(struct posix_clock *pc, struct timespec *ts);
+       int  (*clock_gettime)(struct posix_clock *pc, struct timespec64 *ts);
 
-       int  (*clock_getres) (struct posix_clock *pc, struct timespec *ts);
+       int  (*clock_getres) (struct posix_clock *pc, struct timespec64 *ts);
 
        int  (*clock_settime)(struct posix_clock *pc,
-                             const struct timespec *ts);
+                             const struct timespec64 *ts);
 
        int  (*timer_create) (struct posix_clock *pc, struct k_itimer *kit);
 
        int  (*timer_delete) (struct posix_clock *pc, struct k_itimer *kit);
 
        void (*timer_gettime)(struct posix_clock *pc,
-                             struct k_itimer *kit, struct itimerspec *tsp);
+                             struct k_itimer *kit, struct itimerspec64 *tsp);
 
        int  (*timer_settime)(struct posix_clock *pc,
                              struct k_itimer *kit, int flags,
-                             struct itimerspec *tsp, struct itimerspec *old);
+                             struct itimerspec64 *tsp, struct itimerspec64 *old);
        /*
         * Optional character device methods:
         */
index 64aa189..8c1e43a 100644 (file)
@@ -87,22 +87,22 @@ struct k_itimer {
 };
 
 struct k_clock {
-       int (*clock_getres) (const clockid_t which_clock, struct timespec *tp);
+       int (*clock_getres) (const clockid_t which_clock, struct timespec64 *tp);
        int (*clock_set) (const clockid_t which_clock,
-                         const struct timespec *tp);
-       int (*clock_get) (const clockid_t which_clock, struct timespec * tp);
+                         const struct timespec64 *tp);
+       int (*clock_get) (const clockid_t which_clock, struct timespec64 *tp);
        int (*clock_adj) (const clockid_t which_clock, struct timex *tx);
        int (*timer_create) (struct k_itimer *timer);
        int (*nsleep) (const clockid_t which_clock, int flags,
-                      struct timespec *, struct timespec __user *);
+                      struct timespec64 *, struct timespec __user *);
        long (*nsleep_restart) (struct restart_block *restart_block);
-       int (*timer_set) (struct k_itimer * timr, int flags,
-                         struct itimerspec * new_setting,
-                         struct itimerspec * old_setting);
-       int (*timer_del) (struct k_itimer * timr);
+       int (*timer_set) (struct k_itimer *timr, int flags,
+                         struct itimerspec64 *new_setting,
+                         struct itimerspec64 *old_setting);
+       int (*timer_del) (struct k_itimer *timr);
 #define TIMER_RETRY 1
-       void (*timer_get) (struct k_itimer * timr,
-                          struct itimerspec * cur_setting);
+       void (*timer_get) (struct k_itimer *timr,
+                          struct itimerspec64 *cur_setting);
 };
 
 extern struct k_clock clock_posix_cpu;
diff --git a/include/linux/power/bq24190_charger.h b/include/linux/power/bq24190_charger.h
deleted file mode 100644 (file)
index 9f02837..0000000
+++ /dev/null
@@ -1,16 +0,0 @@
-/*
- * Platform data for the TI bq24190 battery charger driver.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#ifndef _BQ24190_CHARGER_H_
-#define _BQ24190_CHARGER_H_
-
-struct bq24190_platform_data {
-       unsigned int    gpio_int;       /* GPIO pin that's connected to INT# */
-};
-
-#endif
index 64e3a9c..2f48261 100644 (file)
@@ -33,6 +33,8 @@ enum dev_dma_attr {
        DEV_DMA_COHERENT,
 };
 
+struct fwnode_handle *dev_fwnode(struct device *dev);
+
 bool device_property_present(struct device *dev, const char *propname);
 int device_property_read_u8_array(struct device *dev, const char *propname,
                                  u8 *val, size_t nval);
@@ -70,6 +72,15 @@ int fwnode_property_read_string(struct fwnode_handle *fwnode,
 int fwnode_property_match_string(struct fwnode_handle *fwnode,
                                 const char *propname, const char *string);
 
+struct fwnode_handle *fwnode_get_parent(struct fwnode_handle *fwnode);
+struct fwnode_handle *fwnode_get_next_parent(struct fwnode_handle *fwnode);
+struct fwnode_handle *fwnode_get_next_child_node(struct fwnode_handle *fwnode,
+                                                struct fwnode_handle *child);
+
+#define fwnode_for_each_child_node(fwnode, child)                      \
+       for (child = fwnode_get_next_child_node(fwnode, NULL); child;   \
+            child = fwnode_get_next_child_node(fwnode, child))
+
 struct fwnode_handle *device_get_next_child_node(struct device *dev,
                                                 struct fwnode_handle *child);
 
@@ -77,9 +88,12 @@ struct fwnode_handle *device_get_next_child_node(struct device *dev,
        for (child = device_get_next_child_node(dev, NULL); child;      \
             child = device_get_next_child_node(dev, child))
 
+struct fwnode_handle *fwnode_get_named_child_node(struct fwnode_handle *fwnode,
+                                                 const char *childname);
 struct fwnode_handle *device_get_named_child_node(struct device *dev,
                                                  const char *childname);
 
+void fwnode_handle_get(struct fwnode_handle *fwnode);
 void fwnode_handle_put(struct fwnode_handle *fwnode);
 
 unsigned int device_get_child_node_count(struct device *dev);
@@ -258,4 +272,16 @@ int device_get_phy_mode(struct device *dev);
 
 void *device_get_mac_address(struct device *dev, char *addr, int alen);
 
+struct fwnode_handle *fwnode_graph_get_next_endpoint(
+       struct fwnode_handle *fwnode, struct fwnode_handle *prev);
+struct fwnode_handle *fwnode_graph_get_remote_port_parent(
+       struct fwnode_handle *fwnode);
+struct fwnode_handle *fwnode_graph_get_remote_port(
+       struct fwnode_handle *fwnode);
+struct fwnode_handle *fwnode_graph_get_remote_endpoint(
+       struct fwnode_handle *fwnode);
+
+int fwnode_graph_parse_endpoint(struct fwnode_handle *fwnode,
+                               struct fwnode_endpoint *endpoint);
+
 #endif /* _LINUX_PROPERTY_H_ */
index 2aceeaf..ffb1471 100644 (file)
@@ -1,14 +1,25 @@
 #ifndef __RAS_H__
 #define __RAS_H__
 
+#include <asm/errno.h>
+
 #ifdef CONFIG_DEBUG_FS
 int ras_userspace_consumers(void);
 void ras_debugfs_init(void);
 int ras_add_daemon_trace(void);
 #else
 static inline int ras_userspace_consumers(void) { return 0; }
-static inline void ras_debugfs_init(void) { return; }
+static inline void ras_debugfs_init(void) { }
 static inline int ras_add_daemon_trace(void) { return 0; }
 #endif
 
+#ifdef CONFIG_RAS_CEC
+void __init cec_init(void);
+int __init parse_cec_param(char *str);
+int cec_add_elem(u64 pfn);
+#else
+static inline void __init cec_init(void)       { }
+static inline int cec_add_elem(u64 pfn)                { return -ENODEV; }
 #endif
+
+#endif /* __RAS_H__ */
index 0023fee..b34aa64 100644 (file)
@@ -6,17 +6,36 @@
 #include <linux/spinlock.h>
 #include <linux/kernel.h>
 
+/**
+ * refcount_t - variant of atomic_t specialized for reference counts
+ * @refs: atomic_t counter field
+ *
+ * The counter saturates at UINT_MAX and will not move once
+ * there. This avoids wrapping the counter and causing 'spurious'
+ * use-after-free bugs.
+ */
 typedef struct refcount_struct {
        atomic_t refs;
 } refcount_t;
 
 #define REFCOUNT_INIT(n)       { .refs = ATOMIC_INIT(n), }
 
+/**
+ * refcount_set - set a refcount's value
+ * @r: the refcount
+ * @n: value to which the refcount will be set
+ */
 static inline void refcount_set(refcount_t *r, unsigned int n)
 {
        atomic_set(&r->refs, n);
 }
 
+/**
+ * refcount_read - get a refcount's value
+ * @r: the refcount
+ *
+ * Return: the refcount's value
+ */
 static inline unsigned int refcount_read(const refcount_t *r)
 {
        return atomic_read(&r->refs);
index 96fb139..13d8681 100644 (file)
@@ -15,6 +15,9 @@ int reset_control_status(struct reset_control *rstc);
 struct reset_control *__of_reset_control_get(struct device_node *node,
                                     const char *id, int index, bool shared,
                                     bool optional);
+struct reset_control *__reset_control_get(struct device *dev, const char *id,
+                                         int index, bool shared,
+                                         bool optional);
 void reset_control_put(struct reset_control *rstc);
 struct reset_control *__devm_reset_control_get(struct device *dev,
                                     const char *id, int index, bool shared,
@@ -72,6 +75,13 @@ static inline struct reset_control *__of_reset_control_get(
        return optional ? NULL : ERR_PTR(-ENOTSUPP);
 }
 
+static inline struct reset_control *__reset_control_get(
+                                       struct device *dev, const char *id,
+                                       int index, bool shared, bool optional)
+{
+       return optional ? NULL : ERR_PTR(-ENOTSUPP);
+}
+
 static inline struct reset_control *__devm_reset_control_get(
                                        struct device *dev, const char *id,
                                        int index, bool shared, bool optional)
@@ -102,8 +112,7 @@ __must_check reset_control_get_exclusive(struct device *dev, const char *id)
 #ifndef CONFIG_RESET_CONTROLLER
        WARN_ON(1);
 #endif
-       return __of_reset_control_get(dev ? dev->of_node : NULL, id, 0, false,
-                                                                       false);
+       return __reset_control_get(dev, id, 0, false, false);
 }
 
 /**
@@ -131,22 +140,19 @@ __must_check reset_control_get_exclusive(struct device *dev, const char *id)
 static inline struct reset_control *reset_control_get_shared(
                                        struct device *dev, const char *id)
 {
-       return __of_reset_control_get(dev ? dev->of_node : NULL, id, 0, true,
-                                                                       false);
+       return __reset_control_get(dev, id, 0, true, false);
 }
 
 static inline struct reset_control *reset_control_get_optional_exclusive(
                                        struct device *dev, const char *id)
 {
-       return __of_reset_control_get(dev ? dev->of_node : NULL, id, 0, false,
-                                                                       true);
+       return __reset_control_get(dev, id, 0, false, true);
 }
 
 static inline struct reset_control *reset_control_get_optional_shared(
                                        struct device *dev, const char *id)
 {
-       return __of_reset_control_get(dev ? dev->of_node : NULL, id, 0, true,
-                                                                       true);
+       return __reset_control_get(dev, id, 0, true, true);
 }
 
 /**
index d4e0a20..a1904aa 100644 (file)
@@ -176,6 +176,25 @@ void sbitmap_resize(struct sbitmap *sb, unsigned int depth);
 int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin);
 
 /**
+ * sbitmap_get_shallow() - Try to allocate a free bit from a &struct sbitmap,
+ * limiting the depth used from each word.
+ * @sb: Bitmap to allocate from.
+ * @alloc_hint: Hint for where to start searching for a free bit.
+ * @shallow_depth: The maximum number of bits to allocate from a single word.
+ *
+ * This rather specific operation allows for having multiple users with
+ * different allocation limits. E.g., there can be a high-priority class that
+ * uses sbitmap_get() and a low-priority class that uses sbitmap_get_shallow()
+ * with a @shallow_depth of (1 << (@sb->shift - 1)). Then, the low-priority
+ * class can only allocate half of the total bits in the bitmap, preventing it
+ * from starving out the high-priority class.
+ *
+ * Return: Non-negative allocated bit number if successful, -1 otherwise.
+ */
+int sbitmap_get_shallow(struct sbitmap *sb, unsigned int alloc_hint,
+                       unsigned long shallow_depth);
+
+/**
  * sbitmap_any_bit_set() - Check for a set bit in a &struct sbitmap.
  * @sb: Bitmap to check.
  *
@@ -326,6 +345,19 @@ void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth);
 int __sbitmap_queue_get(struct sbitmap_queue *sbq);
 
 /**
+ * __sbitmap_queue_get_shallow() - Try to allocate a free bit from a &struct
+ * sbitmap_queue, limiting the depth used from each word, with preemption
+ * already disabled.
+ * @sbq: Bitmap queue to allocate from.
+ * @shallow_depth: The maximum number of bits to allocate from a single word.
+ * See sbitmap_get_shallow().
+ *
+ * Return: Non-negative allocated bit number if successful, -1 otherwise.
+ */
+int __sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
+                               unsigned int shallow_depth);
+
+/**
  * sbitmap_queue_get() - Try to allocate a free bit from a &struct
  * sbitmap_queue.
  * @sbq: Bitmap queue to allocate from.
@@ -346,6 +378,29 @@ static inline int sbitmap_queue_get(struct sbitmap_queue *sbq,
 }
 
 /**
+ * sbitmap_queue_get_shallow() - Try to allocate a free bit from a &struct
+ * sbitmap_queue, limiting the depth used from each word.
+ * @sbq: Bitmap queue to allocate from.
+ * @cpu: Output parameter; will contain the CPU we ran on (e.g., to be passed to
+ *       sbitmap_queue_clear()).
+ * @shallow_depth: The maximum number of bits to allocate from a single word.
+ * See sbitmap_get_shallow().
+ *
+ * Return: Non-negative allocated bit number if successful, -1 otherwise.
+ */
+static inline int sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
+                                           unsigned int *cpu,
+                                           unsigned int shallow_depth)
+{
+       int nr;
+
+       *cpu = get_cpu();
+       nr = __sbitmap_queue_get_shallow(sbq, shallow_depth);
+       put_cpu();
+       return nr;
+}
+
+/**
  * sbitmap_queue_clear() - Free an allocated bit and wake up waiters on a
  * &struct sbitmap_queue.
  * @sbq: Bitmap to free from.
index d67eee8..ba080e5 100644 (file)
@@ -604,6 +604,10 @@ struct task_struct {
 #ifdef CONFIG_COMPAT_BRK
        unsigned                        brk_randomized:1;
 #endif
+#ifdef CONFIG_CGROUPS
+       /* disallow userland-initiated cgroup migration */
+       unsigned                        no_cgroup_migration:1;
+#endif
 
        unsigned long                   atomic_flags; /* Flags requiring atomic access. */
 
@@ -775,6 +779,8 @@ struct task_struct {
        /* PI waiters blocked on a rt_mutex held by this task: */
        struct rb_root                  pi_waiters;
        struct rb_node                  *pi_waiters_leftmost;
+       /* Updated under owner's pi_lock and rq lock */
+       struct task_struct              *pi_top_task;
        /* Deadlock detection and priority inheritance handling: */
        struct rt_mutex_waiter          *pi_blocked_on;
 #endif
@@ -1286,10 +1292,10 @@ TASK_PFA_TEST(LMK_WAITING, lmk_waiting)
 TASK_PFA_SET(LMK_WAITING, lmk_waiting)
 
 static inline void
-tsk_restore_flags(struct task_struct *task, unsigned long orig_flags, unsigned long flags)
+current_restore_flags(unsigned long orig_flags, unsigned long flags)
 {
-       task->flags &= ~flags;
-       task->flags |= orig_flags & flags;
+       current->flags &= ~flags;
+       current->flags |= orig_flags & flags;
 }
 
 extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
index 3bd6684..f93329a 100644 (file)
@@ -18,27 +18,20 @@ static inline int rt_task(struct task_struct *p)
 }
 
 #ifdef CONFIG_RT_MUTEXES
-extern int rt_mutex_getprio(struct task_struct *p);
-extern void rt_mutex_setprio(struct task_struct *p, int prio);
-extern int rt_mutex_get_effective_prio(struct task_struct *task, int newprio);
-extern struct task_struct *rt_mutex_get_top_task(struct task_struct *task);
+/*
+ * Must hold either p->pi_lock or task_rq(p)->lock.
+ */
+static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *p)
+{
+       return p->pi_top_task;
+}
+extern void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task);
 extern void rt_mutex_adjust_pi(struct task_struct *p);
 static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
 {
        return tsk->pi_blocked_on != NULL;
 }
 #else
-static inline int rt_mutex_getprio(struct task_struct *p)
-{
-       return p->normal_prio;
-}
-
-static inline int rt_mutex_get_effective_prio(struct task_struct *task,
-                                             int newprio)
-{
-       return newprio;
-}
-
 static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
 {
        return NULL;
index 8e0cb7a..68123c1 100644 (file)
@@ -120,6 +120,13 @@ extern unsigned int setup_max_cpus;
 extern void __init setup_nr_cpu_ids(void);
 extern void __init smp_init(void);
 
+extern int __boot_cpu_id;
+
+static inline int get_boot_cpu_id(void)
+{
+       return __boot_cpu_id;
+}
+
 #else /* !SMP */
 
 static inline void smp_send_stop(void) { }
@@ -158,6 +165,11 @@ static inline void smp_init(void) { up_late_init(); }
 static inline void smp_init(void) { }
 #endif
 
+static inline int get_boot_cpu_id(void)
+{
+       return 0;
+}
+
 #endif /* !SMP */
 
 /*
index c76e524..64b6b3a 100644 (file)
@@ -26,6 +26,7 @@ struct kstat {
        unsigned int    nlink;
        uint32_t        blksize;        /* Preferred I/O size */
        u64             attributes;
+       u64             attributes_mask;
 #define KSTAT_ATTR_FS_IOC_FLAGS                                \
        (STATX_ATTR_COMPRESSED |                        \
         STATX_ATTR_IMMUTABLE |                         \
index 9fba9dd..9375d23 100644 (file)
@@ -34,9 +34,9 @@ struct t10_pi_tuple {
 };
 
 
-extern struct blk_integrity_profile t10_pi_type1_crc;
-extern struct blk_integrity_profile t10_pi_type1_ip;
-extern struct blk_integrity_profile t10_pi_type3_crc;
-extern struct blk_integrity_profile t10_pi_type3_ip;
+extern const struct blk_integrity_profile t10_pi_type1_crc;
+extern const struct blk_integrity_profile t10_pi_type1_ip;
+extern const struct blk_integrity_profile t10_pi_type3_crc;
+extern const struct blk_integrity_profile t10_pi_type3_ip;
 
 #endif
index 5837387..55125d6 100644 (file)
@@ -101,6 +101,10 @@ static inline void check_object_size(const void *ptr, unsigned long n,
 { }
 #endif /* CONFIG_HARDENED_USERCOPY */
 
+#ifndef arch_setup_new_exec
+static inline void arch_setup_new_exec(void) { }
+#endif
+
 #endif /* __KERNEL__ */
 
 #endif /* _LINUX_THREAD_INFO_H */
index a04fea1..fe01e68 100644 (file)
@@ -117,6 +117,7 @@ extern void tick_nohz_idle_enter(void);
 extern void tick_nohz_idle_exit(void);
 extern void tick_nohz_irq_exit(void);
 extern ktime_t tick_nohz_get_sleep_length(void);
+extern unsigned long tick_nohz_get_idle_calls(void);
 extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time);
 extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time);
 #else /* !CONFIG_NO_HZ_COMMON */
index b598cbc..ddc229f 100644 (file)
@@ -19,21 +19,6 @@ extern void do_gettimeofday(struct timeval *tv);
 extern int do_settimeofday64(const struct timespec64 *ts);
 extern int do_sys_settimeofday64(const struct timespec64 *tv,
                                 const struct timezone *tz);
-static inline int do_sys_settimeofday(const struct timespec *tv,
-                                     const struct timezone *tz)
-{
-       struct timespec64 ts64;
-
-       if (!tv)
-               return do_sys_settimeofday64(NULL, tz);
-
-       if (!timespec_valid(tv))
-               return -EINVAL;
-
-       ts64 = timespec_to_timespec64(*tv);
-       return do_sys_settimeofday64(&ts64, tz);
-}
-
 /*
  * Kernel time accessors
  */
@@ -273,6 +258,11 @@ static inline void timekeeping_clocktai(struct timespec *ts)
        *ts = ktime_to_timespec(ktime_get_clocktai());
 }
 
+static inline void timekeeping_clocktai64(struct timespec64 *ts)
+{
+       *ts = ktime_to_timespec64(ktime_get_clocktai());
+}
+
 /*
  * RTC specific
  */
index f30c187..e0cbfb0 100644 (file)
@@ -2,8 +2,199 @@
 #define __LINUX_UACCESS_H__
 
 #include <linux/sched.h>
+#include <linux/thread_info.h>
+#include <linux/kasan-checks.h>
+
+#define VERIFY_READ 0
+#define VERIFY_WRITE 1
+
+#define uaccess_kernel() segment_eq(get_fs(), KERNEL_DS)
+
 #include <asm/uaccess.h>
 
+/*
+ * Architectures should provide two primitives (raw_copy_{to,from}_user())
+ * and get rid of their private instances of copy_{to,from}_user() and
+ * __copy_{to,from}_user{,_inatomic}().
+ *
+ * raw_copy_{to,from}_user(to, from, size) should copy up to size bytes and
+ * return the amount left to copy.  They should assume that access_ok() has
+ * already been checked (and succeeded); they should *not* zero-pad anything.
+ * No KASAN or object size checks either - those belong here.
+ *
+ * Both of these functions should attempt to copy size bytes starting at from
+ * into the area starting at to.  They must not fetch or store anything
+ * outside of those areas.  Return value must be between 0 (everything
+ * copied successfully) and size (nothing copied).
+ *
+ * If raw_copy_{to,from}_user(to, from, size) returns N, size - N bytes starting
+ * at to must become equal to the bytes fetched from the corresponding area
+ * starting at from.  All data past to + size - N must be left unmodified.
+ *
+ * If copying succeeds, the return value must be 0.  If some data cannot be
+ * fetched, it is permitted to copy less than had been fetched; the only
+ * hard requirement is that not storing anything at all (i.e. returning size)
+ * should happen only when nothing could be copied.  In other words, you don't
+ * have to squeeze as much as possible - it is allowed, but not necessary.
+ *
+ * For raw_copy_from_user() to always points to kernel memory and no faults
+ * on store should happen.  Interpretation of from is affected by set_fs().
+ * For raw_copy_to_user() it's the other way round.
+ *
+ * Both can be inlined - it's up to architectures whether it wants to bother
+ * with that.  They should not be used directly; they are used to implement
+ * the 6 functions (copy_{to,from}_user(), __copy_{to,from}_user_inatomic())
+ * that are used instead.  Out of those, __... ones are inlined.  Plain
+ * copy_{to,from}_user() might or might not be inlined.  If you want them
+ * inlined, have asm/uaccess.h define INLINE_COPY_{TO,FROM}_USER.
+ *
+ * NOTE: only copy_from_user() zero-pads the destination in case of short copy.
+ * Neither __copy_from_user() nor __copy_from_user_inatomic() zero anything
+ * at all; their callers absolutely must check the return value.
+ *
+ * Biarch ones should also provide raw_copy_in_user() - similar to the above,
+ * but both source and destination are __user pointers (affected by set_fs()
+ * as usual) and both source and destination can trigger faults.
+ */
+
+static __always_inline unsigned long
+__copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
+{
+       kasan_check_write(to, n);
+       check_object_size(to, n, false);
+       return raw_copy_from_user(to, from, n);
+}
+
+static __always_inline unsigned long
+__copy_from_user(void *to, const void __user *from, unsigned long n)
+{
+       might_fault();
+       kasan_check_write(to, n);
+       check_object_size(to, n, false);
+       return raw_copy_from_user(to, from, n);
+}
+
+/**
+ * __copy_to_user_inatomic: - Copy a block of data into user space, with less checking.
+ * @to:   Destination address, in user space.
+ * @from: Source address, in kernel space.
+ * @n:    Number of bytes to copy.
+ *
+ * Context: User context only.
+ *
+ * Copy data from kernel space to user space.  Caller must check
+ * the specified block with access_ok() before calling this function.
+ * The caller should also make sure he pins the user space address
+ * so that we don't result in page fault and sleep.
+ */
+static __always_inline unsigned long
+__copy_to_user_inatomic(void __user *to, const void *from, unsigned long n)
+{
+       kasan_check_read(from, n);
+       check_object_size(from, n, true);
+       return raw_copy_to_user(to, from, n);
+}
+
+static __always_inline unsigned long
+__copy_to_user(void __user *to, const void *from, unsigned long n)
+{
+       might_fault();
+       kasan_check_read(from, n);
+       check_object_size(from, n, true);
+       return raw_copy_to_user(to, from, n);
+}
+
+#ifdef INLINE_COPY_FROM_USER
+static inline unsigned long
+_copy_from_user(void *to, const void __user *from, unsigned long n)
+{
+       unsigned long res = n;
+       if (likely(access_ok(VERIFY_READ, from, n)))
+               res = raw_copy_from_user(to, from, n);
+       if (unlikely(res))
+               memset(to + (n - res), 0, res);
+       return res;
+}
+#else
+extern unsigned long
+_copy_from_user(void *, const void __user *, unsigned long);
+#endif
+
+#ifdef INLINE_COPY_TO_USER
+static inline unsigned long
+_copy_to_user(void __user *to, const void *from, unsigned long n)
+{
+       if (access_ok(VERIFY_WRITE, to, n))
+               n = raw_copy_to_user(to, from, n);
+       return n;
+}
+#else
+extern unsigned long
+_copy_to_user(void __user *, const void *, unsigned long);
+#endif
+
+extern void __compiletime_error("usercopy buffer size is too small")
+__bad_copy_user(void);
+
+static inline void copy_user_overflow(int size, unsigned long count)
+{
+       WARN(1, "Buffer overflow detected (%d < %lu)!\n", size, count);
+}
+
+static __always_inline unsigned long __must_check
+copy_from_user(void *to, const void __user *from, unsigned long n)
+{
+       int sz = __compiletime_object_size(to);
+
+       might_fault();
+       kasan_check_write(to, n);
+
+       if (likely(sz < 0 || sz >= n)) {
+               check_object_size(to, n, false);
+               n = _copy_from_user(to, from, n);
+       } else if (!__builtin_constant_p(n))
+               copy_user_overflow(sz, n);
+       else
+               __bad_copy_user();
+
+       return n;
+}
+
+static __always_inline unsigned long __must_check
+copy_to_user(void __user *to, const void *from, unsigned long n)
+{
+       int sz = __compiletime_object_size(from);
+
+       kasan_check_read(from, n);
+       might_fault();
+
+       if (likely(sz < 0 || sz >= n)) {
+               check_object_size(from, n, true);
+               n = _copy_to_user(to, from, n);
+       } else if (!__builtin_constant_p(n))
+               copy_user_overflow(sz, n);
+       else
+               __bad_copy_user();
+
+       return n;
+}
+#ifdef CONFIG_COMPAT
+static __always_inline unsigned long __must_check
+__copy_in_user(void __user *to, const void *from, unsigned long n)
+{
+       might_fault();
+       return raw_copy_in_user(to, from, n);
+}
+static __always_inline unsigned long __must_check
+copy_in_user(void __user *to, const void *from, unsigned long n)
+{
+       might_fault();
+       if (access_ok(VERIFY_WRITE, to, n) && access_ok(VERIFY_READ, from, n))
+               n = raw_copy_in_user(to, from, n);
+       return n;
+}
+#endif
+
 static __always_inline void pagefault_disabled_inc(void)
 {
        current->pagefault_disabled++;
@@ -67,12 +258,6 @@ static inline unsigned long __copy_from_user_inatomic_nocache(void *to,
        return __copy_from_user_inatomic(to, from, n);
 }
 
-static inline unsigned long __copy_from_user_nocache(void *to,
-                               const void __user *from, unsigned long n)
-{
-       return __copy_from_user(to, from, n);
-}
-
 #endif         /* ARCH_HAS_NOCACHE_UACCESS */
 
 /*
index 804e34c..f2d36a3 100644 (file)
@@ -39,7 +39,10 @@ struct iov_iter {
        };
        union {
                unsigned long nr_segs;
-               int idx;
+               struct {
+                       int idx;
+                       int start_idx;
+               };
        };
 };
 
@@ -81,6 +84,7 @@ unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to);
 size_t iov_iter_copy_from_user_atomic(struct page *page,
                struct iov_iter *i, unsigned long offset, size_t bytes);
 void iov_iter_advance(struct iov_iter *i, size_t bytes);
+void iov_iter_revert(struct iov_iter *i, size_t bytes);
 int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes);
 size_t iov_iter_single_seg_count(const struct iov_iter *i);
 size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
index 04b0d3f..7edfbdb 100644 (file)
@@ -167,6 +167,7 @@ struct virtio_driver {
        unsigned int feature_table_size;
        const unsigned int *feature_table_legacy;
        unsigned int feature_table_size_legacy;
+       int (*validate)(struct virtio_device *dev);
        int (*probe)(struct virtio_device *dev);
        void (*scan)(struct virtio_device *dev);
        void (*remove)(struct virtio_device *dev);
index bde063c..c102ef6 100644 (file)
@@ -608,8 +608,13 @@ static inline long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
 {
        return fn(arg);
 }
+static inline long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg)
+{
+       return fn(arg);
+}
 #else
 long work_on_cpu(int cpu, long (*fn)(void *), void *arg);
+long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg);
 #endif /* CONFIG_SMP */
 
 #ifdef CONFIG_FREEZER
index a3c0cbd..d581579 100644 (file)
@@ -237,6 +237,7 @@ static inline void inode_attach_wb(struct inode *inode, struct page *page)
 static inline void inode_detach_wb(struct inode *inode)
 {
        if (inode->i_wb) {
+               WARN_ON_ONCE(!(inode->i_state & I_CLEAR));
                wb_put(inode->i_wb);
                inode->i_wb = NULL;
        }
index 1f71ee5..069582e 100644 (file)
@@ -448,10 +448,9 @@ static inline int sctp_frag_point(const struct sctp_association *asoc, int pmtu)
        return frag;
 }
 
-static inline void sctp_assoc_pending_pmtu(struct sock *sk, struct sctp_association *asoc)
+static inline void sctp_assoc_pending_pmtu(struct sctp_association *asoc)
 {
-
-       sctp_assoc_sync_pmtu(sk, asoc);
+       sctp_assoc_sync_pmtu(asoc);
        asoc->pmtu_pending = 0;
 }
 
@@ -596,12 +595,23 @@ static inline void sctp_v4_map_v6(union sctp_addr *addr)
  */
 static inline struct dst_entry *sctp_transport_dst_check(struct sctp_transport *t)
 {
-       if (t->dst && (!dst_check(t->dst, t->dst_cookie) ||
-                      t->pathmtu != max_t(size_t, SCTP_TRUNC4(dst_mtu(t->dst)),
-                                          SCTP_DEFAULT_MINSEGMENT)))
+       if (t->dst && !dst_check(t->dst, t->dst_cookie))
                sctp_transport_dst_release(t);
 
        return t->dst;
 }
 
+static inline bool sctp_transport_pmtu_check(struct sctp_transport *t)
+{
+       __u32 pmtu = max_t(size_t, SCTP_TRUNC4(dst_mtu(t->dst)),
+                          SCTP_DEFAULT_MINSEGMENT);
+
+       if (t->pathmtu == pmtu)
+               return true;
+
+       t->pathmtu = pmtu;
+
+       return false;
+}
+
 #endif /* __net_sctp_h__ */
index 592dece..138f861 100644 (file)
@@ -377,7 +377,8 @@ typedef struct sctp_sender_hb_info {
        __u64 hb_nonce;
 } sctp_sender_hb_info_t;
 
-struct sctp_stream *sctp_stream_new(__u16 incnt, __u16 outcnt, gfp_t gfp);
+int sctp_stream_new(struct sctp_association *asoc, gfp_t gfp);
+int sctp_stream_init(struct sctp_association *asoc, gfp_t gfp);
 void sctp_stream_free(struct sctp_stream *stream);
 void sctp_stream_clear(struct sctp_stream *stream);
 
@@ -499,7 +500,6 @@ struct sctp_datamsg {
        /* Did the messenge fail to send? */
        int send_error;
        u8 send_failed:1,
-          force_delay:1,
           can_delay;       /* should this message be Nagle delayed */
 };
 
@@ -952,8 +952,8 @@ void sctp_transport_lower_cwnd(struct sctp_transport *, sctp_lower_cwnd_t);
 void sctp_transport_burst_limited(struct sctp_transport *);
 void sctp_transport_burst_reset(struct sctp_transport *);
 unsigned long sctp_transport_timeout(struct sctp_transport *);
-void sctp_transport_reset(struct sctp_transport *);
-void sctp_transport_update_pmtu(struct sock *, struct sctp_transport *, u32);
+void sctp_transport_reset(struct sctp_transport *t);
+void sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu);
 void sctp_transport_immediate_rtx(struct sctp_transport *);
 void sctp_transport_dst_release(struct sctp_transport *t);
 void sctp_transport_dst_confirm(struct sctp_transport *t);
@@ -1878,6 +1878,7 @@ struct sctp_association {
 
        __u8 need_ecne:1,       /* Need to send an ECNE Chunk? */
             temp:1,            /* Is it a temporary association? */
+            force_delay:1,
             prsctp_enable:1,
             reconf_enable:1;
 
@@ -1953,7 +1954,7 @@ void sctp_assoc_update(struct sctp_association *old,
 
 __u32 sctp_association_get_next_tsn(struct sctp_association *);
 
-void sctp_assoc_sync_pmtu(struct sock *, struct sctp_association *);
+void sctp_assoc_sync_pmtu(struct sctp_association *asoc);
 void sctp_assoc_rwnd_increase(struct sctp_association *, unsigned int);
 void sctp_assoc_rwnd_decrease(struct sctp_association *, unsigned int);
 void sctp_assoc_set_primary(struct sctp_association *,
index 9b4c22a..66dbed0 100644 (file)
@@ -100,7 +100,7 @@ struct sockaddr_ib {
  */
 static inline bool ib_safe_file_access(struct file *filp)
 {
-       return filp->f_cred == current_cred() && segment_eq(get_fs(), USER_DS);
+       return filp->f_cred == current_cred() && !uaccess_kernel();
 }
 
 #endif /* _RDMA_IB_H */
index ba0aeb9..f0c76f9 100644 (file)
@@ -9,8 +9,10 @@ struct scsi_request {
        unsigned char   __cmd[BLK_MAX_CDB];
        unsigned char   *cmd;
        unsigned short  cmd_len;
+       int             result;
        unsigned int    sense_len;
        unsigned int    resid_len;      /* residual count */
+       int             retries;
        void            *sense;
 };
 
index 4b784b6..ccfad0e 100644 (file)
@@ -117,6 +117,7 @@ enum transport_state_table {
        TRANSPORT_ISTATE_PROCESSING = 11,
        TRANSPORT_COMPLETE_QF_WP = 18,
        TRANSPORT_COMPLETE_QF_OK = 19,
+       TRANSPORT_COMPLETE_QF_ERR = 20,
 };
 
 /* Used for struct se_cmd->se_cmd_flags */
@@ -279,8 +280,6 @@ struct t10_alua_tg_pt_gp {
        u16     tg_pt_gp_id;
        int     tg_pt_gp_valid_id;
        int     tg_pt_gp_alua_supported_states;
-       int     tg_pt_gp_alua_pending_state;
-       int     tg_pt_gp_alua_previous_state;
        int     tg_pt_gp_alua_access_status;
        int     tg_pt_gp_alua_access_type;
        int     tg_pt_gp_nonop_delay_msecs;
@@ -289,18 +288,16 @@ struct t10_alua_tg_pt_gp {
        int     tg_pt_gp_pref;
        int     tg_pt_gp_write_metadata;
        u32     tg_pt_gp_members;
-       atomic_t tg_pt_gp_alua_access_state;
+       int     tg_pt_gp_alua_access_state;
        atomic_t tg_pt_gp_ref_cnt;
        spinlock_t tg_pt_gp_lock;
-       struct mutex tg_pt_gp_md_mutex;
+       struct mutex tg_pt_gp_transition_mutex;
        struct se_device *tg_pt_gp_dev;
        struct config_group tg_pt_gp_group;
        struct list_head tg_pt_gp_list;
        struct list_head tg_pt_gp_lun_list;
        struct se_lun *tg_pt_gp_alua_lun;
        struct se_node_acl *tg_pt_gp_alua_nacl;
-       struct work_struct tg_pt_gp_transition_work;
-       struct completion *tg_pt_gp_transition_complete;
 };
 
 struct t10_vpd {
@@ -705,6 +702,7 @@ struct se_lun {
        u64                     unpacked_lun;
 #define SE_LUN_LINK_MAGIC                      0xffff7771
        u32                     lun_link_magic;
+       bool                    lun_shutdown;
        bool                    lun_access_ro;
        u32                     lun_index;
 
index a88ed13..d0dbe60 100644 (file)
@@ -61,7 +61,16 @@ DEFINE_EVENT(block_buffer, block_dirty_buffer,
        TP_ARGS(bh)
 );
 
-DECLARE_EVENT_CLASS(block_rq_with_error,
+/**
+ * block_rq_requeue - place block IO request back on a queue
+ * @q: queue holding operation
+ * @rq: block IO operation request
+ *
+ * The block operation request @rq is being placed back into queue
+ * @q.  For some reason the request was not completed and needs to be
+ * put back in the queue.
+ */
+TRACE_EVENT(block_rq_requeue,
 
        TP_PROTO(struct request_queue *q, struct request *rq),
 
@@ -71,7 +80,6 @@ DECLARE_EVENT_CLASS(block_rq_with_error,
                __field(  dev_t,        dev                     )
                __field(  sector_t,     sector                  )
                __field(  unsigned int, nr_sector               )
-               __field(  int,          errors                  )
                __array(  char,         rwbs,   RWBS_LEN        )
                __dynamic_array( char,  cmd,    1               )
        ),
@@ -80,7 +88,6 @@ DECLARE_EVENT_CLASS(block_rq_with_error,
                __entry->dev       = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
                __entry->sector    = blk_rq_trace_sector(rq);
                __entry->nr_sector = blk_rq_trace_nr_sectors(rq);
-               __entry->errors    = rq->errors;
 
                blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq));
                __get_str(cmd)[0] = '\0';
@@ -90,46 +97,13 @@ DECLARE_EVENT_CLASS(block_rq_with_error,
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->rwbs, __get_str(cmd),
                  (unsigned long long)__entry->sector,
-                 __entry->nr_sector, __entry->errors)
-);
-
-/**
- * block_rq_abort - abort block operation request
- * @q: queue containing the block operation request
- * @rq: block IO operation request
- *
- * Called immediately after pending block IO operation request @rq in
- * queue @q is aborted. The fields in the operation request @rq
- * can be examined to determine which device and sectors the pending
- * operation would access.
- */
-DEFINE_EVENT(block_rq_with_error, block_rq_abort,
-
-       TP_PROTO(struct request_queue *q, struct request *rq),
-
-       TP_ARGS(q, rq)
-);
-
-/**
- * block_rq_requeue - place block IO request back on a queue
- * @q: queue holding operation
- * @rq: block IO operation request
- *
- * The block operation request @rq is being placed back into queue
- * @q.  For some reason the request was not completed and needs to be
- * put back in the queue.
- */
-DEFINE_EVENT(block_rq_with_error, block_rq_requeue,
-
-       TP_PROTO(struct request_queue *q, struct request *rq),
-
-       TP_ARGS(q, rq)
+                 __entry->nr_sector, 0)
 );
 
 /**
  * block_rq_complete - block IO operation completed by device driver
- * @q: queue containing the block operation request
  * @rq: block operations request
+ * @error: status code
  * @nr_bytes: number of completed bytes
  *
  * The block_rq_complete tracepoint event indicates that some portion
@@ -140,16 +114,15 @@ DEFINE_EVENT(block_rq_with_error, block_rq_requeue,
  */
 TRACE_EVENT(block_rq_complete,
 
-       TP_PROTO(struct request_queue *q, struct request *rq,
-                unsigned int nr_bytes),
+       TP_PROTO(struct request *rq, int error, unsigned int nr_bytes),
 
-       TP_ARGS(q, rq, nr_bytes),
+       TP_ARGS(rq, error, nr_bytes),
 
        TP_STRUCT__entry(
                __field(  dev_t,        dev                     )
                __field(  sector_t,     sector                  )
                __field(  unsigned int, nr_sector               )
-               __field(  int,          errors                  )
+               __field(  int,          error                   )
                __array(  char,         rwbs,   RWBS_LEN        )
                __dynamic_array( char,  cmd,    1               )
        ),
@@ -158,7 +131,7 @@ TRACE_EVENT(block_rq_complete,
                __entry->dev       = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
                __entry->sector    = blk_rq_pos(rq);
                __entry->nr_sector = nr_bytes >> 9;
-               __entry->errors    = rq->errors;
+               __entry->error     = error;
 
                blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, nr_bytes);
                __get_str(cmd)[0] = '\0';
@@ -168,7 +141,7 @@ TRACE_EVENT(block_rq_complete,
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->rwbs, __get_str(cmd),
                  (unsigned long long)__entry->sector,
-                 __entry->nr_sector, __entry->errors)
+                 __entry->nr_sector, __entry->error)
 );
 
 DECLARE_EVENT_CLASS(block_rq,
index 9e3ef6c..ae1409f 100644 (file)
@@ -70,7 +70,7 @@ DECLARE_EVENT_CLASS(sched_wakeup_template,
        TP_fast_assign(
                memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
                __entry->pid            = p->pid;
-               __entry->prio           = p->prio;
+               __entry->prio           = p->prio; /* XXX SCHED_DEADLINE */
                __entry->success        = 1; /* rudiment, kill when possible */
                __entry->target_cpu     = task_cpu(p);
        ),
@@ -147,6 +147,7 @@ TRACE_EVENT(sched_switch,
                memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
                __entry->next_pid       = next->pid;
                __entry->next_prio      = next->prio;
+               /* XXX SCHED_DEADLINE */
        ),
 
        TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d",
@@ -181,7 +182,7 @@ TRACE_EVENT(sched_migrate_task,
        TP_fast_assign(
                memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
                __entry->pid            = p->pid;
-               __entry->prio           = p->prio;
+               __entry->prio           = p->prio; /* XXX SCHED_DEADLINE */
                __entry->orig_cpu       = task_cpu(p);
                __entry->dest_cpu       = dest_cpu;
        ),
@@ -206,7 +207,7 @@ DECLARE_EVENT_CLASS(sched_process_template,
        TP_fast_assign(
                memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
                __entry->pid            = p->pid;
-               __entry->prio           = p->prio;
+               __entry->prio           = p->prio; /* XXX SCHED_DEADLINE */
        ),
 
        TP_printk("comm=%s pid=%d prio=%d",
@@ -253,7 +254,7 @@ TRACE_EVENT(sched_process_wait,
        TP_fast_assign(
                memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
                __entry->pid            = pid_nr(pid);
-               __entry->prio           = current->prio;
+               __entry->prio           = current->prio; /* XXX SCHED_DEADLINE */
        ),
 
        TP_printk("comm=%s pid=%d prio=%d",
@@ -413,9 +414,9 @@ DEFINE_EVENT(sched_stat_runtime, sched_stat_runtime,
  */
 TRACE_EVENT(sched_pi_setprio,
 
-       TP_PROTO(struct task_struct *tsk, int newprio),
+       TP_PROTO(struct task_struct *tsk, struct task_struct *pi_task),
 
-       TP_ARGS(tsk, newprio),
+       TP_ARGS(tsk, pi_task),
 
        TP_STRUCT__entry(
                __array( char,  comm,   TASK_COMM_LEN   )
@@ -428,7 +429,8 @@ TRACE_EVENT(sched_pi_setprio,
                memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
                __entry->pid            = tsk->pid;
                __entry->oldprio        = tsk->prio;
-               __entry->newprio        = newprio;
+               __entry->newprio        = pi_task ? pi_task->prio : tsk->prio;
+               /* XXX SCHED_DEADLINE bits missing */
        ),
 
        TP_printk("comm=%s pid=%d oldprio=%d newprio=%d",
index dd9820b..f8d9fed 100644 (file)
@@ -445,6 +445,7 @@ header-y += unistd.h
 header-y += unix_diag.h
 header-y += usbdevice_fs.h
 header-y += usbip.h
+header-y += userio.h
 header-y += utime.h
 header-y += utsname.h
 header-y += uuid.h
index cb5d1a5..9cd1de9 100644 (file)
@@ -42,7 +42,6 @@
 #define EM_TILEGX      191     /* Tilera TILE-Gx */
 #define EM_BPF         247     /* Linux BPF - in-kernel virtual machine */
 #define EM_FRV         0x5441  /* Fujitsu FR-V */
-#define EM_AVR32       0x18ad  /* Atmel AVR32 */
 
 /*
  * This is an interim value that we will use until the committee comes
index 85bbb17..d496c02 100644 (file)
@@ -35,7 +35,7 @@
 #define RTF_PREF(pref) ((pref) << 27)
 #define RTF_PREF_MASK  0x18000000
 
-#define RTF_PCPU       0x40000000
+#define RTF_PCPU       0x40000000      /* read-only: can not be set by user */
 #define RTF_LOCAL      0x80000000
 
 
index fd19f36..c8aec4b 100644 (file)
@@ -85,6 +85,10 @@ struct nvm_ioctl_create_conf {
        };
 };
 
+enum {
+       NVM_TARGET_FACTORY = 1 << 0,    /* Init target in factory mode */
+};
+
 struct nvm_ioctl_create {
        char dev[DISK_NAME_LEN];                /* open-channel SSD device */
        char tgttype[NVM_TTYPE_NAME_MAX];       /* target type name */
diff --git a/include/uapi/linux/nbd-netlink.h b/include/uapi/linux/nbd-netlink.h
new file mode 100644 (file)
index 0000000..6f7ca3d
--- /dev/null
@@ -0,0 +1,98 @@
+/*
+ * Copyright (C) 2017 Facebook.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef _UAPILINUX_NBD_NETLINK_H
+#define _UAPILINUX_NBD_NETLINK_H
+
+#define NBD_GENL_FAMILY_NAME           "nbd"
+#define NBD_GENL_VERSION               0x1
+#define NBD_GENL_MCAST_GROUP_NAME      "nbd_mc_group"
+
+/* Configuration policy attributes, used for CONNECT */
+enum {
+       NBD_ATTR_UNSPEC,
+       NBD_ATTR_INDEX,
+       NBD_ATTR_SIZE_BYTES,
+       NBD_ATTR_BLOCK_SIZE_BYTES,
+       NBD_ATTR_TIMEOUT,
+       NBD_ATTR_SERVER_FLAGS,
+       NBD_ATTR_CLIENT_FLAGS,
+       NBD_ATTR_SOCKETS,
+       NBD_ATTR_DEAD_CONN_TIMEOUT,
+       NBD_ATTR_DEVICE_LIST,
+       __NBD_ATTR_MAX,
+};
+#define NBD_ATTR_MAX (__NBD_ATTR_MAX - 1)
+
+/*
+ * This is the format for multiple devices with NBD_ATTR_DEVICE_LIST
+ *
+ * [NBD_ATTR_DEVICE_LIST]
+ *   [NBD_DEVICE_ITEM]
+ *     [NBD_DEVICE_INDEX]
+ *     [NBD_DEVICE_CONNECTED]
+ */
+enum {
+       NBD_DEVICE_ITEM_UNSPEC,
+       NBD_DEVICE_ITEM,
+       __NBD_DEVICE_ITEM_MAX,
+};
+#define NBD_DEVICE_ITEM_MAX (__NBD_DEVICE_ITEM_MAX - 1)
+
+enum {
+       NBD_DEVICE_UNSPEC,
+       NBD_DEVICE_INDEX,
+       NBD_DEVICE_CONNECTED,
+       __NBD_DEVICE_MAX,
+};
+#define NBD_DEVICE_ATTR_MAX (__NBD_DEVICE_MAX - 1)
+
+/*
+ * This is the format for multiple sockets with NBD_ATTR_SOCKETS
+ *
+ * [NBD_ATTR_SOCKETS]
+ *   [NBD_SOCK_ITEM]
+ *     [NBD_SOCK_FD]
+ *   [NBD_SOCK_ITEM]
+ *     [NBD_SOCK_FD]
+ */
+enum {
+       NBD_SOCK_ITEM_UNSPEC,
+       NBD_SOCK_ITEM,
+       __NBD_SOCK_ITEM_MAX,
+};
+#define NBD_SOCK_ITEM_MAX (__NBD_SOCK_ITEM_MAX - 1)
+
+enum {
+       NBD_SOCK_UNSPEC,
+       NBD_SOCK_FD,
+       __NBD_SOCK_MAX,
+};
+#define NBD_SOCK_MAX (__NBD_SOCK_MAX - 1)
+
+enum {
+       NBD_CMD_UNSPEC,
+       NBD_CMD_CONNECT,
+       NBD_CMD_DISCONNECT,
+       NBD_CMD_RECONFIGURE,
+       NBD_CMD_LINK_DEAD,
+       NBD_CMD_STATUS,
+       __NBD_CMD_MAX,
+};
+#define NBD_CMD_MAX    (__NBD_CMD_MAX - 1)
+
+#endif /* _UAPILINUX_NBD_NETLINK_H */
index c91c642..155e33f 100644 (file)
@@ -37,7 +37,7 @@ enum {
        NBD_CMD_TRIM = 4
 };
 
-/* values for flags field */
+/* values for flags field, these are server interaction specific. */
 #define NBD_FLAG_HAS_FLAGS     (1 << 0) /* nbd-server supports flags */
 #define NBD_FLAG_READ_ONLY     (1 << 1) /* device is read-only */
 #define NBD_FLAG_SEND_FLUSH    (1 << 2) /* can flush writeback cache */
@@ -45,6 +45,10 @@ enum {
 #define NBD_FLAG_SEND_TRIM     (1 << 5) /* send trim/discard */
 #define NBD_FLAG_CAN_MULTI_CONN        (1 << 8)        /* Server supports multiple connections per export. */
 
+/* These are client behavior specific flags. */
+#define NBD_CFLAG_DESTROY_ON_DISCONNECT        (1 << 0) /* delete the nbd device on
+                                                   disconnect. */
+
 /* userspace doesn't need the nbd_device structure */
 
 /* These are sent over the network in the request/reply magic fields */
index c66a485..d09a9cd 100644 (file)
@@ -344,7 +344,8 @@ struct perf_event_attr {
                                use_clockid    :  1, /* use @clockid for time fields */
                                context_switch :  1, /* context switch data */
                                write_backward :  1, /* Write ring buffer from end to beginning */
-                               __reserved_1   : 36;
+                               namespaces     :  1, /* include namespaces data */
+                               __reserved_1   : 35;
 
        union {
                __u32           wakeup_events;    /* wakeup every n events */
@@ -610,6 +611,23 @@ struct perf_event_header {
        __u16   size;
 };
 
+struct perf_ns_link_info {
+       __u64   dev;
+       __u64   ino;
+};
+
+enum {
+       NET_NS_INDEX            = 0,
+       UTS_NS_INDEX            = 1,
+       IPC_NS_INDEX            = 2,
+       PID_NS_INDEX            = 3,
+       USER_NS_INDEX           = 4,
+       MNT_NS_INDEX            = 5,
+       CGROUP_NS_INDEX         = 6,
+
+       NR_NAMESPACES,          /* number of available namespaces */
+};
+
 enum perf_event_type {
 
        /*
@@ -862,6 +880,18 @@ enum perf_event_type {
         */
        PERF_RECORD_SWITCH_CPU_WIDE             = 15,
 
+       /*
+        * struct {
+        *      struct perf_event_header        header;
+        *      u32                             pid;
+        *      u32                             tid;
+        *      u64                             nr_namespaces;
+        *      { u64                           dev, inode; } [nr_namespaces];
+        *      struct sample_id                sample_id;
+        * };
+        */
+       PERF_RECORD_NAMESPACES                  = 16,
+
        PERF_RECORD_MAX,                        /* non-ABI */
 };
 
@@ -885,6 +915,7 @@ enum perf_callchain_context {
  */
 #define PERF_AUX_FLAG_TRUNCATED                0x01    /* record was truncated to fit */
 #define PERF_AUX_FLAG_OVERWRITE                0x02    /* snapshot from overwrite mode */
+#define PERF_AUX_FLAG_PARTIAL          0x04    /* record contains gaps */
 
 #define PERF_FLAG_FD_NO_GROUP          (1UL << 0)
 #define PERF_FLAG_FD_OUTPUT            (1UL << 1)
index 51a6b86..17b1030 100644 (file)
  * tv_sec holds the number of seconds before (negative) or after (positive)
  * 00:00:00 1st January 1970 UTC.
  *
- * tv_nsec holds a number of nanoseconds before (0..-999,999,999 if tv_sec is
- * negative) or after (0..999,999,999 if tv_sec is positive) the tv_sec time.
- *
- * Note that if both tv_sec and tv_nsec are non-zero, then the two values must
- * either be both positive or both negative.
+ * tv_nsec holds a number of nanoseconds (0..999,999,999) after the tv_sec time.
  *
  * __reserved is held in case we need a yet finer resolution.
  */
 struct statx_timestamp {
        __s64   tv_sec;
-       __s32   tv_nsec;
+       __u32   tv_nsec;
        __s32   __reserved;
 };
 
@@ -114,7 +110,7 @@ struct statx {
        __u64   stx_ino;        /* Inode number */
        __u64   stx_size;       /* File size */
        __u64   stx_blocks;     /* Number of 512-byte blocks allocated */
-       __u64   __spare1[1];
+       __u64   stx_attributes_mask; /* Mask to show what's supported in stx_attributes */
        /* 0x40 */
        struct statx_timestamp  stx_atime;      /* Last access time */
        struct statx_timestamp  stx_btime;      /* File creation time */
@@ -152,9 +148,10 @@ struct statx {
 #define STATX_BASIC_STATS      0x000007ffU     /* The stuff in the normal stat struct */
 #define STATX_BTIME            0x00000800U     /* Want/got stx_btime */
 #define STATX_ALL              0x00000fffU     /* All currently supported flags */
+#define STATX__RESERVED                0x80000000U     /* Reserved for future struct statx expansion */
 
 /*
- * Attributes to be found in stx_attributes
+ * Attributes to be found in stx_attributes and masked in stx_attributes_mask.
  *
  * These give information about the features or the state of a file that might
  * be of use to ordinary userspace programs such as GUIs or ls rather than
index 15b4385..90007a1 100644 (file)
@@ -79,7 +79,7 @@
  * configuration space */
 #define VIRTIO_PCI_CONFIG_OFF(msix_enabled)    ((msix_enabled) ? 24 : 20)
 /* Deprecated: please use VIRTIO_PCI_CONFIG_OFF instead */
-#define VIRTIO_PCI_CONFIG(dev) VIRTIO_PCI_CONFIG_OFF((dev)->pci_dev->msix_enabled)
+#define VIRTIO_PCI_CONFIG(dev) VIRTIO_PCI_CONFIG_OFF((dev)->msix_enabled)
 
 /* Virtio ABI version, this must match exactly */
 #define VIRTIO_PCI_ABI_VERSION         0
index 9dc46cb..064194f 100644 (file)
@@ -38,7 +38,7 @@ struct xen_memory_region {
        unsigned long n_pfns;
 };
 
-#define XEN_EXTRA_MEM_MAX_REGIONS 128 /* == E820MAX */
+#define XEN_EXTRA_MEM_MAX_REGIONS 128 /* == E820_MAX_ENTRIES_ZEROPAGE */
 
 extern __initdata
 struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS];
index 2f4964c..a871bf8 100644 (file)
@@ -160,7 +160,6 @@ static LIST_HEAD(audit_freelist);
 
 /* queue msgs to send via kauditd_task */
 static struct sk_buff_head audit_queue;
-static void kauditd_hold_skb(struct sk_buff *skb);
 /* queue msgs due to temporary unicast send problems */
 static struct sk_buff_head audit_retry_queue;
 /* queue msgs waiting for new auditd connection */
@@ -454,30 +453,6 @@ static void auditd_set(int pid, u32 portid, struct net *net)
 }
 
 /**
- * auditd_reset - Disconnect the auditd connection
- *
- * Description:
- * Break the auditd/kauditd connection and move all the queued records into the
- * hold queue in case auditd reconnects.
- */
-static void auditd_reset(void)
-{
-       struct sk_buff *skb;
-
-       /* if it isn't already broken, break the connection */
-       rcu_read_lock();
-       if (auditd_conn.pid)
-               auditd_set(0, 0, NULL);
-       rcu_read_unlock();
-
-       /* flush all of the main and retry queues to the hold queue */
-       while ((skb = skb_dequeue(&audit_retry_queue)))
-               kauditd_hold_skb(skb);
-       while ((skb = skb_dequeue(&audit_queue)))
-               kauditd_hold_skb(skb);
-}
-
-/**
  * kauditd_print_skb - Print the audit record to the ring buffer
  * @skb: audit record
  *
@@ -505,9 +480,6 @@ static void kauditd_rehold_skb(struct sk_buff *skb)
 {
        /* put the record back in the queue at the same place */
        skb_queue_head(&audit_hold_queue, skb);
-
-       /* fail the auditd connection */
-       auditd_reset();
 }
 
 /**
@@ -544,9 +516,6 @@ static void kauditd_hold_skb(struct sk_buff *skb)
        /* we have no other options - drop the message */
        audit_log_lost("kauditd hold queue overflow");
        kfree_skb(skb);
-
-       /* fail the auditd connection */
-       auditd_reset();
 }
 
 /**
@@ -567,6 +536,30 @@ static void kauditd_retry_skb(struct sk_buff *skb)
 }
 
 /**
+ * auditd_reset - Disconnect the auditd connection
+ *
+ * Description:
+ * Break the auditd/kauditd connection and move all the queued records into the
+ * hold queue in case auditd reconnects.
+ */
+static void auditd_reset(void)
+{
+       struct sk_buff *skb;
+
+       /* if it isn't already broken, break the connection */
+       rcu_read_lock();
+       if (auditd_conn.pid)
+               auditd_set(0, 0, NULL);
+       rcu_read_unlock();
+
+       /* flush all of the main and retry queues to the hold queue */
+       while ((skb = skb_dequeue(&audit_retry_queue)))
+               kauditd_hold_skb(skb);
+       while ((skb = skb_dequeue(&audit_queue)))
+               kauditd_hold_skb(skb);
+}
+
+/**
  * auditd_send_unicast_skb - Send a record via unicast to auditd
  * @skb: audit record
  *
@@ -758,6 +751,7 @@ static int kauditd_thread(void *dummy)
                                        NULL, kauditd_rehold_skb);
                if (rc < 0) {
                        sk = NULL;
+                       auditd_reset();
                        goto main_queue;
                }
 
@@ -767,6 +761,7 @@ static int kauditd_thread(void *dummy)
                                        NULL, kauditd_hold_skb);
                if (rc < 0) {
                        sk = NULL;
+                       auditd_reset();
                        goto main_queue;
                }
 
@@ -775,16 +770,18 @@ main_queue:
                 * unicast, dump failed record sends to the retry queue; if
                 * sk == NULL due to previous failures we will just do the
                 * multicast send and move the record to the retry queue */
-               kauditd_send_queue(sk, portid, &audit_queue, 1,
-                                  kauditd_send_multicast_skb,
-                                  kauditd_retry_skb);
+               rc = kauditd_send_queue(sk, portid, &audit_queue, 1,
+                                       kauditd_send_multicast_skb,
+                                       kauditd_retry_skb);
+               if (sk == NULL || rc < 0)
+                       auditd_reset();
+               sk = NULL;
 
                /* drop our netns reference, no auditd sends past this line */
                if (net) {
                        put_net(net);
                        net = NULL;
                }
-               sk = NULL;
 
                /* we have processed all the queues so wake everyone */
                wake_up(&audit_backlog_wait);
index 0f1cf6d..0d87f8a 100644 (file)
@@ -333,13 +333,7 @@ extern u32 audit_sig_sid;
 extern int audit_filter(int msgtype, unsigned int listtype);
 
 #ifdef CONFIG_AUDITSYSCALL
-extern int __audit_signal_info(int sig, struct task_struct *t);
-static inline int audit_signal_info(int sig, struct task_struct *t)
-{
-       if (auditd_test_task(t) || (audit_signals && !audit_dummy_context()))
-               return __audit_signal_info(sig, t);
-       return 0;
-}
+extern int audit_signal_info(int sig, struct task_struct *t);
 extern void audit_filter_inodes(struct task_struct *, struct audit_context *);
 extern struct list_head *audit_killed_trees(void);
 #else
index e59ffc7..1c23331 100644 (file)
@@ -2249,26 +2249,27 @@ void __audit_ptrace(struct task_struct *t)
  * If the audit subsystem is being terminated, record the task (pid)
  * and uid that is doing that.
  */
-int __audit_signal_info(int sig, struct task_struct *t)
+int audit_signal_info(int sig, struct task_struct *t)
 {
        struct audit_aux_data_pids *axp;
        struct task_struct *tsk = current;
        struct audit_context *ctx = tsk->audit_context;
        kuid_t uid = current_uid(), t_uid = task_uid(t);
 
-       if (auditd_test_task(t)) {
-               if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) {
-                       audit_sig_pid = task_tgid_nr(tsk);
-                       if (uid_valid(tsk->loginuid))
-                               audit_sig_uid = tsk->loginuid;
-                       else
-                               audit_sig_uid = uid;
-                       security_task_getsecid(tsk, &audit_sig_sid);
-               }
-               if (!audit_signals || audit_dummy_context())
-                       return 0;
+       if (auditd_test_task(t) &&
+           (sig == SIGTERM || sig == SIGHUP ||
+            sig == SIGUSR1 || sig == SIGUSR2)) {
+               audit_sig_pid = task_tgid_nr(tsk);
+               if (uid_valid(tsk->loginuid))
+                       audit_sig_uid = tsk->loginuid;
+               else
+                       audit_sig_uid = uid;
+               security_task_getsecid(tsk, &audit_sig_sid);
        }
 
+       if (!audit_signals || audit_dummy_context())
+               return 0;
+
        /* optimize the common case by putting first signal recipient directly
         * in audit_context */
        if (!ctx->target_pid) {
index f45827e..b4f1cb0 100644 (file)
@@ -1162,12 +1162,12 @@ out:
        LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */
                off = IMM;
 load_word:
-               /* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are
-                * only appearing in the programs where ctx ==
-                * skb. All programs keep 'ctx' in regs[BPF_REG_CTX]
-                * == BPF_R6, bpf_convert_filter() saves it in BPF_R6,
-                * internal BPF verifier will check that BPF_R6 ==
-                * ctx.
+               /* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are only
+                * appearing in the programs where ctx == skb
+                * (see may_access_skb() in the verifier). All programs
+                * keep 'ctx' in regs[BPF_REG_CTX] == BPF_R6,
+                * bpf_convert_filter() saves it in BPF_R6, internal BPF
+                * verifier will check that BPF_R6 == ctx.
                 *
                 * BPF_ABS and BPF_IND are wrappers of function calls,
                 * so they scratch BPF_R1-BPF_R5 registers, preserve
index 7af0dcc..821f9e8 100644 (file)
@@ -617,6 +617,14 @@ static void fixup_bpf_calls(struct bpf_prog *prog)
                        if (insn->imm == BPF_FUNC_xdp_adjust_head)
                                prog->xdp_adjust_head = 1;
                        if (insn->imm == BPF_FUNC_tail_call) {
+                               /* If we tail call into other programs, we
+                                * cannot make any assumptions since they
+                                * can be replaced dynamically during runtime
+                                * in the program array.
+                                */
+                               prog->cb_access = 1;
+                               prog->xdp_adjust_head = 1;
+
                                /* mark bpf_tail_call as different opcode
                                 * to avoid conditional branch in
                                 * interpeter for every normal call
index 796b68d..a834068 100644 (file)
@@ -765,38 +765,56 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
        }
 }
 
-static int check_ptr_alignment(struct bpf_verifier_env *env,
-                              struct bpf_reg_state *reg, int off, int size)
+static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
+                                  int off, int size)
 {
-       if (reg->type != PTR_TO_PACKET && reg->type != PTR_TO_MAP_VALUE_ADJ) {
-               if (off % size != 0) {
-                       verbose("misaligned access off %d size %d\n",
-                               off, size);
-                       return -EACCES;
-               } else {
-                       return 0;
-               }
-       }
-
-       if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
-               /* misaligned access to packet is ok on x86,arm,arm64 */
-               return 0;
-
        if (reg->id && size != 1) {
-               verbose("Unknown packet alignment. Only byte-sized access allowed\n");
+               verbose("Unknown alignment. Only byte-sized access allowed in packet access.\n");
                return -EACCES;
        }
 
        /* skb->data is NET_IP_ALIGN-ed */
-       if (reg->type == PTR_TO_PACKET &&
-           (NET_IP_ALIGN + reg->off + off) % size != 0) {
+       if ((NET_IP_ALIGN + reg->off + off) % size != 0) {
                verbose("misaligned packet access off %d+%d+%d size %d\n",
                        NET_IP_ALIGN, reg->off, off, size);
                return -EACCES;
        }
+
        return 0;
 }
 
+static int check_val_ptr_alignment(const struct bpf_reg_state *reg,
+                                  int size)
+{
+       if (size != 1) {
+               verbose("Unknown alignment. Only byte-sized access allowed in value access.\n");
+               return -EACCES;
+       }
+
+       return 0;
+}
+
+static int check_ptr_alignment(const struct bpf_reg_state *reg,
+                              int off, int size)
+{
+       switch (reg->type) {
+       case PTR_TO_PACKET:
+               return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ? 0 :
+                      check_pkt_ptr_alignment(reg, off, size);
+       case PTR_TO_MAP_VALUE_ADJ:
+               return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ? 0 :
+                      check_val_ptr_alignment(reg, size);
+       default:
+               if (off % size != 0) {
+                       verbose("misaligned access off %d size %d\n",
+                               off, size);
+                       return -EACCES;
+               }
+
+               return 0;
+       }
+}
+
 /* check whether memory at (regno + off) is accessible for t = (read | write)
  * if t==write, value_regno is a register which value is stored into memory
  * if t==read, value_regno is a register which will receive the value from memory
@@ -818,7 +836,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
        if (size < 0)
                return size;
 
-       err = check_ptr_alignment(env, reg, off, size);
+       err = check_ptr_alignment(reg, off, size);
        if (err)
                return err;
 
@@ -1925,6 +1943,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
                 * register as unknown.
                 */
                if (env->allow_ptr_leaks &&
+                   BPF_CLASS(insn->code) == BPF_ALU64 && opcode == BPF_ADD &&
                    (dst_reg->type == PTR_TO_MAP_VALUE ||
                     dst_reg->type == PTR_TO_MAP_VALUE_ADJ))
                        dst_reg->type = PTR_TO_MAP_VALUE_ADJ;
@@ -1973,14 +1992,15 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
 
        for (i = 0; i < MAX_BPF_REG; i++)
                if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id)
-                       regs[i].range = dst_reg->off;
+                       /* keep the maximum range already checked */
+                       regs[i].range = max(regs[i].range, dst_reg->off);
 
        for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
                if (state->stack_slot_type[i] != STACK_SPILL)
                        continue;
                reg = &state->spilled_regs[i / BPF_REG_SIZE];
                if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id)
-                       reg->range = dst_reg->off;
+                       reg->range = max(reg->range, dst_reg->off);
        }
 }
 
index 9203bfb..00f4d6b 100644 (file)
@@ -5,6 +5,7 @@
 #include <linux/kernfs.h>
 #include <linux/workqueue.h>
 #include <linux/list.h>
+#include <linux/refcount.h>
 
 /*
  * A cgroup can be associated with multiple css_sets as different tasks may
@@ -134,7 +135,7 @@ static inline void put_css_set(struct css_set *cset)
         * can see it. Similar to atomic_dec_and_lock(), but for an
         * rwlock
         */
-       if (atomic_add_unless(&cset->refcount, -1, 1))
+       if (refcount_dec_not_one(&cset->refcount))
                return;
 
        spin_lock_irqsave(&css_set_lock, flags);
@@ -147,7 +148,7 @@ static inline void put_css_set(struct css_set *cset)
  */
 static inline void get_css_set(struct css_set *cset)
 {
-       atomic_inc(&cset->refcount);
+       refcount_inc(&cset->refcount);
 }
 
 bool cgroup_ssid_enabled(int ssid);
@@ -163,7 +164,7 @@ int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
 
 void cgroup_free_root(struct cgroup_root *root);
 void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts);
-int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask);
+int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags);
 int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
 struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
                               struct cgroup_root *root, unsigned long magic,
index 1dc22f6..85d7515 100644 (file)
@@ -346,7 +346,7 @@ static int cgroup_task_count(const struct cgroup *cgrp)
 
        spin_lock_irq(&css_set_lock);
        list_for_each_entry(link, &cgrp->cset_links, cset_link)
-               count += atomic_read(&link->cset->refcount);
+               count += refcount_read(&link->cset->refcount);
        spin_unlock_irq(&css_set_lock);
        return count;
 }
@@ -1072,6 +1072,7 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
        struct cgroup_subsys *ss;
        struct dentry *dentry;
        int i, ret;
+       bool new_root = false;
 
        cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
 
@@ -1181,10 +1182,11 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
                ret = -ENOMEM;
                goto out_unlock;
        }
+       new_root = true;
 
        init_cgroup_root(root, &opts);
 
-       ret = cgroup_setup_root(root, opts.subsys_mask);
+       ret = cgroup_setup_root(root, opts.subsys_mask, PERCPU_REF_INIT_DEAD);
        if (ret)
                cgroup_free_root(root);
 
@@ -1201,6 +1203,18 @@ out_free:
                                 CGROUP_SUPER_MAGIC, ns);
 
        /*
+        * There's a race window after we release cgroup_mutex and before
+        * allocating a superblock. Make sure a concurrent process won't
+        * be able to re-use the root during this window by delaying the
+        * initialization of root refcnt.
+        */
+       if (new_root) {
+               mutex_lock(&cgroup_mutex);
+               percpu_ref_reinit(&root->cgrp.self.refcnt);
+               mutex_unlock(&cgroup_mutex);
+       }
+
+       /*
         * If @pinned_sb, we're reusing an existing root and holding an
         * extra ref on its sb.  Mount is complete.  Put the extra ref.
         */
@@ -1286,7 +1300,7 @@ static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
        u64 count;
 
        rcu_read_lock();
-       count = atomic_read(&task_css_set(current)->refcount);
+       count = refcount_read(&task_css_set(current)->refcount);
        rcu_read_unlock();
        return count;
 }
index 4885132..c3c9a0e 100644 (file)
@@ -189,7 +189,7 @@ static u16 have_canfork_callback __read_mostly;
 
 /* cgroup namespace for init task */
 struct cgroup_namespace init_cgroup_ns = {
-       .count          = { .counter = 2, },
+       .count          = REFCOUNT_INIT(2),
        .user_ns        = &init_user_ns,
        .ns.ops         = &cgroupns_operations,
        .ns.inum        = PROC_CGROUP_INIT_INO,
@@ -436,7 +436,12 @@ out_unlock:
        return css;
 }
 
-static void cgroup_get(struct cgroup *cgrp)
+static void __maybe_unused cgroup_get(struct cgroup *cgrp)
+{
+       css_get(&cgrp->self);
+}
+
+static void cgroup_get_live(struct cgroup *cgrp)
 {
        WARN_ON_ONCE(cgroup_is_dead(cgrp));
        css_get(&cgrp->self);
@@ -554,7 +559,7 @@ EXPORT_SYMBOL_GPL(of_css);
  * haven't been created.
  */
 struct css_set init_css_set = {
-       .refcount               = ATOMIC_INIT(1),
+       .refcount               = REFCOUNT_INIT(1),
        .tasks                  = LIST_HEAD_INIT(init_css_set.tasks),
        .mg_tasks               = LIST_HEAD_INIT(init_css_set.mg_tasks),
        .task_iters             = LIST_HEAD_INIT(init_css_set.task_iters),
@@ -724,7 +729,7 @@ void put_css_set_locked(struct css_set *cset)
 
        lockdep_assert_held(&css_set_lock);
 
-       if (!atomic_dec_and_test(&cset->refcount))
+       if (!refcount_dec_and_test(&cset->refcount))
                return;
 
        /* This css_set is dead. unlink it and release cgroup and css refs */
@@ -932,7 +937,7 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
        list_add_tail(&link->cgrp_link, &cset->cgrp_links);
 
        if (cgroup_parent(cgrp))
-               cgroup_get(cgrp);
+               cgroup_get_live(cgrp);
 }
 
 /**
@@ -977,7 +982,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
                return NULL;
        }
 
-       atomic_set(&cset->refcount, 1);
+       refcount_set(&cset->refcount, 1);
        INIT_LIST_HEAD(&cset->tasks);
        INIT_LIST_HEAD(&cset->mg_tasks);
        INIT_LIST_HEAD(&cset->task_iters);
@@ -1640,7 +1645,7 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts)
                set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
 }
 
-int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
+int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags)
 {
        LIST_HEAD(tmp_links);
        struct cgroup *root_cgrp = &root->cgrp;
@@ -1656,8 +1661,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
        root_cgrp->id = ret;
        root_cgrp->ancestor_ids[0] = ret;
 
-       ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0,
-                             GFP_KERNEL);
+       ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release,
+                             ref_flags, GFP_KERNEL);
        if (ret)
                goto out;
 
@@ -1802,7 +1807,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                        return ERR_PTR(-EINVAL);
                }
                cgrp_dfl_visible = true;
-               cgroup_get(&cgrp_dfl_root.cgrp);
+               cgroup_get_live(&cgrp_dfl_root.cgrp);
 
                dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root,
                                         CGROUP2_SUPER_MAGIC, ns);
@@ -2425,11 +2430,12 @@ ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
                tsk = tsk->group_leader;
 
        /*
-        * Workqueue threads may acquire PF_NO_SETAFFINITY and become
-        * trapped in a cpuset, or RT worker may be born in a cgroup
-        * with no rt_runtime allocated.  Just say no.
+        * kthreads may acquire PF_NO_SETAFFINITY during initialization.
+        * If userland migrates such a kthread to a non-root cgroup, it can
+        * become trapped in a cpuset, or RT kthread may be born in a
+        * cgroup with no rt_runtime allocated.  Just say no.
         */
-       if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
+       if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
                ret = -EINVAL;
                goto out_unlock_rcu;
        }
@@ -2575,7 +2581,7 @@ restart:
                        if (!css || !percpu_ref_is_dying(&css->refcnt))
                                continue;
 
-                       cgroup_get(dsct);
+                       cgroup_get_live(dsct);
                        prepare_to_wait(&dsct->offline_waitq, &wait,
                                        TASK_UNINTERRUPTIBLE);
 
@@ -3946,7 +3952,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
 {
        lockdep_assert_held(&cgroup_mutex);
 
-       cgroup_get(cgrp);
+       cgroup_get_live(cgrp);
 
        memset(css, 0, sizeof(*css));
        css->cgroup = cgrp;
@@ -4122,7 +4128,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
        /* allocation complete, commit to creation */
        list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children);
        atomic_inc(&root->nr_cgrps);
-       cgroup_get(parent);
+       cgroup_get_live(parent);
 
        /*
         * @cgrp is now fully operational.  If something fails after this
@@ -4512,7 +4518,7 @@ int __init cgroup_init(void)
        hash_add(css_set_table, &init_css_set.hlist,
                 css_set_hash(init_css_set.subsys));
 
-       BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
+       BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0, 0));
 
        mutex_unlock(&cgroup_mutex);
 
@@ -4946,7 +4952,7 @@ struct cgroup *cgroup_get_from_path(const char *path)
        if (kn) {
                if (kernfs_type(kn) == KERNFS_DIR) {
                        cgrp = kn->priv;
-                       cgroup_get(cgrp);
+                       cgroup_get_live(cgrp);
                } else {
                        cgrp = ERR_PTR(-ENOTDIR);
                }
@@ -5026,6 +5032,11 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
 
        /* Socket clone path */
        if (skcd->val) {
+               /*
+                * We might be cloning a socket which is left in an empty
+                * cgroup and the cgroup might have already been rmdir'd.
+                * Don't use cgroup_get_live().
+                */
                cgroup_get(sock_cgroup_ptr(skcd));
                return;
        }
index 0f41292..f6501f4 100644 (file)
@@ -2121,10 +2121,8 @@ int __init cpuset_init(void)
 {
        int err = 0;
 
-       if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
-               BUG();
-       if (!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL))
-               BUG();
+       BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
+       BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
 
        cpumask_setall(top_cpuset.cpus_allowed);
        nodes_setall(top_cpuset.mems_allowed);
@@ -2139,8 +2137,7 @@ int __init cpuset_init(void)
        if (err < 0)
                return err;
 
-       if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL))
-               BUG();
+       BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));
 
        return 0;
 }
@@ -2354,7 +2351,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
                rebuild_sched_domains();
 }
 
-void cpuset_update_active_cpus(bool cpu_online)
+void cpuset_update_active_cpus(void)
 {
        /*
         * We're inside cpu hotplug critical region which usually nests
index 96d38da..66129eb 100644 (file)
@@ -31,7 +31,7 @@ static struct cgroup_namespace *alloc_cgroup_ns(void)
                kfree(new_ns);
                return ERR_PTR(ret);
        }
-       atomic_set(&new_ns->count, 1);
+       refcount_set(&new_ns->count, 1);
        new_ns->ns.ops = &cgroupns_operations;
        return new_ns;
 }
index 19aec5d..933bcb3 100644 (file)
@@ -108,8 +108,8 @@ COMPAT_SYSCALL_DEFINE2(gettimeofday, struct compat_timeval __user *, tv,
 COMPAT_SYSCALL_DEFINE2(settimeofday, struct compat_timeval __user *, tv,
                       struct timezone __user *, tz)
 {
+       struct timespec64 new_ts;
        struct timeval user_tv;
-       struct timespec new_ts;
        struct timezone new_tz;
 
        if (tv) {
@@ -123,7 +123,7 @@ COMPAT_SYSCALL_DEFINE2(settimeofday, struct compat_timeval __user *, tv,
                        return -EFAULT;
        }
 
-       return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
+       return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
 }
 
 static int __compat_get_timeval(struct timeval *tv, const struct compat_timeval __user *ctv)
@@ -240,18 +240,20 @@ COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp,
                       struct compat_timespec __user *, rmtp)
 {
        struct timespec tu, rmt;
+       struct timespec64 tu64;
        mm_segment_t oldfs;
        long ret;
 
        if (compat_get_timespec(&tu, rqtp))
                return -EFAULT;
 
-       if (!timespec_valid(&tu))
+       tu64 = timespec_to_timespec64(tu);
+       if (!timespec64_valid(&tu64))
                return -EINVAL;
 
        oldfs = get_fs();
        set_fs(KERNEL_DS);
-       ret = hrtimer_nanosleep(&tu,
+       ret = hrtimer_nanosleep(&tu64,
                                rmtp ? (struct timespec __user *)&rmt : NULL,
                                HRTIMER_MODE_REL, CLOCK_MONOTONIC);
        set_fs(oldfs);
index 37b223e..9ae6fbe 100644 (file)
@@ -1125,6 +1125,8 @@ core_initcall(cpu_hotplug_pm_sync_init);
 
 #endif /* CONFIG_PM_SLEEP_SMP */
 
+int __boot_cpu_id;
+
 #endif /* CONFIG_SMP */
 
 /* Boot processor state steps */
@@ -1815,6 +1817,10 @@ void __init boot_cpu_init(void)
        set_cpu_active(cpu, true);
        set_cpu_present(cpu, true);
        set_cpu_possible(cpu, true);
+
+#ifdef CONFIG_SMP
+       __boot_cpu_id = cpu;
+#endif
 }
 
 /*
index ff01cba..6e75a5c 100644 (file)
@@ -48,6 +48,8 @@
 #include <linux/parser.h>
 #include <linux/sched/clock.h>
 #include <linux/sched/mm.h>
+#include <linux/proc_ns.h>
+#include <linux/mount.h>
 
 #include "internal.h"
 
@@ -379,6 +381,7 @@ static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
 
 static atomic_t nr_mmap_events __read_mostly;
 static atomic_t nr_comm_events __read_mostly;
+static atomic_t nr_namespaces_events __read_mostly;
 static atomic_t nr_task_events __read_mostly;
 static atomic_t nr_freq_events __read_mostly;
 static atomic_t nr_switch_events __read_mostly;
@@ -3991,6 +3994,8 @@ static void unaccount_event(struct perf_event *event)
                atomic_dec(&nr_mmap_events);
        if (event->attr.comm)
                atomic_dec(&nr_comm_events);
+       if (event->attr.namespaces)
+               atomic_dec(&nr_namespaces_events);
        if (event->attr.task)
                atomic_dec(&nr_task_events);
        if (event->attr.freq)
@@ -6491,6 +6496,7 @@ static void perf_event_task(struct task_struct *task,
 void perf_event_fork(struct task_struct *task)
 {
        perf_event_task(task, NULL, 1);
+       perf_event_namespaces(task);
 }
 
 /*
@@ -6593,6 +6599,132 @@ void perf_event_comm(struct task_struct *task, bool exec)
 }
 
 /*
+ * namespaces tracking
+ */
+
+struct perf_namespaces_event {
+       struct task_struct              *task;
+
+       struct {
+               struct perf_event_header        header;
+
+               u32                             pid;
+               u32                             tid;
+               u64                             nr_namespaces;
+               struct perf_ns_link_info        link_info[NR_NAMESPACES];
+       } event_id;
+};
+
+static int perf_event_namespaces_match(struct perf_event *event)
+{
+       return event->attr.namespaces;
+}
+
+static void perf_event_namespaces_output(struct perf_event *event,
+                                        void *data)
+{
+       struct perf_namespaces_event *namespaces_event = data;
+       struct perf_output_handle handle;
+       struct perf_sample_data sample;
+       int ret;
+
+       if (!perf_event_namespaces_match(event))
+               return;
+
+       perf_event_header__init_id(&namespaces_event->event_id.header,
+                                  &sample, event);
+       ret = perf_output_begin(&handle, event,
+                               namespaces_event->event_id.header.size);
+       if (ret)
+               return;
+
+       namespaces_event->event_id.pid = perf_event_pid(event,
+                                                       namespaces_event->task);
+       namespaces_event->event_id.tid = perf_event_tid(event,
+                                                       namespaces_event->task);
+
+       perf_output_put(&handle, namespaces_event->event_id);
+
+       perf_event__output_id_sample(event, &handle, &sample);
+
+       perf_output_end(&handle);
+}
+
+static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info,
+                                  struct task_struct *task,
+                                  const struct proc_ns_operations *ns_ops)
+{
+       struct path ns_path;
+       struct inode *ns_inode;
+       void *error;
+
+       error = ns_get_path(&ns_path, task, ns_ops);
+       if (!error) {
+               ns_inode = ns_path.dentry->d_inode;
+               ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev);
+               ns_link_info->ino = ns_inode->i_ino;
+       }
+}
+
+void perf_event_namespaces(struct task_struct *task)
+{
+       struct perf_namespaces_event namespaces_event;
+       struct perf_ns_link_info *ns_link_info;
+
+       if (!atomic_read(&nr_namespaces_events))
+               return;
+
+       namespaces_event = (struct perf_namespaces_event){
+               .task   = task,
+               .event_id  = {
+                       .header = {
+                               .type = PERF_RECORD_NAMESPACES,
+                               .misc = 0,
+                               .size = sizeof(namespaces_event.event_id),
+                       },
+                       /* .pid */
+                       /* .tid */
+                       .nr_namespaces = NR_NAMESPACES,
+                       /* .link_info[NR_NAMESPACES] */
+               },
+       };
+
+       ns_link_info = namespaces_event.event_id.link_info;
+
+       perf_fill_ns_link_info(&ns_link_info[MNT_NS_INDEX],
+                              task, &mntns_operations);
+
+#ifdef CONFIG_USER_NS
+       perf_fill_ns_link_info(&ns_link_info[USER_NS_INDEX],
+                              task, &userns_operations);
+#endif
+#ifdef CONFIG_NET_NS
+       perf_fill_ns_link_info(&ns_link_info[NET_NS_INDEX],
+                              task, &netns_operations);
+#endif
+#ifdef CONFIG_UTS_NS
+       perf_fill_ns_link_info(&ns_link_info[UTS_NS_INDEX],
+                              task, &utsns_operations);
+#endif
+#ifdef CONFIG_IPC_NS
+       perf_fill_ns_link_info(&ns_link_info[IPC_NS_INDEX],
+                              task, &ipcns_operations);
+#endif
+#ifdef CONFIG_PID_NS
+       perf_fill_ns_link_info(&ns_link_info[PID_NS_INDEX],
+                              task, &pidns_operations);
+#endif
+#ifdef CONFIG_CGROUPS
+       perf_fill_ns_link_info(&ns_link_info[CGROUP_NS_INDEX],
+                              task, &cgroupns_operations);
+#endif
+
+       perf_iterate_sb(perf_event_namespaces_output,
+                       &namespaces_event,
+                       NULL);
+}
+
+/*
  * mmap tracking
  */
 
@@ -9146,6 +9278,8 @@ static void account_event(struct perf_event *event)
                atomic_inc(&nr_mmap_events);
        if (event->attr.comm)
                atomic_inc(&nr_comm_events);
+       if (event->attr.namespaces)
+               atomic_inc(&nr_namespaces_events);
        if (event->attr.task)
                atomic_inc(&nr_task_events);
        if (event->attr.freq)
@@ -9691,6 +9825,11 @@ SYSCALL_DEFINE5(perf_event_open,
                        return -EACCES;
        }
 
+       if (attr.namespaces) {
+               if (!capable(CAP_SYS_ADMIN))
+                       return -EACCES;
+       }
+
        if (attr.freq) {
                if (attr.sample_freq > sysctl_perf_event_sample_rate)
                        return -EINVAL;
index 257fa46..2831480 100644 (file)
@@ -297,6 +297,19 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
                rb->paused = 1;
 }
 
+void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags)
+{
+       /*
+        * OVERWRITE is determined by perf_aux_output_end() and can't
+        * be passed in directly.
+        */
+       if (WARN_ON_ONCE(flags & PERF_AUX_FLAG_OVERWRITE))
+               return;
+
+       handle->aux_flags |= flags;
+}
+EXPORT_SYMBOL_GPL(perf_aux_output_flag);
+
 /*
  * This is called before hardware starts writing to the AUX area to
  * obtain an output handle and make sure there's room in the buffer.
@@ -360,6 +373,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
        handle->event = event;
        handle->head = aux_head;
        handle->size = 0;
+       handle->aux_flags = 0;
 
        /*
         * In overwrite mode, AUX data stores do not depend on aux_tail,
@@ -408,34 +422,32 @@ err:
  * of the AUX buffer management code is that after pmu::stop(), the AUX
  * transaction must be stopped and therefore drop the AUX reference count.
  */
-void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
-                        bool truncated)
+void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
 {
+       bool wakeup = !!(handle->aux_flags & PERF_AUX_FLAG_TRUNCATED);
        struct ring_buffer *rb = handle->rb;
-       bool wakeup = truncated;
        unsigned long aux_head;
-       u64 flags = 0;
-
-       if (truncated)
-               flags |= PERF_AUX_FLAG_TRUNCATED;
 
        /* in overwrite mode, driver provides aux_head via handle */
        if (rb->aux_overwrite) {
-               flags |= PERF_AUX_FLAG_OVERWRITE;
+               handle->aux_flags |= PERF_AUX_FLAG_OVERWRITE;
 
                aux_head = handle->head;
                local_set(&rb->aux_head, aux_head);
        } else {
+               handle->aux_flags &= ~PERF_AUX_FLAG_OVERWRITE;
+
                aux_head = local_read(&rb->aux_head);
                local_add(size, &rb->aux_head);
        }
 
-       if (size || flags) {
+       if (size || handle->aux_flags) {
                /*
                 * Only send RECORD_AUX if we have something useful to communicate
                 */
 
-               perf_event_aux_event(handle->event, aux_head, size, flags);
+               perf_event_aux_event(handle->event, aux_head, size,
+                                    handle->aux_flags);
        }
 
        aux_head = rb->user_page->aux_head = local_read(&rb->aux_head);
@@ -446,7 +458,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
        }
 
        if (wakeup) {
-               if (truncated)
+               if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED)
                        handle->event->pending_disable = 1;
                perf_output_wakeup(handle);
        }
index 6c463c8..3a4343c 100644 (file)
@@ -1438,6 +1438,7 @@ static void rt_mutex_init_task(struct task_struct *p)
 #ifdef CONFIG_RT_MUTEXES
        p->pi_waiters = RB_ROOT;
        p->pi_waiters_leftmost = NULL;
+       p->pi_top_task = NULL;
        p->pi_blocked_on = NULL;
 #endif
 }
@@ -2352,6 +2353,8 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
                }
        }
 
+       perf_event_namespaces(current);
+
 bad_unshare_cleanup_cred:
        if (new_cred)
                put_cred(new_cred);
index 45858ec..357348a 100644 (file)
@@ -802,7 +802,7 @@ static int refill_pi_state_cache(void)
        return 0;
 }
 
-static struct futex_pi_state * alloc_pi_state(void)
+static struct futex_pi_state *alloc_pi_state(void)
 {
        struct futex_pi_state *pi_state = current->pi_state_cache;
 
@@ -812,6 +812,11 @@ static struct futex_pi_state * alloc_pi_state(void)
        return pi_state;
 }
 
+static void get_pi_state(struct futex_pi_state *pi_state)
+{
+       WARN_ON_ONCE(!atomic_inc_not_zero(&pi_state->refcount));
+}
+
 /*
  * Drops a reference to the pi_state object and frees or caches it
  * when the last reference is gone.
@@ -856,7 +861,7 @@ static void put_pi_state(struct futex_pi_state *pi_state)
  * Look up the task based on what TID userspace gave us.
  * We dont trust it.
  */
-static struct task_struct * futex_find_get_task(pid_t pid)
+static struct task_struct *futex_find_get_task(pid_t pid)
 {
        struct task_struct *p;
 
@@ -916,10 +921,12 @@ void exit_pi_state_list(struct task_struct *curr)
                pi_state->owner = NULL;
                raw_spin_unlock_irq(&curr->pi_lock);
 
-               rt_mutex_unlock(&pi_state->pi_mutex);
-
+               get_pi_state(pi_state);
                spin_unlock(&hb->lock);
 
+               rt_mutex_futex_unlock(&pi_state->pi_mutex);
+               put_pi_state(pi_state);
+
                raw_spin_lock_irq(&curr->pi_lock);
        }
        raw_spin_unlock_irq(&curr->pi_lock);
@@ -973,6 +980,39 @@ void exit_pi_state_list(struct task_struct *curr)
  *
  * [10] There is no transient state which leaves owner and user space
  *     TID out of sync.
+ *
+ *
+ * Serialization and lifetime rules:
+ *
+ * hb->lock:
+ *
+ *     hb -> futex_q, relation
+ *     futex_q -> pi_state, relation
+ *
+ *     (cannot be raw because hb can contain arbitrary amount
+ *      of futex_q's)
+ *
+ * pi_mutex->wait_lock:
+ *
+ *     {uval, pi_state}
+ *
+ *     (and pi_mutex 'obviously')
+ *
+ * p->pi_lock:
+ *
+ *     p->pi_state_list -> pi_state->list, relation
+ *
+ * pi_state->refcount:
+ *
+ *     pi_state lifetime
+ *
+ *
+ * Lock order:
+ *
+ *   hb->lock
+ *     pi_mutex->wait_lock
+ *       p->pi_lock
+ *
  */
 
 /*
@@ -980,10 +1020,13 @@ void exit_pi_state_list(struct task_struct *curr)
  * the pi_state against the user space value. If correct, attach to
  * it.
  */
-static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
+static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
+                             struct futex_pi_state *pi_state,
                              struct futex_pi_state **ps)
 {
        pid_t pid = uval & FUTEX_TID_MASK;
+       u32 uval2;
+       int ret;
 
        /*
         * Userspace might have messed up non-PI and PI futexes [3]
@@ -991,9 +1034,39 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
        if (unlikely(!pi_state))
                return -EINVAL;
 
+       /*
+        * We get here with hb->lock held, and having found a
+        * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
+        * has dropped the hb->lock in between queue_me() and unqueue_me_pi(),
+        * which in turn means that futex_lock_pi() still has a reference on
+        * our pi_state.
+        *
+        * The waiter holding a reference on @pi_state also protects against
+        * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
+        * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
+        * free pi_state before we can take a reference ourselves.
+        */
        WARN_ON(!atomic_read(&pi_state->refcount));
 
        /*
+        * Now that we have a pi_state, we can acquire wait_lock
+        * and do the state validation.
+        */
+       raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+
+       /*
+        * Since {uval, pi_state} is serialized by wait_lock, and our current
+        * uval was read without holding it, it can have changed. Verify it
+        * still is what we expect it to be, otherwise retry the entire
+        * operation.
+        */
+       if (get_futex_value_locked(&uval2, uaddr))
+               goto out_efault;
+
+       if (uval != uval2)
+               goto out_eagain;
+
+       /*
         * Handle the owner died case:
         */
        if (uval & FUTEX_OWNER_DIED) {
@@ -1008,11 +1081,11 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
                         * is not 0. Inconsistent state. [5]
                         */
                        if (pid)
-                               return -EINVAL;
+                               goto out_einval;
                        /*
                         * Take a ref on the state and return success. [4]
                         */
-                       goto out_state;
+                       goto out_attach;
                }
 
                /*
@@ -1024,14 +1097,14 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
                 * Take a ref on the state and return success. [6]
                 */
                if (!pid)
-                       goto out_state;
+                       goto out_attach;
        } else {
                /*
                 * If the owner died bit is not set, then the pi_state
                 * must have an owner. [7]
                 */
                if (!pi_state->owner)
-                       return -EINVAL;
+                       goto out_einval;
        }
 
        /*
@@ -1040,11 +1113,29 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
         * user space TID. [9/10]
         */
        if (pid != task_pid_vnr(pi_state->owner))
-               return -EINVAL;
-out_state:
-       atomic_inc(&pi_state->refcount);
+               goto out_einval;
+
+out_attach:
+       get_pi_state(pi_state);
+       raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
        *ps = pi_state;
        return 0;
+
+out_einval:
+       ret = -EINVAL;
+       goto out_error;
+
+out_eagain:
+       ret = -EAGAIN;
+       goto out_error;
+
+out_efault:
+       ret = -EFAULT;
+       goto out_error;
+
+out_error:
+       raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+       return ret;
 }
 
 /*
@@ -1095,6 +1186,9 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
 
        /*
         * No existing pi state. First waiter. [2]
+        *
+        * This creates pi_state, we have hb->lock held, this means nothing can
+        * observe this state, wait_lock is irrelevant.
         */
        pi_state = alloc_pi_state();
 
@@ -1119,17 +1213,18 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
        return 0;
 }
 
-static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
+static int lookup_pi_state(u32 __user *uaddr, u32 uval,
+                          struct futex_hash_bucket *hb,
                           union futex_key *key, struct futex_pi_state **ps)
 {
-       struct futex_q *match = futex_top_waiter(hb, key);
+       struct futex_q *top_waiter = futex_top_waiter(hb, key);
 
        /*
         * If there is a waiter on that futex, validate it and
         * attach to the pi_state when the validation succeeds.
         */
-       if (match)
-               return attach_to_pi_state(uval, match->pi_state, ps);
+       if (top_waiter)
+               return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
 
        /*
         * We are the first waiter - try to look up the owner based on
@@ -1148,7 +1243,7 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
        if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
                return -EFAULT;
 
-       /*If user space value changed, let the caller retry */
+       /* If user space value changed, let the caller retry */
        return curval != uval ? -EAGAIN : 0;
 }
 
@@ -1176,7 +1271,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
                                struct task_struct *task, int set_waiters)
 {
        u32 uval, newval, vpid = task_pid_vnr(task);
-       struct futex_q *match;
+       struct futex_q *top_waiter;
        int ret;
 
        /*
@@ -1202,9 +1297,9 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
         * Lookup existing state first. If it exists, try to attach to
         * its pi_state.
         */
-       match = futex_top_waiter(hb, key);
-       if (match)
-               return attach_to_pi_state(uval, match->pi_state, ps);
+       top_waiter = futex_top_waiter(hb, key);
+       if (top_waiter)
+               return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
 
        /*
         * No waiter and user TID is 0. We are here because the
@@ -1285,50 +1380,44 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
        wake_q_add(wake_q, p);
        __unqueue_futex(q);
        /*
-        * The waiting task can free the futex_q as soon as
-        * q->lock_ptr = NULL is written, without taking any locks. A
-        * memory barrier is required here to prevent the following
-        * store to lock_ptr from getting ahead of the plist_del.
+        * The waiting task can free the futex_q as soon as q->lock_ptr = NULL
+        * is written, without taking any locks. This is possible in the event
+        * of a spurious wakeup, for example. A memory barrier is required here
+        * to prevent the following store to lock_ptr from getting ahead of the
+        * plist_del in __unqueue_futex().
         */
-       smp_wmb();
-       q->lock_ptr = NULL;
+       smp_store_release(&q->lock_ptr, NULL);
 }
 
-static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
-                        struct futex_hash_bucket *hb)
+/*
+ * Caller must hold a reference on @pi_state.
+ */
+static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
 {
-       struct task_struct *new_owner;
-       struct futex_pi_state *pi_state = this->pi_state;
        u32 uninitialized_var(curval), newval;
+       struct task_struct *new_owner;
+       bool postunlock = false;
        DEFINE_WAKE_Q(wake_q);
-       bool deboost;
        int ret = 0;
 
-       if (!pi_state)
-               return -EINVAL;
-
-       /*
-        * If current does not own the pi_state then the futex is
-        * inconsistent and user space fiddled with the futex value.
-        */
-       if (pi_state->owner != current)
-               return -EINVAL;
-
-       raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
        new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
+       if (WARN_ON_ONCE(!new_owner)) {
+               /*
+                * As per the comment in futex_unlock_pi() this should not happen.
+                *
+                * When this happens, give up our locks and try again, giving
+                * the futex_lock_pi() instance time to complete, either by
+                * waiting on the rtmutex or removing itself from the futex
+                * queue.
+                */
+               ret = -EAGAIN;
+               goto out_unlock;
+       }
 
        /*
-        * It is possible that the next waiter (the one that brought
-        * this owner to the kernel) timed out and is no longer
-        * waiting on the lock.
-        */
-       if (!new_owner)
-               new_owner = this->task;
-
-       /*
-        * We pass it to the next owner. The WAITERS bit is always
-        * kept enabled while there is PI state around. We cleanup the
-        * owner died bit, because we are the owner.
+        * We pass it to the next owner. The WAITERS bit is always kept
+        * enabled while there is PI state around. We cleanup the owner
+        * died bit, because we are the owner.
         */
        newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
 
@@ -1337,6 +1426,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
 
        if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) {
                ret = -EFAULT;
+
        } else if (curval != uval) {
                /*
                 * If a unconditional UNLOCK_PI operation (user space did not
@@ -1349,10 +1439,14 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
                else
                        ret = -EINVAL;
        }
-       if (ret) {
-               raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
-               return ret;
-       }
+
+       if (ret)
+               goto out_unlock;
+
+       /*
+        * This is a point of no return; once we modify the uval there is no
+        * going back and subsequent operations must not fail.
+        */
 
        raw_spin_lock(&pi_state->owner->pi_lock);
        WARN_ON(list_empty(&pi_state->list));
@@ -1365,22 +1459,15 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
        pi_state->owner = new_owner;
        raw_spin_unlock(&new_owner->pi_lock);
 
-       raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+       postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
 
-       deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
+out_unlock:
+       raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
 
-       /*
-        * First unlock HB so the waiter does not spin on it once he got woken
-        * up. Second wake up the waiter before the priority is adjusted. If we
-        * deboost first (and lose our higher priority), then the task might get
-        * scheduled away before the wake up can take place.
-        */
-       spin_unlock(&hb->lock);
-       wake_up_q(&wake_q);
-       if (deboost)
-               rt_mutex_adjust_prio(current);
+       if (postunlock)
+               rt_mutex_postunlock(&wake_q);
 
-       return 0;
+       return ret;
 }
 
 /*
@@ -1826,7 +1913,7 @@ retry_private:
                         * If that call succeeds then we have pi_state and an
                         * initial refcount on it.
                         */
-                       ret = lookup_pi_state(ret, hb2, &key2, &pi_state);
+                       ret = lookup_pi_state(uaddr2, ret, hb2, &key2, &pi_state);
                }
 
                switch (ret) {
@@ -1909,7 +1996,7 @@ retry_private:
                         * refcount on the pi_state and store the pointer in
                         * the futex_q object of the waiter.
                         */
-                       atomic_inc(&pi_state->refcount);
+                       get_pi_state(pi_state);
                        this->pi_state = pi_state;
                        ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
                                                        this->rt_waiter,
@@ -2009,20 +2096,7 @@ queue_unlock(struct futex_hash_bucket *hb)
        hb_waiters_dec(hb);
 }
 
-/**
- * queue_me() - Enqueue the futex_q on the futex_hash_bucket
- * @q: The futex_q to enqueue
- * @hb:        The destination hash bucket
- *
- * The hb->lock must be held by the caller, and is released here. A call to
- * queue_me() is typically paired with exactly one call to unqueue_me().  The
- * exceptions involve the PI related operations, which may use unqueue_me_pi()
- * or nothing if the unqueue is done as part of the wake process and the unqueue
- * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
- * an example).
- */
-static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
-       __releases(&hb->lock)
+static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
 {
        int prio;
 
@@ -2039,6 +2113,24 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
        plist_node_init(&q->list, prio);
        plist_add(&q->list, &hb->chain);
        q->task = current;
+}
+
+/**
+ * queue_me() - Enqueue the futex_q on the futex_hash_bucket
+ * @q: The futex_q to enqueue
+ * @hb:        The destination hash bucket
+ *
+ * The hb->lock must be held by the caller, and is released here. A call to
+ * queue_me() is typically paired with exactly one call to unqueue_me().  The
+ * exceptions involve the PI related operations, which may use unqueue_me_pi()
+ * or nothing if the unqueue is done as part of the wake process and the unqueue
+ * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
+ * an example).
+ */
+static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
+       __releases(&hb->lock)
+{
+       __queue_me(q, hb);
        spin_unlock(&hb->lock);
 }
 
@@ -2125,10 +2217,13 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
 {
        u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
        struct futex_pi_state *pi_state = q->pi_state;
-       struct task_struct *oldowner = pi_state->owner;
        u32 uval, uninitialized_var(curval), newval;
+       struct task_struct *oldowner;
        int ret;
 
+       raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+
+       oldowner = pi_state->owner;
        /* Owner died? */
        if (!pi_state->owner)
                newtid |= FUTEX_OWNER_DIED;
@@ -2136,7 +2231,8 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
        /*
         * We are here either because we stole the rtmutex from the
         * previous highest priority waiter or we are the highest priority
-        * waiter but failed to get the rtmutex the first time.
+        * waiter but have failed to get the rtmutex the first time.
+        *
         * We have to replace the newowner TID in the user space variable.
         * This must be atomic as we have to preserve the owner died bit here.
         *
@@ -2144,17 +2240,16 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
         * because we can fault here. Imagine swapped out pages or a fork
         * that marked all the anonymous memory readonly for cow.
         *
-        * Modifying pi_state _before_ the user space value would
-        * leave the pi_state in an inconsistent state when we fault
-        * here, because we need to drop the hash bucket lock to
-        * handle the fault. This might be observed in the PID check
-        * in lookup_pi_state.
+        * Modifying pi_state _before_ the user space value would leave the
+        * pi_state in an inconsistent state when we fault here, because we
+        * need to drop the locks to handle the fault. This might be observed
+        * in the PID check in lookup_pi_state.
         */
 retry:
        if (get_futex_value_locked(&uval, uaddr))
                goto handle_fault;
 
-       while (1) {
+       for (;;) {
                newval = (uval & FUTEX_OWNER_DIED) | newtid;
 
                if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
@@ -2169,47 +2264,60 @@ retry:
         * itself.
         */
        if (pi_state->owner != NULL) {
-               raw_spin_lock_irq(&pi_state->owner->pi_lock);
+               raw_spin_lock(&pi_state->owner->pi_lock);
                WARN_ON(list_empty(&pi_state->list));
                list_del_init(&pi_state->list);
-               raw_spin_unlock_irq(&pi_state->owner->pi_lock);
+               raw_spin_unlock(&pi_state->owner->pi_lock);
        }
 
        pi_state->owner = newowner;
 
-       raw_spin_lock_irq(&newowner->pi_lock);
+       raw_spin_lock(&newowner->pi_lock);
        WARN_ON(!list_empty(&pi_state->list));
        list_add(&pi_state->list, &newowner->pi_state_list);
-       raw_spin_unlock_irq(&newowner->pi_lock);
+       raw_spin_unlock(&newowner->pi_lock);
+       raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+
        return 0;
 
        /*
-        * To handle the page fault we need to drop the hash bucket
-        * lock here. That gives the other task (either the highest priority
-        * waiter itself or the task which stole the rtmutex) the
-        * chance to try the fixup of the pi_state. So once we are
-        * back from handling the fault we need to check the pi_state
-        * after reacquiring the hash bucket lock and before trying to
-        * do another fixup. When the fixup has been done already we
-        * simply return.
+        * To handle the page fault we need to drop the locks here. That gives
+        * the other task (either the highest priority waiter itself or the
+        * task which stole the rtmutex) the chance to try the fixup of the
+        * pi_state. So once we are back from handling the fault we need to
+        * check the pi_state after reacquiring the locks and before trying to
+        * do another fixup. When the fixup has been done already we simply
+        * return.
+        *
+        * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
+        * drop hb->lock since the caller owns the hb -> futex_q relation.
+        * Dropping the pi_mutex->wait_lock requires the state revalidate.
         */
 handle_fault:
+       raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
        spin_unlock(q->lock_ptr);
 
        ret = fault_in_user_writeable(uaddr);
 
        spin_lock(q->lock_ptr);
+       raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
 
        /*
         * Check if someone else fixed it for us:
         */
-       if (pi_state->owner != oldowner)
-               return 0;
+       if (pi_state->owner != oldowner) {
+               ret = 0;
+               goto out_unlock;
+       }
 
        if (ret)
-               return ret;
+               goto out_unlock;
 
        goto retry;
+
+out_unlock:
+       raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+       return ret;
 }
 
 static long futex_wait_restart(struct restart_block *restart);
@@ -2231,13 +2339,16 @@ static long futex_wait_restart(struct restart_block *restart);
  */
 static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
 {
-       struct task_struct *owner;
        int ret = 0;
 
        if (locked) {
                /*
                 * Got the lock. We might not be the anticipated owner if we
                 * did a lock-steal - fix up the PI-state in that case:
+                *
+                * We can safely read pi_state->owner without holding wait_lock
+                * because we now own the rt_mutex, only the owner will attempt
+                * to change it.
                 */
                if (q->pi_state->owner != current)
                        ret = fixup_pi_state_owner(uaddr, q, current);
@@ -2245,43 +2356,15 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
        }
 
        /*
-        * Catch the rare case, where the lock was released when we were on the
-        * way back before we locked the hash bucket.
-        */
-       if (q->pi_state->owner == current) {
-               /*
-                * Try to get the rt_mutex now. This might fail as some other
-                * task acquired the rt_mutex after we removed ourself from the
-                * rt_mutex waiters list.
-                */
-               if (rt_mutex_trylock(&q->pi_state->pi_mutex)) {
-                       locked = 1;
-                       goto out;
-               }
-
-               /*
-                * pi_state is incorrect, some other task did a lock steal and
-                * we returned due to timeout or signal without taking the
-                * rt_mutex. Too late.
-                */
-               raw_spin_lock_irq(&q->pi_state->pi_mutex.wait_lock);
-               owner = rt_mutex_owner(&q->pi_state->pi_mutex);
-               if (!owner)
-                       owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
-               raw_spin_unlock_irq(&q->pi_state->pi_mutex.wait_lock);
-               ret = fixup_pi_state_owner(uaddr, q, owner);
-               goto out;
-       }
-
-       /*
         * Paranoia check. If we did not take the lock, then we should not be
         * the owner of the rt_mutex.
         */
-       if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
+       if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) {
                printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
                                "pi-state %p\n", ret,
                                q->pi_state->pi_mutex.owner,
                                q->pi_state->owner);
+       }
 
 out:
        return ret ? ret : locked;
@@ -2505,6 +2588,8 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
                         ktime_t *time, int trylock)
 {
        struct hrtimer_sleeper timeout, *to = NULL;
+       struct futex_pi_state *pi_state = NULL;
+       struct rt_mutex_waiter rt_waiter;
        struct futex_hash_bucket *hb;
        struct futex_q q = futex_q_init;
        int res, ret;
@@ -2557,25 +2642,68 @@ retry_private:
                }
        }
 
+       WARN_ON(!q.pi_state);
+
        /*
         * Only actually queue now that the atomic ops are done:
         */
-       queue_me(&q, hb);
+       __queue_me(&q, hb);
 
-       WARN_ON(!q.pi_state);
-       /*
-        * Block on the PI mutex:
-        */
-       if (!trylock) {
-               ret = rt_mutex_timed_futex_lock(&q.pi_state->pi_mutex, to);
-       } else {
-               ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
+       if (trylock) {
+               ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
                /* Fixup the trylock return value: */
                ret = ret ? 0 : -EWOULDBLOCK;
+               goto no_block;
        }
 
+       rt_mutex_init_waiter(&rt_waiter);
+
+       /*
+        * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not
+        * hold it while doing rt_mutex_start_proxy(), because then it will
+        * include hb->lock in the blocking chain, even through we'll not in
+        * fact hold it while blocking. This will lead it to report -EDEADLK
+        * and BUG when futex_unlock_pi() interleaves with this.
+        *
+        * Therefore acquire wait_lock while holding hb->lock, but drop the
+        * latter before calling rt_mutex_start_proxy_lock(). This still fully
+        * serializes against futex_unlock_pi() as that does the exact same
+        * lock handoff sequence.
+        */
+       raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
+       spin_unlock(q.lock_ptr);
+       ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
+       raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
+
+       if (ret) {
+               if (ret == 1)
+                       ret = 0;
+
+               spin_lock(q.lock_ptr);
+               goto no_block;
+       }
+
+
+       if (unlikely(to))
+               hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS);
+
+       ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
+
        spin_lock(q.lock_ptr);
        /*
+        * If we failed to acquire the lock (signal/timeout), we must
+        * first acquire the hb->lock before removing the lock from the
+        * rt_mutex waitqueue, such that we can keep the hb and rt_mutex
+        * wait lists consistent.
+        *
+        * In particular; it is important that futex_unlock_pi() can not
+        * observe this inconsistency.
+        */
+       if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
+               ret = 0;
+
+no_block:
+       /*
         * Fixup the pi_state owner and possibly acquire the lock if we
         * haven't already.
         */
@@ -2591,12 +2719,19 @@ retry_private:
         * If fixup_owner() faulted and was unable to handle the fault, unlock
         * it and return the fault to userspace.
         */
-       if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current))
-               rt_mutex_unlock(&q.pi_state->pi_mutex);
+       if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) {
+               pi_state = q.pi_state;
+               get_pi_state(pi_state);
+       }
 
        /* Unqueue and drop the lock */
        unqueue_me_pi(&q);
 
+       if (pi_state) {
+               rt_mutex_futex_unlock(&pi_state->pi_mutex);
+               put_pi_state(pi_state);
+       }
+
        goto out_put_key;
 
 out_unlock_put_key:
@@ -2605,8 +2740,10 @@ out_unlock_put_key:
 out_put_key:
        put_futex_key(&q.key);
 out:
-       if (to)
+       if (to) {
+               hrtimer_cancel(&to->timer);
                destroy_hrtimer_on_stack(&to->timer);
+       }
        return ret != -EINTR ? ret : -ERESTARTNOINTR;
 
 uaddr_faulted:
@@ -2633,7 +2770,7 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
        u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current);
        union futex_key key = FUTEX_KEY_INIT;
        struct futex_hash_bucket *hb;
-       struct futex_q *match;
+       struct futex_q *top_waiter;
        int ret;
 
 retry:
@@ -2657,12 +2794,37 @@ retry:
         * all and we at least want to know if user space fiddled
         * with the futex value instead of blindly unlocking.
         */
-       match = futex_top_waiter(hb, &key);
-       if (match) {
-               ret = wake_futex_pi(uaddr, uval, match, hb);
+       top_waiter = futex_top_waiter(hb, &key);
+       if (top_waiter) {
+               struct futex_pi_state *pi_state = top_waiter->pi_state;
+
+               ret = -EINVAL;
+               if (!pi_state)
+                       goto out_unlock;
+
                /*
-                * In case of success wake_futex_pi dropped the hash
-                * bucket lock.
+                * If current does not own the pi_state then the futex is
+                * inconsistent and user space fiddled with the futex value.
+                */
+               if (pi_state->owner != current)
+                       goto out_unlock;
+
+               get_pi_state(pi_state);
+               /*
+                * By taking wait_lock while still holding hb->lock, we ensure
+                * there is no point where we hold neither; and therefore
+                * wake_futex_pi() must observe a state consistent with what we
+                * observed.
+                */
+               raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+               spin_unlock(&hb->lock);
+
+               ret = wake_futex_pi(uaddr, uval, pi_state);
+
+               put_pi_state(pi_state);
+
+               /*
+                * Success, we're done! No tricky corner cases.
                 */
                if (!ret)
                        goto out_putkey;
@@ -2677,7 +2839,6 @@ retry:
                 * setting the FUTEX_WAITERS bit. Try again.
                 */
                if (ret == -EAGAIN) {
-                       spin_unlock(&hb->lock);
                        put_futex_key(&key);
                        goto retry;
                }
@@ -2685,7 +2846,7 @@ retry:
                 * wake_futex_pi has detected invalid state. Tell user
                 * space.
                 */
-               goto out_unlock;
+               goto out_putkey;
        }
 
        /*
@@ -2695,8 +2856,10 @@ retry:
         * preserve the WAITERS bit not the OWNER_DIED one. We are the
         * owner.
         */
-       if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))
+       if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) {
+               spin_unlock(&hb->lock);
                goto pi_faulted;
+       }
 
        /*
         * If uval has changed, let user space handle it.
@@ -2710,7 +2873,6 @@ out_putkey:
        return ret;
 
 pi_faulted:
-       spin_unlock(&hb->lock);
        put_futex_key(&key);
 
        ret = fault_in_user_writeable(uaddr);
@@ -2814,6 +2976,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
                                 u32 __user *uaddr2)
 {
        struct hrtimer_sleeper timeout, *to = NULL;
+       struct futex_pi_state *pi_state = NULL;
        struct rt_mutex_waiter rt_waiter;
        struct futex_hash_bucket *hb;
        union futex_key key2 = FUTEX_KEY_INIT;
@@ -2840,10 +3003,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
         * The waiter is allocated on our stack, manipulated by the requeue
         * code while we sleep on uaddr.
         */
-       debug_rt_mutex_init_waiter(&rt_waiter);
-       RB_CLEAR_NODE(&rt_waiter.pi_tree_entry);
-       RB_CLEAR_NODE(&rt_waiter.tree_entry);
-       rt_waiter.task = NULL;
+       rt_mutex_init_waiter(&rt_waiter);
 
        ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
        if (unlikely(ret != 0))
@@ -2898,8 +3058,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
                if (q.pi_state && (q.pi_state->owner != current)) {
                        spin_lock(q.lock_ptr);
                        ret = fixup_pi_state_owner(uaddr2, &q, current);
-                       if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current)
-                               rt_mutex_unlock(&q.pi_state->pi_mutex);
+                       if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
+                               pi_state = q.pi_state;
+                               get_pi_state(pi_state);
+                       }
                        /*
                         * Drop the reference to the pi state which
                         * the requeue_pi() code acquired for us.
@@ -2917,10 +3079,13 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
                 */
                WARN_ON(!q.pi_state);
                pi_mutex = &q.pi_state->pi_mutex;
-               ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter);
-               debug_rt_mutex_free_waiter(&rt_waiter);
+               ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);
 
                spin_lock(q.lock_ptr);
+               if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
+                       ret = 0;
+
+               debug_rt_mutex_free_waiter(&rt_waiter);
                /*
                 * Fixup the pi_state owner and possibly acquire the lock if we
                 * haven't already.
@@ -2938,13 +3103,20 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
                 * the fault, unlock the rt_mutex and return the fault to
                 * userspace.
                 */
-               if (ret && rt_mutex_owner(pi_mutex) == current)
-                       rt_mutex_unlock(pi_mutex);
+               if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
+                       pi_state = q.pi_state;
+                       get_pi_state(pi_state);
+               }
 
                /* Unqueue and drop the lock. */
                unqueue_me_pi(&q);
        }
 
+       if (pi_state) {
+               rt_mutex_futex_unlock(&pi_state->pi_mutex);
+               put_pi_state(pi_state);
+       }
+
        if (ret == -EINTR) {
                /*
                 * We've already been requeued, but cannot restart by calling
index 4544b11..e2d356d 100644 (file)
@@ -59,7 +59,7 @@ static int get_nodes_in_cpumask(const struct cpumask *mask, nodemask_t *nodemsk)
 struct cpumask *
 irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 {
-       int n, nodes, vecs_per_node, cpus_per_vec, extra_vecs, curvec;
+       int n, nodes, cpus_per_vec, extra_vecs, curvec;
        int affv = nvecs - affd->pre_vectors - affd->post_vectors;
        int last_affv = affv + affd->pre_vectors;
        nodemask_t nodemsk = NODE_MASK_NONE;
@@ -94,19 +94,21 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
                goto done;
        }
 
-       /* Spread the vectors per node */
-       vecs_per_node = affv / nodes;
-       /* Account for rounding errors */
-       extra_vecs = affv - (nodes * vecs_per_node);
-
        for_each_node_mask(n, nodemsk) {
-               int ncpus, v, vecs_to_assign = vecs_per_node;
+               int ncpus, v, vecs_to_assign, vecs_per_node;
+
+               /* Spread the vectors per node */
+               vecs_per_node = (affv - (curvec - affd->pre_vectors)) / nodes;
 
                /* Get the cpus on this node which are in the mask */
                cpumask_and(nmsk, cpu_online_mask, cpumask_of_node(n));
 
                /* Calculate the number of cpus per vector */
                ncpus = cpumask_weight(nmsk);
+               vecs_to_assign = min(vecs_per_node, ncpus);
+
+               /* Account for rounding errors */
+               extra_vecs = ncpus - vecs_to_assign * (ncpus / vecs_to_assign);
 
                for (v = 0; curvec < last_affv && v < vecs_to_assign;
                     curvec++, v++) {
@@ -115,14 +117,14 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
                        /* Account for extra vectors to compensate rounding errors */
                        if (extra_vecs) {
                                cpus_per_vec++;
-                               if (!--extra_vecs)
-                                       vecs_per_node++;
+                               --extra_vecs;
                        }
                        irq_spread_init_one(masks + curvec, nmsk, cpus_per_vec);
                }
 
                if (curvec >= last_affv)
                        break;
+               --nodes;
        }
 
 done:
index be3c34e..686be4b 100644 (file)
@@ -348,7 +348,10 @@ void handle_nested_irq(unsigned int irq)
        irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
        raw_spin_unlock_irq(&desc->lock);
 
-       action_ret = action->thread_fn(action->irq, action->dev_id);
+       action_ret = IRQ_NONE;
+       for_each_action_of_desc(desc, action)
+               action_ret |= action->thread_fn(action->irq, action->dev_id);
+
        if (!noirqdebug)
                note_interrupt(desc, action_ret);
 
index a4afe5c..ae1c90f 100644 (file)
@@ -852,7 +852,7 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
         * This code is triggered unconditionally. Check the affinity
         * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out.
         */
-       if (desc->irq_common_data.affinity)
+       if (cpumask_available(desc->irq_common_data.affinity))
                cpumask_copy(mask, desc->irq_common_data.affinity);
        else
                valid = false;
@@ -1212,8 +1212,10 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
                 * set the trigger type must match. Also all must
                 * agree on ONESHOT.
                 */
+               unsigned int oldtype = irqd_get_trigger_type(&desc->irq_data);
+
                if (!((old->flags & new->flags) & IRQF_SHARED) ||
-                   ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) ||
+                   (oldtype != (new->flags & IRQF_TRIGGER_MASK)) ||
                    ((old->flags ^ new->flags) & IRQF_ONESHOT))
                        goto mismatch;
 
index 699c5bc..d733479 100644 (file)
@@ -1391,21 +1391,19 @@ bool within_kprobe_blacklist(unsigned long addr)
  * This returns encoded errors if it fails to look up symbol or invalid
  * combination of parameters.
  */
-static kprobe_opcode_t *kprobe_addr(struct kprobe *p)
+static kprobe_opcode_t *_kprobe_addr(kprobe_opcode_t *addr,
+                       const char *symbol_name, unsigned int offset)
 {
-       kprobe_opcode_t *addr = p->addr;
-
-       if ((p->symbol_name && p->addr) ||
-           (!p->symbol_name && !p->addr))
+       if ((symbol_name && addr) || (!symbol_name && !addr))
                goto invalid;
 
-       if (p->symbol_name) {
-               kprobe_lookup_name(p->symbol_name, addr);
+       if (symbol_name) {
+               kprobe_lookup_name(symbol_name, addr);
                if (!addr)
                        return ERR_PTR(-ENOENT);
        }
 
-       addr = (kprobe_opcode_t *)(((char *)addr) + p->offset);
+       addr = (kprobe_opcode_t *)(((char *)addr) + offset);
        if (addr)
                return addr;
 
@@ -1413,6 +1411,11 @@ invalid:
        return ERR_PTR(-EINVAL);
 }
 
+static kprobe_opcode_t *kprobe_addr(struct kprobe *p)
+{
+       return _kprobe_addr(p->addr, p->symbol_name, p->offset);
+}
+
 /* Check passed kprobe is valid and return kprobe in kprobe_table. */
 static struct kprobe *__get_valid_kprobe(struct kprobe *p)
 {
@@ -1740,11 +1743,12 @@ void unregister_kprobes(struct kprobe **kps, int num)
 }
 EXPORT_SYMBOL_GPL(unregister_kprobes);
 
-int __weak __kprobes kprobe_exceptions_notify(struct notifier_block *self,
-                                             unsigned long val, void *data)
+int __weak kprobe_exceptions_notify(struct notifier_block *self,
+                                       unsigned long val, void *data)
 {
        return NOTIFY_DONE;
 }
+NOKPROBE_SYMBOL(kprobe_exceptions_notify);
 
 static struct notifier_block kprobe_exceptions_nb = {
        .notifier_call = kprobe_exceptions_notify,
@@ -1875,6 +1879,25 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
 }
 NOKPROBE_SYMBOL(pre_handler_kretprobe);
 
+bool __weak arch_function_offset_within_entry(unsigned long offset)
+{
+       return !offset;
+}
+
+bool function_offset_within_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset)
+{
+       kprobe_opcode_t *kp_addr = _kprobe_addr(addr, sym, offset);
+
+       if (IS_ERR(kp_addr))
+               return false;
+
+       if (!kallsyms_lookup_size_offset((unsigned long)kp_addr, NULL, &offset) ||
+                                               !arch_function_offset_within_entry(offset))
+               return false;
+
+       return true;
+}
+
 int register_kretprobe(struct kretprobe *rp)
 {
        int ret = 0;
@@ -1882,6 +1905,9 @@ int register_kretprobe(struct kretprobe *rp)
        int i;
        void *addr;
 
+       if (!function_offset_within_entry(rp->kp.addr, rp->kp.symbol_name, rp->kp.offset))
+               return -EINVAL;
+
        if (kretprobe_blacklist_size) {
                addr = kprobe_addr(&rp->kp);
                if (IS_ERR(addr))
index 2f26ade..26db528 100644 (file)
@@ -20,6 +20,7 @@
 #include <linux/freezer.h>
 #include <linux/ptrace.h>
 #include <linux/uaccess.h>
+#include <linux/cgroup.h>
 #include <trace/events/sched.h>
 
 static DEFINE_SPINLOCK(kthread_create_lock);
@@ -225,6 +226,7 @@ static int kthread(void *_create)
 
        ret = -EINTR;
        if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) {
+               cgroup_kthread_ready();
                __kthread_parkme(self);
                ret = threadfn(data);
        }
@@ -538,6 +540,7 @@ int kthreadd(void *unused)
        set_mems_allowed(node_states[N_MEMORY]);
 
        current->flags |= PF_NOFREEZE;
+       cgroup_init_kthreadd();
 
        for (;;) {
                set_current_state(TASK_INTERRUPTIBLE);
index a95e5d1..98dd623 100644 (file)
@@ -660,6 +660,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
        struct lockdep_subclass_key *key;
        struct hlist_head *hash_head;
        struct lock_class *class;
+       bool is_static = false;
 
        if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
                debug_locks_off();
@@ -673,10 +674,23 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
 
        /*
         * Static locks do not have their class-keys yet - for them the key
-        * is the lock object itself:
+        * is the lock object itself. If the lock is in the per cpu area,
+        * the canonical address of the lock (per cpu offset removed) is
+        * used.
         */
-       if (unlikely(!lock->key))
-               lock->key = (void *)lock;
+       if (unlikely(!lock->key)) {
+               unsigned long can_addr, addr = (unsigned long)lock;
+
+               if (__is_kernel_percpu_address(addr, &can_addr))
+                       lock->key = (void *)can_addr;
+               else if (__is_module_percpu_address(addr, &can_addr))
+                       lock->key = (void *)can_addr;
+               else if (static_obj(lock))
+                       lock->key = (void *)lock;
+               else
+                       return ERR_PTR(-EINVAL);
+               is_static = true;
+       }
 
        /*
         * NOTE: the class-key must be unique. For dynamic locks, a static
@@ -708,7 +722,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
                }
        }
 
-       return NULL;
+       return is_static || static_obj(lock->key) ? NULL : ERR_PTR(-EINVAL);
 }
 
 /*
@@ -726,19 +740,18 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
        DEBUG_LOCKS_WARN_ON(!irqs_disabled());
 
        class = look_up_lock_class(lock, subclass);
-       if (likely(class))
+       if (likely(!IS_ERR_OR_NULL(class)))
                goto out_set_class_cache;
 
        /*
         * Debug-check: all keys must be persistent!
-        */
-       if (!static_obj(lock->key)) {
+        */
+       if (IS_ERR(class)) {
                debug_locks_off();
                printk("INFO: trying to register non-static key.\n");
                printk("the code is fine but needs lockdep annotation.\n");
                printk("turning off the locking correctness validator.\n");
                dump_stack();
-
                return NULL;
        }
 
@@ -3419,7 +3432,7 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
                 * Clearly if the lock hasn't been acquired _ever_, we're not
                 * holding it either, so report failure.
                 */
-               if (!class)
+               if (IS_ERR_OR_NULL(class))
                        return 0;
 
                /*
@@ -3437,13 +3450,67 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
        return 0;
 }
 
+/* @depth must not be zero */
+static struct held_lock *find_held_lock(struct task_struct *curr,
+                                       struct lockdep_map *lock,
+                                       unsigned int depth, int *idx)
+{
+       struct held_lock *ret, *hlock, *prev_hlock;
+       int i;
+
+       i = depth - 1;
+       hlock = curr->held_locks + i;
+       ret = hlock;
+       if (match_held_lock(hlock, lock))
+               goto out;
+
+       ret = NULL;
+       for (i--, prev_hlock = hlock--;
+            i >= 0;
+            i--, prev_hlock = hlock--) {
+               /*
+                * We must not cross into another context:
+                */
+               if (prev_hlock->irq_context != hlock->irq_context) {
+                       ret = NULL;
+                       break;
+               }
+               if (match_held_lock(hlock, lock)) {
+                       ret = hlock;
+                       break;
+               }
+       }
+
+out:
+       *idx = i;
+       return ret;
+}
+
+static int reacquire_held_locks(struct task_struct *curr, unsigned int depth,
+                             int idx)
+{
+       struct held_lock *hlock;
+
+       for (hlock = curr->held_locks + idx; idx < depth; idx++, hlock++) {
+               if (!__lock_acquire(hlock->instance,
+                                   hlock_class(hlock)->subclass,
+                                   hlock->trylock,
+                                   hlock->read, hlock->check,
+                                   hlock->hardirqs_off,
+                                   hlock->nest_lock, hlock->acquire_ip,
+                                   hlock->references, hlock->pin_count))
+                       return 1;
+       }
+       return 0;
+}
+
 static int
 __lock_set_class(struct lockdep_map *lock, const char *name,
                 struct lock_class_key *key, unsigned int subclass,
                 unsigned long ip)
 {
        struct task_struct *curr = current;
-       struct held_lock *hlock, *prev_hlock;
+       struct held_lock *hlock;
        struct lock_class *class;
        unsigned int depth;
        int i;
@@ -3456,21 +3523,10 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
        if (DEBUG_LOCKS_WARN_ON(!depth))
                return 0;
 
-       prev_hlock = NULL;
-       for (i = depth-1; i >= 0; i--) {
-               hlock = curr->held_locks + i;
-               /*
-                * We must not cross into another context:
-                */
-               if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
-                       break;
-               if (match_held_lock(hlock, lock))
-                       goto found_it;
-               prev_hlock = hlock;
-       }
-       return print_unlock_imbalance_bug(curr, lock, ip);
+       hlock = find_held_lock(curr, lock, depth, &i);
+       if (!hlock)
+               return print_unlock_imbalance_bug(curr, lock, ip);
 
-found_it:
        lockdep_init_map(lock, name, key, 0);
        class = register_lock_class(lock, subclass, 0);
        hlock->class_idx = class - lock_classes + 1;
@@ -3478,15 +3534,46 @@ found_it:
        curr->lockdep_depth = i;
        curr->curr_chain_key = hlock->prev_chain_key;
 
-       for (; i < depth; i++) {
-               hlock = curr->held_locks + i;
-               if (!__lock_acquire(hlock->instance,
-                       hlock_class(hlock)->subclass, hlock->trylock,
-                               hlock->read, hlock->check, hlock->hardirqs_off,
-                               hlock->nest_lock, hlock->acquire_ip,
-                               hlock->references, hlock->pin_count))
-                       return 0;
-       }
+       if (reacquire_held_locks(curr, depth, i))
+               return 0;
+
+       /*
+        * I took it apart and put it back together again, except now I have
+        * these 'spare' parts.. where shall I put them.
+        */
+       if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth))
+               return 0;
+       return 1;
+}
+
+static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip)
+{
+       struct task_struct *curr = current;
+       struct held_lock *hlock;
+       unsigned int depth;
+       int i;
+
+       depth = curr->lockdep_depth;
+       /*
+        * This function is about (re)setting the class of a held lock,
+        * yet we're not actually holding any locks. Naughty user!
+        */
+       if (DEBUG_LOCKS_WARN_ON(!depth))
+               return 0;
+
+       hlock = find_held_lock(curr, lock, depth, &i);
+       if (!hlock)
+               return print_unlock_imbalance_bug(curr, lock, ip);
+
+       curr->lockdep_depth = i;
+       curr->curr_chain_key = hlock->prev_chain_key;
+
+       WARN(hlock->read, "downgrading a read lock");
+       hlock->read = 1;
+       hlock->acquire_ip = ip;
+
+       if (reacquire_held_locks(curr, depth, i))
+               return 0;
 
        /*
         * I took it apart and put it back together again, except now I have
@@ -3508,7 +3595,7 @@ static int
 __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
 {
        struct task_struct *curr = current;
-       struct held_lock *hlock, *prev_hlock;
+       struct held_lock *hlock;
        unsigned int depth;
        int i;
 
@@ -3527,21 +3614,10 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
         * Check whether the lock exists in the current stack
         * of held locks:
         */
-       prev_hlock = NULL;
-       for (i = depth-1; i >= 0; i--) {
-               hlock = curr->held_locks + i;
-               /*
-                * We must not cross into another context:
-                */
-               if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
-                       break;
-               if (match_held_lock(hlock, lock))
-                       goto found_it;
-               prev_hlock = hlock;
-       }
-       return print_unlock_imbalance_bug(curr, lock, ip);
+       hlock = find_held_lock(curr, lock, depth, &i);
+       if (!hlock)
+               return print_unlock_imbalance_bug(curr, lock, ip);
 
-found_it:
        if (hlock->instance == lock)
                lock_release_holdtime(hlock);
 
@@ -3568,15 +3644,8 @@ found_it:
        curr->lockdep_depth = i;
        curr->curr_chain_key = hlock->prev_chain_key;
 
-       for (i++; i < depth; i++) {
-               hlock = curr->held_locks + i;
-               if (!__lock_acquire(hlock->instance,
-                       hlock_class(hlock)->subclass, hlock->trylock,
-                               hlock->read, hlock->check, hlock->hardirqs_off,
-                               hlock->nest_lock, hlock->acquire_ip,
-                               hlock->references, hlock->pin_count))
-                       return 0;
-       }
+       if (reacquire_held_locks(curr, depth, i + 1))
+               return 0;
 
        /*
         * We had N bottles of beer on the wall, we drank one, but now
@@ -3741,6 +3810,23 @@ void lock_set_class(struct lockdep_map *lock, const char *name,
 }
 EXPORT_SYMBOL_GPL(lock_set_class);
 
+void lock_downgrade(struct lockdep_map *lock, unsigned long ip)
+{
+       unsigned long flags;
+
+       if (unlikely(current->lockdep_recursion))
+               return;
+
+       raw_local_irq_save(flags);
+       current->lockdep_recursion = 1;
+       check_flags(flags);
+       if (__lock_downgrade(lock, ip))
+               check_chain_key(current);
+       current->lockdep_recursion = 0;
+       raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_downgrade);
+
 /*
  * We are not always called with irqs disabled - do that here,
  * and also avoid lockdep recursion:
@@ -3903,7 +3989,7 @@ static void
 __lock_contended(struct lockdep_map *lock, unsigned long ip)
 {
        struct task_struct *curr = current;
-       struct held_lock *hlock, *prev_hlock;
+       struct held_lock *hlock;
        struct lock_class_stats *stats;
        unsigned int depth;
        int i, contention_point, contending_point;
@@ -3916,22 +4002,12 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
        if (DEBUG_LOCKS_WARN_ON(!depth))
                return;
 
-       prev_hlock = NULL;
-       for (i = depth-1; i >= 0; i--) {
-               hlock = curr->held_locks + i;
-               /*
-                * We must not cross into another context:
-                */
-               if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
-                       break;
-               if (match_held_lock(hlock, lock))
-                       goto found_it;
-               prev_hlock = hlock;
+       hlock = find_held_lock(curr, lock, depth, &i);
+       if (!hlock) {
+               print_lock_contention_bug(curr, lock, ip);
+               return;
        }
-       print_lock_contention_bug(curr, lock, ip);
-       return;
 
-found_it:
        if (hlock->instance != lock)
                return;
 
@@ -3955,7 +4031,7 @@ static void
 __lock_acquired(struct lockdep_map *lock, unsigned long ip)
 {
        struct task_struct *curr = current;
-       struct held_lock *hlock, *prev_hlock;
+       struct held_lock *hlock;
        struct lock_class_stats *stats;
        unsigned int depth;
        u64 now, waittime = 0;
@@ -3969,22 +4045,12 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip)
        if (DEBUG_LOCKS_WARN_ON(!depth))
                return;
 
-       prev_hlock = NULL;
-       for (i = depth-1; i >= 0; i--) {
-               hlock = curr->held_locks + i;
-               /*
-                * We must not cross into another context:
-                */
-               if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
-                       break;
-               if (match_held_lock(hlock, lock))
-                       goto found_it;
-               prev_hlock = hlock;
+       hlock = find_held_lock(curr, lock, depth, &i);
+       if (!hlock) {
+               print_lock_contention_bug(curr, lock, _RET_IP_);
+               return;
        }
-       print_lock_contention_bug(curr, lock, _RET_IP_);
-       return;
 
-found_it:
        if (hlock->instance != lock)
                return;
 
@@ -4172,7 +4238,7 @@ void lockdep_reset_lock(struct lockdep_map *lock)
                 * If the class exists we look it up and zap it:
                 */
                class = look_up_lock_class(lock, j);
-               if (class)
+               if (!IS_ERR_OR_NULL(class))
                        zap_class(class);
        }
        /*
index c2b8849..c08fbd2 100644 (file)
@@ -46,13 +46,13 @@ enum {
                (LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ)
 
 /*
- * CONFIG_PROVE_LOCKING_SMALL is defined for sparc. Sparc requires .text,
+ * CONFIG_LOCKDEP_SMALL is defined for sparc. Sparc requires .text,
  * .data and .bss to fit in required 32MB limit for the kernel. With
- * PROVE_LOCKING we could go over this limit and cause system boot-up problems.
+ * CONFIG_LOCKDEP we could go over this limit and cause system boot-up problems.
  * So, reduce the static allocations for lockdeps related structures so that
  * everything fits in current required size limit.
  */
-#ifdef CONFIG_PROVE_LOCKING_SMALL
+#ifdef CONFIG_LOCKDEP_SMALL
 /*
  * MAX_LOCKDEP_ENTRIES is the maximum number of lock dependencies
  * we track.
index 97ee9df..32fe775 100644 (file)
@@ -174,12 +174,3 @@ void debug_rt_mutex_init(struct rt_mutex *lock, const char *name)
        lock->name = name;
 }
 
-void
-rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task)
-{
-}
-
-void rt_mutex_deadlock_account_unlock(struct task_struct *task)
-{
-}
-
index d0519c3..b585af9 100644 (file)
@@ -9,9 +9,6 @@
  * This file contains macros used solely by rtmutex.c. Debug version.
  */
 
-extern void
-rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task);
-extern void rt_mutex_deadlock_account_unlock(struct task_struct *task);
 extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
 extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter);
 extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name);
index 6edc32e..b955094 100644 (file)
@@ -224,6 +224,12 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
 }
 #endif
 
+/*
+ * Only use with rt_mutex_waiter_{less,equal}()
+ */
+#define task_to_waiter(p)      \
+       &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline }
+
 static inline int
 rt_mutex_waiter_less(struct rt_mutex_waiter *left,
                     struct rt_mutex_waiter *right)
@@ -238,12 +244,30 @@ rt_mutex_waiter_less(struct rt_mutex_waiter *left,
         * then right waiter has a dl_prio() too.
         */
        if (dl_prio(left->prio))
-               return dl_time_before(left->task->dl.deadline,
-                                     right->task->dl.deadline);
+               return dl_time_before(left->deadline, right->deadline);
 
        return 0;
 }
 
+static inline int
+rt_mutex_waiter_equal(struct rt_mutex_waiter *left,
+                     struct rt_mutex_waiter *right)
+{
+       if (left->prio != right->prio)
+               return 0;
+
+       /*
+        * If both waiters have dl_prio(), we check the deadlines of the
+        * associated tasks.
+        * If left waiter has a dl_prio(), and we didn't return 0 above,
+        * then right waiter has a dl_prio() too.
+        */
+       if (dl_prio(left->prio))
+               return left->deadline == right->deadline;
+
+       return 1;
+}
+
 static void
 rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
 {
@@ -322,72 +346,16 @@ rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
        RB_CLEAR_NODE(&waiter->pi_tree_entry);
 }
 
-/*
- * Calculate task priority from the waiter tree priority
- *
- * Return task->normal_prio when the waiter tree is empty or when
- * the waiter is not allowed to do priority boosting
- */
-int rt_mutex_getprio(struct task_struct *task)
-{
-       if (likely(!task_has_pi_waiters(task)))
-               return task->normal_prio;
-
-       return min(task_top_pi_waiter(task)->prio,
-                  task->normal_prio);
-}
-
-struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
+static void rt_mutex_adjust_prio(struct task_struct *p)
 {
-       if (likely(!task_has_pi_waiters(task)))
-               return NULL;
-
-       return task_top_pi_waiter(task)->task;
-}
+       struct task_struct *pi_task = NULL;
 
-/*
- * Called by sched_setscheduler() to get the priority which will be
- * effective after the change.
- */
-int rt_mutex_get_effective_prio(struct task_struct *task, int newprio)
-{
-       if (!task_has_pi_waiters(task))
-               return newprio;
+       lockdep_assert_held(&p->pi_lock);
 
-       if (task_top_pi_waiter(task)->task->prio <= newprio)
-               return task_top_pi_waiter(task)->task->prio;
-       return newprio;
-}
+       if (task_has_pi_waiters(p))
+               pi_task = task_top_pi_waiter(p)->task;
 
-/*
- * Adjust the priority of a task, after its pi_waiters got modified.
- *
- * This can be both boosting and unboosting. task->pi_lock must be held.
- */
-static void __rt_mutex_adjust_prio(struct task_struct *task)
-{
-       int prio = rt_mutex_getprio(task);
-
-       if (task->prio != prio || dl_prio(prio))
-               rt_mutex_setprio(task, prio);
-}
-
-/*
- * Adjust task priority (undo boosting). Called from the exit path of
- * rt_mutex_slowunlock() and rt_mutex_slowlock().
- *
- * (Note: We do this outside of the protection of lock->wait_lock to
- * allow the lock to be taken while or before we readjust the priority
- * of task. We do not use the spin_xx_mutex() variants here as we are
- * outside of the debug path.)
- */
-void rt_mutex_adjust_prio(struct task_struct *task)
-{
-       unsigned long flags;
-
-       raw_spin_lock_irqsave(&task->pi_lock, flags);
-       __rt_mutex_adjust_prio(task);
-       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+       rt_mutex_setprio(p, pi_task);
 }
 
 /*
@@ -610,7 +578,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
         * enabled we continue, but stop the requeueing in the chain
         * walk.
         */
-       if (waiter->prio == task->prio) {
+       if (rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
                if (!detect_deadlock)
                        goto out_unlock_pi;
                else
@@ -706,7 +674,26 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 
        /* [7] Requeue the waiter in the lock waiter tree. */
        rt_mutex_dequeue(lock, waiter);
+
+       /*
+        * Update the waiter prio fields now that we're dequeued.
+        *
+        * These values can have changed through either:
+        *
+        *   sys_sched_set_scheduler() / sys_sched_setattr()
+        *
+        * or
+        *
+        *   DL CBS enforcement advancing the effective deadline.
+        *
+        * Even though pi_waiters also uses these fields, and that tree is only
+        * updated in [11], we can do this here, since we hold [L], which
+        * serializes all pi_waiters access and rb_erase() does not care about
+        * the values of the node being removed.
+        */
        waiter->prio = task->prio;
+       waiter->deadline = task->dl.deadline;
+
        rt_mutex_enqueue(lock, waiter);
 
        /* [8] Release the task */
@@ -747,7 +734,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
                 */
                rt_mutex_dequeue_pi(task, prerequeue_top_waiter);
                rt_mutex_enqueue_pi(task, waiter);
-               __rt_mutex_adjust_prio(task);
+               rt_mutex_adjust_prio(task);
 
        } else if (prerequeue_top_waiter == waiter) {
                /*
@@ -763,7 +750,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
                rt_mutex_dequeue_pi(task, waiter);
                waiter = rt_mutex_top_waiter(lock);
                rt_mutex_enqueue_pi(task, waiter);
-               __rt_mutex_adjust_prio(task);
+               rt_mutex_adjust_prio(task);
        } else {
                /*
                 * Nothing changed. No need to do any priority
@@ -833,6 +820,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
                                struct rt_mutex_waiter *waiter)
 {
+       lockdep_assert_held(&lock->wait_lock);
+
        /*
         * Before testing whether we can acquire @lock, we set the
         * RT_MUTEX_HAS_WAITERS bit in @lock->owner. This forces all
@@ -892,7 +881,8 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
                         * the top waiter priority (kernel view),
                         * @task lost.
                         */
-                       if (task->prio >= rt_mutex_top_waiter(lock)->prio)
+                       if (!rt_mutex_waiter_less(task_to_waiter(task),
+                                                 rt_mutex_top_waiter(lock)))
                                return 0;
 
                        /*
@@ -938,8 +928,6 @@ takeit:
         */
        rt_mutex_set_owner(lock, task);
 
-       rt_mutex_deadlock_account_lock(lock, task);
-
        return 1;
 }
 
@@ -960,6 +948,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
        struct rt_mutex *next_lock;
        int chain_walk = 0, res;
 
+       lockdep_assert_held(&lock->wait_lock);
+
        /*
         * Early deadlock detection. We really don't want the task to
         * enqueue on itself just to untangle the mess later. It's not
@@ -973,10 +963,11 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
                return -EDEADLK;
 
        raw_spin_lock(&task->pi_lock);
-       __rt_mutex_adjust_prio(task);
+       rt_mutex_adjust_prio(task);
        waiter->task = task;
        waiter->lock = lock;
        waiter->prio = task->prio;
+       waiter->deadline = task->dl.deadline;
 
        /* Get the top priority waiter on the lock */
        if (rt_mutex_has_waiters(lock))
@@ -995,7 +986,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
                rt_mutex_dequeue_pi(owner, top_waiter);
                rt_mutex_enqueue_pi(owner, waiter);
 
-               __rt_mutex_adjust_prio(owner);
+               rt_mutex_adjust_prio(owner);
                if (owner->pi_blocked_on)
                        chain_walk = 1;
        } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
@@ -1047,12 +1038,14 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
        waiter = rt_mutex_top_waiter(lock);
 
        /*
-        * Remove it from current->pi_waiters. We do not adjust a
-        * possible priority boost right now. We execute wakeup in the
-        * boosted mode and go back to normal after releasing
-        * lock->wait_lock.
+        * Remove it from current->pi_waiters and deboost.
+        *
+        * We must in fact deboost here in order to ensure we call
+        * rt_mutex_setprio() to update p->pi_top_task before the
+        * task unblocks.
         */
        rt_mutex_dequeue_pi(current, waiter);
+       rt_mutex_adjust_prio(current);
 
        /*
         * As we are waking up the top waiter, and the waiter stays
@@ -1064,9 +1057,19 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
         */
        lock->owner = (void *) RT_MUTEX_HAS_WAITERS;
 
-       raw_spin_unlock(&current->pi_lock);
-
+       /*
+        * We deboosted before waking the top waiter task such that we don't
+        * run two tasks with the 'same' priority (and ensure the
+        * p->pi_top_task pointer points to a blocked task). This however can
+        * lead to priority inversion if we would get preempted after the
+        * deboost but before waking our donor task, hence the preempt_disable()
+        * before unlock.
+        *
+        * Pairs with preempt_enable() in rt_mutex_postunlock();
+        */
+       preempt_disable();
        wake_q_add(wake_q, waiter->task);
+       raw_spin_unlock(&current->pi_lock);
 }
 
 /*
@@ -1082,6 +1085,8 @@ static void remove_waiter(struct rt_mutex *lock,
        struct task_struct *owner = rt_mutex_owner(lock);
        struct rt_mutex *next_lock;
 
+       lockdep_assert_held(&lock->wait_lock);
+
        raw_spin_lock(&current->pi_lock);
        rt_mutex_dequeue(lock, waiter);
        current->pi_blocked_on = NULL;
@@ -1101,7 +1106,7 @@ static void remove_waiter(struct rt_mutex *lock,
        if (rt_mutex_has_waiters(lock))
                rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock));
 
-       __rt_mutex_adjust_prio(owner);
+       rt_mutex_adjust_prio(owner);
 
        /* Store the lock on which owner is blocked or NULL */
        next_lock = task_blocked_on_lock(owner);
@@ -1140,8 +1145,7 @@ void rt_mutex_adjust_pi(struct task_struct *task)
        raw_spin_lock_irqsave(&task->pi_lock, flags);
 
        waiter = task->pi_blocked_on;
-       if (!waiter || (waiter->prio == task->prio &&
-                       !dl_prio(task->prio))) {
+       if (!waiter || rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
                raw_spin_unlock_irqrestore(&task->pi_lock, flags);
                return;
        }
@@ -1155,6 +1159,14 @@ void rt_mutex_adjust_pi(struct task_struct *task)
                                   next_lock, NULL, task);
 }
 
+void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
+{
+       debug_rt_mutex_init_waiter(waiter);
+       RB_CLEAR_NODE(&waiter->pi_tree_entry);
+       RB_CLEAR_NODE(&waiter->tree_entry);
+       waiter->task = NULL;
+}
+
 /**
  * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop
  * @lock:               the rt_mutex to take
@@ -1237,9 +1249,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
        unsigned long flags;
        int ret = 0;
 
-       debug_rt_mutex_init_waiter(&waiter);
-       RB_CLEAR_NODE(&waiter.pi_tree_entry);
-       RB_CLEAR_NODE(&waiter.tree_entry);
+       rt_mutex_init_waiter(&waiter);
 
        /*
         * Technically we could use raw_spin_[un]lock_irq() here, but this can
@@ -1330,7 +1340,8 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
 
 /*
  * Slow path to release a rt-mutex.
- * Return whether the current task needs to undo a potential priority boosting.
+ *
+ * Return whether the current task needs to call rt_mutex_postunlock().
  */
 static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
                                        struct wake_q_head *wake_q)
@@ -1342,8 +1353,6 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
 
        debug_rt_mutex_unlock(lock);
 
-       rt_mutex_deadlock_account_unlock(current);
-
        /*
         * We must be careful here if the fast path is enabled. If we
         * have no waiters queued we cannot set owner to NULL here
@@ -1390,11 +1399,9 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
         * Queue the next waiter for wakeup once we release the wait_lock.
         */
        mark_wakeup_next_waiter(wake_q, lock);
-
        raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
 
-       /* check PI boosting */
-       return true;
+       return true; /* call rt_mutex_postunlock() */
 }
 
 /*
@@ -1409,11 +1416,10 @@ rt_mutex_fastlock(struct rt_mutex *lock, int state,
                                struct hrtimer_sleeper *timeout,
                                enum rtmutex_chainwalk chwalk))
 {
-       if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
-               rt_mutex_deadlock_account_lock(lock, current);
+       if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
                return 0;
-       } else
-               return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
+
+       return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
 }
 
 static inline int
@@ -1425,24 +1431,33 @@ rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
                                      enum rtmutex_chainwalk chwalk))
 {
        if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
-           likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
-               rt_mutex_deadlock_account_lock(lock, current);
+           likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
                return 0;
-       } else
-               return slowfn(lock, state, timeout, chwalk);
+
+       return slowfn(lock, state, timeout, chwalk);
 }
 
 static inline int
 rt_mutex_fasttrylock(struct rt_mutex *lock,
                     int (*slowfn)(struct rt_mutex *lock))
 {
-       if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
-               rt_mutex_deadlock_account_lock(lock, current);
+       if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
                return 1;
-       }
+
        return slowfn(lock);
 }
 
+/*
+ * Performs the wakeup of the the top-waiter and re-enables preemption.
+ */
+void rt_mutex_postunlock(struct wake_q_head *wake_q)
+{
+       wake_up_q(wake_q);
+
+       /* Pairs with preempt_disable() in rt_mutex_slowunlock() */
+       preempt_enable();
+}
+
 static inline void
 rt_mutex_fastunlock(struct rt_mutex *lock,
                    bool (*slowfn)(struct rt_mutex *lock,
@@ -1450,18 +1465,11 @@ rt_mutex_fastunlock(struct rt_mutex *lock,
 {
        DEFINE_WAKE_Q(wake_q);
 
-       if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
-               rt_mutex_deadlock_account_unlock(current);
-
-       } else {
-               bool deboost = slowfn(lock, &wake_q);
-
-               wake_up_q(&wake_q);
+       if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
+               return;
 
-               /* Undo pi boosting if necessary: */
-               if (deboost)
-                       rt_mutex_adjust_prio(current);
-       }
+       if (slowfn(lock, &wake_q))
+               rt_mutex_postunlock(&wake_q);
 }
 
 /**
@@ -1495,16 +1503,11 @@ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
 EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
 
 /*
- * Futex variant with full deadlock detection.
+ * Futex variant, must not use fastpath.
  */
-int rt_mutex_timed_futex_lock(struct rt_mutex *lock,
-                             struct hrtimer_sleeper *timeout)
+int __sched rt_mutex_futex_trylock(struct rt_mutex *lock)
 {
-       might_sleep();
-
-       return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
-                                      RT_MUTEX_FULL_CHAINWALK,
-                                      rt_mutex_slowlock);
+       return rt_mutex_slowtrylock(lock);
 }
 
 /**
@@ -1563,20 +1566,43 @@ void __sched rt_mutex_unlock(struct rt_mutex *lock)
 EXPORT_SYMBOL_GPL(rt_mutex_unlock);
 
 /**
- * rt_mutex_futex_unlock - Futex variant of rt_mutex_unlock
- * @lock: the rt_mutex to be unlocked
- *
- * Returns: true/false indicating whether priority adjustment is
- * required or not.
+ * Futex variant, that since futex variants do not use the fast-path, can be
+ * simple and will not need to retry.
  */
-bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock,
-                                  struct wake_q_head *wqh)
+bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock,
+                                   struct wake_q_head *wake_q)
 {
-       if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
-               rt_mutex_deadlock_account_unlock(current);
-               return false;
+       lockdep_assert_held(&lock->wait_lock);
+
+       debug_rt_mutex_unlock(lock);
+
+       if (!rt_mutex_has_waiters(lock)) {
+               lock->owner = NULL;
+               return false; /* done */
        }
-       return rt_mutex_slowunlock(lock, wqh);
+
+       /*
+        * We've already deboosted, mark_wakeup_next_waiter() will
+        * retain preempt_disabled when we drop the wait_lock, to
+        * avoid inversion prior to the wakeup.  preempt_disable()
+        * therein pairs with rt_mutex_postunlock().
+        */
+       mark_wakeup_next_waiter(wake_q, lock);
+
+       return true; /* call postunlock() */
+}
+
+void __sched rt_mutex_futex_unlock(struct rt_mutex *lock)
+{
+       DEFINE_WAKE_Q(wake_q);
+       bool postunlock;
+
+       raw_spin_lock_irq(&lock->wait_lock);
+       postunlock = __rt_mutex_futex_unlock(lock, &wake_q);
+       raw_spin_unlock_irq(&lock->wait_lock);
+
+       if (postunlock)
+               rt_mutex_postunlock(&wake_q);
 }
 
 /**
@@ -1637,7 +1663,6 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
        __rt_mutex_init(lock, NULL);
        debug_rt_mutex_proxy_lock(lock, proxy_owner);
        rt_mutex_set_owner(lock, proxy_owner);
-       rt_mutex_deadlock_account_lock(lock, proxy_owner);
 }
 
 /**
@@ -1657,34 +1682,16 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock,
 {
        debug_rt_mutex_proxy_unlock(lock);
        rt_mutex_set_owner(lock, NULL);
-       rt_mutex_deadlock_account_unlock(proxy_owner);
 }
 
-/**
- * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
- * @lock:              the rt_mutex to take
- * @waiter:            the pre-initialized rt_mutex_waiter
- * @task:              the task to prepare
- *
- * Returns:
- *  0 - task blocked on lock
- *  1 - acquired the lock for task, caller should wake it up
- * <0 - error
- *
- * Special API call for FUTEX_REQUEUE_PI support.
- */
-int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
                              struct rt_mutex_waiter *waiter,
                              struct task_struct *task)
 {
        int ret;
 
-       raw_spin_lock_irq(&lock->wait_lock);
-
-       if (try_to_take_rt_mutex(lock, task, NULL)) {
-               raw_spin_unlock_irq(&lock->wait_lock);
+       if (try_to_take_rt_mutex(lock, task, NULL))
                return 1;
-       }
 
        /* We enforce deadlock detection for futexes */
        ret = task_blocks_on_rt_mutex(lock, waiter, task,
@@ -1703,14 +1710,38 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
        if (unlikely(ret))
                remove_waiter(lock, waiter);
 
-       raw_spin_unlock_irq(&lock->wait_lock);
-
        debug_rt_mutex_print_deadlock(waiter);
 
        return ret;
 }
 
 /**
+ * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
+ * @lock:              the rt_mutex to take
+ * @waiter:            the pre-initialized rt_mutex_waiter
+ * @task:              the task to prepare
+ *
+ * Returns:
+ *  0 - task blocked on lock
+ *  1 - acquired the lock for task, caller should wake it up
+ * <0 - error
+ *
+ * Special API call for FUTEX_REQUEUE_PI support.
+ */
+int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+                             struct rt_mutex_waiter *waiter,
+                             struct task_struct *task)
+{
+       int ret;
+
+       raw_spin_lock_irq(&lock->wait_lock);
+       ret = __rt_mutex_start_proxy_lock(lock, waiter, task);
+       raw_spin_unlock_irq(&lock->wait_lock);
+
+       return ret;
+}
+
+/**
  * rt_mutex_next_owner - return the next owner of the lock
  *
  * @lock: the rt lock query
@@ -1731,21 +1762,23 @@ struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock)
 }
 
 /**
- * rt_mutex_finish_proxy_lock() - Complete lock acquisition
+ * rt_mutex_wait_proxy_lock() - Wait for lock acquisition
  * @lock:              the rt_mutex we were woken on
  * @to:                        the timeout, null if none. hrtimer should already have
  *                     been started.
  * @waiter:            the pre-initialized rt_mutex_waiter
  *
- * Complete the lock acquisition started our behalf by another thread.
+ * Wait for the the lock acquisition started on our behalf by
+ * rt_mutex_start_proxy_lock(). Upon failure, the caller must call
+ * rt_mutex_cleanup_proxy_lock().
  *
  * Returns:
  *  0 - success
  * <0 - error, one of -EINTR, -ETIMEDOUT
  *
- * Special API call for PI-futex requeue support
+ * Special API call for PI-futex support
  */
-int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
+int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
                               struct hrtimer_sleeper *to,
                               struct rt_mutex_waiter *waiter)
 {
@@ -1758,8 +1791,45 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
        /* sleep on the mutex */
        ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
 
-       if (unlikely(ret))
+       raw_spin_unlock_irq(&lock->wait_lock);
+
+       return ret;
+}
+
+/**
+ * rt_mutex_cleanup_proxy_lock() - Cleanup failed lock acquisition
+ * @lock:              the rt_mutex we were woken on
+ * @waiter:            the pre-initialized rt_mutex_waiter
+ *
+ * Attempt to clean up after a failed rt_mutex_wait_proxy_lock().
+ *
+ * Unless we acquired the lock; we're still enqueued on the wait-list and can
+ * in fact still be granted ownership until we're removed. Therefore we can
+ * find we are in fact the owner and must disregard the
+ * rt_mutex_wait_proxy_lock() failure.
+ *
+ * Returns:
+ *  true  - did the cleanup, we done.
+ *  false - we acquired the lock after rt_mutex_wait_proxy_lock() returned,
+ *          caller should disregards its return value.
+ *
+ * Special API call for PI-futex support
+ */
+bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
+                                struct rt_mutex_waiter *waiter)
+{
+       bool cleanup = false;
+
+       raw_spin_lock_irq(&lock->wait_lock);
+       /*
+        * Unless we're the owner; we're still enqueued on the wait_list.
+        * So check if we became owner, if not, take us off the wait_list.
+        */
+       if (rt_mutex_owner(lock) != current) {
                remove_waiter(lock, waiter);
+               fixup_rt_mutex_waiters(lock);
+               cleanup = true;
+       }
 
        /*
         * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
@@ -1769,5 +1839,5 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
 
        raw_spin_unlock_irq(&lock->wait_lock);
 
-       return ret;
+       return cleanup;
 }
index c406058..6607802 100644 (file)
@@ -11,8 +11,6 @@
  */
 
 #define rt_mutex_deadlock_check(l)                     (0)
-#define rt_mutex_deadlock_account_lock(m, t)           do { } while (0)
-#define rt_mutex_deadlock_account_unlock(l)            do { } while (0)
 #define debug_rt_mutex_init_waiter(w)                  do { } while (0)
 #define debug_rt_mutex_free_waiter(w)                  do { } while (0)
 #define debug_rt_mutex_lock(l)                         do { } while (0)
index 856dfff..72ad45a 100644 (file)
@@ -34,6 +34,7 @@ struct rt_mutex_waiter {
        struct rt_mutex         *deadlock_lock;
 #endif
        int prio;
+       u64 deadline;
 };
 
 /*
@@ -103,16 +104,26 @@ extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
                                       struct task_struct *proxy_owner);
 extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
                                  struct task_struct *proxy_owner);
+extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
+extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+                                    struct rt_mutex_waiter *waiter,
+                                    struct task_struct *task);
 extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
                                     struct rt_mutex_waiter *waiter,
                                     struct task_struct *task);
-extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
-                                     struct hrtimer_sleeper *to,
-                                     struct rt_mutex_waiter *waiter);
-extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to);
-extern bool rt_mutex_futex_unlock(struct rt_mutex *lock,
-                                 struct wake_q_head *wqh);
-extern void rt_mutex_adjust_prio(struct task_struct *task);
+extern int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
+                              struct hrtimer_sleeper *to,
+                              struct rt_mutex_waiter *waiter);
+extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
+                                struct rt_mutex_waiter *waiter);
+
+extern int rt_mutex_futex_trylock(struct rt_mutex *l);
+
+extern void rt_mutex_futex_unlock(struct rt_mutex *lock);
+extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock,
+                                struct wake_q_head *wqh);
+
+extern void rt_mutex_postunlock(struct wake_q_head *wake_q);
 
 #ifdef CONFIG_DEBUG_RT_MUTEXES
 # include "rtmutex-debug.h"
index 90a74cc..4d48b1c 100644 (file)
@@ -124,10 +124,8 @@ EXPORT_SYMBOL(up_write);
  */
 void downgrade_write(struct rw_semaphore *sem)
 {
-       /*
-        * lockdep: a downgraded write will live on as a write
-        * dependency.
-        */
+       lock_downgrade(&sem->dep_map, _RET_IP_);
+
        rwsem_set_reader_owned(sem);
        __downgrade_write(sem);
 }
index 6b7abb3..39f56c8 100644 (file)
@@ -353,8 +353,8 @@ static int test_cycle(unsigned int ncpus)
 struct stress {
        struct work_struct work;
        struct ww_mutex *locks;
+       unsigned long timeout;
        int nlocks;
-       int nloops;
 };
 
 static int *get_random_order(int count)
@@ -398,12 +398,11 @@ static void stress_inorder_work(struct work_struct *work)
        if (!order)
                return;
 
-       ww_acquire_init(&ctx, &ww_class);
-
        do {
                int contended = -1;
                int n, err;
 
+               ww_acquire_init(&ctx, &ww_class);
 retry:
                err = 0;
                for (n = 0; n < nlocks; n++) {
@@ -433,9 +432,9 @@ retry:
                                    __func__, err);
                        break;
                }
-       } while (--stress->nloops);
 
-       ww_acquire_fini(&ctx);
+               ww_acquire_fini(&ctx);
+       } while (!time_after(jiffies, stress->timeout));
 
        kfree(order);
        kfree(stress);
@@ -470,9 +469,9 @@ static void stress_reorder_work(struct work_struct *work)
        kfree(order);
        order = NULL;
 
-       ww_acquire_init(&ctx, &ww_class);
-
        do {
+               ww_acquire_init(&ctx, &ww_class);
+
                list_for_each_entry(ll, &locks, link) {
                        err = ww_mutex_lock(ll->lock, &ctx);
                        if (!err)
@@ -495,9 +494,9 @@ static void stress_reorder_work(struct work_struct *work)
                dummy_load(stress);
                list_for_each_entry(ll, &locks, link)
                        ww_mutex_unlock(ll->lock);
-       } while (--stress->nloops);
 
-       ww_acquire_fini(&ctx);
+               ww_acquire_fini(&ctx);
+       } while (!time_after(jiffies, stress->timeout));
 
 out:
        list_for_each_entry_safe(ll, ln, &locks, link)
@@ -523,7 +522,7 @@ static void stress_one_work(struct work_struct *work)
                                    __func__, err);
                        break;
                }
-       } while (--stress->nloops);
+       } while (!time_after(jiffies, stress->timeout));
 
        kfree(stress);
 }
@@ -533,7 +532,7 @@ static void stress_one_work(struct work_struct *work)
 #define STRESS_ONE BIT(2)
 #define STRESS_ALL (STRESS_INORDER | STRESS_REORDER | STRESS_ONE)
 
-static int stress(int nlocks, int nthreads, int nloops, unsigned int flags)
+static int stress(int nlocks, int nthreads, unsigned int flags)
 {
        struct ww_mutex *locks;
        int n;
@@ -575,7 +574,7 @@ static int stress(int nlocks, int nthreads, int nloops, unsigned int flags)
                INIT_WORK(&stress->work, fn);
                stress->locks = locks;
                stress->nlocks = nlocks;
-               stress->nloops = nloops;
+               stress->timeout = jiffies + 2*HZ;
 
                queue_work(wq, &stress->work);
                nthreads--;
@@ -619,15 +618,15 @@ static int __init test_ww_mutex_init(void)
        if (ret)
                return ret;
 
-       ret = stress(16, 2*ncpus, 1<<10, STRESS_INORDER);
+       ret = stress(16, 2*ncpus, STRESS_INORDER);
        if (ret)
                return ret;
 
-       ret = stress(16, 2*ncpus, 1<<10, STRESS_REORDER);
+       ret = stress(16, 2*ncpus, STRESS_REORDER);
        if (ret)
                return ret;
 
-       ret = stress(4095, hweight32(STRESS_ALL)*ncpus, 1<<12, STRESS_ALL);
+       ret = stress(4095, hweight32(STRESS_ALL)*ncpus, STRESS_ALL);
        if (ret)
                return ret;
 
index 7eba6de..6d99880 100644 (file)
@@ -665,16 +665,7 @@ static void percpu_modcopy(struct module *mod,
                memcpy(per_cpu_ptr(mod->percpu, cpu), from, size);
 }
 
-/**
- * is_module_percpu_address - test whether address is from module static percpu
- * @addr: address to test
- *
- * Test whether @addr belongs to module static percpu area.
- *
- * RETURNS:
- * %true if @addr is from module static percpu area
- */
-bool is_module_percpu_address(unsigned long addr)
+bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
 {
        struct module *mod;
        unsigned int cpu;
@@ -688,9 +679,15 @@ bool is_module_percpu_address(unsigned long addr)
                        continue;
                for_each_possible_cpu(cpu) {
                        void *start = per_cpu_ptr(mod->percpu, cpu);
-
-                       if ((void *)addr >= start &&
-                           (void *)addr < start + mod->percpu_size) {
+                       void *va = (void *)addr;
+
+                       if (va >= start && va < start + mod->percpu_size) {
+                               if (can_addr) {
+                                       *can_addr = (unsigned long) (va - start);
+                                       *can_addr += (unsigned long)
+                                               per_cpu_ptr(mod->percpu,
+                                                           get_boot_cpu_id());
+                               }
                                preempt_enable();
                                return true;
                        }
@@ -701,6 +698,20 @@ bool is_module_percpu_address(unsigned long addr)
        return false;
 }
 
+/**
+ * is_module_percpu_address - test whether address is from module static percpu
+ * @addr: address to test
+ *
+ * Test whether @addr belongs to module static percpu area.
+ *
+ * RETURNS:
+ * %true if @addr is from module static percpu area
+ */
+bool is_module_percpu_address(unsigned long addr)
+{
+       return __is_module_percpu_address(addr, NULL);
+}
+
 #else /* ... !CONFIG_SMP */
 
 static inline void __percpu *mod_percpu(struct module *mod)
@@ -732,6 +743,11 @@ bool is_module_percpu_address(unsigned long addr)
        return false;
 }
 
+bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
+{
+       return false;
+}
+
 #endif /* CONFIG_SMP */
 
 #define MODINFO_ATTR(field)    \
index 782102e..f6c5d33 100644 (file)
@@ -26,6 +26,7 @@
 #include <linux/file.h>
 #include <linux/syscalls.h>
 #include <linux/cgroup.h>
+#include <linux/perf_event.h>
 
 static struct kmem_cache *nsproxy_cachep;
 
@@ -262,6 +263,8 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
                goto out;
        }
        switch_task_namespaces(tsk, new_nsproxy);
+
+       perf_event_namespaces(tsk);
 out:
        fput(file);
        return err;
index a6d6149..60b2d81 100644 (file)
@@ -160,58 +160,6 @@ static int parse_one(char *param,
        return -ENOENT;
 }
 
-/* You can use " around spaces, but can't escape ". */
-/* Hyphens and underscores equivalent in parameter names. */
-static char *next_arg(char *args, char **param, char **val)
-{
-       unsigned int i, equals = 0;
-       int in_quote = 0, quoted = 0;
-       char *next;
-
-       if (*args == '"') {
-               args++;
-               in_quote = 1;
-               quoted = 1;
-       }
-
-       for (i = 0; args[i]; i++) {
-               if (isspace(args[i]) && !in_quote)
-                       break;
-               if (equals == 0) {
-                       if (args[i] == '=')
-                               equals = i;
-               }
-               if (args[i] == '"')
-                       in_quote = !in_quote;
-       }
-
-       *param = args;
-       if (!equals)
-               *val = NULL;
-       else {
-               args[equals] = '\0';
-               *val = args + equals + 1;
-
-               /* Don't include quotes in value. */
-               if (**val == '"') {
-                       (*val)++;
-                       if (args[i-1] == '"')
-                               args[i-1] = '\0';
-               }
-       }
-       if (quoted && args[i-1] == '"')
-               args[i-1] = '\0';
-
-       if (args[i]) {
-               args[i] = '\0';
-               next = args + i + 1;
-       } else
-               next = args + i;
-
-       /* Chew up trailing spaces. */
-       return skip_spaces(next);
-}
-
 /* Args looks like "foo=bar,bar2 baz=fuz wiz". */
 char *parse_args(const char *doing,
                 char *args,
index 0af9287..266ddcc 100644 (file)
@@ -184,11 +184,17 @@ static void ptrace_unfreeze_traced(struct task_struct *task)
 
        WARN_ON(!task->ptrace || task->parent != current);
 
+       /*
+        * PTRACE_LISTEN can allow ptrace_trap_notify to wake us up remotely.
+        * Recheck state under the lock to close this race.
+        */
        spin_lock_irq(&task->sighand->siglock);
-       if (__fatal_signal_pending(task))
-               wake_up_state(task, __TASK_TRACED);
-       else
-               task->state = TASK_TRACED;
+       if (task->state == __TASK_TRACED) {
+               if (__fatal_signal_pending(task))
+                       wake_up_state(task, __TASK_TRACED);
+               else
+                       task->state = TASK_TRACED;
+       }
        spin_unlock_irq(&task->sighand->siglock);
 }
 
index 3b31fc0..c51147a 100644 (file)
@@ -86,21 +86,6 @@ int sysctl_sched_rt_runtime = 950000;
 cpumask_var_t cpu_isolated_map;
 
 /*
- * this_rq_lock - lock this runqueue and disable interrupts.
- */
-static struct rq *this_rq_lock(void)
-       __acquires(rq->lock)
-{
-       struct rq *rq;
-
-       local_irq_disable();
-       rq = this_rq();
-       raw_spin_lock(&rq->lock);
-
-       return rq;
-}
-
-/*
  * __task_rq_lock - lock the rq @p resides on.
  */
 struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
@@ -233,8 +218,11 @@ void update_rq_clock(struct rq *rq)
                return;
 
 #ifdef CONFIG_SCHED_DEBUG
+       if (sched_feat(WARN_DOUBLE_CLOCK))
+               SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED);
        rq->clock_update_flags |= RQCF_UPDATED;
 #endif
+
        delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
        if (delta < 0)
                return;
@@ -261,13 +249,14 @@ static void hrtick_clear(struct rq *rq)
 static enum hrtimer_restart hrtick(struct hrtimer *timer)
 {
        struct rq *rq = container_of(timer, struct rq, hrtick_timer);
+       struct rq_flags rf;
 
        WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
 
-       raw_spin_lock(&rq->lock);
+       rq_lock(rq, &rf);
        update_rq_clock(rq);
        rq->curr->sched_class->task_tick(rq, rq->curr, 1);
-       raw_spin_unlock(&rq->lock);
+       rq_unlock(rq, &rf);
 
        return HRTIMER_NORESTART;
 }
@@ -287,11 +276,12 @@ static void __hrtick_restart(struct rq *rq)
 static void __hrtick_start(void *arg)
 {
        struct rq *rq = arg;
+       struct rq_flags rf;
 
-       raw_spin_lock(&rq->lock);
+       rq_lock(rq, &rf);
        __hrtick_restart(rq);
        rq->hrtick_csd_pending = 0;
-       raw_spin_unlock(&rq->lock);
+       rq_unlock(rq, &rf);
 }
 
 /*
@@ -762,17 +752,23 @@ static void set_load_weight(struct task_struct *p)
 
 static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 {
-       update_rq_clock(rq);
+       if (!(flags & ENQUEUE_NOCLOCK))
+               update_rq_clock(rq);
+
        if (!(flags & ENQUEUE_RESTORE))
                sched_info_queued(rq, p);
+
        p->sched_class->enqueue_task(rq, p, flags);
 }
 
 static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 {
-       update_rq_clock(rq);
+       if (!(flags & DEQUEUE_NOCLOCK))
+               update_rq_clock(rq);
+
        if (!(flags & DEQUEUE_SAVE))
                sched_info_dequeued(rq, p);
+
        p->sched_class->dequeue_task(rq, p, flags);
 }
 
@@ -946,18 +942,19 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
  *
  * Returns (locked) new rq. Old rq's lock is released.
  */
-static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new_cpu)
+static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
+                                  struct task_struct *p, int new_cpu)
 {
        lockdep_assert_held(&rq->lock);
 
        p->on_rq = TASK_ON_RQ_MIGRATING;
-       dequeue_task(rq, p, 0);
+       dequeue_task(rq, p, DEQUEUE_NOCLOCK);
        set_task_cpu(p, new_cpu);
-       raw_spin_unlock(&rq->lock);
+       rq_unlock(rq, rf);
 
        rq = cpu_rq(new_cpu);
 
-       raw_spin_lock(&rq->lock);
+       rq_lock(rq, rf);
        BUG_ON(task_cpu(p) != new_cpu);
        enqueue_task(rq, p, 0);
        p->on_rq = TASK_ON_RQ_QUEUED;
@@ -980,7 +977,8 @@ struct migration_arg {
  * So we race with normal scheduler movements, but that's OK, as long
  * as the task is no longer on this CPU.
  */
-static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_cpu)
+static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
+                                struct task_struct *p, int dest_cpu)
 {
        if (unlikely(!cpu_active(dest_cpu)))
                return rq;
@@ -989,7 +987,8 @@ static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_
        if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
                return rq;
 
-       rq = move_queued_task(rq, p, dest_cpu);
+       update_rq_clock(rq);
+       rq = move_queued_task(rq, rf, p, dest_cpu);
 
        return rq;
 }
@@ -1004,6 +1003,7 @@ static int migration_cpu_stop(void *data)
        struct migration_arg *arg = data;
        struct task_struct *p = arg->task;
        struct rq *rq = this_rq();
+       struct rq_flags rf;
 
        /*
         * The original target CPU might have gone down and we might
@@ -1018,7 +1018,7 @@ static int migration_cpu_stop(void *data)
        sched_ttwu_pending();
 
        raw_spin_lock(&p->pi_lock);
-       raw_spin_lock(&rq->lock);
+       rq_lock(rq, &rf);
        /*
         * If task_rq(p) != rq, it cannot be migrated here, because we're
         * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
@@ -1026,11 +1026,11 @@ static int migration_cpu_stop(void *data)
         */
        if (task_rq(p) == rq) {
                if (task_on_rq_queued(p))
-                       rq = __migrate_task(rq, p, arg->dest_cpu);
+                       rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
                else
                        p->wake_cpu = arg->dest_cpu;
        }
-       raw_spin_unlock(&rq->lock);
+       rq_unlock(rq, &rf);
        raw_spin_unlock(&p->pi_lock);
 
        local_irq_enable();
@@ -1063,7 +1063,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
                 * holding rq->lock.
                 */
                lockdep_assert_held(&rq->lock);
-               dequeue_task(rq, p, DEQUEUE_SAVE);
+               dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
        }
        if (running)
                put_prev_task(rq, p);
@@ -1071,7 +1071,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
        p->sched_class->set_cpus_allowed(p, new_mask);
 
        if (queued)
-               enqueue_task(rq, p, ENQUEUE_RESTORE);
+               enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
        if (running)
                set_curr_task(rq, p);
 }
@@ -1150,9 +1150,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
                 * OK, since we're going to drop the lock immediately
                 * afterwards anyway.
                 */
-               rq_unpin_lock(rq, &rf);
-               rq = move_queued_task(rq, p, dest_cpu);
-               rq_repin_lock(rq, &rf);
+               rq = move_queued_task(rq, &rf, p, dest_cpu);
        }
 out:
        task_rq_unlock(rq, p, &rf);
@@ -1217,16 +1215,24 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
 {
        if (task_on_rq_queued(p)) {
                struct rq *src_rq, *dst_rq;
+               struct rq_flags srf, drf;
 
                src_rq = task_rq(p);
                dst_rq = cpu_rq(cpu);
 
+               rq_pin_lock(src_rq, &srf);
+               rq_pin_lock(dst_rq, &drf);
+
                p->on_rq = TASK_ON_RQ_MIGRATING;
                deactivate_task(src_rq, p, 0);
                set_task_cpu(p, cpu);
                activate_task(dst_rq, p, 0);
                p->on_rq = TASK_ON_RQ_QUEUED;
                check_preempt_curr(dst_rq, p, 0);
+
+               rq_unpin_lock(dst_rq, &drf);
+               rq_unpin_lock(src_rq, &srf);
+
        } else {
                /*
                 * Task isn't running anymore; make it appear like we migrated
@@ -1680,7 +1686,7 @@ static void
 ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
                 struct rq_flags *rf)
 {
-       int en_flags = ENQUEUE_WAKEUP;
+       int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
 
        lockdep_assert_held(&rq->lock);
 
@@ -1726,14 +1732,13 @@ void sched_ttwu_pending(void)
        struct rq *rq = this_rq();
        struct llist_node *llist = llist_del_all(&rq->wake_list);
        struct task_struct *p;
-       unsigned long flags;
        struct rq_flags rf;
 
        if (!llist)
                return;
 
-       raw_spin_lock_irqsave(&rq->lock, flags);
-       rq_pin_lock(rq, &rf);
+       rq_lock_irqsave(rq, &rf);
+       update_rq_clock(rq);
 
        while (llist) {
                int wake_flags = 0;
@@ -1747,8 +1752,7 @@ void sched_ttwu_pending(void)
                ttwu_do_activate(rq, p, wake_flags, &rf);
        }
 
-       rq_unpin_lock(rq, &rf);
-       raw_spin_unlock_irqrestore(&rq->lock, flags);
+       rq_unlock_irqrestore(rq, &rf);
 }
 
 void scheduler_ipi(void)
@@ -1806,7 +1810,7 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
 void wake_up_if_idle(int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
-       unsigned long flags;
+       struct rq_flags rf;
 
        rcu_read_lock();
 
@@ -1816,11 +1820,11 @@ void wake_up_if_idle(int cpu)
        if (set_nr_if_polling(rq->idle)) {
                trace_sched_wake_idle_without_ipi(cpu);
        } else {
-               raw_spin_lock_irqsave(&rq->lock, flags);
+               rq_lock_irqsave(rq, &rf);
                if (is_idle_task(rq->curr))
                        smp_send_reschedule(cpu);
                /* Else CPU is not idle, do nothing here: */
-               raw_spin_unlock_irqrestore(&rq->lock, flags);
+               rq_unlock_irqrestore(rq, &rf);
        }
 
 out:
@@ -1846,11 +1850,10 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
        }
 #endif
 
-       raw_spin_lock(&rq->lock);
-       rq_pin_lock(rq, &rf);
+       rq_lock(rq, &rf);
+       update_rq_clock(rq);
        ttwu_do_activate(rq, p, wake_flags, &rf);
-       rq_unpin_lock(rq, &rf);
-       raw_spin_unlock(&rq->lock);
+       rq_unlock(rq, &rf);
 }
 
 /*
@@ -2097,11 +2100,9 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
                 * disabled avoiding further scheduler activity on it and we've
                 * not yet picked a replacement task.
                 */
-               rq_unpin_lock(rq, rf);
-               raw_spin_unlock(&rq->lock);
+               rq_unlock(rq, rf);
                raw_spin_lock(&p->pi_lock);
-               raw_spin_lock(&rq->lock);
-               rq_repin_lock(rq, rf);
+               rq_relock(rq, rf);
        }
 
        if (!(p->state & TASK_NORMAL))
@@ -2114,7 +2115,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
                        delayacct_blkio_end();
                        atomic_dec(&rq->nr_iowait);
                }
-               ttwu_activate(rq, p, ENQUEUE_WAKEUP);
+               ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK);
        }
 
        ttwu_do_wakeup(rq, p, 0, rf);
@@ -2555,7 +2556,7 @@ void wake_up_new_task(struct task_struct *p)
        update_rq_clock(rq);
        post_init_entity_util_avg(&p->se);
 
-       activate_task(rq, p, 0);
+       activate_task(rq, p, ENQUEUE_NOCLOCK);
        p->on_rq = TASK_ON_RQ_QUEUED;
        trace_sched_wakeup_new(p);
        check_preempt_curr(rq, p, WF_FORK);
@@ -3093,15 +3094,18 @@ void scheduler_tick(void)
        int cpu = smp_processor_id();
        struct rq *rq = cpu_rq(cpu);
        struct task_struct *curr = rq->curr;
+       struct rq_flags rf;
 
        sched_clock_tick();
 
-       raw_spin_lock(&rq->lock);
+       rq_lock(rq, &rf);
+
        update_rq_clock(rq);
        curr->sched_class->task_tick(rq, curr, 0);
        cpu_load_update_active(rq);
        calc_global_load_tick(rq);
-       raw_spin_unlock(&rq->lock);
+
+       rq_unlock(rq, &rf);
 
        perf_event_task_tick();
 
@@ -3386,18 +3390,18 @@ static void __sched notrace __schedule(bool preempt)
         * done by the caller to avoid the race with signal_wake_up().
         */
        smp_mb__before_spinlock();
-       raw_spin_lock(&rq->lock);
-       rq_pin_lock(rq, &rf);
+       rq_lock(rq, &rf);
 
        /* Promote REQ to ACT */
        rq->clock_update_flags <<= 1;
+       update_rq_clock(rq);
 
        switch_count = &prev->nivcsw;
        if (!preempt && prev->state) {
                if (unlikely(signal_pending_state(prev->state, prev))) {
                        prev->state = TASK_RUNNING;
                } else {
-                       deactivate_task(rq, prev, DEQUEUE_SLEEP);
+                       deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
                        prev->on_rq = 0;
 
                        if (prev->in_iowait) {
@@ -3421,9 +3425,6 @@ static void __sched notrace __schedule(bool preempt)
                switch_count = &prev->nvcsw;
        }
 
-       if (task_on_rq_queued(prev))
-               update_rq_clock(rq);
-
        next = pick_next_task(rq, prev, &rf);
        clear_tsk_need_resched(prev);
        clear_preempt_need_resched();
@@ -3439,8 +3440,7 @@ static void __sched notrace __schedule(bool preempt)
                rq = context_switch(rq, prev, next, &rf);
        } else {
                rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
-               rq_unpin_lock(rq, &rf);
-               raw_spin_unlock_irq(&rq->lock);
+               rq_unlock_irq(rq, &rf);
        }
 
        balance_callback(rq);
@@ -3671,10 +3671,25 @@ EXPORT_SYMBOL(default_wake_function);
 
 #ifdef CONFIG_RT_MUTEXES
 
+static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
+{
+       if (pi_task)
+               prio = min(prio, pi_task->prio);
+
+       return prio;
+}
+
+static inline int rt_effective_prio(struct task_struct *p, int prio)
+{
+       struct task_struct *pi_task = rt_mutex_get_top_task(p);
+
+       return __rt_effective_prio(pi_task, prio);
+}
+
 /*
  * rt_mutex_setprio - set the current priority of a task
- * @p: task
- * @prio: prio value (kernel-internal form)
+ * @p: task to boost
+ * @pi_task: donor task
  *
  * This function changes the 'effective' priority of a task. It does
  * not touch ->normal_prio like __setscheduler().
@@ -3682,17 +3697,42 @@ EXPORT_SYMBOL(default_wake_function);
  * Used by the rt_mutex code to implement priority inheritance
  * logic. Call site only calls if the priority of the task changed.
  */
-void rt_mutex_setprio(struct task_struct *p, int prio)
+void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
 {
-       int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
+       int prio, oldprio, queued, running, queue_flag =
+               DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
        const struct sched_class *prev_class;
        struct rq_flags rf;
        struct rq *rq;
 
-       BUG_ON(prio > MAX_PRIO);
+       /* XXX used to be waiter->prio, not waiter->task->prio */
+       prio = __rt_effective_prio(pi_task, p->normal_prio);
+
+       /*
+        * If nothing changed; bail early.
+        */
+       if (p->pi_top_task == pi_task && prio == p->prio && !dl_prio(prio))
+               return;
 
        rq = __task_rq_lock(p, &rf);
        update_rq_clock(rq);
+       /*
+        * Set under pi_lock && rq->lock, such that the value can be used under
+        * either lock.
+        *
+        * Note that there is loads of tricky to make this pointer cache work
+        * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to
+        * ensure a task is de-boosted (pi_task is set to NULL) before the
+        * task is allowed to run again (and can exit). This ensures the pointer
+        * points to a blocked task -- which guaratees the task is present.
+        */
+       p->pi_top_task = pi_task;
+
+       /*
+        * For FIFO/RR we only need to set prio, if that matches we're done.
+        */
+       if (prio == p->prio && !dl_prio(prio))
+               goto out_unlock;
 
        /*
         * Idle task boosting is a nono in general. There is one
@@ -3712,7 +3752,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
                goto out_unlock;
        }
 
-       trace_sched_pi_setprio(p, prio);
+       trace_sched_pi_setprio(p, pi_task);
        oldprio = p->prio;
 
        if (oldprio == prio)
@@ -3736,7 +3776,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
         *          running task
         */
        if (dl_prio(prio)) {
-               struct task_struct *pi_task = rt_mutex_get_top_task(p);
                if (!dl_prio(p->normal_prio) ||
                    (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
                        p->dl.dl_boosted = 1;
@@ -3774,6 +3813,11 @@ out_unlock:
        balance_callback(rq);
        preempt_enable();
 }
+#else
+static inline int rt_effective_prio(struct task_struct *p, int prio)
+{
+       return prio;
+}
 #endif
 
 void set_user_nice(struct task_struct *p, long nice)
@@ -3805,7 +3849,7 @@ void set_user_nice(struct task_struct *p, long nice)
        queued = task_on_rq_queued(p);
        running = task_current(rq, p);
        if (queued)
-               dequeue_task(rq, p, DEQUEUE_SAVE);
+               dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
        if (running)
                put_prev_task(rq, p);
 
@@ -3816,7 +3860,7 @@ void set_user_nice(struct task_struct *p, long nice)
        delta = p->prio - old_prio;
 
        if (queued) {
-               enqueue_task(rq, p, ENQUEUE_RESTORE);
+               enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
                /*
                 * If the task increased its priority or is running and
                 * lowered its priority, then reschedule its CPU:
@@ -4020,10 +4064,9 @@ static void __setscheduler(struct rq *rq, struct task_struct *p,
         * Keep a potential priority boosting if called from
         * sched_setscheduler().
         */
+       p->prio = normal_prio(p);
        if (keep_boost)
-               p->prio = rt_mutex_get_effective_prio(p, normal_prio(p));
-       else
-               p->prio = normal_prio(p);
+               p->prio = rt_effective_prio(p, p->prio);
 
        if (dl_prio(p->prio))
                p->sched_class = &dl_sched_class;
@@ -4126,7 +4169,7 @@ static int __sched_setscheduler(struct task_struct *p,
        const struct sched_class *prev_class;
        struct rq_flags rf;
        int reset_on_fork;
-       int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
+       int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
        struct rq *rq;
 
        /* May grab non-irq protected spin_locks: */
@@ -4310,7 +4353,7 @@ change:
                 * the runqueue. This will be done when the task deboost
                 * itself.
                 */
-               new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
+               new_effective_prio = rt_effective_prio(p, newprio);
                if (new_effective_prio == oldprio)
                        queue_flags &= ~DEQUEUE_MOVE;
        }
@@ -4923,7 +4966,12 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
  */
 SYSCALL_DEFINE0(sched_yield)
 {
-       struct rq *rq = this_rq_lock();
+       struct rq_flags rf;
+       struct rq *rq;
+
+       local_irq_disable();
+       rq = this_rq();
+       rq_lock(rq, &rf);
 
        schedstat_inc(rq->yld_count);
        current->sched_class->yield_task(rq);
@@ -4932,9 +4980,8 @@ SYSCALL_DEFINE0(sched_yield)
         * Since we are going to call schedule() anyway, there's
         * no need to preempt or enable interrupts:
         */
-       __release(rq->lock);
-       spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
-       do_raw_spin_unlock(&rq->lock);
+       preempt_disable();
+       rq_unlock(rq, &rf);
        sched_preempt_enable_no_resched();
 
        schedule();
@@ -5514,7 +5561,7 @@ void sched_setnuma(struct task_struct *p, int nid)
        p->numa_preferred_nid = nid;
 
        if (queued)
-               enqueue_task(rq, p, ENQUEUE_RESTORE);
+               enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
        if (running)
                set_curr_task(rq, p);
        task_rq_unlock(rq, p, &rf);
@@ -5579,11 +5626,11 @@ static struct task_struct fake_task = {
  * there's no concurrency possible, we hold the required locks anyway
  * because of lock validation efforts.
  */
-static void migrate_tasks(struct rq *dead_rq)
+static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
 {
        struct rq *rq = dead_rq;
        struct task_struct *next, *stop = rq->stop;
-       struct rq_flags rf;
+       struct rq_flags orf = *rf;
        int dest_cpu;
 
        /*
@@ -5602,9 +5649,7 @@ static void migrate_tasks(struct rq *dead_rq)
         * class method both need to have an up-to-date
         * value of rq->clock[_task]
         */
-       rq_pin_lock(rq, &rf);
        update_rq_clock(rq);
-       rq_unpin_lock(rq, &rf);
 
        for (;;) {
                /*
@@ -5617,8 +5662,7 @@ static void migrate_tasks(struct rq *dead_rq)
                /*
                 * pick_next_task() assumes pinned rq->lock:
                 */
-               rq_repin_lock(rq, &rf);
-               next = pick_next_task(rq, &fake_task, &rf);
+               next = pick_next_task(rq, &fake_task, rf);
                BUG_ON(!next);
                next->sched_class->put_prev_task(rq, next);
 
@@ -5631,10 +5675,9 @@ static void migrate_tasks(struct rq *dead_rq)
                 * because !cpu_active at this point, which means load-balance
                 * will not interfere. Also, stop-machine.
                 */
-               rq_unpin_lock(rq, &rf);
-               raw_spin_unlock(&rq->lock);
+               rq_unlock(rq, rf);
                raw_spin_lock(&next->pi_lock);
-               raw_spin_lock(&rq->lock);
+               rq_relock(rq, rf);
 
                /*
                 * Since we're inside stop-machine, _nothing_ should have
@@ -5648,12 +5691,12 @@ static void migrate_tasks(struct rq *dead_rq)
 
                /* Find suitable destination for @next, with force if needed. */
                dest_cpu = select_fallback_rq(dead_rq->cpu, next);
-
-               rq = __migrate_task(rq, next, dest_cpu);
+               rq = __migrate_task(rq, rf, next, dest_cpu);
                if (rq != dead_rq) {
-                       raw_spin_unlock(&rq->lock);
+                       rq_unlock(rq, rf);
                        rq = dead_rq;
-                       raw_spin_lock(&rq->lock);
+                       *rf = orf;
+                       rq_relock(rq, rf);
                }
                raw_spin_unlock(&next->pi_lock);
        }
@@ -5732,7 +5775,7 @@ static void cpuset_cpu_active(void)
                 * cpuset configurations.
                 */
        }
-       cpuset_update_active_cpus(true);
+       cpuset_update_active_cpus();
 }
 
 static int cpuset_cpu_inactive(unsigned int cpu)
@@ -5755,7 +5798,7 @@ static int cpuset_cpu_inactive(unsigned int cpu)
 
                if (overflow)
                        return -EBUSY;
-               cpuset_update_active_cpus(false);
+               cpuset_update_active_cpus();
        } else {
                num_cpus_frozen++;
                partition_sched_domains(1, NULL, NULL);
@@ -5766,7 +5809,7 @@ static int cpuset_cpu_inactive(unsigned int cpu)
 int sched_cpu_activate(unsigned int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
-       unsigned long flags;
+       struct rq_flags rf;
 
        set_cpu_active(cpu, true);
 
@@ -5784,12 +5827,12 @@ int sched_cpu_activate(unsigned int cpu)
         * 2) At runtime, if cpuset_cpu_active() fails to rebuild the
         *    domains.
         */
-       raw_spin_lock_irqsave(&rq->lock, flags);
+       rq_lock_irqsave(rq, &rf);
        if (rq->rd) {
                BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
                set_rq_online(rq);
        }
-       raw_spin_unlock_irqrestore(&rq->lock, flags);
+       rq_unlock_irqrestore(rq, &rf);
 
        update_max_interval();
 
@@ -5847,18 +5890,20 @@ int sched_cpu_starting(unsigned int cpu)
 int sched_cpu_dying(unsigned int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
-       unsigned long flags;
+       struct rq_flags rf;
 
        /* Handle pending wakeups and then migrate everything off */
        sched_ttwu_pending();
-       raw_spin_lock_irqsave(&rq->lock, flags);
+
+       rq_lock_irqsave(rq, &rf);
        if (rq->rd) {
                BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
                set_rq_offline(rq);
        }
-       migrate_tasks(rq);
+       migrate_tasks(rq, &rf);
        BUG_ON(rq->nr_running != 1);
-       raw_spin_unlock_irqrestore(&rq->lock, flags);
+       rq_unlock_irqrestore(rq, &rf);
+
        calc_load_migrate(rq);
        update_max_interval();
        nohz_balance_exit_idle(cpu);
@@ -6412,7 +6457,8 @@ static void sched_change_group(struct task_struct *tsk, int type)
  */
 void sched_move_task(struct task_struct *tsk)
 {
-       int queued, running;
+       int queued, running, queue_flags =
+               DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
        struct rq_flags rf;
        struct rq *rq;
 
@@ -6423,14 +6469,14 @@ void sched_move_task(struct task_struct *tsk)
        queued = task_on_rq_queued(tsk);
 
        if (queued)
-               dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
+               dequeue_task(rq, tsk, queue_flags);
        if (running)
                put_prev_task(rq, tsk);
 
        sched_change_group(tsk, TASK_MOVE_GROUP);
 
        if (queued)
-               enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE);
+               enqueue_task(rq, tsk, queue_flags);
        if (running)
                set_curr_task(rq, tsk);
 
@@ -7008,14 +7054,15 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
        for_each_online_cpu(i) {
                struct cfs_rq *cfs_rq = tg->cfs_rq[i];
                struct rq *rq = cfs_rq->rq;
+               struct rq_flags rf;
 
-               raw_spin_lock_irq(&rq->lock);
+               rq_lock_irq(rq, &rf);
                cfs_rq->runtime_enabled = runtime_enabled;
                cfs_rq->runtime_remaining = 0;
 
                if (cfs_rq->throttled)
                        unthrottle_cfs_rq(cfs_rq);
-               raw_spin_unlock_irq(&rq->lock);
+               rq_unlock_irq(rq, &rf);
        }
        if (runtime_was_enabled && !runtime_enabled)
                cfs_bandwidth_usage_dec();
index 54c5775..76877a6 100644 (file)
@@ -61,6 +61,11 @@ struct sugov_cpu {
        unsigned long util;
        unsigned long max;
        unsigned int flags;
+
+       /* The field below is for single-CPU policies only. */
+#ifdef CONFIG_NO_HZ_COMMON
+       unsigned long saved_idle_calls;
+#endif
 };
 
 static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu);
@@ -93,22 +98,23 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
 {
        struct cpufreq_policy *policy = sg_policy->policy;
 
+       if (sg_policy->next_freq == next_freq)
+               return;
+
+       if (sg_policy->next_freq > next_freq)
+               next_freq = (sg_policy->next_freq + next_freq) >> 1;
+
+       sg_policy->next_freq = next_freq;
        sg_policy->last_freq_update_time = time;
 
        if (policy->fast_switch_enabled) {
-               if (sg_policy->next_freq == next_freq) {
-                       trace_cpu_frequency(policy->cur, smp_processor_id());
-                       return;
-               }
-               sg_policy->next_freq = next_freq;
                next_freq = cpufreq_driver_fast_switch(policy, next_freq);
                if (next_freq == CPUFREQ_ENTRY_INVALID)
                        return;
 
                policy->cur = next_freq;
                trace_cpu_frequency(next_freq, smp_processor_id());
-       } else if (sg_policy->next_freq != next_freq) {
-               sg_policy->next_freq = next_freq;
+       } else {
                sg_policy->work_in_progress = true;
                irq_work_queue(&sg_policy->irq_work);
        }
@@ -192,6 +198,19 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
        sg_cpu->iowait_boost >>= 1;
 }
 
+#ifdef CONFIG_NO_HZ_COMMON
+static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu)
+{
+       unsigned long idle_calls = tick_nohz_get_idle_calls();
+       bool ret = idle_calls == sg_cpu->saved_idle_calls;
+
+       sg_cpu->saved_idle_calls = idle_calls;
+       return ret;
+}
+#else
+static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }
+#endif /* CONFIG_NO_HZ_COMMON */
+
 static void sugov_update_single(struct update_util_data *hook, u64 time,
                                unsigned int flags)
 {
@@ -200,6 +219,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
        struct cpufreq_policy *policy = sg_policy->policy;
        unsigned long util, max;
        unsigned int next_f;
+       bool busy;
 
        sugov_set_iowait_boost(sg_cpu, time, flags);
        sg_cpu->last_update = time;
@@ -207,40 +227,37 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
        if (!sugov_should_update_freq(sg_policy, time))
                return;
 
+       busy = sugov_cpu_is_busy(sg_cpu);
+
        if (flags & SCHED_CPUFREQ_RT_DL) {
                next_f = policy->cpuinfo.max_freq;
        } else {
                sugov_get_util(&util, &max);
                sugov_iowait_boost(sg_cpu, &util, &max);
                next_f = get_next_freq(sg_policy, util, max);
+               /*
+                * Do not reduce the frequency if the CPU has not been idle
+                * recently, as the reduction is likely to be premature then.
+                */
+               if (busy && next_f < sg_policy->next_freq)
+                       next_f = sg_policy->next_freq;
        }
        sugov_update_commit(sg_policy, time, next_f);
 }
 
-static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu,
-                                          unsigned long util, unsigned long max,
-                                          unsigned int flags)
+static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu)
 {
        struct sugov_policy *sg_policy = sg_cpu->sg_policy;
        struct cpufreq_policy *policy = sg_policy->policy;
-       unsigned int max_f = policy->cpuinfo.max_freq;
        u64 last_freq_update_time = sg_policy->last_freq_update_time;
+       unsigned long util = 0, max = 1;
        unsigned int j;
 
-       if (flags & SCHED_CPUFREQ_RT_DL)
-               return max_f;
-
-       sugov_iowait_boost(sg_cpu, &util, &max);
-
        for_each_cpu(j, policy->cpus) {
-               struct sugov_cpu *j_sg_cpu;
+               struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
                unsigned long j_util, j_max;
                s64 delta_ns;
 
-               if (j == smp_processor_id())
-                       continue;
-
-               j_sg_cpu = &per_cpu(sugov_cpu, j);
                /*
                 * If the CPU utilization was last updated before the previous
                 * frequency update and the time elapsed between the last update
@@ -254,7 +271,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu,
                        continue;
                }
                if (j_sg_cpu->flags & SCHED_CPUFREQ_RT_DL)
-                       return max_f;
+                       return policy->cpuinfo.max_freq;
 
                j_util = j_sg_cpu->util;
                j_max = j_sg_cpu->max;
@@ -289,7 +306,11 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time,
        sg_cpu->last_update = time;
 
        if (sugov_should_update_freq(sg_policy, time)) {
-               next_f = sugov_next_freq_shared(sg_cpu, util, max, flags);
+               if (flags & SCHED_CPUFREQ_RT_DL)
+                       next_f = sg_policy->policy->cpuinfo.max_freq;
+               else
+                       next_f = sugov_next_freq_shared(sg_cpu);
+
                sugov_update_commit(sg_policy, time, next_f);
        }
 
@@ -473,7 +494,6 @@ static int sugov_init(struct cpufreq_policy *policy)
 {
        struct sugov_policy *sg_policy;
        struct sugov_tunables *tunables;
-       unsigned int lat;
        int ret = 0;
 
        /* State should be equivalent to EXIT */
@@ -512,10 +532,16 @@ static int sugov_init(struct cpufreq_policy *policy)
                goto stop_kthread;
        }
 
-       tunables->rate_limit_us = LATENCY_MULTIPLIER;
-       lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC;
-       if (lat)
-               tunables->rate_limit_us *= lat;
+       if (policy->transition_delay_us) {
+               tunables->rate_limit_us = policy->transition_delay_us;
+       } else {
+               unsigned int lat;
+
+               tunables->rate_limit_us = LATENCY_MULTIPLIER;
+               lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC;
+               if (lat)
+                       tunables->rate_limit_us *= lat;
+       }
 
        policy->governor_data = sg_policy;
        sg_policy->tunables = tunables;
index f3778e2..aea3135 100644 (file)
@@ -34,6 +34,18 @@ void disable_sched_clock_irqtime(void)
        sched_clock_irqtime = 0;
 }
 
+static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
+                                 enum cpu_usage_stat idx)
+{
+       u64 *cpustat = kcpustat_this_cpu->cpustat;
+
+       u64_stats_update_begin(&irqtime->sync);
+       cpustat[idx] += delta;
+       irqtime->total += delta;
+       irqtime->tick_delta += delta;
+       u64_stats_update_end(&irqtime->sync);
+}
+
 /*
  * Called before incrementing preempt_count on {soft,}irq_enter
  * and before decrementing preempt_count on {soft,}irq_exit.
@@ -41,7 +53,6 @@ void disable_sched_clock_irqtime(void)
 void irqtime_account_irq(struct task_struct *curr)
 {
        struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
-       u64 *cpustat = kcpustat_this_cpu->cpustat;
        s64 delta;
        int cpu;
 
@@ -52,22 +63,16 @@ void irqtime_account_irq(struct task_struct *curr)
        delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
        irqtime->irq_start_time += delta;
 
-       u64_stats_update_begin(&irqtime->sync);
        /*
         * We do not account for softirq time from ksoftirqd here.
         * We want to continue accounting softirq time to ksoftirqd thread
         * in that case, so as not to confuse scheduler with a special task
         * that do not consume any time, but still wants to run.
         */
-       if (hardirq_count()) {
-               cpustat[CPUTIME_IRQ] += delta;
-               irqtime->tick_delta += delta;
-       } else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) {
-               cpustat[CPUTIME_SOFTIRQ] += delta;
-               irqtime->tick_delta += delta;
-       }
-
-       u64_stats_update_end(&irqtime->sync);
+       if (hardirq_count())
+               irqtime_account_delta(irqtime, delta, CPUTIME_IRQ);
+       else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
+               irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ);
 }
 EXPORT_SYMBOL_GPL(irqtime_account_irq);
 
index dea1389..a903276 100644 (file)
@@ -717,18 +717,12 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 }
 
 #ifdef CONFIG_SMP
+
+#include "sched-pelt.h"
+
 static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
 static unsigned long task_h_load(struct task_struct *p);
 
-/*
- * We choose a half-life close to 1 scheduling period.
- * Note: The tables runnable_avg_yN_inv and runnable_avg_yN_sum are
- * dependent on this value.
- */
-#define LOAD_AVG_PERIOD 32
-#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
-#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_AVG_MAX */
-
 /* Give new sched_entity start runnable values to heavy its load in infant time */
 void init_entity_runnable_average(struct sched_entity *se)
 {
@@ -2733,47 +2727,15 @@ static inline void update_cfs_shares(struct sched_entity *se)
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
 #ifdef CONFIG_SMP
-/* Precomputed fixed inverse multiplies for multiplication by y^n */
-static const u32 runnable_avg_yN_inv[] = {
-       0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
-       0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
-       0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
-       0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
-       0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
-       0x85aac367, 0x82cd8698,
-};
-
-/*
- * Precomputed \Sum y^k { 1<=k<=n }.  These are floor(true_value) to prevent
- * over-estimates when re-combining.
- */
-static const u32 runnable_avg_yN_sum[] = {
-           0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
-        9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
-       17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
-};
-
-/*
- * Precomputed \Sum y^k { 1<=k<=n, where n%32=0). Values are rolled down to
- * lower integers. See Documentation/scheduler/sched-avg.txt how these
- * were generated:
- */
-static const u32 __accumulated_sum_N32[] = {
-           0, 23371, 35056, 40899, 43820, 45281,
-       46011, 46376, 46559, 46650, 46696, 46719,
-};
-
 /*
  * Approximate:
  *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
  */
-static __always_inline u64 decay_load(u64 val, u64 n)
+static u64 decay_load(u64 val, u64 n)
 {
        unsigned int local_n;
 
-       if (!n)
-               return val;
-       else if (unlikely(n > LOAD_AVG_PERIOD * 63))
+       if (unlikely(n > LOAD_AVG_PERIOD * 63))
                return 0;
 
        /* after bounds checking we can collapse to 32-bit */
@@ -2795,30 +2757,97 @@ static __always_inline u64 decay_load(u64 val, u64 n)
        return val;
 }
 
+static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
+{
+       u32 c1, c2, c3 = d3; /* y^0 == 1 */
+
+       /*
+        * c1 = d1 y^p
+        */
+       c1 = decay_load((u64)d1, periods);
+
+       /*
+        *            p-1
+        * c2 = 1024 \Sum y^n
+        *            n=1
+        *
+        *              inf        inf
+        *    = 1024 ( \Sum y^n - \Sum y^n - y^0 )
+        *              n=0        n=p
+        */
+       c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024;
+
+       return c1 + c2 + c3;
+}
+
+#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
+
 /*
- * For updates fully spanning n periods, the contribution to runnable
- * average will be: \Sum 1024*y^n
+ * Accumulate the three separate parts of the sum; d1 the remainder
+ * of the last (incomplete) period, d2 the span of full periods and d3
+ * the remainder of the (incomplete) current period.
+ *
+ *           d1          d2           d3
+ *           ^           ^            ^
+ *           |           |            |
+ *         |<->|<----------------->|<--->|
+ * ... |---x---|------| ... |------|-----x (now)
+ *
+ *                           p-1
+ * u' = (u + d1) y^p + 1024 \Sum y^n + d3 y^0
+ *                           n=1
  *
- * We can compute this reasonably efficiently by combining:
- *   y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for  n <PERIOD}
+ *    = u y^p +                                        (Step 1)
+ *
+ *                     p-1
+ *      d1 y^p + 1024 \Sum y^n + d3 y^0                (Step 2)
+ *                     n=1
  */
-static u32 __compute_runnable_contrib(u64 n)
+static __always_inline u32
+accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
+              unsigned long weight, int running, struct cfs_rq *cfs_rq)
 {
-       u32 contrib = 0;
+       unsigned long scale_freq, scale_cpu;
+       u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
+       u64 periods;
 
-       if (likely(n <= LOAD_AVG_PERIOD))
-               return runnable_avg_yN_sum[n];
-       else if (unlikely(n >= LOAD_AVG_MAX_N))
-               return LOAD_AVG_MAX;
+       scale_freq = arch_scale_freq_capacity(NULL, cpu);
+       scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
 
-       /* Since n < LOAD_AVG_MAX_N, n/LOAD_AVG_PERIOD < 11 */
-       contrib = __accumulated_sum_N32[n/LOAD_AVG_PERIOD];
-       n %= LOAD_AVG_PERIOD;
-       contrib = decay_load(contrib, n);
-       return contrib + runnable_avg_yN_sum[n];
-}
+       delta += sa->period_contrib;
+       periods = delta / 1024; /* A period is 1024us (~1ms) */
 
-#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
+       /*
+        * Step 1: decay old *_sum if we crossed period boundaries.
+        */
+       if (periods) {
+               sa->load_sum = decay_load(sa->load_sum, periods);
+               if (cfs_rq) {
+                       cfs_rq->runnable_load_sum =
+                               decay_load(cfs_rq->runnable_load_sum, periods);
+               }
+               sa->util_sum = decay_load((u64)(sa->util_sum), periods);
+
+               /*
+                * Step 2
+                */
+               delta %= 1024;
+               contrib = __accumulate_pelt_segments(periods,
+                               1024 - sa->period_contrib, delta);
+       }
+       sa->period_contrib = delta;
+
+       contrib = cap_scale(contrib, scale_freq);
+       if (weight) {
+               sa->load_sum += weight * contrib;
+               if (cfs_rq)
+                       cfs_rq->runnable_load_sum += weight * contrib;
+       }
+       if (running)
+               sa->util_sum += contrib * scale_cpu;
+
+       return periods;
+}
 
 /*
  * We can represent the historical contribution to runnable average as the
@@ -2849,13 +2878,10 @@ static u32 __compute_runnable_contrib(u64 n)
  *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
  */
 static __always_inline int
-__update_load_avg(u64 now, int cpu, struct sched_avg *sa,
+___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
                  unsigned long weight, int running, struct cfs_rq *cfs_rq)
 {
-       u64 delta, scaled_delta, periods;
-       u32 contrib;
-       unsigned int delta_w, scaled_delta_w, decayed = 0;
-       unsigned long scale_freq, scale_cpu;
+       u64 delta;
 
        delta = now - sa->last_update_time;
        /*
@@ -2874,83 +2900,52 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
        delta >>= 10;
        if (!delta)
                return 0;
-       sa->last_update_time = now;
-
-       scale_freq = arch_scale_freq_capacity(NULL, cpu);
-       scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
-
-       /* delta_w is the amount already accumulated against our next period */
-       delta_w = sa->period_contrib;
-       if (delta + delta_w >= 1024) {
-               decayed = 1;
 
-               /* how much left for next period will start over, we don't know yet */
-               sa->period_contrib = 0;
+       sa->last_update_time += delta << 10;
 
-               /*
-                * Now that we know we're crossing a period boundary, figure
-                * out how much from delta we need to complete the current
-                * period and accrue it.
-                */
-               delta_w = 1024 - delta_w;
-               scaled_delta_w = cap_scale(delta_w, scale_freq);
-               if (weight) {
-                       sa->load_sum += weight * scaled_delta_w;
-                       if (cfs_rq) {
-                               cfs_rq->runnable_load_sum +=
-                                               weight * scaled_delta_w;
-                       }
-               }
-               if (running)
-                       sa->util_sum += scaled_delta_w * scale_cpu;
-
-               delta -= delta_w;
-
-               /* Figure out how many additional periods this update spans */
-               periods = delta / 1024;
-               delta %= 1024;
+       /*
+        * Now we know we crossed measurement unit boundaries. The *_avg
+        * accrues by two steps:
+        *
+        * Step 1: accumulate *_sum since last_update_time. If we haven't
+        * crossed period boundaries, finish.
+        */
+       if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq))
+               return 0;
 
-               sa->load_sum = decay_load(sa->load_sum, periods + 1);
-               if (cfs_rq) {
-                       cfs_rq->runnable_load_sum =
-                               decay_load(cfs_rq->runnable_load_sum, periods + 1);
-               }
-               sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
-
-               /* Efficiently calculate \sum (1..n_period) 1024*y^i */
-               contrib = __compute_runnable_contrib(periods);
-               contrib = cap_scale(contrib, scale_freq);
-               if (weight) {
-                       sa->load_sum += weight * contrib;
-                       if (cfs_rq)
-                               cfs_rq->runnable_load_sum += weight * contrib;
-               }
-               if (running)
-                       sa->util_sum += contrib * scale_cpu;
+       /*
+        * Step 2: update *_avg.
+        */
+       sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
+       if (cfs_rq) {
+               cfs_rq->runnable_load_avg =
+                       div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
        }
+       sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
 
-       /* Remainder of delta accrued against u_0` */
-       scaled_delta = cap_scale(delta, scale_freq);
-       if (weight) {
-               sa->load_sum += weight * scaled_delta;
-               if (cfs_rq)
-                       cfs_rq->runnable_load_sum += weight * scaled_delta;
-       }
-       if (running)
-               sa->util_sum += scaled_delta * scale_cpu;
+       return 1;
+}
 
-       sa->period_contrib += delta;
+static int
+__update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
+{
+       return ___update_load_avg(now, cpu, &se->avg, 0, 0, NULL);
+}
 
-       if (decayed) {
-               sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
-               if (cfs_rq) {
-                       cfs_rq->runnable_load_avg =
-                               div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
-               }
-               sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
-       }
+static int
+__update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+       return ___update_load_avg(now, cpu, &se->avg,
+                                 se->on_rq * scale_load_down(se->load.weight),
+                                 cfs_rq->curr == se, NULL);
+}
 
-       return decayed;
+static int
+__update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
+{
+       return ___update_load_avg(now, cpu, &cfs_rq->avg,
+                       scale_load_down(cfs_rq->load.weight),
+                       cfs_rq->curr != NULL, cfs_rq);
 }
 
 /*
@@ -3014,6 +3009,9 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
 void set_task_rq_fair(struct sched_entity *se,
                      struct cfs_rq *prev, struct cfs_rq *next)
 {
+       u64 p_last_update_time;
+       u64 n_last_update_time;
+
        if (!sched_feat(ATTACH_AGE_LOAD))
                return;
 
@@ -3024,11 +3022,11 @@ void set_task_rq_fair(struct sched_entity *se,
         * time. This will result in the wakee task is less decayed, but giving
         * the wakee more load sounds not bad.
         */
-       if (se->avg.last_update_time && prev) {
-               u64 p_last_update_time;
-               u64 n_last_update_time;
+       if (!(se->avg.last_update_time && prev))
+               return;
 
 #ifndef CONFIG_64BIT
+       {
                u64 p_last_update_time_copy;
                u64 n_last_update_time_copy;
 
@@ -3043,14 +3041,13 @@ void set_task_rq_fair(struct sched_entity *se,
 
                } while (p_last_update_time != p_last_update_time_copy ||
                         n_last_update_time != n_last_update_time_copy);
+       }
 #else
-               p_last_update_time = prev->avg.last_update_time;
-               n_last_update_time = next->avg.last_update_time;
+       p_last_update_time = prev->avg.last_update_time;
+       n_last_update_time = next->avg.last_update_time;
 #endif
-               __update_load_avg(p_last_update_time, cpu_of(rq_of(prev)),
-                                 &se->avg, 0, 0, NULL);
-               se->avg.last_update_time = n_last_update_time;
-       }
+       __update_load_avg_blocked_se(p_last_update_time, cpu_of(rq_of(prev)), se);
+       se->avg.last_update_time = n_last_update_time;
 }
 
 /* Take into account change of utilization of a child task group */
@@ -3173,6 +3170,36 @@ static inline int propagate_entity_load_avg(struct sched_entity *se)
        return 1;
 }
 
+/*
+ * Check if we need to update the load and the utilization of a blocked
+ * group_entity:
+ */
+static inline bool skip_blocked_update(struct sched_entity *se)
+{
+       struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+
+       /*
+        * If sched_entity still have not zero load or utilization, we have to
+        * decay it:
+        */
+       if (se->avg.load_avg || se->avg.util_avg)
+               return false;
+
+       /*
+        * If there is a pending propagation, we have to update the load and
+        * the utilization of the sched_entity:
+        */
+       if (gcfs_rq->propagate_avg)
+               return false;
+
+       /*
+        * Otherwise, the load and the utilization of the sched_entity is
+        * already zero and there is no pending propagation, so it will be a
+        * waste of time to try to decay it:
+        */
+       return true;
+}
+
 #else /* CONFIG_FAIR_GROUP_SCHED */
 
 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
@@ -3265,8 +3292,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
                set_tg_cfs_propagate(cfs_rq);
        }
 
-       decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
-               scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq);
+       decayed = __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq);
 
 #ifndef CONFIG_64BIT
        smp_wmb();
@@ -3298,11 +3324,8 @@ static inline void update_load_avg(struct sched_entity *se, int flags)
         * Track task load average for carrying it to new CPU after migrated, and
         * track group sched_entity load average for task_h_load calc in migration
         */
-       if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) {
-               __update_load_avg(now, cpu, &se->avg,
-                         se->on_rq * scale_load_down(se->load.weight),
-                         cfs_rq->curr == se, NULL);
-       }
+       if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
+               __update_load_avg_se(now, cpu, cfs_rq, se);
 
        decayed  = update_cfs_rq_load_avg(now, cfs_rq, true);
        decayed |= propagate_entity_load_avg(se);
@@ -3407,7 +3430,7 @@ void sync_entity_load_avg(struct sched_entity *se)
        u64 last_update_time;
 
        last_update_time = cfs_rq_last_update_time(cfs_rq);
-       __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
+       __update_load_avg_blocked_se(last_update_time, cpu_of(rq_of(cfs_rq)), se);
 }
 
 /*
@@ -4271,8 +4294,9 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
        list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
                                throttled_list) {
                struct rq *rq = rq_of(cfs_rq);
+               struct rq_flags rf;
 
-               raw_spin_lock(&rq->lock);
+               rq_lock(rq, &rf);
                if (!cfs_rq_throttled(cfs_rq))
                        goto next;
 
@@ -4289,7 +4313,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
                        unthrottle_cfs_rq(cfs_rq);
 
 next:
-               raw_spin_unlock(&rq->lock);
+               rq_unlock(rq, &rf);
 
                if (!remaining)
                        break;
@@ -5097,15 +5121,16 @@ void cpu_load_update_nohz_stop(void)
        unsigned long curr_jiffies = READ_ONCE(jiffies);
        struct rq *this_rq = this_rq();
        unsigned long load;
+       struct rq_flags rf;
 
        if (curr_jiffies == this_rq->last_load_update_tick)
                return;
 
        load = weighted_cpuload(cpu_of(this_rq));
-       raw_spin_lock(&this_rq->lock);
+       rq_lock(this_rq, &rf);
        update_rq_clock(this_rq);
        cpu_load_update_nohz(this_rq, curr_jiffies, load);
-       raw_spin_unlock(&this_rq->lock);
+       rq_unlock(this_rq, &rf);
 }
 #else /* !CONFIG_NO_HZ_COMMON */
 static inline void cpu_load_update_nohz(struct rq *this_rq,
@@ -6769,7 +6794,7 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
        lockdep_assert_held(&env->src_rq->lock);
 
        p->on_rq = TASK_ON_RQ_MIGRATING;
-       deactivate_task(env->src_rq, p, 0);
+       deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
        set_task_cpu(p, env->dst_cpu);
 }
 
@@ -6902,7 +6927,7 @@ static void attach_task(struct rq *rq, struct task_struct *p)
        lockdep_assert_held(&rq->lock);
 
        BUG_ON(task_rq(p) != rq);
-       activate_task(rq, p, 0);
+       activate_task(rq, p, ENQUEUE_NOCLOCK);
        p->on_rq = TASK_ON_RQ_QUEUED;
        check_preempt_curr(rq, p, 0);
 }
@@ -6913,9 +6938,12 @@ static void attach_task(struct rq *rq, struct task_struct *p)
  */
 static void attach_one_task(struct rq *rq, struct task_struct *p)
 {
-       raw_spin_lock(&rq->lock);
+       struct rq_flags rf;
+
+       rq_lock(rq, &rf);
+       update_rq_clock(rq);
        attach_task(rq, p);
-       raw_spin_unlock(&rq->lock);
+       rq_unlock(rq, &rf);
 }
 
 /*
@@ -6926,8 +6954,10 @@ static void attach_tasks(struct lb_env *env)
 {
        struct list_head *tasks = &env->tasks;
        struct task_struct *p;
+       struct rq_flags rf;
 
-       raw_spin_lock(&env->dst_rq->lock);
+       rq_lock(env->dst_rq, &rf);
+       update_rq_clock(env->dst_rq);
 
        while (!list_empty(tasks)) {
                p = list_first_entry(tasks, struct task_struct, se.group_node);
@@ -6936,7 +6966,7 @@ static void attach_tasks(struct lb_env *env)
                attach_task(env->dst_rq, p);
        }
 
-       raw_spin_unlock(&env->dst_rq->lock);
+       rq_unlock(env->dst_rq, &rf);
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -6944,9 +6974,9 @@ static void update_blocked_averages(int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
        struct cfs_rq *cfs_rq;
-       unsigned long flags;
+       struct rq_flags rf;
 
-       raw_spin_lock_irqsave(&rq->lock, flags);
+       rq_lock_irqsave(rq, &rf);
        update_rq_clock(rq);
 
        /*
@@ -6954,6 +6984,8 @@ static void update_blocked_averages(int cpu)
         * list_add_leaf_cfs_rq() for details.
         */
        for_each_leaf_cfs_rq(rq, cfs_rq) {
+               struct sched_entity *se;
+
                /* throttled entities do not contribute to load */
                if (throttled_hierarchy(cfs_rq))
                        continue;
@@ -6961,11 +6993,12 @@ static void update_blocked_averages(int cpu)
                if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true))
                        update_tg_load_avg(cfs_rq, 0);
 
-               /* Propagate pending load changes to the parent */
-               if (cfs_rq->tg->se[cpu])
-                       update_load_avg(cfs_rq->tg->se[cpu], 0);
+               /* Propagate pending load changes to the parent, if any: */
+               se = cfs_rq->tg->se[cpu];
+               if (se && !skip_blocked_update(se))
+                       update_load_avg(se, 0);
        }
-       raw_spin_unlock_irqrestore(&rq->lock, flags);
+       rq_unlock_irqrestore(rq, &rf);
 }
 
 /*
@@ -7019,12 +7052,12 @@ static inline void update_blocked_averages(int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
        struct cfs_rq *cfs_rq = &rq->cfs;
-       unsigned long flags;
+       struct rq_flags rf;
 
-       raw_spin_lock_irqsave(&rq->lock, flags);
+       rq_lock_irqsave(rq, &rf);
        update_rq_clock(rq);
        update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true);
-       raw_spin_unlock_irqrestore(&rq->lock, flags);
+       rq_unlock_irqrestore(rq, &rf);
 }
 
 static unsigned long task_h_load(struct task_struct *p)
@@ -7525,6 +7558,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 {
        struct sched_domain *child = env->sd->child;
        struct sched_group *sg = env->sd->groups;
+       struct sg_lb_stats *local = &sds->local_stat;
        struct sg_lb_stats tmp_sgs;
        int load_idx, prefer_sibling = 0;
        bool overload = false;
@@ -7541,7 +7575,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
                local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
                if (local_group) {
                        sds->local = sg;
-                       sgs = &sds->local_stat;
+                       sgs = local;
 
                        if (env->idle != CPU_NEWLY_IDLE ||
                            time_after_eq(jiffies, sg->sgc->next_update))
@@ -7565,8 +7599,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
                 * the tasks on the system).
                 */
                if (prefer_sibling && sds->local &&
-                   group_has_capacity(env, &sds->local_stat) &&
-                   (sgs->sum_nr_running > 1)) {
+                   group_has_capacity(env, local) &&
+                   (sgs->sum_nr_running > local->sum_nr_running + 1)) {
                        sgs->group_no_capacity = 1;
                        sgs->group_type = group_classify(sg, sgs);
                }
@@ -8042,7 +8076,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
        struct sched_domain *sd_parent = sd->parent;
        struct sched_group *group;
        struct rq *busiest;
-       unsigned long flags;
+       struct rq_flags rf;
        struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
 
        struct lb_env env = {
@@ -8105,7 +8139,7 @@ redo:
                env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
 
 more_balance:
-               raw_spin_lock_irqsave(&busiest->lock, flags);
+               rq_lock_irqsave(busiest, &rf);
                update_rq_clock(busiest);
 
                /*
@@ -8122,14 +8156,14 @@ more_balance:
                 * See task_rq_lock() family for the details.
                 */
 
-               raw_spin_unlock(&busiest->lock);
+               rq_unlock(busiest, &rf);
 
                if (cur_ld_moved) {
                        attach_tasks(&env);
                        ld_moved += cur_ld_moved;
                }
 
-               local_irq_restore(flags);
+               local_irq_restore(rf.flags);
 
                if (env.flags & LBF_NEED_BREAK) {
                        env.flags &= ~LBF_NEED_BREAK;
@@ -8207,6 +8241,8 @@ more_balance:
                        sd->nr_balance_failed++;
 
                if (need_active_balance(&env)) {
+                       unsigned long flags;
+
                        raw_spin_lock_irqsave(&busiest->lock, flags);
 
                        /* don't kick the active_load_balance_cpu_stop,
@@ -8444,8 +8480,9 @@ static int active_load_balance_cpu_stop(void *data)
        struct rq *target_rq = cpu_rq(target_cpu);
        struct sched_domain *sd;
        struct task_struct *p = NULL;
+       struct rq_flags rf;
 
-       raw_spin_lock_irq(&busiest_rq->lock);
+       rq_lock_irq(busiest_rq, &rf);
 
        /* make sure the requested cpu hasn't gone down in the meantime */
        if (unlikely(busiest_cpu != smp_processor_id() ||
@@ -8496,7 +8533,7 @@ static int active_load_balance_cpu_stop(void *data)
        rcu_read_unlock();
 out_unlock:
        busiest_rq->active_balance = 0;
-       raw_spin_unlock(&busiest_rq->lock);
+       rq_unlock(busiest_rq, &rf);
 
        if (p)
                attach_one_task(target_rq, p);
@@ -8794,10 +8831,13 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
                 * do the balance.
                 */
                if (time_after_eq(jiffies, rq->next_balance)) {
-                       raw_spin_lock_irq(&rq->lock);
+                       struct rq_flags rf;
+
+                       rq_lock_irq(rq, &rf);
                        update_rq_clock(rq);
                        cpu_load_update_idle(rq);
-                       raw_spin_unlock_irq(&rq->lock);
+                       rq_unlock_irq(rq, &rf);
+
                        rebalance_domains(rq, CPU_IDLE);
                }
 
@@ -8988,8 +9028,9 @@ static void task_fork_fair(struct task_struct *p)
        struct cfs_rq *cfs_rq;
        struct sched_entity *se = &p->se, *curr;
        struct rq *rq = this_rq();
+       struct rq_flags rf;
 
-       raw_spin_lock(&rq->lock);
+       rq_lock(rq, &rf);
        update_rq_clock(rq);
 
        cfs_rq = task_cfs_rq(current);
@@ -9010,7 +9051,7 @@ static void task_fork_fair(struct task_struct *p)
        }
 
        se->vruntime -= cfs_rq->min_vruntime;
-       raw_spin_unlock(&rq->lock);
+       rq_unlock(rq, &rf);
 }
 
 /*
@@ -9372,7 +9413,6 @@ static DEFINE_MUTEX(shares_mutex);
 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 {
        int i;
-       unsigned long flags;
 
        /*
         * We can't change the weight of the root cgroup.
@@ -9389,19 +9429,17 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
        tg->shares = shares;
        for_each_possible_cpu(i) {
                struct rq *rq = cpu_rq(i);
-               struct sched_entity *se;
+               struct sched_entity *se = tg->se[i];
+               struct rq_flags rf;
 
-               se = tg->se[i];
                /* Propagate contribution to hierarchy */
-               raw_spin_lock_irqsave(&rq->lock, flags);
-
-               /* Possible calls to update_curr() need rq clock */
+               rq_lock_irqsave(rq, &rf);
                update_rq_clock(rq);
                for_each_sched_entity(se) {
                        update_load_avg(se, UPDATE_TG);
                        update_cfs_shares(se);
                }
-               raw_spin_unlock_irqrestore(&rq->lock, flags);
+               rq_unlock_irqrestore(rq, &rf);
        }
 
 done:
index 1b3c818..11192e0 100644 (file)
@@ -56,6 +56,13 @@ SCHED_FEAT(TTWU_QUEUE, true)
  */
 SCHED_FEAT(SIS_AVG_CPU, false)
 
+/*
+ * Issue a WARN when we do multiple update_rq_clock() calls
+ * in a single rq->lock section. Default disabled because the
+ * annotations are not complete.
+ */
+SCHED_FEAT(WARN_DOUBLE_CLOCK, false)
+
 #ifdef HAVE_RT_PUSH_IPI
 /*
  * In order to avoid a thundering herd attack of CPUs that are
index 9f3e402..979b734 100644 (file)
@@ -1927,6 +1927,87 @@ static int find_next_push_cpu(struct rq *rq)
 #define RT_PUSH_IPI_EXECUTING          1
 #define RT_PUSH_IPI_RESTART            2
 
+/*
+ * When a high priority task schedules out from a CPU and a lower priority
+ * task is scheduled in, a check is made to see if there's any RT tasks
+ * on other CPUs that are waiting to run because a higher priority RT task
+ * is currently running on its CPU. In this case, the CPU with multiple RT
+ * tasks queued on it (overloaded) needs to be notified that a CPU has opened
+ * up that may be able to run one of its non-running queued RT tasks.
+ *
+ * On large CPU boxes, there's the case that several CPUs could schedule
+ * a lower priority task at the same time, in which case it will look for
+ * any overloaded CPUs that it could pull a task from. To do this, the runqueue
+ * lock must be taken from that overloaded CPU. Having 10s of CPUs all fighting
+ * for a single overloaded CPU's runqueue lock can produce a large latency.
+ * (This has actually been observed on large boxes running cyclictest).
+ * Instead of taking the runqueue lock of the overloaded CPU, each of the
+ * CPUs that scheduled a lower priority task simply sends an IPI to the
+ * overloaded CPU. An IPI is much cheaper than taking an runqueue lock with
+ * lots of contention. The overloaded CPU will look to push its non-running
+ * RT task off, and if it does, it can then ignore the other IPIs coming
+ * in, and just pass those IPIs off to any other overloaded CPU.
+ *
+ * When a CPU schedules a lower priority task, it only sends an IPI to
+ * the "next" CPU that has overloaded RT tasks. This prevents IPI storms,
+ * as having 10 CPUs scheduling lower priority tasks and 10 CPUs with
+ * RT overloaded tasks, would cause 100 IPIs to go out at once.
+ *
+ * The overloaded RT CPU, when receiving an IPI, will try to push off its
+ * overloaded RT tasks and then send an IPI to the next CPU that has
+ * overloaded RT tasks. This stops when all CPUs with overloaded RT tasks
+ * have completed. Just because a CPU may have pushed off its own overloaded
+ * RT task does not mean it should stop sending the IPI around to other
+ * overloaded CPUs. There may be another RT task waiting to run on one of
+ * those CPUs that are of higher priority than the one that was just
+ * pushed.
+ *
+ * An optimization that could possibly be made is to make a CPU array similar
+ * to the cpupri array mask of all running RT tasks, but for the overloaded
+ * case, then the IPI could be sent to only the CPU with the highest priority
+ * RT task waiting, and that CPU could send off further IPIs to the CPU with
+ * the next highest waiting task. Since the overloaded case is much less likely
+ * to happen, the complexity of this implementation may not be worth it.
+ * Instead, just send an IPI around to all overloaded CPUs.
+ *
+ * The rq->rt.push_flags holds the status of the IPI that is going around.
+ * A run queue can only send out a single IPI at a time. The possible flags
+ * for rq->rt.push_flags are:
+ *
+ *    (None or zero):          No IPI is going around for the current rq
+ *    RT_PUSH_IPI_EXECUTING:   An IPI for the rq is being passed around
+ *    RT_PUSH_IPI_RESTART:     The priority of the running task for the rq
+ *                             has changed, and the IPI should restart
+ *                             circulating the overloaded CPUs again.
+ *
+ * rq->rt.push_cpu contains the CPU that is being sent the IPI. It is updated
+ * before sending to the next CPU.
+ *
+ * Instead of having all CPUs that schedule a lower priority task send
+ * an IPI to the same "first" CPU in the RT overload mask, they send it
+ * to the next overloaded CPU after their own CPU. This helps distribute
+ * the work when there's more than one overloaded CPU and multiple CPUs
+ * scheduling in lower priority tasks.
+ *
+ * When a rq schedules a lower priority task than what was currently
+ * running, the next CPU with overloaded RT tasks is examined first.
+ * That is, if CPU 1 and 5 are overloaded, and CPU 3 schedules a lower
+ * priority task, it will send an IPI first to CPU 5, then CPU 5 will
+ * send to CPU 1 if it is still overloaded. CPU 1 will clear the
+ * rq->rt.push_flags if RT_PUSH_IPI_RESTART is not set.
+ *
+ * The first CPU to notice IPI_RESTART is set, will clear that flag and then
+ * send an IPI to the next overloaded CPU after the rq->cpu and not the next
+ * CPU after push_cpu. That is, if CPU 1, 4 and 5 are overloaded when CPU 3
+ * schedules a lower priority task, and the IPI_RESTART gets set while the
+ * handling is being done on CPU 5, it will clear the flag and send it back to
+ * CPU 4 instead of CPU 1.
+ *
+ * Note, the above logic can be disabled by turning off the sched_feature
+ * RT_PUSH_IPI. Then the rq lock of the overloaded CPU will simply be
+ * taken by the CPU requesting a pull and the waiting RT task will be pulled
+ * by that CPU. This may be fine for machines with few CPUs.
+ */
 static void tell_cpu_to_push(struct rq *rq)
 {
        int cpu;
diff --git a/kernel/sched/sched-pelt.h b/kernel/sched/sched-pelt.h
new file mode 100644 (file)
index 0000000..cd200d1
--- /dev/null
@@ -0,0 +1,13 @@
+/* Generated by Documentation/scheduler/sched-pelt; do not modify. */
+
+static const u32 runnable_avg_yN_inv[] = {
+       0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
+       0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
+       0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
+       0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
+       0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
+       0x85aac367, 0x82cd8698,
+};
+
+#define LOAD_AVG_PERIOD 32
+#define LOAD_AVG_MAX 47742
index 5cbf922..7808ab0 100644 (file)
@@ -1331,15 +1331,17 @@ extern const u32 sched_prio_to_wmult[40];
 #define DEQUEUE_SLEEP          0x01
 #define DEQUEUE_SAVE           0x02 /* matches ENQUEUE_RESTORE */
 #define DEQUEUE_MOVE           0x04 /* matches ENQUEUE_MOVE */
+#define DEQUEUE_NOCLOCK                0x08 /* matches ENQUEUE_NOCLOCK */
 
 #define ENQUEUE_WAKEUP         0x01
 #define ENQUEUE_RESTORE                0x02
 #define ENQUEUE_MOVE           0x04
+#define ENQUEUE_NOCLOCK                0x08
 
-#define ENQUEUE_HEAD           0x08
-#define ENQUEUE_REPLENISH      0x10
+#define ENQUEUE_HEAD           0x10
+#define ENQUEUE_REPLENISH      0x20
 #ifdef CONFIG_SMP
-#define ENQUEUE_MIGRATED       0x20
+#define ENQUEUE_MIGRATED       0x40
 #else
 #define ENQUEUE_MIGRATED       0x00
 #endif
@@ -1624,6 +1626,7 @@ static inline void sched_avg_update(struct rq *rq) { }
 
 struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
        __acquires(rq->lock);
+
 struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
        __acquires(p->pi_lock)
        __acquires(rq->lock);
@@ -1645,6 +1648,62 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
        raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
 }
 
+static inline void
+rq_lock_irqsave(struct rq *rq, struct rq_flags *rf)
+       __acquires(rq->lock)
+{
+       raw_spin_lock_irqsave(&rq->lock, rf->flags);
+       rq_pin_lock(rq, rf);
+}
+
+static inline void
+rq_lock_irq(struct rq *rq, struct rq_flags *rf)
+       __acquires(rq->lock)
+{
+       raw_spin_lock_irq(&rq->lock);
+       rq_pin_lock(rq, rf);
+}
+
+static inline void
+rq_lock(struct rq *rq, struct rq_flags *rf)
+       __acquires(rq->lock)
+{
+       raw_spin_lock(&rq->lock);
+       rq_pin_lock(rq, rf);
+}
+
+static inline void
+rq_relock(struct rq *rq, struct rq_flags *rf)
+       __acquires(rq->lock)
+{
+       raw_spin_lock(&rq->lock);
+       rq_repin_lock(rq, rf);
+}
+
+static inline void
+rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf)
+       __releases(rq->lock)
+{
+       rq_unpin_lock(rq, rf);
+       raw_spin_unlock_irqrestore(&rq->lock, rf->flags);
+}
+
+static inline void
+rq_unlock_irq(struct rq *rq, struct rq_flags *rf)
+       __releases(rq->lock)
+{
+       rq_unpin_lock(rq, rf);
+       raw_spin_unlock_irq(&rq->lock);
+}
+
+static inline void
+rq_unlock(struct rq *rq, struct rq_flags *rf)
+       __releases(rq->lock)
+{
+       rq_unpin_lock(rq, rf);
+       raw_spin_unlock(&rq->lock);
+}
+
 #ifdef CONFIG_SMP
 #ifdef CONFIG_PREEMPT
 
@@ -1869,6 +1928,7 @@ static inline void nohz_balance_exit_idle(unsigned int cpu) { }
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 struct irqtime {
+       u64                     total;
        u64                     tick_delta;
        u64                     irq_start_time;
        struct u64_stats_sync   sync;
@@ -1876,16 +1936,20 @@ struct irqtime {
 
 DECLARE_PER_CPU(struct irqtime, cpu_irqtime);
 
+/*
+ * Returns the irqtime minus the softirq time computed by ksoftirqd.
+ * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime
+ * and never move forward.
+ */
 static inline u64 irq_time_read(int cpu)
 {
        struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu);
-       u64 *cpustat = kcpustat_cpu(cpu).cpustat;
        unsigned int seq;
        u64 total;
 
        do {
                seq = __u64_stats_fetch_begin(&irqtime->sync);
-               total = cpustat[CPUTIME_SOFTIRQ] + cpustat[CPUTIME_IRQ];
+               total = irqtime->total;
        } while (__u64_stats_fetch_retry(&irqtime->sync, seq));
 
        return total;
index 744fa61..4e09821 100644 (file)
@@ -309,7 +309,7 @@ restart:
        account_irq_exit_time(current);
        __local_bh_enable(SOFTIRQ_OFFSET);
        WARN_ON_ONCE(in_interrupt());
-       tsk_restore_flags(current, old_flags, PF_MEMALLOC);
+       current_restore_flags(old_flags, PF_MEMALLOC);
 }
 
 asmlinkage __visible void do_softirq(void)
index acf0a5a..21343d1 100644 (file)
@@ -1176,6 +1176,8 @@ static struct ctl_table kern_table[] = {
                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
                .proc_handler   = timer_migration_handler,
+               .extra1         = &zero,
+               .extra2         = &one,
        },
 #endif
 #ifdef CONFIG_BPF_SYSCALL
@@ -2133,9 +2135,12 @@ static int do_proc_douintvec_conv(bool *negp, unsigned long *lvalp,
        if (write) {
                if (*negp)
                        return -EINVAL;
+               if (*lvalp > UINT_MAX)
+                       return -EINVAL;
                *valp = *lvalp;
        } else {
                unsigned int val = *valp;
+               *negp = false;
                *lvalp = (unsigned long)val;
        }
        return 0;
index ce3a31e..5cb5b00 100644 (file)
@@ -541,7 +541,7 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm,
  *
  * Returns the granularity of underlying alarm base clock
  */
-static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp)
+static int alarm_clock_getres(const clockid_t which_clock, struct timespec64 *tp)
 {
        if (!alarmtimer_get_rtcdev())
                return -EINVAL;
@@ -558,14 +558,14 @@ static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp)
  *
  * Provides the underlying alarm base time.
  */
-static int alarm_clock_get(clockid_t which_clock, struct timespec *tp)
+static int alarm_clock_get(clockid_t which_clock, struct timespec64 *tp)
 {
        struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)];
 
        if (!alarmtimer_get_rtcdev())
                return -EINVAL;
 
-       *tp = ktime_to_timespec(base->gettime());
+       *tp = ktime_to_timespec64(base->gettime());
        return 0;
 }
 
@@ -598,19 +598,19 @@ static int alarm_timer_create(struct k_itimer *new_timer)
  * Copies out the current itimerspec data
  */
 static void alarm_timer_get(struct k_itimer *timr,
-                               struct itimerspec *cur_setting)
+                           struct itimerspec64 *cur_setting)
 {
        ktime_t relative_expiry_time =
                alarm_expires_remaining(&(timr->it.alarm.alarmtimer));
 
        if (ktime_to_ns(relative_expiry_time) > 0) {
-               cur_setting->it_value = ktime_to_timespec(relative_expiry_time);
+               cur_setting->it_value = ktime_to_timespec64(relative_expiry_time);
        } else {
                cur_setting->it_value.tv_sec = 0;
                cur_setting->it_value.tv_nsec = 0;
        }
 
-       cur_setting->it_interval = ktime_to_timespec(timr->it.alarm.interval);
+       cur_setting->it_interval = ktime_to_timespec64(timr->it.alarm.interval);
 }
 
 /**
@@ -640,8 +640,8 @@ static int alarm_timer_del(struct k_itimer *timr)
  * Sets the timer to new_setting, and starts the timer.
  */
 static int alarm_timer_set(struct k_itimer *timr, int flags,
-                               struct itimerspec *new_setting,
-                               struct itimerspec *old_setting)
+                          struct itimerspec64 *new_setting,
+                          struct itimerspec64 *old_setting)
 {
        ktime_t exp;
 
@@ -659,8 +659,8 @@ static int alarm_timer_set(struct k_itimer *timr, int flags,
                return TIMER_RETRY;
 
        /* start the timer */
-       timr->it.alarm.interval = timespec_to_ktime(new_setting->it_interval);
-       exp = timespec_to_ktime(new_setting->it_value);
+       timr->it.alarm.interval = timespec64_to_ktime(new_setting->it_interval);
+       exp = timespec64_to_ktime(new_setting->it_value);
        /* Convert (if necessary) to absolute time */
        if (flags != TIMER_ABSTIME) {
                ktime_t now;
@@ -790,13 +790,14 @@ out:
  * Handles clock_nanosleep calls against _ALARM clockids
  */
 static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
-                    struct timespec *tsreq, struct timespec __user *rmtp)
+                             struct timespec64 *tsreq,
+                             struct timespec __user *rmtp)
 {
        enum  alarmtimer_type type = clock2alarm(which_clock);
+       struct restart_block *restart;
        struct alarm alarm;
        ktime_t exp;
        int ret = 0;
-       struct restart_block *restart;
 
        if (!alarmtimer_get_rtcdev())
                return -ENOTSUPP;
@@ -809,7 +810,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
 
        alarm_init(&alarm, type, alarmtimer_nsleep_wakeup);
 
-       exp = timespec_to_ktime(*tsreq);
+       exp = timespec64_to_ktime(*tsreq);
        /* Convert (if necessary) to absolute time */
        if (flags != TIMER_ABSTIME) {
                ktime_t now = alarm_bases[type].gettime();
index 97ac095..4237e07 100644 (file)
@@ -468,7 +468,7 @@ void clockevents_register_device(struct clock_event_device *dev)
 }
 EXPORT_SYMBOL_GPL(clockevents_register_device);
 
-void clockevents_config(struct clock_event_device *dev, u32 freq)
+static void clockevents_config(struct clock_event_device *dev, u32 freq)
 {
        u64 sec;
 
index ec08f52..a756012 100644 (file)
@@ -1368,10 +1368,7 @@ retry:
                    ktime_to_ns(delta));
 }
 
-/*
- * local version of hrtimer_peek_ahead_timers() called with interrupts
- * disabled.
- */
+/* called with interrupts disabled */
 static inline void __hrtimer_peek_ahead_timers(void)
 {
        struct tick_device *td;
@@ -1506,7 +1503,7 @@ out:
        return ret;
 }
 
-long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
+long hrtimer_nanosleep(struct timespec64 *rqtp, struct timespec __user *rmtp,
                       const enum hrtimer_mode mode, const clockid_t clockid)
 {
        struct restart_block *restart;
@@ -1519,7 +1516,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
                slack = 0;
 
        hrtimer_init_on_stack(&t.timer, clockid, mode);
-       hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack);
+       hrtimer_set_expires_range_ns(&t.timer, timespec64_to_ktime(*rqtp), slack);
        if (do_nanosleep(&t, mode))
                goto out;
 
@@ -1550,15 +1547,17 @@ out:
 SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
                struct timespec __user *, rmtp)
 {
+       struct timespec64 tu64;
        struct timespec tu;
 
        if (copy_from_user(&tu, rqtp, sizeof(tu)))
                return -EFAULT;
 
-       if (!timespec_valid(&tu))
+       tu64 = timespec_to_timespec64(tu);
+       if (!timespec64_valid(&tu64))
                return -EINVAL;
 
-       return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
+       return hrtimer_nanosleep(&tu64, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
 }
 
 /*
index 9cff0ab..31d588d 100644 (file)
@@ -297,7 +297,7 @@ out:
        return err;
 }
 
-static int pc_clock_gettime(clockid_t id, struct timespec *ts)
+static int pc_clock_gettime(clockid_t id, struct timespec64 *ts)
 {
        struct posix_clock_desc cd;
        int err;
@@ -316,7 +316,7 @@ static int pc_clock_gettime(clockid_t id, struct timespec *ts)
        return err;
 }
 
-static int pc_clock_getres(clockid_t id, struct timespec *ts)
+static int pc_clock_getres(clockid_t id, struct timespec64 *ts)
 {
        struct posix_clock_desc cd;
        int err;
@@ -335,7 +335,7 @@ static int pc_clock_getres(clockid_t id, struct timespec *ts)
        return err;
 }
 
-static int pc_clock_settime(clockid_t id, const struct timespec *ts)
+static int pc_clock_settime(clockid_t id, const struct timespec64 *ts)
 {
        struct posix_clock_desc cd;
        int err;
@@ -399,7 +399,7 @@ static int pc_timer_delete(struct k_itimer *kit)
        return err;
 }
 
-static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec *ts)
+static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec64 *ts)
 {
        clockid_t id = kit->it_clock;
        struct posix_clock_desc cd;
@@ -414,7 +414,7 @@ static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec *ts)
 }
 
 static int pc_timer_settime(struct k_itimer *kit, int flags,
-                           struct itimerspec *ts, struct itimerspec *old)
+                           struct itimerspec64 *ts, struct itimerspec64 *old)
 {
        clockid_t id = kit->it_clock;
        struct posix_clock_desc cd;
index 4513ad1..949e434 100644 (file)
@@ -116,7 +116,7 @@ static inline u64 virt_ticks(struct task_struct *p)
 }
 
 static int
-posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
+posix_cpu_clock_getres(const clockid_t which_clock, struct timespec64 *tp)
 {
        int error = check_clock(which_clock);
        if (!error) {
@@ -135,7 +135,7 @@ posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
 }
 
 static int
-posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
+posix_cpu_clock_set(const clockid_t which_clock, const struct timespec64 *tp)
 {
        /*
         * You can never reset a CPU clock, but we check for other errors
@@ -261,7 +261,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
 
 static int posix_cpu_clock_get_task(struct task_struct *tsk,
                                    const clockid_t which_clock,
-                                   struct timespec *tp)
+                                   struct timespec64 *tp)
 {
        int err = -EINVAL;
        u64 rtn;
@@ -275,13 +275,13 @@ static int posix_cpu_clock_get_task(struct task_struct *tsk,
        }
 
        if (!err)
-               *tp = ns_to_timespec(rtn);
+               *tp = ns_to_timespec64(rtn);
 
        return err;
 }
 
 
-static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
+static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec64 *tp)
 {
        const pid_t pid = CPUCLOCK_PID(which_clock);
        int err = -EINVAL;
@@ -562,7 +562,7 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
  * and try again.  (This happens when the timer is in the middle of firing.)
  */
 static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
-                              struct itimerspec *new, struct itimerspec *old)
+                              struct itimerspec64 *new, struct itimerspec64 *old)
 {
        unsigned long flags;
        struct sighand_struct *sighand;
@@ -572,7 +572,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
 
        WARN_ON_ONCE(p == NULL);
 
-       new_expires = timespec_to_ns(&new->it_value);
+       new_expires = timespec64_to_ns(&new->it_value);
 
        /*
         * Protect against sighand release/switch in exit/exec and p->cpu_timers
@@ -633,7 +633,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
                        bump_cpu_timer(timer, val);
                        if (val < timer->it.cpu.expires) {
                                old_expires = timer->it.cpu.expires - val;
-                               old->it_value = ns_to_timespec(old_expires);
+                               old->it_value = ns_to_timespec64(old_expires);
                        } else {
                                old->it_value.tv_nsec = 1;
                                old->it_value.tv_sec = 0;
@@ -671,7 +671,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
         * Install the new reload setting, and
         * set up the signal and overrun bookkeeping.
         */
-       timer->it.cpu.incr = timespec_to_ns(&new->it_interval);
+       timer->it.cpu.incr = timespec64_to_ns(&new->it_interval);
 
        /*
         * This acts as a modification timestamp for the timer,
@@ -695,12 +695,12 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
        ret = 0;
  out:
        if (old)
-               old->it_interval = ns_to_timespec(old_incr);
+               old->it_interval = ns_to_timespec64(old_incr);
 
        return ret;
 }
 
-static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
+static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp)
 {
        u64 now;
        struct task_struct *p = timer->it.cpu.task;
@@ -710,7 +710,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
        /*
         * Easy part: convert the reload time.
         */
-       itp->it_interval = ns_to_timespec(timer->it.cpu.incr);
+       itp->it_interval = ns_to_timespec64(timer->it.cpu.incr);
 
        if (timer->it.cpu.expires == 0) {       /* Timer not armed at all.  */
                itp->it_value.tv_sec = itp->it_value.tv_nsec = 0;
@@ -739,7 +739,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
                         * Call the timer disarmed, nothing else to do.
                         */
                        timer->it.cpu.expires = 0;
-                       itp->it_value = ns_to_timespec(timer->it.cpu.expires);
+                       itp->it_value = ns_to_timespec64(timer->it.cpu.expires);
                        return;
                } else {
                        cpu_timer_sample_group(timer->it_clock, p, &now);
@@ -748,7 +748,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
        }
 
        if (now < timer->it.cpu.expires) {
-               itp->it_value = ns_to_timespec(timer->it.cpu.expires - now);
+               itp->it_value = ns_to_timespec64(timer->it.cpu.expires - now);
        } else {
                /*
                 * The timer should have expired already, but the firing
@@ -825,6 +825,8 @@ static void check_thread_timers(struct task_struct *tsk,
                         * At the hard limit, we just die.
                         * No need to calculate anything else now.
                         */
+                       pr_info("CPU Watchdog Timeout (hard): %s[%d]\n",
+                               tsk->comm, task_pid_nr(tsk));
                        __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
                        return;
                }
@@ -836,8 +838,7 @@ static void check_thread_timers(struct task_struct *tsk,
                                soft += USEC_PER_SEC;
                                sig->rlim[RLIMIT_RTTIME].rlim_cur = soft;
                        }
-                       printk(KERN_INFO
-                               "RT Watchdog Timeout: %s[%d]\n",
+                       pr_info("RT Watchdog Timeout (soft): %s[%d]\n",
                                tsk->comm, task_pid_nr(tsk));
                        __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
                }
@@ -935,6 +936,8 @@ static void check_process_timers(struct task_struct *tsk,
                         * At the hard limit, we just die.
                         * No need to calculate anything else now.
                         */
+                       pr_info("RT Watchdog Timeout (hard): %s[%d]\n",
+                               tsk->comm, task_pid_nr(tsk));
                        __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
                        return;
                }
@@ -942,6 +945,8 @@ static void check_process_timers(struct task_struct *tsk,
                        /*
                         * At the soft limit, send a SIGXCPU every second.
                         */
+                       pr_info("CPU Watchdog Timeout (soft): %s[%d]\n",
+                               tsk->comm, task_pid_nr(tsk));
                        __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
                        if (soft < hard) {
                                soft++;
@@ -1214,7 +1219,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
 }
 
 static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
-                           struct timespec *rqtp, struct itimerspec *it)
+                           struct timespec64 *rqtp, struct itimerspec64 *it)
 {
        struct k_itimer timer;
        int error;
@@ -1229,7 +1234,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
        error = posix_cpu_timer_create(&timer);
        timer.it_process = current;
        if (!error) {
-               static struct itimerspec zero_it;
+               static struct itimerspec64 zero_it;
 
                memset(it, 0, sizeof *it);
                it->it_value = *rqtp;
@@ -1264,7 +1269,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
                /*
                 * We were interrupted by a signal.
                 */
-               *rqtp = ns_to_timespec(timer.it.cpu.expires);
+               *rqtp = ns_to_timespec64(timer.it.cpu.expires);
                error = posix_cpu_timer_set(&timer, 0, &zero_it, it);
                if (!error) {
                        /*
@@ -1301,10 +1306,11 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
 static long posix_cpu_nsleep_restart(struct restart_block *restart_block);
 
 static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
-                           struct timespec *rqtp, struct timespec __user *rmtp)
+                           struct timespec64 *rqtp, struct timespec __user *rmtp)
 {
        struct restart_block *restart_block = &current->restart_block;
-       struct itimerspec it;
+       struct itimerspec64 it;
+       struct timespec ts;
        int error;
 
        /*
@@ -1324,13 +1330,14 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
                /*
                 * Report back to the user the time still remaining.
                 */
-               if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
+               ts = timespec64_to_timespec(it.it_value);
+               if (rmtp && copy_to_user(rmtp, &ts, sizeof(*rmtp)))
                        return -EFAULT;
 
                restart_block->fn = posix_cpu_nsleep_restart;
                restart_block->nanosleep.clockid = which_clock;
                restart_block->nanosleep.rmtp = rmtp;
-               restart_block->nanosleep.expires = timespec_to_ns(rqtp);
+               restart_block->nanosleep.expires = timespec64_to_ns(rqtp);
        }
        return error;
 }
@@ -1338,11 +1345,12 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
 static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
 {
        clockid_t which_clock = restart_block->nanosleep.clockid;
-       struct timespec t;
-       struct itimerspec it;
+       struct itimerspec64 it;
+       struct timespec64 t;
+       struct timespec tmp;
        int error;
 
-       t = ns_to_timespec(restart_block->nanosleep.expires);
+       t = ns_to_timespec64(restart_block->nanosleep.expires);
 
        error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it);
 
@@ -1351,10 +1359,11 @@ static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
                /*
                 * Report back to the user the time still remaining.
                 */
-               if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
+                tmp = timespec64_to_timespec(it.it_value);
+               if (rmtp && copy_to_user(rmtp, &tmp, sizeof(*rmtp)))
                        return -EFAULT;
 
-               restart_block->nanosleep.expires = timespec_to_ns(&t);
+               restart_block->nanosleep.expires = timespec64_to_ns(&t);
        }
        return error;
 
@@ -1364,12 +1373,12 @@ static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
 #define THREAD_CLOCK   MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED)
 
 static int process_cpu_clock_getres(const clockid_t which_clock,
-                                   struct timespec *tp)
+                                   struct timespec64 *tp)
 {
        return posix_cpu_clock_getres(PROCESS_CLOCK, tp);
 }
 static int process_cpu_clock_get(const clockid_t which_clock,
-                                struct timespec *tp)
+                                struct timespec64 *tp)
 {
        return posix_cpu_clock_get(PROCESS_CLOCK, tp);
 }
@@ -1379,7 +1388,7 @@ static int process_cpu_timer_create(struct k_itimer *timer)
        return posix_cpu_timer_create(timer);
 }
 static int process_cpu_nsleep(const clockid_t which_clock, int flags,
-                             struct timespec *rqtp,
+                             struct timespec64 *rqtp,
                              struct timespec __user *rmtp)
 {
        return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp, rmtp);
@@ -1389,12 +1398,12 @@ static long process_cpu_nsleep_restart(struct restart_block *restart_block)
        return -EINVAL;
 }
 static int thread_cpu_clock_getres(const clockid_t which_clock,
-                                  struct timespec *tp)
+                                  struct timespec64 *tp)
 {
        return posix_cpu_clock_getres(THREAD_CLOCK, tp);
 }
 static int thread_cpu_clock_get(const clockid_t which_clock,
-                               struct timespec *tp)
+                               struct timespec64 *tp)
 {
        return posix_cpu_clock_get(THREAD_CLOCK, tp);
 }
index cd6716e..c0cd53e 100644 (file)
@@ -49,26 +49,32 @@ SYS_NI(alarm);
 SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
                const struct timespec __user *, tp)
 {
+       struct timespec64 new_tp64;
        struct timespec new_tp;
 
        if (which_clock != CLOCK_REALTIME)
                return -EINVAL;
        if (copy_from_user(&new_tp, tp, sizeof (*tp)))
                return -EFAULT;
-       return do_sys_settimeofday(&new_tp, NULL);
+
+       new_tp64 = timespec_to_timespec64(new_tp);
+       return do_sys_settimeofday64(&new_tp64, NULL);
 }
 
 SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
                struct timespec __user *,tp)
 {
+       struct timespec64 kernel_tp64;
        struct timespec kernel_tp;
 
        switch (which_clock) {
-       case CLOCK_REALTIME: ktime_get_real_ts(&kernel_tp); break;
-       case CLOCK_MONOTONIC: ktime_get_ts(&kernel_tp); break;
-       case CLOCK_BOOTTIME: get_monotonic_boottime(&kernel_tp); break;
+       case CLOCK_REALTIME: ktime_get_real_ts64(&kernel_tp64); break;
+       case CLOCK_MONOTONIC: ktime_get_ts64(&kernel_tp64); break;
+       case CLOCK_BOOTTIME: get_monotonic_boottime64(&kernel_tp64); break;
        default: return -EINVAL;
        }
+
+       kernel_tp = timespec64_to_timespec(kernel_tp64);
        if (copy_to_user(tp, &kernel_tp, sizeof (kernel_tp)))
                return -EFAULT;
        return 0;
@@ -97,6 +103,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
                const struct timespec __user *, rqtp,
                struct timespec __user *, rmtp)
 {
+       struct timespec64 t64;
        struct timespec t;
 
        switch (which_clock) {
@@ -105,9 +112,10 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
        case CLOCK_BOOTTIME:
                if (copy_from_user(&t, rqtp, sizeof (struct timespec)))
                        return -EFAULT;
-               if (!timespec_valid(&t))
+               t64 = timespec_to_timespec64(t);
+               if (!timespec64_valid(&t64))
                        return -EINVAL;
-               return hrtimer_nanosleep(&t, rmtp, flags & TIMER_ABSTIME ?
+               return hrtimer_nanosleep(&t64, rmtp, flags & TIMER_ABSTIME ?
                                         HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
                                         which_clock);
        default:
index 50a6a47..4d7b2ce 100644 (file)
@@ -130,12 +130,12 @@ static struct k_clock posix_clocks[MAX_CLOCKS];
 /*
  * These ones are defined below.
  */
-static int common_nsleep(const clockid_t, int flags, struct timespec *t,
+static int common_nsleep(const clockid_t, int flags, struct timespec64 *t,
                         struct timespec __user *rmtp);
 static int common_timer_create(struct k_itimer *new_timer);
-static void common_timer_get(struct k_itimer *, struct itimerspec *);
+static void common_timer_get(struct k_itimer *, struct itimerspec64 *);
 static int common_timer_set(struct k_itimer *, int,
-                           struct itimerspec *, struct itimerspec *);
+                           struct itimerspec64 *, struct itimerspec64 *);
 static int common_timer_del(struct k_itimer *timer);
 
 static enum hrtimer_restart posix_timer_fn(struct hrtimer *data);
@@ -204,17 +204,17 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
 }
 
 /* Get clock_realtime */
-static int posix_clock_realtime_get(clockid_t which_clock, struct timespec *tp)
+static int posix_clock_realtime_get(clockid_t which_clock, struct timespec64 *tp)
 {
-       ktime_get_real_ts(tp);
+       ktime_get_real_ts64(tp);
        return 0;
 }
 
 /* Set clock_realtime */
 static int posix_clock_realtime_set(const clockid_t which_clock,
-                                   const struct timespec *tp)
+                                   const struct timespec64 *tp)
 {
-       return do_sys_settimeofday(tp, NULL);
+       return do_sys_settimeofday64(tp, NULL);
 }
 
 static int posix_clock_realtime_adj(const clockid_t which_clock,
@@ -226,54 +226,54 @@ static int posix_clock_realtime_adj(const clockid_t which_clock,
 /*
  * Get monotonic time for posix timers
  */
-static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp)
+static int posix_ktime_get_ts(clockid_t which_clock, struct timespec64 *tp)
 {
-       ktime_get_ts(tp);
+       ktime_get_ts64(tp);
        return 0;
 }
 
 /*
  * Get monotonic-raw time for posix timers
  */
-static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp)
+static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec64 *tp)
 {
-       getrawmonotonic(tp);
+       getrawmonotonic64(tp);
        return 0;
 }
 
 
-static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec *tp)
+static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec64 *tp)
 {
-       *tp = current_kernel_time();
+       *tp = current_kernel_time64();
        return 0;
 }
 
 static int posix_get_monotonic_coarse(clockid_t which_clock,
-                                               struct timespec *tp)
+                                               struct timespec64 *tp)
 {
-       *tp = get_monotonic_coarse();
+       *tp = get_monotonic_coarse64();
        return 0;
 }
 
-static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp)
+static int posix_get_coarse_res(const clockid_t which_clock, struct timespec64 *tp)
 {
-       *tp = ktime_to_timespec(KTIME_LOW_RES);
+       *tp = ktime_to_timespec64(KTIME_LOW_RES);
        return 0;
 }
 
-static int posix_get_boottime(const clockid_t which_clock, struct timespec *tp)
+static int posix_get_boottime(const clockid_t which_clock, struct timespec64 *tp)
 {
-       get_monotonic_boottime(tp);
+       get_monotonic_boottime64(tp);
        return 0;
 }
 
-static int posix_get_tai(clockid_t which_clock, struct timespec *tp)
+static int posix_get_tai(clockid_t which_clock, struct timespec64 *tp)
 {
-       timekeeping_clocktai(tp);
+       timekeeping_clocktai64(tp);
        return 0;
 }
 
-static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec *tp)
+static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp)
 {
        tp->tv_sec = 0;
        tp->tv_nsec = hrtimer_resolution;
@@ -734,18 +734,18 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
  * report.
  */
 static void
-common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
+common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
 {
        ktime_t now, remaining, iv;
        struct hrtimer *timer = &timr->it.real.timer;
 
-       memset(cur_setting, 0, sizeof(struct itimerspec));
+       memset(cur_setting, 0, sizeof(*cur_setting));
 
        iv = timr->it.real.interval;
 
        /* interval timer ? */
        if (iv)
-               cur_setting->it_interval = ktime_to_timespec(iv);
+               cur_setting->it_interval = ktime_to_timespec64(iv);
        else if (!hrtimer_active(timer) &&
                 (timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE)
                return;
@@ -771,13 +771,14 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
                if ((timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE)
                        cur_setting->it_value.tv_nsec = 1;
        } else
-               cur_setting->it_value = ktime_to_timespec(remaining);
+               cur_setting->it_value = ktime_to_timespec64(remaining);
 }
 
 /* Get the time remaining on a POSIX.1b interval timer. */
 SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
                struct itimerspec __user *, setting)
 {
+       struct itimerspec64 cur_setting64;
        struct itimerspec cur_setting;
        struct k_itimer *timr;
        struct k_clock *kc;
@@ -792,10 +793,11 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
        if (WARN_ON_ONCE(!kc || !kc->timer_get))
                ret = -EINVAL;
        else
-               kc->timer_get(timr, &cur_setting);
+               kc->timer_get(timr, &cur_setting64);
 
        unlock_timer(timr, flags);
 
+       cur_setting = itimerspec64_to_itimerspec(&cur_setting64);
        if (!ret && copy_to_user(setting, &cur_setting, sizeof (cur_setting)))
                return -EFAULT;
 
@@ -831,7 +833,7 @@ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id)
 /* timr->it_lock is taken. */
 static int
 common_timer_set(struct k_itimer *timr, int flags,
-                struct itimerspec *new_setting, struct itimerspec *old_setting)
+                struct itimerspec64 *new_setting, struct itimerspec64 *old_setting)
 {
        struct hrtimer *timer = &timr->it.real.timer;
        enum hrtimer_mode mode;
@@ -860,10 +862,10 @@ common_timer_set(struct k_itimer *timr, int flags,
        hrtimer_init(&timr->it.real.timer, timr->it_clock, mode);
        timr->it.real.timer.function = posix_timer_fn;
 
-       hrtimer_set_expires(timer, timespec_to_ktime(new_setting->it_value));
+       hrtimer_set_expires(timer, timespec64_to_ktime(new_setting->it_value));
 
        /* Convert interval */
-       timr->it.real.interval = timespec_to_ktime(new_setting->it_interval);
+       timr->it.real.interval = timespec64_to_ktime(new_setting->it_interval);
 
        /* SIGEV_NONE timers are not queued ! See common_timer_get */
        if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) {
@@ -883,21 +885,23 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
                const struct itimerspec __user *, new_setting,
                struct itimerspec __user *, old_setting)
 {
-       struct k_itimer *timr;
+       struct itimerspec64 new_spec64, old_spec64;
+       struct itimerspec64 *rtn = old_setting ? &old_spec64 : NULL;
        struct itimerspec new_spec, old_spec;
-       int error = 0;
+       struct k_itimer *timr;
        unsigned long flag;
-       struct itimerspec *rtn = old_setting ? &old_spec : NULL;
        struct k_clock *kc;
+       int error = 0;
 
        if (!new_setting)
                return -EINVAL;
 
        if (copy_from_user(&new_spec, new_setting, sizeof (new_spec)))
                return -EFAULT;
+       new_spec64 = itimerspec_to_itimerspec64(&new_spec);
 
-       if (!timespec_valid(&new_spec.it_interval) ||
-           !timespec_valid(&new_spec.it_value))
+       if (!timespec64_valid(&new_spec64.it_interval) ||
+           !timespec64_valid(&new_spec64.it_value))
                return -EINVAL;
 retry:
        timr = lock_timer(timer_id, &flag);
@@ -908,7 +912,7 @@ retry:
        if (WARN_ON_ONCE(!kc || !kc->timer_set))
                error = -EINVAL;
        else
-               error = kc->timer_set(timr, flags, &new_spec, rtn);
+               error = kc->timer_set(timr, flags, &new_spec64, rtn);
 
        unlock_timer(timr, flag);
        if (error == TIMER_RETRY) {
@@ -916,6 +920,7 @@ retry:
                goto retry;
        }
 
+       old_spec = itimerspec64_to_itimerspec(&old_spec64);
        if (old_setting && !error &&
            copy_to_user(old_setting, &old_spec, sizeof (old_spec)))
                error = -EFAULT;
@@ -1014,6 +1019,7 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
                const struct timespec __user *, tp)
 {
        struct k_clock *kc = clockid_to_kclock(which_clock);
+       struct timespec64 new_tp64;
        struct timespec new_tp;
 
        if (!kc || !kc->clock_set)
@@ -1021,21 +1027,24 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
 
        if (copy_from_user(&new_tp, tp, sizeof (*tp)))
                return -EFAULT;
+       new_tp64 = timespec_to_timespec64(new_tp);
 
-       return kc->clock_set(which_clock, &new_tp);
+       return kc->clock_set(which_clock, &new_tp64);
 }
 
 SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
                struct timespec __user *,tp)
 {
        struct k_clock *kc = clockid_to_kclock(which_clock);
+       struct timespec64 kernel_tp64;
        struct timespec kernel_tp;
        int error;
 
        if (!kc)
                return -EINVAL;
 
-       error = kc->clock_get(which_clock, &kernel_tp);
+       error = kc->clock_get(which_clock, &kernel_tp64);
+       kernel_tp = timespec64_to_timespec(kernel_tp64);
 
        if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp)))
                error = -EFAULT;
@@ -1070,13 +1079,15 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
                struct timespec __user *, tp)
 {
        struct k_clock *kc = clockid_to_kclock(which_clock);
+       struct timespec64 rtn_tp64;
        struct timespec rtn_tp;
        int error;
 
        if (!kc)
                return -EINVAL;
 
-       error = kc->clock_getres(which_clock, &rtn_tp);
+       error = kc->clock_getres(which_clock, &rtn_tp64);
+       rtn_tp = timespec64_to_timespec(rtn_tp64);
 
        if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp)))
                error = -EFAULT;
@@ -1088,7 +1099,7 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
  * nanosleep for monotonic and realtime clocks
  */
 static int common_nsleep(const clockid_t which_clock, int flags,
-                        struct timespec *tsave, struct timespec __user *rmtp)
+                        struct timespec64 *tsave, struct timespec __user *rmtp)
 {
        return hrtimer_nanosleep(tsave, rmtp, flags & TIMER_ABSTIME ?
                                 HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
@@ -1100,6 +1111,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
                struct timespec __user *, rmtp)
 {
        struct k_clock *kc = clockid_to_kclock(which_clock);
+       struct timespec64 t64;
        struct timespec t;
 
        if (!kc)
@@ -1110,10 +1122,11 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
        if (copy_from_user(&t, rqtp, sizeof (struct timespec)))
                return -EFAULT;
 
-       if (!timespec_valid(&t))
+       t64 = timespec_to_timespec64(t);
+       if (!timespec64_valid(&t64))
                return -EINVAL;
 
-       return kc->nsleep(which_clock, flags, &t, rmtp);
+       return kc->nsleep(which_clock, flags, &t64, rmtp);
 }
 
 /*
index ea6b610..2d8f05a 100644 (file)
@@ -206,6 +206,11 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
 
        update_clock_read_data(&rd);
 
+       if (sched_clock_timer.function != NULL) {
+               /* update timeout for clock wrap */
+               hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
+       }
+
        r = rate;
        if (r >= 4000000) {
                r /= 1000000;
index 7fe53be..64c97fc 100644 (file)
@@ -993,6 +993,18 @@ ktime_t tick_nohz_get_sleep_length(void)
        return ts->sleep_length;
 }
 
+/**
+ * tick_nohz_get_idle_calls - return the current idle calls counter value
+ *
+ * Called from the schedutil frequency scaling governor in scheduler context.
+ */
+unsigned long tick_nohz_get_idle_calls(void)
+{
+       struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
+
+       return ts->idle_calls;
+}
+
 static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
 {
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
index 25bdd25..6574bba 100644 (file)
@@ -193,8 +193,8 @@ int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz
 SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv,
                struct timezone __user *, tz)
 {
+       struct timespec64 new_ts;
        struct timeval user_tv;
-       struct timespec new_ts;
        struct timezone new_tz;
 
        if (tv) {
@@ -212,7 +212,7 @@ SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv,
                        return -EFAULT;
        }
 
-       return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
+       return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
 }
 
 SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p)
index 5b63a21..9652bc5 100644 (file)
@@ -996,8 +996,7 @@ static int adjust_historical_crosststamp(struct system_time_snapshot *history,
                return 0;
 
        /* Interpolate shortest distance from beginning or end of history */
-       interp_forward = partial_history_cycles > total_history_cycles/2 ?
-               true : false;
+       interp_forward = partial_history_cycles > total_history_cycles / 2;
        partial_history_cycles = interp_forward ?
                total_history_cycles - partial_history_cycles :
                partial_history_cycles;
index 1dc0256..cc6b6bd 100644 (file)
@@ -241,7 +241,7 @@ int timer_migration_handler(struct ctl_table *table, int write,
        int ret;
 
        mutex_lock(&mutex);
-       ret = proc_dointvec(table, write, buffer, lenp, ppos);
+       ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
        if (!ret && write)
                timers_update_migration(false);
        mutex_unlock(&mutex);
index ff8d5c1..0e7f542 100644 (file)
@@ -16,6 +16,7 @@
 #include <linux/sched.h>
 #include <linux/seq_file.h>
 #include <linux/kallsyms.h>
+#include <linux/nmi.h>
 
 #include <linux/uaccess.h>
 
@@ -86,6 +87,9 @@ print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base,
 
 next_one:
        i = 0;
+
+       touch_nmi_watchdog();
+
        raw_spin_lock_irqsave(&base->cpu_base->lock, flags);
 
        curr = timerqueue_getnext(&base->active);
@@ -197,6 +201,8 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
 {
        struct clock_event_device *dev = td->evtdev;
 
+       touch_nmi_watchdog();
+
        SEQ_printf(m, "Tick Device: mode:     %d\n", td->mode);
        if (cpu < 0)
                SEQ_printf(m, "Broadcast device\n");
index d4a06e7..9619b57 100644 (file)
@@ -455,7 +455,7 @@ config UPROBE_EVENTS
        select UPROBES
        select PROBE_EVENTS
        select TRACING
-       default n
+       default y
        help
          This allows the user to add tracing events on top of userspace
          dynamic events (similar to tracepoints) on the fly via the trace
index b2058a7..bd8ae8d 100644 (file)
@@ -690,8 +690,8 @@ void blk_trace_shutdown(struct request_queue *q)
 
 /**
  * blk_add_trace_rq - Add a trace for a request oriented action
- * @q:         queue the io is for
  * @rq:                the source request
+ * @error:     return status to log
  * @nr_bytes:  number of completed bytes
  * @what:      the action
  *
@@ -699,10 +699,10 @@ void blk_trace_shutdown(struct request_queue *q)
  *     Records an action against a request. Will log the bio offset + size.
  *
  **/
-static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
+static void blk_add_trace_rq(struct request *rq, int error,
                             unsigned int nr_bytes, u32 what)
 {
-       struct blk_trace *bt = q->blk_trace;
+       struct blk_trace *bt = rq->q->blk_trace;
 
        if (likely(!bt))
                return;
@@ -713,40 +713,32 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
                what |= BLK_TC_ACT(BLK_TC_FS);
 
        __blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq),
-                       rq->cmd_flags, what, rq->errors, 0, NULL);
-}
-
-static void blk_add_trace_rq_abort(void *ignore,
-                                  struct request_queue *q, struct request *rq)
-{
-       blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_ABORT);
+                       rq->cmd_flags, what, error, 0, NULL);
 }
 
 static void blk_add_trace_rq_insert(void *ignore,
                                    struct request_queue *q, struct request *rq)
 {
-       blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_INSERT);
+       blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT);
 }
 
 static void blk_add_trace_rq_issue(void *ignore,
                                   struct request_queue *q, struct request *rq)
 {
-       blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_ISSUE);
+       blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE);
 }
 
 static void blk_add_trace_rq_requeue(void *ignore,
                                     struct request_queue *q,
                                     struct request *rq)
 {
-       blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_REQUEUE);
+       blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE);
 }
 
-static void blk_add_trace_rq_complete(void *ignore,
-                                     struct request_queue *q,
-                                     struct request *rq,
-                                     unsigned int nr_bytes)
+static void blk_add_trace_rq_complete(void *ignore, struct request *rq,
+                       int error, unsigned int nr_bytes)
 {
-       blk_add_trace_rq(q, rq, nr_bytes, BLK_TA_COMPLETE);
+       blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE);
 }
 
 /**
@@ -941,7 +933,7 @@ static void blk_add_trace_rq_remap(void *ignore,
        r.sector_from = cpu_to_be64(from);
 
        __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
-                       rq_data_dir(rq), 0, BLK_TA_REMAP, !!rq->errors,
+                       rq_data_dir(rq), 0, BLK_TA_REMAP, 0,
                        sizeof(r), &r);
 }
 
@@ -966,7 +958,7 @@ void blk_add_driver_data(struct request_queue *q,
                return;
 
        __blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0,
-                               BLK_TA_DRV_DATA, rq->errors, len, data);
+                               BLK_TA_DRV_DATA, 0, len, data);
 }
 EXPORT_SYMBOL_GPL(blk_add_driver_data);
 
@@ -974,8 +966,6 @@ static void blk_register_tracepoints(void)
 {
        int ret;
 
-       ret = register_trace_block_rq_abort(blk_add_trace_rq_abort, NULL);
-       WARN_ON(ret);
        ret = register_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
        WARN_ON(ret);
        ret = register_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
@@ -1028,7 +1018,6 @@ static void blk_unregister_tracepoints(void)
        unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
        unregister_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
        unregister_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
-       unregister_trace_block_rq_abort(blk_add_trace_rq_abort, NULL);
 
        tracepoint_synchronize_unregister();
 }
index cee9802..f806dbd 100644 (file)
@@ -96,7 +96,7 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src,
        if (unlikely(in_interrupt() ||
                     current->flags & (PF_KTHREAD | PF_EXITING)))
                return -EPERM;
-       if (unlikely(segment_eq(get_fs(), KERNEL_DS)))
+       if (unlikely(uaccess_kernel()))
                return -EPERM;
        if (!access_ok(VERIFY_WRITE, unsafe_ptr, size))
                return -EPERM;
index b9691ee..dd3e91d 100644 (file)
@@ -3755,23 +3755,24 @@ static void __enable_ftrace_function_probe(struct ftrace_ops_hash *old_hash)
        ftrace_probe_registered = 1;
 }
 
-static void __disable_ftrace_function_probe(void)
+static bool __disable_ftrace_function_probe(void)
 {
        int i;
 
        if (!ftrace_probe_registered)
-               return;
+               return false;
 
        for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
                struct hlist_head *hhd = &ftrace_func_hash[i];
                if (hhd->first)
-                       return;
+                       return false;
        }
 
        /* no more funcs left */
        ftrace_shutdown(&trace_probe_ops, 0);
 
        ftrace_probe_registered = 0;
+       return true;
 }
 
 
@@ -3901,6 +3902,7 @@ static void
 __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
                                  void *data, int flags)
 {
+       struct ftrace_ops_hash old_hash_ops;
        struct ftrace_func_entry *rec_entry;
        struct ftrace_func_probe *entry;
        struct ftrace_func_probe *p;
@@ -3912,6 +3914,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
        struct hlist_node *tmp;
        char str[KSYM_SYMBOL_LEN];
        int i, ret;
+       bool disabled;
 
        if (glob && (strcmp(glob, "*") == 0 || !strlen(glob)))
                func_g.search = NULL;
@@ -3930,6 +3933,10 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
 
        mutex_lock(&trace_probe_ops.func_hash->regex_lock);
 
+       old_hash_ops.filter_hash = old_hash;
+       /* Probes only have filters */
+       old_hash_ops.notrace_hash = NULL;
+
        hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
        if (!hash)
                /* Hmm, should report this somehow */
@@ -3967,12 +3974,17 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
                }
        }
        mutex_lock(&ftrace_lock);
-       __disable_ftrace_function_probe();
+       disabled = __disable_ftrace_function_probe();
        /*
         * Remove after the disable is called. Otherwise, if the last
         * probe is removed, a null hash means *all enabled*.
         */
        ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash);
+
+       /* still need to update the function call sites */
+       if (ftrace_enabled && !disabled)
+               ftrace_run_modify_code(&trace_probe_ops, FTRACE_UPDATE_CALLS,
+                                      &old_hash_ops);
        synchronize_sched();
        if (!ret)
                free_ftrace_hash_rcu(old_hash);
@@ -5554,6 +5566,15 @@ static void clear_ftrace_pids(struct trace_array *tr)
        trace_free_pid_list(pid_list);
 }
 
+void ftrace_clear_pids(struct trace_array *tr)
+{
+       mutex_lock(&ftrace_lock);
+
+       clear_ftrace_pids(tr);
+
+       mutex_unlock(&ftrace_lock);
+}
+
 static void ftrace_pid_reset(struct trace_array *tr)
 {
        mutex_lock(&ftrace_lock);
index 96fc3c0..ca47a4f 100644 (file)
@@ -3405,11 +3405,23 @@ EXPORT_SYMBOL_GPL(ring_buffer_iter_reset);
 int ring_buffer_iter_empty(struct ring_buffer_iter *iter)
 {
        struct ring_buffer_per_cpu *cpu_buffer;
+       struct buffer_page *reader;
+       struct buffer_page *head_page;
+       struct buffer_page *commit_page;
+       unsigned commit;
 
        cpu_buffer = iter->cpu_buffer;
 
-       return iter->head_page == cpu_buffer->commit_page &&
-               iter->head == rb_commit_index(cpu_buffer);
+       /* Remember, trace recording is off when iterator is in use */
+       reader = cpu_buffer->reader_page;
+       head_page = cpu_buffer->head_page;
+       commit_page = cpu_buffer->commit_page;
+       commit = rb_page_commit(commit_page);
+
+       return ((iter->head_page == commit_page && iter->head == commit) ||
+               (iter->head_page == reader && commit_page == head_page &&
+                head_page->read == commit &&
+                iter->head == rb_page_commit(cpu_buffer->reader_page)));
 }
 EXPORT_SYMBOL_GPL(ring_buffer_iter_empty);
 
@@ -4826,9 +4838,9 @@ static __init int test_ringbuffer(void)
                rb_data[cpu].cnt = cpu;
                rb_threads[cpu] = kthread_create(rb_test, &rb_data[cpu],
                                                 "rbtester/%d", cpu);
-               if (WARN_ON(!rb_threads[cpu])) {
+               if (WARN_ON(IS_ERR(rb_threads[cpu]))) {
                        pr_cont("FAILED\n");
-                       ret = -1;
+                       ret = PTR_ERR(rb_threads[cpu]);
                        goto out_free;
                }
 
@@ -4838,9 +4850,9 @@ static __init int test_ringbuffer(void)
 
        /* Now create the rb hammer! */
        rb_hammer = kthread_run(rb_hammer_test, NULL, "rbhammer");
-       if (WARN_ON(!rb_hammer)) {
+       if (WARN_ON(IS_ERR(rb_hammer))) {
                pr_cont("FAILED\n");
-               ret = -1;
+               ret = PTR_ERR(rb_hammer);
                goto out_free;
        }
 
index f351095..b253d59 100644 (file)
@@ -4355,6 +4355,7 @@ static const char readme_msg[] =
        "\t           -:[<group>/]<event>\n"
 #ifdef CONFIG_KPROBE_EVENTS
        "\t    place: [<module>:]<symbol>[+<offset>]|<memaddr>\n"
+  "place (kretprobe): [<module>:]<symbol>[+<offset>]|<memaddr>\n"
 #endif
 #ifdef CONFIG_UPROBE_EVENTS
        "\t    place: <path>:<offset>\n"
@@ -6733,11 +6734,13 @@ ftrace_trace_snapshot_callback(struct ftrace_hash *hash,
                return ret;
 
  out_reg:
-       ret = register_ftrace_function_probe(glob, ops, count);
+       ret = alloc_snapshot(&global_trace);
+       if (ret < 0)
+               goto out;
 
-       if (ret >= 0)
-               alloc_snapshot(&global_trace);
+       ret = register_ftrace_function_probe(glob, ops, count);
 
+ out:
        return ret < 0 ? ret : 0;
 }
 
@@ -7402,6 +7405,7 @@ static int instance_rmdir(const char *name)
 
        tracing_set_nop(tr);
        event_trace_del_tracer(tr);
+       ftrace_clear_pids(tr);
        ftrace_destroy_function_files(tr);
        tracefs_remove_recursive(tr->dir);
        free_trace_buffers(tr);
index ae1cce9..d19d52d 100644 (file)
@@ -896,6 +896,7 @@ int using_ftrace_ops_list_func(void);
 void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer);
 void ftrace_init_tracefs_toplevel(struct trace_array *tr,
                                  struct dentry *d_tracer);
+void ftrace_clear_pids(struct trace_array *tr);
 #else
 static inline int ftrace_trace_task(struct trace_array *tr)
 {
@@ -914,6 +915,7 @@ ftrace_init_global_array_ops(struct trace_array *tr) { }
 static inline void ftrace_reset_array_ops(struct trace_array *tr) { }
 static inline void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d) { }
 static inline void ftrace_init_tracefs_toplevel(struct trace_array *tr, struct dentry *d) { }
+static inline void ftrace_clear_pids(struct trace_array *tr) { }
 /* ftace_func_t type is not defined, use macro instead of static inline */
 #define ftrace_init_array_ops(tr, func) do { } while (0)
 #endif /* CONFIG_FUNCTION_TRACER */
index 5f688cc..013f4e7 100644 (file)
@@ -681,10 +681,6 @@ static int create_trace_kprobe(int argc, char **argv)
                return -EINVAL;
        }
        if (isdigit(argv[1][0])) {
-               if (is_return) {
-                       pr_info("Return probe point must be a symbol.\n");
-                       return -EINVAL;
-               }
                /* an address specified */
                ret = kstrtoul(&argv[1][0], 0, (unsigned long *)&addr);
                if (ret) {
@@ -700,8 +696,9 @@ static int create_trace_kprobe(int argc, char **argv)
                        pr_info("Failed to parse symbol.\n");
                        return ret;
                }
-               if (offset && is_return) {
-                       pr_info("Return probe must be used without offset.\n");
+               if (offset && is_return &&
+                   !function_offset_within_entry(NULL, symbol, offset)) {
+                       pr_info("Given offset is not valid for return probe.\n");
                        return -EINVAL;
                }
        }
index c0168b7..c74bf39 100644 (file)
@@ -3209,9 +3209,8 @@ static int init_worker_pool(struct worker_pool *pool)
        INIT_LIST_HEAD(&pool->idle_list);
        hash_init(pool->busy_hash);
 
-       init_timer_deferrable(&pool->idle_timer);
-       pool->idle_timer.function = idle_worker_timeout;
-       pool->idle_timer.data = (unsigned long)pool;
+       setup_deferrable_timer(&pool->idle_timer, idle_worker_timeout,
+                              (unsigned long)pool);
 
        setup_timer(&pool->mayday_timer, pool_mayday_timeout,
                    (unsigned long)pool);
@@ -4735,6 +4734,29 @@ long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
        return wfc.ret;
 }
 EXPORT_SYMBOL_GPL(work_on_cpu);
+
+/**
+ * work_on_cpu_safe - run a function in thread context on a particular cpu
+ * @cpu: the cpu to run on
+ * @fn:  the function to run
+ * @arg: the function argument
+ *
+ * Disables CPU hotplug and calls work_on_cpu(). The caller must not hold
+ * any locks which would prevent @fn from completing.
+ *
+ * Return: The value @fn returns.
+ */
+long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg)
+{
+       long ret = -ENODEV;
+
+       get_online_cpus();
+       if (cpu_online(cpu))
+               ret = work_on_cpu(cpu, fn, arg);
+       put_online_cpus();
+       return ret;
+}
+EXPORT_SYMBOL_GPL(work_on_cpu_safe);
 #endif /* CONFIG_SMP */
 
 #ifdef CONFIG_FREEZER
index 97d62c2..2598a32 100644 (file)
@@ -356,7 +356,7 @@ config FRAME_POINTER
        bool "Compile the kernel with frame pointers"
        depends on DEBUG_KERNEL && \
                (CRIS || M68K || FRV || UML || \
-                AVR32 || SUPERH || BLACKFIN || MN10300 || METAG) || \
+                SUPERH || BLACKFIN || MN10300 || METAG) || \
                ARCH_WANT_FRAME_POINTERS
        default y if (DEBUG_INFO && UML) || ARCH_WANT_FRAME_POINTERS
        help
@@ -1103,9 +1103,6 @@ config PROVE_LOCKING
 
         For more details, see Documentation/locking/lockdep-design.txt.
 
-config PROVE_LOCKING_SMALL
-       bool
-
 config LOCKDEP
        bool
        depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT
@@ -1114,6 +1111,9 @@ config LOCKDEP
        select KALLSYMS
        select KALLSYMS_ALL
 
+config LOCKDEP_SMALL
+       bool
+
 config LOCK_STAT
        bool "Lock usage statistics"
        depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT
index 320ac46..b47cf97 100644 (file)
@@ -41,7 +41,7 @@ obj-y += bcd.o div64.o sort.o parser.o debug_locks.o random32.o \
         gcd.o lcm.o list_sort.o uuid.o flex_array.o iov_iter.o clz_ctz.o \
         bsearch.o find_bit.o llist.o memweight.o kfifo.o \
         percpu-refcount.o percpu_ida.o rhashtable.o reciprocal_div.o \
-        once.o refcount.o
+        once.o refcount.o usercopy.o
 obj-y += string_helpers.o
 obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o
 obj-y += hexdump.o
index 8f13cf7..3c6432d 100644 (file)
@@ -15,6 +15,7 @@
 #include <linux/export.h>
 #include <linux/kernel.h>
 #include <linux/string.h>
+#include <linux/ctype.h>
 
 /*
  *     If a hyphen was found in get_option, this will handle the
@@ -189,3 +190,59 @@ bool parse_option_str(const char *str, const char *option)
 
        return false;
 }
+
+/*
+ * Parse a string to get a param value pair.
+ * You can use " around spaces, but can't escape ".
+ * Hyphens and underscores equivalent in parameter names.
+ */
+char *next_arg(char *args, char **param, char **val)
+{
+       unsigned int i, equals = 0;
+       int in_quote = 0, quoted = 0;
+       char *next;
+
+       if (*args == '"') {
+               args++;
+               in_quote = 1;
+               quoted = 1;
+       }
+
+       for (i = 0; args[i]; i++) {
+               if (isspace(args[i]) && !in_quote)
+                       break;
+               if (equals == 0) {
+                       if (args[i] == '=')
+                               equals = i;
+               }
+               if (args[i] == '"')
+                       in_quote = !in_quote;
+       }
+
+       *param = args;
+       if (!equals)
+               *val = NULL;
+       else {
+               args[equals] = '\0';
+               *val = args + equals + 1;
+
+               /* Don't include quotes in value. */
+               if (**val == '"') {
+                       (*val)++;
+                       if (args[i-1] == '"')
+                               args[i-1] = '\0';
+               }
+       }
+       if (quoted && args[i-1] == '"')
+               args[i-1] = '\0';
+
+       if (args[i]) {
+               args[i] = '\0';
+               next = args + i + 1;
+       } else
+               next = args + i;
+
+       /* Chew up trailing spaces. */
+       return skip_spaces(next);
+       //return next;
+}
index e68604a..4952311 100644 (file)
@@ -413,7 +413,7 @@ void iov_iter_init(struct iov_iter *i, int direction,
                        size_t count)
 {
        /* It will get better.  Eventually... */
-       if (segment_eq(get_fs(), KERNEL_DS)) {
+       if (uaccess_kernel()) {
                direction |= ITER_KVEC;
                i->type = direction;
                i->kvec = (struct kvec *)iov;
@@ -604,7 +604,7 @@ size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
                return 0;
        }
        iterate_and_advance(i, bytes, v,
-               __copy_from_user_nocache((to += v.iov_len) - v.iov_len,
+               __copy_from_user_inatomic_nocache((to += v.iov_len) - v.iov_len,
                                         v.iov_base, v.iov_len),
                memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
                                 v.bv_offset, v.bv_len),
@@ -625,7 +625,7 @@ bool copy_from_iter_full_nocache(void *addr, size_t bytes, struct iov_iter *i)
        if (unlikely(i->count < bytes))
                return false;
        iterate_all_kinds(i, bytes, v, ({
-               if (__copy_from_user_nocache((to += v.iov_len) - v.iov_len,
+               if (__copy_from_user_inatomic_nocache((to += v.iov_len) - v.iov_len,
                                             v.iov_base, v.iov_len))
                        return false;
                0;}),
@@ -786,6 +786,68 @@ void iov_iter_advance(struct iov_iter *i, size_t size)
 }
 EXPORT_SYMBOL(iov_iter_advance);
 
+void iov_iter_revert(struct iov_iter *i, size_t unroll)
+{
+       if (!unroll)
+               return;
+       i->count += unroll;
+       if (unlikely(i->type & ITER_PIPE)) {
+               struct pipe_inode_info *pipe = i->pipe;
+               int idx = i->idx;
+               size_t off = i->iov_offset;
+               while (1) {
+                       size_t n = off - pipe->bufs[idx].offset;
+                       if (unroll < n) {
+                               off -= unroll;
+                               break;
+                       }
+                       unroll -= n;
+                       if (!unroll && idx == i->start_idx) {
+                               off = 0;
+                               break;
+                       }
+                       if (!idx--)
+                               idx = pipe->buffers - 1;
+                       off = pipe->bufs[idx].offset + pipe->bufs[idx].len;
+               }
+               i->iov_offset = off;
+               i->idx = idx;
+               pipe_truncate(i);
+               return;
+       }
+       if (unroll <= i->iov_offset) {
+               i->iov_offset -= unroll;
+               return;
+       }
+       unroll -= i->iov_offset;
+       if (i->type & ITER_BVEC) {
+               const struct bio_vec *bvec = i->bvec;
+               while (1) {
+                       size_t n = (--bvec)->bv_len;
+                       i->nr_segs++;
+                       if (unroll <= n) {
+                               i->bvec = bvec;
+                               i->iov_offset = n - unroll;
+                               return;
+                       }
+                       unroll -= n;
+               }
+       } else { /* same logics for iovec and kvec */
+               const struct iovec *iov = i->iov;
+               while (1) {
+                       size_t n = (--iov)->iov_len;
+                       i->nr_segs++;
+                       if (unroll <= n) {
+                               i->iov = iov;
+                               i->iov_offset = n - unroll;
+                               return;
+                       }
+                       unroll -= n;
+               }
+       }
+}
+EXPORT_SYMBOL(iov_iter_revert);
+
 /*
  * Return the count of just the current iov_iter segment.
  */
@@ -839,6 +901,7 @@ void iov_iter_pipe(struct iov_iter *i, int direction,
        i->idx = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
        i->iov_offset = 0;
        i->count = count;
+       i->start_idx = i->idx;
 }
 EXPORT_SYMBOL(iov_iter_pipe);
 
index 445dcae..763d70a 100644 (file)
@@ -601,12 +601,15 @@ struct kobject *kobject_get(struct kobject *kobj)
 }
 EXPORT_SYMBOL(kobject_get);
 
-static struct kobject * __must_check kobject_get_unless_zero(struct kobject *kobj)
+struct kobject * __must_check kobject_get_unless_zero(struct kobject *kobj)
 {
+       if (!kobj)
+               return NULL;
        if (!kref_get_unless_zero(&kobj->kref))
                kobj = NULL;
        return kobj;
 }
+EXPORT_SYMBOL(kobject_get_unless_zero);
 
 /*
  * kobject_cleanup - free kobject resources.
index aa09ad3..f42124c 100644 (file)
 #include <linux/refcount.h>
 #include <linux/bug.h>
 
+/**
+ * refcount_add_not_zero - add a value to a refcount unless it is 0
+ * @i: the value to add to the refcount
+ * @r: the refcount
+ *
+ * Will saturate at UINT_MAX and WARN.
+ *
+ * Provides no memory ordering, it is assumed the caller has guaranteed the
+ * object memory to be stable (RCU, etc.). It does provide a control dependency
+ * and thereby orders future stores. See the comment on top.
+ *
+ * Use of this function is not recommended for the normal reference counting
+ * use case in which references are taken and released one at a time.  In these
+ * cases, refcount_inc(), or one of its variants, should instead be used to
+ * increment a reference count.
+ *
+ * Return: false if the passed refcount is 0, true otherwise
+ */
 bool refcount_add_not_zero(unsigned int i, refcount_t *r)
 {
-       unsigned int old, new, val = atomic_read(&r->refs);
+       unsigned int new, val = atomic_read(&r->refs);
 
-       for (;;) {
+       do {
                if (!val)
                        return false;
 
@@ -51,12 +69,8 @@ bool refcount_add_not_zero(unsigned int i, refcount_t *r)
                new = val + i;
                if (new < val)
                        new = UINT_MAX;
-               old = atomic_cmpxchg_relaxed(&r->refs, val, new);
-               if (old == val)
-                       break;
 
-               val = old;
-       }
+       } while (!atomic_try_cmpxchg_relaxed(&r->refs, &val, new));
 
        WARN_ONCE(new == UINT_MAX, "refcount_t: saturated; leaking memory.\n");
 
@@ -64,24 +78,45 @@ bool refcount_add_not_zero(unsigned int i, refcount_t *r)
 }
 EXPORT_SYMBOL_GPL(refcount_add_not_zero);
 
+/**
+ * refcount_add - add a value to a refcount
+ * @i: the value to add to the refcount
+ * @r: the refcount
+ *
+ * Similar to atomic_add(), but will saturate at UINT_MAX and WARN.
+ *
+ * Provides no memory ordering, it is assumed the caller has guaranteed the
+ * object memory to be stable (RCU, etc.). It does provide a control dependency
+ * and thereby orders future stores. See the comment on top.
+ *
+ * Use of this function is not recommended for the normal reference counting
+ * use case in which references are taken and released one at a time.  In these
+ * cases, refcount_inc(), or one of its variants, should instead be used to
+ * increment a reference count.
+ */
 void refcount_add(unsigned int i, refcount_t *r)
 {
        WARN_ONCE(!refcount_add_not_zero(i, r), "refcount_t: addition on 0; use-after-free.\n");
 }
 EXPORT_SYMBOL_GPL(refcount_add);
 
-/*
- * Similar to atomic_inc_not_zero(), will saturate at UINT_MAX and WARN.
+/**
+ * refcount_inc_not_zero - increment a refcount unless it is 0
+ * @r: the refcount to increment
+ *
+ * Similar to atomic_inc_not_zero(), but will saturate at UINT_MAX and WARN.
  *
  * Provides no memory ordering, it is assumed the caller has guaranteed the
  * object memory to be stable (RCU, etc.). It does provide a control dependency
  * and thereby orders future stores. See the comment on top.
+ *
+ * Return: true if the increment was successful, false otherwise
  */
 bool refcount_inc_not_zero(refcount_t *r)
 {
-       unsigned int old, new, val = atomic_read(&r->refs);
+       unsigned int new, val = atomic_read(&r->refs);
 
-       for (;;) {
+       do {
                new = val + 1;
 
                if (!val)
@@ -90,12 +125,7 @@ bool refcount_inc_not_zero(refcount_t *r)
                if (unlikely(!new))
                        return true;
 
-               old = atomic_cmpxchg_relaxed(&r->refs, val, new);
-               if (old == val)
-                       break;
-
-               val = old;
-       }
+       } while (!atomic_try_cmpxchg_relaxed(&r->refs, &val, new));
 
        WARN_ONCE(new == UINT_MAX, "refcount_t: saturated; leaking memory.\n");
 
@@ -103,11 +133,17 @@ bool refcount_inc_not_zero(refcount_t *r)
 }
 EXPORT_SYMBOL_GPL(refcount_inc_not_zero);
 
-/*
- * Similar to atomic_inc(), will saturate at UINT_MAX and WARN.
+/**
+ * refcount_inc - increment a refcount
+ * @r: the refcount to increment
+ *
+ * Similar to atomic_inc(), but will saturate at UINT_MAX and WARN.
  *
  * Provides no memory ordering, it is assumed the caller already has a
- * reference on the object, will WARN when this is not so.
+ * reference on the object.
+ *
+ * Will WARN if the refcount is 0, as this represents a possible use-after-free
+ * condition.
  */
 void refcount_inc(refcount_t *r)
 {
@@ -115,11 +151,31 @@ void refcount_inc(refcount_t *r)
 }
 EXPORT_SYMBOL_GPL(refcount_inc);
 
+/**
+ * refcount_sub_and_test - subtract from a refcount and test if it is 0
+ * @i: amount to subtract from the refcount
+ * @r: the refcount
+ *
+ * Similar to atomic_dec_and_test(), but it will WARN, return false and
+ * ultimately leak on underflow and will fail to decrement when saturated
+ * at UINT_MAX.
+ *
+ * Provides release memory ordering, such that prior loads and stores are done
+ * before, and provides a control dependency such that free() must come after.
+ * See the comment on top.
+ *
+ * Use of this function is not recommended for the normal reference counting
+ * use case in which references are taken and released one at a time.  In these
+ * cases, refcount_dec(), or one of its variants, should instead be used to
+ * decrement a reference count.
+ *
+ * Return: true if the resulting refcount is 0, false otherwise
+ */
 bool refcount_sub_and_test(unsigned int i, refcount_t *r)
 {
-       unsigned int old, new, val = atomic_read(&r->refs);
+       unsigned int new, val = atomic_read(&r->refs);
 
-       for (;;) {
+       do {
                if (unlikely(val == UINT_MAX))
                        return false;
 
@@ -129,24 +185,24 @@ bool refcount_sub_and_test(unsigned int i, refcount_t *r)
                        return false;
                }
 
-               old = atomic_cmpxchg_release(&r->refs, val, new);
-               if (old == val)
-                       break;
-
-               val = old;
-       }
+       } while (!atomic_try_cmpxchg_release(&r->refs, &val, new));
 
        return !new;
 }
 EXPORT_SYMBOL_GPL(refcount_sub_and_test);
 
-/*
+/**
+ * refcount_dec_and_test - decrement a refcount and test if it is 0
+ * @r: the refcount
+ *
  * Similar to atomic_dec_and_test(), it will WARN on underflow and fail to
  * decrement when saturated at UINT_MAX.
  *
  * Provides release memory ordering, such that prior loads and stores are done
  * before, and provides a control dependency such that free() must come after.
  * See the comment on top.
+ *
+ * Return: true if the resulting refcount is 0, false otherwise
  */
 bool refcount_dec_and_test(refcount_t *r)
 {
@@ -154,21 +210,26 @@ bool refcount_dec_and_test(refcount_t *r)
 }
 EXPORT_SYMBOL_GPL(refcount_dec_and_test);
 
-/*
+/**
+ * refcount_dec - decrement a refcount
+ * @r: the refcount
+ *
  * Similar to atomic_dec(), it will WARN on underflow and fail to decrement
  * when saturated at UINT_MAX.
  *
  * Provides release memory ordering, such that prior loads and stores are done
  * before.
  */
-
 void refcount_dec(refcount_t *r)
 {
        WARN_ONCE(refcount_dec_and_test(r), "refcount_t: decrement hit 0; leaking memory.\n");
 }
 EXPORT_SYMBOL_GPL(refcount_dec);
 
-/*
+/**
+ * refcount_dec_if_one - decrement a refcount if it is 1
+ * @r: the refcount
+ *
  * No atomic_t counterpart, it attempts a 1 -> 0 transition and returns the
  * success thereof.
  *
@@ -178,24 +239,33 @@ EXPORT_SYMBOL_GPL(refcount_dec);
  * It can be used like a try-delete operator; this explicit case is provided
  * and not cmpxchg in generic, because that would allow implementing unsafe
  * operations.
+ *
+ * Return: true if the resulting refcount is 0, false otherwise
  */
 bool refcount_dec_if_one(refcount_t *r)
 {
-       return atomic_cmpxchg_release(&r->refs, 1, 0) == 1;
+       int val = 1;
+
+       return atomic_try_cmpxchg_release(&r->refs, &val, 0);
 }
 EXPORT_SYMBOL_GPL(refcount_dec_if_one);
 
-/*
+/**
+ * refcount_dec_not_one - decrement a refcount if it is not 1
+ * @r: the refcount
+ *
  * No atomic_t counterpart, it decrements unless the value is 1, in which case
  * it will return false.
  *
  * Was often done like: atomic_add_unless(&var, -1, 1)
+ *
+ * Return: true if the decrement operation was successful, false otherwise
  */
 bool refcount_dec_not_one(refcount_t *r)
 {
-       unsigned int old, new, val = atomic_read(&r->refs);
+       unsigned int new, val = atomic_read(&r->refs);
 
-       for (;;) {
+       do {
                if (unlikely(val == UINT_MAX))
                        return true;
 
@@ -208,24 +278,27 @@ bool refcount_dec_not_one(refcount_t *r)
                        return true;
                }
 
-               old = atomic_cmpxchg_release(&r->refs, val, new);
-               if (old == val)
-                       break;
-
-               val = old;
-       }
+       } while (!atomic_try_cmpxchg_release(&r->refs, &val, new));
 
        return true;
 }
 EXPORT_SYMBOL_GPL(refcount_dec_not_one);
 
-/*
+/**
+ * refcount_dec_and_mutex_lock - return holding mutex if able to decrement
+ *                               refcount to 0
+ * @r: the refcount
+ * @lock: the mutex to be locked
+ *
  * Similar to atomic_dec_and_mutex_lock(), it will WARN on underflow and fail
  * to decrement when saturated at UINT_MAX.
  *
  * Provides release memory ordering, such that prior loads and stores are done
  * before, and provides a control dependency such that free() must come after.
  * See the comment on top.
+ *
+ * Return: true and hold mutex if able to decrement refcount to 0, false
+ *         otherwise
  */
 bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock)
 {
@@ -242,13 +315,21 @@ bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock)
 }
 EXPORT_SYMBOL_GPL(refcount_dec_and_mutex_lock);
 
-/*
+/**
+ * refcount_dec_and_lock - return holding spinlock if able to decrement
+ *                         refcount to 0
+ * @r: the refcount
+ * @lock: the spinlock to be locked
+ *
  * Similar to atomic_dec_and_lock(), it will WARN on underflow and fail to
  * decrement when saturated at UINT_MAX.
  *
  * Provides release memory ordering, such that prior loads and stores are done
  * before, and provides a control dependency such that free() must come after.
  * See the comment on top.
+ *
+ * Return: true and hold spinlock if able to decrement refcount to 0, false
+ *         otherwise
  */
 bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock)
 {
index 60e800e..80aa8d5 100644 (file)
@@ -79,15 +79,15 @@ void sbitmap_resize(struct sbitmap *sb, unsigned int depth)
 }
 EXPORT_SYMBOL_GPL(sbitmap_resize);
 
-static int __sbitmap_get_word(struct sbitmap_word *word, unsigned int hint,
-                             bool wrap)
+static int __sbitmap_get_word(unsigned long *word, unsigned long depth,
+                             unsigned int hint, bool wrap)
 {
        unsigned int orig_hint = hint;
        int nr;
 
        while (1) {
-               nr = find_next_zero_bit(&word->word, word->depth, hint);
-               if (unlikely(nr >= word->depth)) {
+               nr = find_next_zero_bit(word, depth, hint);
+               if (unlikely(nr >= depth)) {
                        /*
                         * We started with an offset, and we didn't reset the
                         * offset to 0 in a failure case, so start from 0 to
@@ -100,11 +100,11 @@ static int __sbitmap_get_word(struct sbitmap_word *word, unsigned int hint,
                        return -1;
                }
 
-               if (!test_and_set_bit(nr, &word->word))
+               if (!test_and_set_bit(nr, word))
                        break;
 
                hint = nr + 1;
-               if (hint >= word->depth - 1)
+               if (hint >= depth - 1)
                        hint = 0;
        }
 
@@ -119,7 +119,8 @@ int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin)
        index = SB_NR_TO_INDEX(sb, alloc_hint);
 
        for (i = 0; i < sb->map_nr; i++) {
-               nr = __sbitmap_get_word(&sb->map[index],
+               nr = __sbitmap_get_word(&sb->map[index].word,
+                                       sb->map[index].depth,
                                        SB_NR_TO_BIT(sb, alloc_hint),
                                        !round_robin);
                if (nr != -1) {
@@ -141,6 +142,37 @@ int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin)
 }
 EXPORT_SYMBOL_GPL(sbitmap_get);
 
+int sbitmap_get_shallow(struct sbitmap *sb, unsigned int alloc_hint,
+                       unsigned long shallow_depth)
+{
+       unsigned int i, index;
+       int nr = -1;
+
+       index = SB_NR_TO_INDEX(sb, alloc_hint);
+
+       for (i = 0; i < sb->map_nr; i++) {
+               nr = __sbitmap_get_word(&sb->map[index].word,
+                                       min(sb->map[index].depth, shallow_depth),
+                                       SB_NR_TO_BIT(sb, alloc_hint), true);
+               if (nr != -1) {
+                       nr += index << sb->shift;
+                       break;
+               }
+
+               /* Jump to next index. */
+               index++;
+               alloc_hint = index << sb->shift;
+
+               if (index >= sb->map_nr) {
+                       index = 0;
+                       alloc_hint = 0;
+               }
+       }
+
+       return nr;
+}
+EXPORT_SYMBOL_GPL(sbitmap_get_shallow);
+
 bool sbitmap_any_bit_set(const struct sbitmap *sb)
 {
        unsigned int i;
@@ -342,6 +374,35 @@ int __sbitmap_queue_get(struct sbitmap_queue *sbq)
 }
 EXPORT_SYMBOL_GPL(__sbitmap_queue_get);
 
+int __sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
+                               unsigned int shallow_depth)
+{
+       unsigned int hint, depth;
+       int nr;
+
+       hint = this_cpu_read(*sbq->alloc_hint);
+       depth = READ_ONCE(sbq->sb.depth);
+       if (unlikely(hint >= depth)) {
+               hint = depth ? prandom_u32() % depth : 0;
+               this_cpu_write(*sbq->alloc_hint, hint);
+       }
+       nr = sbitmap_get_shallow(&sbq->sb, hint, shallow_depth);
+
+       if (nr == -1) {
+               /* If the map is full, a hint won't do us much good. */
+               this_cpu_write(*sbq->alloc_hint, 0);
+       } else if (nr == hint || unlikely(sbq->round_robin)) {
+               /* Only update the hint if we used it. */
+               hint = nr + 1;
+               if (hint >= depth - 1)
+                       hint = 0;
+               this_cpu_write(*sbq->alloc_hint, hint);
+       }
+
+       return nr;
+}
+EXPORT_SYMBOL_GPL(__sbitmap_queue_get_shallow);
+
 static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq)
 {
        int i, wake_index;
index 1a8d71a..4621db8 100644 (file)
@@ -31,7 +31,6 @@
  * their capability at compile-time, we just have to opt-out certain archs.
  */
 #if BITS_PER_LONG == 64 || (!(defined(CONFIG_ARM) && !defined(MMU)) && \
-                           !defined(CONFIG_AVR32) &&           \
                            !defined(CONFIG_BLACKFIN) &&        \
                            !defined(CONFIG_M32R) &&            \
                            !defined(CONFIG_M68K) &&            \
diff --git a/lib/usercopy.c b/lib/usercopy.c
new file mode 100644 (file)
index 0000000..1b6010a
--- /dev/null
@@ -0,0 +1,26 @@
+#include <linux/uaccess.h>
+
+/* out-of-line parts */
+
+#ifndef INLINE_COPY_FROM_USER
+unsigned long _copy_from_user(void *to, const void __user *from, unsigned long n)
+{
+       unsigned long res = n;
+       if (likely(access_ok(VERIFY_READ, from, n)))
+               res = raw_copy_from_user(to, from, n);
+       if (unlikely(res))
+               memset(to + (n - res), 0, res);
+       return res;
+}
+EXPORT_SYMBOL(_copy_from_user);
+#endif
+
+#ifndef INLINE_COPY_TO_USER
+unsigned long _copy_to_user(void *to, const void __user *from, unsigned long n)
+{
+       if (likely(access_ok(VERIFY_WRITE, to, n)))
+               n = raw_copy_to_user(to, from, n);
+       return n;
+}
+EXPORT_SYMBOL(_copy_to_user);
+#endif
index 9b8fccb..beb7a45 100644 (file)
@@ -312,7 +312,6 @@ config NEED_BOUNCE_POOL
 config NR_QUICK
        int
        depends on QUICKLIST
-       default "2" if AVR32
        default "1"
 
 config VIRT_TO_BUS
index c6f2a37..f028a9a 100644 (file)
@@ -12,8 +12,6 @@
 #include <linux/device.h>
 #include <trace/events/writeback.h>
 
-static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
-
 struct backing_dev_info noop_backing_dev_info = {
        .name           = "noop",
        .capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK,
@@ -242,6 +240,8 @@ static __init int bdi_class_init(void)
 }
 postcore_initcall(bdi_class_init);
 
+static int bdi_init(struct backing_dev_info *bdi);
+
 static int __init default_bdi_init(void)
 {
        int err;
@@ -294,6 +294,8 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
 
        memset(wb, 0, sizeof(*wb));
 
+       if (wb != &bdi->wb)
+               bdi_get(bdi);
        wb->bdi = bdi;
        wb->last_old_flush = jiffies;
        INIT_LIST_HEAD(&wb->b_dirty);
@@ -314,8 +316,10 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
        wb->dirty_sleep = jiffies;
 
        wb->congested = wb_congested_get_create(bdi, blkcg_id, gfp);
-       if (!wb->congested)
-               return -ENOMEM;
+       if (!wb->congested) {
+               err = -ENOMEM;
+               goto out_put_bdi;
+       }
 
        err = fprop_local_init_percpu(&wb->completions, gfp);
        if (err)
@@ -335,9 +339,14 @@ out_destroy_stat:
        fprop_local_destroy_percpu(&wb->completions);
 out_put_cong:
        wb_congested_put(wb->congested);
+out_put_bdi:
+       if (wb != &bdi->wb)
+               bdi_put(bdi);
        return err;
 }
 
+static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb);
+
 /*
  * Remove bdi from the global list and shutdown any threads we have running
  */
@@ -347,10 +356,18 @@ static void wb_shutdown(struct bdi_writeback *wb)
        spin_lock_bh(&wb->work_lock);
        if (!test_and_clear_bit(WB_registered, &wb->state)) {
                spin_unlock_bh(&wb->work_lock);
+               /*
+                * Wait for wb shutdown to finish if someone else is just
+                * running wb_shutdown(). Otherwise we could proceed to wb /
+                * bdi destruction before wb_shutdown() is finished.
+                */
+               wait_on_bit(&wb->state, WB_shutting_down, TASK_UNINTERRUPTIBLE);
                return;
        }
+       set_bit(WB_shutting_down, &wb->state);
        spin_unlock_bh(&wb->work_lock);
 
+       cgwb_remove_from_bdi_list(wb);
        /*
         * Drain work list and shutdown the delayed_work.  !WB_registered
         * tells wb_workfn() that @wb is dying and its work_list needs to
@@ -359,6 +376,12 @@ static void wb_shutdown(struct bdi_writeback *wb)
        mod_delayed_work(bdi_wq, &wb->dwork, 0);
        flush_delayed_work(&wb->dwork);
        WARN_ON(!list_empty(&wb->work_list));
+       /*
+        * Make sure bit gets cleared after shutdown is finished. Matches with
+        * the barrier provided by test_and_clear_bit() above.
+        */
+       smp_wmb();
+       clear_bit(WB_shutting_down, &wb->state);
 }
 
 static void wb_exit(struct bdi_writeback *wb)
@@ -372,6 +395,8 @@ static void wb_exit(struct bdi_writeback *wb)
 
        fprop_local_destroy_percpu(&wb->completions);
        wb_congested_put(wb->congested);
+       if (wb != &wb->bdi->wb)
+               bdi_put(wb->bdi);
 }
 
 #ifdef CONFIG_CGROUP_WRITEBACK
@@ -381,11 +406,9 @@ static void wb_exit(struct bdi_writeback *wb)
 /*
  * cgwb_lock protects bdi->cgwb_tree, bdi->cgwb_congested_tree,
  * blkcg->cgwb_list, and memcg->cgwb_list.  bdi->cgwb_tree is also RCU
- * protected.  cgwb_release_wait is used to wait for the completion of cgwb
- * releases from bdi destruction path.
+ * protected.
  */
 static DEFINE_SPINLOCK(cgwb_lock);
-static DECLARE_WAIT_QUEUE_HEAD(cgwb_release_wait);
 
 /**
  * wb_congested_get_create - get or create a wb_congested
@@ -438,7 +461,7 @@ retry:
                return NULL;
 
        atomic_set(&new_congested->refcnt, 0);
-       new_congested->bdi = bdi;
+       new_congested->__bdi = bdi;
        new_congested->blkcg_id = blkcg_id;
        goto retry;
 
@@ -466,10 +489,10 @@ void wb_congested_put(struct bdi_writeback_congested *congested)
        }
 
        /* bdi might already have been destroyed leaving @congested unlinked */
-       if (congested->bdi) {
+       if (congested->__bdi) {
                rb_erase(&congested->rb_node,
-                        &congested->bdi->cgwb_congested_tree);
-               congested->bdi = NULL;
+                        &congested->__bdi->cgwb_congested_tree);
+               congested->__bdi = NULL;
        }
 
        spin_unlock_irqrestore(&cgwb_lock, flags);
@@ -480,11 +503,6 @@ static void cgwb_release_workfn(struct work_struct *work)
 {
        struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
                                                release_work);
-       struct backing_dev_info *bdi = wb->bdi;
-
-       spin_lock_irq(&cgwb_lock);
-       list_del_rcu(&wb->bdi_node);
-       spin_unlock_irq(&cgwb_lock);
 
        wb_shutdown(wb);
 
@@ -495,9 +513,6 @@ static void cgwb_release_workfn(struct work_struct *work)
        percpu_ref_exit(&wb->refcnt);
        wb_exit(wb);
        kfree_rcu(wb, rcu);
-
-       if (atomic_dec_and_test(&bdi->usage_cnt))
-               wake_up_all(&cgwb_release_wait);
 }
 
 static void cgwb_release(struct percpu_ref *refcnt)
@@ -517,6 +532,13 @@ static void cgwb_kill(struct bdi_writeback *wb)
        percpu_ref_kill(&wb->refcnt);
 }
 
+static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb)
+{
+       spin_lock_irq(&cgwb_lock);
+       list_del_rcu(&wb->bdi_node);
+       spin_unlock_irq(&cgwb_lock);
+}
+
 static int cgwb_create(struct backing_dev_info *bdi,
                       struct cgroup_subsys_state *memcg_css, gfp_t gfp)
 {
@@ -580,7 +602,6 @@ static int cgwb_create(struct backing_dev_info *bdi,
                /* we might have raced another instance of this function */
                ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb);
                if (!ret) {
-                       atomic_inc(&bdi->usage_cnt);
                        list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list);
                        list_add(&wb->memcg_node, memcg_cgwb_list);
                        list_add(&wb->blkcg_node, blkcg_cgwb_list);
@@ -670,7 +691,6 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
 
        INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
        bdi->cgwb_congested_tree = RB_ROOT;
-       atomic_set(&bdi->usage_cnt, 1);
 
        ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL);
        if (!ret) {
@@ -680,29 +700,26 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
        return ret;
 }
 
-static void cgwb_bdi_destroy(struct backing_dev_info *bdi)
+static void cgwb_bdi_unregister(struct backing_dev_info *bdi)
 {
        struct radix_tree_iter iter;
        void **slot;
+       struct bdi_writeback *wb;
 
        WARN_ON(test_bit(WB_registered, &bdi->wb.state));
 
        spin_lock_irq(&cgwb_lock);
        radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0)
                cgwb_kill(*slot);
-       spin_unlock_irq(&cgwb_lock);
 
-       /*
-        * All cgwb's must be shutdown and released before returning.  Drain
-        * the usage counter to wait for all cgwb's ever created on @bdi.
-        */
-       atomic_dec(&bdi->usage_cnt);
-       wait_event(cgwb_release_wait, !atomic_read(&bdi->usage_cnt));
-       /*
-        * Grab back our reference so that we hold it when @bdi gets
-        * re-registered.
-        */
-       atomic_inc(&bdi->usage_cnt);
+       while (!list_empty(&bdi->wb_list)) {
+               wb = list_first_entry(&bdi->wb_list, struct bdi_writeback,
+                                     bdi_node);
+               spin_unlock_irq(&cgwb_lock);
+               wb_shutdown(wb);
+               spin_lock_irq(&cgwb_lock);
+       }
+       spin_unlock_irq(&cgwb_lock);
 }
 
 /**
@@ -752,11 +769,18 @@ static void cgwb_bdi_exit(struct backing_dev_info *bdi)
                        rb_entry(rbn, struct bdi_writeback_congested, rb_node);
 
                rb_erase(rbn, &bdi->cgwb_congested_tree);
-               congested->bdi = NULL;  /* mark @congested unlinked */
+               congested->__bdi = NULL;        /* mark @congested unlinked */
        }
        spin_unlock_irq(&cgwb_lock);
 }
 
+static void cgwb_bdi_register(struct backing_dev_info *bdi)
+{
+       spin_lock_irq(&cgwb_lock);
+       list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
+       spin_unlock_irq(&cgwb_lock);
+}
+
 #else  /* CONFIG_CGROUP_WRITEBACK */
 
 static int cgwb_bdi_init(struct backing_dev_info *bdi)
@@ -777,16 +801,26 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
        return 0;
 }
 
-static void cgwb_bdi_destroy(struct backing_dev_info *bdi) { }
+static void cgwb_bdi_unregister(struct backing_dev_info *bdi) { }
 
 static void cgwb_bdi_exit(struct backing_dev_info *bdi)
 {
        wb_congested_put(bdi->wb_congested);
 }
 
+static void cgwb_bdi_register(struct backing_dev_info *bdi)
+{
+       list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
+}
+
+static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb)
+{
+       list_del_rcu(&wb->bdi_node);
+}
+
 #endif /* CONFIG_CGROUP_WRITEBACK */
 
-int bdi_init(struct backing_dev_info *bdi)
+static int bdi_init(struct backing_dev_info *bdi)
 {
        int ret;
 
@@ -802,11 +836,8 @@ int bdi_init(struct backing_dev_info *bdi)
 
        ret = cgwb_bdi_init(bdi);
 
-       list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
-
        return ret;
 }
-EXPORT_SYMBOL(bdi_init);
 
 struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id)
 {
@@ -823,22 +854,20 @@ struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id)
        }
        return bdi;
 }
+EXPORT_SYMBOL(bdi_alloc_node);
 
-int bdi_register(struct backing_dev_info *bdi, struct device *parent,
-               const char *fmt, ...)
+int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args)
 {
-       va_list args;
        struct device *dev;
 
        if (bdi->dev)   /* The driver needs to use separate queues per device */
                return 0;
 
-       va_start(args, fmt);
-       dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
-       va_end(args);
+       dev = device_create_vargs(bdi_class, NULL, MKDEV(0, 0), bdi, fmt, args);
        if (IS_ERR(dev))
                return PTR_ERR(dev);
 
+       cgwb_bdi_register(bdi);
        bdi->dev = dev;
 
        bdi_debug_register(bdi, dev_name(dev));
@@ -851,20 +880,25 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
        trace_writeback_bdi_register(bdi);
        return 0;
 }
-EXPORT_SYMBOL(bdi_register);
+EXPORT_SYMBOL(bdi_register_va);
 
-int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
+int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...)
 {
-       return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev));
+       va_list args;
+       int ret;
+
+       va_start(args, fmt);
+       ret = bdi_register_va(bdi, fmt, args);
+       va_end(args);
+       return ret;
 }
-EXPORT_SYMBOL(bdi_register_dev);
+EXPORT_SYMBOL(bdi_register);
 
 int bdi_register_owner(struct backing_dev_info *bdi, struct device *owner)
 {
        int rc;
 
-       rc = bdi_register(bdi, NULL, "%u:%u", MAJOR(owner->devt),
-                       MINOR(owner->devt));
+       rc = bdi_register(bdi, "%u:%u", MAJOR(owner->devt), MINOR(owner->devt));
        if (rc)
                return rc;
        /* Leaking owner reference... */
@@ -892,7 +926,7 @@ void bdi_unregister(struct backing_dev_info *bdi)
        /* make sure nobody finds us on the bdi_list anymore */
        bdi_remove_from_list(bdi);
        wb_shutdown(&bdi->wb);
-       cgwb_bdi_destroy(bdi);
+       cgwb_bdi_unregister(bdi);
 
        if (bdi->dev) {
                bdi_debug_unregister(bdi);
@@ -906,19 +940,16 @@ void bdi_unregister(struct backing_dev_info *bdi)
        }
 }
 
-static void bdi_exit(struct backing_dev_info *bdi)
-{
-       WARN_ON_ONCE(bdi->dev);
-       wb_exit(&bdi->wb);
-       cgwb_bdi_exit(bdi);
-}
-
 static void release_bdi(struct kref *ref)
 {
        struct backing_dev_info *bdi =
                        container_of(ref, struct backing_dev_info, refcnt);
 
-       bdi_exit(bdi);
+       if (test_bit(WB_registered, &bdi->wb.state))
+               bdi_unregister(bdi);
+       WARN_ON_ONCE(bdi->dev);
+       wb_exit(&bdi->wb);
+       cgwb_bdi_exit(bdi);
        kfree(bdi);
 }
 
@@ -926,38 +957,7 @@ void bdi_put(struct backing_dev_info *bdi)
 {
        kref_put(&bdi->refcnt, release_bdi);
 }
-
-void bdi_destroy(struct backing_dev_info *bdi)
-{
-       bdi_unregister(bdi);
-       bdi_exit(bdi);
-}
-EXPORT_SYMBOL(bdi_destroy);
-
-/*
- * For use from filesystems to quickly init and register a bdi associated
- * with dirty writeback
- */
-int bdi_setup_and_register(struct backing_dev_info *bdi, char *name)
-{
-       int err;
-
-       bdi->name = name;
-       bdi->capabilities = 0;
-       err = bdi_init(bdi);
-       if (err)
-               return err;
-
-       err = bdi_register(bdi, NULL, "%.28s-%ld", name,
-                          atomic_long_inc_return(&bdi_seq));
-       if (err) {
-               bdi_destroy(bdi);
-               return err;
-       }
-
-       return 0;
-}
-EXPORT_SYMBOL(bdi_setup_and_register);
+EXPORT_SYMBOL(bdi_put);
 
 static wait_queue_head_t congestion_wqh[2] = {
                __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
index 1ebc93e..f3c4f9d 100644 (file)
@@ -240,18 +240,18 @@ static ssize_t defrag_store(struct kobject *kobj,
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
                set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
-       } else if (!memcmp("defer", buf,
-                   min(sizeof("defer")-1, count))) {
-               clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
-               clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
-               clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
-               set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
        } else if (!memcmp("defer+madvise", buf,
                    min(sizeof("defer+madvise")-1, count))) {
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
                set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
+       } else if (!memcmp("defer", buf,
+                   min(sizeof("defer")-1, count))) {
+               clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
+               clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
+               clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
+               set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
        } else if (!memcmp("madvise", buf,
                           min(sizeof("madvise")-1, count))) {
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
@@ -1568,8 +1568,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
                deactivate_page(page);
 
        if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
-               orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
-                       tlb->fullmm);
+               pmdp_invalidate(vma, addr, pmd);
                orig_pmd = pmd_mkold(orig_pmd);
                orig_pmd = pmd_mkclean(orig_pmd);
 
@@ -1724,37 +1723,69 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 {
        struct mm_struct *mm = vma->vm_mm;
        spinlock_t *ptl;
-       int ret = 0;
+       pmd_t entry;
+       bool preserve_write;
+       int ret;
 
        ptl = __pmd_trans_huge_lock(pmd, vma);
-       if (ptl) {
-               pmd_t entry;
-               bool preserve_write = prot_numa && pmd_write(*pmd);
-               ret = 1;
+       if (!ptl)
+               return 0;
 
-               /*
-                * Avoid trapping faults against the zero page. The read-only
-                * data is likely to be read-cached on the local CPU and
-                * local/remote hits to the zero page are not interesting.
-                */
-               if (prot_numa && is_huge_zero_pmd(*pmd)) {
-                       spin_unlock(ptl);
-                       return ret;
-               }
+       preserve_write = prot_numa && pmd_write(*pmd);
+       ret = 1;
 
-               if (!prot_numa || !pmd_protnone(*pmd)) {
-                       entry = pmdp_huge_get_and_clear_notify(mm, addr, pmd);
-                       entry = pmd_modify(entry, newprot);
-                       if (preserve_write)
-                               entry = pmd_mk_savedwrite(entry);
-                       ret = HPAGE_PMD_NR;
-                       set_pmd_at(mm, addr, pmd, entry);
-                       BUG_ON(vma_is_anonymous(vma) && !preserve_write &&
-                                       pmd_write(entry));
-               }
-               spin_unlock(ptl);
-       }
+       /*
+        * Avoid trapping faults against the zero page. The read-only
+        * data is likely to be read-cached on the local CPU and
+        * local/remote hits to the zero page are not interesting.
+        */
+       if (prot_numa && is_huge_zero_pmd(*pmd))
+               goto unlock;
+
+       if (prot_numa && pmd_protnone(*pmd))
+               goto unlock;
+
+       /*
+        * In case prot_numa, we are under down_read(mmap_sem). It's critical
+        * to not clear pmd intermittently to avoid race with MADV_DONTNEED
+        * which is also under down_read(mmap_sem):
+        *
+        *      CPU0:                           CPU1:
+        *                              change_huge_pmd(prot_numa=1)
+        *                               pmdp_huge_get_and_clear_notify()
+        * madvise_dontneed()
+        *  zap_pmd_range()
+        *   pmd_trans_huge(*pmd) == 0 (without ptl)
+        *   // skip the pmd
+        *                               set_pmd_at();
+        *                               // pmd is re-established
+        *
+        * The race makes MADV_DONTNEED miss the huge pmd and don't clear it
+        * which may break userspace.
+        *
+        * pmdp_invalidate() is required to make sure we don't miss
+        * dirty/young flags set by hardware.
+        */
+       entry = *pmd;
+       pmdp_invalidate(vma, addr, pmd);
 
+       /*
+        * Recover dirty/young flags.  It relies on pmdp_invalidate to not
+        * corrupt them.
+        */
+       if (pmd_dirty(*pmd))
+               entry = pmd_mkdirty(entry);
+       if (pmd_young(*pmd))
+               entry = pmd_mkyoung(entry);
+
+       entry = pmd_modify(entry, newprot);
+       if (preserve_write)
+               entry = pmd_mk_savedwrite(entry);
+       ret = HPAGE_PMD_NR;
+       set_pmd_at(mm, addr, pmd, entry);
+       BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry));
+unlock:
+       spin_unlock(ptl);
        return ret;
 }
 
index ccfc2a2..266efae 100644 (file)
@@ -481,6 +481,13 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
 enum ttu_flags;
 struct tlbflush_unmap_batch;
 
+
+/*
+ * only for MM internal work items which do not depend on
+ * any allocations or locks which might depend on allocations
+ */
+extern struct workqueue_struct *mm_percpu_wq;
+
 #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
 void try_to_unmap_flush(void);
 void try_to_unmap_flush_dirty(void);
index 235ba51..6ff5d72 100644 (file)
@@ -4298,7 +4298,7 @@ void __might_fault(const char *file, int line)
         * get paged out, therefore we'll never actually fault, and the
         * below annotations will generate false positives.
         */
-       if (segment_eq(get_fs(), KERNEL_DS))
+       if (uaccess_kernel())
                return;
        if (pagefault_disabled())
                return;
index 75b2745..37d0b33 100644 (file)
@@ -1529,7 +1529,6 @@ COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
 COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
                       compat_ulong_t, maxnode)
 {
-       long err = 0;
        unsigned long __user *nm = NULL;
        unsigned long nr_bits, alloc_size;
        DECLARE_BITMAP(bm, MAX_NUMNODES);
@@ -1538,14 +1537,13 @@ COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 
        if (nmask) {
-               err = compat_get_bitmap(bm, nmask, nr_bits);
+               if (compat_get_bitmap(bm, nmask, nr_bits))
+                       return -EFAULT;
                nm = compat_alloc_user_space(alloc_size);
-               err |= copy_to_user(nm, bm, alloc_size);
+               if (copy_to_user(nm, bm, alloc_size))
+                       return -EFAULT;
        }
 
-       if (err)
-               return -EFAULT;
-
        return sys_set_mempolicy(mode, nm, nr_bits+1);
 }
 
@@ -1553,7 +1551,6 @@ COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
                       compat_ulong_t, mode, compat_ulong_t __user *, nmask,
                       compat_ulong_t, maxnode, compat_ulong_t, flags)
 {
-       long err = 0;
        unsigned long __user *nm = NULL;
        unsigned long nr_bits, alloc_size;
        nodemask_t bm;
@@ -1562,14 +1559,13 @@ COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 
        if (nmask) {
-               err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
+               if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits))
+                       return -EFAULT;
                nm = compat_alloc_user_space(alloc_size);
-               err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
+               if (copy_to_user(nm, nodes_addr(bm), alloc_size))
+                       return -EFAULT;
        }
 
-       if (err)
-               return -EFAULT;
-
        return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
 }
 
index ed97c2c..738f1d5 100644 (file)
@@ -184,9 +184,9 @@ void putback_movable_pages(struct list_head *l)
                        unlock_page(page);
                        put_page(page);
                } else {
-                       putback_lru_page(page);
                        dec_node_page_state(page, NR_ISOLATED_ANON +
                                        page_is_file_cache(page));
+                       putback_lru_page(page);
                }
        }
 }
index 6cbde31..07efbc3 100644 (file)
@@ -1090,10 +1090,10 @@ static void free_pcppages_bulk(struct zone *zone, int count,
 {
        int migratetype = 0;
        int batch_free = 0;
-       unsigned long nr_scanned, flags;
+       unsigned long nr_scanned;
        bool isolated_pageblocks;
 
-       spin_lock_irqsave(&zone->lock, flags);
+       spin_lock(&zone->lock);
        isolated_pageblocks = has_isolate_pageblock(zone);
        nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
        if (nr_scanned)
@@ -1142,7 +1142,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
                        trace_mm_page_pcpu_drain(page, 0, mt);
                } while (--count && --batch_free && !list_empty(list));
        }
-       spin_unlock_irqrestore(&zone->lock, flags);
+       spin_unlock(&zone->lock);
 }
 
 static void free_one_page(struct zone *zone,
@@ -1150,9 +1150,8 @@ static void free_one_page(struct zone *zone,
                                unsigned int order,
                                int migratetype)
 {
-       unsigned long nr_scanned, flags;
-       spin_lock_irqsave(&zone->lock, flags);
-       __count_vm_events(PGFREE, 1 << order);
+       unsigned long nr_scanned;
+       spin_lock(&zone->lock);
        nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
        if (nr_scanned)
                __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
@@ -1162,7 +1161,7 @@ static void free_one_page(struct zone *zone,
                migratetype = get_pfnblock_migratetype(page, pfn);
        }
        __free_one_page(page, pfn, zone, order, migratetype);
-       spin_unlock_irqrestore(&zone->lock, flags);
+       spin_unlock(&zone->lock);
 }
 
 static void __meminit __init_single_page(struct page *page, unsigned long pfn,
@@ -1240,6 +1239,7 @@ void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
 
 static void __free_pages_ok(struct page *page, unsigned int order)
 {
+       unsigned long flags;
        int migratetype;
        unsigned long pfn = page_to_pfn(page);
 
@@ -1247,7 +1247,10 @@ static void __free_pages_ok(struct page *page, unsigned int order)
                return;
 
        migratetype = get_pfnblock_migratetype(page, pfn);
+       local_irq_save(flags);
+       __count_vm_events(PGFREE, 1 << order);
        free_one_page(page_zone(page), page, pfn, order, migratetype);
+       local_irq_restore(flags);
 }
 
 static void __init __free_pages_boot_core(struct page *page, unsigned int order)
@@ -2219,9 +2222,8 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
                        int migratetype, bool cold)
 {
        int i, alloced = 0;
-       unsigned long flags;
 
-       spin_lock_irqsave(&zone->lock, flags);
+       spin_lock(&zone->lock);
        for (i = 0; i < count; ++i) {
                struct page *page = __rmqueue(zone, order, migratetype);
                if (unlikely(page == NULL))
@@ -2257,7 +2259,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
         * pages added to the pcp list.
         */
        __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
-       spin_unlock_irqrestore(&zone->lock, flags);
+       spin_unlock(&zone->lock);
        return alloced;
 }
 
@@ -2373,6 +2375,13 @@ void drain_all_pages(struct zone *zone)
         */
        static cpumask_t cpus_with_pcps;
 
+       /*
+        * Make sure nobody triggers this path before mm_percpu_wq is fully
+        * initialized.
+        */
+       if (WARN_ON_ONCE(!mm_percpu_wq))
+               return;
+
        /* Workqueues cannot recurse */
        if (current->flags & PF_WQ_WORKER)
                return;
@@ -2422,7 +2431,7 @@ void drain_all_pages(struct zone *zone)
        for_each_cpu(cpu, &cpus_with_pcps) {
                struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu);
                INIT_WORK(work, drain_local_pages_wq);
-               schedule_work_on(cpu, work);
+               queue_work_on(cpu, mm_percpu_wq, work);
        }
        for_each_cpu(cpu, &cpus_with_pcps)
                flush_work(per_cpu_ptr(&pcpu_drain, cpu));
@@ -2478,20 +2487,17 @@ void free_hot_cold_page(struct page *page, bool cold)
 {
        struct zone *zone = page_zone(page);
        struct per_cpu_pages *pcp;
+       unsigned long flags;
        unsigned long pfn = page_to_pfn(page);
        int migratetype;
 
-       if (in_interrupt()) {
-               __free_pages_ok(page, 0);
-               return;
-       }
-
        if (!free_pcp_prepare(page))
                return;
 
        migratetype = get_pfnblock_migratetype(page, pfn);
        set_pcppage_migratetype(page, migratetype);
-       preempt_disable();
+       local_irq_save(flags);
+       __count_vm_event(PGFREE);
 
        /*
         * We only track unmovable, reclaimable and movable on pcp lists.
@@ -2508,7 +2514,6 @@ void free_hot_cold_page(struct page *page, bool cold)
                migratetype = MIGRATE_MOVABLE;
        }
 
-       __count_vm_event(PGFREE);
        pcp = &this_cpu_ptr(zone->pageset)->pcp;
        if (!cold)
                list_add(&page->lru, &pcp->lists[migratetype]);
@@ -2522,7 +2527,7 @@ void free_hot_cold_page(struct page *page, bool cold)
        }
 
 out:
-       preempt_enable();
+       local_irq_restore(flags);
 }
 
 /*
@@ -2647,8 +2652,6 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
 {
        struct page *page;
 
-       VM_BUG_ON(in_interrupt());
-
        do {
                if (list_empty(list)) {
                        pcp->count += rmqueue_bulk(zone, 0,
@@ -2679,8 +2682,9 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
        struct list_head *list;
        bool cold = ((gfp_flags & __GFP_COLD) != 0);
        struct page *page;
+       unsigned long flags;
 
-       preempt_disable();
+       local_irq_save(flags);
        pcp = &this_cpu_ptr(zone->pageset)->pcp;
        list = &pcp->lists[migratetype];
        page = __rmqueue_pcplist(zone,  migratetype, cold, pcp, list);
@@ -2688,7 +2692,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
                __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
                zone_statistics(preferred_zone, zone);
        }
-       preempt_enable();
+       local_irq_restore(flags);
        return page;
 }
 
@@ -2704,7 +2708,7 @@ struct page *rmqueue(struct zone *preferred_zone,
        unsigned long flags;
        struct page *page;
 
-       if (likely(order == 0) && !in_interrupt()) {
+       if (likely(order == 0)) {
                page = rmqueue_pcplist(preferred_zone, zone, order,
                                gfp_flags, migratetype);
                goto out;
@@ -4519,13 +4523,13 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
                        K(node_page_state(pgdat, NR_FILE_MAPPED)),
                        K(node_page_state(pgdat, NR_FILE_DIRTY)),
                        K(node_page_state(pgdat, NR_WRITEBACK)),
+                       K(node_page_state(pgdat, NR_SHMEM)),
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
                        K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR),
                        K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)
                                        * HPAGE_PMD_NR),
                        K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR),
 #endif
-                       K(node_page_state(pgdat, NR_SHMEM)),
                        K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
                        K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
                        node_page_state(pgdat, NR_PAGES_SCANNED),
index c4c9def..de9c40d 100644 (file)
@@ -111,12 +111,8 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
        if (pvmw->pmd && !pvmw->pte)
                return not_found(pvmw);
 
-       /* Only for THP, seek to next pte entry makes sense */
-       if (pvmw->pte) {
-               if (!PageTransHuge(pvmw->page) || PageHuge(pvmw->page))
-                       return not_found(pvmw);
+       if (pvmw->pte)
                goto next_pte;
-       }
 
        if (unlikely(PageHuge(pvmw->page))) {
                /* when pud is not present, pte will be NULL */
@@ -165,9 +161,14 @@ restart:
        while (1) {
                if (check_pte(pvmw))
                        return true;
-next_pte:      do {
+next_pte:
+               /* Seek to next pte only makes sense for THP */
+               if (!PageTransHuge(pvmw->page) || PageHuge(pvmw->page))
+                       return not_found(pvmw);
+               do {
                        pvmw->address += PAGE_SIZE;
-                       if (pvmw->address >=
+                       if (pvmw->address >= pvmw->vma->vm_end ||
+                           pvmw->address >=
                                        __vma_address(pvmw->page, pvmw->vma) +
                                        hpage_nr_pages(pvmw->page) * PAGE_SIZE)
                                return not_found(pvmw);
index 60a6488..e0aa8ae 100644 (file)
@@ -1284,18 +1284,7 @@ void free_percpu(void __percpu *ptr)
 }
 EXPORT_SYMBOL_GPL(free_percpu);
 
-/**
- * is_kernel_percpu_address - test whether address is from static percpu area
- * @addr: address to test
- *
- * Test whether @addr belongs to in-kernel static percpu area.  Module
- * static percpu areas are not considered.  For those, use
- * is_module_percpu_address().
- *
- * RETURNS:
- * %true if @addr is from in-kernel static percpu area, %false otherwise.
- */
-bool is_kernel_percpu_address(unsigned long addr)
+bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr)
 {
 #ifdef CONFIG_SMP
        const size_t static_size = __per_cpu_end - __per_cpu_start;
@@ -1304,16 +1293,39 @@ bool is_kernel_percpu_address(unsigned long addr)
 
        for_each_possible_cpu(cpu) {
                void *start = per_cpu_ptr(base, cpu);
+               void *va = (void *)addr;
 
-               if ((void *)addr >= start && (void *)addr < start + static_size)
+               if (va >= start && va < start + static_size) {
+                       if (can_addr) {
+                               *can_addr = (unsigned long) (va - start);
+                               *can_addr += (unsigned long)
+                                       per_cpu_ptr(base, get_boot_cpu_id());
+                       }
                        return true;
-        }
+               }
+       }
 #endif
        /* on UP, can't distinguish from other static vars, always false */
        return false;
 }
 
 /**
+ * is_kernel_percpu_address - test whether address is from static percpu area
+ * @addr: address to test
+ *
+ * Test whether @addr belongs to in-kernel static percpu area.  Module
+ * static percpu areas are not considered.  For those, use
+ * is_module_percpu_address().
+ *
+ * RETURNS:
+ * %true if @addr is from in-kernel static percpu area, %false otherwise.
+ */
+bool is_kernel_percpu_address(unsigned long addr)
+{
+       return __is_kernel_percpu_address(addr, NULL);
+}
+
+/**
  * per_cpu_ptr_to_phys - convert translated percpu address to physical address
  * @addr: the address to be converted to physical address
  *
index c4910f1..5dabf44 100644 (file)
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -670,30 +670,19 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
 
 static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
 
-/*
- * lru_add_drain_wq is used to do lru_add_drain_all() from a WQ_MEM_RECLAIM
- * workqueue, aiding in getting memory freed.
- */
-static struct workqueue_struct *lru_add_drain_wq;
-
-static int __init lru_init(void)
-{
-       lru_add_drain_wq = alloc_workqueue("lru-add-drain", WQ_MEM_RECLAIM, 0);
-
-       if (WARN(!lru_add_drain_wq,
-               "Failed to create workqueue lru_add_drain_wq"))
-               return -ENOMEM;
-
-       return 0;
-}
-early_initcall(lru_init);
-
 void lru_add_drain_all(void)
 {
        static DEFINE_MUTEX(lock);
        static struct cpumask has_work;
        int cpu;
 
+       /*
+        * Make sure nobody triggers this path before mm_percpu_wq is fully
+        * initialized.
+        */
+       if (WARN_ON(!mm_percpu_wq))
+               return;
+
        mutex_lock(&lock);
        get_online_cpus();
        cpumask_clear(&has_work);
@@ -707,7 +696,7 @@ void lru_add_drain_all(void)
                    pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
                    need_activate_page_drain(cpu)) {
                        INIT_WORK(work, lru_add_drain_per_cpu);
-                       queue_work_on(cpu, lru_add_drain_wq, work);
+                       queue_work_on(cpu, mm_percpu_wq, work);
                        cpumask_set_cpu(cpu, &has_work);
                }
        }
index 310ac0b..ac6318a 100644 (file)
@@ -201,6 +201,8 @@ void swap_cgroup_swapoff(int type)
                        struct page *page = map[i];
                        if (page)
                                __free_page(page);
+                       if (!(i % SWAP_CLUSTER_MAX))
+                               cond_resched();
                }
                vfree(map);
        }
index 89f9539..5a4f5c5 100644 (file)
@@ -1552,7 +1552,6 @@ static const struct file_operations proc_vmstat_file_operations = {
 #endif /* CONFIG_PROC_FS */
 
 #ifdef CONFIG_SMP
-static struct workqueue_struct *vmstat_wq;
 static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
 int sysctl_stat_interval __read_mostly = HZ;
 
@@ -1623,7 +1622,7 @@ static void vmstat_update(struct work_struct *w)
                 * to occur in the future. Keep on running the
                 * update worker thread.
                 */
-               queue_delayed_work_on(smp_processor_id(), vmstat_wq,
+               queue_delayed_work_on(smp_processor_id(), mm_percpu_wq,
                                this_cpu_ptr(&vmstat_work),
                                round_jiffies_relative(sysctl_stat_interval));
        }
@@ -1702,7 +1701,7 @@ static void vmstat_shepherd(struct work_struct *w)
                struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
 
                if (!delayed_work_pending(dw) && need_update(cpu))
-                       queue_delayed_work_on(cpu, vmstat_wq, dw, 0);
+                       queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);
        }
        put_online_cpus();
 
@@ -1718,7 +1717,6 @@ static void __init start_shepherd_timer(void)
                INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
                        vmstat_update);
 
-       vmstat_wq = alloc_workqueue("vmstat", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
        schedule_delayed_work(&shepherd,
                round_jiffies_relative(sysctl_stat_interval));
 }
@@ -1764,11 +1762,15 @@ static int vmstat_cpu_dead(unsigned int cpu)
 
 #endif
 
+struct workqueue_struct *mm_percpu_wq;
+
 void __init init_mm_internals(void)
 {
-#ifdef CONFIG_SMP
-       int ret;
+       int ret __maybe_unused;
 
+       mm_percpu_wq = alloc_workqueue("mm_percpu_wq", WQ_MEM_RECLAIM, 0);
+
+#ifdef CONFIG_SMP
        ret = cpuhp_setup_state_nocalls(CPUHP_MM_VMSTAT_DEAD, "mm/vmstat:dead",
                                        NULL, vmstat_cpu_dead);
        if (ret < 0)
index f9492bc..54f63c4 100644 (file)
@@ -185,6 +185,12 @@ static inline void z3fold_page_lock(struct z3fold_header *zhdr)
        spin_lock(&zhdr->page_lock);
 }
 
+/* Try to lock a z3fold page */
+static inline int z3fold_page_trylock(struct z3fold_header *zhdr)
+{
+       return spin_trylock(&zhdr->page_lock);
+}
+
 /* Unlock a z3fold page */
 static inline void z3fold_page_unlock(struct z3fold_header *zhdr)
 {
@@ -385,7 +391,7 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
                        spin_lock(&pool->lock);
                        zhdr = list_first_entry_or_null(&pool->unbuddied[i],
                                                struct z3fold_header, buddy);
-                       if (!zhdr) {
+                       if (!zhdr || !z3fold_page_trylock(zhdr)) {
                                spin_unlock(&pool->lock);
                                continue;
                        }
@@ -394,7 +400,6 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
                        spin_unlock(&pool->lock);
 
                        page = virt_to_page(zhdr);
-                       z3fold_page_lock(zhdr);
                        if (zhdr->first_chunks == 0) {
                                if (zhdr->middle_chunks != 0 &&
                                    chunks >= zhdr->start_middle)
index b7ee9c3..d41edd2 100644 (file)
@@ -276,7 +276,7 @@ struct zs_pool {
 struct zspage {
        struct {
                unsigned int fullness:FULLNESS_BITS;
-               unsigned int class:CLASS_BITS;
+               unsigned int class:CLASS_BITS + 1;
                unsigned int isolated:ISOLATED_BITS;
                unsigned int magic:MAGIC_VAL_BITS;
        };
index 3ce672a..8e5c6a8 100644 (file)
@@ -2101,6 +2101,10 @@ int p9_client_readdir(struct p9_fid *fid, char *data, u32 count, u64 offset)
                trace_9p_protocol_dump(clnt, req->rc);
                goto free_and_error;
        }
+       if (rsize < count) {
+               pr_err("bogus RREADDIR count (%d > %d)\n", count, rsize);
+               count = rsize;
+       }
 
        p9_debug(P9_DEBUG_9P, "<<< RREADDIR count %d\n", count);
 
index ea71513..430b53e 100644 (file)
@@ -119,6 +119,16 @@ static int br_dev_init(struct net_device *dev)
        return err;
 }
 
+static void br_dev_uninit(struct net_device *dev)
+{
+       struct net_bridge *br = netdev_priv(dev);
+
+       br_multicast_dev_del(br);
+       br_multicast_uninit_stats(br);
+       br_vlan_flush(br);
+       free_percpu(br->stats);
+}
+
 static int br_dev_open(struct net_device *dev)
 {
        struct net_bridge *br = netdev_priv(dev);
@@ -332,6 +342,7 @@ static const struct net_device_ops br_netdev_ops = {
        .ndo_open                = br_dev_open,
        .ndo_stop                = br_dev_stop,
        .ndo_init                = br_dev_init,
+       .ndo_uninit              = br_dev_uninit,
        .ndo_start_xmit          = br_dev_xmit,
        .ndo_get_stats64         = br_get_stats64,
        .ndo_set_mac_address     = br_set_mac_address,
@@ -356,14 +367,6 @@ static const struct net_device_ops br_netdev_ops = {
        .ndo_features_check      = passthru_features_check,
 };
 
-static void br_dev_free(struct net_device *dev)
-{
-       struct net_bridge *br = netdev_priv(dev);
-
-       free_percpu(br->stats);
-       free_netdev(dev);
-}
-
 static struct device_type br_type = {
        .name   = "bridge",
 };
@@ -376,7 +379,7 @@ void br_dev_setup(struct net_device *dev)
        ether_setup(dev);
 
        dev->netdev_ops = &br_netdev_ops;
-       dev->destructor = br_dev_free;
+       dev->destructor = free_netdev;
        dev->ethtool_ops = &br_ethtool_ops;
        SET_NETDEV_DEVTYPE(dev, &br_type);
        dev->priv_flags = IFF_EBRIDGE | IFF_NO_QUEUE;
index 8ac1770..a8d0ed2 100644 (file)
@@ -311,8 +311,6 @@ void br_dev_delete(struct net_device *dev, struct list_head *head)
 
        br_fdb_delete_by_port(br, NULL, 0, 1);
 
-       br_vlan_flush(br);
-       br_multicast_dev_del(br);
        cancel_delayed_work_sync(&br->gc_work);
 
        br_sysfs_delbr(br->dev);
index b760f26..faa7261 100644 (file)
@@ -2031,8 +2031,6 @@ void br_multicast_dev_del(struct net_bridge *br)
 
 out:
        spin_unlock_bh(&br->multicast_lock);
-
-       free_percpu(br->mcast_stats);
 }
 
 int br_multicast_set_router(struct net_bridge *br, unsigned long val)
@@ -2531,6 +2529,11 @@ int br_multicast_init_stats(struct net_bridge *br)
        return 0;
 }
 
+void br_multicast_uninit_stats(struct net_bridge *br)
+{
+       free_percpu(br->mcast_stats);
+}
+
 static void mcast_stats_add_dir(u64 *dst, u64 *src)
 {
        dst[BR_MCAST_DIR_RX] += src[BR_MCAST_DIR_RX];
index a8f6acd..225ef7d 100644 (file)
@@ -1165,11 +1165,14 @@ static int br_dev_newlink(struct net *src_net, struct net_device *dev,
                spin_unlock_bh(&br->lock);
        }
 
-       err = br_changelink(dev, tb, data);
+       err = register_netdevice(dev);
        if (err)
                return err;
 
-       return register_netdevice(dev);
+       err = br_changelink(dev, tb, data);
+       if (err)
+               unregister_netdevice(dev);
+       return err;
 }
 
 static size_t br_get_size(const struct net_device *brdev)
index 6136818..0d17728 100644 (file)
@@ -620,6 +620,7 @@ void br_rtr_notify(struct net_device *dev, struct net_bridge_port *port,
 void br_multicast_count(struct net_bridge *br, const struct net_bridge_port *p,
                        const struct sk_buff *skb, u8 type, u8 dir);
 int br_multicast_init_stats(struct net_bridge *br);
+void br_multicast_uninit_stats(struct net_bridge *br);
 void br_multicast_get_stats(const struct net_bridge *br,
                            const struct net_bridge_port *p,
                            struct br_mcast_stats *dest);
@@ -760,6 +761,10 @@ static inline int br_multicast_init_stats(struct net_bridge *br)
        return 0;
 }
 
+static inline void br_multicast_uninit_stats(struct net_bridge *br)
+{
+}
+
 static inline int br_multicast_igmp_type(const struct sk_buff *skb)
 {
        return 0;
index ea63334..f4947e7 100644 (file)
@@ -398,7 +398,7 @@ int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
                           struct iov_iter *to, int len)
 {
        int start = skb_headlen(skb);
-       int i, copy = start - offset;
+       int i, copy = start - offset, start_off = offset, n;
        struct sk_buff *frag_iter;
 
        trace_skb_copy_datagram_iovec(skb, len);
@@ -407,11 +407,12 @@ int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
        if (copy > 0) {
                if (copy > len)
                        copy = len;
-               if (copy_to_iter(skb->data + offset, copy, to) != copy)
+               n = copy_to_iter(skb->data + offset, copy, to);
+               offset += n;
+               if (n != copy)
                        goto short_copy;
                if ((len -= copy) == 0)
                        return 0;
-               offset += copy;
        }
 
        /* Copy paged appendix. Hmm... why does this look so complicated? */
@@ -425,13 +426,14 @@ int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
                if ((copy = end - offset) > 0) {
                        if (copy > len)
                                copy = len;
-                       if (copy_page_to_iter(skb_frag_page(frag),
+                       n = copy_page_to_iter(skb_frag_page(frag),
                                              frag->page_offset + offset -
-                                             start, copy, to) != copy)
+                                             start, copy, to);
+                       offset += n;
+                       if (n != copy)
                                goto short_copy;
                        if (!(len -= copy))
                                return 0;
-                       offset += copy;
                }
                start = end;
        }
@@ -463,6 +465,7 @@ int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
         */
 
 fault:
+       iov_iter_revert(to, offset - start_off);
        return -EFAULT;
 
 short_copy:
@@ -613,7 +616,7 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
                                      __wsum *csump)
 {
        int start = skb_headlen(skb);
-       int i, copy = start - offset;
+       int i, copy = start - offset, start_off = offset;
        struct sk_buff *frag_iter;
        int pos = 0;
        int n;
@@ -623,11 +626,11 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
                if (copy > len)
                        copy = len;
                n = csum_and_copy_to_iter(skb->data + offset, copy, csump, to);
+               offset += n;
                if (n != copy)
                        goto fault;
                if ((len -= copy) == 0)
                        return 0;
-               offset += copy;
                pos = copy;
        }
 
@@ -649,12 +652,12 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
                                                  offset - start, copy,
                                                  &csum2, to);
                        kunmap(page);
+                       offset += n;
                        if (n != copy)
                                goto fault;
                        *csump = csum_block_add(*csump, csum2, pos);
                        if (!(len -= copy))
                                return 0;
-                       offset += copy;
                        pos += copy;
                }
                start = end;
@@ -687,6 +690,7 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
                return 0;
 
 fault:
+       iov_iter_revert(to, offset - start_off);
        return -EFAULT;
 }
 
@@ -771,6 +775,7 @@ int skb_copy_and_csum_datagram_msg(struct sk_buff *skb,
        }
        return 0;
 csum_error:
+       iov_iter_revert(&msg->msg_iter, chunk);
        return -EINVAL;
 fault:
        return -EFAULT;
index 7869ae3..c57878b 100644 (file)
@@ -2450,6 +2450,9 @@ void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
 {
        unsigned long flags;
 
+       if (unlikely(!skb))
+               return;
+
        if (likely(atomic_read(&skb->users) == 1)) {
                smp_rmb();
                atomic_set(&skb->users, 0);
@@ -4240,7 +4243,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
                 */
                current->flags |= PF_MEMALLOC;
                ret = __netif_receive_skb_core(skb, true);
-               tsk_restore_flags(current, pflags, PF_MEMALLOC);
+               current_restore_flags(pflags, PF_MEMALLOC);
        } else
                ret = __netif_receive_skb_core(skb, false);
 
@@ -6757,7 +6760,6 @@ int dev_change_xdp_fd(struct net_device *dev, int fd, u32 flags)
 
        return err;
 }
-EXPORT_SYMBOL(dev_change_xdp_fd);
 
 /**
  *     dev_new_index   -       allocate an ifindex
index c35aae1..d98d499 100644 (file)
@@ -390,7 +390,7 @@ mpls:
                        unsigned char ar_tip[4];
                } *arp_eth, _arp_eth;
                const struct arphdr *arp;
-               struct arphdr *_arp;
+               struct arphdr _arp;
 
                arp = __skb_header_pointer(skb, nhoff, sizeof(_arp), data,
                                           hlen, &_arp);
index e7c12ca..4526cbd 100644 (file)
@@ -860,7 +860,8 @@ static void neigh_probe(struct neighbour *neigh)
        if (skb)
                skb = skb_clone(skb, GFP_ATOMIC);
        write_unlock(&neigh->lock);
-       neigh->ops->solicit(neigh, skb);
+       if (neigh->ops->solicit)
+               neigh->ops->solicit(neigh, skb);
        atomic_inc(&neigh->probes);
        kfree_skb(skb);
 }
index 9424673..29be246 100644 (file)
@@ -105,15 +105,21 @@ static void queue_process(struct work_struct *work)
        while ((skb = skb_dequeue(&npinfo->txq))) {
                struct net_device *dev = skb->dev;
                struct netdev_queue *txq;
+               unsigned int q_index;
 
                if (!netif_device_present(dev) || !netif_running(dev)) {
                        kfree_skb(skb);
                        continue;
                }
 
-               txq = skb_get_tx_queue(dev, skb);
-
                local_irq_save(flags);
+               /* check if skb->queue_mapping is still valid */
+               q_index = skb_get_queue_mapping(skb);
+               if (unlikely(q_index >= dev->real_num_tx_queues)) {
+                       q_index = q_index % dev->real_num_tx_queues;
+                       skb_set_queue_mapping(skb, q_index);
+               }
+               txq = netdev_get_tx_queue(dev, q_index);
                HARD_TX_LOCK(dev, txq, smp_processor_id());
                if (netif_xmit_frozen_or_stopped(txq) ||
                    netpoll_start_xmit(skb, dev, txq) != NETDEV_TX_OK) {
index 758f140..d28da7d 100644 (file)
 #include <net/tcp.h>
 
 static siphash_key_t net_secret __read_mostly;
+static siphash_key_t ts_secret __read_mostly;
 
 static __always_inline void net_secret_init(void)
 {
+       net_get_random_once(&ts_secret, sizeof(ts_secret));
        net_get_random_once(&net_secret, sizeof(net_secret));
 }
 #endif
@@ -45,6 +47,23 @@ static u32 seq_scale(u32 seq)
 #endif
 
 #if IS_ENABLED(CONFIG_IPV6)
+static u32 secure_tcpv6_ts_off(const __be32 *saddr, const __be32 *daddr)
+{
+       const struct {
+               struct in6_addr saddr;
+               struct in6_addr daddr;
+       } __aligned(SIPHASH_ALIGNMENT) combined = {
+               .saddr = *(struct in6_addr *)saddr,
+               .daddr = *(struct in6_addr *)daddr,
+       };
+
+       if (sysctl_tcp_timestamps != 1)
+               return 0;
+
+       return siphash(&combined, offsetofend(typeof(combined), daddr),
+                      &ts_secret);
+}
+
 u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr,
                                 __be16 sport, __be16 dport, u32 *tsoff)
 {
@@ -63,7 +82,7 @@ u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr,
        net_secret_init();
        hash = siphash(&combined, offsetofend(typeof(combined), dport),
                       &net_secret);
-       *tsoff = sysctl_tcp_timestamps == 1 ? (hash >> 32) : 0;
+       *tsoff = secure_tcpv6_ts_off(saddr, daddr);
        return seq_scale(hash);
 }
 EXPORT_SYMBOL(secure_tcpv6_sequence_number);
@@ -88,6 +107,14 @@ EXPORT_SYMBOL(secure_ipv6_port_ephemeral);
 #endif
 
 #ifdef CONFIG_INET
+static u32 secure_tcp_ts_off(__be32 saddr, __be32 daddr)
+{
+       if (sysctl_tcp_timestamps != 1)
+               return 0;
+
+       return siphash_2u32((__force u32)saddr, (__force u32)daddr,
+                           &ts_secret);
+}
 
 /* secure_tcp_sequence_number(a, b, 0, d) == secure_ipv4_port_ephemeral(a, b, d),
  * but fortunately, `sport' cannot be 0 in any circumstances. If this changes,
@@ -103,7 +130,7 @@ u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr,
        hash = siphash_3u32((__force u32)saddr, (__force u32)daddr,
                            (__force u32)sport << 16 | (__force u32)dport,
                            &net_secret);
-       *tsoff = sysctl_tcp_timestamps == 1 ? (hash >> 32) : 0;
+       *tsoff = secure_tcp_ts_off(saddr, daddr);
        return seq_scale(hash);
 }
 
index 9f78109..f1d0459 100644 (file)
@@ -1576,6 +1576,8 @@ done:
                skb_set_tail_pointer(skb, len);
        }
 
+       if (!skb->sk || skb->destructor == sock_edemux)
+               skb_condense(skb);
        return 0;
 }
 EXPORT_SYMBOL(___pskb_trim);
@@ -3082,22 +3084,32 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
        if (sg && csum && (mss != GSO_BY_FRAGS))  {
                if (!(features & NETIF_F_GSO_PARTIAL)) {
                        struct sk_buff *iter;
+                       unsigned int frag_len;
 
                        if (!list_skb ||
                            !net_gso_ok(features, skb_shinfo(head_skb)->gso_type))
                                goto normal;
 
-                       /* Split the buffer at the frag_list pointer.
-                        * This is based on the assumption that all
-                        * buffers in the chain excluding the last
-                        * containing the same amount of data.
+                       /* If we get here then all the required
+                        * GSO features except frag_list are supported.
+                        * Try to split the SKB to multiple GSO SKBs
+                        * with no frag_list.
+                        * Currently we can do that only when the buffers don't
+                        * have a linear part and all the buffers except
+                        * the last are of the same length.
                         */
+                       frag_len = list_skb->len;
                        skb_walk_frags(head_skb, iter) {
+                               if (frag_len != iter->len && iter->next)
+                                       goto normal;
                                if (skb_headlen(iter))
                                        goto normal;
 
                                len -= iter->len;
                        }
+
+                       if (len != frag_len)
+                               goto normal;
                }
 
                /* GSO partial only requires that we trim off any excess that
@@ -3807,6 +3819,7 @@ static void __skb_complete_tx_timestamp(struct sk_buff *skb,
        serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
        serr->ee.ee_info = tstype;
        serr->opt_stats = opt_stats;
+       serr->header.h4.iif = skb->dev ? skb->dev->ifindex : 0;
        if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) {
                serr->ee.ee_data = skb_shinfo(skb)->tskey;
                if (sk->sk_protocol == IPPROTO_TCP &&
index 2c4f574..b416a53 100644 (file)
@@ -325,7 +325,7 @@ int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 
        current->flags |= PF_MEMALLOC;
        ret = sk->sk_backlog_rcv(sk, skb);
-       tsk_restore_flags(current, pflags, PF_MEMALLOC);
+       current_restore_flags(pflags, PF_MEMALLOC);
 
        return ret;
 }
index 4ead336..7f9cc40 100644 (file)
@@ -408,14 +408,16 @@ static struct ctl_table net_core_table[] = {
                .data           = &sysctl_net_busy_poll,
                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
-               .proc_handler   = proc_dointvec
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = &zero,
        },
        {
                .procname       = "busy_read",
                .data           = &sysctl_net_busy_read,
                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
-               .proc_handler   = proc_dointvec
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = &zero,
        },
 #endif
 #ifdef CONFIG_NET_SCHED
index 6592d7b..32c467c 100644 (file)
 #include <linux/percpu.h>
 #include <linux/init.h>
 #include <linux/ratelimit.h>
+#include <linux/socket.h>
 
 #include <net/sock.h>
 #include <net/net_ratelimit.h>
+#include <net/ipv6.h>
 
 #include <asm/byteorder.h>
 #include <linux/uaccess.h>
@@ -300,6 +302,107 @@ out:
 }
 EXPORT_SYMBOL(in6_pton);
 
+static int inet4_pton(const char *src, u16 port_num,
+               struct sockaddr_storage *addr)
+{
+       struct sockaddr_in *addr4 = (struct sockaddr_in *)addr;
+       int srclen = strlen(src);
+
+       if (srclen > INET_ADDRSTRLEN)
+               return -EINVAL;
+
+       if (in4_pton(src, srclen, (u8 *)&addr4->sin_addr.s_addr,
+                    '\n', NULL) == 0)
+               return -EINVAL;
+
+       addr4->sin_family = AF_INET;
+       addr4->sin_port = htons(port_num);
+
+       return 0;
+}
+
+static int inet6_pton(struct net *net, const char *src, u16 port_num,
+               struct sockaddr_storage *addr)
+{
+       struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr;
+       const char *scope_delim;
+       int srclen = strlen(src);
+
+       if (srclen > INET6_ADDRSTRLEN)
+               return -EINVAL;
+
+       if (in6_pton(src, srclen, (u8 *)&addr6->sin6_addr.s6_addr,
+                    '%', &scope_delim) == 0)
+               return -EINVAL;
+
+       if (ipv6_addr_type(&addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL &&
+           src + srclen != scope_delim && *scope_delim == '%') {
+               struct net_device *dev;
+               char scope_id[16];
+               size_t scope_len = min_t(size_t, sizeof(scope_id) - 1,
+                                        src + srclen - scope_delim - 1);
+
+               memcpy(scope_id, scope_delim + 1, scope_len);
+               scope_id[scope_len] = '\0';
+
+               dev = dev_get_by_name(net, scope_id);
+               if (dev) {
+                       addr6->sin6_scope_id = dev->ifindex;
+                       dev_put(dev);
+               } else if (kstrtouint(scope_id, 0, &addr6->sin6_scope_id)) {
+                       return -EINVAL;
+               }
+       }
+
+       addr6->sin6_family = AF_INET6;
+       addr6->sin6_port = htons(port_num);
+
+       return 0;
+}
+
+/**
+ * inet_pton_with_scope - convert an IPv4/IPv6 and port to socket address
+ * @net: net namespace (used for scope handling)
+ * @af: address family, AF_INET, AF_INET6 or AF_UNSPEC for either
+ * @src: the start of the address string
+ * @port: the start of the port string (or NULL for none)
+ * @addr: output socket address
+ *
+ * Return zero on success, return errno when any error occurs.
+ */
+int inet_pton_with_scope(struct net *net, __kernel_sa_family_t af,
+               const char *src, const char *port, struct sockaddr_storage *addr)
+{
+       u16 port_num;
+       int ret = -EINVAL;
+
+       if (port) {
+               if (kstrtou16(port, 0, &port_num))
+                       return -EINVAL;
+       } else {
+               port_num = 0;
+       }
+
+       switch (af) {
+       case AF_INET:
+               ret = inet4_pton(src, port_num, addr);
+               break;
+       case AF_INET6:
+               ret = inet6_pton(net, src, port_num, addr);
+               break;
+       case AF_UNSPEC:
+               ret = inet4_pton(src, port_num, addr);
+               if (ret)
+                       ret = inet6_pton(net, src, port_num, addr);
+               break;
+       default:
+               pr_err("unexpected address family %d\n", af);
+       };
+
+       return ret;
+}
+EXPORT_SYMBOL(inet_pton_with_scope);
+
 void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb,
                              __be32 from, __be32 to, bool pseudohdr)
 {
index 6b1fc6e..13a9a32 100644 (file)
@@ -1343,6 +1343,9 @@ struct sk_buff **inet_gro_receive(struct sk_buff **head, struct sk_buff *skb)
        if (*(u8 *)iph != 0x45)
                goto out_unlock;
 
+       if (ip_is_fragment(iph))
+               goto out_unlock;
+
        if (unlikely(ip_fast_csum((u8 *)iph, 5)))
                goto out_unlock;
 
index ebd953b..1d46d05 100644 (file)
@@ -488,16 +488,15 @@ static bool ipv4_datagram_support_cmsg(const struct sock *sk,
                return false;
 
        /* Support IP_PKTINFO on tstamp packets if requested, to correlate
-        * timestamp with egress dev. Not possible for packets without dev
+        * timestamp with egress dev. Not possible for packets without iif
         * or without payload (SOF_TIMESTAMPING_OPT_TSONLY).
         */
-       if ((!(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_CMSG)) ||
-           (!skb->dev))
+       info = PKTINFO_SKB_CB(skb);
+       if (!(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_CMSG) ||
+           !info->ipi_ifindex)
                return false;
 
-       info = PKTINFO_SKB_CB(skb);
        info->ipi_spec_dst.s_addr = ip_hdr(skb)->saddr;
-       info->ipi_ifindex = skb->dev->ifindex;
        return true;
 }
 
@@ -591,6 +590,7 @@ static bool setsockopt_needs_rtnl(int optname)
        case MCAST_LEAVE_GROUP:
        case MCAST_LEAVE_SOURCE_GROUP:
        case MCAST_UNBLOCK_SOURCE:
+       case IP_ROUTER_ALERT:
                return true;
        }
        return false;
index fd9f34b..dfb2ab2 100644 (file)
@@ -306,7 +306,7 @@ static void __init ic_close_devs(void)
        while ((d = next)) {
                next = d->next;
                dev = d->dev;
-               if ((!ic_dev || dev != ic_dev->dev) && !netdev_uses_dsa(dev)) {
+               if (d != ic_dev && !netdev_uses_dsa(dev)) {
                        pr_debug("IP-Config: Downing %s\n", dev->name);
                        dev_change_flags(dev, d->flags);
                }
index c0317c9..b036e85 100644 (file)
@@ -1278,7 +1278,7 @@ static void mrtsock_destruct(struct sock *sk)
        struct net *net = sock_net(sk);
        struct mr_table *mrt;
 
-       rtnl_lock();
+       ASSERT_RTNL();
        ipmr_for_each_table(mrt, net) {
                if (sk == rtnl_dereference(mrt->mroute_sk)) {
                        IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
@@ -1289,7 +1289,6 @@ static void mrtsock_destruct(struct sock *sk)
                        mroute_clean_tables(mrt, false);
                }
        }
-       rtnl_unlock();
 }
 
 /* Socket options and virtual interface manipulation. The whole
@@ -1353,13 +1352,8 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
                if (sk != rcu_access_pointer(mrt->mroute_sk)) {
                        ret = -EACCES;
                } else {
-                       /* We need to unlock here because mrtsock_destruct takes
-                        * care of rtnl itself and we can't change that due to
-                        * the IP_ROUTER_ALERT setsockopt which runs without it.
-                        */
-                       rtnl_unlock();
                        ret = ip_ra_control(sk, 0, NULL);
-                       goto out;
+                       goto out_unlock;
                }
                break;
        case MRT_ADD_VIF:
@@ -1470,7 +1464,6 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
        }
 out_unlock:
        rtnl_unlock();
-out:
        return ret;
 }
 
index 52f2645..9b88413 100644 (file)
@@ -461,7 +461,7 @@ static void clusterip_tg_destroy(const struct xt_tgdtor_param *par)
 
        clusterip_config_put(cipinfo->config);
 
-       nf_ct_netns_get(par->net, par->family);
+       nf_ct_netns_put(par->net, par->family);
 }
 
 #ifdef CONFIG_COMPAT
index c9b52c3..53e49f5 100644 (file)
@@ -1260,16 +1260,6 @@ static const struct nf_conntrack_expect_policy snmp_exp_policy = {
        .timeout        = 180,
 };
 
-static struct nf_conntrack_helper snmp_helper __read_mostly = {
-       .me                     = THIS_MODULE,
-       .help                   = help,
-       .expect_policy          = &snmp_exp_policy,
-       .name                   = "snmp",
-       .tuple.src.l3num        = AF_INET,
-       .tuple.src.u.udp.port   = cpu_to_be16(SNMP_PORT),
-       .tuple.dst.protonum     = IPPROTO_UDP,
-};
-
 static struct nf_conntrack_helper snmp_trap_helper __read_mostly = {
        .me                     = THIS_MODULE,
        .help                   = help,
@@ -1288,22 +1278,16 @@ static struct nf_conntrack_helper snmp_trap_helper __read_mostly = {
 
 static int __init nf_nat_snmp_basic_init(void)
 {
-       int ret = 0;
-
        BUG_ON(nf_nat_snmp_hook != NULL);
        RCU_INIT_POINTER(nf_nat_snmp_hook, help);
 
-       ret = nf_conntrack_helper_register(&snmp_trap_helper);
-       if (ret < 0) {
-               nf_conntrack_helper_unregister(&snmp_helper);
-               return ret;
-       }
-       return ret;
+       return nf_conntrack_helper_register(&snmp_trap_helper);
 }
 
 static void __exit nf_nat_snmp_basic_fini(void)
 {
        RCU_INIT_POINTER(nf_nat_snmp_hook, NULL);
+       synchronize_rcu();
        nf_conntrack_helper_unregister(&snmp_trap_helper);
 }
 
index 2af6244..ccfbce1 100644 (file)
@@ -156,17 +156,18 @@ int ping_hash(struct sock *sk)
 void ping_unhash(struct sock *sk)
 {
        struct inet_sock *isk = inet_sk(sk);
+
        pr_debug("ping_unhash(isk=%p,isk->num=%u)\n", isk, isk->inet_num);
+       write_lock_bh(&ping_table.lock);
        if (sk_hashed(sk)) {
-               write_lock_bh(&ping_table.lock);
                hlist_nulls_del(&sk->sk_nulls_node);
                sk_nulls_node_init(&sk->sk_nulls_node);
                sock_put(sk);
                isk->inet_num = 0;
                isk->inet_sport = 0;
                sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
-               write_unlock_bh(&ping_table.lock);
        }
+       write_unlock_bh(&ping_table.lock);
 }
 EXPORT_SYMBOL_GPL(ping_unhash);
 
index 8119e1f..9d94397 100644 (file)
@@ -682,7 +682,9 @@ static void raw_close(struct sock *sk, long timeout)
        /*
         * Raw sockets may have direct kernel references. Kill them.
         */
+       rtnl_lock();
        ip_ra_control(sk, 0, NULL);
+       rtnl_unlock();
 
        sk_common_release(sk);
 }
index 8471dd1..d972488 100644 (file)
@@ -2359,7 +2359,8 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
                }
 
                /* L3 master device is the loopback for that domain */
-               dev_out = l3mdev_master_dev_rcu(dev_out) ? : net->loopback_dev;
+               dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(res)) ? :
+                       net->loopback_dev;
                fl4->flowi4_oif = dev_out->ifindex;
                flags |= RTCF_LOCAL;
                goto make_route;
@@ -2620,7 +2621,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
        skb_reset_network_header(skb);
 
        /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
-       ip_hdr(skb)->protocol = IPPROTO_ICMP;
+       ip_hdr(skb)->protocol = IPPROTO_UDP;
        skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
 
        src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
index 1e319a5..40ba424 100644 (file)
@@ -2322,6 +2322,7 @@ int tcp_disconnect(struct sock *sk, int flags)
        tcp_init_send_head(sk);
        memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
        __sk_dst_reset(sk);
+       tcp_saved_syn_free(tp);
 
        /* Clean up fastopen related fields */
        tcp_free_fastopen_req(tp);
index 79c4817..6e3c512 100644 (file)
@@ -168,12 +168,8 @@ void tcp_assign_congestion_control(struct sock *sk)
        }
 out:
        rcu_read_unlock();
+       memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
 
-       /* Clear out private data before diag gets it and
-        * the ca has not been initialized.
-        */
-       if (ca->get_info)
-               memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
        if (ca->flags & TCP_CONG_NEEDS_ECN)
                INET_ECN_xmit(sk);
        else
@@ -200,11 +196,10 @@ static void tcp_reinit_congestion_control(struct sock *sk,
        tcp_cleanup_congestion_control(sk);
        icsk->icsk_ca_ops = ca;
        icsk->icsk_ca_setsockopt = 1;
+       memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
 
-       if (sk->sk_state != TCP_CLOSE) {
-               memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
+       if (sk->sk_state != TCP_CLOSE)
                tcp_init_congestion_control(sk);
-       }
 }
 
 /* Manage refcounts on socket close. */
index c431197..659d1ba 100644 (file)
@@ -126,7 +126,8 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
 #define REXMIT_LOST    1 /* retransmit packets marked lost */
 #define REXMIT_NEW     2 /* FRTO-style transmit of unsent/new packets */
 
-static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb)
+static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb,
+                            unsigned int len)
 {
        static bool __once __read_mostly;
 
@@ -137,8 +138,9 @@ static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb)
 
                rcu_read_lock();
                dev = dev_get_by_index_rcu(sock_net(sk), skb->skb_iif);
-               pr_warn("%s: Driver has suspect GRO implementation, TCP performance may be compromised.\n",
-                       dev ? dev->name : "Unknown driver");
+               if (!dev || len >= dev->mtu)
+                       pr_warn("%s: Driver has suspect GRO implementation, TCP performance may be compromised.\n",
+                               dev ? dev->name : "Unknown driver");
                rcu_read_unlock();
        }
 }
@@ -161,8 +163,10 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
        if (len >= icsk->icsk_ack.rcv_mss) {
                icsk->icsk_ack.rcv_mss = min_t(unsigned int, len,
                                               tcp_sk(sk)->advmss);
-               if (unlikely(icsk->icsk_ack.rcv_mss != len))
-                       tcp_gro_dev_warn(sk, skb);
+               /* Account for possibly-removed options */
+               if (unlikely(len > icsk->icsk_ack.rcv_mss +
+                                  MAX_TCP_OPTION_SPACE))
+                       tcp_gro_dev_warn(sk, skb, len);
        } else {
                /* Otherwise, we make more careful check taking into account,
                 * that SACKs block is variable.
@@ -874,22 +878,11 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
                                  const int ts)
 {
        struct tcp_sock *tp = tcp_sk(sk);
-       if (metric > tp->reordering) {
-               int mib_idx;
+       int mib_idx;
 
+       if (metric > tp->reordering) {
                tp->reordering = min(sysctl_tcp_max_reordering, metric);
 
-               /* This exciting event is worth to be remembered. 8) */
-               if (ts)
-                       mib_idx = LINUX_MIB_TCPTSREORDER;
-               else if (tcp_is_reno(tp))
-                       mib_idx = LINUX_MIB_TCPRENOREORDER;
-               else if (tcp_is_fack(tp))
-                       mib_idx = LINUX_MIB_TCPFACKREORDER;
-               else
-                       mib_idx = LINUX_MIB_TCPSACKREORDER;
-
-               NET_INC_STATS(sock_net(sk), mib_idx);
 #if FASTRETRANS_DEBUG > 1
                pr_debug("Disorder%d %d %u f%u s%u rr%d\n",
                         tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
@@ -902,6 +895,18 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
        }
 
        tp->rack.reord = 1;
+
+       /* This exciting event is worth to be remembered. 8) */
+       if (ts)
+               mib_idx = LINUX_MIB_TCPTSREORDER;
+       else if (tcp_is_reno(tp))
+               mib_idx = LINUX_MIB_TCPRENOREORDER;
+       else if (tcp_is_fack(tp))
+               mib_idx = LINUX_MIB_TCPFACKREORDER;
+       else
+               mib_idx = LINUX_MIB_TCPSACKREORDER;
+
+       NET_INC_STATS(sock_net(sk), mib_idx);
 }
 
 /* This must be called before lost_out is incremented */
@@ -1930,6 +1935,7 @@ void tcp_enter_loss(struct sock *sk)
        struct tcp_sock *tp = tcp_sk(sk);
        struct net *net = sock_net(sk);
        struct sk_buff *skb;
+       bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
        bool is_reneg;                  /* is receiver reneging on SACKs? */
        bool mark_lost;
 
@@ -1989,15 +1995,18 @@ void tcp_enter_loss(struct sock *sk)
        tp->high_seq = tp->snd_nxt;
        tcp_ecn_queue_cwr(tp);
 
-       /* F-RTO RFC5682 sec 3.1 step 1 mandates to disable F-RTO
-        * if a previous recovery is underway, otherwise it may incorrectly
-        * call a timeout spurious if some previously retransmitted packets
-        * are s/acked (sec 3.2). We do not apply that retriction since
-        * retransmitted skbs are permanently tagged with TCPCB_EVER_RETRANS
-        * so FLAG_ORIG_SACK_ACKED is always correct. But we do disable F-RTO
-        * on PTMU discovery to avoid sending new data.
+       /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous
+        * loss recovery is underway except recurring timeout(s) on
+        * the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing
+        *
+        * In theory F-RTO can be used repeatedly during loss recovery.
+        * In practice this interacts badly with broken middle-boxes that
+        * falsely raise the receive window, which results in repeated
+        * timeouts and stop-and-go behavior.
         */
-       tp->frto = sysctl_tcp_frto && !inet_csk(sk)->icsk_mtup.probe_size;
+       tp->frto = sysctl_tcp_frto &&
+                  (new_recovery || icsk->icsk_retransmits) &&
+                  !inet_csk(sk)->icsk_mtup.probe_size;
 }
 
 /* If ACK arrived pointing to a remembered SACK, it means that our
index 22548b5..a85d863 100644 (file)
@@ -1267,7 +1267,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
  * eventually). The difference is that pulled data not copied, but
  * immediately discarded.
  */
-static void __pskb_trim_head(struct sk_buff *skb, int len)
+static int __pskb_trim_head(struct sk_buff *skb, int len)
 {
        struct skb_shared_info *shinfo;
        int i, k, eat;
@@ -1277,7 +1277,7 @@ static void __pskb_trim_head(struct sk_buff *skb, int len)
                __skb_pull(skb, eat);
                len -= eat;
                if (!len)
-                       return;
+                       return 0;
        }
        eat = len;
        k = 0;
@@ -1303,23 +1303,28 @@ static void __pskb_trim_head(struct sk_buff *skb, int len)
        skb_reset_tail_pointer(skb);
        skb->data_len -= len;
        skb->len = skb->data_len;
+       return len;
 }
 
 /* Remove acked data from a packet in the transmit queue. */
 int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
 {
+       u32 delta_truesize;
+
        if (skb_unclone(skb, GFP_ATOMIC))
                return -ENOMEM;
 
-       __pskb_trim_head(skb, len);
+       delta_truesize = __pskb_trim_head(skb, len);
 
        TCP_SKB_CB(skb)->seq += len;
        skb->ip_summed = CHECKSUM_PARTIAL;
 
-       skb->truesize        -= len;
-       sk->sk_wmem_queued   -= len;
-       sk_mem_uncharge(sk, len);
-       sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
+       if (delta_truesize) {
+               skb->truesize      -= delta_truesize;
+               sk->sk_wmem_queued -= delta_truesize;
+               sk_mem_uncharge(sk, delta_truesize);
+               sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
+       }
 
        /* Any change of skb->len requires recalculation of tso factor. */
        if (tcp_skb_pcount(skb) > 1)
@@ -2999,6 +3004,8 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority)
 {
        struct sk_buff *skb;
 
+       TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS);
+
        /* NOTE: No TCP options attached and we never retransmit this. */
        skb = alloc_skb(MAX_TCP_HEADER, priority);
        if (!skb) {
@@ -3014,8 +3021,6 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority)
        /* Send it off. */
        if (tcp_transmit_skb(sk, skb, 0, priority))
                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
-
-       TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS);
 }
 
 /* Send a crossed SYN-ACK during socket establishment.
index 4ecb38a..d8acbd9 100644 (file)
@@ -12,7 +12,8 @@ static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
                /* Account for retransmits that are lost again */
                TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
                tp->retrans_out -= tcp_skb_pcount(skb);
-               NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT);
+               NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT,
+                             tcp_skb_pcount(skb));
        }
 }
 
index b2be1d9..7812501 100644 (file)
@@ -29,6 +29,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
        u16 mac_len = skb->mac_len;
        int udp_offset, outer_hlen;
        __wsum partial;
+       bool need_ipsec;
 
        if (unlikely(!pskb_may_pull(skb, tnl_hlen)))
                goto out;
@@ -62,8 +63,10 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
 
        ufo = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
 
+       need_ipsec = skb_dst(skb) && dst_xfrm(skb_dst(skb));
        /* Try to offload checksum if possible */
        offload_csum = !!(need_csum &&
+                         !need_ipsec &&
                          (skb->dev->features &
                           (is_ipv6 ? (NETIF_F_HW_CSUM | NETIF_F_IPV6_CSUM) :
                                      (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM))));
index 3631725..0ea96c4 100644 (file)
@@ -3271,14 +3271,24 @@ static void addrconf_gre_config(struct net_device *dev)
 static int fixup_permanent_addr(struct inet6_dev *idev,
                                struct inet6_ifaddr *ifp)
 {
-       if (!ifp->rt) {
-               struct rt6_info *rt;
+       /* rt6i_ref == 0 means the host route was removed from the
+        * FIB, for example, if 'lo' device is taken down. In that
+        * case regenerate the host route.
+        */
+       if (!ifp->rt || !atomic_read(&ifp->rt->rt6i_ref)) {
+               struct rt6_info *rt, *prev;
 
                rt = addrconf_dst_alloc(idev, &ifp->addr, false);
                if (unlikely(IS_ERR(rt)))
                        return PTR_ERR(rt);
 
+               /* ifp->rt can be accessed outside of rtnl */
+               spin_lock(&ifp->lock);
+               prev = ifp->rt;
                ifp->rt = rt;
+               spin_unlock(&ifp->lock);
+
+               ip6_rt_put(prev);
        }
 
        if (!(ifp->flags & IFA_F_NOPREFIXROUTE)) {
@@ -3626,14 +3636,19 @@ restart:
        INIT_LIST_HEAD(&del_list);
        list_for_each_entry_safe(ifa, tmp, &idev->addr_list, if_list) {
                struct rt6_info *rt = NULL;
+               bool keep;
 
                addrconf_del_dad_work(ifa);
 
+               keep = keep_addr && (ifa->flags & IFA_F_PERMANENT) &&
+                       !addr_is_local(&ifa->addr);
+               if (!keep)
+                       list_move(&ifa->if_list, &del_list);
+
                write_unlock_bh(&idev->lock);
                spin_lock_bh(&ifa->lock);
 
-               if (keep_addr && (ifa->flags & IFA_F_PERMANENT) &&
-                   !addr_is_local(&ifa->addr)) {
+               if (keep) {
                        /* set state to skip the notifier below */
                        state = INET6_IFADDR_STATE_DEAD;
                        ifa->state = 0;
@@ -3645,8 +3660,6 @@ restart:
                } else {
                        state = ifa->state;
                        ifa->state = INET6_IFADDR_STATE_DEAD;
-
-                       list_move(&ifa->if_list, &del_list);
                }
 
                spin_unlock_bh(&ifa->lock);
index a9a9553..e82e59f 100644 (file)
@@ -933,8 +933,6 @@ static int __init inet6_init(void)
        if (err)
                goto igmp_fail;
 
-       ipv6_stub = &ipv6_stub_impl;
-
        err = ipv6_netfilter_init();
        if (err)
                goto netfilter_fail;
@@ -1010,6 +1008,10 @@ static int __init inet6_init(void)
        if (err)
                goto sysctl_fail;
 #endif
+
+       /* ensure that ipv6 stubs are visible only after ipv6 is ready */
+       wmb();
+       ipv6_stub = &ipv6_stub_impl;
 out:
        return err;
 
index eec27f8..e011122 100644 (file)
@@ -405,9 +405,6 @@ static inline bool ipv6_datagram_support_addr(struct sock_exterr_skb *serr)
  * At one point, excluding local errors was a quick test to identify icmp/icmp6
  * errors. This is no longer true, but the test remained, so the v6 stack,
  * unlike v4, also honors cmsg requests on all wifi and timestamp errors.
- *
- * Timestamp code paths do not initialize the fields expected by cmsg:
- * the PKTINFO fields in skb->cb[]. Fill those in here.
  */
 static bool ip6_datagram_support_cmsg(struct sk_buff *skb,
                                      struct sock_exterr_skb *serr)
@@ -419,14 +416,9 @@ static bool ip6_datagram_support_cmsg(struct sk_buff *skb,
        if (serr->ee.ee_origin == SO_EE_ORIGIN_LOCAL)
                return false;
 
-       if (!skb->dev)
+       if (!IP6CB(skb)->iif)
                return false;
 
-       if (skb->protocol == htons(ETH_P_IPV6))
-               IP6CB(skb)->iif = skb->dev->ifindex;
-       else
-               PKTINFO_SKB_CB(skb)->ipi_ifindex = skb->dev->ifindex;
-
        return true;
 }
 
index 275cac6..d32e211 100644 (file)
@@ -388,7 +388,6 @@ looped_back:
                icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
                                  ((&hdr->segments_left) -
                                   skb_network_header(skb)));
-               kfree_skb(skb);
                return -1;
        }
 
@@ -910,6 +909,8 @@ static void ipv6_push_rthdr(struct sk_buff *skb, u8 *proto,
 {
        switch (opt->type) {
        case IPV6_SRCRT_TYPE_0:
+       case IPV6_SRCRT_STRICT:
+       case IPV6_SRCRT_TYPE_2:
                ipv6_push_rthdr0(skb, proto, opt, addr_p, saddr);
                break;
        case IPV6_SRCRT_TYPE_4:
@@ -1164,6 +1165,8 @@ struct in6_addr *fl6_update_dst(struct flowi6 *fl6,
 
        switch (opt->srcrt->type) {
        case IPV6_SRCRT_TYPE_0:
+       case IPV6_SRCRT_STRICT:
+       case IPV6_SRCRT_TYPE_2:
                fl6->daddr = *((struct rt0_hdr *)opt->srcrt)->addr;
                break;
        case IPV6_SRCRT_TYPE_4:
index aacfb4b..c45b12b 100644 (file)
@@ -122,11 +122,14 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
                        max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs));
        /*
         * RFC4291 2.5.3
+        * The loopback address must not be used as the source address in IPv6
+        * packets that are sent outside of a single node. [..]
         * A packet received on an interface with a destination address
         * of loopback must be dropped.
         */
-       if (!(dev->flags & IFF_LOOPBACK) &&
-           ipv6_addr_loopback(&hdr->daddr))
+       if ((ipv6_addr_loopback(&hdr->saddr) ||
+            ipv6_addr_loopback(&hdr->daddr)) &&
+            !(dev->flags & IFF_LOOPBACK))
                goto err;
 
        /* RFC4291 Errata ID: 3480
index 75fac93..a9692ec 100644 (file)
@@ -1037,7 +1037,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
        struct ip6_tnl *t = netdev_priv(dev);
        struct net *net = t->net;
        struct net_device_stats *stats = &t->dev->stats;
-       struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+       struct ipv6hdr *ipv6h;
        struct ipv6_tel_txoption opt;
        struct dst_entry *dst = NULL, *ndst = NULL;
        struct net_device *tdev;
@@ -1057,26 +1057,28 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
 
        /* NBMA tunnel */
        if (ipv6_addr_any(&t->parms.raddr)) {
-               struct in6_addr *addr6;
-               struct neighbour *neigh;
-               int addr_type;
+               if (skb->protocol == htons(ETH_P_IPV6)) {
+                       struct in6_addr *addr6;
+                       struct neighbour *neigh;
+                       int addr_type;
 
-               if (!skb_dst(skb))
-                       goto tx_err_link_failure;
+                       if (!skb_dst(skb))
+                               goto tx_err_link_failure;
 
-               neigh = dst_neigh_lookup(skb_dst(skb),
-                                        &ipv6_hdr(skb)->daddr);
-               if (!neigh)
-                       goto tx_err_link_failure;
+                       neigh = dst_neigh_lookup(skb_dst(skb),
+                                                &ipv6_hdr(skb)->daddr);
+                       if (!neigh)
+                               goto tx_err_link_failure;
 
-               addr6 = (struct in6_addr *)&neigh->primary_key;
-               addr_type = ipv6_addr_type(addr6);
+                       addr6 = (struct in6_addr *)&neigh->primary_key;
+                       addr_type = ipv6_addr_type(addr6);
 
-               if (addr_type == IPV6_ADDR_ANY)
-                       addr6 = &ipv6_hdr(skb)->daddr;
+                       if (addr_type == IPV6_ADDR_ANY)
+                               addr6 = &ipv6_hdr(skb)->daddr;
 
-               memcpy(&fl6->daddr, addr6, sizeof(fl6->daddr));
-               neigh_release(neigh);
+                       memcpy(&fl6->daddr, addr6, sizeof(fl6->daddr));
+                       neigh_release(neigh);
+               }
        } else if (!(t->parms.flags &
                     (IP6_TNL_F_USE_ORIG_TCLASS | IP6_TNL_F_USE_ORIG_FWMARK))) {
                /* enable the cache only only if the routing decision does
index 6ba6c90..bf34d09 100644 (file)
@@ -774,7 +774,8 @@ failure:
  *     Delete a VIF entry
  */
 
-static int mif6_delete(struct mr6_table *mrt, int vifi, struct list_head *head)
+static int mif6_delete(struct mr6_table *mrt, int vifi, int notify,
+                      struct list_head *head)
 {
        struct mif_device *v;
        struct net_device *dev;
@@ -820,7 +821,7 @@ static int mif6_delete(struct mr6_table *mrt, int vifi, struct list_head *head)
                                             dev->ifindex, &in6_dev->cnf);
        }
 
-       if (v->flags & MIFF_REGISTER)
+       if ((v->flags & MIFF_REGISTER) && !notify)
                unregister_netdevice_queue(dev, head);
 
        dev_put(dev);
@@ -1331,7 +1332,6 @@ static int ip6mr_device_event(struct notifier_block *this,
        struct mr6_table *mrt;
        struct mif_device *v;
        int ct;
-       LIST_HEAD(list);
 
        if (event != NETDEV_UNREGISTER)
                return NOTIFY_DONE;
@@ -1340,10 +1340,9 @@ static int ip6mr_device_event(struct notifier_block *this,
                v = &mrt->vif6_table[0];
                for (ct = 0; ct < mrt->maxvif; ct++, v++) {
                        if (v->dev == dev)
-                               mif6_delete(mrt, ct, &list);
+                               mif6_delete(mrt, ct, 1, NULL);
                }
        }
-       unregister_netdevice_many(&list);
 
        return NOTIFY_DONE;
 }
@@ -1552,7 +1551,7 @@ static void mroute_clean_tables(struct mr6_table *mrt, bool all)
        for (i = 0; i < mrt->maxvif; i++) {
                if (!all && (mrt->vif6_table[i].flags & VIFF_STATIC))
                        continue;
-               mif6_delete(mrt, i, &list);
+               mif6_delete(mrt, i, 0, &list);
        }
        unregister_netdevice_many(&list);
 
@@ -1707,7 +1706,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
                if (copy_from_user(&mifi, optval, sizeof(mifi_t)))
                        return -EFAULT;
                rtnl_lock();
-               ret = mif6_delete(mrt, mifi, NULL);
+               ret = mif6_delete(mrt, mifi, 0, NULL);
                rtnl_unlock();
                return ret;
 
index 7ebac63..cb17667 100644 (file)
@@ -1749,7 +1749,8 @@ static int ndisc_netdev_event(struct notifier_block *this, unsigned long event,
                idev = in6_dev_get(dev);
                if (!idev)
                        break;
-               if (idev->cnf.ndisc_notify)
+               if (idev->cnf.ndisc_notify ||
+                   net->ipv6.devconf_all->ndisc_notify)
                        ndisc_send_unsol_na(dev);
                in6_dev_put(idev);
                break;
index f174e76..0da6a12 100644 (file)
@@ -1178,8 +1178,7 @@ static int rawv6_ioctl(struct sock *sk, int cmd, unsigned long arg)
                spin_lock_bh(&sk->sk_receive_queue.lock);
                skb = skb_peek(&sk->sk_receive_queue);
                if (skb)
-                       amount = skb_tail_pointer(skb) -
-                               skb_transport_header(skb);
+                       amount = skb->len;
                spin_unlock_bh(&sk->sk_receive_queue.lock);
                return put_user(amount, (int __user *)arg);
        }
index 9db1418..fb174b5 100644 (file)
@@ -1854,6 +1854,10 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg)
        int addr_type;
        int err = -EINVAL;
 
+       /* RTF_PCPU is an internal flag; can not be set by userspace */
+       if (cfg->fc_flags & RTF_PCPU)
+               goto out;
+
        if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
                goto out;
 #ifndef CONFIG_IPV6_SUBTREES
index a855eb3..5f44ffe 100644 (file)
@@ -53,6 +53,9 @@ bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len)
                struct sr6_tlv *tlv;
                unsigned int tlv_len;
 
+               if (trailing < sizeof(*tlv))
+                       return false;
+
                tlv = (struct sr6_tlv *)((unsigned char *)srh + tlv_offset);
                tlv_len = sizeof(*tlv) + tlv->len;
 
index 309062f..31762f7 100644 (file)
@@ -1687,7 +1687,7 @@ static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
                struct kcm_attach info;
 
                if (copy_from_user(&info, (void __user *)arg, sizeof(info)))
-                       err = -EFAULT;
+                       return -EFAULT;
 
                err = kcm_attach_ioctl(sock, &info);
 
@@ -1697,7 +1697,7 @@ static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
                struct kcm_unattach info;
 
                if (copy_from_user(&info, (void __user *)arg, sizeof(info)))
-                       err = -EFAULT;
+                       return -EFAULT;
 
                err = kcm_unattach_ioctl(sock, &info);
 
@@ -1708,7 +1708,7 @@ static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
                struct socket *newsock = NULL;
 
                if (copy_from_user(&info, (void __user *)arg, sizeof(info)))
-                       err = -EFAULT;
+                       return -EFAULT;
 
                err = kcm_clone(sock, &info, &newsock);
 
index c6252ed..be8cecc 100644 (file)
@@ -63,8 +63,13 @@ struct pfkey_sock {
                } u;
                struct sk_buff  *skb;
        } dump;
+       struct mutex dump_lock;
 };
 
+static int parse_sockaddr_pair(struct sockaddr *sa, int ext_len,
+                              xfrm_address_t *saddr, xfrm_address_t *daddr,
+                              u16 *family);
+
 static inline struct pfkey_sock *pfkey_sk(struct sock *sk)
 {
        return (struct pfkey_sock *)sk;
@@ -139,6 +144,7 @@ static int pfkey_create(struct net *net, struct socket *sock, int protocol,
 {
        struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id);
        struct sock *sk;
+       struct pfkey_sock *pfk;
        int err;
 
        if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
@@ -153,6 +159,9 @@ static int pfkey_create(struct net *net, struct socket *sock, int protocol,
        if (sk == NULL)
                goto out;
 
+       pfk = pfkey_sk(sk);
+       mutex_init(&pfk->dump_lock);
+
        sock->ops = &pfkey_ops;
        sock_init_data(sock, sk);
 
@@ -281,13 +290,23 @@ static int pfkey_do_dump(struct pfkey_sock *pfk)
        struct sadb_msg *hdr;
        int rc;
 
+       mutex_lock(&pfk->dump_lock);
+       if (!pfk->dump.dump) {
+               rc = 0;
+               goto out;
+       }
+
        rc = pfk->dump.dump(pfk);
-       if (rc == -ENOBUFS)
-               return 0;
+       if (rc == -ENOBUFS) {
+               rc = 0;
+               goto out;
+       }
 
        if (pfk->dump.skb) {
-               if (!pfkey_can_dump(&pfk->sk))
-                       return 0;
+               if (!pfkey_can_dump(&pfk->sk)) {
+                       rc = 0;
+                       goto out;
+               }
 
                hdr = (struct sadb_msg *) pfk->dump.skb->data;
                hdr->sadb_msg_seq = 0;
@@ -298,6 +317,9 @@ static int pfkey_do_dump(struct pfkey_sock *pfk)
        }
 
        pfkey_terminate_dump(pfk);
+
+out:
+       mutex_unlock(&pfk->dump_lock);
        return rc;
 }
 
@@ -1793,19 +1815,26 @@ static int pfkey_dump(struct sock *sk, struct sk_buff *skb, const struct sadb_ms
        struct xfrm_address_filter *filter = NULL;
        struct pfkey_sock *pfk = pfkey_sk(sk);
 
-       if (pfk->dump.dump != NULL)
+       mutex_lock(&pfk->dump_lock);
+       if (pfk->dump.dump != NULL) {
+               mutex_unlock(&pfk->dump_lock);
                return -EBUSY;
+       }
 
        proto = pfkey_satype2proto(hdr->sadb_msg_satype);
-       if (proto == 0)
+       if (proto == 0) {
+               mutex_unlock(&pfk->dump_lock);
                return -EINVAL;
+       }
 
        if (ext_hdrs[SADB_X_EXT_FILTER - 1]) {
                struct sadb_x_filter *xfilter = ext_hdrs[SADB_X_EXT_FILTER - 1];
 
                filter = kmalloc(sizeof(*filter), GFP_KERNEL);
-               if (filter == NULL)
+               if (filter == NULL) {
+                       mutex_unlock(&pfk->dump_lock);
                        return -ENOMEM;
+               }
 
                memcpy(&filter->saddr, &xfilter->sadb_x_filter_saddr,
                       sizeof(xfrm_address_t));
@@ -1821,6 +1850,7 @@ static int pfkey_dump(struct sock *sk, struct sk_buff *skb, const struct sadb_ms
        pfk->dump.dump = pfkey_dump_sa;
        pfk->dump.done = pfkey_dump_sa_done;
        xfrm_state_walk_init(&pfk->dump.u.state, proto, filter);
+       mutex_unlock(&pfk->dump_lock);
 
        return pfkey_do_dump(pfk);
 }
@@ -1913,19 +1943,14 @@ parse_ipsecrequest(struct xfrm_policy *xp, struct sadb_x_ipsecrequest *rq)
 
        /* addresses present only in tunnel mode */
        if (t->mode == XFRM_MODE_TUNNEL) {
-               u8 *sa = (u8 *) (rq + 1);
-               int family, socklen;
+               int err;
 
-               family = pfkey_sockaddr_extract((struct sockaddr *)sa,
-                                               &t->saddr);
-               if (!family)
-                       return -EINVAL;
-
-               socklen = pfkey_sockaddr_len(family);
-               if (pfkey_sockaddr_extract((struct sockaddr *)(sa + socklen),
-                                          &t->id.daddr) != family)
-                       return -EINVAL;
-               t->encap_family = family;
+               err = parse_sockaddr_pair(
+                       (struct sockaddr *)(rq + 1),
+                       rq->sadb_x_ipsecrequest_len - sizeof(*rq),
+                       &t->saddr, &t->id.daddr, &t->encap_family);
+               if (err)
+                       return err;
        } else
                t->encap_family = xp->family;
 
@@ -1945,7 +1970,11 @@ parse_ipsecrequests(struct xfrm_policy *xp, struct sadb_x_policy *pol)
        if (pol->sadb_x_policy_len * 8 < sizeof(struct sadb_x_policy))
                return -EINVAL;
 
-       while (len >= sizeof(struct sadb_x_ipsecrequest)) {
+       while (len >= sizeof(*rq)) {
+               if (len < rq->sadb_x_ipsecrequest_len ||
+                   rq->sadb_x_ipsecrequest_len < sizeof(*rq))
+                       return -EINVAL;
+
                if ((err = parse_ipsecrequest(xp, rq)) < 0)
                        return err;
                len -= rq->sadb_x_ipsecrequest_len;
@@ -2408,7 +2437,6 @@ out:
        return err;
 }
 
-#ifdef CONFIG_NET_KEY_MIGRATE
 static int pfkey_sockaddr_pair_size(sa_family_t family)
 {
        return PFKEY_ALIGN8(pfkey_sockaddr_len(family) * 2);
@@ -2420,7 +2448,7 @@ static int parse_sockaddr_pair(struct sockaddr *sa, int ext_len,
 {
        int af, socklen;
 
-       if (ext_len < pfkey_sockaddr_pair_size(sa->sa_family))
+       if (ext_len < 2 || ext_len < pfkey_sockaddr_pair_size(sa->sa_family))
                return -EINVAL;
 
        af = pfkey_sockaddr_extract(sa, saddr);
@@ -2436,6 +2464,7 @@ static int parse_sockaddr_pair(struct sockaddr *sa, int ext_len,
        return 0;
 }
 
+#ifdef CONFIG_NET_KEY_MIGRATE
 static int ipsecrequests_to_migrate(struct sadb_x_ipsecrequest *rq1, int len,
                                    struct xfrm_migrate *m)
 {
@@ -2443,13 +2472,14 @@ static int ipsecrequests_to_migrate(struct sadb_x_ipsecrequest *rq1, int len,
        struct sadb_x_ipsecrequest *rq2;
        int mode;
 
-       if (len <= sizeof(struct sadb_x_ipsecrequest) ||
-           len < rq1->sadb_x_ipsecrequest_len)
+       if (len < sizeof(*rq1) ||
+           len < rq1->sadb_x_ipsecrequest_len ||
+           rq1->sadb_x_ipsecrequest_len < sizeof(*rq1))
                return -EINVAL;
 
        /* old endoints */
        err = parse_sockaddr_pair((struct sockaddr *)(rq1 + 1),
-                                 rq1->sadb_x_ipsecrequest_len,
+                                 rq1->sadb_x_ipsecrequest_len - sizeof(*rq1),
                                  &m->old_saddr, &m->old_daddr,
                                  &m->old_family);
        if (err)
@@ -2458,13 +2488,14 @@ static int ipsecrequests_to_migrate(struct sadb_x_ipsecrequest *rq1, int len,
        rq2 = (struct sadb_x_ipsecrequest *)((u8 *)rq1 + rq1->sadb_x_ipsecrequest_len);
        len -= rq1->sadb_x_ipsecrequest_len;
 
-       if (len <= sizeof(struct sadb_x_ipsecrequest) ||
-           len < rq2->sadb_x_ipsecrequest_len)
+       if (len <= sizeof(*rq2) ||
+           len < rq2->sadb_x_ipsecrequest_len ||
+           rq2->sadb_x_ipsecrequest_len < sizeof(*rq2))
                return -EINVAL;
 
        /* new endpoints */
        err = parse_sockaddr_pair((struct sockaddr *)(rq2 + 1),
-                                 rq2->sadb_x_ipsecrequest_len,
+                                 rq2->sadb_x_ipsecrequest_len - sizeof(*rq2),
                                  &m->new_saddr, &m->new_daddr,
                                  &m->new_family);
        if (err)
@@ -2679,14 +2710,18 @@ static int pfkey_spddump(struct sock *sk, struct sk_buff *skb, const struct sadb
 {
        struct pfkey_sock *pfk = pfkey_sk(sk);
 
-       if (pfk->dump.dump != NULL)
+       mutex_lock(&pfk->dump_lock);
+       if (pfk->dump.dump != NULL) {
+               mutex_unlock(&pfk->dump_lock);
                return -EBUSY;
+       }
 
        pfk->dump.msg_version = hdr->sadb_msg_version;
        pfk->dump.msg_portid = hdr->sadb_msg_pid;
        pfk->dump.dump = pfkey_dump_sp;
        pfk->dump.done = pfkey_dump_sp_done;
        xfrm_policy_walk_init(&pfk->dump.u.policy, XFRM_POLICY_TYPE_MAIN);
+       mutex_unlock(&pfk->dump_lock);
 
        return pfkey_do_dump(pfk);
 }
index 8adab63..e37d955 100644 (file)
@@ -278,7 +278,57 @@ struct l2tp_session *l2tp_session_find(struct net *net, struct l2tp_tunnel *tunn
 }
 EXPORT_SYMBOL_GPL(l2tp_session_find);
 
-struct l2tp_session *l2tp_session_find_nth(struct l2tp_tunnel *tunnel, int nth)
+/* Like l2tp_session_find() but takes a reference on the returned session.
+ * Optionally calls session->ref() too if do_ref is true.
+ */
+struct l2tp_session *l2tp_session_get(struct net *net,
+                                     struct l2tp_tunnel *tunnel,
+                                     u32 session_id, bool do_ref)
+{
+       struct hlist_head *session_list;
+       struct l2tp_session *session;
+
+       if (!tunnel) {
+               struct l2tp_net *pn = l2tp_pernet(net);
+
+               session_list = l2tp_session_id_hash_2(pn, session_id);
+
+               rcu_read_lock_bh();
+               hlist_for_each_entry_rcu(session, session_list, global_hlist) {
+                       if (session->session_id == session_id) {
+                               l2tp_session_inc_refcount(session);
+                               if (do_ref && session->ref)
+                                       session->ref(session);
+                               rcu_read_unlock_bh();
+
+                               return session;
+                       }
+               }
+               rcu_read_unlock_bh();
+
+               return NULL;
+       }
+
+       session_list = l2tp_session_id_hash(tunnel, session_id);
+       read_lock_bh(&tunnel->hlist_lock);
+       hlist_for_each_entry(session, session_list, hlist) {
+               if (session->session_id == session_id) {
+                       l2tp_session_inc_refcount(session);
+                       if (do_ref && session->ref)
+                               session->ref(session);
+                       read_unlock_bh(&tunnel->hlist_lock);
+
+                       return session;
+               }
+       }
+       read_unlock_bh(&tunnel->hlist_lock);
+
+       return NULL;
+}
+EXPORT_SYMBOL_GPL(l2tp_session_get);
+
+struct l2tp_session *l2tp_session_get_nth(struct l2tp_tunnel *tunnel, int nth,
+                                         bool do_ref)
 {
        int hash;
        struct l2tp_session *session;
@@ -288,6 +338,9 @@ struct l2tp_session *l2tp_session_find_nth(struct l2tp_tunnel *tunnel, int nth)
        for (hash = 0; hash < L2TP_HASH_SIZE; hash++) {
                hlist_for_each_entry(session, &tunnel->session_hlist[hash], hlist) {
                        if (++count > nth) {
+                               l2tp_session_inc_refcount(session);
+                               if (do_ref && session->ref)
+                                       session->ref(session);
                                read_unlock_bh(&tunnel->hlist_lock);
                                return session;
                        }
@@ -298,12 +351,13 @@ struct l2tp_session *l2tp_session_find_nth(struct l2tp_tunnel *tunnel, int nth)
 
        return NULL;
 }
-EXPORT_SYMBOL_GPL(l2tp_session_find_nth);
+EXPORT_SYMBOL_GPL(l2tp_session_get_nth);
 
 /* Lookup a session by interface name.
  * This is very inefficient but is only used by management interfaces.
  */
-struct l2tp_session *l2tp_session_find_by_ifname(struct net *net, char *ifname)
+struct l2tp_session *l2tp_session_get_by_ifname(struct net *net, char *ifname,
+                                               bool do_ref)
 {
        struct l2tp_net *pn = l2tp_pernet(net);
        int hash;
@@ -313,7 +367,11 @@ struct l2tp_session *l2tp_session_find_by_ifname(struct net *net, char *ifname)
        for (hash = 0; hash < L2TP_HASH_SIZE_2; hash++) {
                hlist_for_each_entry_rcu(session, &pn->l2tp_session_hlist[hash], global_hlist) {
                        if (!strcmp(session->ifname, ifname)) {
+                               l2tp_session_inc_refcount(session);
+                               if (do_ref && session->ref)
+                                       session->ref(session);
                                rcu_read_unlock_bh();
+
                                return session;
                        }
                }
@@ -323,7 +381,49 @@ struct l2tp_session *l2tp_session_find_by_ifname(struct net *net, char *ifname)
 
        return NULL;
 }
-EXPORT_SYMBOL_GPL(l2tp_session_find_by_ifname);
+EXPORT_SYMBOL_GPL(l2tp_session_get_by_ifname);
+
+static int l2tp_session_add_to_tunnel(struct l2tp_tunnel *tunnel,
+                                     struct l2tp_session *session)
+{
+       struct l2tp_session *session_walk;
+       struct hlist_head *g_head;
+       struct hlist_head *head;
+       struct l2tp_net *pn;
+
+       head = l2tp_session_id_hash(tunnel, session->session_id);
+
+       write_lock_bh(&tunnel->hlist_lock);
+       hlist_for_each_entry(session_walk, head, hlist)
+               if (session_walk->session_id == session->session_id)
+                       goto exist;
+
+       if (tunnel->version == L2TP_HDR_VER_3) {
+               pn = l2tp_pernet(tunnel->l2tp_net);
+               g_head = l2tp_session_id_hash_2(l2tp_pernet(tunnel->l2tp_net),
+                                               session->session_id);
+
+               spin_lock_bh(&pn->l2tp_session_hlist_lock);
+               hlist_for_each_entry(session_walk, g_head, global_hlist)
+                       if (session_walk->session_id == session->session_id)
+                               goto exist_glob;
+
+               hlist_add_head_rcu(&session->global_hlist, g_head);
+               spin_unlock_bh(&pn->l2tp_session_hlist_lock);
+       }
+
+       hlist_add_head(&session->hlist, head);
+       write_unlock_bh(&tunnel->hlist_lock);
+
+       return 0;
+
+exist_glob:
+       spin_unlock_bh(&pn->l2tp_session_hlist_lock);
+exist:
+       write_unlock_bh(&tunnel->hlist_lock);
+
+       return -EEXIST;
+}
 
 /* Lookup a tunnel by id
  */
@@ -633,6 +733,9 @@ discard:
  * a data (not control) frame before coming here. Fields up to the
  * session-id have already been parsed and ptr points to the data
  * after the session-id.
+ *
+ * session->ref() must have been called prior to l2tp_recv_common().
+ * session->deref() will be called automatically after skb is processed.
  */
 void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb,
                      unsigned char *ptr, unsigned char *optr, u16 hdrflags,
@@ -642,14 +745,6 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb,
        int offset;
        u32 ns, nr;
 
-       /* The ref count is increased since we now hold a pointer to
-        * the session. Take care to decrement the refcnt when exiting
-        * this function from now on...
-        */
-       l2tp_session_inc_refcount(session);
-       if (session->ref)
-               (*session->ref)(session);
-
        /* Parse and check optional cookie */
        if (session->peer_cookie_len > 0) {
                if (memcmp(ptr, &session->peer_cookie[0], session->peer_cookie_len)) {
@@ -802,8 +897,6 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb,
        /* Try to dequeue as many skbs from reorder_q as we can. */
        l2tp_recv_dequeue(session);
 
-       l2tp_session_dec_refcount(session);
-
        return;
 
 discard:
@@ -812,8 +905,6 @@ discard:
 
        if (session->deref)
                (*session->deref)(session);
-
-       l2tp_session_dec_refcount(session);
 }
 EXPORT_SYMBOL(l2tp_recv_common);
 
@@ -920,8 +1011,14 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb,
        }
 
        /* Find the session context */
-       session = l2tp_session_find(tunnel->l2tp_net, tunnel, session_id);
+       session = l2tp_session_get(tunnel->l2tp_net, tunnel, session_id, true);
        if (!session || !session->recv_skb) {
+               if (session) {
+                       if (session->deref)
+                               session->deref(session);
+                       l2tp_session_dec_refcount(session);
+               }
+
                /* Not found? Pass to userspace to deal with */
                l2tp_info(tunnel, L2TP_MSG_DATA,
                          "%s: no session found (%u/%u). Passing up.\n",
@@ -930,6 +1027,7 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb,
        }
 
        l2tp_recv_common(session, skb, ptr, optr, hdrflags, length, payload_hook);
+       l2tp_session_dec_refcount(session);
 
        return 0;
 
@@ -1738,6 +1836,7 @@ EXPORT_SYMBOL_GPL(l2tp_session_set_header_len);
 struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunnel, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg)
 {
        struct l2tp_session *session;
+       int err;
 
        session = kzalloc(sizeof(struct l2tp_session) + priv_size, GFP_KERNEL);
        if (session != NULL) {
@@ -1793,6 +1892,13 @@ struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunn
 
                l2tp_session_set_header_len(session, tunnel->version);
 
+               err = l2tp_session_add_to_tunnel(tunnel, session);
+               if (err) {
+                       kfree(session);
+
+                       return ERR_PTR(err);
+               }
+
                /* Bump the reference count. The session context is deleted
                 * only when this drops to zero.
                 */
@@ -1802,28 +1908,14 @@ struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunn
                /* Ensure tunnel socket isn't deleted */
                sock_hold(tunnel->sock);
 
-               /* Add session to the tunnel's hash list */
-               write_lock_bh(&tunnel->hlist_lock);
-               hlist_add_head(&session->hlist,
-                              l2tp_session_id_hash(tunnel, session_id));
-               write_unlock_bh(&tunnel->hlist_lock);
-
-               /* And to the global session list if L2TPv3 */
-               if (tunnel->version != L2TP_HDR_VER_2) {
-                       struct l2tp_net *pn = l2tp_pernet(tunnel->l2tp_net);
-
-                       spin_lock_bh(&pn->l2tp_session_hlist_lock);
-                       hlist_add_head_rcu(&session->global_hlist,
-                                          l2tp_session_id_hash_2(pn, session_id));
-                       spin_unlock_bh(&pn->l2tp_session_hlist_lock);
-               }
-
                /* Ignore management session in session count value */
                if (session->session_id != 0)
                        atomic_inc(&l2tp_session_count);
+
+               return session;
        }
 
-       return session;
+       return ERR_PTR(-ENOMEM);
 }
 EXPORT_SYMBOL_GPL(l2tp_session_create);
 
index aebf281..8ce7818 100644 (file)
@@ -230,11 +230,16 @@ out:
        return tunnel;
 }
 
+struct l2tp_session *l2tp_session_get(struct net *net,
+                                     struct l2tp_tunnel *tunnel,
+                                     u32 session_id, bool do_ref);
 struct l2tp_session *l2tp_session_find(struct net *net,
                                       struct l2tp_tunnel *tunnel,
                                       u32 session_id);
-struct l2tp_session *l2tp_session_find_nth(struct l2tp_tunnel *tunnel, int nth);
-struct l2tp_session *l2tp_session_find_by_ifname(struct net *net, char *ifname);
+struct l2tp_session *l2tp_session_get_nth(struct l2tp_tunnel *tunnel, int nth,
+                                         bool do_ref);
+struct l2tp_session *l2tp_session_get_by_ifname(struct net *net, char *ifname,
+                                               bool do_ref);
 struct l2tp_tunnel *l2tp_tunnel_find(struct net *net, u32 tunnel_id);
 struct l2tp_tunnel *l2tp_tunnel_find_nth(struct net *net, int nth);
 
index 2d6760a..d100aed 100644 (file)
@@ -53,7 +53,7 @@ static void l2tp_dfs_next_tunnel(struct l2tp_dfs_seq_data *pd)
 
 static void l2tp_dfs_next_session(struct l2tp_dfs_seq_data *pd)
 {
-       pd->session = l2tp_session_find_nth(pd->tunnel, pd->session_idx);
+       pd->session = l2tp_session_get_nth(pd->tunnel, pd->session_idx, true);
        pd->session_idx++;
 
        if (pd->session == NULL) {
@@ -238,10 +238,14 @@ static int l2tp_dfs_seq_show(struct seq_file *m, void *v)
        }
 
        /* Show the tunnel or session context */
-       if (pd->session == NULL)
+       if (!pd->session) {
                l2tp_dfs_seq_tunnel_show(m, pd->tunnel);
-       else
+       } else {
                l2tp_dfs_seq_session_show(m, pd->session);
+               if (pd->session->deref)
+                       pd->session->deref(pd->session);
+               l2tp_session_dec_refcount(pd->session);
+       }
 
 out:
        return 0;
index 8bf18a5..6fd41d7 100644 (file)
@@ -221,12 +221,6 @@ static int l2tp_eth_create(struct net *net, u32 tunnel_id, u32 session_id, u32 p
                goto out;
        }
 
-       session = l2tp_session_find(net, tunnel, session_id);
-       if (session) {
-               rc = -EEXIST;
-               goto out;
-       }
-
        if (cfg->ifname) {
                dev = dev_get_by_name(net, cfg->ifname);
                if (dev) {
@@ -240,8 +234,8 @@ static int l2tp_eth_create(struct net *net, u32 tunnel_id, u32 session_id, u32 p
 
        session = l2tp_session_create(sizeof(*spriv), tunnel, session_id,
                                      peer_session_id, cfg);
-       if (!session) {
-               rc = -ENOMEM;
+       if (IS_ERR(session)) {
+               rc = PTR_ERR(session);
                goto out;
        }
 
index d25038c..4d322c1 100644 (file)
@@ -143,19 +143,19 @@ static int l2tp_ip_recv(struct sk_buff *skb)
        }
 
        /* Ok, this is a data packet. Lookup the session. */
-       session = l2tp_session_find(net, NULL, session_id);
-       if (session == NULL)
+       session = l2tp_session_get(net, NULL, session_id, true);
+       if (!session)
                goto discard;
 
        tunnel = session->tunnel;
-       if (tunnel == NULL)
-               goto discard;
+       if (!tunnel)
+               goto discard_sess;
 
        /* Trace packet contents, if enabled */
        if (tunnel->debug & L2TP_MSG_DATA) {
                length = min(32u, skb->len);
                if (!pskb_may_pull(skb, length))
-                       goto discard;
+                       goto discard_sess;
 
                /* Point to L2TP header */
                optr = ptr = skb->data;
@@ -165,6 +165,7 @@ static int l2tp_ip_recv(struct sk_buff *skb)
        }
 
        l2tp_recv_common(session, skb, ptr, optr, 0, skb->len, tunnel->recv_payload_hook);
+       l2tp_session_dec_refcount(session);
 
        return 0;
 
@@ -178,9 +179,10 @@ pass_up:
 
        tunnel_id = ntohl(*(__be32 *) &skb->data[4]);
        tunnel = l2tp_tunnel_find(net, tunnel_id);
-       if (tunnel != NULL)
+       if (tunnel) {
                sk = tunnel->sock;
-       else {
+               sock_hold(sk);
+       } else {
                struct iphdr *iph = (struct iphdr *) skb_network_header(skb);
 
                read_lock_bh(&l2tp_ip_lock);
@@ -202,6 +204,12 @@ pass_up:
 
        return sk_receive_skb(sk, skb, 1);
 
+discard_sess:
+       if (session->deref)
+               session->deref(session);
+       l2tp_session_dec_refcount(session);
+       goto discard;
+
 discard_put:
        sock_put(sk);
 
index a4abcbc..88b397c 100644 (file)
@@ -156,19 +156,19 @@ static int l2tp_ip6_recv(struct sk_buff *skb)
        }
 
        /* Ok, this is a data packet. Lookup the session. */
-       session = l2tp_session_find(net, NULL, session_id);
-       if (session == NULL)
+       session = l2tp_session_get(net, NULL, session_id, true);
+       if (!session)
                goto discard;
 
        tunnel = session->tunnel;
-       if (tunnel == NULL)
-               goto discard;
+       if (!tunnel)
+               goto discard_sess;
 
        /* Trace packet contents, if enabled */
        if (tunnel->debug & L2TP_MSG_DATA) {
                length = min(32u, skb->len);
                if (!pskb_may_pull(skb, length))
-                       goto discard;
+                       goto discard_sess;
 
                /* Point to L2TP header */
                optr = ptr = skb->data;
@@ -179,6 +179,8 @@ static int l2tp_ip6_recv(struct sk_buff *skb)
 
        l2tp_recv_common(session, skb, ptr, optr, 0, skb->len,
                         tunnel->recv_payload_hook);
+       l2tp_session_dec_refcount(session);
+
        return 0;
 
 pass_up:
@@ -191,9 +193,10 @@ pass_up:
 
        tunnel_id = ntohl(*(__be32 *) &skb->data[4]);
        tunnel = l2tp_tunnel_find(net, tunnel_id);
-       if (tunnel != NULL)
+       if (tunnel) {
                sk = tunnel->sock;
-       else {
+               sock_hold(sk);
+       } else {
                struct ipv6hdr *iph = ipv6_hdr(skb);
 
                read_lock_bh(&l2tp_ip6_lock);
@@ -215,6 +218,12 @@ pass_up:
 
        return sk_receive_skb(sk, skb, 1);
 
+discard_sess:
+       if (session->deref)
+               session->deref(session);
+       l2tp_session_dec_refcount(session);
+       goto discard;
+
 discard_put:
        sock_put(sk);
 
index 3620fba..7e3e669 100644 (file)
@@ -48,7 +48,8 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq,
 /* Accessed under genl lock */
 static const struct l2tp_nl_cmd_ops *l2tp_nl_cmd_ops[__L2TP_PWTYPE_MAX];
 
-static struct l2tp_session *l2tp_nl_session_find(struct genl_info *info)
+static struct l2tp_session *l2tp_nl_session_get(struct genl_info *info,
+                                               bool do_ref)
 {
        u32 tunnel_id;
        u32 session_id;
@@ -59,14 +60,15 @@ static struct l2tp_session *l2tp_nl_session_find(struct genl_info *info)
 
        if (info->attrs[L2TP_ATTR_IFNAME]) {
                ifname = nla_data(info->attrs[L2TP_ATTR_IFNAME]);
-               session = l2tp_session_find_by_ifname(net, ifname);
+               session = l2tp_session_get_by_ifname(net, ifname, do_ref);
        } else if ((info->attrs[L2TP_ATTR_SESSION_ID]) &&
                   (info->attrs[L2TP_ATTR_CONN_ID])) {
                tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_CONN_ID]);
                session_id = nla_get_u32(info->attrs[L2TP_ATTR_SESSION_ID]);
                tunnel = l2tp_tunnel_find(net, tunnel_id);
                if (tunnel)
-                       session = l2tp_session_find(net, tunnel, session_id);
+                       session = l2tp_session_get(net, tunnel, session_id,
+                                                  do_ref);
        }
 
        return session;
@@ -642,10 +644,12 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf
                        session_id, peer_session_id, &cfg);
 
        if (ret >= 0) {
-               session = l2tp_session_find(net, tunnel, session_id);
-               if (session)
+               session = l2tp_session_get(net, tunnel, session_id, false);
+               if (session) {
                        ret = l2tp_session_notify(&l2tp_nl_family, info, session,
                                                  L2TP_CMD_SESSION_CREATE);
+                       l2tp_session_dec_refcount(session);
+               }
        }
 
 out:
@@ -658,7 +662,7 @@ static int l2tp_nl_cmd_session_delete(struct sk_buff *skb, struct genl_info *inf
        struct l2tp_session *session;
        u16 pw_type;
 
-       session = l2tp_nl_session_find(info);
+       session = l2tp_nl_session_get(info, true);
        if (session == NULL) {
                ret = -ENODEV;
                goto out;
@@ -672,6 +676,10 @@ static int l2tp_nl_cmd_session_delete(struct sk_buff *skb, struct genl_info *inf
                if (l2tp_nl_cmd_ops[pw_type] && l2tp_nl_cmd_ops[pw_type]->session_delete)
                        ret = (*l2tp_nl_cmd_ops[pw_type]->session_delete)(session);
 
+       if (session->deref)
+               session->deref(session);
+       l2tp_session_dec_refcount(session);
+
 out:
        return ret;
 }
@@ -681,7 +689,7 @@ static int l2tp_nl_cmd_session_modify(struct sk_buff *skb, struct genl_info *inf
        int ret = 0;
        struct l2tp_session *session;
 
-       session = l2tp_nl_session_find(info);
+       session = l2tp_nl_session_get(info, false);
        if (session == NULL) {
                ret = -ENODEV;
                goto out;
@@ -716,6 +724,8 @@ static int l2tp_nl_cmd_session_modify(struct sk_buff *skb, struct genl_info *inf
        ret = l2tp_session_notify(&l2tp_nl_family, info,
                                  session, L2TP_CMD_SESSION_MODIFY);
 
+       l2tp_session_dec_refcount(session);
+
 out:
        return ret;
 }
@@ -811,29 +821,34 @@ static int l2tp_nl_cmd_session_get(struct sk_buff *skb, struct genl_info *info)
        struct sk_buff *msg;
        int ret;
 
-       session = l2tp_nl_session_find(info);
+       session = l2tp_nl_session_get(info, false);
        if (session == NULL) {
                ret = -ENODEV;
-               goto out;
+               goto err;
        }
 
        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!msg) {
                ret = -ENOMEM;
-               goto out;
+               goto err_ref;
        }
 
        ret = l2tp_nl_session_send(msg, info->snd_portid, info->snd_seq,
                                   0, session, L2TP_CMD_SESSION_GET);
        if (ret < 0)
-               goto err_out;
+               goto err_ref_msg;
 
-       return genlmsg_unicast(genl_info_net(info), msg, info->snd_portid);
+       ret = genlmsg_unicast(genl_info_net(info), msg, info->snd_portid);
 
-err_out:
-       nlmsg_free(msg);
+       l2tp_session_dec_refcount(session);
 
-out:
+       return ret;
+
+err_ref_msg:
+       nlmsg_free(msg);
+err_ref:
+       l2tp_session_dec_refcount(session);
+err:
        return ret;
 }
 
@@ -852,7 +867,7 @@ static int l2tp_nl_cmd_session_dump(struct sk_buff *skb, struct netlink_callback
                                goto out;
                }
 
-               session = l2tp_session_find_nth(tunnel, si);
+               session = l2tp_session_get_nth(tunnel, si, false);
                if (session == NULL) {
                        ti++;
                        tunnel = NULL;
@@ -862,8 +877,11 @@ static int l2tp_nl_cmd_session_dump(struct sk_buff *skb, struct netlink_callback
 
                if (l2tp_nl_session_send(skb, NETLINK_CB(cb->skb).portid,
                                         cb->nlh->nlmsg_seq, NLM_F_MULTI,
-                                        session, L2TP_CMD_SESSION_GET) < 0)
+                                        session, L2TP_CMD_SESSION_GET) < 0) {
+                       l2tp_session_dec_refcount(session);
                        break;
+               }
+               l2tp_session_dec_refcount(session);
 
                si++;
        }
index 36cc56f..32ea0f3 100644 (file)
@@ -450,6 +450,10 @@ static void pppol2tp_session_close(struct l2tp_session *session)
 static void pppol2tp_session_destruct(struct sock *sk)
 {
        struct l2tp_session *session = sk->sk_user_data;
+
+       skb_queue_purge(&sk->sk_receive_queue);
+       skb_queue_purge(&sk->sk_write_queue);
+
        if (session) {
                sk->sk_user_data = NULL;
                BUG_ON(session->magic != L2TP_SESSION_MAGIC);
@@ -488,9 +492,6 @@ static int pppol2tp_release(struct socket *sock)
                l2tp_session_queue_purge(session);
                sock_put(sk);
        }
-       skb_queue_purge(&sk->sk_receive_queue);
-       skb_queue_purge(&sk->sk_write_queue);
-
        release_sock(sk);
 
        /* This will delete the session context via
@@ -582,6 +583,7 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
        int error = 0;
        u32 tunnel_id, peer_tunnel_id;
        u32 session_id, peer_session_id;
+       bool drop_refcnt = false;
        int ver = 2;
        int fd;
 
@@ -683,36 +685,36 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
        if (tunnel->peer_tunnel_id == 0)
                tunnel->peer_tunnel_id = peer_tunnel_id;
 
-       /* Create session if it doesn't already exist. We handle the
-        * case where a session was previously created by the netlink
-        * interface by checking that the session doesn't already have
-        * a socket and its tunnel socket are what we expect. If any
-        * of those checks fail, return EEXIST to the caller.
-        */
-       session = l2tp_session_find(sock_net(sk), tunnel, session_id);
-       if (session == NULL) {
-               /* Default MTU must allow space for UDP/L2TP/PPP
-                * headers.
+       session = l2tp_session_get(sock_net(sk), tunnel, session_id, false);
+       if (session) {
+               drop_refcnt = true;
+               ps = l2tp_session_priv(session);
+
+               /* Using a pre-existing session is fine as long as it hasn't
+                * been connected yet.
                 */
-               cfg.mtu = cfg.mru = 1500 - PPPOL2TP_HEADER_OVERHEAD;
+               if (ps->sock) {
+                       error = -EEXIST;
+                       goto end;
+               }
 
-               /* Allocate and initialize a new session context. */
-               session = l2tp_session_create(sizeof(struct pppol2tp_session),
-                                             tunnel, session_id,
-                                             peer_session_id, &cfg);
-               if (session == NULL) {
-                       error = -ENOMEM;
+               /* consistency checks */
+               if (ps->tunnel_sock != tunnel->sock) {
+                       error = -EEXIST;
                        goto end;
                }
        } else {
-               ps = l2tp_session_priv(session);
-               error = -EEXIST;
-               if (ps->sock != NULL)
-                       goto end;
+               /* Default MTU must allow space for UDP/L2TP/PPP headers */
+               cfg.mtu = 1500 - PPPOL2TP_HEADER_OVERHEAD;
+               cfg.mru = cfg.mtu;
 
-               /* consistency checks */
-               if (ps->tunnel_sock != tunnel->sock)
+               session = l2tp_session_create(sizeof(struct pppol2tp_session),
+                                             tunnel, session_id,
+                                             peer_session_id, &cfg);
+               if (IS_ERR(session)) {
+                       error = PTR_ERR(session);
                        goto end;
+               }
        }
 
        /* Associate session with its PPPoL2TP socket */
@@ -777,6 +779,8 @@ out_no_ppp:
                  session->name);
 
 end:
+       if (drop_refcnt)
+               l2tp_session_dec_refcount(session);
        release_sock(sk);
 
        return error;
@@ -804,12 +808,6 @@ static int pppol2tp_session_create(struct net *net, u32 tunnel_id, u32 session_i
        if (tunnel->sock == NULL)
                goto out;
 
-       /* Check that this session doesn't already exist */
-       error = -EEXIST;
-       session = l2tp_session_find(net, tunnel, session_id);
-       if (session != NULL)
-               goto out;
-
        /* Default MTU values. */
        if (cfg->mtu == 0)
                cfg->mtu = 1500 - PPPOL2TP_HEADER_OVERHEAD;
@@ -817,12 +815,13 @@ static int pppol2tp_session_create(struct net *net, u32 tunnel_id, u32 session_i
                cfg->mru = cfg->mtu;
 
        /* Allocate and initialize a new session context. */
-       error = -ENOMEM;
        session = l2tp_session_create(sizeof(struct pppol2tp_session),
                                      tunnel, session_id,
                                      peer_session_id, cfg);
-       if (session == NULL)
+       if (IS_ERR(session)) {
+               error = PTR_ERR(session);
                goto out;
+       }
 
        ps = l2tp_session_priv(session);
        ps->tunnel_sock = tunnel->sock;
@@ -1140,11 +1139,18 @@ static int pppol2tp_tunnel_ioctl(struct l2tp_tunnel *tunnel,
                if (stats.session_id != 0) {
                        /* resend to session ioctl handler */
                        struct l2tp_session *session =
-                               l2tp_session_find(sock_net(sk), tunnel, stats.session_id);
-                       if (session != NULL)
-                               err = pppol2tp_session_ioctl(session, cmd, arg);
-                       else
+                               l2tp_session_get(sock_net(sk), tunnel,
+                                                stats.session_id, true);
+
+                       if (session) {
+                               err = pppol2tp_session_ioctl(session, cmd,
+                                                            arg);
+                               if (session->deref)
+                                       session->deref(session);
+                               l2tp_session_dec_refcount(session);
+                       } else {
                                err = -EBADR;
+                       }
                        break;
                }
 #ifdef CONFIG_XFRM
@@ -1377,8 +1383,6 @@ static int pppol2tp_setsockopt(struct socket *sock, int level, int optname,
        } else
                err = pppol2tp_session_setsockopt(sk, session, optname, val);
 
-       err = 0;
-
 end_put_sess:
        sock_put(sk);
 end:
@@ -1501,8 +1505,13 @@ static int pppol2tp_getsockopt(struct socket *sock, int level, int optname,
 
                err = pppol2tp_tunnel_getsockopt(sk, tunnel, optname, &val);
                sock_put(ps->tunnel_sock);
-       } else
+               if (err)
+                       goto end_put_sess;
+       } else {
                err = pppol2tp_session_getsockopt(sk, session, optname, &val);
+               if (err)
+                       goto end_put_sess;
+       }
 
        err = -EFAULT;
        if (put_user(len, optlen))
@@ -1554,7 +1563,7 @@ static void pppol2tp_next_tunnel(struct net *net, struct pppol2tp_seq_data *pd)
 
 static void pppol2tp_next_session(struct net *net, struct pppol2tp_seq_data *pd)
 {
-       pd->session = l2tp_session_find_nth(pd->tunnel, pd->session_idx);
+       pd->session = l2tp_session_get_nth(pd->tunnel, pd->session_idx, true);
        pd->session_idx++;
 
        if (pd->session == NULL) {
@@ -1681,10 +1690,14 @@ static int pppol2tp_seq_show(struct seq_file *m, void *v)
 
        /* Show the tunnel or session context.
         */
-       if (pd->session == NULL)
+       if (!pd->session) {
                pppol2tp_seq_tunnel_show(m, pd->tunnel);
-       else
+       } else {
                pppol2tp_seq_session_show(m, pd->session);
+               if (pd->session->deref)
+                       pd->session->deref(pd->session);
+               l2tp_session_dec_refcount(pd->session);
+       }
 
 out:
        return 0;
@@ -1843,4 +1856,4 @@ MODULE_DESCRIPTION("PPP over L2TP over UDP");
 MODULE_LICENSE("GPL");
 MODULE_VERSION(PPPOL2TP_DRV_VERSION);
 MODULE_ALIAS_NET_PF_PROTO(PF_PPPOX, PX_PROTO_OL2TP);
-MODULE_ALIAS_L2TP_PWTYPE(11);
+MODULE_ALIAS_L2TP_PWTYPE(7);
index 40813dd..5bb0c50 100644 (file)
@@ -718,7 +718,8 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up)
        ieee80211_recalc_ps(local);
 
        if (sdata->vif.type == NL80211_IFTYPE_MONITOR ||
-           sdata->vif.type == NL80211_IFTYPE_AP_VLAN) {
+           sdata->vif.type == NL80211_IFTYPE_AP_VLAN ||
+           local->ops->wake_tx_queue) {
                /* XXX: for AP_VLAN, actually track AP queues */
                netif_tx_start_all_queues(dev);
        } else if (dev) {
index e48724a..4d7543d 100644 (file)
@@ -208,6 +208,51 @@ ieee80211_rx_radiotap_hdrlen(struct ieee80211_local *local,
        return len;
 }
 
+static void ieee80211_handle_mu_mimo_mon(struct ieee80211_sub_if_data *sdata,
+                                        struct sk_buff *skb,
+                                        int rtap_vendor_space)
+{
+       struct {
+               struct ieee80211_hdr_3addr hdr;
+               u8 category;
+               u8 action_code;
+       } __packed action;
+
+       if (!sdata)
+               return;
+
+       BUILD_BUG_ON(sizeof(action) != IEEE80211_MIN_ACTION_SIZE + 1);
+
+       if (skb->len < rtap_vendor_space + sizeof(action) +
+                      VHT_MUMIMO_GROUPS_DATA_LEN)
+               return;
+
+       if (!is_valid_ether_addr(sdata->u.mntr.mu_follow_addr))
+               return;
+
+       skb_copy_bits(skb, rtap_vendor_space, &action, sizeof(action));
+
+       if (!ieee80211_is_action(action.hdr.frame_control))
+               return;
+
+       if (action.category != WLAN_CATEGORY_VHT)
+               return;
+
+       if (action.action_code != WLAN_VHT_ACTION_GROUPID_MGMT)
+               return;
+
+       if (!ether_addr_equal(action.hdr.addr1, sdata->u.mntr.mu_follow_addr))
+               return;
+
+       skb = skb_copy(skb, GFP_ATOMIC);
+       if (!skb)
+               return;
+
+       skb->pkt_type = IEEE80211_SDATA_QUEUE_TYPE_FRAME;
+       skb_queue_tail(&sdata->skb_queue, skb);
+       ieee80211_queue_work(&sdata->local->hw, &sdata->work);
+}
+
 /*
  * ieee80211_add_rx_radiotap_header - add radiotap header
  *
@@ -515,7 +560,6 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
        struct net_device *prev_dev = NULL;
        int present_fcs_len = 0;
        unsigned int rtap_vendor_space = 0;
-       struct ieee80211_mgmt *mgmt;
        struct ieee80211_sub_if_data *monitor_sdata =
                rcu_dereference(local->monitor_sdata);
 
@@ -553,6 +597,8 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
                return remove_monitor_info(local, origskb, rtap_vendor_space);
        }
 
+       ieee80211_handle_mu_mimo_mon(monitor_sdata, origskb, rtap_vendor_space);
+
        /* room for the radiotap header based on driver features */
        rt_hdrlen = ieee80211_rx_radiotap_hdrlen(local, status, origskb);
        needed_headroom = rt_hdrlen - rtap_vendor_space;
@@ -618,23 +664,6 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
                ieee80211_rx_stats(sdata->dev, skb->len);
        }
 
-       mgmt = (void *)skb->data;
-       if (monitor_sdata &&
-           skb->len >= IEEE80211_MIN_ACTION_SIZE + 1 + VHT_MUMIMO_GROUPS_DATA_LEN &&
-           ieee80211_is_action(mgmt->frame_control) &&
-           mgmt->u.action.category == WLAN_CATEGORY_VHT &&
-           mgmt->u.action.u.vht_group_notif.action_code == WLAN_VHT_ACTION_GROUPID_MGMT &&
-           is_valid_ether_addr(monitor_sdata->u.mntr.mu_follow_addr) &&
-           ether_addr_equal(mgmt->da, monitor_sdata->u.mntr.mu_follow_addr)) {
-               struct sk_buff *mu_skb = skb_copy(skb, GFP_ATOMIC);
-
-               if (mu_skb) {
-                       mu_skb->pkt_type = IEEE80211_SDATA_QUEUE_TYPE_FRAME;
-                       skb_queue_tail(&monitor_sdata->skb_queue, mu_skb);
-                       ieee80211_queue_work(&local->hw, &monitor_sdata->work);
-               }
-       }
-
        if (prev_dev) {
                skb->dev = prev_dev;
                netif_receive_skb(skb);
@@ -3610,6 +3639,27 @@ static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx)
                            !ether_addr_equal(bssid, hdr->addr1))
                                return false;
                }
+
+               /*
+                * 802.11-2016 Table 9-26 says that for data frames, A1 must be
+                * the BSSID - we've checked that already but may have accepted
+                * the wildcard (ff:ff:ff:ff:ff:ff).
+                *
+                * It also says:
+                *      The BSSID of the Data frame is determined as follows:
+                *      a) If the STA is contained within an AP or is associated
+                *         with an AP, the BSSID is the address currently in use
+                *         by the STA contained in the AP.
+                *
+                * So we should not accept data frames with an address that's
+                * multicast.
+                *
+                * Accepting it also opens a security problem because stations
+                * could encrypt it with the GTK and inject traffic that way.
+                */
+               if (ieee80211_is_data(hdr->frame_control) && multicast)
+                       return false;
+
                return true;
        case NL80211_IFTYPE_WDS:
                if (bssid || !ieee80211_is_data(hdr->frame_control))
index da9df2d..22fc321 100644 (file)
@@ -290,6 +290,7 @@ void nf_conntrack_unregister_notifier(struct net *net,
        BUG_ON(notify != new);
        RCU_INIT_POINTER(net->ct.nf_conntrack_event_cb, NULL);
        mutex_unlock(&nf_ct_ecache_mutex);
+       /* synchronize_rcu() is called from ctnetlink_exit. */
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier);
 
@@ -326,6 +327,7 @@ void nf_ct_expect_unregister_notifier(struct net *net,
        BUG_ON(notify != new);
        RCU_INIT_POINTER(net->ct.nf_expect_event_cb, NULL);
        mutex_unlock(&nf_ct_ecache_mutex);
+       /* synchronize_rcu() is called from ctnetlink_exit. */
 }
 EXPORT_SYMBOL_GPL(nf_ct_expect_unregister_notifier);
 
index 4b2e1fb..d800730 100644 (file)
@@ -57,7 +57,7 @@ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp,
        hlist_del_rcu(&exp->hnode);
        net->ct.expect_count--;
 
-       hlist_del(&exp->lnode);
+       hlist_del_rcu(&exp->lnode);
        master_help->expecting[exp->class]--;
 
        nf_ct_expect_event_report(IPEXP_DESTROY, exp, portid, report);
@@ -363,7 +363,7 @@ static void nf_ct_expect_insert(struct nf_conntrack_expect *exp)
        /* two references : one for hash insert, one for the timer */
        atomic_add(2, &exp->use);
 
-       hlist_add_head(&exp->lnode, &master_help->expectations);
+       hlist_add_head_rcu(&exp->lnode, &master_help->expectations);
        master_help->expecting[exp->class]++;
 
        hlist_add_head_rcu(&exp->hnode, &nf_ct_expect_hash[h]);
index 02bcf00..008299b 100644 (file)
@@ -53,7 +53,11 @@ nf_ct_ext_create(struct nf_ct_ext **ext, enum nf_ct_ext_id id,
 
        rcu_read_lock();
        t = rcu_dereference(nf_ct_ext_types[id]);
-       BUG_ON(t == NULL);
+       if (!t) {
+               rcu_read_unlock();
+               return NULL;
+       }
+
        off = ALIGN(sizeof(struct nf_ct_ext), t->align);
        len = off + t->len + var_alloc_len;
        alloc_size = t->alloc_size + var_alloc_len;
@@ -88,7 +92,10 @@ void *__nf_ct_ext_add_length(struct nf_conn *ct, enum nf_ct_ext_id id,
 
        rcu_read_lock();
        t = rcu_dereference(nf_ct_ext_types[id]);
-       BUG_ON(t == NULL);
+       if (!t) {
+               rcu_read_unlock();
+               return NULL;
+       }
 
        newoff = ALIGN(old->len, t->align);
        newlen = newoff + t->len + var_alloc_len;
@@ -175,6 +182,6 @@ void nf_ct_extend_unregister(struct nf_ct_ext_type *type)
        RCU_INIT_POINTER(nf_ct_ext_types[type->id], NULL);
        update_alloc_size(type);
        mutex_unlock(&nf_ct_ext_type_mutex);
-       rcu_barrier(); /* Wait for completion of call_rcu()'s */
+       synchronize_rcu();
 }
 EXPORT_SYMBOL_GPL(nf_ct_extend_unregister);
index 6dc44d9..4eeb341 100644 (file)
@@ -158,16 +158,25 @@ nf_conntrack_helper_try_module_get(const char *name, u16 l3num, u8 protonum)
 {
        struct nf_conntrack_helper *h;
 
+       rcu_read_lock();
+
        h = __nf_conntrack_helper_find(name, l3num, protonum);
 #ifdef CONFIG_MODULES
        if (h == NULL) {
-               if (request_module("nfct-helper-%s", name) == 0)
+               rcu_read_unlock();
+               if (request_module("nfct-helper-%s", name) == 0) {
+                       rcu_read_lock();
                        h = __nf_conntrack_helper_find(name, l3num, protonum);
+               } else {
+                       return h;
+               }
        }
 #endif
        if (h != NULL && !try_module_get(h->me))
                h = NULL;
 
+       rcu_read_unlock();
+
        return h;
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_helper_try_module_get);
@@ -311,38 +320,36 @@ void nf_ct_helper_expectfn_unregister(struct nf_ct_helper_expectfn *n)
 }
 EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_unregister);
 
+/* Caller should hold the rcu lock */
 struct nf_ct_helper_expectfn *
 nf_ct_helper_expectfn_find_by_name(const char *name)
 {
        struct nf_ct_helper_expectfn *cur;
        bool found = false;
 
-       rcu_read_lock();
        list_for_each_entry_rcu(cur, &nf_ct_helper_expectfn_list, head) {
                if (!strcmp(cur->name, name)) {
                        found = true;
                        break;
                }
        }
-       rcu_read_unlock();
        return found ? cur : NULL;
 }
 EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_find_by_name);
 
+/* Caller should hold the rcu lock */
 struct nf_ct_helper_expectfn *
 nf_ct_helper_expectfn_find_by_symbol(const void *symbol)
 {
        struct nf_ct_helper_expectfn *cur;
        bool found = false;
 
-       rcu_read_lock();
        list_for_each_entry_rcu(cur, &nf_ct_helper_expectfn_list, head) {
                if (cur->expectfn == symbol) {
                        found = true;
                        break;
                }
        }
-       rcu_read_unlock();
        return found ? cur : NULL;
 }
 EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_find_by_symbol);
index 6806b5e..dc7dfd6 100644 (file)
@@ -1488,11 +1488,16 @@ static int ctnetlink_change_helper(struct nf_conn *ct,
                 * treat the second attempt as a no-op instead of returning
                 * an error.
                 */
-               if (help && help->helper &&
-                   !strcmp(help->helper->name, helpname))
-                       return 0;
-               else
-                       return -EBUSY;
+               err = -EBUSY;
+               if (help) {
+                       rcu_read_lock();
+                       helper = rcu_dereference(help->helper);
+                       if (helper && !strcmp(helper->name, helpname))
+                               err = 0;
+                       rcu_read_unlock();
+               }
+
+               return err;
        }
 
        if (!strcmp(helpname, "")) {
@@ -1929,9 +1934,9 @@ static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl,
 
                        err = 0;
                        if (test_bit(IPS_EXPECTED_BIT, &ct->status))
-                               events = IPCT_RELATED;
+                               events = 1 << IPCT_RELATED;
                        else
-                               events = IPCT_NEW;
+                               events = 1 << IPCT_NEW;
 
                        if (cda[CTA_LABELS] &&
                            ctnetlink_attach_labels(ct, cda) == 0)
@@ -2675,8 +2680,8 @@ ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
        last = (struct nf_conntrack_expect *)cb->args[1];
        for (; cb->args[0] < nf_ct_expect_hsize; cb->args[0]++) {
 restart:
-               hlist_for_each_entry(exp, &nf_ct_expect_hash[cb->args[0]],
-                                    hnode) {
+               hlist_for_each_entry_rcu(exp, &nf_ct_expect_hash[cb->args[0]],
+                                        hnode) {
                        if (l3proto && exp->tuple.src.l3num != l3proto)
                                continue;
 
@@ -2727,7 +2732,7 @@ ctnetlink_exp_ct_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
        rcu_read_lock();
        last = (struct nf_conntrack_expect *)cb->args[1];
 restart:
-       hlist_for_each_entry(exp, &help->expectations, lnode) {
+       hlist_for_each_entry_rcu(exp, &help->expectations, lnode) {
                if (l3proto && exp->tuple.src.l3num != l3proto)
                        continue;
                if (cb->args[1]) {
@@ -2789,6 +2794,12 @@ static int ctnetlink_dump_exp_ct(struct net *net, struct sock *ctnl,
                return -ENOENT;
 
        ct = nf_ct_tuplehash_to_ctrack(h);
+       /* No expectation linked to this connection tracking. */
+       if (!nfct_help(ct)) {
+               nf_ct_put(ct);
+               return 0;
+       }
+
        c.data = ct;
 
        err = netlink_dump_start(ctnl, skb, nlh, &c);
@@ -3133,23 +3144,27 @@ ctnetlink_create_expect(struct net *net,
                return -ENOENT;
        ct = nf_ct_tuplehash_to_ctrack(h);
 
+       rcu_read_lock();
        if (cda[CTA_EXPECT_HELP_NAME]) {
                const char *helpname = nla_data(cda[CTA_EXPECT_HELP_NAME]);
 
                helper = __nf_conntrack_helper_find(helpname, u3,
                                                    nf_ct_protonum(ct));
                if (helper == NULL) {
+                       rcu_read_unlock();
 #ifdef CONFIG_MODULES
                        if (request_module("nfct-helper-%s", helpname) < 0) {
                                err = -EOPNOTSUPP;
                                goto err_ct;
                        }
+                       rcu_read_lock();
                        helper = __nf_conntrack_helper_find(helpname, u3,
                                                            nf_ct_protonum(ct));
                        if (helper) {
                                err = -EAGAIN;
-                               goto err_ct;
+                               goto err_rcu;
                        }
+                       rcu_read_unlock();
 #endif
                        err = -EOPNOTSUPP;
                        goto err_ct;
@@ -3159,11 +3174,13 @@ ctnetlink_create_expect(struct net *net,
        exp = ctnetlink_alloc_expect(cda, ct, helper, &tuple, &mask);
        if (IS_ERR(exp)) {
                err = PTR_ERR(exp);
-               goto err_ct;
+               goto err_rcu;
        }
 
        err = nf_ct_expect_related_report(exp, portid, report);
        nf_ct_expect_put(exp);
+err_rcu:
+       rcu_read_unlock();
 err_ct:
        nf_ct_put(ct);
        return err;
@@ -3442,6 +3459,7 @@ static void __exit ctnetlink_exit(void)
 #ifdef CONFIG_NETFILTER_NETLINK_GLUE_CT
        RCU_INIT_POINTER(nfnl_ct_hook, NULL);
 #endif
+       synchronize_rcu();
 }
 
 module_init(ctnetlink_init);
index 94b14c5..82802e4 100644 (file)
@@ -903,6 +903,8 @@ static void __exit nf_nat_cleanup(void)
 #ifdef CONFIG_XFRM
        RCU_INIT_POINTER(nf_nat_decode_session_hook, NULL);
 #endif
+       synchronize_rcu();
+
        for (i = 0; i < NFPROTO_NUMPROTO; i++)
                kfree(nf_nat_l4protos[i]);
 
index d438698..8606756 100644 (file)
@@ -101,11 +101,13 @@ nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range *range,
                rcu_read_lock();
                idev = __in6_dev_get(skb->dev);
                if (idev != NULL) {
+                       read_lock_bh(&idev->lock);
                        list_for_each_entry(ifa, &idev->addr_list, if_list) {
                                newdst = ifa->addr;
                                addr = true;
                                break;
                        }
+                       read_unlock_bh(&idev->lock);
                }
                rcu_read_unlock();
 
index de87823..d455581 100644 (file)
@@ -32,6 +32,13 @@ MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
 MODULE_DESCRIPTION("nfnl_cthelper: User-space connection tracking helpers");
 
+struct nfnl_cthelper {
+       struct list_head                list;
+       struct nf_conntrack_helper      helper;
+};
+
+static LIST_HEAD(nfnl_cthelper_list);
+
 static int
 nfnl_userspace_cthelper(struct sk_buff *skb, unsigned int protoff,
                        struct nf_conn *ct, enum ip_conntrack_info ctinfo)
@@ -161,6 +168,7 @@ nfnl_cthelper_parse_expect_policy(struct nf_conntrack_helper *helper,
        int i, ret;
        struct nf_conntrack_expect_policy *expect_policy;
        struct nlattr *tb[NFCTH_POLICY_SET_MAX+1];
+       unsigned int class_max;
 
        ret = nla_parse_nested(tb, NFCTH_POLICY_SET_MAX, attr,
                               nfnl_cthelper_expect_policy_set);
@@ -170,19 +178,18 @@ nfnl_cthelper_parse_expect_policy(struct nf_conntrack_helper *helper,
        if (!tb[NFCTH_POLICY_SET_NUM])
                return -EINVAL;
 
-       helper->expect_class_max =
-               ntohl(nla_get_be32(tb[NFCTH_POLICY_SET_NUM]));
-
-       if (helper->expect_class_max != 0 &&
-           helper->expect_class_max > NF_CT_MAX_EXPECT_CLASSES)
+       class_max = ntohl(nla_get_be32(tb[NFCTH_POLICY_SET_NUM]));
+       if (class_max == 0)
+               return -EINVAL;
+       if (class_max > NF_CT_MAX_EXPECT_CLASSES)
                return -EOVERFLOW;
 
        expect_policy = kzalloc(sizeof(struct nf_conntrack_expect_policy) *
-                               helper->expect_class_max, GFP_KERNEL);
+                               class_max, GFP_KERNEL);
        if (expect_policy == NULL)
                return -ENOMEM;
 
-       for (i=0; i<helper->expect_class_max; i++) {
+       for (i = 0; i < class_max; i++) {
                if (!tb[NFCTH_POLICY_SET+i])
                        goto err;
 
@@ -191,6 +198,8 @@ nfnl_cthelper_parse_expect_policy(struct nf_conntrack_helper *helper,
                if (ret < 0)
                        goto err;
        }
+
+       helper->expect_class_max = class_max - 1;
        helper->expect_policy = expect_policy;
        return 0;
 err:
@@ -203,18 +212,20 @@ nfnl_cthelper_create(const struct nlattr * const tb[],
                     struct nf_conntrack_tuple *tuple)
 {
        struct nf_conntrack_helper *helper;
+       struct nfnl_cthelper *nfcth;
        int ret;
 
        if (!tb[NFCTH_TUPLE] || !tb[NFCTH_POLICY] || !tb[NFCTH_PRIV_DATA_LEN])
                return -EINVAL;
 
-       helper = kzalloc(sizeof(struct nf_conntrack_helper), GFP_KERNEL);
-       if (helper == NULL)
+       nfcth = kzalloc(sizeof(*nfcth), GFP_KERNEL);
+       if (nfcth == NULL)
                return -ENOMEM;
+       helper = &nfcth->helper;
 
        ret = nfnl_cthelper_parse_expect_policy(helper, tb[NFCTH_POLICY]);
        if (ret < 0)
-               goto err;
+               goto err1;
 
        strncpy(helper->name, nla_data(tb[NFCTH_NAME]), NF_CT_HELPER_NAME_LEN);
        helper->data_len = ntohl(nla_get_be32(tb[NFCTH_PRIV_DATA_LEN]));
@@ -245,15 +256,101 @@ nfnl_cthelper_create(const struct nlattr * const tb[],
 
        ret = nf_conntrack_helper_register(helper);
        if (ret < 0)
-               goto err;
+               goto err2;
 
+       list_add_tail(&nfcth->list, &nfnl_cthelper_list);
        return 0;
-err:
-       kfree(helper);
+err2:
+       kfree(helper->expect_policy);
+err1:
+       kfree(nfcth);
        return ret;
 }
 
 static int
+nfnl_cthelper_update_policy_one(const struct nf_conntrack_expect_policy *policy,
+                               struct nf_conntrack_expect_policy *new_policy,
+                               const struct nlattr *attr)
+{
+       struct nlattr *tb[NFCTH_POLICY_MAX + 1];
+       int err;
+
+       err = nla_parse_nested(tb, NFCTH_POLICY_MAX, attr,
+                              nfnl_cthelper_expect_pol);
+       if (err < 0)
+               return err;
+
+       if (!tb[NFCTH_POLICY_NAME] ||
+           !tb[NFCTH_POLICY_EXPECT_MAX] ||
+           !tb[NFCTH_POLICY_EXPECT_TIMEOUT])
+               return -EINVAL;
+
+       if (nla_strcmp(tb[NFCTH_POLICY_NAME], policy->name))
+               return -EBUSY;
+
+       new_policy->max_expected =
+               ntohl(nla_get_be32(tb[NFCTH_POLICY_EXPECT_MAX]));
+       new_policy->timeout =
+               ntohl(nla_get_be32(tb[NFCTH_POLICY_EXPECT_TIMEOUT]));
+
+       return 0;
+}
+
+static int nfnl_cthelper_update_policy_all(struct nlattr *tb[],
+                                          struct nf_conntrack_helper *helper)
+{
+       struct nf_conntrack_expect_policy new_policy[helper->expect_class_max + 1];
+       struct nf_conntrack_expect_policy *policy;
+       int i, err;
+
+       /* Check first that all policy attributes are well-formed, so we don't
+        * leave things in inconsistent state on errors.
+        */
+       for (i = 0; i < helper->expect_class_max + 1; i++) {
+
+               if (!tb[NFCTH_POLICY_SET + i])
+                       return -EINVAL;
+
+               err = nfnl_cthelper_update_policy_one(&helper->expect_policy[i],
+                                                     &new_policy[i],
+                                                     tb[NFCTH_POLICY_SET + i]);
+               if (err < 0)
+                       return err;
+       }
+       /* Now we can safely update them. */
+       for (i = 0; i < helper->expect_class_max + 1; i++) {
+               policy = (struct nf_conntrack_expect_policy *)
+                               &helper->expect_policy[i];
+               policy->max_expected = new_policy->max_expected;
+               policy->timeout = new_policy->timeout;
+       }
+
+       return 0;
+}
+
+static int nfnl_cthelper_update_policy(struct nf_conntrack_helper *helper,
+                                      const struct nlattr *attr)
+{
+       struct nlattr *tb[NFCTH_POLICY_SET_MAX + 1];
+       unsigned int class_max;
+       int err;
+
+       err = nla_parse_nested(tb, NFCTH_POLICY_SET_MAX, attr,
+                              nfnl_cthelper_expect_policy_set);
+       if (err < 0)
+               return err;
+
+       if (!tb[NFCTH_POLICY_SET_NUM])
+               return -EINVAL;
+
+       class_max = ntohl(nla_get_be32(tb[NFCTH_POLICY_SET_NUM]));
+       if (helper->expect_class_max + 1 != class_max)
+               return -EBUSY;
+
+       return nfnl_cthelper_update_policy_all(tb, helper);
+}
+
+static int
 nfnl_cthelper_update(const struct nlattr * const tb[],
                     struct nf_conntrack_helper *helper)
 {
@@ -263,8 +360,7 @@ nfnl_cthelper_update(const struct nlattr * const tb[],
                return -EBUSY;
 
        if (tb[NFCTH_POLICY]) {
-               ret = nfnl_cthelper_parse_expect_policy(helper,
-                                                       tb[NFCTH_POLICY]);
+               ret = nfnl_cthelper_update_policy(helper, tb[NFCTH_POLICY]);
                if (ret < 0)
                        return ret;
        }
@@ -293,7 +389,8 @@ static int nfnl_cthelper_new(struct net *net, struct sock *nfnl,
        const char *helper_name;
        struct nf_conntrack_helper *cur, *helper = NULL;
        struct nf_conntrack_tuple tuple;
-       int ret = 0, i;
+       struct nfnl_cthelper *nlcth;
+       int ret = 0;
 
        if (!tb[NFCTH_NAME] || !tb[NFCTH_TUPLE])
                return -EINVAL;
@@ -304,31 +401,22 @@ static int nfnl_cthelper_new(struct net *net, struct sock *nfnl,
        if (ret < 0)
                return ret;
 
-       rcu_read_lock();
-       for (i = 0; i < nf_ct_helper_hsize && !helper; i++) {
-               hlist_for_each_entry_rcu(cur, &nf_ct_helper_hash[i], hnode) {
+       list_for_each_entry(nlcth, &nfnl_cthelper_list, list) {
+               cur = &nlcth->helper;
 
-                       /* skip non-userspace conntrack helpers. */
-                       if (!(cur->flags & NF_CT_HELPER_F_USERSPACE))
-                               continue;
+               if (strncmp(cur->name, helper_name, NF_CT_HELPER_NAME_LEN))
+                       continue;
 
-                       if (strncmp(cur->name, helper_name,
-                                       NF_CT_HELPER_NAME_LEN) != 0)
-                               continue;
+               if ((tuple.src.l3num != cur->tuple.src.l3num ||
+                    tuple.dst.protonum != cur->tuple.dst.protonum))
+                       continue;
 
-                       if ((tuple.src.l3num != cur->tuple.src.l3num ||
-                            tuple.dst.protonum != cur->tuple.dst.protonum))
-                               continue;
+               if (nlh->nlmsg_flags & NLM_F_EXCL)
+                       return -EEXIST;
 
-                       if (nlh->nlmsg_flags & NLM_F_EXCL) {
-                               ret = -EEXIST;
-                               goto err;
-                       }
-                       helper = cur;
-                       break;
-               }
+               helper = cur;
+               break;
        }
-       rcu_read_unlock();
 
        if (helper == NULL)
                ret = nfnl_cthelper_create(tb, &tuple);
@@ -336,9 +424,6 @@ static int nfnl_cthelper_new(struct net *net, struct sock *nfnl,
                ret = nfnl_cthelper_update(tb, helper);
 
        return ret;
-err:
-       rcu_read_unlock();
-       return ret;
 }
 
 static int
@@ -377,10 +462,10 @@ nfnl_cthelper_dump_policy(struct sk_buff *skb,
                goto nla_put_failure;
 
        if (nla_put_be32(skb, NFCTH_POLICY_SET_NUM,
-                        htonl(helper->expect_class_max)))
+                        htonl(helper->expect_class_max + 1)))
                goto nla_put_failure;
 
-       for (i=0; i<helper->expect_class_max; i++) {
+       for (i = 0; i < helper->expect_class_max + 1; i++) {
                nest_parms2 = nla_nest_start(skb,
                                (NFCTH_POLICY_SET+i) | NLA_F_NESTED);
                if (nest_parms2 == NULL)
@@ -502,11 +587,12 @@ static int nfnl_cthelper_get(struct net *net, struct sock *nfnl,
                             struct sk_buff *skb, const struct nlmsghdr *nlh,
                             const struct nlattr * const tb[])
 {
-       int ret = -ENOENT, i;
+       int ret = -ENOENT;
        struct nf_conntrack_helper *cur;
        struct sk_buff *skb2;
        char *helper_name = NULL;
        struct nf_conntrack_tuple tuple;
+       struct nfnl_cthelper *nlcth;
        bool tuple_set = false;
 
        if (nlh->nlmsg_flags & NLM_F_DUMP) {
@@ -527,45 +613,39 @@ static int nfnl_cthelper_get(struct net *net, struct sock *nfnl,
                tuple_set = true;
        }
 
-       for (i = 0; i < nf_ct_helper_hsize; i++) {
-               hlist_for_each_entry_rcu(cur, &nf_ct_helper_hash[i], hnode) {
+       list_for_each_entry(nlcth, &nfnl_cthelper_list, list) {
+               cur = &nlcth->helper;
+               if (helper_name &&
+                   strncmp(cur->name, helper_name, NF_CT_HELPER_NAME_LEN))
+                       continue;
 
-                       /* skip non-userspace conntrack helpers. */
-                       if (!(cur->flags & NF_CT_HELPER_F_USERSPACE))
-                               continue;
+               if (tuple_set &&
+                   (tuple.src.l3num != cur->tuple.src.l3num ||
+                    tuple.dst.protonum != cur->tuple.dst.protonum))
+                       continue;
 
-                       if (helper_name && strncmp(cur->name, helper_name,
-                                               NF_CT_HELPER_NAME_LEN) != 0) {
-                               continue;
-                       }
-                       if (tuple_set &&
-                           (tuple.src.l3num != cur->tuple.src.l3num ||
-                            tuple.dst.protonum != cur->tuple.dst.protonum))
-                               continue;
-
-                       skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
-                       if (skb2 == NULL) {
-                               ret = -ENOMEM;
-                               break;
-                       }
+               skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+               if (skb2 == NULL) {
+                       ret = -ENOMEM;
+                       break;
+               }
 
-                       ret = nfnl_cthelper_fill_info(skb2, NETLINK_CB(skb).portid,
-                                               nlh->nlmsg_seq,
-                                               NFNL_MSG_TYPE(nlh->nlmsg_type),
-                                               NFNL_MSG_CTHELPER_NEW, cur);
-                       if (ret <= 0) {
-                               kfree_skb(skb2);
-                               break;
-                       }
+               ret = nfnl_cthelper_fill_info(skb2, NETLINK_CB(skb).portid,
+                                             nlh->nlmsg_seq,
+                                             NFNL_MSG_TYPE(nlh->nlmsg_type),
+                                             NFNL_MSG_CTHELPER_NEW, cur);
+               if (ret <= 0) {
+                       kfree_skb(skb2);
+                       break;
+               }
 
-                       ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid,
-                                               MSG_DONTWAIT);
-                       if (ret > 0)
-                               ret = 0;
+               ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid,
+                                     MSG_DONTWAIT);
+               if (ret > 0)
+                       ret = 0;
 
-                       /* this avoids a loop in nfnetlink. */
-                       return ret == -EAGAIN ? -ENOBUFS : ret;
-               }
+               /* this avoids a loop in nfnetlink. */
+               return ret == -EAGAIN ? -ENOBUFS : ret;
        }
        return ret;
 }
@@ -576,10 +656,10 @@ static int nfnl_cthelper_del(struct net *net, struct sock *nfnl,
 {
        char *helper_name = NULL;
        struct nf_conntrack_helper *cur;
-       struct hlist_node *tmp;
        struct nf_conntrack_tuple tuple;
        bool tuple_set = false, found = false;
-       int i, j = 0, ret;
+       struct nfnl_cthelper *nlcth, *n;
+       int j = 0, ret;
 
        if (tb[NFCTH_NAME])
                helper_name = nla_data(tb[NFCTH_NAME]);
@@ -592,28 +672,27 @@ static int nfnl_cthelper_del(struct net *net, struct sock *nfnl,
                tuple_set = true;
        }
 
-       for (i = 0; i < nf_ct_helper_hsize; i++) {
-               hlist_for_each_entry_safe(cur, tmp, &nf_ct_helper_hash[i],
-                                                               hnode) {
-                       /* skip non-userspace conntrack helpers. */
-                       if (!(cur->flags & NF_CT_HELPER_F_USERSPACE))
-                               continue;
+       list_for_each_entry_safe(nlcth, n, &nfnl_cthelper_list, list) {
+               cur = &nlcth->helper;
+               j++;
 
-                       j++;
+               if (helper_name &&
+                   strncmp(cur->name, helper_name, NF_CT_HELPER_NAME_LEN))
+                       continue;
 
-                       if (helper_name && strncmp(cur->name, helper_name,
-                                               NF_CT_HELPER_NAME_LEN) != 0) {
-                               continue;
-                       }
-                       if (tuple_set &&
-                           (tuple.src.l3num != cur->tuple.src.l3num ||
-                            tuple.dst.protonum != cur->tuple.dst.protonum))
-                               continue;
+               if (tuple_set &&
+                   (tuple.src.l3num != cur->tuple.src.l3num ||
+                    tuple.dst.protonum != cur->tuple.dst.protonum))
+                       continue;
 
-                       found = true;
-                       nf_conntrack_helper_unregister(cur);
-               }
+               found = true;
+               nf_conntrack_helper_unregister(cur);
+               kfree(cur->expect_policy);
+
+               list_del(&nlcth->list);
+               kfree(nlcth);
        }
+
        /* Make sure we return success if we flush and there is no helpers */
        return (found || j == 0) ? 0 : -ENOENT;
 }
@@ -662,20 +741,16 @@ err_out:
 static void __exit nfnl_cthelper_exit(void)
 {
        struct nf_conntrack_helper *cur;
-       struct hlist_node *tmp;
-       int i;
+       struct nfnl_cthelper *nlcth, *n;
 
        nfnetlink_subsys_unregister(&nfnl_cthelper_subsys);
 
-       for (i=0; i<nf_ct_helper_hsize; i++) {
-               hlist_for_each_entry_safe(cur, tmp, &nf_ct_helper_hash[i],
-                                                                       hnode) {
-                       /* skip non-userspace conntrack helpers. */
-                       if (!(cur->flags & NF_CT_HELPER_F_USERSPACE))
-                               continue;
+       list_for_each_entry_safe(nlcth, n, &nfnl_cthelper_list, list) {
+               cur = &nlcth->helper;
 
-                       nf_conntrack_helper_unregister(cur);
-               }
+               nf_conntrack_helper_unregister(cur);
+               kfree(cur->expect_policy);
+               kfree(nlcth);
        }
 }
 
index 139e086..47d6656 100644 (file)
@@ -646,8 +646,8 @@ static void __exit cttimeout_exit(void)
 #ifdef CONFIG_NF_CONNTRACK_TIMEOUT
        RCU_INIT_POINTER(nf_ct_timeout_find_get_hook, NULL);
        RCU_INIT_POINTER(nf_ct_timeout_put_hook, NULL);
+       synchronize_rcu();
 #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
-       rcu_barrier();
 }
 
 module_init(cttimeout_init);
index 3ee0b8a..933509e 100644 (file)
@@ -443,7 +443,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
        skb = alloc_skb(size, GFP_ATOMIC);
        if (!skb) {
                skb_tx_error(entskb);
-               return NULL;
+               goto nlmsg_failure;
        }
 
        nlh = nlmsg_put(skb, 0, 0,
@@ -452,7 +452,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
        if (!nlh) {
                skb_tx_error(entskb);
                kfree_skb(skb);
-               return NULL;
+               goto nlmsg_failure;
        }
        nfmsg = nlmsg_data(nlh);
        nfmsg->nfgen_family = entry->state.pf;
@@ -598,12 +598,17 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
        }
 
        nlh->nlmsg_len = skb->len;
+       if (seclen)
+               security_release_secctx(secdata, seclen);
        return skb;
 
 nla_put_failure:
        skb_tx_error(entskb);
        kfree_skb(skb);
        net_err_ratelimited("nf_queue: error creating packet message\n");
+nlmsg_failure:
+       if (seclen)
+               security_release_secctx(secdata, seclen);
        return NULL;
 }
 
index eb2721a..c4dad12 100644 (file)
@@ -21,6 +21,7 @@ struct nft_hash {
        enum nft_registers      sreg:8;
        enum nft_registers      dreg:8;
        u8                      len;
+       bool                    autogen_seed:1;
        u32                     modulus;
        u32                     seed;
        u32                     offset;
@@ -82,10 +83,12 @@ static int nft_hash_init(const struct nft_ctx *ctx,
        if (priv->offset + priv->modulus - 1 < priv->offset)
                return -EOVERFLOW;
 
-       if (tb[NFTA_HASH_SEED])
+       if (tb[NFTA_HASH_SEED]) {
                priv->seed = ntohl(nla_get_be32(tb[NFTA_HASH_SEED]));
-       else
+       } else {
+               priv->autogen_seed = true;
                get_random_bytes(&priv->seed, sizeof(priv->seed));
+       }
 
        return nft_validate_register_load(priv->sreg, len) &&
               nft_validate_register_store(ctx, priv->dreg, NULL,
@@ -105,7 +108,8 @@ static int nft_hash_dump(struct sk_buff *skb,
                goto nla_put_failure;
        if (nla_put_be32(skb, NFTA_HASH_MODULUS, htonl(priv->modulus)))
                goto nla_put_failure;
-       if (nla_put_be32(skb, NFTA_HASH_SEED, htonl(priv->seed)))
+       if (!priv->autogen_seed &&
+           nla_put_be32(skb, NFTA_HASH_SEED, htonl(priv->seed)))
                goto nla_put_failure;
        if (priv->offset != 0)
                if (nla_put_be32(skb, NFTA_HASH_OFFSET, htonl(priv->offset)))
index 27241a7..c64aca6 100644 (file)
@@ -104,7 +104,7 @@ tcpmss_mangle_packet(struct sk_buff *skb,
        tcph = (struct tcphdr *)(skb_network_header(skb) + tcphoff);
        tcp_hdrlen = tcph->doff * 4;
 
-       if (len < tcp_hdrlen)
+       if (len < tcp_hdrlen || tcp_hdrlen < sizeof(struct tcphdr))
                return -1;
 
        if (info->mss == XT_TCPMSS_CLAMP_PMTU) {
@@ -152,6 +152,10 @@ tcpmss_mangle_packet(struct sk_buff *skb,
        if (len > tcp_hdrlen)
                return 0;
 
+       /* tcph->doff has 4 bits, do not wrap it to 0 */
+       if (tcp_hdrlen >= 15 * 4)
+               return 0;
+
        /*
         * MSS Option not found ?! add it..
         */
index 80cb7ba..df7f1df 100644 (file)
@@ -393,7 +393,8 @@ tproxy_laddr6(struct sk_buff *skb, const struct in6_addr *user_laddr,
 
        rcu_read_lock();
        indev = __in6_dev_get(skb->dev);
-       if (indev)
+       if (indev) {
+               read_lock_bh(&indev->lock);
                list_for_each_entry(ifa, &indev->addr_list, if_list) {
                        if (ifa->flags & (IFA_F_TENTATIVE | IFA_F_DEPRECATED))
                                continue;
@@ -401,6 +402,8 @@ tproxy_laddr6(struct sk_buff *skb, const struct in6_addr *user_laddr,
                        laddr = &ifa->addr;
                        break;
                }
+               read_unlock_bh(&indev->lock);
+       }
        rcu_read_unlock();
 
        return laddr ? laddr : daddr;
index e0a8777..7b2c2fc 100644 (file)
@@ -643,8 +643,8 @@ static bool skb_nfct_cached(struct net *net,
                 */
                if (nf_ct_is_confirmed(ct))
                        nf_ct_delete(ct, 0, 0);
-               else
-                       nf_conntrack_put(&ct->ct_general);
+
+               nf_conntrack_put(&ct->ct_general);
                nf_ct_set(skb, NULL, 0);
                return false;
        }
index 9d4bb8e..3f76cb7 100644 (file)
@@ -527,7 +527,7 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
 
        /* Link layer. */
        clear_vlan(key);
-       if (key->mac_proto == MAC_PROTO_NONE) {
+       if (ovs_key_mac_proto(key) == MAC_PROTO_NONE) {
                if (unlikely(eth_type_vlan(skb->protocol)))
                        return -EINVAL;
 
@@ -745,7 +745,13 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
 
 int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key)
 {
-       return key_extract(skb, key);
+       int res;
+
+       res = key_extract(skb, key);
+       if (!res)
+               key->mac_proto &= ~SW_FLOW_KEY_INVALID;
+
+       return res;
 }
 
 static int key_extract_mac_proto(struct sk_buff *skb)
index a0dbe7c..ea81ccf 100644 (file)
@@ -3665,6 +3665,8 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
                        return -EBUSY;
                if (copy_from_user(&val, optval, sizeof(val)))
                        return -EFAULT;
+               if (val > INT_MAX)
+                       return -EINVAL;
                po->tp_reserve = val;
                return 0;
        }
@@ -3834,6 +3836,8 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
        case PACKET_HDRLEN:
                if (len > sizeof(int))
                        len = sizeof(int);
+               if (len < sizeof(int))
+                       return -EINVAL;
                if (copy_from_user(&val, optval, len))
                        return -EFAULT;
                switch (val) {
@@ -4193,8 +4197,8 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
                if (unlikely(!PAGE_ALIGNED(req->tp_block_size)))
                        goto out;
                if (po->tp_version >= TPACKET_V3 &&
-                   (int)(req->tp_block_size -
-                         BLK_PLUS_PRIV(req_u->req3.tp_sizeof_priv)) <= 0)
+                   req->tp_block_size <=
+                         BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv))
                        goto out;
                if (unlikely(req->tp_frame_size < po->tp_hdrlen +
                                        po->tp_reserve))
@@ -4205,6 +4209,8 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
                rb->frames_per_block = req->tp_block_size / req->tp_frame_size;
                if (unlikely(rb->frames_per_block == 0))
                        goto out;
+               if (unlikely(req->tp_block_size > UINT_MAX / req->tp_block_nr))
+                       goto out;
                if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
                                        req->tp_frame_nr))
                        goto out;
index ae5ac17..9da7368 100644 (file)
@@ -658,7 +658,9 @@ static int qrtr_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
        }
 
        if (plen != len) {
-               skb_pad(skb, plen - len);
+               rc = skb_pad(skb, plen - len);
+               if (rc)
+                       goto out_node;
                skb_put(skb, plen - len);
        }
 
index 2256900..431404d 100644 (file)
@@ -84,13 +84,10 @@ static struct ctl_table rds_tcp_sysctl_table[] = {
 /* doing it this way avoids calling tcp_sk() */
 void rds_tcp_nonagle(struct socket *sock)
 {
-       mm_segment_t oldfs = get_fs();
        int val = 1;
 
-       set_fs(KERNEL_DS);
-       sock->ops->setsockopt(sock, SOL_TCP, TCP_NODELAY, (char __user *)&val,
+       kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (void *)&val,
                              sizeof(val));
-       set_fs(oldfs);
 }
 
 u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc)
index dcf4742..52d11d7 100644 (file)
 
 static void rds_tcp_cork(struct socket *sock, int val)
 {
-       mm_segment_t oldfs;
-
-       oldfs = get_fs();
-       set_fs(KERNEL_DS);
-       sock->ops->setsockopt(sock, SOL_TCP, TCP_CORK, (char __user *)&val,
-                             sizeof(val));
-       set_fs(oldfs);
+       kernel_setsockopt(sock, SOL_TCP, TCP_CORK, (void *)&val, sizeof(val));
 }
 
 void rds_tcp_xmit_path_prepare(struct rds_conn_path *cp)
index b70aa57..e05b924 100644 (file)
@@ -529,20 +529,20 @@ errout:
        return err;
 }
 
-static int nla_memdup_cookie(struct tc_action *a, struct nlattr **tb)
+static struct tc_cookie *nla_memdup_cookie(struct nlattr **tb)
 {
-       a->act_cookie = kzalloc(sizeof(*a->act_cookie), GFP_KERNEL);
-       if (!a->act_cookie)
-               return -ENOMEM;
+       struct tc_cookie *c = kzalloc(sizeof(*c), GFP_KERNEL);
+       if (!c)
+               return NULL;
 
-       a->act_cookie->data = nla_memdup(tb[TCA_ACT_COOKIE], GFP_KERNEL);
-       if (!a->act_cookie->data) {
-               kfree(a->act_cookie);
-               return -ENOMEM;
+       c->data = nla_memdup(tb[TCA_ACT_COOKIE], GFP_KERNEL);
+       if (!c->data) {
+               kfree(c);
+               return NULL;
        }
-       a->act_cookie->len = nla_len(tb[TCA_ACT_COOKIE]);
+       c->len = nla_len(tb[TCA_ACT_COOKIE]);
 
-       return 0;
+       return c;
 }
 
 struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla,
@@ -551,6 +551,7 @@ struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla,
 {
        struct tc_action *a;
        struct tc_action_ops *a_o;
+       struct tc_cookie *cookie = NULL;
        char act_name[IFNAMSIZ];
        struct nlattr *tb[TCA_ACT_MAX + 1];
        struct nlattr *kind;
@@ -566,6 +567,18 @@ struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla,
                        goto err_out;
                if (nla_strlcpy(act_name, kind, IFNAMSIZ) >= IFNAMSIZ)
                        goto err_out;
+               if (tb[TCA_ACT_COOKIE]) {
+                       int cklen = nla_len(tb[TCA_ACT_COOKIE]);
+
+                       if (cklen > TC_COOKIE_MAX_SIZE)
+                               goto err_out;
+
+                       cookie = nla_memdup_cookie(tb);
+                       if (!cookie) {
+                               err = -ENOMEM;
+                               goto err_out;
+                       }
+               }
        } else {
                err = -EINVAL;
                if (strlcpy(act_name, name, IFNAMSIZ) >= IFNAMSIZ)
@@ -604,20 +617,12 @@ struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla,
        if (err < 0)
                goto err_mod;
 
-       if (tb[TCA_ACT_COOKIE]) {
-               int cklen = nla_len(tb[TCA_ACT_COOKIE]);
-
-               if (cklen > TC_COOKIE_MAX_SIZE) {
-                       err = -EINVAL;
-                       tcf_hash_release(a, bind);
-                       goto err_mod;
-               }
-
-               if (nla_memdup_cookie(a, tb) < 0) {
-                       err = -ENOMEM;
-                       tcf_hash_release(a, bind);
-                       goto err_mod;
+       if (name == NULL && tb[TCA_ACT_COOKIE]) {
+               if (a->act_cookie) {
+                       kfree(a->act_cookie->data);
+                       kfree(a->act_cookie);
                }
+               a->act_cookie = cookie;
        }
 
        /* module count goes up only when brand new policy is created
@@ -632,6 +637,10 @@ struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla,
 err_mod:
        module_put(a_o->owner);
 err_out:
+       if (cookie) {
+               kfree(cookie->data);
+               kfree(cookie);
+       }
        return ERR_PTR(err);
 }
 
index b052b27..1a2f9e9 100644 (file)
@@ -794,7 +794,7 @@ static void attach_default_qdiscs(struct net_device *dev)
                }
        }
 #ifdef CONFIG_NET_SCHED
-       if (dev->qdisc)
+       if (dev->qdisc != &noop_qdisc)
                qdisc_hash_add(dev->qdisc);
 #endif
 }
index 0439a1a..a9708da 100644 (file)
@@ -246,6 +246,9 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
        if (!sctp_ulpq_init(&asoc->ulpq, asoc))
                goto fail_init;
 
+       if (sctp_stream_new(asoc, gfp))
+               goto fail_init;
+
        /* Assume that peer would support both address types unless we are
         * told otherwise.
         */
@@ -264,7 +267,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
        /* AUTH related initializations */
        INIT_LIST_HEAD(&asoc->endpoint_shared_keys);
        if (sctp_auth_asoc_copy_shkeys(ep, asoc, gfp))
-               goto fail_init;
+               goto stream_free;
 
        asoc->active_key_id = ep->active_key_id;
        asoc->prsctp_enable = ep->prsctp_enable;
@@ -287,6 +290,8 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
 
        return asoc;
 
+stream_free:
+       sctp_stream_free(asoc->stream);
 fail_init:
        sock_put(asoc->base.sk);
        sctp_endpoint_put(asoc->ep);
@@ -1407,7 +1412,7 @@ sctp_assoc_choose_alter_transport(struct sctp_association *asoc,
 /* Update the association's pmtu and frag_point by going through all the
  * transports. This routine is called when a transport's PMTU has changed.
  */
-void sctp_assoc_sync_pmtu(struct sock *sk, struct sctp_association *asoc)
+void sctp_assoc_sync_pmtu(struct sctp_association *asoc)
 {
        struct sctp_transport *t;
        __u32 pmtu = 0;
@@ -1419,8 +1424,8 @@ void sctp_assoc_sync_pmtu(struct sock *sk, struct sctp_association *asoc)
        list_for_each_entry(t, &asoc->peer.transport_addr_list,
                                transports) {
                if (t->pmtu_pending && t->dst) {
-                       sctp_transport_update_pmtu(sk, t,
-                                                  SCTP_TRUNC4(dst_mtu(t->dst)));
+                       sctp_transport_update_pmtu(
+                                       t, SCTP_TRUNC4(dst_mtu(t->dst)));
                        t->pmtu_pending = 0;
                }
                if (!pmtu || (t->pathmtu < pmtu))
index 2a28ab2..0e06a27 100644 (file)
@@ -401,10 +401,10 @@ void sctp_icmp_frag_needed(struct sock *sk, struct sctp_association *asoc,
 
        if (t->param_flags & SPP_PMTUD_ENABLE) {
                /* Update transports view of the MTU */
-               sctp_transport_update_pmtu(sk, t, pmtu);
+               sctp_transport_update_pmtu(t, pmtu);
 
                /* Update association pmtu. */
-               sctp_assoc_sync_pmtu(sk, asoc);
+               sctp_assoc_sync_pmtu(asoc);
        }
 
        /* Retransmit with the new pmtu setting.
index 1224421..1409a87 100644 (file)
@@ -86,43 +86,53 @@ void sctp_packet_config(struct sctp_packet *packet, __u32 vtag,
 {
        struct sctp_transport *tp = packet->transport;
        struct sctp_association *asoc = tp->asoc;
+       struct sock *sk;
 
        pr_debug("%s: packet:%p vtag:0x%x\n", __func__, packet, vtag);
-
        packet->vtag = vtag;
 
-       if (asoc && tp->dst) {
-               struct sock *sk = asoc->base.sk;
-
-               rcu_read_lock();
-               if (__sk_dst_get(sk) != tp->dst) {
-                       dst_hold(tp->dst);
-                       sk_setup_caps(sk, tp->dst);
-               }
-
-               if (sk_can_gso(sk)) {
-                       struct net_device *dev = tp->dst->dev;
+       /* do the following jobs only once for a flush schedule */
+       if (!sctp_packet_empty(packet))
+               return;
 
-                       packet->max_size = dev->gso_max_size;
-               } else {
-                       packet->max_size = asoc->pathmtu;
-               }
-               rcu_read_unlock();
+       /* set packet max_size with pathmtu */
+       packet->max_size = tp->pathmtu;
+       if (!asoc)
+               return;
 
-       } else {
-               packet->max_size = tp->pathmtu;
+       /* update dst or transport pathmtu if in need */
+       sk = asoc->base.sk;
+       if (!sctp_transport_dst_check(tp)) {
+               sctp_transport_route(tp, NULL, sctp_sk(sk));
+               if (asoc->param_flags & SPP_PMTUD_ENABLE)
+                       sctp_assoc_sync_pmtu(asoc);
+       } else if (!sctp_transport_pmtu_check(tp)) {
+               if (asoc->param_flags & SPP_PMTUD_ENABLE)
+                       sctp_assoc_sync_pmtu(asoc);
        }
 
-       if (ecn_capable && sctp_packet_empty(packet)) {
-               struct sctp_chunk *chunk;
+       /* If there a is a prepend chunk stick it on the list before
+        * any other chunks get appended.
+        */
+       if (ecn_capable) {
+               struct sctp_chunk *chunk = sctp_get_ecne_prepend(asoc);
 
-               /* If there a is a prepend chunk stick it on the list before
-                * any other chunks get appended.
-                */
-               chunk = sctp_get_ecne_prepend(asoc);
                if (chunk)
                        sctp_packet_append_chunk(packet, chunk);
        }
+
+       if (!tp->dst)
+               return;
+
+       /* set packet max_size with gso_max_size if gso is enabled*/
+       rcu_read_lock();
+       if (__sk_dst_get(sk) != tp->dst) {
+               dst_hold(tp->dst);
+               sk_setup_caps(sk, tp->dst);
+       }
+       packet->max_size = sk_can_gso(sk) ? tp->dst->dev->gso_max_size
+                                         : asoc->pathmtu;
+       rcu_read_unlock();
 }
 
 /* Initialize the packet structure. */
@@ -582,12 +592,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
        sh->vtag = htonl(packet->vtag);
        sh->checksum = 0;
 
-       /* update dst if in need */
-       if (!sctp_transport_dst_check(tp)) {
-               sctp_transport_route(tp, NULL, sctp_sk(sk));
-               if (asoc && asoc->param_flags & SPP_PMTUD_ENABLE)
-                       sctp_assoc_sync_pmtu(sk, asoc);
-       }
+       /* drop packet if no dst */
        dst = dst_clone(tp->dst);
        if (!dst) {
                IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
@@ -704,7 +709,7 @@ static sctp_xmit_t sctp_packet_can_append_data(struct sctp_packet *packet,
         */
 
        if ((sctp_sk(asoc->base.sk)->nodelay || inflight == 0) &&
-           !chunk->msg->force_delay)
+           !asoc->force_delay)
                /* Nothing unacked */
                return SCTP_XMIT_OK;
 
index 025ccff..8081476 100644 (file)
@@ -1026,8 +1026,7 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
                        /* RFC 2960 6.5 Every DATA chunk MUST carry a valid
                         * stream identifier.
                         */
-                       if (chunk->sinfo.sinfo_stream >=
-                           asoc->c.sinit_num_ostreams) {
+                       if (chunk->sinfo.sinfo_stream >= asoc->stream->outcnt) {
 
                                /* Mark as failed send. */
                                sctp_chunk_fail(chunk, SCTP_ERROR_INV_STRM);
index 206377f..a0b29d4 100644 (file)
@@ -361,8 +361,8 @@ static int sctp_assocs_seq_show(struct seq_file *seq, void *v)
        sctp_seq_dump_remote_addrs(seq, assoc);
        seq_printf(seq, "\t%8lu %5d %5d %4d %4d %4d %8d "
                   "%8d %8d %8d %8d",
-               assoc->hbinterval, assoc->c.sinit_max_instreams,
-               assoc->c.sinit_num_ostreams, assoc->max_retrans,
+               assoc->hbinterval, assoc->stream->incnt,
+               assoc->stream->outcnt, assoc->max_retrans,
                assoc->init_retries, assoc->shutdown_retries,
                assoc->rtx_data_chunks,
                atomic_read(&sk->sk_wmem_alloc),
index 969a30c..118faff 100644 (file)
@@ -2460,15 +2460,10 @@ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk,
         * association.
         */
        if (!asoc->temp) {
-               int error;
-
-               asoc->stream = sctp_stream_new(asoc->c.sinit_max_instreams,
-                                              asoc->c.sinit_num_ostreams, gfp);
-               if (!asoc->stream)
+               if (sctp_stream_init(asoc, gfp))
                        goto clean_up;
 
-               error = sctp_assoc_set_id(asoc, gfp);
-               if (error)
+               if (sctp_assoc_set_id(asoc, gfp))
                        goto clean_up;
        }
 
index e03bb1a..24c6ccc 100644 (file)
@@ -3946,7 +3946,7 @@ sctp_disposition_t sctp_sf_eat_fwd_tsn(struct net *net,
 
        /* Silently discard the chunk if stream-id is not valid */
        sctp_walk_fwdtsn(skip, chunk) {
-               if (ntohs(skip->stream) >= asoc->c.sinit_max_instreams)
+               if (ntohs(skip->stream) >= asoc->stream->incnt)
                        goto discard_noforce;
        }
 
@@ -4017,7 +4017,7 @@ sctp_disposition_t sctp_sf_eat_fwd_tsn_fast(
 
        /* Silently discard the chunk if stream-id is not valid */
        sctp_walk_fwdtsn(skip, chunk) {
-               if (ntohs(skip->stream) >= asoc->c.sinit_max_instreams)
+               if (ntohs(skip->stream) >= asoc->stream->incnt)
                        goto gen_shutdown;
        }
 
@@ -6353,7 +6353,7 @@ static int sctp_eat_data(const struct sctp_association *asoc,
         * and discard the DATA chunk.
         */
        sid = ntohs(data_hdr->stream);
-       if (sid >= asoc->c.sinit_max_instreams) {
+       if (sid >= asoc->stream->incnt) {
                /* Mark tsn as received even though we drop it */
                sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_TSN, SCTP_U32(tsn));
 
index 0f378ea..d9d4c92 100644 (file)
@@ -1907,7 +1907,7 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
        }
 
        if (asoc->pmtu_pending)
-               sctp_assoc_pending_pmtu(sk, asoc);
+               sctp_assoc_pending_pmtu(asoc);
 
        /* If fragmentation is disabled and the message length exceeds the
         * association fragmentation point, return EMSGSIZE.  The I-D
@@ -1920,7 +1920,7 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
        }
 
        /* Check for invalid stream. */
-       if (sinfo->sinfo_stream >= asoc->c.sinit_num_ostreams) {
+       if (sinfo->sinfo_stream >= asoc->stream->outcnt) {
                err = -EINVAL;
                goto out_free;
        }
@@ -1965,7 +1965,7 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
                err = PTR_ERR(datamsg);
                goto out_free;
        }
-       datamsg->force_delay = !!(msg->msg_flags & MSG_MORE);
+       asoc->force_delay = !!(msg->msg_flags & MSG_MORE);
 
        /* Now send the (possibly) fragmented message. */
        list_for_each_entry(chunk, &datamsg->chunks, frag_list) {
@@ -2435,7 +2435,7 @@ static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params,
        if ((params->spp_flags & SPP_PMTUD_DISABLE) && params->spp_pathmtu) {
                if (trans) {
                        trans->pathmtu = params->spp_pathmtu;
-                       sctp_assoc_sync_pmtu(sctp_opt2sk(sp), asoc);
+                       sctp_assoc_sync_pmtu(asoc);
                } else if (asoc) {
                        asoc->pathmtu = params->spp_pathmtu;
                } else {
@@ -2451,7 +2451,7 @@ static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params,
                                (trans->param_flags & ~SPP_PMTUD) | pmtud_change;
                        if (update) {
                                sctp_transport_pmtu(trans, sctp_opt2sk(sp));
-                               sctp_assoc_sync_pmtu(sctp_opt2sk(sp), asoc);
+                               sctp_assoc_sync_pmtu(asoc);
                        }
                } else if (asoc) {
                        asoc->param_flags =
@@ -4461,8 +4461,8 @@ int sctp_get_sctp_info(struct sock *sk, struct sctp_association *asoc,
        info->sctpi_rwnd = asoc->a_rwnd;
        info->sctpi_unackdata = asoc->unack_data;
        info->sctpi_penddata = sctp_tsnmap_pending(&asoc->peer.tsn_map);
-       info->sctpi_instrms = asoc->c.sinit_max_instreams;
-       info->sctpi_outstrms = asoc->c.sinit_num_ostreams;
+       info->sctpi_instrms = asoc->stream->incnt;
+       info->sctpi_outstrms = asoc->stream->outcnt;
        list_for_each(pos, &asoc->base.inqueue.in_chunk_list)
                info->sctpi_inqueue++;
        list_for_each(pos, &asoc->outqueue.out_chunk_list)
@@ -4691,8 +4691,8 @@ static int sctp_getsockopt_sctp_status(struct sock *sk, int len,
        status.sstat_unackdata = asoc->unack_data;
 
        status.sstat_penddata = sctp_tsnmap_pending(&asoc->peer.tsn_map);
-       status.sstat_instrms = asoc->c.sinit_max_instreams;
-       status.sstat_outstrms = asoc->c.sinit_num_ostreams;
+       status.sstat_instrms = asoc->stream->incnt;
+       status.sstat_outstrms = asoc->stream->outcnt;
        status.sstat_fragmentation_point = asoc->frag_point;
        status.sstat_primary.spinfo_assoc_id = sctp_assoc2id(transport->asoc);
        memcpy(&status.sstat_primary.spinfo_address, &transport->ipaddr,
@@ -7034,6 +7034,9 @@ int sctp_inet_listen(struct socket *sock, int backlog)
        if (sock->state != SS_UNCONNECTED)
                goto out;
 
+       if (!sctp_sstate(sk, LISTENING) && !sctp_sstate(sk, CLOSED))
+               goto out;
+
        /* If backlog is zero, disable listening. */
        if (!backlog) {
                if (sctp_sstate(sk, CLOSED))
index 1c6cc04..bbed997 100644 (file)
 #include <net/sctp/sctp.h>
 #include <net/sctp/sm.h>
 
-struct sctp_stream *sctp_stream_new(__u16 incnt, __u16 outcnt, gfp_t gfp)
+int sctp_stream_new(struct sctp_association *asoc, gfp_t gfp)
 {
        struct sctp_stream *stream;
        int i;
 
        stream = kzalloc(sizeof(*stream), gfp);
        if (!stream)
-               return NULL;
+               return -ENOMEM;
 
-       stream->outcnt = outcnt;
+       stream->outcnt = asoc->c.sinit_num_ostreams;
        stream->out = kcalloc(stream->outcnt, sizeof(*stream->out), gfp);
        if (!stream->out) {
                kfree(stream);
-               return NULL;
+               return -ENOMEM;
        }
        for (i = 0; i < stream->outcnt; i++)
                stream->out[i].state = SCTP_STREAM_OPEN;
 
-       stream->incnt = incnt;
+       asoc->stream = stream;
+
+       return 0;
+}
+
+int sctp_stream_init(struct sctp_association *asoc, gfp_t gfp)
+{
+       struct sctp_stream *stream = asoc->stream;
+       int i;
+
+       /* Initial stream->out size may be very big, so free it and alloc
+        * a new one with new outcnt to save memory.
+        */
+       kfree(stream->out);
+       stream->outcnt = asoc->c.sinit_num_ostreams;
+       stream->out = kcalloc(stream->outcnt, sizeof(*stream->out), gfp);
+       if (!stream->out)
+               goto nomem;
+
+       for (i = 0; i < stream->outcnt; i++)
+               stream->out[i].state = SCTP_STREAM_OPEN;
+
+       stream->incnt = asoc->c.sinit_max_instreams;
        stream->in = kcalloc(stream->incnt, sizeof(*stream->in), gfp);
        if (!stream->in) {
                kfree(stream->out);
-               kfree(stream);
-               return NULL;
+               goto nomem;
        }
 
-       return stream;
+       return 0;
+
+nomem:
+       asoc->stream = NULL;
+       kfree(stream);
+
+       return -ENOMEM;
 }
 
 void sctp_stream_free(struct sctp_stream *stream)
index 3379668..721eeeb 100644 (file)
@@ -251,14 +251,13 @@ void sctp_transport_pmtu(struct sctp_transport *transport, struct sock *sk)
                transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT;
 }
 
-void sctp_transport_update_pmtu(struct sock *sk, struct sctp_transport *t, u32 pmtu)
+void sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu)
 {
-       struct dst_entry *dst;
+       struct dst_entry *dst = sctp_transport_dst_check(t);
 
        if (unlikely(pmtu < SCTP_DEFAULT_MINSEGMENT)) {
                pr_warn("%s: Reported pmtu %d too low, using default minimum of %d\n",
-                       __func__, pmtu,
-                       SCTP_DEFAULT_MINSEGMENT);
+                       __func__, pmtu, SCTP_DEFAULT_MINSEGMENT);
                /* Use default minimum segment size and disable
                 * pmtu discovery on this transport.
                 */
@@ -267,17 +266,13 @@ void sctp_transport_update_pmtu(struct sock *sk, struct sctp_transport *t, u32 p
                t->pathmtu = pmtu;
        }
 
-       dst = sctp_transport_dst_check(t);
-       if (!dst)
-               t->af_specific->get_dst(t, &t->saddr, &t->fl, sk);
-
        if (dst) {
-               dst->ops->update_pmtu(dst, sk, NULL, pmtu);
-
+               dst->ops->update_pmtu(dst, t->asoc->base.sk, NULL, pmtu);
                dst = sctp_transport_dst_check(t);
-               if (!dst)
-                       t->af_specific->get_dst(t, &t->saddr, &t->fl, sk);
        }
+
+       if (!dst)
+               t->af_specific->get_dst(t, &t->saddr, &t->fl, t->asoc->base.sk);
 }
 
 /* Caches the dst entry and source address for a transport's destination
index 7130e73..bdce99f 100644 (file)
@@ -866,6 +866,14 @@ static void tipc_sk_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb,
        if (!tsk_peer_msg(tsk, hdr))
                goto exit;
 
+       if (unlikely(msg_errcode(hdr))) {
+               tipc_set_sk_state(sk, TIPC_DISCONNECTING);
+               tipc_node_remove_conn(sock_net(sk), tsk_peer_node(tsk),
+                                     tsk_peer_port(tsk));
+               sk->sk_state_change(sk);
+               goto exit;
+       }
+
        tsk->probe_unacked = false;
 
        if (mtyp == CONN_PROBE) {
@@ -1083,7 +1091,7 @@ static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dlen)
                }
        } while (sent < dlen && !rc);
 
-       return rc ? rc : sent;
+       return sent ? sent : rc;
 }
 
 /**
@@ -1259,7 +1267,10 @@ static int tipc_wait_for_rcvmsg(struct socket *sock, long *timeop)
        struct sock *sk = sock->sk;
        DEFINE_WAIT(wait);
        long timeo = *timeop;
-       int err;
+       int err = sock_error(sk);
+
+       if (err)
+               return err;
 
        for (;;) {
                prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
@@ -1281,6 +1292,10 @@ static int tipc_wait_for_rcvmsg(struct socket *sock, long *timeop)
                err = sock_intr_errno(timeo);
                if (signal_pending(current))
                        break;
+
+               err = sock_error(sk);
+               if (err)
+                       break;
        }
        finish_wait(sk_sleep(sk), &wait);
        *timeop = timeo;
@@ -1484,7 +1499,7 @@ restart:
        if (unlikely(flags & MSG_PEEK))
                goto exit;
 
-       tsk->rcv_unacked += tsk_inc(tsk, hlen + sz);
+       tsk->rcv_unacked += tsk_inc(tsk, hlen + msg_data_sz(msg));
        if (unlikely(tsk->rcv_unacked >= (tsk->rcv_win / 4)))
                tipc_sk_send_ack(tsk);
        tsk_advance_rx_queue(sk);
@@ -1551,6 +1566,8 @@ static bool filter_connect(struct tipc_sock *tsk, struct sk_buff *skb)
        struct sock *sk = &tsk->sk;
        struct net *net = sock_net(sk);
        struct tipc_msg *hdr = buf_msg(skb);
+       u32 pport = msg_origport(hdr);
+       u32 pnode = msg_orignode(hdr);
 
        if (unlikely(msg_mcast(hdr)))
                return false;
@@ -1558,18 +1575,28 @@ static bool filter_connect(struct tipc_sock *tsk, struct sk_buff *skb)
        switch (sk->sk_state) {
        case TIPC_CONNECTING:
                /* Accept only ACK or NACK message */
-               if (unlikely(!msg_connected(hdr)))
-                       return false;
+               if (unlikely(!msg_connected(hdr))) {
+                       if (pport != tsk_peer_port(tsk) ||
+                           pnode != tsk_peer_node(tsk))
+                               return false;
+
+                       tipc_set_sk_state(sk, TIPC_DISCONNECTING);
+                       sk->sk_err = ECONNREFUSED;
+                       sk->sk_state_change(sk);
+                       return true;
+               }
 
                if (unlikely(msg_errcode(hdr))) {
                        tipc_set_sk_state(sk, TIPC_DISCONNECTING);
                        sk->sk_err = ECONNREFUSED;
+                       sk->sk_state_change(sk);
                        return true;
                }
 
                if (unlikely(!msg_isdata(hdr))) {
                        tipc_set_sk_state(sk, TIPC_DISCONNECTING);
                        sk->sk_err = EINVAL;
+                       sk->sk_state_change(sk);
                        return true;
                }
 
@@ -1581,8 +1608,7 @@ static bool filter_connect(struct tipc_sock *tsk, struct sk_buff *skb)
                        return true;
 
                /* If empty 'ACK-' message, wake up sleeping connect() */
-               if (waitqueue_active(sk_sleep(sk)))
-                       wake_up_interruptible(sk_sleep(sk));
+               sk->sk_data_ready(sk);
 
                /* 'ACK-' message is neither accepted nor rejected: */
                msg_set_dest_droppable(hdr, 1);
index 16b6b59..570a2b6 100644 (file)
@@ -132,12 +132,10 @@ static int wiphy_resume(struct device *dev)
        /* Age scan results with time spent in suspend */
        cfg80211_bss_age(rdev, get_seconds() - rdev->suspend_at);
 
-       if (rdev->ops->resume) {
-               rtnl_lock();
-               if (rdev->wiphy.registered)
-                       ret = rdev_resume(rdev);
-               rtnl_unlock();
-       }
+       rtnl_lock();
+       if (rdev->wiphy.registered && rdev->ops->resume)
+               ret = rdev_resume(rdev);
+       rtnl_unlock();
 
        return ret;
 }
index 46bdb4f..e23570b 100644 (file)
@@ -395,7 +395,7 @@ resume:
                if (xo)
                        xfrm_gro = xo->flags & XFRM_GRO;
 
-               err = x->inner_mode->afinfo->transport_finish(skb, async);
+               err = x->inner_mode->afinfo->transport_finish(skb, xfrm_gro || async);
                if (xfrm_gro) {
                        skb_dst_drop(skb);
                        gro_cells_receive(&gro_cells, skb);
index 236cbbc..dfc77b9 100644 (file)
@@ -1006,6 +1006,10 @@ int xfrm_policy_flush(struct net *net, u8 type, bool task_valid)
                err = -ESRCH;
 out:
        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
+
+       if (cnt)
+               xfrm_garbage_collect(net);
+
        return err;
 }
 EXPORT_SYMBOL(xfrm_policy_flush);
index 8571d76..d4d77b0 100644 (file)
@@ -141,8 +141,8 @@ static void dump_statx(struct statx *stx)
        if (stx->stx_mask & STATX_BTIME)
                print_time(" Birth: ", &stx->stx_btime);
 
-       if (stx->stx_attributes) {
-               unsigned char bits;
+       if (stx->stx_attributes_mask) {
+               unsigned char bits, mbits;
                int loop, byte;
 
                static char attr_representation[64 + 1] =
@@ -160,14 +160,18 @@ static void dump_statx(struct statx *stx)
                printf("Attributes: %016llx (", stx->stx_attributes);
                for (byte = 64 - 8; byte >= 0; byte -= 8) {
                        bits = stx->stx_attributes >> byte;
+                       mbits = stx->stx_attributes_mask >> byte;
                        for (loop = 7; loop >= 0; loop--) {
                                int bit = byte + loop;
 
-                               if (bits & 0x80)
+                               if (!(mbits & 0x80))
+                                       putchar('.');   /* Not supported */
+                               else if (bits & 0x80)
                                        putchar(attr_representation[63 - bit]);
                                else
-                                       putchar('-');
+                                       putchar('-');   /* Not set */
                                bits <<= 1;
+                               mbits <<= 1;
                        }
                        if (byte)
                                putchar(' ');
index 0a07f90..7234e61 100644 (file)
@@ -155,7 +155,7 @@ else
 # $(call addtree,-I$(obj)) locates .h files in srctree, from generated .c files
 #   and locates generated .h files
 # FIXME: Replace both with specific CFLAGS* statements in the makefiles
-__c_flags      = $(if $(obj),-I$(srctree)/$(src) -I$(obj)) \
+__c_flags      = $(if $(obj),$(call addtree,-I$(src)) -I$(obj)) \
                  $(call flags,_c_flags)
 __a_flags      = $(call flags,_a_flags)
 __cpp_flags     = $(call flags,_cpp_flags)
index 3033be7..9d37aa4 100755 (executable)
@@ -12,7 +12,6 @@
 #      sh64 port by Paul Mundt
 #      Random bits by Matt Mackall <mpm@selenic.com>
 #      M68k port by Geert Uytterhoeven and Andreas Schwab
-#      AVR32 port by Haavard Skinnemoen (Atmel)
 #      AArch64, PARISC ports by Kyle McMartin
 #      sparc port by Martin Habets <errandir_news@mph.eclipse.co.uk>
 #
@@ -51,10 +50,6 @@ my (@stack, $re, $dre, $x, $xs, $funcre);
        } elsif ($arch eq 'arm') {
                #c0008ffc:      e24dd064        sub     sp, sp, #100    ; 0x64
                $re = qr/.*sub.*sp, sp, #(([0-9]{2}|[3-9])[0-9]{2})/o;
-       } elsif ($arch eq 'avr32') {
-               #8000008a:       20 1d           sub sp,4
-               #80000ca8:       fa cd 05 b0     sub sp,sp,1456
-               $re = qr/^.*sub.*sp.*,([0-9]{1,8})/o;
        } elsif ($arch =~ /^x86(_64)?$/ || $arch =~ /^i[3456]86$/) {
                #c0105234:       81 ec ac 05 00 00       sub    $0x5ac,%esp
                # or
index 2c9082b..116b773 100755 (executable)
@@ -148,6 +148,7 @@ cat << EOF
 #define __IGNORE_sysfs
 #define __IGNORE_uselib
 #define __IGNORE__sysctl
+#define __IGNORE_arch_prctl
 
 /* ... including the "new" 32-bit uid syscalls */
 #define __IGNORE_lchown32
index 26d208b..cfddddb 100644 (file)
@@ -914,7 +914,7 @@ on_treeview2_button_press_event(GtkWidget * widget,
                        current = menu;
                        display_tree_part();
                        gtk_widget_set_sensitive(back_btn, TRUE);
-               } else if ((col == COL_OPTION)) {
+               } else if (col == COL_OPTION) {
                        toggle_sym_value(menu);
                        gtk_tree_view_expand_row(view, path, TRUE);
                }
index d900f47..213df4d 100644 (file)
@@ -125,17 +125,8 @@ config HAVE_HARDENED_USERCOPY_ALLOCATOR
          validating memory ranges against heap object sizes in
          support of CONFIG_HARDENED_USERCOPY.
 
-config HAVE_ARCH_HARDENED_USERCOPY
-       bool
-       help
-         The architecture supports CONFIG_HARDENED_USERCOPY by
-         calling check_object_size() just before performing the
-         userspace copies in the low level implementation of
-         copy_to_user() and copy_from_user().
-
 config HARDENED_USERCOPY
        bool "Harden memory copies between kernel and userspace"
-       depends on HAVE_ARCH_HARDENED_USERCOPY
        depends on HAVE_HARDENED_USERCOPY_ALLOCATOR
        select BUG
        help
index addf060..9cb4fe4 100644 (file)
@@ -46,7 +46,7 @@ static unsigned long key_gc_flags;
  * immediately unlinked.
  */
 struct key_type key_type_dead = {
-       .name = "dead",
+       .name = ".dead",
 };
 
 /*
index 52c3453..4ad3212 100644 (file)
@@ -273,7 +273,8 @@ error:
  * Create and join an anonymous session keyring or join a named session
  * keyring, creating it if necessary.  A named session keyring must have Search
  * permission for it to be joined.  Session keyrings without this permit will
- * be skipped over.
+ * be skipped over.  It is not permitted for userspace to create or join
+ * keyrings whose name begin with a dot.
  *
  * If successful, the ID of the joined session keyring will be returned.
  */
@@ -290,12 +291,16 @@ long keyctl_join_session_keyring(const char __user *_name)
                        ret = PTR_ERR(name);
                        goto error;
                }
+
+               ret = -EPERM;
+               if (name[0] == '.')
+                       goto error_name;
        }
 
        /* join the session */
        ret = join_session_keyring(name);
+error_name:
        kfree(name);
-
 error:
        return ret;
 }
@@ -1253,8 +1258,8 @@ error:
  * Read or set the default keyring in which request_key() will cache keys and
  * return the old setting.
  *
- * If a process keyring is specified then this will be created if it doesn't
- * yet exist.  The old setting will be returned if successful.
+ * If a thread or process keyring is specified then it will be created if it
+ * doesn't yet exist.  The old setting will be returned if successful.
  */
 long keyctl_set_reqkey_keyring(int reqkey_defl)
 {
@@ -1279,11 +1284,8 @@ long keyctl_set_reqkey_keyring(int reqkey_defl)
 
        case KEY_REQKEY_DEFL_PROCESS_KEYRING:
                ret = install_process_keyring_to_cred(new);
-               if (ret < 0) {
-                       if (ret != -EEXIST)
-                               goto error;
-                       ret = 0;
-               }
+               if (ret < 0)
+                       goto error;
                goto set;
 
        case KEY_REQKEY_DEFL_DEFAULT:
index b6fdd22..9139b18 100644 (file)
@@ -128,13 +128,18 @@ error:
 }
 
 /*
- * Install a fresh thread keyring directly to new credentials.  This keyring is
- * allowed to overrun the quota.
+ * Install a thread keyring to the given credentials struct if it didn't have
+ * one already.  This is allowed to overrun the quota.
+ *
+ * Return: 0 if a thread keyring is now present; -errno on failure.
  */
 int install_thread_keyring_to_cred(struct cred *new)
 {
        struct key *keyring;
 
+       if (new->thread_keyring)
+               return 0;
+
        keyring = keyring_alloc("_tid", new->uid, new->gid, new,
                                KEY_POS_ALL | KEY_USR_VIEW,
                                KEY_ALLOC_QUOTA_OVERRUN,
@@ -147,7 +152,9 @@ int install_thread_keyring_to_cred(struct cred *new)
 }
 
 /*
- * Install a fresh thread keyring, discarding the old one.
+ * Install a thread keyring to the current task if it didn't have one already.
+ *
+ * Return: 0 if a thread keyring is now present; -errno on failure.
  */
 static int install_thread_keyring(void)
 {
@@ -158,8 +165,6 @@ static int install_thread_keyring(void)
        if (!new)
                return -ENOMEM;
 
-       BUG_ON(new->thread_keyring);
-
        ret = install_thread_keyring_to_cred(new);
        if (ret < 0) {
                abort_creds(new);
@@ -170,17 +175,17 @@ static int install_thread_keyring(void)
 }
 
 /*
- * Install a process keyring directly to a credentials struct.
+ * Install a process keyring to the given credentials struct if it didn't have
+ * one already.  This is allowed to overrun the quota.
  *
- * Returns -EEXIST if there was already a process keyring, 0 if one installed,
- * and other value on any other error
+ * Return: 0 if a process keyring is now present; -errno on failure.
  */
 int install_process_keyring_to_cred(struct cred *new)
 {
        struct key *keyring;
 
        if (new->process_keyring)
-               return -EEXIST;
+               return 0;
 
        keyring = keyring_alloc("_pid", new->uid, new->gid, new,
                                KEY_POS_ALL | KEY_USR_VIEW,
@@ -194,11 +199,9 @@ int install_process_keyring_to_cred(struct cred *new)
 }
 
 /*
- * Make sure a process keyring is installed for the current process.  The
- * existing process keyring is not replaced.
+ * Install a process keyring to the current task if it didn't have one already.
  *
- * Returns 0 if there is a process keyring by the end of this function, some
- * error otherwise.
+ * Return: 0 if a process keyring is now present; -errno on failure.
  */
 static int install_process_keyring(void)
 {
@@ -212,14 +215,18 @@ static int install_process_keyring(void)
        ret = install_process_keyring_to_cred(new);
        if (ret < 0) {
                abort_creds(new);
-               return ret != -EEXIST ? ret : 0;
+               return ret;
        }
 
        return commit_creds(new);
 }
 
 /*
- * Install a session keyring directly to a credentials struct.
+ * Install the given keyring as the session keyring of the given credentials
+ * struct, replacing the existing one if any.  If the given keyring is NULL,
+ * then install a new anonymous session keyring.
+ *
+ * Return: 0 on success; -errno on failure.
  */
 int install_session_keyring_to_cred(struct cred *cred, struct key *keyring)
 {
@@ -254,8 +261,11 @@ int install_session_keyring_to_cred(struct cred *cred, struct key *keyring)
 }
 
 /*
- * Install a session keyring, discarding the old one.  If a keyring is not
- * supplied, an empty one is invented.
+ * Install the given keyring as the session keyring of the current task,
+ * replacing the existing one if any.  If the given keyring is NULL, then
+ * install a new anonymous session keyring.
+ *
+ * Return: 0 on success; -errno on failure.
  */
 static int install_session_keyring(struct key *keyring)
 {
index 9752771..6c02ac4 100644 (file)
@@ -608,7 +608,7 @@ static int tomoyo_check_unix_address(struct sockaddr *addr,
 static bool tomoyo_kernel_service(void)
 {
        /* Nothing to do if I am a kernel service. */
-       return segment_eq(get_fs(), KERNEL_DS);
+       return uaccess_kernel();
 }
 
 /**
index 3b693e9..12ba833 100644 (file)
 /* wait until all locks are released */
 void snd_use_lock_sync_helper(snd_use_lock_t *lockp, const char *file, int line)
 {
-       int max_count = 5 * HZ;
+       int warn_count = 5 * HZ;
 
        if (atomic_read(lockp) < 0) {
                pr_warn("ALSA: seq_lock: lock trouble [counter = %d] in %s:%d\n", atomic_read(lockp), file, line);
                return;
        }
        while (atomic_read(lockp) > 0) {
-               if (max_count == 0) {
-                       pr_warn("ALSA: seq_lock: timeout [%d left] in %s:%d\n", atomic_read(lockp), file, line);
-                       break;
-               }
+               if (warn_count-- == 0)
+                       pr_warn("ALSA: seq_lock: waiting [%d left] in %s:%d\n", atomic_read(lockp), file, line);
                schedule_timeout_uninterruptible(1);
-               max_count--;
        }
 }
 
index f676931..c3768cd 100644 (file)
@@ -45,7 +45,7 @@ struct snd_fw_async_midi_port {
 
        struct snd_rawmidi_substream *substream;
        snd_fw_async_midi_port_fill fill;
-       unsigned int consume_bytes;
+       int consume_bytes;
 };
 
 int snd_fw_async_midi_port_init(struct snd_fw_async_midi_port *port,
index 74d7fb6..413ab63 100644 (file)
@@ -227,11 +227,11 @@ static void do_registration(struct work_struct *work)
        if (err < 0)
                goto error;
 
-       err = detect_quirks(oxfw);
+       err = snd_oxfw_stream_discover(oxfw);
        if (err < 0)
                goto error;
 
-       err = snd_oxfw_stream_discover(oxfw);
+       err = detect_quirks(oxfw);
        if (err < 0)
                goto error;
 
index 19d41da..7efa7bd 100644 (file)
@@ -2,11 +2,11 @@
  * to be included from codec driver
  */
 
-#if IS_ENABLED(CONFIG_LEDS_DELL_NETBOOKS)
+#if IS_ENABLED(CONFIG_DELL_LAPTOP)
 #include <linux/dell-led.h>
 
 static int dell_led_value;
-static int (*dell_led_set_func)(int, int);
+static int (*dell_micmute_led_set_func)(int);
 static void (*dell_old_cap_hook)(struct hda_codec *,
                                 struct snd_kcontrol *,
                                 struct snd_ctl_elem_value *);
@@ -18,7 +18,7 @@ static void update_dell_wmi_micmute_led(struct hda_codec *codec,
        if (dell_old_cap_hook)
                dell_old_cap_hook(codec, kcontrol, ucontrol);
 
-       if (!ucontrol || !dell_led_set_func)
+       if (!ucontrol || !dell_micmute_led_set_func)
                return;
        if (strcmp("Capture Switch", ucontrol->id.name) == 0 && ucontrol->id.index == 0) {
                /* TODO: How do I verify if it's a mono or stereo here? */
@@ -26,8 +26,8 @@ static void update_dell_wmi_micmute_led(struct hda_codec *codec,
                if (val == dell_led_value)
                        return;
                dell_led_value = val;
-               if (dell_led_set_func)
-                       dell_led_set_func(DELL_LED_MICMUTE, dell_led_value);
+               if (dell_micmute_led_set_func)
+                       dell_micmute_led_set_func(dell_led_value);
        }
 }
 
@@ -39,15 +39,15 @@ static void alc_fixup_dell_wmi(struct hda_codec *codec,
        bool removefunc = false;
 
        if (action == HDA_FIXUP_ACT_PROBE) {
-               if (!dell_led_set_func)
-                       dell_led_set_func = symbol_request(dell_app_wmi_led_set);
-               if (!dell_led_set_func) {
-                       codec_warn(codec, "Failed to find dell wmi symbol dell_app_wmi_led_set\n");
+               if (!dell_micmute_led_set_func)
+                       dell_micmute_led_set_func = symbol_request(dell_micmute_led_set);
+               if (!dell_micmute_led_set_func) {
+                       codec_warn(codec, "Failed to find dell wmi symbol dell_micmute_led_set\n");
                        return;
                }
 
                removefunc = true;
-               if (dell_led_set_func(DELL_LED_MICMUTE, false) >= 0) {
+               if (dell_micmute_led_set_func(false) >= 0) {
                        dell_led_value = 0;
                        if (spec->gen.num_adc_nids > 1 && !spec->gen.dyn_adc_switch)
                                codec_dbg(codec, "Skipping micmute LED control due to several ADCs");
@@ -60,17 +60,17 @@ static void alc_fixup_dell_wmi(struct hda_codec *codec,
 
        }
 
-       if (dell_led_set_func && (action == HDA_FIXUP_ACT_FREE || removefunc)) {
-               symbol_put(dell_app_wmi_led_set);
-               dell_led_set_func = NULL;
+       if (dell_micmute_led_set_func && (action == HDA_FIXUP_ACT_FREE || removefunc)) {
+               symbol_put(dell_micmute_led_set);
+               dell_micmute_led_set_func = NULL;
                dell_old_cap_hook = NULL;
        }
 }
 
-#else /* CONFIG_LEDS_DELL_NETBOOKS */
+#else /* CONFIG_DELL_LAPTOP */
 static void alc_fixup_dell_wmi(struct hda_codec *codec,
                               const struct hda_fixup *fix, int action)
 {
 }
 
-#endif /* CONFIG_LEDS_DELL_NETBOOKS */
+#endif /* CONFIG_DELL_LAPTOP */
index 5c7219f..9e2a340 100644 (file)
@@ -621,7 +621,7 @@ static struct snd_soc_dai_link byt_rt5640_dais[] = {
                .codec_dai_name = "snd-soc-dummy-dai",
                .codec_name = "snd-soc-dummy",
                .platform_name = "sst-mfld-platform",
-               .ignore_suspend = 1,
+               .nonatomic = true,
                .dynamic = 1,
                .dpcm_playback = 1,
                .dpcm_capture = 1,
@@ -634,7 +634,6 @@ static struct snd_soc_dai_link byt_rt5640_dais[] = {
                .codec_dai_name = "snd-soc-dummy-dai",
                .codec_name = "snd-soc-dummy",
                .platform_name = "sst-mfld-platform",
-               .ignore_suspend = 1,
                .nonatomic = true,
                .dynamic = 1,
                .dpcm_playback = 1,
@@ -661,6 +660,7 @@ static struct snd_soc_dai_link byt_rt5640_dais[] = {
                                                | SND_SOC_DAIFMT_CBS_CFS,
                .be_hw_params_fixup = byt_rt5640_codec_fixup,
                .ignore_suspend = 1,
+               .nonatomic = true,
                .dpcm_playback = 1,
                .dpcm_capture = 1,
                .init = byt_rt5640_init,
index 3186f01..8164bec 100644 (file)
@@ -235,7 +235,6 @@ static struct snd_soc_dai_link byt_rt5651_dais[] = {
                .codec_dai_name = "snd-soc-dummy-dai",
                .codec_name = "snd-soc-dummy",
                .platform_name = "sst-mfld-platform",
-               .ignore_suspend = 1,
                .nonatomic = true,
                .dynamic = 1,
                .dpcm_playback = 1,
@@ -249,7 +248,6 @@ static struct snd_soc_dai_link byt_rt5651_dais[] = {
                .codec_dai_name = "snd-soc-dummy-dai",
                .codec_name = "snd-soc-dummy",
                .platform_name = "sst-mfld-platform",
-               .ignore_suspend = 1,
                .nonatomic = true,
                .dynamic = 1,
                .dpcm_playback = 1,
index 3e9b1c0..058bc99 100644 (file)
@@ -933,6 +933,7 @@ static int soc_tplg_denum_create_texts(struct soc_enum *se,
                }
        }
 
+       se->texts = (const char * const *)se->dobj.control.dtexts;
        return 0;
 
 err:
index d487dd2..cfcb0ea 100644 (file)
@@ -1299,6 +1299,7 @@ struct uniperif {
        int ver; /* IP version, used by register access macros */
        struct regmap_field *clk_sel;
        struct regmap_field *valid_sel;
+       spinlock_t irq_lock; /* use to prevent race condition with IRQ */
 
        /* capabilities */
        const struct snd_pcm_hardware *hw;
index 60ae31a..d7e8dd4 100644 (file)
@@ -65,10 +65,13 @@ static irqreturn_t uni_player_irq_handler(int irq, void *dev_id)
        unsigned int status;
        unsigned int tmp;
 
-       if (player->state == UNIPERIF_STATE_STOPPED) {
-               /* Unexpected IRQ: do nothing */
-               return IRQ_NONE;
-       }
+       spin_lock(&player->irq_lock);
+       if (!player->substream)
+               goto irq_spin_unlock;
+
+       snd_pcm_stream_lock(player->substream);
+       if (player->state == UNIPERIF_STATE_STOPPED)
+               goto stream_unlock;
 
        /* Get interrupt status & clear them immediately */
        status = GET_UNIPERIF_ITS(player);
@@ -88,9 +91,7 @@ static irqreturn_t uni_player_irq_handler(int irq, void *dev_id)
                        SET_UNIPERIF_ITM_BCLR_FIFO_ERROR(player);
 
                        /* Stop the player */
-                       snd_pcm_stream_lock(player->substream);
                        snd_pcm_stop(player->substream, SNDRV_PCM_STATE_XRUN);
-                       snd_pcm_stream_unlock(player->substream);
                }
 
                ret = IRQ_HANDLED;
@@ -104,9 +105,7 @@ static irqreturn_t uni_player_irq_handler(int irq, void *dev_id)
                SET_UNIPERIF_ITM_BCLR_DMA_ERROR(player);
 
                /* Stop the player */
-               snd_pcm_stream_lock(player->substream);
                snd_pcm_stop(player->substream, SNDRV_PCM_STATE_XRUN);
-               snd_pcm_stream_unlock(player->substream);
 
                ret = IRQ_HANDLED;
        }
@@ -116,7 +115,8 @@ static irqreturn_t uni_player_irq_handler(int irq, void *dev_id)
                if (!player->underflow_enabled) {
                        dev_err(player->dev,
                                "unexpected Underflow recovering\n");
-                       return -EPERM;
+                       ret = -EPERM;
+                       goto stream_unlock;
                }
                /* Read the underflow recovery duration */
                tmp = GET_UNIPERIF_STATUS_1_UNDERFLOW_DURATION(player);
@@ -138,13 +138,16 @@ static irqreturn_t uni_player_irq_handler(int irq, void *dev_id)
                dev_err(player->dev, "Underflow recovery failed\n");
 
                /* Stop the player */
-               snd_pcm_stream_lock(player->substream);
                snd_pcm_stop(player->substream, SNDRV_PCM_STATE_XRUN);
-               snd_pcm_stream_unlock(player->substream);
 
                ret = IRQ_HANDLED;
        }
 
+stream_unlock:
+       snd_pcm_stream_unlock(player->substream);
+irq_spin_unlock:
+       spin_unlock(&player->irq_lock);
+
        return ret;
 }
 
@@ -588,6 +591,7 @@ static int uni_player_ctl_iec958_put(struct snd_kcontrol *kcontrol,
        struct sti_uniperiph_data *priv = snd_soc_dai_get_drvdata(dai);
        struct uniperif *player = priv->dai_data.uni;
        struct snd_aes_iec958 *iec958 =  &player->stream_settings.iec958;
+       unsigned long flags;
 
        mutex_lock(&player->ctrl_lock);
        iec958->status[0] = ucontrol->value.iec958.status[0];
@@ -596,12 +600,14 @@ static int uni_player_ctl_iec958_put(struct snd_kcontrol *kcontrol,
        iec958->status[3] = ucontrol->value.iec958.status[3];
        mutex_unlock(&player->ctrl_lock);
 
+       spin_lock_irqsave(&player->irq_lock, flags);
        if (player->substream && player->substream->runtime)
                uni_player_set_channel_status(player,
                                              player->substream->runtime);
        else
                uni_player_set_channel_status(player, NULL);
 
+       spin_unlock_irqrestore(&player->irq_lock, flags);
        return 0;
 }
 
@@ -686,9 +692,12 @@ static int uni_player_startup(struct snd_pcm_substream *substream,
 {
        struct sti_uniperiph_data *priv = snd_soc_dai_get_drvdata(dai);
        struct uniperif *player = priv->dai_data.uni;
+       unsigned long flags;
        int ret;
 
+       spin_lock_irqsave(&player->irq_lock, flags);
        player->substream = substream;
+       spin_unlock_irqrestore(&player->irq_lock, flags);
 
        player->clk_adj = 0;
 
@@ -986,12 +995,15 @@ static void uni_player_shutdown(struct snd_pcm_substream *substream,
 {
        struct sti_uniperiph_data *priv = snd_soc_dai_get_drvdata(dai);
        struct uniperif *player = priv->dai_data.uni;
+       unsigned long flags;
 
+       spin_lock_irqsave(&player->irq_lock, flags);
        if (player->state != UNIPERIF_STATE_STOPPED)
                /* Stop the player */
                uni_player_stop(player);
 
        player->substream = NULL;
+       spin_unlock_irqrestore(&player->irq_lock, flags);
 }
 
 static int uni_player_parse_dt_audio_glue(struct platform_device *pdev,
@@ -1096,6 +1108,7 @@ int uni_player_init(struct platform_device *pdev,
        }
 
        mutex_init(&player->ctrl_lock);
+       spin_lock_init(&player->irq_lock);
 
        /* Ensure that disabled by default */
        SET_UNIPERIF_CONFIG_BACK_STALL_REQ_DISABLE(player);
index 93a8df6..ee0055e 100644 (file)
@@ -46,10 +46,15 @@ static irqreturn_t uni_reader_irq_handler(int irq, void *dev_id)
        struct uniperif *reader = dev_id;
        unsigned int status;
 
+       spin_lock(&reader->irq_lock);
+       if (!reader->substream)
+               goto irq_spin_unlock;
+
+       snd_pcm_stream_lock(reader->substream);
        if (reader->state == UNIPERIF_STATE_STOPPED) {
                /* Unexpected IRQ: do nothing */
                dev_warn(reader->dev, "unexpected IRQ\n");
-               return IRQ_HANDLED;
+               goto stream_unlock;
        }
 
        /* Get interrupt status & clear them immediately */
@@ -60,13 +65,16 @@ static irqreturn_t uni_reader_irq_handler(int irq, void *dev_id)
        if (unlikely(status & UNIPERIF_ITS_FIFO_ERROR_MASK(reader))) {
                dev_err(reader->dev, "FIFO error detected\n");
 
-               snd_pcm_stream_lock(reader->substream);
                snd_pcm_stop(reader->substream, SNDRV_PCM_STATE_XRUN);
-               snd_pcm_stream_unlock(reader->substream);
 
-               return IRQ_HANDLED;
+               ret = IRQ_HANDLED;
        }
 
+stream_unlock:
+       snd_pcm_stream_unlock(reader->substream);
+irq_spin_unlock:
+       spin_unlock(&reader->irq_lock);
+
        return ret;
 }
 
@@ -347,9 +355,12 @@ static int uni_reader_startup(struct snd_pcm_substream *substream,
 {
        struct sti_uniperiph_data *priv = snd_soc_dai_get_drvdata(dai);
        struct uniperif *reader = priv->dai_data.uni;
+       unsigned long flags;
        int ret;
 
+       spin_lock_irqsave(&reader->irq_lock, flags);
        reader->substream = substream;
+       spin_unlock_irqrestore(&reader->irq_lock, flags);
 
        if (!UNIPERIF_TYPE_IS_TDM(reader))
                return 0;
@@ -375,12 +386,15 @@ static void uni_reader_shutdown(struct snd_pcm_substream *substream,
 {
        struct sti_uniperiph_data *priv = snd_soc_dai_get_drvdata(dai);
        struct uniperif *reader = priv->dai_data.uni;
+       unsigned long flags;
 
+       spin_lock_irqsave(&reader->irq_lock, flags);
        if (reader->state != UNIPERIF_STATE_STOPPED) {
                /* Stop the reader */
                uni_reader_stop(reader);
        }
        reader->substream = NULL;
+       spin_unlock_irqrestore(&reader->irq_lock, flags);
 }
 
 static const struct snd_soc_dai_ops uni_reader_dai_ops = {
@@ -415,6 +429,8 @@ int uni_reader_init(struct platform_device *pdev,
                return -EBUSY;
        }
 
+       spin_lock_init(&reader->irq_lock);
+
        return 0;
 }
 EXPORT_SYMBOL_GPL(uni_reader_init);
index af05f8e..6ebd3e6 100644 (file)
@@ -181,10 +181,23 @@ struct kvm_arch_memory_slot {
 #define KVM_DEV_ARM_VGIC_GRP_CPU_REGS  2
 #define   KVM_DEV_ARM_VGIC_CPUID_SHIFT 32
 #define   KVM_DEV_ARM_VGIC_CPUID_MASK  (0xffULL << KVM_DEV_ARM_VGIC_CPUID_SHIFT)
+#define   KVM_DEV_ARM_VGIC_V3_MPIDR_SHIFT 32
+#define   KVM_DEV_ARM_VGIC_V3_MPIDR_MASK \
+                       (0xffffffffULL << KVM_DEV_ARM_VGIC_V3_MPIDR_SHIFT)
 #define   KVM_DEV_ARM_VGIC_OFFSET_SHIFT        0
 #define   KVM_DEV_ARM_VGIC_OFFSET_MASK (0xffffffffULL << KVM_DEV_ARM_VGIC_OFFSET_SHIFT)
+#define   KVM_DEV_ARM_VGIC_SYSREG_INSTR_MASK (0xffff)
 #define KVM_DEV_ARM_VGIC_GRP_NR_IRQS   3
 #define KVM_DEV_ARM_VGIC_GRP_CTRL       4
+#define KVM_DEV_ARM_VGIC_GRP_REDIST_REGS 5
+#define KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS 6
+#define KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO  7
+#define KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT 10
+#define KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK \
+                       (0x3fffffULL << KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT)
+#define KVM_DEV_ARM_VGIC_LINE_LEVEL_INTID_MASK 0x3ff
+#define VGIC_LEVEL_INFO_LINE_LEVEL     0
+
 #define   KVM_DEV_ARM_VGIC_CTRL_INIT    0
 
 /* KVM_IRQ_LINE irq field index values */
index 3051f86..c286035 100644 (file)
@@ -201,10 +201,23 @@ struct kvm_arch_memory_slot {
 #define KVM_DEV_ARM_VGIC_GRP_CPU_REGS  2
 #define   KVM_DEV_ARM_VGIC_CPUID_SHIFT 32
 #define   KVM_DEV_ARM_VGIC_CPUID_MASK  (0xffULL << KVM_DEV_ARM_VGIC_CPUID_SHIFT)
+#define   KVM_DEV_ARM_VGIC_V3_MPIDR_SHIFT 32
+#define   KVM_DEV_ARM_VGIC_V3_MPIDR_MASK \
+                       (0xffffffffULL << KVM_DEV_ARM_VGIC_V3_MPIDR_SHIFT)
 #define   KVM_DEV_ARM_VGIC_OFFSET_SHIFT        0
 #define   KVM_DEV_ARM_VGIC_OFFSET_MASK (0xffffffffULL << KVM_DEV_ARM_VGIC_OFFSET_SHIFT)
+#define   KVM_DEV_ARM_VGIC_SYSREG_INSTR_MASK (0xffff)
 #define KVM_DEV_ARM_VGIC_GRP_NR_IRQS   3
 #define KVM_DEV_ARM_VGIC_GRP_CTRL      4
+#define KVM_DEV_ARM_VGIC_GRP_REDIST_REGS 5
+#define KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS 6
+#define KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO  7
+#define KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT 10
+#define KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK \
+                       (0x3fffffULL << KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT)
+#define KVM_DEV_ARM_VGIC_LINE_LEVEL_INTID_MASK 0x3ff
+#define VGIC_LEVEL_INFO_LINE_LEVEL     0
+
 #define   KVM_DEV_ARM_VGIC_CTRL_INIT   0
 
 /* Device Control API on vcpu fd */
index 3603b6f..4edbe4b 100644 (file)
@@ -413,6 +413,26 @@ struct kvm_get_htab_header {
        __u16   n_invalid;
 };
 
+/* For KVM_PPC_CONFIGURE_V3_MMU */
+struct kvm_ppc_mmuv3_cfg {
+       __u64   flags;
+       __u64   process_table;  /* second doubleword of partition table entry */
+};
+
+/* Flag values for KVM_PPC_CONFIGURE_V3_MMU */
+#define KVM_PPC_MMUV3_RADIX    1       /* 1 = radix mode, 0 = HPT */
+#define KVM_PPC_MMUV3_GTSE     2       /* global translation shootdown enb. */
+
+/* For KVM_PPC_GET_RMMU_INFO */
+struct kvm_ppc_rmmu_info {
+       struct kvm_ppc_radix_geom {
+               __u8    page_shift;
+               __u8    level_bits[4];
+               __u8    pad[3];
+       }       geometries[8];
+       __u32   ap_encodings[8];
+};
+
 /* Per-vcpu XICS interrupt controller state */
 #define KVM_REG_PPC_ICP_STATE  (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x8c)
 
@@ -613,5 +633,7 @@ struct kvm_get_htab_header {
 #define  KVM_XICS_LEVEL_SENSITIVE      (1ULL << 40)
 #define  KVM_XICS_MASKED               (1ULL << 41)
 #define  KVM_XICS_PENDING              (1ULL << 42)
+#define  KVM_XICS_PRESENTED            (1ULL << 43)
+#define  KVM_XICS_QUEUED               (1ULL << 44)
 
 #endif /* __LINUX_KVM_POWERPC_H */
index 059e33e..328eece 100644 (file)
@@ -7,6 +7,8 @@
 
 #define LOCK_PREFIX "\n\tlock; "
 
+#include <asm/cmpxchg.h>
+
 /*
  * Atomic operations that C can't guarantee us.  Useful for
  * resource counting etc..
@@ -62,4 +64,9 @@ static inline int atomic_dec_and_test(atomic_t *v)
        GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, "%0", "e");
 }
 
+static __always_inline int atomic_cmpxchg(atomic_t *v, int old, int new)
+{
+       return cmpxchg(&v->counter, old, new);
+}
+
 #endif /* _TOOLS_LINUX_ASM_X86_ATOMIC_H */
diff --git a/tools/arch/x86/include/asm/cmpxchg.h b/tools/arch/x86/include/asm/cmpxchg.h
new file mode 100644 (file)
index 0000000..f525326
--- /dev/null
@@ -0,0 +1,89 @@
+#ifndef TOOLS_ASM_X86_CMPXCHG_H
+#define TOOLS_ASM_X86_CMPXCHG_H
+
+#include <linux/compiler.h>
+
+/*
+ * Non-existant functions to indicate usage errors at link time
+ * (or compile-time if the compiler implements __compiletime_error().
+ */
+extern void __cmpxchg_wrong_size(void)
+       __compiletime_error("Bad argument size for cmpxchg");
+
+/*
+ * Constants for operation sizes. On 32-bit, the 64-bit size it set to
+ * -1 because sizeof will never return -1, thereby making those switch
+ * case statements guaranteeed dead code which the compiler will
+ * eliminate, and allowing the "missing symbol in the default case" to
+ * indicate a usage error.
+ */
+#define __X86_CASE_B   1
+#define __X86_CASE_W   2
+#define __X86_CASE_L   4
+#ifdef __x86_64__
+#define __X86_CASE_Q   8
+#else
+#define        __X86_CASE_Q    -1              /* sizeof will never return -1 */
+#endif
+
+/*
+ * Atomic compare and exchange.  Compare OLD with MEM, if identical,
+ * store NEW in MEM.  Return the initial value in MEM.  Success is
+ * indicated by comparing RETURN with OLD.
+ */
+#define __raw_cmpxchg(ptr, old, new, size, lock)                       \
+({                                                                     \
+       __typeof__(*(ptr)) __ret;                                       \
+       __typeof__(*(ptr)) __old = (old);                               \
+       __typeof__(*(ptr)) __new = (new);                               \
+       switch (size) {                                                 \
+       case __X86_CASE_B:                                              \
+       {                                                               \
+               volatile u8 *__ptr = (volatile u8 *)(ptr);              \
+               asm volatile(lock "cmpxchgb %2,%1"                      \
+                            : "=a" (__ret), "+m" (*__ptr)              \
+                            : "q" (__new), "0" (__old)                 \
+                            : "memory");                               \
+               break;                                                  \
+       }                                                               \
+       case __X86_CASE_W:                                              \
+       {                                                               \
+               volatile u16 *__ptr = (volatile u16 *)(ptr);            \
+               asm volatile(lock "cmpxchgw %2,%1"                      \
+                            : "=a" (__ret), "+m" (*__ptr)              \
+                            : "r" (__new), "0" (__old)                 \
+                            : "memory");                               \
+               break;                                                  \
+       }                                                               \
+       case __X86_CASE_L:                                              \
+       {                                                               \
+               volatile u32 *__ptr = (volatile u32 *)(ptr);            \
+               asm volatile(lock "cmpxchgl %2,%1"                      \
+                            : "=a" (__ret), "+m" (*__ptr)              \
+                            : "r" (__new), "0" (__old)                 \
+                            : "memory");                               \
+               break;                                                  \
+       }                                                               \
+       case __X86_CASE_Q:                                              \
+       {                                                               \
+               volatile u64 *__ptr = (volatile u64 *)(ptr);            \
+               asm volatile(lock "cmpxchgq %2,%1"                      \
+                            : "=a" (__ret), "+m" (*__ptr)              \
+                            : "r" (__new), "0" (__old)                 \
+                            : "memory");                               \
+               break;                                                  \
+       }                                                               \
+       default:                                                        \
+               __cmpxchg_wrong_size();                                 \
+       }                                                               \
+       __ret;                                                          \
+})
+
+#define __cmpxchg(ptr, old, new, size)                                 \
+       __raw_cmpxchg((ptr), (old), (new), (size), LOCK_PREFIX)
+
+#define cmpxchg(ptr, old, new)                                         \
+       __cmpxchg(ptr, old, new, sizeof(*(ptr)))
+
+
+#endif /* TOOLS_ASM_X86_CMPXCHG_H */
index 293149a..0fe0044 100644 (file)
 #define X86_FEATURE_XTOPOLOGY  ( 3*32+22) /* cpu topology enum extensions */
 #define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */
 #define X86_FEATURE_NONSTOP_TSC        ( 3*32+24) /* TSC does not stop in C states */
-/* free, was #define X86_FEATURE_CLFLUSH_MONITOR ( 3*32+25) * "" clflush reqd with monitor */
+#define X86_FEATURE_CPUID      ( 3*32+25) /* CPU has CPUID instruction itself */
 #define X86_FEATURE_EXTD_APICID        ( 3*32+26) /* has extended APICID (8 bits) */
 #define X86_FEATURE_AMD_DCM     ( 3*32+27) /* multi-node processor */
 #define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */
  *
  * Reuse free bits when adding new feature flags!
  */
-
+#define X86_FEATURE_RING3MWAIT ( 7*32+ 0) /* Ring 3 MONITOR/MWAIT */
+#define X86_FEATURE_CPUID_FAULT ( 7*32+ 1) /* Intel CPUID faulting */
 #define X86_FEATURE_CPB                ( 7*32+ 2) /* AMD Core Performance Boost */
 #define X86_FEATURE_EPB                ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
 #define X86_FEATURE_CAT_L3     ( 7*32+ 4) /* Cache Allocation Technology L3 */
 #define X86_FEATURE_PKU                (16*32+ 3) /* Protection Keys for Userspace */
 #define X86_FEATURE_OSPKE      (16*32+ 4) /* OS Protection Keys Enable */
 #define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */
-#define X86_FEATURE_RDPID      (16*32+ 22) /* RDPID instruction */
+#define X86_FEATURE_LA57       (16*32+16) /* 5-level page tables */
+#define X86_FEATURE_RDPID      (16*32+22) /* RDPID instruction */
 
 /* AMD-defined CPU features, CPUID level 0x80000007 (ebx), word 17 */
 #define X86_FEATURE_OVERFLOW_RECOV (17*32+0) /* MCA overflow recovery support */
 #define X86_BUG_SWAPGS_FENCE   X86_BUG(11) /* SWAPGS without input dep on GS */
 #define X86_BUG_MONITOR                X86_BUG(12) /* IPI required to wake up remote CPU */
 #define X86_BUG_AMD_E400       X86_BUG(13) /* CPU is among the affected by Erratum 400 */
-
 #endif /* _ASM_X86_CPUFEATURES_H */
index 49e6eba..98dcc11 100644 (file)
@@ -286,7 +286,7 @@ ENDPROC(memcpy_mcsafe_unrolled)
        _ASM_EXTABLE_FAULT(.L_copy_leading_bytes, .L_memcpy_mcsafe_fail)
        _ASM_EXTABLE_FAULT(.L_cache_w0, .L_memcpy_mcsafe_fail)
        _ASM_EXTABLE_FAULT(.L_cache_w1, .L_memcpy_mcsafe_fail)
-       _ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail)
+       _ASM_EXTABLE_FAULT(.L_cache_w2, .L_memcpy_mcsafe_fail)
        _ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail)
        _ASM_EXTABLE_FAULT(.L_cache_w4, .L_memcpy_mcsafe_fail)
        _ASM_EXTABLE_FAULT(.L_cache_w5, .L_memcpy_mcsafe_fail)
index e3fb5ec..523911f 100644 (file)
@@ -63,6 +63,7 @@ FEATURE_TESTS_BASIC :=                  \
         lzma                            \
         get_cpuid                       \
         bpf                             \
+        sched_getcpu                   \
         sdt
 
 # FEATURE_TESTS_BASIC + FEATURE_TESTS_EXTRA is the complete list
index b564a2e..e35e4e5 100644 (file)
@@ -48,21 +48,22 @@ FILES=                                          \
          test-get_cpuid.bin                     \
          test-sdt.bin                           \
          test-cxx.bin                           \
-         test-jvmti.bin
+         test-jvmti.bin                                \
+         test-sched_getcpu.bin
 
 FILES := $(addprefix $(OUTPUT),$(FILES))
 
-CC := $(CROSS_COMPILE)gcc -MD
-CXX := $(CROSS_COMPILE)g++ -MD
-PKG_CONFIG := $(CROSS_COMPILE)pkg-config
+CC ?= $(CROSS_COMPILE)gcc
+CXX ?= $(CROSS_COMPILE)g++
+PKG_CONFIG ?= $(CROSS_COMPILE)pkg-config
 LLVM_CONFIG ?= llvm-config
 
 all: $(FILES)
 
-__BUILD = $(CC) $(CFLAGS) -Wall -Werror -o $@ $(patsubst %.bin,%.c,$(@F)) $(LDFLAGS)
+__BUILD = $(CC) $(CFLAGS) -MD -Wall -Werror -o $@ $(patsubst %.bin,%.c,$(@F)) $(LDFLAGS)
   BUILD = $(__BUILD) > $(@:.bin=.make.output) 2>&1
 
-__BUILDXX = $(CXX) $(CXXFLAGS) -Wall -Werror -o $@ $(patsubst %.bin,%.cpp,$(@F)) $(LDFLAGS)
+__BUILDXX = $(CXX) $(CXXFLAGS) -MD -Wall -Werror -o $@ $(patsubst %.bin,%.cpp,$(@F)) $(LDFLAGS)
   BUILDXX = $(__BUILDXX) > $(@:.bin=.make.output) 2>&1
 
 ###############################
@@ -91,6 +92,9 @@ $(OUTPUT)test-libelf.bin:
 $(OUTPUT)test-glibc.bin:
        $(BUILD)
 
+$(OUTPUT)test-sched_getcpu.bin:
+       $(BUILD)
+
 DWARFLIBS := -ldw
 ifeq ($(findstring -static,${LDFLAGS}),-static)
 DWARFLIBS += -lelf -lebl -lz -llzma -lbz2
@@ -171,7 +175,7 @@ $(OUTPUT)test-libperl.bin:
        $(BUILD) $(FLAGS_PERL_EMBED)
 
 $(OUTPUT)test-libpython.bin:
-       $(BUILD)
+       $(BUILD) $(FLAGS_PYTHON_EMBED)
 
 $(OUTPUT)test-libpython-version.bin:
        $(BUILD)
index 699e436..cc6c7c0 100644 (file)
 # include "test-pthread-attr-setaffinity-np.c"
 #undef main
 
+#define main main_test_sched_getcpu
+# include "test-sched_getcpu.c"
+#undef main
+
 # if 0
 /*
  * Disable libbabeltrace check for test-all, because the requested
@@ -182,6 +186,7 @@ int main(int argc, char *argv[])
        main_test_get_cpuid();
        main_test_bpf();
        main_test_libcrypto();
+       main_test_sched_getcpu();
        main_test_sdt();
 
        return 0;
diff --git a/tools/build/feature/test-sched_getcpu.c b/tools/build/feature/test-sched_getcpu.c
new file mode 100644 (file)
index 0000000..c4a148d
--- /dev/null
@@ -0,0 +1,7 @@
+#define _GNU_SOURCE
+#include <sched.h>
+
+int main(void)
+{
+       return sched_getcpu();
+}
index 2ba78c9..5e9738f 100644 (file)
@@ -60,4 +60,12 @@ static inline int atomic_dec_and_test(atomic_t *v)
        return __sync_sub_and_fetch(&v->counter, 1) == 0;
 }
 
+#define cmpxchg(ptr, oldval, newval) \
+       __sync_val_compare_and_swap(ptr, oldval, newval)
+
+static inline int atomic_cmpxchg(atomic_t *v, int oldval, int newval)
+{
+       return cmpxchg(&(v)->counter, oldval, newval);
+}
+
 #endif /* __TOOLS_ASM_GENERIC_ATOMIC_H */
index 4e3d3d1..9f21fc2 100644 (file)
@@ -3,4 +3,10 @@
 
 #include <asm/atomic.h>
 
+/* atomic_cmpxchg_relaxed */
+#ifndef atomic_cmpxchg_relaxed
+#define  atomic_cmpxchg_relaxed                atomic_cmpxchg
+#define  atomic_cmpxchg_release         atomic_cmpxchg
+#endif /* atomic_cmpxchg_relaxed */
+
 #endif /* __TOOLS_LINUX_ATOMIC_H */
diff --git a/tools/include/linux/bug.h b/tools/include/linux/bug.h
new file mode 100644 (file)
index 0000000..8e4a4f4
--- /dev/null
@@ -0,0 +1,10 @@
+#ifndef _TOOLS_PERF_LINUX_BUG_H
+#define _TOOLS_PERF_LINUX_BUG_H
+
+/* Force a compilation error if condition is true, but also produce a
+   result (of value 0 and type size_t), so the expression can be used
+   e.g. in a structure initializer (or where-ever else comma expressions
+   aren't permitted). */
+#define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:-!!(e); }))
+
+#endif /* _TOOLS_PERF_LINUX_BUG_H */
index 48af2f1..825d44f 100644 (file)
 #if GCC_VERSION >= 70000 && !defined(__CHECKER__)
 # define __fallthrough __attribute__ ((fallthrough))
 #endif
+
+#if GCC_VERSION >= 40300
+# define __compiletime_error(message) __attribute__((error(message)))
+#endif /* GCC_VERSION >= 40300 */
+
+/* &a[0] degrades to a pointer: a different type from an array */
+#define __must_be_array(a)     BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0]))
index 8de163b..23299d7 100644 (file)
@@ -5,6 +5,10 @@
 #include <linux/compiler-gcc.h>
 #endif
 
+#ifndef __compiletime_error
+# define __compiletime_error(message)
+#endif
+
 /* Optimization barrier */
 /* The "volatile" is due to gcc bugs */
 #define barrier() __asm__ __volatile__("": : :"memory")
 # define __always_inline       inline __attribute__((always_inline))
 #endif
 
+/* Are two types/vars the same type (ignoring qualifiers)? */
+#ifndef __same_type
+# define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b))
+#endif
+
 #ifdef __ANDROID__
 /*
  * FIXME: Big hammer to get rid of tons of:
index 122153b..390d7c9 100644 (file)
                .off   = OFF,                                   \
                .imm   = 0 })
 
+/* Atomic memory add, *(uint *)(dst_reg + off16) += src_reg */
+
+#define BPF_STX_XADD(SIZE, DST, SRC, OFF)                      \
+       ((struct bpf_insn) {                                    \
+               .code  = BPF_STX | BPF_SIZE(SIZE) | BPF_XADD,   \
+               .dst_reg = DST,                                 \
+               .src_reg = SRC,                                 \
+               .off   = OFF,                                   \
+               .imm   = 0 })
+
 /* Memory store, *(uint *) (dst_reg + off16) = imm32 */
 
 #define BPF_ST_MEM(SIZE, DST, OFF, IMM)                                \
index c65cc0a..251eabf 100644 (file)
 #include <linux/hash.h>
 #include <linux/log2.h>
 
-#ifndef ARRAY_SIZE
-#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
-#endif
-
 #define DEFINE_HASHTABLE(name, bits)                                           \
        struct hlist_head name[1 << (bits)] =                                   \
                        { [0 ... ((1 << (bits)) - 1)] = HLIST_HEAD_INIT }
index 28607db..73ccc48 100644 (file)
@@ -4,6 +4,11 @@
 #include <stdarg.h>
 #include <stddef.h>
 #include <assert.h>
+#include <linux/compiler.h>
+
+#ifndef UINT_MAX
+#define UINT_MAX       (~0U)
+#endif
 
 #define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
 
@@ -72,6 +77,8 @@
 int vscnprintf(char *buf, size_t size, const char *fmt, va_list args);
 int scnprintf(char * buf, size_t size, const char * fmt, ...);
 
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]) + __must_be_array(arr))
+
 /*
  * This looks more complex than it should be. But we need to
  * get the type for the ~ right in round_down (it needs to be
index d5677d3..0325cef 100644 (file)
@@ -12,6 +12,9 @@
 #ifndef _TOOLS_LINUX_LOG2_H
 #define _TOOLS_LINUX_LOG2_H
 
+#include <linux/bitops.h>
+#include <linux/types.h>
+
 /*
  * non-constant log of base 2 calculators
  * - the arch may override these in asm/bitops.h if they can be implemented
diff --git a/tools/include/linux/refcount.h b/tools/include/linux/refcount.h
new file mode 100644 (file)
index 0000000..a0177c1
--- /dev/null
@@ -0,0 +1,151 @@
+#ifndef _TOOLS_LINUX_REFCOUNT_H
+#define _TOOLS_LINUX_REFCOUNT_H
+
+/*
+ * Variant of atomic_t specialized for reference counts.
+ *
+ * The interface matches the atomic_t interface (to aid in porting) but only
+ * provides the few functions one should use for reference counting.
+ *
+ * It differs in that the counter saturates at UINT_MAX and will not move once
+ * there. This avoids wrapping the counter and causing 'spurious'
+ * use-after-free issues.
+ *
+ * Memory ordering rules are slightly relaxed wrt regular atomic_t functions
+ * and provide only what is strictly required for refcounts.
+ *
+ * The increments are fully relaxed; these will not provide ordering. The
+ * rationale is that whatever is used to obtain the object we're increasing the
+ * reference count on will provide the ordering. For locked data structures,
+ * its the lock acquire, for RCU/lockless data structures its the dependent
+ * load.
+ *
+ * Do note that inc_not_zero() provides a control dependency which will order
+ * future stores against the inc, this ensures we'll never modify the object
+ * if we did not in fact acquire a reference.
+ *
+ * The decrements will provide release order, such that all the prior loads and
+ * stores will be issued before, it also provides a control dependency, which
+ * will order us against the subsequent free().
+ *
+ * The control dependency is against the load of the cmpxchg (ll/sc) that
+ * succeeded. This means the stores aren't fully ordered, but this is fine
+ * because the 1->0 transition indicates no concurrency.
+ *
+ * Note that the allocator is responsible for ordering things between free()
+ * and alloc().
+ *
+ */
+
+#include <linux/atomic.h>
+#include <linux/kernel.h>
+
+#ifdef NDEBUG
+#define REFCOUNT_WARN(cond, str) (void)(cond)
+#define __refcount_check
+#else
+#define REFCOUNT_WARN(cond, str) BUG_ON(cond)
+#define __refcount_check       __must_check
+#endif
+
+typedef struct refcount_struct {
+       atomic_t refs;
+} refcount_t;
+
+#define REFCOUNT_INIT(n)       { .refs = ATOMIC_INIT(n), }
+
+static inline void refcount_set(refcount_t *r, unsigned int n)
+{
+       atomic_set(&r->refs, n);
+}
+
+static inline unsigned int refcount_read(const refcount_t *r)
+{
+       return atomic_read(&r->refs);
+}
+
+/*
+ * Similar to atomic_inc_not_zero(), will saturate at UINT_MAX and WARN.
+ *
+ * Provides no memory ordering, it is assumed the caller has guaranteed the
+ * object memory to be stable (RCU, etc.). It does provide a control dependency
+ * and thereby orders future stores. See the comment on top.
+ */
+static inline __refcount_check
+bool refcount_inc_not_zero(refcount_t *r)
+{
+       unsigned int old, new, val = atomic_read(&r->refs);
+
+       for (;;) {
+               new = val + 1;
+
+               if (!val)
+                       return false;
+
+               if (unlikely(!new))
+                       return true;
+
+               old = atomic_cmpxchg_relaxed(&r->refs, val, new);
+               if (old == val)
+                       break;
+
+               val = old;
+       }
+
+       REFCOUNT_WARN(new == UINT_MAX, "refcount_t: saturated; leaking memory.\n");
+
+       return true;
+}
+
+/*
+ * Similar to atomic_inc(), will saturate at UINT_MAX and WARN.
+ *
+ * Provides no memory ordering, it is assumed the caller already has a
+ * reference on the object, will WARN when this is not so.
+ */
+static inline void refcount_inc(refcount_t *r)
+{
+       REFCOUNT_WARN(!refcount_inc_not_zero(r), "refcount_t: increment on 0; use-after-free.\n");
+}
+
+/*
+ * Similar to atomic_dec_and_test(), it will WARN on underflow and fail to
+ * decrement when saturated at UINT_MAX.
+ *
+ * Provides release memory ordering, such that prior loads and stores are done
+ * before, and provides a control dependency such that free() must come after.
+ * See the comment on top.
+ */
+static inline __refcount_check
+bool refcount_sub_and_test(unsigned int i, refcount_t *r)
+{
+       unsigned int old, new, val = atomic_read(&r->refs);
+
+       for (;;) {
+               if (unlikely(val == UINT_MAX))
+                       return false;
+
+               new = val - i;
+               if (new > val) {
+                       REFCOUNT_WARN(new > val, "refcount_t: underflow; use-after-free.\n");
+                       return false;
+               }
+
+               old = atomic_cmpxchg_release(&r->refs, val, new);
+               if (old == val)
+                       break;
+
+               val = old;
+       }
+
+       return !new;
+}
+
+static inline __refcount_check
+bool refcount_dec_and_test(refcount_t *r)
+{
+       return refcount_sub_and_test(1, r);
+}
+
+
+#endif /* _ATOMIC_LINUX_REFCOUNT_H */
index c24b3e3..77a28a2 100644 (file)
@@ -7,6 +7,7 @@
 
 #define __SANE_USERSPACE_TYPES__       /* For PPC64, to get LL64 types */
 #include <asm/types.h>
+#include <asm/posix_types.h>
 
 struct page;
 struct kmem_cache;
diff --git a/tools/include/uapi/linux/fcntl.h b/tools/include/uapi/linux/fcntl.h
new file mode 100644 (file)
index 0000000..813afd6
--- /dev/null
@@ -0,0 +1,72 @@
+#ifndef _UAPI_LINUX_FCNTL_H
+#define _UAPI_LINUX_FCNTL_H
+
+#include <asm/fcntl.h>
+
+#define F_SETLEASE     (F_LINUX_SPECIFIC_BASE + 0)
+#define F_GETLEASE     (F_LINUX_SPECIFIC_BASE + 1)
+
+/*
+ * Cancel a blocking posix lock; internal use only until we expose an
+ * asynchronous lock api to userspace:
+ */
+#define F_CANCELLK     (F_LINUX_SPECIFIC_BASE + 5)
+
+/* Create a file descriptor with FD_CLOEXEC set. */
+#define F_DUPFD_CLOEXEC        (F_LINUX_SPECIFIC_BASE + 6)
+
+/*
+ * Request nofications on a directory.
+ * See below for events that may be notified.
+ */
+#define F_NOTIFY       (F_LINUX_SPECIFIC_BASE+2)
+
+/*
+ * Set and get of pipe page size array
+ */
+#define F_SETPIPE_SZ   (F_LINUX_SPECIFIC_BASE + 7)
+#define F_GETPIPE_SZ   (F_LINUX_SPECIFIC_BASE + 8)
+
+/*
+ * Set/Get seals
+ */
+#define F_ADD_SEALS    (F_LINUX_SPECIFIC_BASE + 9)
+#define F_GET_SEALS    (F_LINUX_SPECIFIC_BASE + 10)
+
+/*
+ * Types of seals
+ */
+#define F_SEAL_SEAL    0x0001  /* prevent further seals from being set */
+#define F_SEAL_SHRINK  0x0002  /* prevent file from shrinking */
+#define F_SEAL_GROW    0x0004  /* prevent file from growing */
+#define F_SEAL_WRITE   0x0008  /* prevent writes */
+/* (1U << 31) is reserved for signed error codes */
+
+/*
+ * Types of directory notifications that may be requested.
+ */
+#define DN_ACCESS      0x00000001      /* File accessed */
+#define DN_MODIFY      0x00000002      /* File modified */
+#define DN_CREATE      0x00000004      /* File created */
+#define DN_DELETE      0x00000008      /* File removed */
+#define DN_RENAME      0x00000010      /* File renamed */
+#define DN_ATTRIB      0x00000020      /* File changed attibutes */
+#define DN_MULTISHOT   0x80000000      /* Don't remove notifier */
+
+#define AT_FDCWD               -100    /* Special value used to indicate
+                                           openat should use the current
+                                           working directory. */
+#define AT_SYMLINK_NOFOLLOW    0x100   /* Do not follow symbolic links.  */
+#define AT_REMOVEDIR           0x200   /* Remove directory instead of
+                                           unlinking file.  */
+#define AT_SYMLINK_FOLLOW      0x400   /* Follow symbolic links.  */
+#define AT_NO_AUTOMOUNT                0x800   /* Suppress terminal automount traversal */
+#define AT_EMPTY_PATH          0x1000  /* Allow empty relative pathname */
+
+#define AT_STATX_SYNC_TYPE     0x6000  /* Type of synchronisation required from statx() */
+#define AT_STATX_SYNC_AS_STAT  0x0000  /* - Do whatever stat() does */
+#define AT_STATX_FORCE_SYNC    0x2000  /* - Force the attributes to be sync'd with the server */
+#define AT_STATX_DONT_SYNC     0x4000  /* - Don't sync attributes with the server */
+
+
+#endif /* _UAPI_LINUX_FCNTL_H */
index c66a485..d09a9cd 100644 (file)
@@ -344,7 +344,8 @@ struct perf_event_attr {
                                use_clockid    :  1, /* use @clockid for time fields */
                                context_switch :  1, /* context switch data */
                                write_backward :  1, /* Write ring buffer from end to beginning */
-                               __reserved_1   : 36;
+                               namespaces     :  1, /* include namespaces data */
+                               __reserved_1   : 35;
 
        union {
                __u32           wakeup_events;    /* wakeup every n events */
@@ -610,6 +611,23 @@ struct perf_event_header {
        __u16   size;
 };
 
+struct perf_ns_link_info {
+       __u64   dev;
+       __u64   ino;
+};
+
+enum {
+       NET_NS_INDEX            = 0,
+       UTS_NS_INDEX            = 1,
+       IPC_NS_INDEX            = 2,
+       PID_NS_INDEX            = 3,
+       USER_NS_INDEX           = 4,
+       MNT_NS_INDEX            = 5,
+       CGROUP_NS_INDEX         = 6,
+
+       NR_NAMESPACES,          /* number of available namespaces */
+};
+
 enum perf_event_type {
 
        /*
@@ -862,6 +880,18 @@ enum perf_event_type {
         */
        PERF_RECORD_SWITCH_CPU_WIDE             = 15,
 
+       /*
+        * struct {
+        *      struct perf_event_header        header;
+        *      u32                             pid;
+        *      u32                             tid;
+        *      u64                             nr_namespaces;
+        *      { u64                           dev, inode; } [nr_namespaces];
+        *      struct sample_id                sample_id;
+        * };
+        */
+       PERF_RECORD_NAMESPACES                  = 16,
+
        PERF_RECORD_MAX,                        /* non-ABI */
 };
 
@@ -885,6 +915,7 @@ enum perf_callchain_context {
  */
 #define PERF_AUX_FLAG_TRUNCATED                0x01    /* record was truncated to fit */
 #define PERF_AUX_FLAG_OVERWRITE                0x02    /* snapshot from overwrite mode */
+#define PERF_AUX_FLAG_PARTIAL          0x04    /* record contains gaps */
 
 #define PERF_FLAG_FD_NO_GROUP          (1UL << 0)
 #define PERF_FLAG_FD_OUTPUT            (1UL << 1)
diff --git a/tools/include/uapi/linux/stat.h b/tools/include/uapi/linux/stat.h
new file mode 100644 (file)
index 0000000..d538897
--- /dev/null
@@ -0,0 +1,177 @@
+#ifndef _UAPI_LINUX_STAT_H
+#define _UAPI_LINUX_STAT_H
+
+#include <linux/types.h>
+
+#if defined(__KERNEL__) || !defined(__GLIBC__) || (__GLIBC__ < 2)
+
+#define S_IFMT  00170000
+#define S_IFSOCK 0140000
+#define S_IFLNK         0120000
+#define S_IFREG  0100000
+#define S_IFBLK  0060000
+#define S_IFDIR  0040000
+#define S_IFCHR  0020000
+#define S_IFIFO  0010000
+#define S_ISUID  0004000
+#define S_ISGID  0002000
+#define S_ISVTX  0001000
+
+#define S_ISLNK(m)     (((m) & S_IFMT) == S_IFLNK)
+#define S_ISREG(m)     (((m) & S_IFMT) == S_IFREG)
+#define S_ISDIR(m)     (((m) & S_IFMT) == S_IFDIR)
+#define S_ISCHR(m)     (((m) & S_IFMT) == S_IFCHR)
+#define S_ISBLK(m)     (((m) & S_IFMT) == S_IFBLK)
+#define S_ISFIFO(m)    (((m) & S_IFMT) == S_IFIFO)
+#define S_ISSOCK(m)    (((m) & S_IFMT) == S_IFSOCK)
+
+#define S_IRWXU 00700
+#define S_IRUSR 00400
+#define S_IWUSR 00200
+#define S_IXUSR 00100
+
+#define S_IRWXG 00070
+#define S_IRGRP 00040
+#define S_IWGRP 00020
+#define S_IXGRP 00010
+
+#define S_IRWXO 00007
+#define S_IROTH 00004
+#define S_IWOTH 00002
+#define S_IXOTH 00001
+
+#endif
+
+/*
+ * Timestamp structure for the timestamps in struct statx.
+ *
+ * tv_sec holds the number of seconds before (negative) or after (positive)
+ * 00:00:00 1st January 1970 UTC.
+ *
+ * tv_nsec holds a number of nanoseconds before (0..-999,999,999 if tv_sec is
+ * negative) or after (0..999,999,999 if tv_sec is positive) the tv_sec time.
+ *
+ * Note that if both tv_sec and tv_nsec are non-zero, then the two values must
+ * either be both positive or both negative.
+ *
+ * __reserved is held in case we need a yet finer resolution.
+ */
+struct statx_timestamp {
+       __s64   tv_sec;
+       __s32   tv_nsec;
+       __s32   __reserved;
+};
+
+/*
+ * Structures for the extended file attribute retrieval system call
+ * (statx()).
+ *
+ * The caller passes a mask of what they're specifically interested in as a
+ * parameter to statx().  What statx() actually got will be indicated in
+ * st_mask upon return.
+ *
+ * For each bit in the mask argument:
+ *
+ * - if the datum is not supported:
+ *
+ *   - the bit will be cleared, and
+ *
+ *   - the datum will be set to an appropriate fabricated value if one is
+ *     available (eg. CIFS can take a default uid and gid), otherwise
+ *
+ *   - the field will be cleared;
+ *
+ * - otherwise, if explicitly requested:
+ *
+ *   - the datum will be synchronised to the server if AT_STATX_FORCE_SYNC is
+ *     set or if the datum is considered out of date, and
+ *
+ *   - the field will be filled in and the bit will be set;
+ *
+ * - otherwise, if not requested, but available in approximate form without any
+ *   effort, it will be filled in anyway, and the bit will be set upon return
+ *   (it might not be up to date, however, and no attempt will be made to
+ *   synchronise the internal state first);
+ *
+ * - otherwise the field and the bit will be cleared before returning.
+ *
+ * Items in STATX_BASIC_STATS may be marked unavailable on return, but they
+ * will have values installed for compatibility purposes so that stat() and
+ * co. can be emulated in userspace.
+ */
+struct statx {
+       /* 0x00 */
+       __u32   stx_mask;       /* What results were written [uncond] */
+       __u32   stx_blksize;    /* Preferred general I/O size [uncond] */
+       __u64   stx_attributes; /* Flags conveying information about the file [uncond] */
+       /* 0x10 */
+       __u32   stx_nlink;      /* Number of hard links */
+       __u32   stx_uid;        /* User ID of owner */
+       __u32   stx_gid;        /* Group ID of owner */
+       __u16   stx_mode;       /* File mode */
+       __u16   __spare0[1];
+       /* 0x20 */
+       __u64   stx_ino;        /* Inode number */
+       __u64   stx_size;       /* File size */
+       __u64   stx_blocks;     /* Number of 512-byte blocks allocated */
+       __u64   stx_attributes_mask; /* Mask to show what's supported in stx_attributes */
+       /* 0x40 */
+       struct statx_timestamp  stx_atime;      /* Last access time */
+       struct statx_timestamp  stx_btime;      /* File creation time */
+       struct statx_timestamp  stx_ctime;      /* Last attribute change time */
+       struct statx_timestamp  stx_mtime;      /* Last data modification time */
+       /* 0x80 */
+       __u32   stx_rdev_major; /* Device ID of special file [if bdev/cdev] */
+       __u32   stx_rdev_minor;
+       __u32   stx_dev_major;  /* ID of device containing file [uncond] */
+       __u32   stx_dev_minor;
+       /* 0x90 */
+       __u64   __spare2[14];   /* Spare space for future expansion */
+       /* 0x100 */
+};
+
+/*
+ * Flags to be stx_mask
+ *
+ * Query request/result mask for statx() and struct statx::stx_mask.
+ *
+ * These bits should be set in the mask argument of statx() to request
+ * particular items when calling statx().
+ */
+#define STATX_TYPE             0x00000001U     /* Want/got stx_mode & S_IFMT */
+#define STATX_MODE             0x00000002U     /* Want/got stx_mode & ~S_IFMT */
+#define STATX_NLINK            0x00000004U     /* Want/got stx_nlink */
+#define STATX_UID              0x00000008U     /* Want/got stx_uid */
+#define STATX_GID              0x00000010U     /* Want/got stx_gid */
+#define STATX_ATIME            0x00000020U     /* Want/got stx_atime */
+#define STATX_MTIME            0x00000040U     /* Want/got stx_mtime */
+#define STATX_CTIME            0x00000080U     /* Want/got stx_ctime */
+#define STATX_INO              0x00000100U     /* Want/got stx_ino */
+#define STATX_SIZE             0x00000200U     /* Want/got stx_size */
+#define STATX_BLOCKS           0x00000400U     /* Want/got stx_blocks */
+#define STATX_BASIC_STATS      0x000007ffU     /* The stuff in the normal stat struct */
+#define STATX_BTIME            0x00000800U     /* Want/got stx_btime */
+#define STATX_ALL              0x00000fffU     /* All currently supported flags */
+#define STATX__RESERVED                0x80000000U     /* Reserved for future struct statx expansion */
+
+/*
+ * Attributes to be found in stx_attributes and masked in stx_attributes_mask.
+ *
+ * These give information about the features or the state of a file that might
+ * be of use to ordinary userspace programs such as GUIs or ls rather than
+ * specialised tools.
+ *
+ * Note that the flags marked [I] correspond to generic FS_IOC_FLAGS
+ * semantically.  Where possible, the numerical value is picked to correspond
+ * also.
+ */
+#define STATX_ATTR_COMPRESSED          0x00000004 /* [I] File is compressed by the fs */
+#define STATX_ATTR_IMMUTABLE           0x00000010 /* [I] File is marked immutable */
+#define STATX_ATTR_APPEND              0x00000020 /* [I] File is append-only */
+#define STATX_ATTR_NODUMP              0x00000040 /* [I] File is not to be dumped */
+#define STATX_ATTR_ENCRYPTED           0x00000800 /* [I] File requires key to decrypt in fs */
+
+#define STATX_ATTR_AUTOMOUNT           0x00001000 /* Dir: Automount trigger */
+
+
+#endif /* _UAPI_LINUX_STAT_H */
index 5d19fdf..897cd6f 100644 (file)
@@ -3339,7 +3339,7 @@ int main(int argc, char *argv[])
         * simple, single region.
         */
        boot->e820_entries = 1;
-       boot->e820_map[0] = ((struct e820entry) { 0, mem, E820_RAM });
+       boot->e820_table[0] = ((struct e820_entry) { 0, mem, E820_TYPE_RAM });
        /*
         * The boot header contains a command line pointer: we put the command
         * line after the boot header.
index 4b6bfc4..809c772 100644 (file)
@@ -439,6 +439,35 @@ int sysfs__read_str(const char *entry, char **buf, size_t *sizep)
        return filename__read_str(path, buf, sizep);
 }
 
+int sysfs__read_bool(const char *entry, bool *value)
+{
+       char *buf;
+       size_t size;
+       int ret;
+
+       ret = sysfs__read_str(entry, &buf, &size);
+       if (ret < 0)
+               return ret;
+
+       switch (buf[0]) {
+       case '1':
+       case 'y':
+       case 'Y':
+               *value = true;
+               break;
+       case '0':
+       case 'n':
+       case 'N':
+               *value = false;
+               break;
+       default:
+               ret = -1;
+       }
+
+       free(buf);
+
+       return ret;
+}
 int sysctl__read_int(const char *sysctl, int *value)
 {
        char path[PATH_MAX];
index 6b332dc..956c211 100644 (file)
@@ -37,4 +37,5 @@ int sysctl__read_int(const char *sysctl, int *value);
 int sysfs__read_int(const char *entry, int *value);
 int sysfs__read_ull(const char *entry, unsigned long long *value);
 int sysfs__read_str(const char *entry, char **buf, size_t *sizep);
+int sysfs__read_bool(const char *entry, bool *value);
 #endif /* __API_FS__ */
index e145a02..9bd4223 100644 (file)
@@ -2,6 +2,7 @@
 #define __SUBCMD_HELP_H
 
 #include <sys/types.h>
+#include <stdio.h>
 
 struct cmdnames {
        size_t alloc;
index 5e43107..d270ac0 100644 (file)
@@ -1,3 +1,4 @@
+#include <ctype.h>
 #include "symbol/kallsyms.h"
 #include <stdio.h>
 #include <stdlib.h>
index 066086d..282a603 100644 (file)
@@ -36,8 +36,7 @@
 #include "warn.h"
 
 #include <linux/hashtable.h>
-
-#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+#include <linux/kernel.h>
 
 #define STATE_FP_SAVED         0x1
 #define STATE_FP_SETUP         0x2
index 46c326d..ecc5b1b 100644 (file)
 #include <stdlib.h>
 #include <subcmd/exec-cmd.h>
 #include <subcmd/pager.h>
+#include <linux/kernel.h>
 
 #include "builtin.h"
 
-#define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0]))
-
 struct cmd_struct {
        const char *name;
        int (*fn)(int, const char **);
index 3db3db9..643cc4b 100644 (file)
@@ -31,3 +31,5 @@ config.mak.autogen
 .config-detected
 util/intel-pt-decoder/inat-tables.c
 arch/*/include/generated/
+pmu-events/pmu-events.c
+pmu-events/jevents
index 9b79f8d..bd8eeb6 100644 (file)
@@ -50,5 +50,6 @@ libperf-y += util/
 libperf-y += arch/
 libperf-y += ui/
 libperf-y += scripts/
+libperf-y += trace/beauty/
 
 gtk-y += ui/gtk/
index 2d96de6..6e6a8b2 100644 (file)
@@ -30,6 +30,24 @@ OPTIONS
 --verbose=::
         Verbosity level.
 
+-p::
+--pid=::
+       Trace on existing process id (comma separated list).
+
+-a::
+--all-cpus::
+       Force system-wide collection.  Scripts run without a <command>
+       normally use -a by default, while scripts run with a <command>
+       normally don't - this option allows the latter to be run in
+       system-wide mode.
+
+-C::
+--cpu=::
+       Only trace for the list of CPUs provided.  Multiple CPUs can
+       be provided as a comma separated list with no space like: 0,1.
+       Ranges of CPUs are specified with -: 0-2.
+       Default is to trace on all online CPUs.
+
 
 SEE ALSO
 --------
index 41857cc..f709de5 100644 (file)
@@ -8,7 +8,7 @@ perf-list - List all symbolic event types
 SYNOPSIS
 --------
 [verse]
-'perf list' [--no-desc] [--long-desc] [hw|sw|cache|tracepoint|pmu|event_glob]
+'perf list' [--no-desc] [--long-desc] [hw|sw|cache|tracepoint|pmu|sdt|event_glob]
 
 DESCRIPTION
 -----------
@@ -24,6 +24,10 @@ Don't print descriptions.
 --long-desc::
 Print longer event descriptions.
 
+--details::
+Print how named events are resolved internally into perf events, and also
+any extra expressions computed by perf stat.
+
 
 [[EVENT_MODIFIERS]]
 EVENT MODIFIERS
@@ -240,6 +244,8 @@ To limit the list use:
 
 . 'pmu' to print the kernel supplied PMU events.
 
+. 'sdt' to list all Statically Defined Tracepoint events.
+
 . If none of the above is matched, it will apply the supplied glob to all
   events, printing the ones that match.
 
index b16003e..ea3789d 100644 (file)
@@ -347,6 +347,9 @@ Enable weightened sampling. An additional weight is recorded per sample and can
 displayed with the weight and local_weight sort keys.  This currently works for TSX
 abort events and some memory events in precise mode on modern Intel CPUs.
 
+--namespaces::
+Record events of type PERF_RECORD_NAMESPACES.
+
 --transaction::
 Record transaction flags for transaction related events.
 
index c04cc06..37a1759 100644 (file)
@@ -72,7 +72,8 @@ OPTIONS
 --sort=::
        Sort histogram entries by given key(s) - multiple keys can be specified
        in CSV format.  Following sort keys are available:
-       pid, comm, dso, symbol, parent, cpu, socket, srcline, weight, local_weight.
+       pid, comm, dso, symbol, parent, cpu, socket, srcline, weight,
+       local_weight, cgroup_id.
 
        Each key has following meaning:
 
@@ -80,6 +81,7 @@ OPTIONS
        - pid: command and tid of the task
        - dso: name of library or module executed at the time of sample
        - symbol: name of function executed at the time of sample
+       - symbol_size: size of function executed at the time of sample
        - parent: name of function matched to the parent regex filter. Unmatched
        entries are displayed as "[other]".
        - cpu: cpu number the task ran at the time of sample
@@ -91,6 +93,7 @@ OPTIONS
        - weight: Event specific weight, e.g. memory latency or transaction
        abort cost. This is the global weight.
        - local_weight: Local weight version of the weight above.
+       - cgroup_id: ID derived from cgroup namespace device and inode numbers.
        - transaction: Transaction abort flags.
        - overhead: Overhead percentage of sample
        - overhead_sys: Overhead percentage of sample running in system mode
@@ -172,6 +175,9 @@ OPTIONS
        By default, every sort keys not specified in -F will be appended
        automatically.
 
+       If the keys starts with a prefix '+', then it will append the specified
+        field(s) to the default field order. For example: perf report -F +period,sample.
+
 -p::
 --parent=<regex>::
         A regex filter to identify parent. The parent is a caller of this
@@ -229,6 +235,7 @@ OPTIONS
        sort_key can be:
        - function: compare on functions (default)
        - address: compare on individual code addresses
+       - srcline: compare on source filename and line number
 
        branch can be:
        - branch: include last branch information in callgraph when available.
@@ -424,6 +431,10 @@ include::itrace.txt[]
 --hierarchy::
        Enable hierarchical output.
 
+--inline::
+       If a callgraph address belongs to an inlined function, the inline stack
+       will be printed. Each entry is function name or file/line.
+
 include::callchain-overhead-calculation.txt[]
 
 SEE ALSO
index d33dedd..a092a24 100644 (file)
@@ -132,6 +132,10 @@ OPTIONS for 'perf sched timehist'
 --migrations::
        Show migration events.
 
+-n::
+--next::
+       Show next task.
+
 -I::
 --idle-hist::
        Show idle-related events only.
index 4ed5f23..cb0eda3 100644 (file)
@@ -116,7 +116,7 @@ OPTIONS
 --fields::
         Comma separated list of fields to print. Options are:
         comm, tid, pid, time, cpu, event, trace, ip, sym, dso, addr, symoff,
-        srcline, period, iregs, brstack, brstacksym, flags, bpf-output,
+        srcline, period, iregs, brstack, brstacksym, flags, bpf-output, brstackinsn,
         callindent, insn, insnlen. Field list can be prepended with the type, trace, sw or hw,
         to indicate to which event type the field list applies.
         e.g., -F sw:comm,tid,time,ip,sym  and -F trace:time,cpu,trace
@@ -189,15 +189,20 @@ OPTIONS
        i.e., -F "" is not allowed.
 
        The brstack output includes branch related information with raw addresses using the
-       /v/v/v/v/ syntax in the following order:
+       /v/v/v/v/cycles syntax in the following order:
        FROM: branch source instruction
        TO  : branch target instruction
         M/P/-: M=branch target mispredicted or branch direction was mispredicted, P=target predicted or direction predicted, -=not supported
        X/- : X=branch inside a transactional region, -=not in transaction region or not supported
        A/- : A=TSX abort entry, -=not aborted region or not supported
+       cycles
 
        The brstacksym is identical to brstack, except that the FROM and TO addresses are printed in a symbolic form if possible.
 
+       When brstackinsn is specified the full assembler sequences of branch sequences for each sample
+       is printed. This is the full execution path leading to the sample. This is only supported when the
+       sample was recorded with perf record -b or -j any.
+
 -k::
 --vmlinux=<file>::
         vmlinux pathname
@@ -248,6 +253,9 @@ OPTIONS
 --show-mmap-events
        Display mmap related events (e.g. MMAP, MMAP2).
 
+--show-namespace-events
+       Display namespace events i.e. events of type PERF_RECORD_NAMESPACES.
+
 --show-switch-events
        Display context switch events i.e. events of type PERF_RECORD_SWITCH or
        PERF_RECORD_SWITCH_CPU_WIDE.
@@ -299,6 +307,10 @@ include::itrace.txt[]
        stop time is not given (i.e, time string is 'x.y,') then analysis goes
        to end of file.
 
+--max-blocks::
+       Set the maximum number of program blocks to print with brstackasm for
+       each sample.
+
 SEE ALSO
 --------
 linkperf:perf-record[1], linkperf:perf-script-perl[1],
index aecf2a8..bd0e441 100644 (file)
@@ -94,8 +94,7 @@ to activate system-wide monitoring. Default is to count on all CPUs.
 
 -A::
 --no-aggr::
-Do not aggregate counts across all monitored CPUs in system-wide mode (-a).
-This option is only valid in system-wide mode.
+Do not aggregate counts across all monitored CPUs.
 
 -n::
 --null::
@@ -237,6 +236,9 @@ To interpret the results it is usually needed to know on which
 CPUs the workload runs on. If needed the CPUs can be forced using
 taskset.
 
+--no-merge::
+Do not merge results from same PMUs.
+
 EXAMPLES
 --------
 
index afd7286..c1e3288 100644 (file)
@@ -123,7 +123,8 @@ the thread executes on the designated CPUs. Default is to monitor all CPUs.
        major or all pagefaults. Default value is maj.
 
 --syscalls::
-       Trace system calls. This options is enabled by default.
+       Trace system calls. This options is enabled by default, disable with
+       --no-syscalls.
 
 --call-graph [mode,type,min[,limit],order[,key][,branch]]::
         Setup and enable call-graph (stack chain/backtrace) recording.
index b664b18..fa2a913 100644 (file)
@@ -11,8 +11,8 @@ All fields are in native-endian of the machine that generated the perf.data.
 
 When perf is writing to a pipe it uses a special version of the file
 format that does not rely on seeking to adjust data offsets.  This
-format is not described here. The pipe version can be converted to
-normal perf.data with perf inject.
+format is described in "Pipe-mode data" section. The pipe data version can be
+augmented with additional events using perf inject.
 
 The file starts with a perf_header:
 
@@ -411,6 +411,21 @@ An array bound by the perf_file_section size.
 
 ids points to a array of uint64_t defining the ids for event attr attr.
 
+Pipe-mode data
+
+Pipe-mode avoid seeks in the file by removing the perf_file_section and flags
+from the struct perf_header. The trimmed header is:
+
+struct perf_pipe_file_header {
+       u64                             magic;
+       u64                             size;
+};
+
+The information about attrs, data, and event_types is instead in the
+synthesized events PERF_RECORD_ATTR, PERF_RECORD_HEADER_TRACING_DATA and
+PERF_RECORD_HEADER_EVENT_TYPE that are generated by perf record in pipe-mode.
+
+
 References:
 
 include/uapi/linux/perf_event.h
index 8672f83..a29da46 100644 (file)
@@ -12,6 +12,7 @@ tools/arch/sparc/include/asm/barrier_32.h
 tools/arch/sparc/include/asm/barrier_64.h
 tools/arch/tile/include/asm/barrier.h
 tools/arch/x86/include/asm/barrier.h
+tools/arch/x86/include/asm/cmpxchg.h
 tools/arch/x86/include/asm/cpufeatures.h
 tools/arch/x86/include/asm/disabled-features.h
 tools/arch/x86/include/asm/required-features.h
@@ -63,6 +64,7 @@ tools/include/linux/bitops.h
 tools/include/linux/compiler.h
 tools/include/linux/compiler-gcc.h
 tools/include/linux/coresight-pmu.h
+tools/include/linux/bug.h
 tools/include/linux/filter.h
 tools/include/linux/hash.h
 tools/include/linux/kernel.h
@@ -72,12 +74,15 @@ tools/include/uapi/asm-generic/mman-common.h
 tools/include/uapi/asm-generic/mman.h
 tools/include/uapi/linux/bpf.h
 tools/include/uapi/linux/bpf_common.h
+tools/include/uapi/linux/fcntl.h
 tools/include/uapi/linux/hw_breakpoint.h
 tools/include/uapi/linux/mman.h
 tools/include/uapi/linux/perf_event.h
+tools/include/uapi/linux/stat.h
 tools/include/linux/poison.h
 tools/include/linux/rbtree.h
 tools/include/linux/rbtree_augmented.h
+tools/include/linux/refcount.h
 tools/include/linux/string.h
 tools/include/linux/stringify.h
 tools/include/linux/types.h
index 27c9fbc..8354d04 100644 (file)
@@ -170,13 +170,20 @@ PYTHON2_CONFIG := \
 override PYTHON_CONFIG := \
   $(call get-executable-or-default,PYTHON_CONFIG,$(PYTHON2_CONFIG))
 
-PYTHON_CONFIG_SQ := $(call shell-sq,$(PYTHON_CONFIG))
+grep-libs  = $(filter -l%,$(1))
+strip-libs  = $(filter-out -l%,$(1))
 
-PYTHON_EMBED_LDOPTS := $(shell $(PYTHON_CONFIG_SQ) --ldflags 2>/dev/null)
-PYTHON_EMBED_CCOPTS := $(shell $(PYTHON_CONFIG_SQ) --cflags 2>/dev/null)
+PYTHON_CONFIG_SQ := $(call shell-sq,$(PYTHON_CONFIG))
 
-ifeq ($(CC), clang)
-  PYTHON_EMBED_CCOPTS := $(filter-out -specs=%,$(PYTHON_EMBED_CCOPTS))
+ifdef PYTHON_CONFIG
+  PYTHON_EMBED_LDOPTS := $(shell $(PYTHON_CONFIG_SQ) --ldflags 2>/dev/null)
+  PYTHON_EMBED_LDFLAGS := $(call strip-libs,$(PYTHON_EMBED_LDOPTS))
+  PYTHON_EMBED_LIBADD := $(call grep-libs,$(PYTHON_EMBED_LDOPTS)) -lutil
+  PYTHON_EMBED_CCOPTS := $(shell $(PYTHON_CONFIG_SQ) --cflags 2>/dev/null)
+  ifeq ($(CC), clang)
+    PYTHON_EMBED_CCOPTS := $(filter-out -specs=%,$(PYTHON_EMBED_CCOPTS))
+  endif
+  FLAGS_PYTHON_EMBED := $(PYTHON_EMBED_CCOPTS) $(PYTHON_EMBED_LDOPTS)
 endif
 
 FEATURE_CHECK_CFLAGS-libpython := $(PYTHON_EMBED_CCOPTS)
@@ -267,6 +274,7 @@ ifdef NO_LIBELF
   NO_LIBUNWIND := 1
   NO_LIBDW_DWARF_UNWIND := 1
   NO_LIBBPF := 1
+  NO_JVMTI := 1
 else
   ifeq ($(feature-libelf), 0)
     ifeq ($(feature-glibc), 1)
@@ -276,7 +284,7 @@ else
       LIBC_SUPPORT := 1
     endif
     ifeq ($(LIBC_SUPPORT),1)
-      msg := $(warning No libelf found, disables 'probe' tool and BPF support in 'perf record', please install libelf-dev, libelf-devel or elfutils-libelf-devel);
+      msg := $(warning No libelf found. Disables 'probe' tool, jvmti and BPF support in 'perf record'. Please install libelf-dev, libelf-devel or elfutils-libelf-devel);
 
       NO_LIBELF := 1
       NO_DWARF := 1
@@ -284,6 +292,7 @@ else
       NO_LIBUNWIND := 1
       NO_LIBDW_DWARF_UNWIND := 1
       NO_LIBBPF := 1
+      NO_JVMTI := 1
     else
       ifneq ($(filter s% -static%,$(LDFLAGS),),)
         msg := $(error No static glibc found, please install glibc-static);
@@ -317,6 +326,10 @@ ifdef NO_DWARF
   NO_LIBDW_DWARF_UNWIND := 1
 endif
 
+ifeq ($(feature-sched_getcpu), 1)
+  CFLAGS += -DHAVE_SCHED_GETCPU_SUPPORT
+endif
+
 ifndef NO_LIBELF
   CFLAGS += -DHAVE_LIBELF_SUPPORT
   EXTLIBS += -lelf
@@ -550,8 +563,6 @@ ifndef NO_GTK2
   endif
 endif
 
-grep-libs  = $(filter -l%,$(1))
-strip-libs = $(filter-out -l%,$(1))
 
 ifdef NO_LIBPERL
   CFLAGS += -DNO_LIBPERL
@@ -599,21 +610,9 @@ else
       $(call disable-python,No 'python-config' tool was found: disables Python support - please install python-devel/python-dev)
     else
 
-      PYTHON_CONFIG_SQ := $(call shell-sq,$(PYTHON_CONFIG))
-
-      PYTHON_EMBED_LDOPTS := $(shell $(PYTHON_CONFIG_SQ) --ldflags 2>/dev/null)
-      PYTHON_EMBED_LDFLAGS := $(call strip-libs,$(PYTHON_EMBED_LDOPTS))
-      PYTHON_EMBED_LIBADD := $(call grep-libs,$(PYTHON_EMBED_LDOPTS)) -lutil
-      PYTHON_EMBED_CCOPTS := $(shell $(PYTHON_CONFIG_SQ) --cflags 2>/dev/null)
-      ifeq ($(CC), clang)
-        PYTHON_EMBED_CCOPTS := $(filter-out -specs=%,$(PYTHON_EMBED_CCOPTS))
-      endif
-      FLAGS_PYTHON_EMBED := $(PYTHON_EMBED_CCOPTS) $(PYTHON_EMBED_LDOPTS)
-
       ifneq ($(feature-libpython), 1)
         $(call disable-python,No 'Python.h' (for Python 2.x support) was found: disables Python support - please install python-devel/python-dev)
       else
-
         ifneq ($(feature-libpython-version), 1)
           $(warning Python 3 is not yet supported; please set)
           $(warning PYTHON and/or PYTHON_CONFIG appropriately.)
index dfea6b6..29361d9 100644 (file)
@@ -33,6 +33,7 @@
 #include "../../util/cs-etm.h"
 
 #include <stdlib.h>
+#include <sys/stat.h>
 
 #define ENABLE_SINK_MAX        128
 #define CS_BUS_DEVICE_PATH "/bus/coresight/devices/"
index 33ec5b3..8bb176a 100644 (file)
@@ -9,6 +9,7 @@
  */
 
 #include <stddef.h>
+#include <linux/stringify.h>
 #include <dwarf-regs.h>
 
 struct pt_regs_dwarfnum {
@@ -16,10 +17,9 @@ struct pt_regs_dwarfnum {
        unsigned int dwarfnum;
 };
 
-#define STR(s) #s
 #define REG_DWARFNUM_NAME(r, num) {.name = r, .dwarfnum = num}
 #define GPR_DWARFNUM_NAME(num) \
-       {.name = STR(%r##num), .dwarfnum = num}
+       {.name = __stringify(%r##num), .dwarfnum = num}
 #define REG_DWARFNUM_END {.name = NULL, .dwarfnum = 0}
 
 /*
index b4176c6..bacfa00 100644 (file)
@@ -1,6 +1,7 @@
 #include <elfutils/libdwfl.h>
 #include "../../util/unwind-libdw.h"
 #include "../../util/perf_regs.h"
+#include "../../util/event.h"
 
 bool libdw__arch_set_initial_registers(Dwfl_Thread *thread, void *arg)
 {
index 068b618..cd764a9 100644 (file)
@@ -8,9 +8,12 @@
  * published by the Free Software Foundation.
  */
 
+#include <errno.h>
 #include <stddef.h>
+#include <string.h>
 #include <dwarf-regs.h>
 #include <linux/ptrace.h> /* for struct user_pt_regs */
+#include <linux/stringify.h>
 #include "util.h"
 
 struct pt_regs_dwarfnum {
@@ -20,7 +23,7 @@ struct pt_regs_dwarfnum {
 
 #define REG_DWARFNUM_NAME(r, num) {.name = r, .dwarfnum = num}
 #define GPR_DWARFNUM_NAME(num) \
-       {.name = STR(%x##num), .dwarfnum = num}
+       {.name = __stringify(%x##num), .dwarfnum = num}
 #define REG_DWARFNUM_END {.name = NULL, .dwarfnum = 0}
 #define DWARFNUM2OFFSET(index) \
        (index * sizeof((struct user_pt_regs *)0)->regs[0])
index c116b71..b415dfd 100644 (file)
@@ -1,6 +1,6 @@
+#include <errno.h>
 
 #ifndef REMOTE_UNWIND_LIBUNWIND
-#include <errno.h>
 #include <libunwind.h>
 #include "perf_regs.h"
 #include "../../util/unwind.h"
index 886dd2a..837067f 100644 (file)
@@ -4,6 +4,8 @@
 #include "../util/util.h"
 #include "../util/debug.h"
 
+#include "sane_ctype.h"
+
 const char *const arm_triplets[] = {
        "arm-eabi-",
        "arm-linux-androideabi-",
index 41bdf95..98ac870 100644 (file)
@@ -15,6 +15,7 @@
 #include <dwarf-regs.h>
 #include <linux/ptrace.h>
 #include <linux/kernel.h>
+#include <linux/stringify.h>
 #include "util.h"
 
 struct pt_regs_dwarfnum {
@@ -24,10 +25,10 @@ struct pt_regs_dwarfnum {
 };
 
 #define REG_DWARFNUM_NAME(r, num)                                      \
-               {.name = STR(%)STR(r), .dwarfnum = num,                 \
+               {.name = __stringify(%)__stringify(r), .dwarfnum = num,                 \
                .ptregs_offset = offsetof(struct pt_regs, r)}
 #define GPR_DWARFNUM_NAME(num)                                         \
-               {.name = STR(%gpr##num), .dwarfnum = num,               \
+               {.name = __stringify(%gpr##num), .dwarfnum = num,               \
                .ptregs_offset = offsetof(struct pt_regs, gpr[num])}
 #define REG_DWARFNUM_END {.name = NULL, .dwarfnum = 0, .ptregs_offset = 0}
 
index 74eee30..249723f 100644 (file)
@@ -1,3 +1,4 @@
+#include <errno.h>
 #include "util/kvm-stat.h"
 #include "util/parse-events.h"
 #include "util/debug.h"
index a3c3e1c..f860dc4 100644 (file)
@@ -1,5 +1,11 @@
+#include <errno.h>
+#include <string.h>
+#include <regex.h>
+
 #include "../../perf.h"
+#include "../../util/util.h"
 #include "../../util/perf_regs.h"
+#include "../../util/debug.h"
 
 const struct sample_reg sample_reg_masks[] = {
        SMPL_REG(r0, PERF_REG_POWERPC_R0),
@@ -47,3 +53,109 @@ const struct sample_reg sample_reg_masks[] = {
        SMPL_REG(dsisr, PERF_REG_POWERPC_DSISR),
        SMPL_REG_END
 };
+
+/* REG or %rREG */
+#define SDT_OP_REGEX1  "^(%r)?([1-2]?[0-9]|3[0-1])$"
+
+/* -NUM(REG) or NUM(REG) or -NUM(%rREG) or NUM(%rREG) */
+#define SDT_OP_REGEX2  "^(\\-)?([0-9]+)\\((%r)?([1-2]?[0-9]|3[0-1])\\)$"
+
+static regex_t sdt_op_regex1, sdt_op_regex2;
+
+static int sdt_init_op_regex(void)
+{
+       static int initialized;
+       int ret = 0;
+
+       if (initialized)
+               return 0;
+
+       ret = regcomp(&sdt_op_regex1, SDT_OP_REGEX1, REG_EXTENDED);
+       if (ret)
+               goto error;
+
+       ret = regcomp(&sdt_op_regex2, SDT_OP_REGEX2, REG_EXTENDED);
+       if (ret)
+               goto free_regex1;
+
+       initialized = 1;
+       return 0;
+
+free_regex1:
+       regfree(&sdt_op_regex1);
+error:
+       pr_debug4("Regex compilation error.\n");
+       return ret;
+}
+
+/*
+ * Parse OP and convert it into uprobe format, which is, +/-NUM(%gprREG).
+ * Possible variants of OP are:
+ *     Format          Example
+ *     -------------------------
+ *     NUM(REG)        48(18)
+ *     -NUM(REG)       -48(18)
+ *     NUM(%rREG)      48(%r18)
+ *     -NUM(%rREG)     -48(%r18)
+ *     REG             18
+ *     %rREG           %r18
+ *     iNUM            i0
+ *     i-NUM           i-1
+ *
+ * SDT marker arguments on Powerpc uses %rREG form with -mregnames flag
+ * and REG form with -mno-regnames. Here REG is general purpose register,
+ * which is in 0 to 31 range.
+ */
+int arch_sdt_arg_parse_op(char *old_op, char **new_op)
+{
+       int ret, new_len;
+       regmatch_t rm[5];
+       char prefix;
+
+       /* Constant argument. Uprobe does not support it */
+       if (old_op[0] == 'i') {
+               pr_debug4("Skipping unsupported SDT argument: %s\n", old_op);
+               return SDT_ARG_SKIP;
+       }
+
+       ret = sdt_init_op_regex();
+       if (ret < 0)
+               return ret;
+
+       if (!regexec(&sdt_op_regex1, old_op, 3, rm, 0)) {
+               /* REG or %rREG --> %gprREG */
+
+               new_len = 5;    /* % g p r NULL */
+               new_len += (int)(rm[2].rm_eo - rm[2].rm_so);
+
+               *new_op = zalloc(new_len);
+               if (!*new_op)
+                       return -ENOMEM;
+
+               scnprintf(*new_op, new_len, "%%gpr%.*s",
+                       (int)(rm[2].rm_eo - rm[2].rm_so), old_op + rm[2].rm_so);
+       } else if (!regexec(&sdt_op_regex2, old_op, 5, rm, 0)) {
+               /*
+                * -NUM(REG) or NUM(REG) or -NUM(%rREG) or NUM(%rREG) -->
+                *      +/-NUM(%gprREG)
+                */
+               prefix = (rm[1].rm_so == -1) ? '+' : '-';
+
+               new_len = 8;    /* +/- ( % g p r ) NULL */
+               new_len += (int)(rm[2].rm_eo - rm[2].rm_so);
+               new_len += (int)(rm[4].rm_eo - rm[4].rm_so);
+
+               *new_op = zalloc(new_len);
+               if (!*new_op)
+                       return -ENOMEM;
+
+               scnprintf(*new_op, new_len, "%c%.*s(%%gpr%.*s)", prefix,
+                       (int)(rm[2].rm_eo - rm[2].rm_so), old_op + rm[2].rm_so,
+                       (int)(rm[4].rm_eo - rm[4].rm_so), old_op + rm[4].rm_so);
+       } else {
+               pr_debug4("Skipping unsupported SDT argument: %s\n", old_op);
+               return SDT_ARG_SKIP;
+       }
+
+       return SDT_ARG_VALID;
+}
index 1030a6e..39dbe51 100644 (file)
@@ -10,6 +10,7 @@
 #include "symbol.h"
 #include "map.h"
 #include "probe-event.h"
+#include "probe-file.h"
 
 #ifdef HAVE_LIBELF_SUPPORT
 bool elf__needs_adjust_symbols(GElf_Ehdr ehdr)
@@ -79,13 +80,18 @@ void arch__fix_tev_from_maps(struct perf_probe_event *pev,
         * However, if the user specifies an offset, we fall back to using the
         * GEP since all userspace applications (objdump/readelf) show function
         * disassembly with offsets from the GEP.
-        *
-        * In addition, we shouldn't specify an offset for kretprobes.
         */
-       if (pev->point.offset || (!pev->uprobes && pev->point.retprobe) ||
-           !map || !sym)
+       if (pev->point.offset || !map || !sym)
                return;
 
+       /* For kretprobes, add an offset only if the kernel supports it */
+       if (!pev->uprobes && pev->point.retprobe) {
+#ifdef HAVE_LIBELF_SUPPORT
+               if (!kretprobe_offset_is_supported())
+#endif
+                       return;
+       }
+
        lep_offset = PPC64_LOCAL_ENTRY_OFFSET(sym->arch_sym);
 
        if (map->dso->symtab_type == DSO_BINARY_TYPE__KALLSYMS)
diff --git a/tools/perf/arch/s390/annotate/instructions.c b/tools/perf/arch/s390/annotate/instructions.c
new file mode 100644 (file)
index 0000000..745b4b1
--- /dev/null
@@ -0,0 +1,30 @@
+static struct ins_ops *s390__associate_ins_ops(struct arch *arch, const char *name)
+{
+       struct ins_ops *ops = NULL;
+
+       /* catch all kind of jumps */
+       if (strchr(name, 'j') ||
+           !strncmp(name, "bct", 3) ||
+           !strncmp(name, "br", 2))
+               ops = &jump_ops;
+       /* override call/returns */
+       if (!strcmp(name, "bras") ||
+           !strcmp(name, "brasl") ||
+           !strcmp(name, "basr"))
+               ops = &call_ops;
+       if (!strcmp(name, "br"))
+               ops = &ret_ops;
+
+       arch__associate_ins_ops(arch, name, ops);
+       return ops;
+}
+
+static int s390__annotate_init(struct arch *arch)
+{
+       if (!arch->initialized) {
+               arch->initialized = true;
+               arch->associate_instruction_ops = s390__associate_ins_ops;
+       }
+
+       return 0;
+}
index ed57df2..d233e2e 100644 (file)
@@ -9,6 +9,7 @@
  * as published by the Free Software Foundation.
  */
 
+#include <errno.h>
 #include "../../util/kvm-stat.h"
 #include <asm/sie.h>
 
index e93ef0b..5aef183 100644 (file)
 329    common  pkey_mprotect           sys_pkey_mprotect
 330    common  pkey_alloc              sys_pkey_alloc
 331    common  pkey_free               sys_pkey_free
+332    common  statx                   sys_statx
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
index 7f064eb..f9713a7 100644 (file)
@@ -6,7 +6,10 @@
 #include "evsel.h"
 #include "arch-tests.h"
 
+#include <signal.h>
 #include <sys/mman.h>
+#include <sys/wait.h>
+#include <errno.h>
 #include <string.h>
 
 static pid_t spawn(void)
index 5c76cc8..e3ae9cf 100644 (file)
@@ -1,3 +1,5 @@
+#include <errno.h>
+#include <inttypes.h>
 #include <stdio.h>
 #include <unistd.h>
 #include <linux/types.h>
index cc1d865..6aa3f2a 100644 (file)
@@ -13,6 +13,7 @@
  *
  */
 
+#include <errno.h>
 #include <stdbool.h>
 
 #include "../../util/header.h"
index 5132775..af2bce7 100644 (file)
@@ -13,6 +13,7 @@
  *
  */
 
+#include <errno.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/bitops.h>
index 90fa228..f630de0 100644 (file)
@@ -13,6 +13,7 @@
  *
  */
 
+#include <errno.h>
 #include <stdbool.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
index b63d4be..bf817be 100644 (file)
@@ -1,3 +1,4 @@
+#include <errno.h>
 #include "../../util/kvm-stat.h"
 #include <asm/svm.h>
 #include <asm/vmx.h>
index c5db14f..f95edeb 100644 (file)
@@ -1,5 +1,11 @@
+#include <errno.h>
+#include <string.h>
+#include <regex.h>
+
 #include "../../perf.h"
+#include "../../util/util.h"
 #include "../../util/perf_regs.h"
+#include "../../util/debug.h"
 
 const struct sample_reg sample_reg_masks[] = {
        SMPL_REG(AX, PERF_REG_X86_AX),
@@ -26,3 +32,224 @@ const struct sample_reg sample_reg_masks[] = {
 #endif
        SMPL_REG_END
 };
+
+struct sdt_name_reg {
+       const char *sdt_name;
+       const char *uprobe_name;
+};
+#define SDT_NAME_REG(n, m) {.sdt_name = "%" #n, .uprobe_name = "%" #m}
+#define SDT_NAME_REG_END {.sdt_name = NULL, .uprobe_name = NULL}
+
+static const struct sdt_name_reg sdt_reg_tbl[] = {
+       SDT_NAME_REG(eax, ax),
+       SDT_NAME_REG(rax, ax),
+       SDT_NAME_REG(al,  ax),
+       SDT_NAME_REG(ah,  ax),
+       SDT_NAME_REG(ebx, bx),
+       SDT_NAME_REG(rbx, bx),
+       SDT_NAME_REG(bl,  bx),
+       SDT_NAME_REG(bh,  bx),
+       SDT_NAME_REG(ecx, cx),
+       SDT_NAME_REG(rcx, cx),
+       SDT_NAME_REG(cl,  cx),
+       SDT_NAME_REG(ch,  cx),
+       SDT_NAME_REG(edx, dx),
+       SDT_NAME_REG(rdx, dx),
+       SDT_NAME_REG(dl,  dx),
+       SDT_NAME_REG(dh,  dx),
+       SDT_NAME_REG(esi, si),
+       SDT_NAME_REG(rsi, si),
+       SDT_NAME_REG(sil, si),
+       SDT_NAME_REG(edi, di),
+       SDT_NAME_REG(rdi, di),
+       SDT_NAME_REG(dil, di),
+       SDT_NAME_REG(ebp, bp),
+       SDT_NAME_REG(rbp, bp),
+       SDT_NAME_REG(bpl, bp),
+       SDT_NAME_REG(rsp, sp),
+       SDT_NAME_REG(esp, sp),
+       SDT_NAME_REG(spl, sp),
+
+       /* rNN registers */
+       SDT_NAME_REG(r8b,  r8),
+       SDT_NAME_REG(r8w,  r8),
+       SDT_NAME_REG(r8d,  r8),
+       SDT_NAME_REG(r9b,  r9),
+       SDT_NAME_REG(r9w,  r9),
+       SDT_NAME_REG(r9d,  r9),
+       SDT_NAME_REG(r10b, r10),
+       SDT_NAME_REG(r10w, r10),
+       SDT_NAME_REG(r10d, r10),
+       SDT_NAME_REG(r11b, r11),
+       SDT_NAME_REG(r11w, r11),
+       SDT_NAME_REG(r11d, r11),
+       SDT_NAME_REG(r12b, r12),
+       SDT_NAME_REG(r12w, r12),
+       SDT_NAME_REG(r12d, r12),
+       SDT_NAME_REG(r13b, r13),
+       SDT_NAME_REG(r13w, r13),
+       SDT_NAME_REG(r13d, r13),
+       SDT_NAME_REG(r14b, r14),
+       SDT_NAME_REG(r14w, r14),
+       SDT_NAME_REG(r14d, r14),
+       SDT_NAME_REG(r15b, r15),
+       SDT_NAME_REG(r15w, r15),
+       SDT_NAME_REG(r15d, r15),
+       SDT_NAME_REG_END,
+};
+
+/*
+ * Perf only supports OP which is in  +/-NUM(REG)  form.
+ * Here plus-minus sign, NUM and parenthesis are optional,
+ * only REG is mandatory.
+ *
+ * SDT events also supports indirect addressing mode with a
+ * symbol as offset, scaled mode and constants in OP. But
+ * perf does not support them yet. Below are few examples.
+ *
+ * OP with scaled mode:
+ *     (%rax,%rsi,8)
+ *     10(%ras,%rsi,8)
+ *
+ * OP with indirect addressing mode:
+ *     check_action(%rip)
+ *     mp_+52(%rip)
+ *     44+mp_(%rip)
+ *
+ * OP with constant values:
+ *     $0
+ *     $123
+ *     $-1
+ */
+#define SDT_OP_REGEX  "^([+\\-]?)([0-9]*)(\\(?)(%[a-z][a-z0-9]+)(\\)?)$"
+
+static regex_t sdt_op_regex;
+
+static int sdt_init_op_regex(void)
+{
+       static int initialized;
+       int ret = 0;
+
+       if (initialized)
+               return 0;
+
+       ret = regcomp(&sdt_op_regex, SDT_OP_REGEX, REG_EXTENDED);
+       if (ret < 0) {
+               pr_debug4("Regex compilation error.\n");
+               return ret;
+       }
+
+       initialized = 1;
+       return 0;
+}
+
+/*
+ * Max x86 register name length is 5(ex: %r15d). So, 6th char
+ * should always contain NULL. This helps to find register name
+ * length using strlen, insted of maintaing one more variable.
+ */
+#define SDT_REG_NAME_SIZE  6
+
+/*
+ * The uprobe parser does not support all gas register names;
+ * so, we have to replace them (ex. for x86_64: %rax -> %ax).
+ * Note: If register does not require renaming, just copy
+ * paste as it is, but don't leave it empty.
+ */
+static void sdt_rename_register(char *sdt_reg, int sdt_len, char *uprobe_reg)
+{
+       int i = 0;
+
+       for (i = 0; sdt_reg_tbl[i].sdt_name != NULL; i++) {
+               if (!strncmp(sdt_reg_tbl[i].sdt_name, sdt_reg, sdt_len)) {
+                       strcpy(uprobe_reg, sdt_reg_tbl[i].uprobe_name);
+                       return;
+               }
+       }
+
+       strncpy(uprobe_reg, sdt_reg, sdt_len);
+}
+
+int arch_sdt_arg_parse_op(char *old_op, char **new_op)
+{
+       char new_reg[SDT_REG_NAME_SIZE] = {0};
+       int new_len = 0, ret;
+       /*
+        * rm[0]:  +/-NUM(REG)
+        * rm[1]:  +/-
+        * rm[2]:  NUM
+        * rm[3]:  (
+        * rm[4]:  REG
+        * rm[5]:  )
+        */
+       regmatch_t rm[6];
+       /*
+        * Max prefix length is 2 as it may contains sign(+/-)
+        * and displacement 0 (Both sign and displacement 0 are
+        * optional so it may be empty). Use one more character
+        * to hold last NULL so that strlen can be used to find
+        * prefix length, instead of maintaing one more variable.
+        */
+       char prefix[3] = {0};
+
+       ret = sdt_init_op_regex();
+       if (ret < 0)
+               return ret;
+
+       /*
+        * If unsupported OR does not match with regex OR
+        * register name too long, skip it.
+        */
+       if (strchr(old_op, ',') || strchr(old_op, '$') ||
+           regexec(&sdt_op_regex, old_op, 6, rm, 0)   ||
+           rm[4].rm_eo - rm[4].rm_so > SDT_REG_NAME_SIZE) {
+               pr_debug4("Skipping unsupported SDT argument: %s\n", old_op);
+               return SDT_ARG_SKIP;
+       }
+
+       /*
+        * Prepare prefix.
+        * If SDT OP has parenthesis but does not provide
+        * displacement, add 0 for displacement.
+        *     SDT         Uprobe     Prefix
+        *     -----------------------------
+        *     +24(%rdi)   +24(%di)   +
+        *     24(%rdi)    +24(%di)   +
+        *     %rdi        %di
+        *     (%rdi)      +0(%di)    +0
+        *     -80(%rbx)   -80(%bx)   -
+        */
+       if (rm[3].rm_so != rm[3].rm_eo) {
+               if (rm[1].rm_so != rm[1].rm_eo)
+                       prefix[0] = *(old_op + rm[1].rm_so);
+               else if (rm[2].rm_so != rm[2].rm_eo)
+                       prefix[0] = '+';
+               else
+                       strncpy(prefix, "+0", 2);
+       }
+
+       /* Rename register */
+       sdt_rename_register(old_op + rm[4].rm_so, rm[4].rm_eo - rm[4].rm_so,
+                           new_reg);
+
+       /* Prepare final OP which should be valid for uprobe_events */
+       new_len = strlen(prefix)              +
+                 (rm[2].rm_eo - rm[2].rm_so) +
+                 (rm[3].rm_eo - rm[3].rm_so) +
+                 strlen(new_reg)             +
+                 (rm[5].rm_eo - rm[5].rm_so) +
+                 1;                                    /* NULL */
+
+       *new_op = zalloc(new_len);
+       if (!*new_op)
+               return -ENOMEM;
+
+       scnprintf(*new_op, new_len, "%.*s%.*s%.*s%.*s%.*s",
+                 strlen(prefix), prefix,
+                 (int)(rm[2].rm_eo - rm[2].rm_so), old_op + rm[2].rm_so,
+                 (int)(rm[3].rm_eo - rm[3].rm_so), old_op + rm[3].rm_so,
+                 strlen(new_reg), new_reg,
+                 (int)(rm[5].rm_eo - rm[5].rm_so), old_op + rm[5].rm_so);
+
+       return SDT_ARG_VALID;
+}
index c4b7217..38dc9bb 100644 (file)
@@ -1,6 +1,7 @@
 #include <elfutils/libdwfl.h>
 #include "../../util/unwind-libdw.h"
 #include "../../util/perf_regs.h"
+#include "../../util/event.h"
 
 bool libdw__arch_set_initial_registers(Dwfl_Thread *thread, void *arg)
 {
index 579a592..842ab27 100644 (file)
 # endif
 #endif
 
-int bench_numa(int argc, const char **argv, const char *prefix);
-int bench_sched_messaging(int argc, const char **argv, const char *prefix);
-int bench_sched_pipe(int argc, const char **argv, const char *prefix);
-int bench_mem_memcpy(int argc, const char **argv, const char *prefix);
-int bench_mem_memset(int argc, const char **argv, const char *prefix);
-int bench_futex_hash(int argc, const char **argv, const char *prefix);
-int bench_futex_wake(int argc, const char **argv, const char *prefix);
-int bench_futex_wake_parallel(int argc, const char **argv, const char *prefix);
-int bench_futex_requeue(int argc, const char **argv, const char *prefix);
+int bench_numa(int argc, const char **argv);
+int bench_sched_messaging(int argc, const char **argv);
+int bench_sched_pipe(int argc, const char **argv);
+int bench_mem_memcpy(int argc, const char **argv);
+int bench_mem_memset(int argc, const char **argv);
+int bench_futex_hash(int argc, const char **argv);
+int bench_futex_wake(int argc, const char **argv);
+int bench_futex_wake_parallel(int argc, const char **argv);
+int bench_futex_requeue(int argc, const char **argv);
 /* pi futexes */
-int bench_futex_lock_pi(int argc, const char **argv, const char *prefix);
+int bench_futex_lock_pi(int argc, const char **argv);
 
 #define BENCH_FORMAT_DEFAULT_STR       "default"
 #define BENCH_FORMAT_DEFAULT           0
index da04b8c..fe16b31 100644 (file)
@@ -9,6 +9,7 @@
  */
 
 /* For the CLR_() macros */
+#include <string.h>
 #include <pthread.h>
 
 #include <errno.h>
@@ -113,8 +114,7 @@ static void print_summary(void)
               (int) runtime.tv_sec);
 }
 
-int bench_futex_hash(int argc, const char **argv,
-                    const char *prefix __maybe_unused)
+int bench_futex_hash(int argc, const char **argv)
 {
        int ret = 0;
        cpu_set_t cpu;
index 9187777..73a1c44 100644 (file)
@@ -3,6 +3,7 @@
  */
 
 /* For the CLR_() macros */
+#include <string.h>
 #include <pthread.h>
 
 #include <signal.h>
@@ -139,8 +140,7 @@ static void create_threads(struct worker *w, pthread_attr_t thread_attr)
        }
 }
 
-int bench_futex_lock_pi(int argc, const char **argv,
-                       const char *prefix __maybe_unused)
+int bench_futex_lock_pi(int argc, const char **argv)
 {
        int ret = 0;
        unsigned int i;
index 2b9705a..41786cb 100644 (file)
@@ -9,6 +9,7 @@
  */
 
 /* For the CLR_() macros */
+#include <string.h>
 #include <pthread.h>
 
 #include <signal.h>
@@ -108,8 +109,7 @@ static void toggle_done(int sig __maybe_unused,
        done = true;
 }
 
-int bench_futex_requeue(int argc, const char **argv,
-                       const char *prefix __maybe_unused)
+int bench_futex_requeue(int argc, const char **argv)
 {
        int ret = 0;
        unsigned int i, j;
index 2c8fa67..4ab12c8 100644 (file)
@@ -8,6 +8,7 @@
  */
 
 /* For the CLR_() macros */
+#include <string.h>
 #include <pthread.h>
 
 #include <signal.h>
@@ -196,8 +197,7 @@ static void toggle_done(int sig __maybe_unused,
        done = true;
 }
 
-int bench_futex_wake_parallel(int argc, const char **argv,
-                             const char *prefix __maybe_unused)
+int bench_futex_wake_parallel(int argc, const char **argv)
 {
        int ret = 0;
        unsigned int i, j;
index e246b1b..2fa4922 100644 (file)
@@ -9,6 +9,7 @@
  */
 
 /* For the CLR_() macros */
+#include <string.h>
 #include <pthread.h>
 
 #include <signal.h>
@@ -114,8 +115,7 @@ static void toggle_done(int sig __maybe_unused,
        done = true;
 }
 
-int bench_futex_wake(int argc, const char **argv,
-                    const char *prefix __maybe_unused)
+int bench_futex_wake(int argc, const char **argv)
 {
        int ret = 0;
        unsigned int i, j;
index b2e06d1..e44fd32 100644 (file)
@@ -88,13 +88,11 @@ futex_cmp_requeue(u_int32_t *uaddr, u_int32_t val, u_int32_t *uaddr2, int nr_wak
 
 #ifndef HAVE_PTHREAD_ATTR_SETAFFINITY_NP
 #include <pthread.h>
-static inline int pthread_attr_setaffinity_np(pthread_attr_t *attr,
-                                             size_t cpusetsize,
-                                             cpu_set_t *cpuset)
+#include <linux/compiler.h>
+static inline int pthread_attr_setaffinity_np(pthread_attr_t *attr __maybe_unused,
+                                             size_t cpusetsize __maybe_unused,
+                                             cpu_set_t *cpuset __maybe_unused)
 {
-       attr = attr;
-       cpusetsize = cpusetsize;
-       cpuset = cpuset;
        return 0;
 }
 #endif
index 52504a8..fbd732b 100644 (file)
@@ -12,6 +12,7 @@
 #include <subcmd/parse-options.h>
 #include "../util/header.h"
 #include "../util/cloexec.h"
+#include "../util/string2.h"
 #include "bench.h"
 #include "mem-memcpy-arch.h"
 #include "mem-memset-arch.h"
@@ -284,7 +285,7 @@ static const char * const bench_mem_memcpy_usage[] = {
        NULL
 };
 
-int bench_mem_memcpy(int argc, const char **argv, const char *prefix __maybe_unused)
+int bench_mem_memcpy(int argc, const char **argv)
 {
        struct bench_mem_info info = {
                .functions              = memcpy_functions,
@@ -358,7 +359,7 @@ static const struct function memset_functions[] = {
        { .name = NULL, }
 };
 
-int bench_mem_memset(int argc, const char **argv, const char *prefix __maybe_unused)
+int bench_mem_memset(int argc, const char **argv)
 {
        struct bench_mem_info info = {
                .functions              = memset_functions,
index 3083fc3..27de0c8 100644 (file)
@@ -4,6 +4,7 @@
  * numa: Simulate NUMA-sensitive workload and measure their NUMA performance
  */
 
+#include <inttypes.h>
 /* For the CLR_() macros */
 #include <pthread.h>
 
@@ -30,6 +31,7 @@
 #include <sys/wait.h>
 #include <sys/prctl.h>
 #include <sys/types.h>
+#include <linux/kernel.h>
 #include <linux/time64.h>
 
 #include <numa.h>
@@ -187,7 +189,8 @@ static const struct option options[] = {
        OPT_INCR   ('d', "show_details" , &p0.show_details,     "Show details"),
        OPT_INCR   ('a', "all"          , &p0.run_all,          "Run all tests in the suite"),
        OPT_INTEGER('H', "thp"          , &p0.thp,              "MADV_NOHUGEPAGE < 0 < MADV_HUGEPAGE"),
-       OPT_BOOLEAN('c', "show_convergence", &p0.show_convergence, "show convergence details"),
+       OPT_BOOLEAN('c', "show_convergence", &p0.show_convergence, "show convergence details, "
+                   "convergence is reached when each process (all its threads) is running on a single NUMA node."),
        OPT_BOOLEAN('m', "measure_convergence", &p0.measure_convergence, "measure convergence latency"),
        OPT_BOOLEAN('q', "quiet"        , &p0.show_quiet,       "quiet mode"),
        OPT_BOOLEAN('S', "serialize-startup", &p0.serialize_startup,"serialize thread startup"),
@@ -1766,7 +1769,7 @@ static int bench_all(void)
        return 0;
 }
 
-int bench_numa(int argc, const char **argv, const char *prefix __maybe_unused)
+int bench_numa(int argc, const char **argv)
 {
        init_params(&p0, "main,", argc, argv);
        argc = parse_options(argc, argv, options, bench_numa_usage, 0);
index 6a111e7..4f961e7 100644 (file)
@@ -260,8 +260,7 @@ static const char * const bench_sched_message_usage[] = {
        NULL
 };
 
-int bench_sched_messaging(int argc, const char **argv,
-                   const char *prefix __maybe_unused)
+int bench_sched_messaging(int argc, const char **argv)
 {
        unsigned int i, total_children;
        struct timeval start, stop, diff;
index 2243f01..a152737 100644 (file)
@@ -76,7 +76,7 @@ static void *worker_thread(void *__tdata)
        return NULL;
 }
 
-int bench_sched_pipe(int argc, const char **argv, const char *prefix __maybe_unused)
+int bench_sched_pipe(int argc, const char **argv)
 {
        struct thread_data threads[2], *td;
        int pipe_1[2], pipe_2[2];
index 4f52d85..7a5dc7e 100644 (file)
@@ -33,6 +33,7 @@
 #include "util/block-range.h"
 
 #include <dlfcn.h>
+#include <errno.h>
 #include <linux/bitmap.h>
 
 struct perf_annotate {
@@ -383,7 +384,7 @@ static const char * const annotate_usage[] = {
        NULL
 };
 
-int cmd_annotate(int argc, const char **argv, const char *prefix __maybe_unused)
+int cmd_annotate(int argc, const char **argv)
 {
        struct perf_annotate annotate = {
                .tool = {
@@ -393,6 +394,9 @@ int cmd_annotate(int argc, const char **argv, const char *prefix __maybe_unused)
                        .comm   = perf_event__process_comm,
                        .exit   = perf_event__process_exit,
                        .fork   = perf_event__process_fork,
+                       .namespaces = perf_event__process_namespaces,
+                       .attr   = perf_event__process_attr,
+                       .build_id = perf_event__process_build_id,
                        .ordered_events = true,
                        .ordering_requires_timestamps = true,
                },
index a1cddc6..445e628 100644 (file)
@@ -25,7 +25,7 @@
 #include <string.h>
 #include <sys/prctl.h>
 
-typedef int (*bench_fn_t)(int argc, const char **argv, const char *prefix);
+typedef int (*bench_fn_t)(int argc, const char **argv);
 
 struct bench {
        const char      *name;
@@ -155,7 +155,7 @@ static int bench_str2int(const char *str)
  * to something meaningful:
  */
 static int run_bench(const char *coll_name, const char *bench_name, bench_fn_t fn,
-                    int argc, const char **argv, const char *prefix)
+                    int argc, const char **argv)
 {
        int size;
        char *name;
@@ -171,7 +171,7 @@ static int run_bench(const char *coll_name, const char *bench_name, bench_fn_t f
        prctl(PR_SET_NAME, name);
        argv[0] = name;
 
-       ret = fn(argc, argv, prefix);
+       ret = fn(argc, argv);
 
        free(name);
 
@@ -198,7 +198,7 @@ static void run_collection(struct collection *coll)
                fflush(stdout);
 
                argv[1] = bench->name;
-               run_bench(coll->name, bench->name, bench->fn, 1, argv, NULL);
+               run_bench(coll->name, bench->name, bench->fn, 1, argv);
                printf("\n");
        }
 }
@@ -211,7 +211,7 @@ static void run_all_collections(void)
                run_collection(coll);
 }
 
-int cmd_bench(int argc, const char **argv, const char *prefix __maybe_unused)
+int cmd_bench(int argc, const char **argv)
 {
        struct collection *coll;
        int ret = 0;
@@ -270,7 +270,7 @@ int cmd_bench(int argc, const char **argv, const char *prefix __maybe_unused)
                        if (bench_format == BENCH_FORMAT_DEFAULT)
                                printf("# Running '%s/%s' benchmark:\n", coll->name, bench->name);
                        fflush(stdout);
-                       ret = run_bench(coll->name, bench->name, bench->fn, argc-1, argv+1, prefix);
+                       ret = run_bench(coll->name, bench->name, bench->fn, argc-1, argv+1);
                        goto end;
                }
 
index 30e2b2c..64b44e8 100644 (file)
@@ -10,6 +10,7 @@
 #include <sys/time.h>
 #include <time.h>
 #include <dirent.h>
+#include <errno.h>
 #include <unistd.h>
 #include "builtin.h"
 #include "perf.h"
@@ -21,6 +22,7 @@
 #include "util/build-id.h"
 #include "util/session.h"
 #include "util/symbol.h"
+#include "util/time-utils.h"
 
 static int build_id_cache__kcore_buildid(const char *proc_dir, char *sbuildid)
 {
@@ -276,8 +278,7 @@ static int build_id_cache__update_file(const char *filename)
        return err;
 }
 
-int cmd_buildid_cache(int argc, const char **argv,
-                     const char *prefix __maybe_unused)
+int cmd_buildid_cache(int argc, const char **argv)
 {
        struct strlist *list;
        struct str_node *pos;
index 5e914ee..fdaca16 100644 (file)
@@ -16,6 +16,7 @@
 #include "util/session.h"
 #include "util/symbol.h"
 #include "util/data.h"
+#include <errno.h>
 
 static int sysfs__fprintf_build_id(FILE *fp)
 {
@@ -87,8 +88,7 @@ out:
        return 0;
 }
 
-int cmd_buildid_list(int argc, const char **argv,
-                    const char *prefix __maybe_unused)
+int cmd_buildid_list(int argc, const char **argv)
 {
        bool show_kernel = false;
        bool with_hits = false;
index e2b2172..e33b4ac 100644 (file)
@@ -9,10 +9,13 @@
  *   Dick Fowles <fowles@inreach.com>
  *   Joe Mario <jmario@redhat.com>
  */
+#include <errno.h>
+#include <inttypes.h>
 #include <linux/compiler.h>
 #include <linux/kernel.h>
 #include <linux/stringify.h>
 #include <asm/bug.h>
+#include <sys/param.h>
 #include "util.h"
 #include "debug.h"
 #include "builtin.h"
@@ -29,6 +32,7 @@
 #include <asm/bug.h>
 #include "ui/browsers/hists.h"
 #include "evlist.h"
+#include "thread.h"
 
 struct c2c_hists {
        struct hists            hists;
@@ -2334,7 +2338,7 @@ out:
 
 static void perf_c2c_display(struct perf_session *session)
 {
-       if (c2c.use_stdio)
+       if (use_browser == 0)
                perf_c2c__hists_fprintf(stdout, session);
        else
                perf_c2c__hists_browse(&c2c.hists.hists);
@@ -2536,7 +2540,7 @@ static int perf_c2c__report(int argc, const char **argv)
        OPT_BOOLEAN(0, "stdio", &c2c.use_stdio, "Use the stdio interface"),
 #endif
        OPT_BOOLEAN(0, "stats", &c2c.stats_only,
-                   "Use the stdio interface"),
+                   "Display only statistic tables (implies --stdio)"),
        OPT_BOOLEAN(0, "full-symbols", &c2c.symbol_full,
                    "Display full length of symbols"),
        OPT_BOOLEAN(0, "no-source", &no_source,
@@ -2755,12 +2759,12 @@ static int perf_c2c__record(int argc, const char **argv)
                pr_debug("\n");
        }
 
-       ret = cmd_record(i, rec_argv, NULL);
+       ret = cmd_record(i, rec_argv);
        free(rec_argv);
        return ret;
 }
 
-int cmd_c2c(int argc, const char **argv, const char *prefix __maybe_unused)
+int cmd_c2c(int argc, const char **argv)
 {
        argc = parse_options(argc, argv, c2c_options, c2c_usage,
                             PARSE_OPT_STOP_AT_NON_OPTION);
index 8c0d93b..55f04f8 100644 (file)
@@ -154,7 +154,7 @@ static int parse_config_arg(char *arg, char **var, char **value)
        return 0;
 }
 
-int cmd_config(int argc, const char **argv, const char *prefix __maybe_unused)
+int cmd_config(int argc, const char **argv)
 {
        int i, ret = 0;
        struct perf_config_set *set;
index 7ad6e17..0adb5f8 100644 (file)
@@ -6,7 +6,7 @@
 #include "data-convert.h"
 #include "data-convert-bt.h"
 
-typedef int (*data_cmd_fn_t)(int argc, const char **argv, const char *prefix);
+typedef int (*data_cmd_fn_t)(int argc, const char **argv);
 
 struct data_cmd {
        const char      *name;
@@ -50,8 +50,7 @@ static const char * const data_convert_usage[] = {
        NULL
 };
 
-static int cmd_data_convert(int argc, const char **argv,
-                           const char *prefix __maybe_unused)
+static int cmd_data_convert(int argc, const char **argv)
 {
        const char *to_ctf     = NULL;
        struct perf_data_convert_opts opts = {
@@ -98,7 +97,7 @@ static struct data_cmd data_cmds[] = {
        { .name = NULL, },
 };
 
-int cmd_data(int argc, const char **argv, const char *prefix)
+int cmd_data(int argc, const char **argv)
 {
        struct data_cmd *cmd;
        const char *cmdstr;
@@ -118,7 +117,7 @@ int cmd_data(int argc, const char **argv, const char *prefix)
                if (strcmp(cmd->name, cmdstr))
                        continue;
 
-               return cmd->fn(argc, argv, prefix);
+               return cmd->fn(argc, argv);
        }
 
        pr_err("Unknown command: %s\n", cmdstr);
index 1b96a31..eec5df8 100644 (file)
@@ -19,6 +19,8 @@
 #include "util/data.h"
 #include "util/config.h"
 
+#include <errno.h>
+#include <inttypes.h>
 #include <stdlib.h>
 #include <math.h>
 
@@ -364,6 +366,7 @@ static struct perf_tool tool = {
        .exit   = perf_event__process_exit,
        .fork   = perf_event__process_fork,
        .lost   = perf_event__process_lost,
+       .namespaces = perf_event__process_namespaces,
        .ordered_events = true,
        .ordering_requires_timestamps = true,
 };
@@ -1320,7 +1323,7 @@ static int diff__config(const char *var, const char *value,
        return 0;
 }
 
-int cmd_diff(int argc, const char **argv, const char *prefix __maybe_unused)
+int cmd_diff(int argc, const char **argv)
 {
        int ret = hists__init();
 
index e09c428..6d210e4 100644 (file)
@@ -46,7 +46,7 @@ static int __cmd_evlist(const char *file_name, struct perf_attr_details *details
        return 0;
 }
 
-int cmd_evlist(int argc, const char **argv, const char *prefix __maybe_unused)
+int cmd_evlist(int argc, const char **argv)
 {
        struct perf_attr_details details = { .verbose = false, };
        const struct option options[] = {
index c3e6436..9e0b35c 100644 (file)
@@ -9,13 +9,18 @@
 #include "builtin.h"
 #include "perf.h"
 
+#include <errno.h>
 #include <unistd.h>
 #include <signal.h>
+#include <fcntl.h>
+#include <poll.h>
 
 #include "debug.h"
 #include <subcmd/parse-options.h>
+#include <api/fs/tracing_path.h>
 #include "evlist.h"
 #include "target.h"
+#include "cpumap.h"
 #include "thread_map.h"
 #include "util/config.h"
 
@@ -50,11 +55,12 @@ static void ftrace__workload_exec_failed_signal(int signo __maybe_unused,
        done = true;
 }
 
-static int write_tracing_file(const char *name, const char *val)
+static int __write_tracing_file(const char *name, const char *val, bool append)
 {
        char *file;
        int fd, ret = -1;
        ssize_t size = strlen(val);
+       int flags = O_WRONLY;
 
        file = get_tracing_file(name);
        if (!file) {
@@ -62,7 +68,12 @@ static int write_tracing_file(const char *name, const char *val)
                return -1;
        }
 
-       fd = open(file, O_WRONLY);
+       if (append)
+               flags |= O_APPEND;
+       else
+               flags |= O_TRUNC;
+
+       fd = open(file, flags);
        if (fd < 0) {
                pr_debug("cannot open tracing file: %s\n", name);
                goto out;
@@ -79,6 +90,18 @@ out:
        return ret;
 }
 
+static int write_tracing_file(const char *name, const char *val)
+{
+       return __write_tracing_file(name, val, false);
+}
+
+static int append_tracing_file(const char *name, const char *val)
+{
+       return __write_tracing_file(name, val, true);
+}
+
+static int reset_tracing_cpu(void);
+
 static int reset_tracing_files(struct perf_ftrace *ftrace __maybe_unused)
 {
        if (write_tracing_file("tracing_on", "0") < 0)
@@ -90,14 +113,78 @@ static int reset_tracing_files(struct perf_ftrace *ftrace __maybe_unused)
        if (write_tracing_file("set_ftrace_pid", " ") < 0)
                return -1;
 
+       if (reset_tracing_cpu() < 0)
+               return -1;
+
+       return 0;
+}
+
+static int set_tracing_pid(struct perf_ftrace *ftrace)
+{
+       int i;
+       char buf[16];
+
+       if (target__has_cpu(&ftrace->target))
+               return 0;
+
+       for (i = 0; i < thread_map__nr(ftrace->evlist->threads); i++) {
+               scnprintf(buf, sizeof(buf), "%d",
+                         ftrace->evlist->threads->map[i]);
+               if (append_tracing_file("set_ftrace_pid", buf) < 0)
+                       return -1;
+       }
        return 0;
 }
 
+static int set_tracing_cpumask(struct cpu_map *cpumap)
+{
+       char *cpumask;
+       size_t mask_size;
+       int ret;
+       int last_cpu;
+
+       last_cpu = cpu_map__cpu(cpumap, cpumap->nr - 1);
+       mask_size = (last_cpu + 3) / 4 + 1;
+       mask_size += last_cpu / 32; /* ',' is needed for every 32th cpus */
+
+       cpumask = malloc(mask_size);
+       if (cpumask == NULL) {
+               pr_debug("failed to allocate cpu mask\n");
+               return -1;
+       }
+
+       cpu_map__snprint_mask(cpumap, cpumask, mask_size);
+
+       ret = write_tracing_file("tracing_cpumask", cpumask);
+
+       free(cpumask);
+       return ret;
+}
+
+static int set_tracing_cpu(struct perf_ftrace *ftrace)
+{
+       struct cpu_map *cpumap = ftrace->evlist->cpus;
+
+       if (!target__has_cpu(&ftrace->target))
+               return 0;
+
+       return set_tracing_cpumask(cpumap);
+}
+
+static int reset_tracing_cpu(void)
+{
+       struct cpu_map *cpumap = cpu_map__new(NULL);
+       int ret;
+
+       ret = set_tracing_cpumask(cpumap);
+       cpu_map__put(cpumap);
+       return ret;
+}
+
 static int __cmd_ftrace(struct perf_ftrace *ftrace, int argc, const char **argv)
 {
        char *trace_file;
        int trace_fd;
-       char *trace_pid;
        char buf[4096];
        struct pollfd pollfd = {
                .events = POLLIN,
@@ -108,42 +195,43 @@ static int __cmd_ftrace(struct perf_ftrace *ftrace, int argc, const char **argv)
                return -1;
        }
 
-       if (argc < 1)
-               return -1;
-
        signal(SIGINT, sig_handler);
        signal(SIGUSR1, sig_handler);
        signal(SIGCHLD, sig_handler);
+       signal(SIGPIPE, sig_handler);
 
-       reset_tracing_files(ftrace);
+       if (reset_tracing_files(ftrace) < 0)
+               goto out;
 
        /* reset ftrace buffer */
        if (write_tracing_file("trace", "0") < 0)
                goto out;
 
-       if (perf_evlist__prepare_workload(ftrace->evlist, &ftrace->target,
-                                         argv, false, ftrace__workload_exec_failed_signal) < 0)
+       if (argc && perf_evlist__prepare_workload(ftrace->evlist,
+                               &ftrace->target, argv, false,
+                               ftrace__workload_exec_failed_signal) < 0) {
                goto out;
+       }
 
-       if (write_tracing_file("current_tracer", ftrace->tracer) < 0) {
-               pr_err("failed to set current_tracer to %s\n", ftrace->tracer);
-               goto out;
+       if (set_tracing_pid(ftrace) < 0) {
+               pr_err("failed to set ftrace pid\n");
+               goto out_reset;
        }
 
-       if (asprintf(&trace_pid, "%d", thread_map__pid(ftrace->evlist->threads, 0)) < 0) {
-               pr_err("failed to allocate pid string\n");
-               goto out;
+       if (set_tracing_cpu(ftrace) < 0) {
+               pr_err("failed to set tracing cpumask\n");
+               goto out_reset;
        }
 
-       if (write_tracing_file("set_ftrace_pid", trace_pid) < 0) {
-               pr_err("failed to set pid: %s\n", trace_pid);
-               goto out_free_pid;
+       if (write_tracing_file("current_tracer", ftrace->tracer) < 0) {
+               pr_err("failed to set current_tracer to %s\n", ftrace->tracer);
+               goto out_reset;
        }
 
        trace_file = get_tracing_file("trace_pipe");
        if (!trace_file) {
                pr_err("failed to open trace_pipe\n");
-               goto out_free_pid;
+               goto out_reset;
        }
 
        trace_fd = open(trace_file, O_RDONLY);
@@ -152,7 +240,7 @@ static int __cmd_ftrace(struct perf_ftrace *ftrace, int argc, const char **argv)
 
        if (trace_fd < 0) {
                pr_err("failed to open trace_pipe\n");
-               goto out_free_pid;
+               goto out_reset;
        }
 
        fcntl(trace_fd, F_SETFL, O_NONBLOCK);
@@ -163,6 +251,8 @@ static int __cmd_ftrace(struct perf_ftrace *ftrace, int argc, const char **argv)
                goto out_close_fd;
        }
 
+       setup_pager();
+
        perf_evlist__start_workload(ftrace->evlist);
 
        while (!done) {
@@ -191,11 +281,9 @@ static int __cmd_ftrace(struct perf_ftrace *ftrace, int argc, const char **argv)
 
 out_close_fd:
        close(trace_fd);
-out_free_pid:
-       free(trace_pid);
-out:
+out_reset:
        reset_tracing_files(ftrace);
-
+out:
        return done ? 0 : -1;
 }
 
@@ -219,7 +307,7 @@ static int perf_ftrace_config(const char *var, const char *value, void *cb)
        return -1;
 }
 
-int cmd_ftrace(int argc, const char **argv, const char *prefix __maybe_unused)
+int cmd_ftrace(int argc, const char **argv)
 {
        int ret;
        struct perf_ftrace ftrace = {
@@ -227,15 +315,21 @@ int cmd_ftrace(int argc, const char **argv, const char *prefix __maybe_unused)
                .target = { .uid = UINT_MAX, },
        };
        const char * const ftrace_usage[] = {
-               "perf ftrace [<options>] <command>",
+               "perf ftrace [<options>] [<command>]",
                "perf ftrace [<options>] -- <command> [<options>]",
                NULL
        };
        const struct option ftrace_options[] = {
        OPT_STRING('t', "tracer", &ftrace.tracer, "tracer",
                   "tracer to use: function_graph(default) or function"),
+       OPT_STRING('p', "pid", &ftrace.target.pid, "pid",
+                  "trace on existing process id"),
        OPT_INCR('v', "verbose", &verbose,
                 "be more verbose"),
+       OPT_BOOLEAN('a', "all-cpus", &ftrace.target.system_wide,
+                   "system-wide collection from all CPUs"),
+       OPT_STRING('C', "cpu", &ftrace.target.cpu_list, "cpu",
+                   "list of cpus to monitor"),
        OPT_END()
        };
 
@@ -245,9 +339,18 @@ int cmd_ftrace(int argc, const char **argv, const char *prefix __maybe_unused)
 
        argc = parse_options(argc, argv, ftrace_options, ftrace_usage,
                            PARSE_OPT_STOP_AT_NON_OPTION);
-       if (!argc)
+       if (!argc && target__none(&ftrace.target))
                usage_with_options(ftrace_usage, ftrace_options);
 
+       ret = target__validate(&ftrace.target);
+       if (ret) {
+               char errbuf[512];
+
+               target__strerror(&ftrace.target, ret, errbuf, 512);
+               pr_err("%s\n", errbuf);
+               return -EINVAL;
+       }
+
        ftrace.evlist = perf_evlist__new();
        if (ftrace.evlist == NULL)
                return -ENOMEM;
index aed0d84..492f8e1 100644 (file)
 #include <subcmd/run-command.h>
 #include <subcmd/help.h>
 #include "util/debug.h"
+#include <linux/kernel.h>
+#include <errno.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
 
 static struct man_viewer_list {
        struct man_viewer_list *next;
-       char name[FLEX_ARRAY];
+       char name[0];
 } *man_viewer_list;
 
 static struct man_viewer_info_list {
        struct man_viewer_info_list *next;
        const char *info;
-       char name[FLEX_ARRAY];
+       char name[0];
 } *man_viewer_info_list;
 
 enum help_format {
@@ -301,12 +307,6 @@ void list_common_cmds_help(void)
        }
 }
 
-static int is_perf_command(const char *s)
-{
-       return is_in_cmdlist(&main_cmds, s) ||
-               is_in_cmdlist(&other_cmds, s);
-}
-
 static const char *cmd_to_page(const char *perf_cmd)
 {
        char *s;
@@ -418,7 +418,7 @@ static int show_html_page(const char *perf_cmd)
        return 0;
 }
 
-int cmd_help(int argc, const char **argv, const char *prefix __maybe_unused)
+int cmd_help(int argc, const char **argv)
 {
        bool show_all = false;
        enum help_format help_format = HELP_FORMAT_MAN;
@@ -446,7 +446,6 @@ int cmd_help(int argc, const char **argv, const char *prefix __maybe_unused)
                "perf help [--all] [--man|--web|--info] [command]",
                NULL
        };
-       const char *alias;
        int rc;
 
        load_command_list("perf-", &main_cmds, &other_cmds);
@@ -472,12 +471,6 @@ int cmd_help(int argc, const char **argv, const char *prefix __maybe_unused)
                return 0;
        }
 
-       alias = alias_lookup(argv[0]);
-       if (alias && !is_perf_command(argv[0])) {
-               printf("`perf %s' is aliased to `%s'\n", argv[0], alias);
-               return 0;
-       }
-
        switch (help_format) {
        case HELP_FORMAT_MAN:
                rc = show_man_page(argv[0]);
index b9bc7e3..ea8db38 100644 (file)
 #include "util/data.h"
 #include "util/auxtrace.h"
 #include "util/jit.h"
+#include "util/thread.h"
 
 #include <subcmd/parse-options.h>
 
 #include <linux/list.h>
+#include <errno.h>
+#include <signal.h>
 
 struct perf_inject {
        struct perf_tool        tool;
@@ -333,6 +336,18 @@ static int perf_event__repipe_comm(struct perf_tool *tool,
        return err;
 }
 
+static int perf_event__repipe_namespaces(struct perf_tool *tool,
+                                        union perf_event *event,
+                                        struct perf_sample *sample,
+                                        struct machine *machine)
+{
+       int err = perf_event__process_namespaces(tool, event, sample, machine);
+
+       perf_event__repipe(tool, event, sample, machine);
+
+       return err;
+}
+
 static int perf_event__repipe_exit(struct perf_tool *tool,
                                   union perf_event *event,
                                   struct perf_sample *sample,
@@ -660,6 +675,7 @@ static int __cmd_inject(struct perf_inject *inject)
                session->itrace_synth_opts = &inject->itrace_synth_opts;
                inject->itrace_synth_opts.inject = true;
                inject->tool.comm           = perf_event__repipe_comm;
+               inject->tool.namespaces     = perf_event__repipe_namespaces;
                inject->tool.exit           = perf_event__repipe_exit;
                inject->tool.id_index       = perf_event__repipe_id_index;
                inject->tool.auxtrace_info  = perf_event__process_auxtrace_info;
@@ -681,6 +697,8 @@ static int __cmd_inject(struct perf_inject *inject)
                lseek(fd, output_data_offset, SEEK_SET);
 
        ret = perf_session__process_events(session);
+       if (ret)
+               return ret;
 
        if (!file_out->is_pipe) {
                if (inject->build_ids)
@@ -725,7 +743,7 @@ static int __cmd_inject(struct perf_inject *inject)
        return ret;
 }
 
-int cmd_inject(int argc, const char **argv, const char *prefix __maybe_unused)
+int cmd_inject(int argc, const char **argv)
 {
        struct perf_inject inject = {
                .tool = {
index 224bfc4..bcfb363 100644 (file)
@@ -7,6 +7,7 @@
  *
  * Released under the GPL v2. (and only v2, not any later version)
  */
+#include <inttypes.h>
 #include "builtin.h"
 #include <linux/compiler.h>
 #include <subcmd/parse-options.h>
@@ -43,7 +44,7 @@ static int __cmd_kallsyms(int argc, const char **argv)
        return 0;
 }
 
-int cmd_kallsyms(int argc, const char **argv, const char *prefix __maybe_unused)
+int cmd_kallsyms(int argc, const char **argv)
 {
        const struct option options[] = {
        OPT_INCR('v', "verbose", &verbose, "be more verbose (show counter open errors, etc)"),
index 6da8d08..9409c94 100644 (file)
 
 #include "util/debug.h"
 
+#include <linux/kernel.h>
 #include <linux/rbtree.h>
 #include <linux/string.h>
+#include <errno.h>
+#include <inttypes.h>
 #include <locale.h>
 #include <regex.h>
 
+#include "sane_ctype.h"
+
 static int     kmem_slab;
 static int     kmem_page;
 
@@ -964,6 +969,7 @@ static struct perf_tool perf_kmem = {
        .comm            = perf_event__process_comm,
        .mmap            = perf_event__process_mmap,
        .mmap2           = perf_event__process_mmap2,
+       .namespaces      = perf_event__process_namespaces,
        .ordered_events  = true,
 };
 
@@ -1865,7 +1871,7 @@ static int __cmd_record(int argc, const char **argv)
        for (j = 1; j < (unsigned int)argc; j++, i++)
                rec_argv[i] = argv[j];
 
-       return cmd_record(i, rec_argv, NULL);
+       return cmd_record(i, rec_argv);
 }
 
 static int kmem_config(const char *var, const char *value, void *cb __maybe_unused)
@@ -1884,7 +1890,7 @@ static int kmem_config(const char *var, const char *value, void *cb __maybe_unus
        return 0;
 }
 
-int cmd_kmem(int argc, const char **argv, const char *prefix __maybe_unused)
+int cmd_kmem(int argc, const char **argv)
 {
        const char * const default_slab_sort = "frag,hit,bytes";
        const char * const default_page_sort = "bytes,hit";
index 08fa88f..f309c37 100644 (file)
@@ -3,6 +3,7 @@
 
 #include "util/evsel.h"
 #include "util/evlist.h"
+#include "util/term.h"
 #include "util/util.h"
 #include "util/cache.h"
 #include "util/symbol.h"
 #ifdef HAVE_TIMERFD_SUPPORT
 #include <sys/timerfd.h>
 #endif
+#include <sys/time.h>
 
+#include <linux/kernel.h>
 #include <linux/time64.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <poll.h>
 #include <termios.h>
 #include <semaphore.h>
+#include <signal.h>
 #include <pthread.h>
 #include <math.h>
 
+static const char *get_filename_for_perf_kvm(void)
+{
+       const char *filename;
+
+       if (perf_host && !perf_guest)
+               filename = strdup("perf.data.host");
+       else if (!perf_host && perf_guest)
+               filename = strdup("perf.data.guest");
+       else
+               filename = strdup("perf.data.kvm");
+
+       return filename;
+}
+
 #ifdef HAVE_KVM_STAT_SUPPORT
 #include "util/kvm-stat.h"
 
@@ -1044,6 +1065,7 @@ static int read_events(struct perf_kvm_stat *kvm)
        struct perf_tool eops = {
                .sample                 = process_sample_event,
                .comm                   = perf_event__process_comm,
+               .namespaces             = perf_event__process_namespaces,
                .ordered_events         = true,
        };
        struct perf_data_file file = {
@@ -1208,7 +1230,7 @@ kvm_events_record(struct perf_kvm_stat *kvm, int argc, const char **argv)
        set_option_flag(record_options, 0, "transaction", PARSE_OPT_DISABLED);
 
        record_usage = kvm_stat_record_usage;
-       return cmd_record(i, rec_argv, NULL);
+       return cmd_record(i, rec_argv);
 }
 
 static int
@@ -1348,6 +1370,7 @@ static int kvm_events_live(struct perf_kvm_stat *kvm,
        kvm->tool.exit   = perf_event__process_exit;
        kvm->tool.fork   = perf_event__process_fork;
        kvm->tool.lost   = process_lost_event;
+       kvm->tool.namespaces  = perf_event__process_namespaces;
        kvm->tool.ordered_events = true;
        perf_tool__fill_defaults(&kvm->tool);
 
@@ -1475,7 +1498,7 @@ static int kvm_cmd_stat(const char *file_name, int argc, const char **argv)
 #endif
 
 perf_stat:
-       return cmd_stat(argc, argv, NULL);
+       return cmd_stat(argc, argv);
 }
 #endif /* HAVE_KVM_STAT_SUPPORT */
 
@@ -1494,7 +1517,7 @@ static int __cmd_record(const char *file_name, int argc, const char **argv)
 
        BUG_ON(i != rec_argc);
 
-       return cmd_record(i, rec_argv, NULL);
+       return cmd_record(i, rec_argv);
 }
 
 static int __cmd_report(const char *file_name, int argc, const char **argv)
@@ -1512,7 +1535,7 @@ static int __cmd_report(const char *file_name, int argc, const char **argv)
 
        BUG_ON(i != rec_argc);
 
-       return cmd_report(i, rec_argv, NULL);
+       return cmd_report(i, rec_argv);
 }
 
 static int
@@ -1531,10 +1554,10 @@ __cmd_buildid_list(const char *file_name, int argc, const char **argv)
 
        BUG_ON(i != rec_argc);
 
-       return cmd_buildid_list(i, rec_argv, NULL);
+       return cmd_buildid_list(i, rec_argv);
 }
 
-int cmd_kvm(int argc, const char **argv, const char *prefix __maybe_unused)
+int cmd_kvm(int argc, const char **argv)
 {
        const char *file_name = NULL;
        const struct option kvm_options[] = {
@@ -1589,9 +1612,9 @@ int cmd_kvm(int argc, const char **argv, const char *prefix __maybe_unused)
        else if (!strncmp(argv[0], "rep", 3))
                return __cmd_report(file_name, argc, argv);
        else if (!strncmp(argv[0], "diff", 4))
-               return cmd_diff(argc, argv, NULL);
+               return cmd_diff(argc, argv);
        else if (!strncmp(argv[0], "top", 3))
-               return cmd_top(argc, argv, NULL);
+               return cmd_top(argc, argv);
        else if (!strncmp(argv[0], "buildid-list", 12))
                return __cmd_buildid_list(file_name, argc, argv);
 #ifdef HAVE_KVM_STAT_SUPPORT
index 3b9d98b..4bf2cb4 100644 (file)
@@ -18,8 +18,9 @@
 #include <subcmd/parse-options.h>
 
 static bool desc_flag = true;
+static bool details_flag;
 
-int cmd_list(int argc, const char **argv, const char *prefix __maybe_unused)
+int cmd_list(int argc, const char **argv)
 {
        int i;
        bool raw_dump = false;
@@ -30,6 +31,8 @@ int cmd_list(int argc, const char **argv, const char *prefix __maybe_unused)
                            "Print extra event descriptions. --no-desc to not print."),
                OPT_BOOLEAN('v', "long-desc", &long_desc_flag,
                            "Print longer event descriptions."),
+               OPT_BOOLEAN(0, "details", &details_flag,
+                           "Print information on the perf event names and expressions used internally by events."),
                OPT_INCR(0, "debug", &verbose,
                             "Enable debugging output"),
                OPT_END()
@@ -50,7 +53,8 @@ int cmd_list(int argc, const char **argv, const char *prefix __maybe_unused)
                printf("\nList of pre-defined events (to be used in -e):\n\n");
 
        if (argc == 0) {
-               print_events(NULL, raw_dump, !desc_flag, long_desc_flag);
+               print_events(NULL, raw_dump, !desc_flag, long_desc_flag,
+                               details_flag);
                return 0;
        }
 
@@ -72,7 +76,7 @@ int cmd_list(int argc, const char **argv, const char *prefix __maybe_unused)
                        print_hwcache_events(NULL, raw_dump);
                else if (strcmp(argv[i], "pmu") == 0)
                        print_pmu_events(NULL, raw_dump, !desc_flag,
-                                               long_desc_flag);
+                                               long_desc_flag, details_flag);
                else if (strcmp(argv[i], "sdt") == 0)
                        print_sdt_events(NULL, NULL, raw_dump);
                else if ((sep = strchr(argv[i], ':')) != NULL) {
@@ -80,7 +84,8 @@ int cmd_list(int argc, const char **argv, const char *prefix __maybe_unused)
 
                        if (sep == NULL) {
                                print_events(argv[i], raw_dump, !desc_flag,
-                                                       long_desc_flag);
+                                                       long_desc_flag,
+                                                       details_flag);
                                continue;
                        }
                        sep_idx = sep - argv[i];
@@ -103,7 +108,8 @@ int cmd_list(int argc, const char **argv, const char *prefix __maybe_unused)
                                            event_symbols_sw, PERF_COUNT_SW_MAX, raw_dump);
                        print_hwcache_events(s, raw_dump);
                        print_pmu_events(s, raw_dump, !desc_flag,
-                                               long_desc_flag);
+                                               long_desc_flag,
+                                               details_flag);
                        print_tracepoint_events(NULL, s, raw_dump);
                        print_sdt_events(NULL, s, raw_dump);
                        free(s);
index ce3bfb4..ff98652 100644 (file)
@@ -1,3 +1,5 @@
+#include <errno.h>
+#include <inttypes.h>
 #include "builtin.h"
 #include "perf.h"
 
@@ -26,6 +28,7 @@
 
 #include <linux/list.h>
 #include <linux/hash.h>
+#include <linux/kernel.h>
 
 static struct perf_session *session;
 
@@ -858,6 +861,7 @@ static int __cmd_report(bool display_info)
        struct perf_tool eops = {
                .sample          = process_sample_event,
                .comm            = perf_event__process_comm,
+               .namespaces      = perf_event__process_namespaces,
                .ordered_events  = true,
        };
        struct perf_data_file file = {
@@ -940,34 +944,36 @@ static int __cmd_record(int argc, const char **argv)
 
        BUG_ON(i != rec_argc);
 
-       ret = cmd_record(i, rec_argv, NULL);
+       ret = cmd_record(i, rec_argv);
        free(rec_argv);
        return ret;
 }
 
-int cmd_lock(int argc, const char **argv, const char *prefix __maybe_unused)
+int cmd_lock(int argc, const char **argv)
 {
-       const struct option info_options[] = {
-       OPT_BOOLEAN('t', "threads", &info_threads,
-                   "dump thread list in perf.data"),
-       OPT_BOOLEAN('m', "map", &info_map,
-                   "map of lock instances (address:name table)"),
-       OPT_BOOLEAN('f', "force", &force, "don't complain, do it"),
-       OPT_END()
-       };
        const struct option lock_options[] = {
        OPT_STRING('i', "input", &input_name, "file", "input file name"),
        OPT_INCR('v', "verbose", &verbose, "be more verbose (show symbol address, etc)"),
        OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace, "dump raw trace in ASCII"),
+       OPT_BOOLEAN('f', "force", &force, "don't complain, do it"),
        OPT_END()
        };
+
+       const struct option info_options[] = {
+       OPT_BOOLEAN('t', "threads", &info_threads,
+                   "dump thread list in perf.data"),
+       OPT_BOOLEAN('m', "map", &info_map,
+                   "map of lock instances (address:name table)"),
+       OPT_PARENT(lock_options)
+       };
+
        const struct option report_options[] = {
        OPT_STRING('k', "key", &sort_key, "acquired",
                    "key for sorting (acquired / contended / avg_wait / wait_total / wait_max / wait_min)"),
-       OPT_BOOLEAN('f', "force", &force, "don't complain, do it"),
        /* TODO: type */
-       OPT_END()
+       OPT_PARENT(lock_options)
        };
+
        const char * const info_usage[] = {
                "perf lock info [<options>]",
                NULL
@@ -1006,7 +1012,7 @@ int cmd_lock(int argc, const char **argv, const char *prefix __maybe_unused)
                rc = __cmd_report(false);
        } else if (!strcmp(argv[0], "script")) {
                /* Aliased to 'perf script' */
-               return cmd_script(argc, argv, prefix);
+               return cmd_script(argc, argv);
        } else if (!strcmp(argv[0], "info")) {
                if (argc) {
                        argc = parse_options(argc, argv,
index 6114e07..e001c02 100644 (file)
@@ -1,3 +1,7 @@
+#include <inttypes.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
 #include "builtin.h"
 #include "perf.h"
 
@@ -8,6 +12,7 @@
 #include "util/data.h"
 #include "util/mem-events.h"
 #include "util/debug.h"
+#include "util/symbol.h"
 
 #define MEM_OPERATION_LOAD     0x1
 #define MEM_OPERATION_STORE    0x2
@@ -129,7 +134,7 @@ static int __cmd_record(int argc, const char **argv, struct perf_mem *mem)
                pr_debug("\n");
        }
 
-       ret = cmd_record(i, rec_argv, NULL);
+       ret = cmd_record(i, rec_argv);
        free(rec_argv);
        return ret;
 }
@@ -256,7 +261,7 @@ static int report_events(int argc, const char **argv, struct perf_mem *mem)
        for (j = 1; j < argc; j++, i++)
                rep_argv[i] = argv[j];
 
-       ret = cmd_report(i, rep_argv, NULL);
+       ret = cmd_report(i, rep_argv);
        free(rep_argv);
        return ret;
 }
@@ -330,7 +335,7 @@ error:
        return ret;
 }
 
-int cmd_mem(int argc, const char **argv, const char *prefix __maybe_unused)
+int cmd_mem(int argc, const char **argv)
 {
        struct stat st;
        struct perf_mem mem = {
@@ -342,6 +347,7 @@ int cmd_mem(int argc, const char **argv, const char *prefix __maybe_unused)
                        .lost           = perf_event__process_lost,
                        .fork           = perf_event__process_fork,
                        .build_id       = perf_event__process_build_id,
+                       .namespaces     = perf_event__process_namespaces,
                        .ordered_events = true,
                },
                .input_name              = "perf.data",
index 1fcebc3..d7360c2 100644 (file)
@@ -442,9 +442,9 @@ static int perf_del_probe_events(struct strfilter *filter)
        }
 
        if (ret == -ENOENT && ret2 == -ENOENT)
-               pr_debug("\"%s\" does not hit any event.\n", str);
-               /* Note that this is silently ignored */
-       ret = 0;
+               pr_warning("\"%s\" does not hit any event.\n", str);
+       else
+               ret = 0;
 
 error:
        if (kfd >= 0)
@@ -468,7 +468,7 @@ out:
 
 
 static int
-__cmd_probe(int argc, const char **argv, const char *prefix __maybe_unused)
+__cmd_probe(int argc, const char **argv)
 {
        const char * const probe_usage[] = {
                "perf probe [<options>] 'PROBEDEF' ['PROBEDEF' ...]",
@@ -687,13 +687,13 @@ __cmd_probe(int argc, const char **argv, const char *prefix __maybe_unused)
        return 0;
 }
 
-int cmd_probe(int argc, const char **argv, const char *prefix)
+int cmd_probe(int argc, const char **argv)
 {
        int ret;
 
        ret = init_params();
        if (!ret) {
-               ret = __cmd_probe(argc, argv, prefix);
+               ret = __cmd_probe(argc, argv);
                cleanup_params();
        }
 
index bc84a37..ee7d0a8 100644 (file)
 #include "util/bpf-loader.h"
 #include "util/trigger.h"
 #include "util/perf-hooks.h"
+#include "util/time-utils.h"
+#include "util/units.h"
 #include "asm/bug.h"
 
+#include <errno.h>
+#include <inttypes.h>
+#include <poll.h>
 #include <unistd.h>
 #include <sched.h>
+#include <signal.h>
 #include <sys/mman.h>
+#include <sys/wait.h>
 #include <asm/bug.h>
 #include <linux/time64.h>
 
@@ -876,6 +883,9 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
        signal(SIGTERM, sig_handler);
        signal(SIGSEGV, sigsegv_handler);
 
+       if (rec->opts.record_namespaces)
+               tool->namespace_events = true;
+
        if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
                signal(SIGUSR2, snapshot_sig_handler);
                if (rec->opts.auxtrace_snapshot_mode)
@@ -983,6 +993,7 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
         */
        if (forks) {
                union perf_event *event;
+               pid_t tgid;
 
                event = malloc(sizeof(event->comm) + machine->id_hdr_size);
                if (event == NULL) {
@@ -996,10 +1007,30 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
                 * cannot see a correct process name for those events.
                 * Synthesize COMM event to prevent it.
                 */
-               perf_event__synthesize_comm(tool, event,
-                                           rec->evlist->workload.pid,
-                                           process_synthesized_event,
-                                           machine);
+               tgid = perf_event__synthesize_comm(tool, event,
+                                                  rec->evlist->workload.pid,
+                                                  process_synthesized_event,
+                                                  machine);
+               free(event);
+
+               if (tgid == -1)
+                       goto out_child;
+
+               event = malloc(sizeof(event->namespaces) +
+                              (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
+                              machine->id_hdr_size);
+               if (event == NULL) {
+                       err = -ENOMEM;
+                       goto out_child;
+               }
+
+               /*
+                * Synthesize NAMESPACES event for the command specified.
+                */
+               perf_event__synthesize_namespaces(tool, event,
+                                                 rec->evlist->workload.pid,
+                                                 tgid, process_synthesized_event,
+                                                 machine);
                free(event);
 
                perf_evlist__start_workload(rec->evlist);
@@ -1497,6 +1528,7 @@ static struct record record = {
                .fork           = perf_event__process_fork,
                .exit           = perf_event__process_exit,
                .comm           = perf_event__process_comm,
+               .namespaces     = perf_event__process_namespaces,
                .mmap           = perf_event__process_mmap,
                .mmap2          = perf_event__process_mmap2,
                .ordered_events = true,
@@ -1611,6 +1643,8 @@ static struct option __record_options[] = {
                          "opts", "AUX area tracing Snapshot Mode", ""),
        OPT_UINTEGER(0, "proc-map-timeout", &record.opts.proc_map_timeout,
                        "per thread proc mmap processing timeout in ms"),
+       OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
+                   "Record namespaces events"),
        OPT_BOOLEAN(0, "switch-events", &record.opts.record_switch_events,
                    "Record context switch events"),
        OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
@@ -1640,7 +1674,7 @@ static struct option __record_options[] = {
 
 struct option *record_options = __record_options;
 
-int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
+int cmd_record(int argc, const char **argv)
 {
        int err;
        struct record *rec = &record;
index 0a88670..22478ff 100644 (file)
@@ -16,7 +16,6 @@
 #include <linux/rbtree.h>
 #include "util/symbol.h"
 #include "util/callchain.h"
-#include "util/strlist.h"
 #include "util/values.h"
 
 #include "perf.h"
 #include "arch/common.h"
 #include "util/time-utils.h"
 #include "util/auxtrace.h"
+#include "util/units.h"
 
 #include <dlfcn.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <regex.h>
+#include <signal.h>
 #include <linux/bitmap.h>
 #include <linux/stringify.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
 
 struct report {
        struct perf_tool        tool;
@@ -394,8 +401,7 @@ static int perf_evlist__tty_browse_hists(struct perf_evlist *evlist,
                fprintf(stdout, "\n\n");
        }
 
-       if (sort_order == NULL &&
-           parent_pattern == default_parent_pattern)
+       if (!quiet)
                fprintf(stdout, "#\n# (%s)\n#\n", help);
 
        if (rep->show_threads) {
@@ -682,7 +688,7 @@ const char report_callchain_help[] = "Display call graph (stack chain/backtrace)
                                     CALLCHAIN_REPORT_HELP
                                     "\n\t\t\t\tDefault: " CALLCHAIN_DEFAULT_OPT;
 
-int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
+int cmd_report(int argc, const char **argv)
 {
        struct perf_session *session;
        struct itrace_synth_opts itrace_synth_opts = { .set = 0, };
@@ -701,6 +707,7 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
                        .mmap            = perf_event__process_mmap,
                        .mmap2           = perf_event__process_mmap2,
                        .comm            = perf_event__process_comm,
+                       .namespaces      = perf_event__process_namespaces,
                        .exit            = perf_event__process_exit,
                        .fork            = perf_event__process_fork,
                        .lost            = perf_event__process_lost,
@@ -845,6 +852,8 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
                             stdio__config_color, "always"),
        OPT_STRING(0, "time", &report.time_str, "str",
                   "Time span of interest (start,stop)"),
+       OPT_BOOLEAN(0, "inline", &symbol_conf.inline_name,
+                   "Show inline function"),
        OPT_END()
        };
        struct perf_data_file file = {
index b94cf0d..39996c5 100644 (file)
 
 #include "util/debug.h"
 
+#include <linux/kernel.h>
 #include <linux/log2.h>
 #include <sys/prctl.h>
 #include <sys/resource.h>
+#include <inttypes.h>
 
+#include <errno.h>
 #include <semaphore.h>
 #include <pthread.h>
 #include <math.h>
 #include <api/fs/fs.h>
 #include <linux/time64.h>
 
+#include "sane_ctype.h"
+
 #define PR_SET_NAME            15               /* Set process name */
 #define MAX_CPUS               4096
 #define COMM_LEN               20
@@ -221,6 +226,7 @@ struct perf_sched {
        unsigned int    max_stack;
        bool            show_cpu_visual;
        bool            show_wakeups;
+       bool            show_next;
        bool            show_migrations;
        bool            show_state;
        u64             skipped_samples;
@@ -1897,14 +1903,18 @@ static char task_state_char(struct thread *thread, int state)
 }
 
 static void timehist_print_sample(struct perf_sched *sched,
+                                 struct perf_evsel *evsel,
                                  struct perf_sample *sample,
                                  struct addr_location *al,
                                  struct thread *thread,
                                  u64 t, int state)
 {
        struct thread_runtime *tr = thread__priv(thread);
+       const char *next_comm = perf_evsel__strval(evsel, sample, "next_comm");
+       const u32 next_pid = perf_evsel__intval(evsel, sample, "next_pid");
        u32 max_cpus = sched->max_cpu + 1;
        char tstr[64];
+       char nstr[30];
        u64 wait_time;
 
        timestamp__scnprintf_usec(t, tstr, sizeof(tstr));
@@ -1937,7 +1947,12 @@ static void timehist_print_sample(struct perf_sched *sched,
        if (sched->show_state)
                printf(" %5c ", task_state_char(thread, state));
 
-       if (sched->show_wakeups)
+       if (sched->show_next) {
+               snprintf(nstr, sizeof(nstr), "next: %s[%d]", next_comm, next_pid);
+               printf(" %-*s", comm_width, nstr);
+       }
+
+       if (sched->show_wakeups && !sched->show_next)
                printf("  %-*s", comm_width, "");
 
        if (thread->tid == 0)
@@ -2531,7 +2546,7 @@ static int timehist_sched_change_event(struct perf_tool *tool,
        }
 
        if (!sched->summary_only)
-               timehist_print_sample(sched, sample, &al, thread, t, state);
+               timehist_print_sample(sched, evsel, sample, &al, thread, t, state);
 
 out:
        if (sched->hist_time.start == 0 && t >= ptime->start)
@@ -3262,16 +3277,17 @@ static int __cmd_record(int argc, const char **argv)
 
        BUG_ON(i != rec_argc);
 
-       return cmd_record(i, rec_argv, NULL);
+       return cmd_record(i, rec_argv);
 }
 
-int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
+int cmd_sched(int argc, const char **argv)
 {
        const char default_sort_order[] = "avg, max, switch, runtime";
        struct perf_sched sched = {
                .tool = {
                        .sample          = perf_sched__process_tracepoint_sample,
                        .comm            = perf_event__process_comm,
+                       .namespaces      = perf_event__process_namespaces,
                        .lost            = perf_event__process_lost,
                        .fork            = perf_sched__process_fork_event,
                        .ordered_events = true,
@@ -3340,6 +3356,7 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
        OPT_BOOLEAN('S', "with-summary", &sched.summary,
                    "Show all syscalls and summary with statistics"),
        OPT_BOOLEAN('w', "wakeups", &sched.show_wakeups, "Show wakeup events"),
+       OPT_BOOLEAN('n', "next", &sched.show_next, "Show next task"),
        OPT_BOOLEAN('M', "migrations", &sched.show_migrations, "Show migration events"),
        OPT_BOOLEAN('V', "cpu-visual", &sched.show_cpu_visual, "Add CPU visual"),
        OPT_BOOLEAN('I', "idle-hist", &sched.idle_hist, "Show idle events only"),
@@ -3400,7 +3417,7 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
         * Aliased to 'perf script' for now:
         */
        if (!strcmp(argv[0], "script"))
-               return cmd_script(argc, argv, prefix);
+               return cmd_script(argc, argv);
 
        if (!strncmp(argv[0], "rec", 3)) {
                return __cmd_record(argc, argv);
@@ -3437,10 +3454,14 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
                        if (argc)
                                usage_with_options(timehist_usage, timehist_options);
                }
-               if (sched.show_wakeups && sched.summary_only) {
-                       pr_err(" Error: -s and -w are mutually exclusive.\n");
+               if ((sched.show_wakeups || sched.show_next) &&
+                   sched.summary_only) {
+                       pr_err(" Error: -s and -[n|w] are mutually exclusive.\n");
                        parse_options_usage(timehist_usage, timehist_options, "s", true);
-                       parse_options_usage(NULL, timehist_options, "w", true);
+                       if (sched.show_wakeups)
+                               parse_options_usage(NULL, timehist_options, "w", true);
+                       if (sched.show_next)
+                               parse_options_usage(NULL, timehist_options, "n", true);
                        return -EINVAL;
                }
 
index c0783b4..d05aec4 100644 (file)
 #include "util/cpumap.h"
 #include "util/thread_map.h"
 #include "util/stat.h"
+#include "util/string2.h"
 #include "util/thread-stack.h"
 #include "util/time-utils.h"
+#include "print_binary.h"
 #include <linux/bitmap.h>
+#include <linux/kernel.h>
 #include <linux/stringify.h>
 #include <linux/time64.h>
 #include "asm/bug.h"
 #include "util/mem-events.h"
+#include "util/dump-insn.h"
+#include <dirent.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <signal.h>
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include "sane_ctype.h"
 
 static char const              *script_name;
 static char const              *generate_script_lang;
@@ -42,6 +56,7 @@ static bool                   nanosecs;
 static const char              *cpu_list;
 static DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
 static struct perf_stat_config stat_config;
+static int                     max_blocks;
 
 unsigned int scripting_max_stack = PERF_MAX_STACK_DEPTH;
 
@@ -69,6 +84,7 @@ enum perf_output_field {
        PERF_OUTPUT_CALLINDENT      = 1U << 20,
        PERF_OUTPUT_INSN            = 1U << 21,
        PERF_OUTPUT_INSNLEN         = 1U << 22,
+       PERF_OUTPUT_BRSTACKINSN     = 1U << 23,
 };
 
 struct output_option {
@@ -98,6 +114,7 @@ struct output_option {
        {.str = "callindent", .field = PERF_OUTPUT_CALLINDENT},
        {.str = "insn", .field = PERF_OUTPUT_INSN},
        {.str = "insnlen", .field = PERF_OUTPUT_INSNLEN},
+       {.str = "brstackinsn", .field = PERF_OUTPUT_BRSTACKINSN},
 };
 
 /* default set to maintain compatibility with current format */
@@ -292,7 +309,13 @@ static int perf_evsel__check_attr(struct perf_evsel *evsel,
                       "selected. Hence, no address to lookup the source line number.\n");
                return -EINVAL;
        }
-
+       if (PRINT_FIELD(BRSTACKINSN) &&
+           !(perf_evlist__combined_branch_type(session->evlist) &
+             PERF_SAMPLE_BRANCH_ANY)) {
+               pr_err("Display of branch stack assembler requested, but non all-branch filter set\n"
+                      "Hint: run 'perf record -b ...'\n");
+               return -EINVAL;
+       }
        if ((PRINT_FIELD(PID) || PRINT_FIELD(TID)) &&
                perf_evsel__check_stype(evsel, PERF_SAMPLE_TID, "TID",
                                        PERF_OUTPUT_TID|PERF_OUTPUT_PID))
@@ -546,6 +569,233 @@ static void print_sample_brstacksym(struct perf_sample *sample,
        }
 }
 
+#define MAXBB 16384UL
+
+static int grab_bb(u8 *buffer, u64 start, u64 end,
+                   struct machine *machine, struct thread *thread,
+                   bool *is64bit, u8 *cpumode, bool last)
+{
+       long offset, len;
+       struct addr_location al;
+       bool kernel;
+
+       if (!start || !end)
+               return 0;
+
+       kernel = machine__kernel_ip(machine, start);
+       if (kernel)
+               *cpumode = PERF_RECORD_MISC_KERNEL;
+       else
+               *cpumode = PERF_RECORD_MISC_USER;
+
+       /*
+        * Block overlaps between kernel and user.
+        * This can happen due to ring filtering
+        * On Intel CPUs the entry into the kernel is filtered,
+        * but the exit is not. Let the caller patch it up.
+        */
+       if (kernel != machine__kernel_ip(machine, end)) {
+               printf("\tblock %" PRIx64 "-%" PRIx64 " transfers between kernel and user\n",
+                               start, end);
+               return -ENXIO;
+       }
+
+       memset(&al, 0, sizeof(al));
+       if (end - start > MAXBB - MAXINSN) {
+               if (last)
+                       printf("\tbrstack does not reach to final jump (%" PRIx64 "-%" PRIx64 ")\n", start, end);
+               else
+                       printf("\tblock %" PRIx64 "-%" PRIx64 " (%" PRIu64 ") too long to dump\n", start, end, end - start);
+               return 0;
+       }
+
+       thread__find_addr_map(thread, *cpumode, MAP__FUNCTION, start, &al);
+       if (!al.map || !al.map->dso) {
+               printf("\tcannot resolve %" PRIx64 "-%" PRIx64 "\n", start, end);
+               return 0;
+       }
+       if (al.map->dso->data.status == DSO_DATA_STATUS_ERROR) {
+               printf("\tcannot resolve %" PRIx64 "-%" PRIx64 "\n", start, end);
+               return 0;
+       }
+
+       /* Load maps to ensure dso->is_64_bit has been updated */
+       map__load(al.map);
+
+       offset = al.map->map_ip(al.map, start);
+       len = dso__data_read_offset(al.map->dso, machine, offset, (u8 *)buffer,
+                                   end - start + MAXINSN);
+
+       *is64bit = al.map->dso->is_64_bit;
+       if (len <= 0)
+               printf("\tcannot fetch code for block at %" PRIx64 "-%" PRIx64 "\n",
+                       start, end);
+       return len;
+}
+
+static void print_jump(uint64_t ip, struct branch_entry *en,
+                      struct perf_insn *x, u8 *inbuf, int len,
+                      int insn)
+{
+       printf("\t%016" PRIx64 "\t%-30s\t#%s%s%s%s",
+              ip,
+              dump_insn(x, ip, inbuf, len, NULL),
+              en->flags.predicted ? " PRED" : "",
+              en->flags.mispred ? " MISPRED" : "",
+              en->flags.in_tx ? " INTX" : "",
+              en->flags.abort ? " ABORT" : "");
+       if (en->flags.cycles) {
+               printf(" %d cycles", en->flags.cycles);
+               if (insn)
+                       printf(" %.2f IPC", (float)insn / en->flags.cycles);
+       }
+       putchar('\n');
+}
+
+static void print_ip_sym(struct thread *thread, u8 cpumode, int cpu,
+                        uint64_t addr, struct symbol **lastsym,
+                        struct perf_event_attr *attr)
+{
+       struct addr_location al;
+       int off;
+
+       memset(&al, 0, sizeof(al));
+
+       thread__find_addr_map(thread, cpumode, MAP__FUNCTION, addr, &al);
+       if (!al.map)
+               thread__find_addr_map(thread, cpumode, MAP__VARIABLE,
+                                     addr, &al);
+       if ((*lastsym) && al.addr >= (*lastsym)->start && al.addr < (*lastsym)->end)
+               return;
+
+       al.cpu = cpu;
+       al.sym = NULL;
+       if (al.map)
+               al.sym = map__find_symbol(al.map, al.addr);
+
+       if (!al.sym)
+               return;
+
+       if (al.addr < al.sym->end)
+               off = al.addr - al.sym->start;
+       else
+               off = al.addr - al.map->start - al.sym->start;
+       printf("\t%s", al.sym->name);
+       if (off)
+               printf("%+d", off);
+       putchar(':');
+       if (PRINT_FIELD(SRCLINE))
+               map__fprintf_srcline(al.map, al.addr, "\t", stdout);
+       putchar('\n');
+       *lastsym = al.sym;
+}
+
+static void print_sample_brstackinsn(struct perf_sample *sample,
+                                    struct thread *thread,
+                                    struct perf_event_attr *attr,
+                                    struct machine *machine)
+{
+       struct branch_stack *br = sample->branch_stack;
+       u64 start, end;
+       int i, insn, len, nr, ilen;
+       struct perf_insn x;
+       u8 buffer[MAXBB];
+       unsigned off;
+       struct symbol *lastsym = NULL;
+
+       if (!(br && br->nr))
+               return;
+       nr = br->nr;
+       if (max_blocks && nr > max_blocks + 1)
+               nr = max_blocks + 1;
+
+       x.thread = thread;
+       x.cpu = sample->cpu;
+
+       putchar('\n');
+
+       /* Handle first from jump, of which we don't know the entry. */
+       len = grab_bb(buffer, br->entries[nr-1].from,
+                       br->entries[nr-1].from,
+                       machine, thread, &x.is64bit, &x.cpumode, false);
+       if (len > 0) {
+               print_ip_sym(thread, x.cpumode, x.cpu,
+                            br->entries[nr - 1].from, &lastsym, attr);
+               print_jump(br->entries[nr - 1].from, &br->entries[nr - 1],
+                           &x, buffer, len, 0);
+       }
+
+       /* Print all blocks */
+       for (i = nr - 2; i >= 0; i--) {
+               if (br->entries[i].from || br->entries[i].to)
+                       pr_debug("%d: %" PRIx64 "-%" PRIx64 "\n", i,
+                                br->entries[i].from,
+                                br->entries[i].to);
+               start = br->entries[i + 1].to;
+               end   = br->entries[i].from;
+
+               len = grab_bb(buffer, start, end, machine, thread, &x.is64bit, &x.cpumode, false);
+               /* Patch up missing kernel transfers due to ring filters */
+               if (len == -ENXIO && i > 0) {
+                       end = br->entries[--i].from;
+                       pr_debug("\tpatching up to %" PRIx64 "-%" PRIx64 "\n", start, end);
+                       len = grab_bb(buffer, start, end, machine, thread, &x.is64bit, &x.cpumode, false);
+               }
+               if (len <= 0)
+                       continue;
+
+               insn = 0;
+               for (off = 0;; off += ilen) {
+                       uint64_t ip = start + off;
+
+                       print_ip_sym(thread, x.cpumode, x.cpu, ip, &lastsym, attr);
+                       if (ip == end) {
+                               print_jump(ip, &br->entries[i], &x, buffer + off, len - off, insn);
+                               break;
+                       } else {
+                               printf("\t%016" PRIx64 "\t%s\n", ip,
+                                       dump_insn(&x, ip, buffer + off, len - off, &ilen));
+                               if (ilen == 0)
+                                       break;
+                               insn++;
+                       }
+               }
+       }
+
+       /*
+        * Hit the branch? In this case we are already done, and the target
+        * has not been executed yet.
+        */
+       if (br->entries[0].from == sample->ip)
+               return;
+       if (br->entries[0].flags.abort)
+               return;
+
+       /*
+        * Print final block upto sample
+        */
+       start = br->entries[0].to;
+       end = sample->ip;
+       len = grab_bb(buffer, start, end, machine, thread, &x.is64bit, &x.cpumode, true);
+       print_ip_sym(thread, x.cpumode, x.cpu, start, &lastsym, attr);
+       if (len <= 0) {
+               /* Print at least last IP if basic block did not work */
+               len = grab_bb(buffer, sample->ip, sample->ip,
+                             machine, thread, &x.is64bit, &x.cpumode, false);
+               if (len <= 0)
+                       return;
+
+               printf("\t%016" PRIx64 "\t%s\n", sample->ip,
+                       dump_insn(&x, sample->ip, buffer, len, NULL));
+               return;
+       }
+       for (off = 0; off <= end - start; off += ilen) {
+               printf("\t%016" PRIx64 "\t%s\n", start + off,
+                       dump_insn(&x, start + off, buffer + off, len - off, &ilen));
+               if (ilen == 0)
+                       break;
+       }
+}
 
 static void print_sample_addr(struct perf_sample *sample,
                          struct thread *thread,
@@ -632,7 +882,9 @@ static void print_sample_callindent(struct perf_sample *sample,
 }
 
 static void print_insn(struct perf_sample *sample,
-                      struct perf_event_attr *attr)
+                      struct perf_event_attr *attr,
+                      struct thread *thread,
+                      struct machine *machine)
 {
        if (PRINT_FIELD(INSNLEN))
                printf(" ilen: %d", sample->insn_len);
@@ -643,12 +895,15 @@ static void print_insn(struct perf_sample *sample,
                for (i = 0; i < sample->insn_len; i++)
                        printf(" %02x", (unsigned char)sample->insn[i]);
        }
+       if (PRINT_FIELD(BRSTACKINSN))
+               print_sample_brstackinsn(sample, thread, attr, machine);
 }
 
 static void print_sample_bts(struct perf_sample *sample,
                             struct perf_evsel *evsel,
                             struct thread *thread,
-                            struct addr_location *al)
+                            struct addr_location *al,
+                            struct machine *machine)
 {
        struct perf_event_attr *attr = &evsel->attr;
        bool print_srcline_last = false;
@@ -689,7 +944,7 @@ static void print_sample_bts(struct perf_sample *sample,
        if (print_srcline_last)
                map__fprintf_srcline(al->map, al->addr, "\n  ", stdout);
 
-       print_insn(sample, attr);
+       print_insn(sample, attr, thread, machine);
 
        printf("\n");
 }
@@ -830,6 +1085,7 @@ struct perf_script {
        bool                    show_task_events;
        bool                    show_mmap_events;
        bool                    show_switch_events;
+       bool                    show_namespace_events;
        bool                    allocated;
        struct cpu_map          *cpus;
        struct thread_map       *threads;
@@ -871,7 +1127,8 @@ static size_t data_src__printf(u64 data_src)
 
 static void process_event(struct perf_script *script,
                          struct perf_sample *sample, struct perf_evsel *evsel,
-                         struct addr_location *al)
+                         struct addr_location *al,
+                         struct machine *machine)
 {
        struct thread *thread = al->thread;
        struct perf_event_attr *attr = &evsel->attr;
@@ -898,7 +1155,7 @@ static void process_event(struct perf_script *script,
                print_sample_flags(sample->flags);
 
        if (is_bts_event(attr)) {
-               print_sample_bts(sample, evsel, thread, al);
+               print_sample_bts(sample, evsel, thread, al, machine);
                return;
        }
 
@@ -936,7 +1193,7 @@ static void process_event(struct perf_script *script,
 
        if (perf_evsel__is_bpf_output(evsel) && PRINT_FIELD(BPF_OUTPUT))
                print_sample_bpf_output(sample);
-       print_insn(sample, attr);
+       print_insn(sample, attr, thread, machine);
        printf("\n");
 }
 
@@ -1046,7 +1303,7 @@ static int process_sample_event(struct perf_tool *tool,
        if (scripting_ops)
                scripting_ops->process_event(event, sample, evsel, &al);
        else
-               process_event(scr, sample, evsel, &al);
+               process_event(scr, sample, evsel, &al, machine);
 
 out_put:
        addr_location__put(&al);
@@ -1118,6 +1375,41 @@ out:
        return ret;
 }
 
+static int process_namespaces_event(struct perf_tool *tool,
+                                   union perf_event *event,
+                                   struct perf_sample *sample,
+                                   struct machine *machine)
+{
+       struct thread *thread;
+       struct perf_script *script = container_of(tool, struct perf_script, tool);
+       struct perf_session *session = script->session;
+       struct perf_evsel *evsel = perf_evlist__id2evsel(session->evlist, sample->id);
+       int ret = -1;
+
+       thread = machine__findnew_thread(machine, event->namespaces.pid,
+                                        event->namespaces.tid);
+       if (thread == NULL) {
+               pr_debug("problem processing NAMESPACES event, skipping it.\n");
+               return -1;
+       }
+
+       if (perf_event__process_namespaces(tool, event, sample, machine) < 0)
+               goto out;
+
+       if (!evsel->attr.sample_id_all) {
+               sample->cpu = 0;
+               sample->time = 0;
+               sample->tid = event->namespaces.tid;
+               sample->pid = event->namespaces.pid;
+       }
+       print_sample_start(sample, thread, evsel);
+       perf_event__fprintf(event, stdout);
+       ret = 0;
+out:
+       thread__put(thread);
+       return ret;
+}
+
 static int process_fork_event(struct perf_tool *tool,
                              union perf_event *event,
                              struct perf_sample *sample,
@@ -1293,6 +1585,8 @@ static int __cmd_script(struct perf_script *script)
        }
        if (script->show_switch_events)
                script->tool.context_switch = process_switch_event;
+       if (script->show_namespace_events)
+               script->tool.namespaces = process_namespaces_event;
 
        ret = perf_session__process_events(script->session);
 
@@ -1427,7 +1721,7 @@ static int parse_scriptname(const struct option *opt __maybe_unused,
 static int parse_output_fields(const struct option *opt __maybe_unused,
                            const char *arg, int unset __maybe_unused)
 {
-       char *tok;
+       char *tok, *strtok_saveptr = NULL;
        int i, imax = ARRAY_SIZE(all_output_options);
        int j;
        int rc = 0;
@@ -1488,7 +1782,7 @@ static int parse_output_fields(const struct option *opt __maybe_unused,
                }
        }
 
-       for (tok = strtok(tok, ","); tok; tok = strtok(NULL, ",")) {
+       for (tok = strtok_r(tok, ",", &strtok_saveptr); tok; tok = strtok_r(NULL, ",", &strtok_saveptr)) {
                for (i = 0; i < imax; ++i) {
                        if (strcmp(tok, all_output_options[i].str) == 0)
                                break;
@@ -2078,7 +2372,7 @@ int process_cpu_map_event(struct perf_tool *tool __maybe_unused,
        return set_maps(script);
 }
 
-int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
+int cmd_script(int argc, const char **argv)
 {
        bool show_full_info = false;
        bool header = false;
@@ -2097,6 +2391,7 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
                        .mmap            = perf_event__process_mmap,
                        .mmap2           = perf_event__process_mmap2,
                        .comm            = perf_event__process_comm,
+                       .namespaces      = perf_event__process_namespaces,
                        .exit            = perf_event__process_exit,
                        .fork            = perf_event__process_fork,
                        .attr            = process_attr,
@@ -2152,7 +2447,8 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
                     "Valid types: hw,sw,trace,raw. "
                     "Fields: comm,tid,pid,time,cpu,event,trace,ip,sym,dso,"
                     "addr,symoff,period,iregs,brstack,brstacksym,flags,"
-                    "bpf-output,callindent,insn,insnlen", parse_output_fields),
+                    "bpf-output,callindent,insn,insnlen,brstackinsn",
+                    parse_output_fields),
        OPT_BOOLEAN('a', "all-cpus", &system_wide,
                    "system-wide collection from all CPUs"),
        OPT_STRING('S', "symbols", &symbol_conf.sym_list_str, "symbol[,symbol...]",
@@ -2180,7 +2476,11 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
                    "Show the mmap events"),
        OPT_BOOLEAN('\0', "show-switch-events", &script.show_switch_events,
                    "Show context switch events (if recorded)"),
+       OPT_BOOLEAN('\0', "show-namespace-events", &script.show_namespace_events,
+                   "Show namespace events (if recorded)"),
        OPT_BOOLEAN('f', "force", &symbol_conf.force, "don't complain, do it"),
+       OPT_INTEGER(0, "max-blocks", &max_blocks,
+                   "Maximum number of code blocks to dump with brstackinsn"),
        OPT_BOOLEAN(0, "ns", &nanosecs,
                    "Use 9 decimal places when displaying time"),
        OPT_CALLBACK_OPTARG(0, "itrace", &itrace_synth_opts, NULL, "opts",
@@ -2217,7 +2517,7 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
        if (argc > 1 && !strncmp(argv[0], "rec", strlen("rec"))) {
                rec_script_path = get_script_path(argv[1], RECORD_SUFFIX);
                if (!rec_script_path)
-                       return cmd_record(argc, argv, NULL);
+                       return cmd_record(argc, argv);
        }
 
        if (argc > 1 && !strncmp(argv[0], "rep", strlen("rep"))) {
index 13b5499..a935b50 100644 (file)
 #include "util/session.h"
 #include "util/tool.h"
 #include "util/group.h"
+#include "util/string2.h"
 #include "asm/bug.h"
 
 #include <linux/time64.h>
 #include <api/fs/fs.h>
+#include <errno.h>
+#include <signal.h>
 #include <stdlib.h>
 #include <sys/prctl.h>
+#include <inttypes.h>
 #include <locale.h>
 #include <math.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "sane_ctype.h"
 
 #define DEFAULT_SEPARATOR      " "
 #define CNTR_NOT_SUPPORTED     "<not supported>"
@@ -140,12 +150,14 @@ static unsigned int               unit_width                      = 4; /* strlen("unit") */
 static bool                    forever                         = false;
 static bool                    metric_only                     = false;
 static bool                    force_metric_only               = false;
+static bool                    no_merge                        = false;
 static struct timespec         ref_time;
 static struct cpu_map          *aggr_map;
 static aggr_get_id_t           aggr_get_id;
 static bool                    append_file;
 static const char              *output_name;
 static int                     output_fd;
+static int                     print_free_counters_hint;
 
 struct perf_stat {
        bool                     record;
@@ -310,8 +322,12 @@ static int read_counter(struct perf_evsel *counter)
                        struct perf_counts_values *count;
 
                        count = perf_counts(counter->counts, cpu, thread);
-                       if (perf_evsel__read(counter, cpu, thread, count))
+                       if (perf_evsel__read(counter, cpu, thread, count)) {
+                               counter->counts->scaled = -1;
+                               perf_counts(counter->counts, cpu, thread)->ena = 0;
+                               perf_counts(counter->counts, cpu, thread)->run = 0;
                                return -1;
+                       }
 
                        if (STAT_RECORD) {
                                if (perf_evsel__write_stat_event(counter, cpu, thread, count)) {
@@ -336,12 +352,14 @@ static int read_counter(struct perf_evsel *counter)
 static void read_counters(void)
 {
        struct perf_evsel *counter;
+       int ret;
 
        evlist__for_each_entry(evsel_list, counter) {
-               if (read_counter(counter))
+               ret = read_counter(counter);
+               if (ret)
                        pr_debug("failed to read counter %s\n", counter->name);
 
-               if (perf_stat_process_counter(&stat_config, counter))
+               if (ret == 0 && perf_stat_process_counter(&stat_config, counter))
                        pr_warning("failed to process counter %s\n", counter->name);
        }
 }
@@ -873,10 +891,7 @@ static void print_metric_csv(void *ctx,
                return;
        }
        snprintf(buf, sizeof(buf), fmt, val);
-       vals = buf;
-       while (isspace(*vals))
-               vals++;
-       ends = vals;
+       ends = vals = ltrim(buf);
        while (isdigit(*ends) || *ends == '.')
                ends++;
        *ends = 0;
@@ -948,10 +963,7 @@ static void print_metric_only_csv(void *ctx, const char *color __maybe_unused,
                return;
        unit = fixunit(tbuf, os->evsel, unit);
        snprintf(buf, sizeof buf, fmt, val);
-       vals = buf;
-       while (isspace(*vals))
-               vals++;
-       ends = vals;
+       ends = vals = ltrim(buf);
        while (isdigit(*ends) || *ends == '.')
                ends++;
        *ends = 0;
@@ -1109,6 +1121,9 @@ static void printout(int id, int nr, struct perf_evsel *counter, double uval,
                        counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED,
                        csv_sep);
 
+               if (counter->supported)
+                       print_free_counters_hint = 1;
+
                fprintf(stat_config.output, "%-*s%s",
                        csv_output ? 0 : unit_width,
                        counter->unit, csv_sep);
@@ -1140,6 +1155,7 @@ static void printout(int id, int nr, struct perf_evsel *counter, double uval,
        out.print_metric = pm;
        out.new_line = nl;
        out.ctx = &os;
+       out.force_header = false;
 
        if (csv_output && !metric_only) {
                print_noise(counter, noise);
@@ -1178,11 +1194,81 @@ static void aggr_update_shadow(void)
        }
 }
 
+static void collect_all_aliases(struct perf_evsel *counter,
+                           void (*cb)(struct perf_evsel *counter, void *data,
+                                      bool first),
+                           void *data)
+{
+       struct perf_evsel *alias;
+
+       alias = list_prepare_entry(counter, &(evsel_list->entries), node);
+       list_for_each_entry_continue (alias, &evsel_list->entries, node) {
+               if (strcmp(perf_evsel__name(alias), perf_evsel__name(counter)) ||
+                   alias->scale != counter->scale ||
+                   alias->cgrp != counter->cgrp ||
+                   strcmp(alias->unit, counter->unit) ||
+                   nsec_counter(alias) != nsec_counter(counter))
+                       break;
+               alias->merged_stat = true;
+               cb(alias, data, false);
+       }
+}
+
+static bool collect_data(struct perf_evsel *counter,
+                           void (*cb)(struct perf_evsel *counter, void *data,
+                                      bool first),
+                           void *data)
+{
+       if (counter->merged_stat)
+               return false;
+       cb(counter, data, true);
+       if (!no_merge)
+               collect_all_aliases(counter, cb, data);
+       return true;
+}
+
+struct aggr_data {
+       u64 ena, run, val;
+       int id;
+       int nr;
+       int cpu;
+};
+
+static void aggr_cb(struct perf_evsel *counter, void *data, bool first)
+{
+       struct aggr_data *ad = data;
+       int cpu, s2;
+
+       for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); cpu++) {
+               struct perf_counts_values *counts;
+
+               s2 = aggr_get_id(perf_evsel__cpus(counter), cpu);
+               if (s2 != ad->id)
+                       continue;
+               if (first)
+                       ad->nr++;
+               counts = perf_counts(counter->counts, cpu, 0);
+               /*
+                * When any result is bad, make them all to give
+                * consistent output in interval mode.
+                */
+               if (counts->ena == 0 || counts->run == 0 ||
+                   counter->counts->scaled == -1) {
+                       ad->ena = 0;
+                       ad->run = 0;
+                       break;
+               }
+               ad->val += counts->val;
+               ad->ena += counts->ena;
+               ad->run += counts->run;
+       }
+}
+
 static void print_aggr(char *prefix)
 {
        FILE *output = stat_config.output;
        struct perf_evsel *counter;
-       int cpu, s, s2, id, nr;
+       int s, id, nr;
        double uval;
        u64 ena, run, val;
        bool first;
@@ -1197,23 +1283,21 @@ static void print_aggr(char *prefix)
         * Without each counter has its own line.
         */
        for (s = 0; s < aggr_map->nr; s++) {
+               struct aggr_data ad;
                if (prefix && metric_only)
                        fprintf(output, "%s", prefix);
 
-               id = aggr_map->map[s];
+               ad.id = id = aggr_map->map[s];
                first = true;
                evlist__for_each_entry(evsel_list, counter) {
-                       val = ena = run = 0;
-                       nr = 0;
-                       for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); cpu++) {
-                               s2 = aggr_get_id(perf_evsel__cpus(counter), cpu);
-                               if (s2 != id)
-                                       continue;
-                               val += perf_counts(counter->counts, cpu, 0)->val;
-                               ena += perf_counts(counter->counts, cpu, 0)->ena;
-                               run += perf_counts(counter->counts, cpu, 0)->run;
-                               nr++;
-                       }
+                       ad.val = ad.ena = ad.run = 0;
+                       ad.nr = 0;
+                       if (!collect_data(counter, aggr_cb, &ad))
+                               continue;
+                       nr = ad.nr;
+                       ena = ad.ena;
+                       run = ad.run;
+                       val = ad.val;
                        if (first && metric_only) {
                                first = false;
                                aggr_printout(counter, id, nr);
@@ -1257,6 +1341,21 @@ static void print_aggr_thread(struct perf_evsel *counter, char *prefix)
        }
 }
 
+struct caggr_data {
+       double avg, avg_enabled, avg_running;
+};
+
+static void counter_aggr_cb(struct perf_evsel *counter, void *data,
+                           bool first __maybe_unused)
+{
+       struct caggr_data *cd = data;
+       struct perf_stat_evsel *ps = counter->priv;
+
+       cd->avg += avg_stats(&ps->res_stats[0]);
+       cd->avg_enabled += avg_stats(&ps->res_stats[1]);
+       cd->avg_running += avg_stats(&ps->res_stats[2]);
+}
+
 /*
  * Print out the results of a single counter:
  * aggregated counts in system-wide mode
@@ -1264,23 +1363,31 @@ static void print_aggr_thread(struct perf_evsel *counter, char *prefix)
 static void print_counter_aggr(struct perf_evsel *counter, char *prefix)
 {
        FILE *output = stat_config.output;
-       struct perf_stat_evsel *ps = counter->priv;
-       double avg = avg_stats(&ps->res_stats[0]);
        double uval;
-       double avg_enabled, avg_running;
+       struct caggr_data cd = { .avg = 0.0 };
 
-       avg_enabled = avg_stats(&ps->res_stats[1]);
-       avg_running = avg_stats(&ps->res_stats[2]);
+       if (!collect_data(counter, counter_aggr_cb, &cd))
+               return;
 
        if (prefix && !metric_only)
                fprintf(output, "%s", prefix);
 
-       uval = avg * counter->scale;
-       printout(-1, 0, counter, uval, prefix, avg_running, avg_enabled, avg);
+       uval = cd.avg * counter->scale;
+       printout(-1, 0, counter, uval, prefix, cd.avg_running, cd.avg_enabled, cd.avg);
        if (!metric_only)
                fprintf(output, "\n");
 }
 
+static void counter_cb(struct perf_evsel *counter, void *data,
+                      bool first __maybe_unused)
+{
+       struct aggr_data *ad = data;
+
+       ad->val += perf_counts(counter->counts, ad->cpu, 0)->val;
+       ad->ena += perf_counts(counter->counts, ad->cpu, 0)->ena;
+       ad->run += perf_counts(counter->counts, ad->cpu, 0)->run;
+}
+
 /*
  * Print out the results of a single counter:
  * does not use aggregated count in system-wide
@@ -1293,9 +1400,13 @@ static void print_counter(struct perf_evsel *counter, char *prefix)
        int cpu;
 
        for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); cpu++) {
-               val = perf_counts(counter->counts, cpu, 0)->val;
-               ena = perf_counts(counter->counts, cpu, 0)->ena;
-               run = perf_counts(counter->counts, cpu, 0)->run;
+               struct aggr_data ad = { .cpu = cpu };
+
+               if (!collect_data(counter, counter_cb, &ad))
+                       return;
+               val = ad.val;
+               ena = ad.ena;
+               run = ad.run;
 
                if (prefix)
                        fprintf(output, "%s", prefix);
@@ -1380,6 +1491,7 @@ static void print_metric_headers(const char *prefix, bool no_indent)
                out.ctx = &os;
                out.print_metric = print_metric_header;
                out.new_line = new_line_metric;
+               out.force_header = true;
                os.evsel = counter;
                perf_stat__print_shadow_stats(counter, 0,
                                              0,
@@ -1477,6 +1589,13 @@ static void print_footer(void)
                                avg_stats(&walltime_nsecs_stats));
        }
        fprintf(output, "\n\n");
+
+       if (print_free_counters_hint)
+               fprintf(output,
+"Some events weren't counted. Try disabling the NMI watchdog:\n"
+"      echo 0 > /proc/sys/kernel/nmi_watchdog\n"
+"      perf stat ...\n"
+"      echo 1 > /proc/sys/kernel/nmi_watchdog\n");
 }
 
 static void print_counters(struct timespec *ts, int argc, const char **argv)
@@ -1633,6 +1752,7 @@ static const struct option stat_options[] = {
                    "list of cpus to monitor in system-wide"),
        OPT_SET_UINT('A', "no-aggr", &stat_config.aggr_mode,
                    "disable CPU count aggregation", AGGR_NONE),
+       OPT_BOOLEAN(0, "no-merge", &no_merge, "Do not merge identical named events"),
        OPT_STRING('x', "field-separator", &csv_sep, "separator",
                   "print counts with custom separator"),
        OPT_CALLBACK('G', "cgroup", &evsel_list, "name",
@@ -2339,7 +2459,36 @@ static int __cmd_report(int argc, const char **argv)
        return 0;
 }
 
-int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
+static void setup_system_wide(int forks)
+{
+       /*
+        * Make system wide (-a) the default target if
+        * no target was specified and one of following
+        * conditions is met:
+        *
+        *   - there's no workload specified
+        *   - there is workload specified but all requested
+        *     events are system wide events
+        */
+       if (!target__none(&target))
+               return;
+
+       if (!forks)
+               target.system_wide = true;
+       else {
+               struct perf_evsel *counter;
+
+               evlist__for_each_entry(evsel_list, counter) {
+                       if (!counter->system_wide)
+                               return;
+               }
+
+               if (evsel_list->nr_entries)
+                       target.system_wide = true;
+       }
+}
+
+int cmd_stat(int argc, const char **argv)
 {
        const char * const stat_usage[] = {
                "perf stat [<options>] [<command>]",
@@ -2361,6 +2510,7 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
        argc = parse_options_subcommand(argc, argv, stat_options, stat_subcommands,
                                        (const char **) stat_usage,
                                        PARSE_OPT_STOP_AT_NON_OPTION);
+       perf_stat__collect_metric_expr(evsel_list);
        perf_stat__init_shadow_stats();
 
        if (csv_sep) {
@@ -2445,9 +2595,7 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
        } else if (big_num_opt == 0) /* User passed --no-big-num */
                big_num = false;
 
-       /* Make system wide (-a) the default target. */
-       if (!argc && target__none(&target))
-               target.system_wide = true;
+       setup_system_wide(argc);
 
        if (run_count < 0) {
                pr_err("Run count must be a positive number\n");
index e7eaa29..4e2e616 100644 (file)
@@ -12,6 +12,8 @@
  * of the License.
  */
 
+#include <errno.h>
+#include <inttypes.h>
 #include <traceevent/event-parse.h>
 
 #include "builtin.h"
 #include "util/cache.h"
 #include "util/evlist.h"
 #include "util/evsel.h"
+#include <linux/kernel.h>
 #include <linux/rbtree.h>
 #include <linux/time64.h>
 #include "util/symbol.h"
+#include "util/thread.h"
 #include "util/callchain.h"
-#include "util/strlist.h"
 
 #include "perf.h"
 #include "util/header.h"
@@ -1773,7 +1776,7 @@ static int timechart__io_record(int argc, const char **argv)
        for (i = 0; i < (unsigned int)argc; i++)
                *p++ = argv[i];
 
-       return cmd_record(rec_argc, rec_argv, NULL);
+       return cmd_record(rec_argc, rec_argv);
 }
 
 
@@ -1864,7 +1867,7 @@ static int timechart__record(struct timechart *tchart, int argc, const char **ar
        for (j = 0; j < (unsigned int)argc; j++)
                *p++ = argv[j];
 
-       return cmd_record(rec_argc, rec_argv, NULL);
+       return cmd_record(rec_argc, rec_argv);
 }
 
 static int
@@ -1917,8 +1920,7 @@ parse_time(const struct option *opt, const char *arg, int __maybe_unused unset)
        return 0;
 }
 
-int cmd_timechart(int argc, const char **argv,
-                 const char *prefix __maybe_unused)
+int cmd_timechart(int argc, const char **argv)
 {
        struct timechart tchart = {
                .tool = {
@@ -1933,6 +1935,11 @@ int cmd_timechart(int argc, const char **argv,
                .merge_dist = 1000,
        };
        const char *output_name = "output.svg";
+       const struct option timechart_common_options[] = {
+       OPT_BOOLEAN('P', "power-only", &tchart.power_only, "output power data only"),
+       OPT_BOOLEAN('T', "tasks-only", &tchart.tasks_only, "output processes data only"),
+       OPT_END()
+       };
        const struct option timechart_options[] = {
        OPT_STRING('i', "input", &input_name, "file", "input file name"),
        OPT_STRING('o', "output", &output_name, "file", "output file name"),
@@ -1940,9 +1947,6 @@ int cmd_timechart(int argc, const char **argv,
        OPT_CALLBACK(0, "highlight", NULL, "duration or task name",
                      "highlight tasks. Pass duration in ns or process name.",
                       parse_highlight),
-       OPT_BOOLEAN('P', "power-only", &tchart.power_only, "output power data only"),
-       OPT_BOOLEAN('T', "tasks-only", &tchart.tasks_only,
-                   "output processes data only"),
        OPT_CALLBACK('p', "process", NULL, "process",
                      "process selector. Pass a pid or process name.",
                       parse_process),
@@ -1962,22 +1966,18 @@ int cmd_timechart(int argc, const char **argv,
                     "merge events that are merge-dist us apart",
                     parse_time),
        OPT_BOOLEAN('f', "force", &tchart.force, "don't complain, do it"),
-       OPT_END()
+       OPT_PARENT(timechart_common_options),
        };
        const char * const timechart_subcommands[] = { "record", NULL };
        const char *timechart_usage[] = {
                "perf timechart [<options>] {record}",
                NULL
        };
-
        const struct option timechart_record_options[] = {
-       OPT_BOOLEAN('P', "power-only", &tchart.power_only, "output power data only"),
-       OPT_BOOLEAN('T', "tasks-only", &tchart.tasks_only,
-                   "output processes data only"),
        OPT_BOOLEAN('I', "io-only", &tchart.io_only,
                    "record only IO data"),
        OPT_BOOLEAN('g', "callchain", &tchart.with_backtrace, "record callchain"),
-       OPT_END()
+       OPT_PARENT(timechart_common_options),
        };
        const char * const timechart_record_usage[] = {
                "perf timechart record [<options>]",
index ab90779..7ab42b8 100644 (file)
@@ -40,6 +40,7 @@
 #include "util/cpumap.h"
 #include "util/xyarray.h"
 #include "util/sort.h"
+#include "util/term.h"
 #include "util/intlist.h"
 #include "util/parse-branch-options.h"
 #include "arch/common.h"
@@ -58,6 +59,7 @@
 #include <errno.h>
 #include <time.h>
 #include <sched.h>
+#include <signal.h>
 
 #include <sys/syscall.h>
 #include <sys/ioctl.h>
@@ -72,6 +74,8 @@
 #include <linux/time64.h>
 #include <linux/types.h>
 
+#include "sane_ctype.h"
+
 static volatile int done;
 
 #define HEADER_LINE_NR  5
@@ -1075,7 +1079,7 @@ parse_percent_limit(const struct option *opt, const char *arg,
 const char top_callchain_help[] = CALLCHAIN_RECORD_HELP CALLCHAIN_REPORT_HELP
        "\n\t\t\t\tDefault: fp,graph,0.5,caller,function";
 
-int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused)
+int cmd_top(int argc, const char **argv)
 {
        char errbuf[BUFSIZ];
        struct perf_top top = {
index 256f1fa..eaa66fb 100644 (file)
@@ -24,6 +24,7 @@
 #include "util/evlist.h"
 #include <subcmd/exec-cmd.h>
 #include "util/machine.h"
+#include "util/path.h"
 #include "util/session.h"
 #include "util/thread.h"
 #include <subcmd/parse-options.h>
 #include "util/intlist.h"
 #include "util/thread_map.h"
 #include "util/stat.h"
+#include "trace/beauty/beauty.h"
 #include "trace-event.h"
 #include "util/parse-events.h"
 #include "util/bpf-loader.h"
 #include "callchain.h"
+#include "print_binary.h"
+#include "string2.h"
 #include "syscalltbl.h"
 #include "rb_resort.h"
 
+#include <errno.h>
+#include <inttypes.h>
 #include <libaudit.h> /* FIXME: Still needed for audit_errno_to_name */
+#include <poll.h>
+#include <signal.h>
 #include <stdlib.h>
 #include <string.h>
 #include <linux/err.h>
 #include <linux/filter.h>
 #include <linux/audit.h>
+#include <linux/kernel.h>
 #include <linux/random.h>
 #include <linux/stringify.h>
 #include <linux/time64.h>
 
+#include "sane_ctype.h"
+
 #ifndef O_CLOEXEC
 # define O_CLOEXEC             02000000
 #endif
@@ -267,15 +278,6 @@ out_delete:
        ({ struct syscall_tp *fields = evsel->priv; \
           fields->name.pointer(&fields->name, sample); })
 
-struct syscall_arg {
-       unsigned long val;
-       struct thread *thread;
-       struct trace  *trace;
-       void          *parm;
-       u8            idx;
-       u8            mask;
-};
-
 struct strarray {
        int         offset;
        int         nr_entries;
@@ -771,6 +773,10 @@ static struct syscall_fmt {
          .arg_parm      = { [0] = &strarray__socket_families, /* family */ }, },
        { .name     = "stat",       .errmsg = true, .alias = "newstat", },
        { .name     = "statfs",     .errmsg = true, },
+       { .name     = "statx",      .errmsg = true,
+         .arg_scnprintf = { [0] = SCA_FDAT, /* flags */
+                            [2] = SCA_STATX_FLAGS, /* flags */
+                            [3] = SCA_STATX_MASK, /* mask */ }, },
        { .name     = "swapoff",    .errmsg = true,
          .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
        { .name     = "swapon",     .errmsg = true,
@@ -821,12 +827,21 @@ struct syscall {
        void                **arg_parm;
 };
 
-static size_t fprintf_duration(unsigned long t, FILE *fp)
+/*
+ * We need to have this 'calculated' boolean because in some cases we really
+ * don't know what is the duration of a syscall, for instance, when we start
+ * a session and some threads are waiting for a syscall to finish, say 'poll',
+ * in which case all we can do is to print "( ? ) for duration and for the
+ * start timestamp.
+ */
+static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
 {
        double duration = (double)t / NSEC_PER_MSEC;
        size_t printed = fprintf(fp, "(");
 
-       if (duration >= 1.0)
+       if (!calculated)
+               printed += fprintf(fp, "     ?   ");
+       else if (duration >= 1.0)
                printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
        else if (duration >= 0.01)
                printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
@@ -1028,13 +1043,27 @@ static bool trace__filter_duration(struct trace *trace, double t)
        return t < (trace->duration_filter * NSEC_PER_MSEC);
 }
 
-static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
+static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
 {
        double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
 
        return fprintf(fp, "%10.3f ", ts);
 }
 
+/*
+ * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
+ * using ttrace->entry_time for a thread that receives a sys_exit without
+ * first having received a sys_enter ("poll" issued before tracing session
+ * starts, lost sys_enter exit due to ring buffer overflow).
+ */
+static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
+{
+       if (tstamp > 0)
+               return __trace__fprintf_tstamp(trace, tstamp, fp);
+
+       return fprintf(fp, "         ? ");
+}
+
 static bool done = false;
 static bool interrupted = false;
 
@@ -1045,10 +1074,10 @@ static void sig_handler(int sig)
 }
 
 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
-                                       u64 duration, u64 tstamp, FILE *fp)
+                                       u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
 {
        size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
-       printed += fprintf_duration(duration, fp);
+       printed += fprintf_duration(duration, duration_calculated, fp);
 
        if (trace->multiple_threads) {
                if (trace->show_comm)
@@ -1450,7 +1479,7 @@ static int trace__printf_interrupted_entry(struct trace *trace, struct perf_samp
 
        duration = sample->time - ttrace->entry_time;
 
-       printed  = trace__fprintf_entry_head(trace, trace->current, duration, ttrace->entry_time, trace->output);
+       printed  = trace__fprintf_entry_head(trace, trace->current, duration, true, ttrace->entry_time, trace->output);
        printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
        ttrace->entry_pending = false;
 
@@ -1497,7 +1526,7 @@ static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
 
        if (sc->is_exit) {
                if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) {
-                       trace__fprintf_entry_head(trace, thread, 1, ttrace->entry_time, trace->output);
+                       trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
                        fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
                }
        } else {
@@ -1545,6 +1574,7 @@ static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
 {
        long ret;
        u64 duration = 0;
+       bool duration_calculated = false;
        struct thread *thread;
        int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
        struct syscall *sc = trace__syscall_info(trace, evsel, id);
@@ -1573,6 +1603,7 @@ static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
                duration = sample->time - ttrace->entry_time;
                if (trace__filter_duration(trace, duration))
                        goto out;
+               duration_calculated = true;
        } else if (trace->duration_filter)
                goto out;
 
@@ -1588,7 +1619,7 @@ static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
        if (trace->summary_only)
                goto out;
 
-       trace__fprintf_entry_head(trace, thread, duration, ttrace->entry_time, trace->output);
+       trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
 
        if (ttrace->entry_pending) {
                fprintf(trace->output, "%-70s", ttrace->entry_str);
@@ -1653,15 +1684,17 @@ static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
 
        ttrace = thread__priv(thread);
        if (!ttrace)
-               goto out;
+               goto out_put;
 
        filename_len = strlen(filename);
+       if (filename_len == 0)
+               goto out_put;
 
        if (ttrace->filename.namelen < filename_len) {
                char *f = realloc(ttrace->filename.name, filename_len + 1);
 
                if (f == NULL)
-                               goto out;
+                       goto out_put;
 
                ttrace->filename.namelen = filename_len;
                ttrace->filename.name = f;
@@ -1671,12 +1704,12 @@ static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
        ttrace->filename.pending_open = true;
 
        if (!ttrace->filename.ptr)
-               goto out;
+               goto out_put;
 
        entry_str_len = strlen(ttrace->entry_str);
        remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
        if (remaining_space <= 0)
-               goto out;
+               goto out_put;
 
        if (filename_len > (size_t)remaining_space) {
                filename += filename_len - remaining_space;
@@ -1690,6 +1723,8 @@ static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
 
        ttrace->filename.ptr = 0;
        ttrace->filename.entry_str_pos = 0;
+out_put:
+       thread__put(thread);
 out:
        return 0;
 }
@@ -1710,6 +1745,7 @@ static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evs
 
        ttrace->runtime_ms += runtime_ms;
        trace->runtime_ms += runtime_ms;
+out_put:
        thread__put(thread);
        return 0;
 
@@ -1720,8 +1756,7 @@ out_dump:
               (pid_t)perf_evsel__intval(evsel, sample, "pid"),
               runtime,
               perf_evsel__intval(evsel, sample, "vruntime"));
-       thread__put(thread);
-       return 0;
+       goto out_put;
 }
 
 static void bpf_output__printer(enum binary_printer_ops op,
@@ -1851,7 +1886,7 @@ static int trace__pgfault(struct trace *trace,
        thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
                              sample->ip, &al);
 
-       trace__fprintf_entry_head(trace, thread, 0, sample->time, trace->output);
+       trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
 
        fprintf(trace->output, "%sfault [",
                evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
@@ -1920,7 +1955,7 @@ static int trace__process_sample(struct perf_tool *tool,
 
        thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
        if (thread && thread__is_filtered(thread))
-               return 0;
+               goto out;
 
        trace__set_base_time(trace, evsel, sample);
 
@@ -1928,7 +1963,8 @@ static int trace__process_sample(struct perf_tool *tool,
                ++trace->nr_events;
                handler(trace, evsel, event, sample);
        }
-
+out:
+       thread__put(thread);
        return err;
 }
 
@@ -1988,7 +2024,7 @@ static int trace__record(struct trace *trace, int argc, const char **argv)
        for (i = 0; i < (unsigned int)argc; i++)
                rec_argv[j++] = argv[i];
 
-       return cmd_record(j, rec_argv, NULL);
+       return cmd_record(j, rec_argv);
 }
 
 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
@@ -2415,8 +2451,9 @@ static int trace__replay(struct trace *trace)
        trace->tool.exit          = perf_event__process_exit;
        trace->tool.fork          = perf_event__process_fork;
        trace->tool.attr          = perf_event__process_attr;
-       trace->tool.tracing_data = perf_event__process_tracing_data;
+       trace->tool.tracing_data  = perf_event__process_tracing_data;
        trace->tool.build_id      = perf_event__process_build_id;
+       trace->tool.namespaces    = perf_event__process_namespaces;
 
        trace->tool.ordered_events = true;
        trace->tool.ordering_requires_timestamps = true;
@@ -2785,7 +2822,7 @@ out:
        return err;
 }
 
-int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
+int cmd_trace(int argc, const char **argv)
 {
        const char *trace_usage[] = {
                "perf trace [<options>] [<command>]",
index 9b10cda..d251494 100644 (file)
@@ -1,9 +1,9 @@
-#include "util/util.h"
 #include "builtin.h"
 #include "perf.h"
+#include <linux/compiler.h>
+#include <stdio.h>
 
-int cmd_version(int argc __maybe_unused, const char **argv __maybe_unused,
-               const char *prefix __maybe_unused)
+int cmd_version(int argc __maybe_unused, const char **argv __maybe_unused)
 {
        printf("perf version %s\n", perf_version_string);
        return 0;
index 036e1e3..d4d19fe 100644 (file)
@@ -2,46 +2,42 @@
 #define BUILTIN_H
 
 #include "util/util.h"
-#include "util/strbuf.h"
 
 extern const char perf_usage_string[];
 extern const char perf_more_info_string[];
 
 void list_common_cmds_help(void);
 const char *help_unknown_cmd(const char *cmd);
-void prune_packed_objects(int);
-int read_line_with_nul(char *buf, int size, FILE *file);
-int check_pager_config(const char *cmd);
 
-int cmd_annotate(int argc, const char **argv, const char *prefix);
-int cmd_bench(int argc, const char **argv, const char *prefix);
-int cmd_buildid_cache(int argc, const char **argv, const char *prefix);
-int cmd_buildid_list(int argc, const char **argv, const char *prefix);
-int cmd_config(int argc, const char **argv, const char *prefix);
-int cmd_c2c(int argc, const char **argv, const char *prefix);
-int cmd_diff(int argc, const char **argv, const char *prefix);
-int cmd_evlist(int argc, const char **argv, const char *prefix);
-int cmd_help(int argc, const char **argv, const char *prefix);
-int cmd_sched(int argc, const char **argv, const char *prefix);
-int cmd_kallsyms(int argc, const char **argv, const char *prefix);
-int cmd_list(int argc, const char **argv, const char *prefix);
-int cmd_record(int argc, const char **argv, const char *prefix);
-int cmd_report(int argc, const char **argv, const char *prefix);
-int cmd_stat(int argc, const char **argv, const char *prefix);
-int cmd_timechart(int argc, const char **argv, const char *prefix);
-int cmd_top(int argc, const char **argv, const char *prefix);
-int cmd_script(int argc, const char **argv, const char *prefix);
-int cmd_version(int argc, const char **argv, const char *prefix);
-int cmd_probe(int argc, const char **argv, const char *prefix);
-int cmd_kmem(int argc, const char **argv, const char *prefix);
-int cmd_lock(int argc, const char **argv, const char *prefix);
-int cmd_kvm(int argc, const char **argv, const char *prefix);
-int cmd_test(int argc, const char **argv, const char *prefix);
-int cmd_trace(int argc, const char **argv, const char *prefix);
-int cmd_inject(int argc, const char **argv, const char *prefix);
-int cmd_mem(int argc, const char **argv, const char *prefix);
-int cmd_data(int argc, const char **argv, const char *prefix);
-int cmd_ftrace(int argc, const char **argv, const char *prefix);
+int cmd_annotate(int argc, const char **argv);
+int cmd_bench(int argc, const char **argv);
+int cmd_buildid_cache(int argc, const char **argv);
+int cmd_buildid_list(int argc, const char **argv);
+int cmd_config(int argc, const char **argv);
+int cmd_c2c(int argc, const char **argv);
+int cmd_diff(int argc, const char **argv);
+int cmd_evlist(int argc, const char **argv);
+int cmd_help(int argc, const char **argv);
+int cmd_sched(int argc, const char **argv);
+int cmd_kallsyms(int argc, const char **argv);
+int cmd_list(int argc, const char **argv);
+int cmd_record(int argc, const char **argv);
+int cmd_report(int argc, const char **argv);
+int cmd_stat(int argc, const char **argv);
+int cmd_timechart(int argc, const char **argv);
+int cmd_top(int argc, const char **argv);
+int cmd_script(int argc, const char **argv);
+int cmd_version(int argc, const char **argv);
+int cmd_probe(int argc, const char **argv);
+int cmd_kmem(int argc, const char **argv);
+int cmd_lock(int argc, const char **argv);
+int cmd_kvm(int argc, const char **argv);
+int cmd_test(int argc, const char **argv);
+int cmd_trace(int argc, const char **argv);
+int cmd_inject(int argc, const char **argv);
+int cmd_mem(int argc, const char **argv);
+int cmd_data(int argc, const char **argv);
+int cmd_ftrace(int argc, const char **argv);
 
 int find_scripts(char **scripts_array, char **scripts_path_array);
 #endif
index c747bfd..83fe220 100755 (executable)
@@ -1,7 +1,9 @@
 #!/bin/sh
 
 HEADERS='
+include/uapi/linux/fcntl.h
 include/uapi/linux/perf_event.h
+include/uapi/linux/stat.h
 include/linux/hash.h
 include/uapi/linux/hw_breakpoint.h
 arch/x86/include/asm/disabled-features.h
index ac3efd3..2d0caf2 100644 (file)
@@ -9,6 +9,7 @@ perf-buildid-cache              mainporcelain common
 perf-buildid-list              mainporcelain common
 perf-data                      mainporcelain common
 perf-diff                      mainporcelain common
+perf-c2c                       mainporcelain common
 perf-config                    mainporcelain common
 perf-evlist                    mainporcelain common
 perf-ftrace                    mainporcelain common
index 6d5479e..4cc6960 100644 (file)
 #include "util/debug.h"
 #include <api/fs/fs.h>
 #include <api/fs/tracing_path.h>
+#include <errno.h>
 #include <pthread.h>
+#include <signal.h>
 #include <stdlib.h>
 #include <time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <linux/kernel.h>
 
 const char perf_usage_string[] =
        "perf [--version] [--help] [OPTIONS] COMMAND [ARGS]";
@@ -34,7 +40,7 @@ const char *input_name;
 
 struct cmd_struct {
        const char *cmd;
-       int (*fn)(int, const char **, const char *);
+       int (*fn)(int, const char **);
        int option;
 };
 
@@ -88,7 +94,7 @@ static int pager_command_config(const char *var, const char *value, void *data)
 }
 
 /* returns 0 for "no pager", 1 for "use pager", and -1 for "not specified" */
-int check_pager_config(const char *cmd)
+static int check_pager_config(const char *cmd)
 {
        int err;
        struct pager_config c;
@@ -267,71 +273,6 @@ static int handle_options(const char ***argv, int *argc, int *envchanged)
        return handled;
 }
 
-static int handle_alias(int *argcp, const char ***argv)
-{
-       int envchanged = 0, ret = 0, saved_errno = errno;
-       int count, option_count;
-       const char **new_argv;
-       const char *alias_command;
-       char *alias_string;
-
-       alias_command = (*argv)[0];
-       alias_string = alias_lookup(alias_command);
-       if (alias_string) {
-               if (alias_string[0] == '!') {
-                       if (*argcp > 1) {
-                               struct strbuf buf;
-
-                               if (strbuf_init(&buf, PATH_MAX) < 0 ||
-                                   strbuf_addstr(&buf, alias_string) < 0 ||
-                                   sq_quote_argv(&buf, (*argv) + 1,
-                                                 PATH_MAX) < 0)
-                                       die("Failed to allocate memory.");
-                               free(alias_string);
-                               alias_string = buf.buf;
-                       }
-                       ret = system(alias_string + 1);
-                       if (ret >= 0 && WIFEXITED(ret) &&
-                           WEXITSTATUS(ret) != 127)
-                               exit(WEXITSTATUS(ret));
-                       die("Failed to run '%s' when expanding alias '%s'",
-                           alias_string + 1, alias_command);
-               }
-               count = split_cmdline(alias_string, &new_argv);
-               if (count < 0)
-                       die("Bad alias.%s string", alias_command);
-               option_count = handle_options(&new_argv, &count, &envchanged);
-               if (envchanged)
-                       die("alias '%s' changes environment variables\n"
-                                "You can use '!perf' in the alias to do this.",
-                                alias_command);
-               memmove(new_argv - option_count, new_argv,
-                               count * sizeof(char *));
-               new_argv -= option_count;
-
-               if (count < 1)
-                       die("empty alias for %s", alias_command);
-
-               if (!strcmp(alias_command, new_argv[0]))
-                       die("recursive alias: %s", alias_command);
-
-               new_argv = realloc(new_argv, sizeof(char *) *
-                                   (count + *argcp + 1));
-               /* insert after command name */
-               memcpy(new_argv + count, *argv + 1, sizeof(char *) * *argcp);
-               new_argv[count + *argcp] = NULL;
-
-               *argv = new_argv;
-               *argcp += count - 1;
-
-               ret = 1;
-       }
-
-       errno = saved_errno;
-
-       return ret;
-}
-
 #define RUN_SETUP      (1<<0)
 #define USE_PAGER      (1<<1)
 
@@ -339,13 +280,8 @@ static int run_builtin(struct cmd_struct *p, int argc, const char **argv)
 {
        int status;
        struct stat st;
-       const char *prefix;
        char sbuf[STRERR_BUFSIZE];
 
-       prefix = NULL;
-       if (p->option & RUN_SETUP)
-               prefix = NULL; /* setup_perf_directory(); */
-
        if (use_browser == -1)
                use_browser = check_browser_config(p->cmd);
 
@@ -356,7 +292,7 @@ static int run_builtin(struct cmd_struct *p, int argc, const char **argv)
        commit_pager_choice();
 
        perf_env__set_cmdline(&perf_env, argc, argv);
-       status = p->fn(argc, argv, prefix);
+       status = p->fn(argc, argv);
        perf_config__exit();
        exit_browser(status);
        perf_env__exit(&perf_env);
@@ -397,16 +333,6 @@ static void handle_internal_command(int argc, const char **argv)
 {
        const char *cmd = argv[0];
        unsigned int i;
-       static const char ext[] = STRIP_EXTENSION;
-
-       if (sizeof(ext) > 1) {
-               i = strlen(argv[0]) - strlen(ext);
-               if (i > 0 && !strcmp(argv[0] + i, ext)) {
-                       char *argv0 = strdup(argv[0]);
-                       argv[0] = cmd = argv0;
-                       argv0[i] = '\0';
-               }
-       }
 
        /* Turn "perf cmd --help" into "perf help cmd" */
        if (argc > 1 && !strcmp(argv[1], "--help")) {
@@ -448,7 +374,8 @@ static void execv_dashed_external(const char **argv)
        if (status != -ERR_RUN_COMMAND_EXEC) {
                if (IS_RUN_COMMAND_ERR(status)) {
 do_die:
-                       die("unable to run '%s'", argv[0]);
+                       pr_err("FATAL: unable to run '%s'", argv[0]);
+                       status = -128;
                }
                exit(-status);
        }
@@ -460,25 +387,12 @@ do_die:
 
 static int run_argv(int *argcp, const char ***argv)
 {
-       int done_alias = 0;
+       /* See if it's an internal command */
+       handle_internal_command(*argcp, *argv);
 
-       while (1) {
-               /* See if it's an internal command */
-               handle_internal_command(*argcp, *argv);
-
-               /* .. then try the external ones */
-               execv_dashed_external(*argv);
-
-               /* It could be an alias -- this works around the insanity
-                * of overriding "perf log" with "perf show" by having
-                * alias.log = show
-                */
-               if (done_alias || !handle_alias(argcp, argv))
-                       break;
-               done_alias = 1;
-       }
-
-       return done_alias;
+       /* .. then try the external ones */
+       execv_dashed_external(*argv);
+       return 0;
 }
 
 static void pthread__block_sigwinch(void)
@@ -566,7 +480,7 @@ int main(int argc, const char **argv)
 #ifdef HAVE_LIBAUDIT_SUPPORT
                setup_path();
                argv[0] = "trace";
-               return cmd_trace(argc, argv, NULL);
+               return cmd_trace(argc, argv);
 #else
                fprintf(stderr,
                        "trace command not available: missing audit-libs devel package at build time.\n");
@@ -611,17 +525,12 @@ int main(int argc, const char **argv)
 
        while (1) {
                static int done_help;
-               int was_alias = run_argv(&argc, &argv);
+
+               run_argv(&argc, &argv);
 
                if (errno != ENOENT)
                        break;
 
-               if (was_alias) {
-                       fprintf(stderr, "Expansion of alias '%s' failed; "
-                               "'%s' is not a perf-command\n",
-                               cmd, argv[0]);
-                       goto out;
-               }
                if (!done_help) {
                        cmd = argv[0] = help_unknown_cmd(cmd);
                        done_help = 1;
index 1c27d94..806c216 100644 (file)
@@ -50,6 +50,7 @@ struct record_opts {
        bool         running_time;
        bool         full_auxtrace;
        bool         auxtrace_snapshot_mode;
+       bool         record_namespaces;
        bool         record_switch_events;
        bool         all_kernel;
        bool         all_user;
diff --git a/tools/perf/pmu-events/arch/x86/broadwell/uncore.json b/tools/perf/pmu-events/arch/x86/broadwell/uncore.json
new file mode 100644 (file)
index 0000000..28e1e15
--- /dev/null
@@ -0,0 +1,278 @@
+[
+  {
+    "Unit": "CBO",
+    "EventCode": "0x22",
+    "UMask": "0x41",
+    "EventName": "UNC_CBO_XSNP_RESPONSE.MISS_XCORE",
+    "BriefDescription": "A cross-core snoop initiated by this Cbox due to processor core memory request which misses in some processor core.",
+    "PublicDescription": "A cross-core snoop initiated by this Cbox due to processor core memory request which misses in some processor core.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x22",
+    "UMask": "0x81",
+    "EventName": "UNC_CBO_XSNP_RESPONSE.MISS_EVICTION",
+    "BriefDescription": "A cross-core snoop resulted from L3 Eviction which misses in some processor core.",
+    "PublicDescription": "A cross-core snoop resulted from L3 Eviction which misses in some processor core.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x22",
+    "UMask": "0x44",
+    "EventName": "UNC_CBO_XSNP_RESPONSE.HIT_XCORE",
+    "BriefDescription": "A cross-core snoop initiated by this Cbox due to processor core memory request which hits a non-modified line in some processor core.",
+    "PublicDescription": "A cross-core snoop initiated by this Cbox due to processor core memory request which hits a non-modified line in some processor core.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x22",
+    "UMask": "0x48",
+    "EventName": "UNC_CBO_XSNP_RESPONSE.HITM_XCORE",
+    "BriefDescription": "A cross-core snoop initiated by this Cbox due to processor core memory request which hits a modified line in some processor core.",
+    "PublicDescription": "A cross-core snoop initiated by this Cbox due to processor core memory request which hits a modified line in some processor core.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x34",
+    "UMask": "0x11",
+    "EventName": "UNC_CBO_CACHE_LOOKUP.READ_M",
+    "BriefDescription": "L3 Lookup read request that access cache and found line in M-state",
+    "PublicDescription": "L3 Lookup read request that access cache and found line in M-state.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x34",
+    "UMask": "0x21",
+    "EventName": "UNC_CBO_CACHE_LOOKUP.WRITE_M",
+    "BriefDescription": "L3 Lookup write request that access cache and found line in M-state",
+    "PublicDescription": "L3 Lookup write request that access cache and found line in M-state.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x34",
+    "UMask": "0x81",
+    "EventName": "UNC_CBO_CACHE_LOOKUP.ANY_M",
+    "BriefDescription": "L3 Lookup any request that access cache and found line in M-state",
+    "PublicDescription": "L3 Lookup any request that access cache and found line in M-state.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x34",
+    "UMask": "0x18",
+    "EventName": "UNC_CBO_CACHE_LOOKUP.READ_I",
+    "BriefDescription": "L3 Lookup read request that access cache and found line in I-state",
+    "PublicDescription": "L3 Lookup read request that access cache and found line in I-state.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x34",
+    "UMask": "0x88",
+    "EventName": "UNC_CBO_CACHE_LOOKUP.ANY_I",
+    "BriefDescription": "L3 Lookup any request that access cache and found line in I-state",
+    "PublicDescription": "L3 Lookup any request that access cache and found line in I-state.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x34",
+    "UMask": "0x1f",
+    "EventName": "UNC_CBO_CACHE_LOOKUP.READ_MESI",
+    "BriefDescription": "L3 Lookup read request that access cache and found line in any MESI-state",
+    "PublicDescription": "L3 Lookup read request that access cache and found line in any MESI-state.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x34",
+    "UMask": "0x2f",
+    "EventName": "UNC_CBO_CACHE_LOOKUP.WRITE_MESI",
+    "BriefDescription": "L3 Lookup write request that access cache and found line in MESI-state",
+    "PublicDescription": "L3 Lookup write request that access cache and found line in MESI-state.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x34",
+    "UMask": "0x8f",
+    "EventName": "UNC_CBO_CACHE_LOOKUP.ANY_MESI",
+    "BriefDescription": "L3 Lookup any request that access cache and found line in MESI-state",
+    "PublicDescription": "L3 Lookup any request that access cache and found line in MESI-state.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x34",
+    "UMask": "0x86",
+    "EventName": "UNC_CBO_CACHE_LOOKUP.ANY_ES",
+    "BriefDescription": "L3 Lookup any request that access cache and found line in E or S-state",
+    "PublicDescription": "L3 Lookup any request that access cache and found line in E or S-state.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x34",
+    "UMask": "0x16",
+    "EventName": "UNC_CBO_CACHE_LOOKUP.READ_ES",
+    "BriefDescription": "L3 Lookup read request that access cache and found line in E or S-state",
+    "PublicDescription": "L3 Lookup read request that access cache and found line in E or S-state.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x34",
+    "UMask": "0x26",
+    "EventName": "UNC_CBO_CACHE_LOOKUP.WRITE_ES",
+    "BriefDescription": "L3 Lookup write request that access cache and found line in E or S-state",
+    "PublicDescription": "L3 Lookup write request that access cache and found line in E or S-state.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "iMPH-U",
+    "EventCode": "0x80",
+    "UMask": "0x01",
+    "EventName": "UNC_ARB_TRK_OCCUPANCY.ALL",
+    "BriefDescription": "Each cycle count number of all Core outgoing valid entries. Such entry is defined as valid from it's allocation till first of IDI0 or DRS0 messages is sent out. Accounts for Coherent and non-coherent traffic.",
+    "PublicDescription": "Each cycle count number of all Core outgoing valid entries. Such entry is defined as valid from it's allocation till first of IDI0 or DRS0 messages is sent out. Accounts for Coherent and non-coherent traffic.",
+    "Counter": "0,",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "iMPH-U",
+    "EventCode": "0x80",
+    "UMask": "0x02",
+    "EventName": "UNC_ARB_TRK_OCCUPANCY.DRD_DIRECT",
+    "BriefDescription": "Each cycle count number of 'valid' coherent Data Read entries that are in DirectData mode. Such entry is defined as valid when it is allocated till data sent to Core (first chunk, IDI0). Applicable for IA Cores' requests in normal case.",
+    "PublicDescription": "Each cycle count number of 'valid' coherent Data Read entries that are in DirectData mode. Such entry is defined as valid when it is allocated till data sent to Core (first chunk, IDI0). Applicable for IA Cores' requests in normal case.",
+    "Counter": "0,",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "iMPH-U",
+    "EventCode": "0x81",
+    "UMask": "0x01",
+    "EventName": "UNC_ARB_TRK_REQUESTS.ALL",
+    "BriefDescription": "Total number of Core outgoing entries allocated. Accounts for Coherent and non-coherent traffic.",
+    "PublicDescription": "Total number of Core outgoing entries allocated. Accounts for Coherent and non-coherent traffic.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "iMPH-U",
+    "EventCode": "0x81",
+    "UMask": "0x02",
+    "EventName": "UNC_ARB_TRK_REQUESTS.DRD_DIRECT",
+    "BriefDescription": "Number of Core coherent Data Read entries allocated in DirectData mode",
+    "PublicDescription": "Number of Core coherent Data Read entries allocated in DirectData mode.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "iMPH-U",
+    "EventCode": "0x81",
+    "UMask": "0x20",
+    "EventName": "UNC_ARB_TRK_REQUESTS.WRITES",
+    "BriefDescription": "Number of Writes allocated - any write transactions: full/partials writes and evictions.",
+    "PublicDescription": "Number of Writes allocated - any write transactions: full/partials writes and evictions.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "iMPH-U",
+    "EventCode": "0x84",
+    "UMask": "0x01",
+    "EventName": "UNC_ARB_COH_TRK_REQUESTS.ALL",
+    "BriefDescription": "Number of entries allocated. Account for Any type: e.g. Snoop, Core aperture, etc.",
+    "PublicDescription": "Number of entries allocated. Account for Any type: e.g. Snoop, Core aperture, etc.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "iMPH-U",
+    "EventCode": "0x80",
+    "UMask": "0x01",
+    "EventName": "UNC_ARB_TRK_OCCUPANCY.CYCLES_WITH_ANY_REQUEST",
+    "BriefDescription": "Cycles with at least one request outstanding is waiting for data return from memory controller. Account for coherent and non-coherent requests initiated by IA Cores, Processor Graphics Unit, or LLC.;",
+    "PublicDescription": "Cycles with at least one request outstanding is waiting for data return from memory controller. Account for coherent and non-coherent requests initiated by IA Cores, Processor Graphics Unit, or LLC.",
+    "Counter": "0,",
+    "CounterMask": "1",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "NCU",
+    "EventCode": "0x0",
+    "UMask": "0x01",
+    "EventName": "UNC_CLOCK.SOCKET",
+    "BriefDescription": "This 48-bit fixed counter counts the UCLK cycles",
+    "PublicDescription": "This 48-bit fixed counter counts the UCLK cycles.",
+    "Counter": "FIXED",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  }
+]
\ No newline at end of file
index 076459c..58ed6d3 100644 (file)
@@ -1,13 +1,13 @@
 [
     {
-        "BriefDescription": "Uncore cache clock ticks. Derived from unc_c_clockticks",
+        "BriefDescription": "Uncore cache clock ticks",
         "Counter": "0,1,2,3",
         "EventName": "UNC_C_CLOCKTICKS",
         "PerPkg": "1",
         "Unit": "CBO"
     },
     {
-        "BriefDescription": "All LLC Misses (code+ data rd + data wr - including demand and prefetch). Derived from unc_c_llc_lookup.any",
+        "BriefDescription": "All LLC Misses (code+ data rd + data wr - including demand and prefetch)",
         "Counter": "0,1,2,3",
         "EventCode": "0x34",
         "EventName": "UNC_C_LLC_LOOKUP.ANY",
@@ -18,7 +18,7 @@
         "Unit": "CBO"
     },
     {
-        "BriefDescription": "M line evictions from LLC (writebacks to memory). Derived from unc_c_llc_victims.m_state",
+        "BriefDescription": "M line evictions from LLC (writebacks to memory)",
         "Counter": "0,1,2,3",
         "EventCode": "0x37",
         "EventName": "UNC_C_LLC_VICTIMS.M_STATE",
         "Unit": "CBO"
     },
     {
-        "BriefDescription": "read requests to home agent. Derived from unc_h_requests.reads",
+        "BriefDescription": "read requests to home agent",
         "Counter": "0,1,2,3",
         "EventCode": "0x1",
         "EventName": "UNC_H_REQUESTS.READS",
         "Unit": "HA"
     },
     {
-        "BriefDescription": "read requests to local home agent. Derived from unc_h_requests.reads_local",
+        "BriefDescription": "read requests to local home agent",
         "Counter": "0,1,2,3",
         "EventCode": "0x1",
         "EventName": "UNC_H_REQUESTS.READS_LOCAL",
         "Unit": "HA"
     },
     {
-        "BriefDescription": "read requests to remote home agent. Derived from unc_h_requests.reads_remote",
+        "BriefDescription": "read requests to remote home agent",
         "Counter": "0,1,2,3",
         "EventCode": "0x1",
         "EventName": "UNC_H_REQUESTS.READS_REMOTE",
         "Unit": "HA"
     },
     {
-        "BriefDescription": "write requests to home agent. Derived from unc_h_requests.writes",
+        "BriefDescription": "write requests to home agent",
         "Counter": "0,1,2,3",
         "EventCode": "0x1",
         "EventName": "UNC_H_REQUESTS.WRITES",
         "Unit": "HA"
     },
     {
-        "BriefDescription": "write requests to local home agent. Derived from unc_h_requests.writes_local",
+        "BriefDescription": "write requests to local home agent",
         "Counter": "0,1,2,3",
         "EventCode": "0x1",
         "EventName": "UNC_H_REQUESTS.WRITES_LOCAL",
         "Unit": "HA"
     },
     {
-        "BriefDescription": "write requests to remote home agent. Derived from unc_h_requests.writes_remote",
+        "BriefDescription": "write requests to remote home agent",
         "Counter": "0,1,2,3",
         "EventCode": "0x1",
         "EventName": "UNC_H_REQUESTS.WRITES_REMOTE",
         "Unit": "HA"
     },
     {
-        "BriefDescription": "Conflict requests (requests for same address from multiple agents simultaneously). Derived from unc_h_snoop_resp.rspcnflct",
+        "BriefDescription": "Conflict requests (requests for same address from multiple agents simultaneously)",
         "Counter": "0,1,2,3",
         "EventCode": "0x21",
         "EventName": "UNC_H_SNOOP_RESP.RSPCNFLCT",
         "Unit": "HA"
     },
     {
-        "BriefDescription": "M line forwarded from remote cache along with writeback to memory. Derived from unc_h_snoop_resp.rsp_fwd_wb",
+        "BriefDescription": "M line forwarded from remote cache along with writeback to memory",
         "Counter": "0,1,2,3",
         "EventCode": "0x21",
         "EventName": "UNC_H_SNOOP_RESP.RSP_FWD_WB",
         "Unit": "HA"
     },
     {
-        "BriefDescription": "M line forwarded from remote cache with no writeback to memory. Derived from unc_h_snoop_resp.rspifwd",
+        "BriefDescription": "M line forwarded from remote cache with no writeback to memory",
         "Counter": "0,1,2,3",
         "EventCode": "0x21",
         "EventName": "UNC_H_SNOOP_RESP.RSPIFWD",
         "Unit": "HA"
     },
     {
-        "BriefDescription": "Shared line response from remote cache. Derived from unc_h_snoop_resp.rsps",
+        "BriefDescription": "Shared line response from remote cache",
         "Counter": "0,1,2,3",
         "EventCode": "0x21",
         "EventName": "UNC_H_SNOOP_RESP.RSPS",
         "Unit": "HA"
     },
     {
-        "BriefDescription": "Shared line forwarded from remote cache. Derived from unc_h_snoop_resp.rspsfwd",
+        "BriefDescription": "Shared line forwarded from remote cache",
         "Counter": "0,1,2,3",
         "EventCode": "0x21",
         "EventName": "UNC_H_SNOOP_RESP.RSPSFWD",
index d17dc23..f4b0745 100644 (file)
@@ -3,7 +3,7 @@
         "BriefDescription": "read requests to memory controller. Derived from unc_m_cas_count.rd",
         "Counter": "0,1,2,3",
         "EventCode": "0x4",
-        "EventName": "UNC_M_CAS_COUNT.RD",
+        "EventName": "LLC_MISSES.MEM_READ",
         "PerPkg": "1",
         "ScaleUnit": "64Bytes",
         "UMask": "0x3",
         "BriefDescription": "write requests to memory controller. Derived from unc_m_cas_count.wr",
         "Counter": "0,1,2,3",
         "EventCode": "0x4",
-        "EventName": "UNC_M_CAS_COUNT.WR",
+        "EventName": "LLC_MISSES.MEM_WRITE",
         "PerPkg": "1",
         "ScaleUnit": "64Bytes",
         "UMask": "0xC",
         "Unit": "iMC"
     },
     {
-        "BriefDescription": "Memory controller clock ticks. Derived from unc_m_clockticks",
+        "BriefDescription": "Memory controller clock ticks",
         "Counter": "0,1,2,3",
-        "EventName": "UNC_M_CLOCKTICKS",
+        "EventName": "UNC_M_DCLOCKTICKS",
         "PerPkg": "1",
         "Unit": "iMC"
     },
     {
-        "BriefDescription": "Cycles where DRAM ranks are in power down (CKE) mode. Derived from unc_m_power_channel_ppd",
+        "BriefDescription": "Cycles where DRAM ranks are in power down (CKE) mode",
         "Counter": "0,1,2,3",
         "EventCode": "0x85",
         "EventName": "UNC_M_POWER_CHANNEL_PPD",
-        "MetricExpr": "(UNC_M_POWER_CHANNEL_PPD / UNC_M_CLOCKTICKS) * 100.",
+        "MetricExpr": "(UNC_M_POWER_CHANNEL_PPD / UNC_M_DCLOCKTICKS) * 100.",
+        "MetricName": "power_channel_ppd %",
         "PerPkg": "1",
         "Unit": "iMC"
     },
     {
-        "BriefDescription": "Cycles all ranks are in critical thermal throttle. Derived from unc_m_power_critical_throttle_cycles",
+        "BriefDescription": "Cycles all ranks are in critical thermal throttle",
         "Counter": "0,1,2,3",
         "EventCode": "0x86",
         "EventName": "UNC_M_POWER_CRITICAL_THROTTLE_CYCLES",
-        "MetricExpr": "(UNC_M_POWER_CRITICAL_THROTTLE_CYCLES / UNC_M_CLOCKTICKS) * 100.",
+        "MetricExpr": "(UNC_M_POWER_CRITICAL_THROTTLE_CYCLES / UNC_M_DCLOCKTICKS) * 100.",
+        "MetricName": "power_critical_throttle_cycles %",
         "PerPkg": "1",
         "Unit": "iMC"
     },
     {
-        "BriefDescription": "Cycles Memory is in self refresh power mode. Derived from unc_m_power_self_refresh",
+        "BriefDescription": "Cycles Memory is in self refresh power mode",
         "Counter": "0,1,2,3",
         "EventCode": "0x43",
         "EventName": "UNC_M_POWER_SELF_REFRESH",
-        "MetricExpr": "(UNC_M_POWER_SELF_REFRESH / UNC_M_CLOCKTICKS) * 100.",
+        "MetricExpr": "(UNC_M_POWER_SELF_REFRESH / UNC_M_DCLOCKTICKS) * 100.",
+        "MetricName": "power_self_refresh %",
         "PerPkg": "1",
         "Unit": "iMC"
     },
     {
-        "BriefDescription": "Pre-charges due to page misses. Derived from unc_m_pre_count.page_miss",
+        "BriefDescription": "Pre-charges due to page misses",
         "Counter": "0,1,2,3",
         "EventCode": "0x2",
         "EventName": "UNC_M_PRE_COUNT.PAGE_MISS",
@@ -63,7 +66,7 @@
         "Unit": "iMC"
     },
     {
-        "BriefDescription": "Pre-charge for reads. Derived from unc_m_pre_count.rd",
+        "BriefDescription": "Pre-charge for reads",
         "Counter": "0,1,2,3",
         "EventCode": "0x2",
         "EventName": "UNC_M_PRE_COUNT.RD",
@@ -72,7 +75,7 @@
         "Unit": "iMC"
     },
     {
-        "BriefDescription": "Pre-charge for writes. Derived from unc_m_pre_count.wr",
+        "BriefDescription": "Pre-charge for writes",
         "Counter": "0,1,2,3",
         "EventCode": "0x2",
         "EventName": "UNC_M_PRE_COUNT.WR",
index b44d430..dd1b956 100644 (file)
@@ -1,83 +1,91 @@
 [
     {
-        "BriefDescription": "PCU clock ticks. Use to get percentages of PCU cycles events. Derived from unc_p_clockticks",
+        "BriefDescription": "PCU clock ticks. Use to get percentages of PCU cycles events",
         "Counter": "0,1,2,3",
         "EventName": "UNC_P_CLOCKTICKS",
         "PerPkg": "1",
         "Unit": "PCU"
     },
     {
-        "BriefDescription": "C0 and C1. Derived from unc_p_power_state_occupancy.cores_c0",
+        "BriefDescription": "This is an occupancy event that tracks the number of cores that are in C0.  It can be used by itself to get the average number of cores in C0, with threshholding to generate histograms, or with other PCU events and occupancy triggering to capture other details",
         "Counter": "0,1,2,3",
         "EventCode": "0x80",
         "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C0",
         "Filter": "occ_sel=1",
         "MetricExpr": "(UNC_P_POWER_STATE_OCCUPANCY.CORES_C0 / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "power_state_occupancy.cores_c0 %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
     {
-        "BriefDescription": "C3. Derived from unc_p_power_state_occupancy.cores_c3",
+        "BriefDescription": "This is an occupancy event that tracks the number of cores that are in C3.  It can be used by itself to get the average number of cores in C0, with threshholding to generate histograms, or with other PCU events and occupancy triggering to capture other details",
         "Counter": "0,1,2,3",
         "EventCode": "0x80",
         "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C3",
         "Filter": "occ_sel=2",
         "MetricExpr": "(UNC_P_POWER_STATE_OCCUPANCY.CORES_C3 / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "power_state_occupancy.cores_c3 %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
     {
-        "BriefDescription": "C6 and C7. Derived from unc_p_power_state_occupancy.cores_c6",
+        "BriefDescription": "This is an occupancy event that tracks the number of cores that are in C6.  It can be used by itself to get the average number of cores in C0, with threshholding to generate histograms, or with other PCU events ",
         "Counter": "0,1,2,3",
         "EventCode": "0x80",
         "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C6",
         "Filter": "occ_sel=3",
         "MetricExpr": "(UNC_P_POWER_STATE_OCCUPANCY.CORES_C6 / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "power_state_occupancy.cores_c6 %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
     {
-        "BriefDescription": "External Prochot. Derived from unc_p_prochot_external_cycles",
+        "BriefDescription": "Counts the number of cycles that we are in external PROCHOT mode.  This mode is triggered when a sensor off the die determines that something off-die (like DRAM) is too hot and must throttle to avoid damaging the chip",
         "Counter": "0,1,2,3",
         "EventCode": "0xA",
         "EventName": "UNC_P_PROCHOT_EXTERNAL_CYCLES",
         "MetricExpr": "(UNC_P_PROCHOT_EXTERNAL_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "prochot_external_cycles %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
     {
-        "BriefDescription": "Thermal Strongest Upper Limit Cycles. Derived from unc_p_freq_max_limit_thermal_cycles",
+        "BriefDescription": "Counts the number of cycles when temperature is the upper limit on frequency",
         "Counter": "0,1,2,3",
         "EventCode": "0x4",
         "EventName": "UNC_P_FREQ_MAX_LIMIT_THERMAL_CYCLES",
         "MetricExpr": "(UNC_P_FREQ_MAX_LIMIT_THERMAL_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "freq_max_limit_thermal_cycles %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
     {
-        "BriefDescription": "OS Strongest Upper Limit Cycles. Derived from unc_p_freq_max_os_cycles",
+        "BriefDescription": "Counts the number of cycles when the OS is the upper limit on frequency",
         "Counter": "0,1,2,3",
         "EventCode": "0x6",
         "EventName": "UNC_P_FREQ_MAX_OS_CYCLES",
         "MetricExpr": "(UNC_P_FREQ_MAX_OS_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "freq_max_os_cycles %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
     {
-        "BriefDescription": "Power Strongest Upper Limit Cycles. Derived from unc_p_freq_max_power_cycles",
+        "BriefDescription": "Counts the number of cycles when power is the upper limit on frequency",
         "Counter": "0,1,2,3",
         "EventCode": "0x5",
         "EventName": "UNC_P_FREQ_MAX_POWER_CYCLES",
         "MetricExpr": "(UNC_P_FREQ_MAX_POWER_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "freq_max_power_cycles %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
     {
-        "BriefDescription": "Cycles spent changing Frequency. Derived from unc_p_freq_trans_cycles",
+        "BriefDescription": "Counts the number of cycles when current is the upper limit on frequency",
         "Counter": "0,1,2,3",
         "EventCode": "0x74",
         "EventName": "UNC_P_FREQ_TRANS_CYCLES",
         "MetricExpr": "(UNC_P_FREQ_TRANS_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "freq_trans_cycles %",
         "PerPkg": "1",
         "Unit": "PCU"
     }
index 076459c..58ed6d3 100644 (file)
@@ -1,13 +1,13 @@
 [
     {
-        "BriefDescription": "Uncore cache clock ticks. Derived from unc_c_clockticks",
+        "BriefDescription": "Uncore cache clock ticks",
         "Counter": "0,1,2,3",
         "EventName": "UNC_C_CLOCKTICKS",
         "PerPkg": "1",
         "Unit": "CBO"
     },
     {
-        "BriefDescription": "All LLC Misses (code+ data rd + data wr - including demand and prefetch). Derived from unc_c_llc_lookup.any",
+        "BriefDescription": "All LLC Misses (code+ data rd + data wr - including demand and prefetch)",
         "Counter": "0,1,2,3",
         "EventCode": "0x34",
         "EventName": "UNC_C_LLC_LOOKUP.ANY",
@@ -18,7 +18,7 @@
         "Unit": "CBO"
     },
     {
-        "BriefDescription": "M line evictions from LLC (writebacks to memory). Derived from unc_c_llc_victims.m_state",
+        "BriefDescription": "M line evictions from LLC (writebacks to memory)",
         "Counter": "0,1,2,3",
         "EventCode": "0x37",
         "EventName": "UNC_C_LLC_VICTIMS.M_STATE",
         "Unit": "CBO"
     },
     {
-        "BriefDescription": "read requests to home agent. Derived from unc_h_requests.reads",
+        "BriefDescription": "read requests to home agent",
         "Counter": "0,1,2,3",
         "EventCode": "0x1",
         "EventName": "UNC_H_REQUESTS.READS",
         "Unit": "HA"
     },
     {
-        "BriefDescription": "read requests to local home agent. Derived from unc_h_requests.reads_local",
+        "BriefDescription": "read requests to local home agent",
         "Counter": "0,1,2,3",
         "EventCode": "0x1",
         "EventName": "UNC_H_REQUESTS.READS_LOCAL",
         "Unit": "HA"
     },
     {
-        "BriefDescription": "read requests to remote home agent. Derived from unc_h_requests.reads_remote",
+        "BriefDescription": "read requests to remote home agent",
         "Counter": "0,1,2,3",
         "EventCode": "0x1",
         "EventName": "UNC_H_REQUESTS.READS_REMOTE",
         "Unit": "HA"
     },
     {
-        "BriefDescription": "write requests to home agent. Derived from unc_h_requests.writes",
+        "BriefDescription": "write requests to home agent",
         "Counter": "0,1,2,3",
         "EventCode": "0x1",
         "EventName": "UNC_H_REQUESTS.WRITES",
         "Unit": "HA"
     },
     {
-        "BriefDescription": "write requests to local home agent. Derived from unc_h_requests.writes_local",
+        "BriefDescription": "write requests to local home agent",
         "Counter": "0,1,2,3",
         "EventCode": "0x1",
         "EventName": "UNC_H_REQUESTS.WRITES_LOCAL",
         "Unit": "HA"
     },
     {
-        "BriefDescription": "write requests to remote home agent. Derived from unc_h_requests.writes_remote",
+        "BriefDescription": "write requests to remote home agent",
         "Counter": "0,1,2,3",
         "EventCode": "0x1",
         "EventName": "UNC_H_REQUESTS.WRITES_REMOTE",
         "Unit": "HA"
     },
     {
-        "BriefDescription": "Conflict requests (requests for same address from multiple agents simultaneously). Derived from unc_h_snoop_resp.rspcnflct",
+        "BriefDescription": "Conflict requests (requests for same address from multiple agents simultaneously)",
         "Counter": "0,1,2,3",
         "EventCode": "0x21",
         "EventName": "UNC_H_SNOOP_RESP.RSPCNFLCT",
         "Unit": "HA"
     },
     {
-        "BriefDescription": "M line forwarded from remote cache along with writeback to memory. Derived from unc_h_snoop_resp.rsp_fwd_wb",
+        "BriefDescription": "M line forwarded from remote cache along with writeback to memory",
         "Counter": "0,1,2,3",
         "EventCode": "0x21",
         "EventName": "UNC_H_SNOOP_RESP.RSP_FWD_WB",
         "Unit": "HA"
     },
     {
-        "BriefDescription": "M line forwarded from remote cache with no writeback to memory. Derived from unc_h_snoop_resp.rspifwd",
+        "BriefDescription": "M line forwarded from remote cache with no writeback to memory",
         "Counter": "0,1,2,3",
         "EventCode": "0x21",
         "EventName": "UNC_H_SNOOP_RESP.RSPIFWD",
         "Unit": "HA"
     },
     {
-        "BriefDescription": "Shared line response from remote cache. Derived from unc_h_snoop_resp.rsps",
+        "BriefDescription": "Shared line response from remote cache",
         "Counter": "0,1,2,3",
         "EventCode": "0x21",
         "EventName": "UNC_H_SNOOP_RESP.RSPS",
         "Unit": "HA"
     },
     {
-        "BriefDescription": "Shared line forwarded from remote cache. Derived from unc_h_snoop_resp.rspsfwd",
+        "BriefDescription": "Shared line forwarded from remote cache",
         "Counter": "0,1,2,3",
         "EventCode": "0x21",
         "EventName": "UNC_H_SNOOP_RESP.RSPSFWD",
index 39387f7..8249613 100644 (file)
@@ -1,6 +1,6 @@
 [
     {
-        "BriefDescription": "QPI clock ticks. Derived from unc_q_clockticks",
+        "BriefDescription": "QPI clock ticks",
         "Counter": "0,1,2,3",
         "EventCode": "0x14",
         "EventName": "UNC_Q_CLOCKTICKS",
@@ -10,7 +10,7 @@
     {
         "BriefDescription": "Number of data flits transmitted . Derived from unc_q_txl_flits_g0.data",
         "Counter": "0,1,2,3",
-        "EventName": "UNC_Q_TxL_FLITS_G0.DATA",
+        "EventName": "QPI_DATA_BANDWIDTH_TX",
         "PerPkg": "1",
         "ScaleUnit": "8Bytes",
         "UMask": "0x2",
@@ -19,7 +19,7 @@
     {
         "BriefDescription": "Number of non data (control) flits transmitted . Derived from unc_q_txl_flits_g0.non_data",
         "Counter": "0,1,2,3",
-        "EventName": "UNC_Q_TxL_FLITS_G0.NON_DATA",
+        "EventName": "QPI_CTL_BANDWIDTH_TX",
         "PerPkg": "1",
         "ScaleUnit": "8Bytes",
         "UMask": "0x4",
index d17dc23..66eed39 100644 (file)
@@ -3,7 +3,7 @@
         "BriefDescription": "read requests to memory controller. Derived from unc_m_cas_count.rd",
         "Counter": "0,1,2,3",
         "EventCode": "0x4",
-        "EventName": "UNC_M_CAS_COUNT.RD",
+        "EventName": "LLC_MISSES.MEM_READ",
         "PerPkg": "1",
         "ScaleUnit": "64Bytes",
         "UMask": "0x3",
         "BriefDescription": "write requests to memory controller. Derived from unc_m_cas_count.wr",
         "Counter": "0,1,2,3",
         "EventCode": "0x4",
-        "EventName": "UNC_M_CAS_COUNT.WR",
+        "EventName": "LLC_MISSES.MEM_WRITE",
         "PerPkg": "1",
         "ScaleUnit": "64Bytes",
         "UMask": "0xC",
         "Unit": "iMC"
     },
     {
-        "BriefDescription": "Memory controller clock ticks. Derived from unc_m_clockticks",
+        "BriefDescription": "Memory controller clock ticks",
         "Counter": "0,1,2,3",
         "EventName": "UNC_M_CLOCKTICKS",
         "PerPkg": "1",
         "Unit": "iMC"
     },
     {
-        "BriefDescription": "Cycles where DRAM ranks are in power down (CKE) mode. Derived from unc_m_power_channel_ppd",
+        "BriefDescription": "Cycles where DRAM ranks are in power down (CKE) mode",
         "Counter": "0,1,2,3",
         "EventCode": "0x85",
         "EventName": "UNC_M_POWER_CHANNEL_PPD",
         "MetricExpr": "(UNC_M_POWER_CHANNEL_PPD / UNC_M_CLOCKTICKS) * 100.",
+        "MetricName": "power_channel_ppd %",
         "PerPkg": "1",
         "Unit": "iMC"
     },
     {
-        "BriefDescription": "Cycles all ranks are in critical thermal throttle. Derived from unc_m_power_critical_throttle_cycles",
+        "BriefDescription": "Cycles all ranks are in critical thermal throttle",
         "Counter": "0,1,2,3",
         "EventCode": "0x86",
         "EventName": "UNC_M_POWER_CRITICAL_THROTTLE_CYCLES",
         "MetricExpr": "(UNC_M_POWER_CRITICAL_THROTTLE_CYCLES / UNC_M_CLOCKTICKS) * 100.",
+        "MetricName": "power_critical_throttle_cycles %",
         "PerPkg": "1",
         "Unit": "iMC"
     },
     {
-        "BriefDescription": "Cycles Memory is in self refresh power mode. Derived from unc_m_power_self_refresh",
+        "BriefDescription": "Cycles Memory is in self refresh power mode",
         "Counter": "0,1,2,3",
         "EventCode": "0x43",
         "EventName": "UNC_M_POWER_SELF_REFRESH",
         "MetricExpr": "(UNC_M_POWER_SELF_REFRESH / UNC_M_CLOCKTICKS) * 100.",
+        "MetricName": "power_self_refresh %",
         "PerPkg": "1",
         "Unit": "iMC"
     },
     {
-        "BriefDescription": "Pre-charges due to page misses. Derived from unc_m_pre_count.page_miss",
+        "BriefDescription": "Pre-charges due to page misses",
         "Counter": "0,1,2,3",
         "EventCode": "0x2",
         "EventName": "UNC_M_PRE_COUNT.PAGE_MISS",
@@ -63,7 +66,7 @@
         "Unit": "iMC"
     },
     {
-        "BriefDescription": "Pre-charge for reads. Derived from unc_m_pre_count.rd",
+        "BriefDescription": "Pre-charge for reads",
         "Counter": "0,1,2,3",
         "EventCode": "0x2",
         "EventName": "UNC_M_PRE_COUNT.RD",
@@ -72,7 +75,7 @@
         "Unit": "iMC"
     },
     {
-        "BriefDescription": "Pre-charge for writes. Derived from unc_m_pre_count.wr",
+        "BriefDescription": "Pre-charge for writes",
         "Counter": "0,1,2,3",
         "EventCode": "0x2",
         "EventName": "UNC_M_PRE_COUNT.WR",
index b44d430..dd1b956 100644 (file)
@@ -1,83 +1,91 @@
 [
     {
-        "BriefDescription": "PCU clock ticks. Use to get percentages of PCU cycles events. Derived from unc_p_clockticks",
+        "BriefDescription": "PCU clock ticks. Use to get percentages of PCU cycles events",
         "Counter": "0,1,2,3",
         "EventName": "UNC_P_CLOCKTICKS",
         "PerPkg": "1",
         "Unit": "PCU"
     },
     {
-        "BriefDescription": "C0 and C1. Derived from unc_p_power_state_occupancy.cores_c0",
+        "BriefDescription": "This is an occupancy event that tracks the number of cores that are in C0.  It can be used by itself to get the average number of cores in C0, with threshholding to generate histograms, or with other PCU events and occupancy triggering to capture other details",
         "Counter": "0,1,2,3",
         "EventCode": "0x80",
         "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C0",
         "Filter": "occ_sel=1",
         "MetricExpr": "(UNC_P_POWER_STATE_OCCUPANCY.CORES_C0 / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "power_state_occupancy.cores_c0 %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
     {
-        "BriefDescription": "C3. Derived from unc_p_power_state_occupancy.cores_c3",
+        "BriefDescription": "This is an occupancy event that tracks the number of cores that are in C3.  It can be used by itself to get the average number of cores in C0, with threshholding to generate histograms, or with other PCU events and occupancy triggering to capture other details",
         "Counter": "0,1,2,3",
         "EventCode": "0x80",
         "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C3",
         "Filter": "occ_sel=2",
         "MetricExpr": "(UNC_P_POWER_STATE_OCCUPANCY.CORES_C3 / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "power_state_occupancy.cores_c3 %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
     {
-        "BriefDescription": "C6 and C7. Derived from unc_p_power_state_occupancy.cores_c6",
+        "BriefDescription": "This is an occupancy event that tracks the number of cores that are in C6.  It can be used by itself to get the average number of cores in C0, with threshholding to generate histograms, or with other PCU events ",
         "Counter": "0,1,2,3",
         "EventCode": "0x80",
         "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C6",
         "Filter": "occ_sel=3",
         "MetricExpr": "(UNC_P_POWER_STATE_OCCUPANCY.CORES_C6 / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "power_state_occupancy.cores_c6 %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
     {
-        "BriefDescription": "External Prochot. Derived from unc_p_prochot_external_cycles",
+        "BriefDescription": "Counts the number of cycles that we are in external PROCHOT mode.  This mode is triggered when a sensor off the die determines that something off-die (like DRAM) is too hot and must throttle to avoid damaging the chip",
         "Counter": "0,1,2,3",
         "EventCode": "0xA",
         "EventName": "UNC_P_PROCHOT_EXTERNAL_CYCLES",
         "MetricExpr": "(UNC_P_PROCHOT_EXTERNAL_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "prochot_external_cycles %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
     {
-        "BriefDescription": "Thermal Strongest Upper Limit Cycles. Derived from unc_p_freq_max_limit_thermal_cycles",
+        "BriefDescription": "Counts the number of cycles when temperature is the upper limit on frequency",
         "Counter": "0,1,2,3",
         "EventCode": "0x4",
         "EventName": "UNC_P_FREQ_MAX_LIMIT_THERMAL_CYCLES",
         "MetricExpr": "(UNC_P_FREQ_MAX_LIMIT_THERMAL_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "freq_max_limit_thermal_cycles %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
     {
-        "BriefDescription": "OS Strongest Upper Limit Cycles. Derived from unc_p_freq_max_os_cycles",
+        "BriefDescription": "Counts the number of cycles when the OS is the upper limit on frequency",
         "Counter": "0,1,2,3",
         "EventCode": "0x6",
         "EventName": "UNC_P_FREQ_MAX_OS_CYCLES",
         "MetricExpr": "(UNC_P_FREQ_MAX_OS_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "freq_max_os_cycles %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
     {
-        "BriefDescription": "Power Strongest Upper Limit Cycles. Derived from unc_p_freq_max_power_cycles",
+        "BriefDescription": "Counts the number of cycles when power is the upper limit on frequency",
         "Counter": "0,1,2,3",
         "EventCode": "0x5",
         "EventName": "UNC_P_FREQ_MAX_POWER_CYCLES",
         "MetricExpr": "(UNC_P_FREQ_MAX_POWER_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "freq_max_power_cycles %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
     {
-        "BriefDescription": "Cycles spent changing Frequency. Derived from unc_p_freq_trans_cycles",
+        "BriefDescription": "Counts the number of cycles when current is the upper limit on frequency",
         "Counter": "0,1,2,3",
         "EventCode": "0x74",
         "EventName": "UNC_P_FREQ_TRANS_CYCLES",
         "MetricExpr": "(UNC_P_FREQ_TRANS_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "freq_trans_cycles %",
         "PerPkg": "1",
         "Unit": "PCU"
     }
diff --git a/tools/perf/pmu-events/arch/x86/haswell/uncore.json b/tools/perf/pmu-events/arch/x86/haswell/uncore.json
new file mode 100644 (file)
index 0000000..3ef5c21
--- /dev/null
@@ -0,0 +1,374 @@
+[
+  {
+    "Unit": "CBO",
+    "EventCode": "0x22",
+    "UMask": "0x21",
+    "EventName": "UNC_CBO_XSNP_RESPONSE.MISS_EXTERNAL",
+    "BriefDescription": "An external snoop misses in some processor core.",
+    "PublicDescription": "An external snoop misses in some processor core.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x22",
+    "UMask": "0x41",
+    "EventName": "UNC_CBO_XSNP_RESPONSE.MISS_XCORE",
+    "BriefDescription": "A cross-core snoop initiated by this Cbox due to processor core memory request which misses in some processor core.",
+    "PublicDescription": "A cross-core snoop initiated by this Cbox due to processor core memory request which misses in some processor core.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x22",
+    "UMask": "0x81",
+    "EventName": "UNC_CBO_XSNP_RESPONSE.MISS_EVICTION",
+    "BriefDescription": "A cross-core snoop resulted from L3 Eviction which misses in some processor core.",
+    "PublicDescription": "A cross-core snoop resulted from L3 Eviction which misses in some processor core.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x22",
+    "UMask": "0x24",
+    "EventName": "UNC_CBO_XSNP_RESPONSE.HIT_EXTERNAL",
+    "BriefDescription": "An external snoop hits a non-modified line in some processor core.",
+    "PublicDescription": "An external snoop hits a non-modified line in some processor core.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x22",
+    "UMask": "0x44",
+    "EventName": "UNC_CBO_XSNP_RESPONSE.HIT_XCORE",
+    "BriefDescription": "A cross-core snoop initiated by this Cbox due to processor core memory request which hits a non-modified line in some processor core.",
+    "PublicDescription": "A cross-core snoop initiated by this Cbox due to processor core memory request which hits a non-modified line in some processor core.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x22",
+    "UMask": "0x84",
+    "EventName": "UNC_CBO_XSNP_RESPONSE.HIT_EVICTION",
+    "BriefDescription": "A cross-core snoop resulted from L3 Eviction which hits a non-modified line in some processor core.",
+    "PublicDescription": "A cross-core snoop resulted from L3 Eviction which hits a non-modified line in some processor core.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x22",
+    "UMask": "0x28",
+    "EventName": "UNC_CBO_XSNP_RESPONSE.HITM_EXTERNAL",
+    "BriefDescription": "An external snoop hits a modified line in some processor core.",
+    "PublicDescription": "An external snoop hits a modified line in some processor core.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x22",
+    "UMask": "0x48",
+    "EventName": "UNC_CBO_XSNP_RESPONSE.HITM_XCORE",
+    "BriefDescription": "A cross-core snoop initiated by this Cbox due to processor core memory request which hits a modified line in some processor core.",
+    "PublicDescription": "A cross-core snoop initiated by this Cbox due to processor core memory request which hits a modified line in some processor core.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x22",
+    "UMask": "0x88",
+    "EventName": "UNC_CBO_XSNP_RESPONSE.HITM_EVICTION",
+    "BriefDescription": "A cross-core snoop resulted from L3 Eviction which hits a modified line in some processor core.",
+    "PublicDescription": "A cross-core snoop resulted from L3 Eviction which hits a modified line in some processor core.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x34",
+    "UMask": "0x11",
+    "EventName": "UNC_CBO_CACHE_LOOKUP.READ_M",
+    "BriefDescription": "L3 Lookup read request that access cache and found line in M-state.",
+    "PublicDescription": "L3 Lookup read request that access cache and found line in M-state.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x34",
+    "UMask": "0x21",
+    "EventName": "UNC_CBO_CACHE_LOOKUP.WRITE_M",
+    "BriefDescription": "L3 Lookup write request that access cache and found line in M-state.",
+    "PublicDescription": "L3 Lookup write request that access cache and found line in M-state.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x34",
+    "UMask": "0x41",
+    "EventName": "UNC_CBO_CACHE_LOOKUP.EXTSNP_M",
+    "BriefDescription": "L3 Lookup external snoop request that access cache and found line in M-state.",
+    "PublicDescription": "L3 Lookup external snoop request that access cache and found line in M-state.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x34",
+    "UMask": "0x81",
+    "EventName": "UNC_CBO_CACHE_LOOKUP.ANY_M",
+    "BriefDescription": "L3 Lookup any request that access cache and found line in M-state.",
+    "PublicDescription": "L3 Lookup any request that access cache and found line in M-state.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x34",
+    "UMask": "0x18",
+    "EventName": "UNC_CBO_CACHE_LOOKUP.READ_I",
+    "BriefDescription": "L3 Lookup read request that access cache and found line in I-state.",
+    "PublicDescription": "L3 Lookup read request that access cache and found line in I-state.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x34",
+    "UMask": "0x28",
+    "EventName": "UNC_CBO_CACHE_LOOKUP.WRITE_I",
+    "BriefDescription": "L3 Lookup write request that access cache and found line in I-state.",
+    "PublicDescription": "L3 Lookup write request that access cache and found line in I-state.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x34",
+    "UMask": "0x48",
+    "EventName": "UNC_CBO_CACHE_LOOKUP.EXTSNP_I",
+    "BriefDescription": "L3 Lookup external snoop request that access cache and found line in I-state.",
+    "PublicDescription": "L3 Lookup external snoop request that access cache and found line in I-state.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x34",
+    "UMask": "0x88",
+    "EventName": "UNC_CBO_CACHE_LOOKUP.ANY_I",
+    "BriefDescription": "L3 Lookup any request that access cache and found line in I-state.",
+    "PublicDescription": "L3 Lookup any request that access cache and found line in I-state.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x34",
+    "UMask": "0x1f",
+    "EventName": "UNC_CBO_CACHE_LOOKUP.READ_MESI",
+    "BriefDescription": "L3 Lookup read request that access cache and found line in any MESI-state.",
+    "PublicDescription": "L3 Lookup read request that access cache and found line in any MESI-state.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x34",
+    "UMask": "0x2f",
+    "EventName": "UNC_CBO_CACHE_LOOKUP.WRITE_MESI",
+    "BriefDescription": "L3 Lookup write request that access cache and found line in MESI-state.",
+    "PublicDescription": "L3 Lookup write request that access cache and found line in MESI-state.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x34",
+    "UMask": "0x4f",
+    "EventName": "UNC_CBO_CACHE_LOOKUP.EXTSNP_MESI",
+    "BriefDescription": "L3 Lookup external snoop request that access cache and found line in MESI-state.",
+    "PublicDescription": "L3 Lookup external snoop request that access cache and found line in MESI-state.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x34",
+    "UMask": "0x8f",
+    "EventName": "UNC_CBO_CACHE_LOOKUP.ANY_MESI",
+    "BriefDescription": "L3 Lookup any request that access cache and found line in MESI-state.",
+    "PublicDescription": "L3 Lookup any request that access cache and found line in MESI-state.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x34",
+    "UMask": "0x86",
+    "EventName": "UNC_CBO_CACHE_LOOKUP.ANY_ES",
+    "BriefDescription": "L3 Lookup any request that access cache and found line in E or S-state.",
+    "PublicDescription": "L3 Lookup any request that access cache and found line in E or S-state.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x34",
+    "UMask": "0x46",
+    "EventName": "UNC_CBO_CACHE_LOOKUP.EXTSNP_ES",
+    "BriefDescription": "L3 Lookup external snoop request that access cache and found line in E or S-state.",
+    "PublicDescription": "L3 Lookup external snoop request that access cache and found line in E or S-state.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x34",
+    "UMask": "0x16",
+    "EventName": "UNC_CBO_CACHE_LOOKUP.READ_ES",
+    "BriefDescription": "L3 Lookup read request that access cache and found line in E or S-state.",
+    "PublicDescription": "L3 Lookup read request that access cache and found line in E or S-state.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x34",
+    "UMask": "0x26",
+    "EventName": "UNC_CBO_CACHE_LOOKUP.WRITE_ES",
+    "BriefDescription": "L3 Lookup write request that access cache and found line in E or S-state.",
+    "PublicDescription": "L3 Lookup write request that access cache and found line in E or S-state.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "iMPH-U",
+    "EventCode": "0x80",
+    "UMask": "0x01",
+    "EventName": "UNC_ARB_TRK_OCCUPANCY.ALL",
+    "BriefDescription": "Each cycle count number of all Core outgoing valid entries. Such entry is defined as valid from it's allocation till first of IDI0 or DRS0 messages is sent out. Accounts for Coherent and non-coherent traffic.",
+    "PublicDescription": "Each cycle count number of all Core outgoing valid entries. Such entry is defined as valid from it's allocation till first of IDI0 or DRS0 messages is sent out. Accounts for Coherent and non-coherent traffic.",
+    "Counter": "0",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "iMPH-U",
+    "EventCode": "0x81",
+    "UMask": "0x01",
+    "EventName": "UNC_ARB_TRK_REQUESTS.ALL",
+    "BriefDescription": "Total number of Core outgoing entries allocated. Accounts for Coherent and non-coherent traffic.",
+    "PublicDescription": "Total number of Core outgoing entries allocated. Accounts for Coherent and non-coherent traffic.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "iMPH-U",
+    "EventCode": "0x81",
+    "UMask": "0x20",
+    "EventName": "UNC_ARB_TRK_REQUESTS.WRITES",
+    "BriefDescription": "Number of Writes allocated - any write transactions: full/partials writes and evictions.",
+    "PublicDescription": "Number of Writes allocated - any write transactions: full/partials writes and evictions.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "iMPH-U",
+    "EventCode": "0x83",
+    "UMask": "0x01",
+    "EventName": "UNC_ARB_COH_TRK_OCCUPANCY.All",
+    "BriefDescription": "Each cycle count number of valid entries in Coherency Tracker queue from allocation till deallocation. Aperture requests (snoops) appear as NC decoded internally and become coherent (snoop L3, access memory)",
+    "PublicDescription": "Each cycle count number of valid entries in Coherency Tracker queue from allocation till deallocation. Aperture requests (snoops) appear as NC decoded internally and become coherent (snoop L3, access memory).",
+    "Counter": "0",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "iMPH-U",
+    "EventCode": "0x84",
+    "UMask": "0x01",
+    "EventName": "UNC_ARB_COH_TRK_REQUESTS.ALL",
+    "BriefDescription": "Number of entries allocated. Account for Any type: e.g. Snoop, Core aperture, etc.",
+    "PublicDescription": "Number of entries allocated. Account for Any type: e.g. Snoop, Core aperture, etc.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "NCU",
+    "EventCode": "0x0",
+    "UMask": "0x01",
+    "EventName": "UNC_CLOCK.SOCKET",
+    "BriefDescription": "This 48-bit fixed counter counts the UCLK cycles.",
+    "PublicDescription": "This 48-bit fixed counter counts the UCLK cycles.",
+    "Counter": "FIXED",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  }
+]
\ No newline at end of file
index 076459c..58ed6d3 100644 (file)
@@ -1,13 +1,13 @@
 [
     {
-        "BriefDescription": "Uncore cache clock ticks. Derived from unc_c_clockticks",
+        "BriefDescription": "Uncore cache clock ticks",
         "Counter": "0,1,2,3",
         "EventName": "UNC_C_CLOCKTICKS",
         "PerPkg": "1",
         "Unit": "CBO"
     },
     {
-        "BriefDescription": "All LLC Misses (code+ data rd + data wr - including demand and prefetch). Derived from unc_c_llc_lookup.any",
+        "BriefDescription": "All LLC Misses (code+ data rd + data wr - including demand and prefetch)",
         "Counter": "0,1,2,3",
         "EventCode": "0x34",
         "EventName": "UNC_C_LLC_LOOKUP.ANY",
@@ -18,7 +18,7 @@
         "Unit": "CBO"
     },
     {
-        "BriefDescription": "M line evictions from LLC (writebacks to memory). Derived from unc_c_llc_victims.m_state",
+        "BriefDescription": "M line evictions from LLC (writebacks to memory)",
         "Counter": "0,1,2,3",
         "EventCode": "0x37",
         "EventName": "UNC_C_LLC_VICTIMS.M_STATE",
         "Unit": "CBO"
     },
     {
-        "BriefDescription": "read requests to home agent. Derived from unc_h_requests.reads",
+        "BriefDescription": "read requests to home agent",
         "Counter": "0,1,2,3",
         "EventCode": "0x1",
         "EventName": "UNC_H_REQUESTS.READS",
         "Unit": "HA"
     },
     {
-        "BriefDescription": "read requests to local home agent. Derived from unc_h_requests.reads_local",
+        "BriefDescription": "read requests to local home agent",
         "Counter": "0,1,2,3",
         "EventCode": "0x1",
         "EventName": "UNC_H_REQUESTS.READS_LOCAL",
         "Unit": "HA"
     },
     {
-        "BriefDescription": "read requests to remote home agent. Derived from unc_h_requests.reads_remote",
+        "BriefDescription": "read requests to remote home agent",
         "Counter": "0,1,2,3",
         "EventCode": "0x1",
         "EventName": "UNC_H_REQUESTS.READS_REMOTE",
         "Unit": "HA"
     },
     {
-        "BriefDescription": "write requests to home agent. Derived from unc_h_requests.writes",
+        "BriefDescription": "write requests to home agent",
         "Counter": "0,1,2,3",
         "EventCode": "0x1",
         "EventName": "UNC_H_REQUESTS.WRITES",
         "Unit": "HA"
     },
     {
-        "BriefDescription": "write requests to local home agent. Derived from unc_h_requests.writes_local",
+        "BriefDescription": "write requests to local home agent",
         "Counter": "0,1,2,3",
         "EventCode": "0x1",
         "EventName": "UNC_H_REQUESTS.WRITES_LOCAL",
         "Unit": "HA"
     },
     {
-        "BriefDescription": "write requests to remote home agent. Derived from unc_h_requests.writes_remote",
+        "BriefDescription": "write requests to remote home agent",
         "Counter": "0,1,2,3",
         "EventCode": "0x1",
         "EventName": "UNC_H_REQUESTS.WRITES_REMOTE",
         "Unit": "HA"
     },
     {
-        "BriefDescription": "Conflict requests (requests for same address from multiple agents simultaneously). Derived from unc_h_snoop_resp.rspcnflct",
+        "BriefDescription": "Conflict requests (requests for same address from multiple agents simultaneously)",
         "Counter": "0,1,2,3",
         "EventCode": "0x21",
         "EventName": "UNC_H_SNOOP_RESP.RSPCNFLCT",
         "Unit": "HA"
     },
     {
-        "BriefDescription": "M line forwarded from remote cache along with writeback to memory. Derived from unc_h_snoop_resp.rsp_fwd_wb",
+        "BriefDescription": "M line forwarded from remote cache along with writeback to memory",
         "Counter": "0,1,2,3",
         "EventCode": "0x21",
         "EventName": "UNC_H_SNOOP_RESP.RSP_FWD_WB",
         "Unit": "HA"
     },
     {
-        "BriefDescription": "M line forwarded from remote cache with no writeback to memory. Derived from unc_h_snoop_resp.rspifwd",
+        "BriefDescription": "M line forwarded from remote cache with no writeback to memory",
         "Counter": "0,1,2,3",
         "EventCode": "0x21",
         "EventName": "UNC_H_SNOOP_RESP.RSPIFWD",
         "Unit": "HA"
     },
     {
-        "BriefDescription": "Shared line response from remote cache. Derived from unc_h_snoop_resp.rsps",
+        "BriefDescription": "Shared line response from remote cache",
         "Counter": "0,1,2,3",
         "EventCode": "0x21",
         "EventName": "UNC_H_SNOOP_RESP.RSPS",
         "Unit": "HA"
     },
     {
-        "BriefDescription": "Shared line forwarded from remote cache. Derived from unc_h_snoop_resp.rspsfwd",
+        "BriefDescription": "Shared line forwarded from remote cache",
         "Counter": "0,1,2,3",
         "EventCode": "0x21",
         "EventName": "UNC_H_SNOOP_RESP.RSPSFWD",
index 39387f7..8249613 100644 (file)
@@ -1,6 +1,6 @@
 [
     {
-        "BriefDescription": "QPI clock ticks. Derived from unc_q_clockticks",
+        "BriefDescription": "QPI clock ticks",
         "Counter": "0,1,2,3",
         "EventCode": "0x14",
         "EventName": "UNC_Q_CLOCKTICKS",
@@ -10,7 +10,7 @@
     {
         "BriefDescription": "Number of data flits transmitted . Derived from unc_q_txl_flits_g0.data",
         "Counter": "0,1,2,3",
-        "EventName": "UNC_Q_TxL_FLITS_G0.DATA",
+        "EventName": "QPI_DATA_BANDWIDTH_TX",
         "PerPkg": "1",
         "ScaleUnit": "8Bytes",
         "UMask": "0x2",
@@ -19,7 +19,7 @@
     {
         "BriefDescription": "Number of non data (control) flits transmitted . Derived from unc_q_txl_flits_g0.non_data",
         "Counter": "0,1,2,3",
-        "EventName": "UNC_Q_TxL_FLITS_G0.NON_DATA",
+        "EventName": "QPI_CTL_BANDWIDTH_TX",
         "PerPkg": "1",
         "ScaleUnit": "8Bytes",
         "UMask": "0x4",
index d17dc23..66eed39 100644 (file)
@@ -3,7 +3,7 @@
         "BriefDescription": "read requests to memory controller. Derived from unc_m_cas_count.rd",
         "Counter": "0,1,2,3",
         "EventCode": "0x4",
-        "EventName": "UNC_M_CAS_COUNT.RD",
+        "EventName": "LLC_MISSES.MEM_READ",
         "PerPkg": "1",
         "ScaleUnit": "64Bytes",
         "UMask": "0x3",
         "BriefDescription": "write requests to memory controller. Derived from unc_m_cas_count.wr",
         "Counter": "0,1,2,3",
         "EventCode": "0x4",
-        "EventName": "UNC_M_CAS_COUNT.WR",
+        "EventName": "LLC_MISSES.MEM_WRITE",
         "PerPkg": "1",
         "ScaleUnit": "64Bytes",
         "UMask": "0xC",
         "Unit": "iMC"
     },
     {
-        "BriefDescription": "Memory controller clock ticks. Derived from unc_m_clockticks",
+        "BriefDescription": "Memory controller clock ticks",
         "Counter": "0,1,2,3",
         "EventName": "UNC_M_CLOCKTICKS",
         "PerPkg": "1",
         "Unit": "iMC"
     },
     {
-        "BriefDescription": "Cycles where DRAM ranks are in power down (CKE) mode. Derived from unc_m_power_channel_ppd",
+        "BriefDescription": "Cycles where DRAM ranks are in power down (CKE) mode",
         "Counter": "0,1,2,3",
         "EventCode": "0x85",
         "EventName": "UNC_M_POWER_CHANNEL_PPD",
         "MetricExpr": "(UNC_M_POWER_CHANNEL_PPD / UNC_M_CLOCKTICKS) * 100.",
+        "MetricName": "power_channel_ppd %",
         "PerPkg": "1",
         "Unit": "iMC"
     },
     {
-        "BriefDescription": "Cycles all ranks are in critical thermal throttle. Derived from unc_m_power_critical_throttle_cycles",
+        "BriefDescription": "Cycles all ranks are in critical thermal throttle",
         "Counter": "0,1,2,3",
         "EventCode": "0x86",
         "EventName": "UNC_M_POWER_CRITICAL_THROTTLE_CYCLES",
         "MetricExpr": "(UNC_M_POWER_CRITICAL_THROTTLE_CYCLES / UNC_M_CLOCKTICKS) * 100.",
+        "MetricName": "power_critical_throttle_cycles %",
         "PerPkg": "1",
         "Unit": "iMC"
     },
     {
-        "BriefDescription": "Cycles Memory is in self refresh power mode. Derived from unc_m_power_self_refresh",
+        "BriefDescription": "Cycles Memory is in self refresh power mode",
         "Counter": "0,1,2,3",
         "EventCode": "0x43",
         "EventName": "UNC_M_POWER_SELF_REFRESH",
         "MetricExpr": "(UNC_M_POWER_SELF_REFRESH / UNC_M_CLOCKTICKS) * 100.",
+        "MetricName": "power_self_refresh %",
         "PerPkg": "1",
         "Unit": "iMC"
     },
     {
-        "BriefDescription": "Pre-charges due to page misses. Derived from unc_m_pre_count.page_miss",
+        "BriefDescription": "Pre-charges due to page misses",
         "Counter": "0,1,2,3",
         "EventCode": "0x2",
         "EventName": "UNC_M_PRE_COUNT.PAGE_MISS",
@@ -63,7 +66,7 @@
         "Unit": "iMC"
     },
     {
-        "BriefDescription": "Pre-charge for reads. Derived from unc_m_pre_count.rd",
+        "BriefDescription": "Pre-charge for reads",
         "Counter": "0,1,2,3",
         "EventCode": "0x2",
         "EventName": "UNC_M_PRE_COUNT.RD",
@@ -72,7 +75,7 @@
         "Unit": "iMC"
     },
     {
-        "BriefDescription": "Pre-charge for writes. Derived from unc_m_pre_count.wr",
+        "BriefDescription": "Pre-charge for writes",
         "Counter": "0,1,2,3",
         "EventCode": "0x2",
         "EventName": "UNC_M_PRE_COUNT.WR",
index b44d430..dd1b956 100644 (file)
@@ -1,83 +1,91 @@
 [
     {
-        "BriefDescription": "PCU clock ticks. Use to get percentages of PCU cycles events. Derived from unc_p_clockticks",
+        "BriefDescription": "PCU clock ticks. Use to get percentages of PCU cycles events",
         "Counter": "0,1,2,3",
         "EventName": "UNC_P_CLOCKTICKS",
         "PerPkg": "1",
         "Unit": "PCU"
     },
     {
-        "BriefDescription": "C0 and C1. Derived from unc_p_power_state_occupancy.cores_c0",
+        "BriefDescription": "This is an occupancy event that tracks the number of cores that are in C0.  It can be used by itself to get the average number of cores in C0, with threshholding to generate histograms, or with other PCU events and occupancy triggering to capture other details",
         "Counter": "0,1,2,3",
         "EventCode": "0x80",
         "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C0",
         "Filter": "occ_sel=1",
         "MetricExpr": "(UNC_P_POWER_STATE_OCCUPANCY.CORES_C0 / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "power_state_occupancy.cores_c0 %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
     {
-        "BriefDescription": "C3. Derived from unc_p_power_state_occupancy.cores_c3",
+        "BriefDescription": "This is an occupancy event that tracks the number of cores that are in C3.  It can be used by itself to get the average number of cores in C0, with threshholding to generate histograms, or with other PCU events and occupancy triggering to capture other details",
         "Counter": "0,1,2,3",
         "EventCode": "0x80",
         "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C3",
         "Filter": "occ_sel=2",
         "MetricExpr": "(UNC_P_POWER_STATE_OCCUPANCY.CORES_C3 / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "power_state_occupancy.cores_c3 %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
     {
-        "BriefDescription": "C6 and C7. Derived from unc_p_power_state_occupancy.cores_c6",
+        "BriefDescription": "This is an occupancy event that tracks the number of cores that are in C6.  It can be used by itself to get the average number of cores in C0, with threshholding to generate histograms, or with other PCU events ",
         "Counter": "0,1,2,3",
         "EventCode": "0x80",
         "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C6",
         "Filter": "occ_sel=3",
         "MetricExpr": "(UNC_P_POWER_STATE_OCCUPANCY.CORES_C6 / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "power_state_occupancy.cores_c6 %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
     {
-        "BriefDescription": "External Prochot. Derived from unc_p_prochot_external_cycles",
+        "BriefDescription": "Counts the number of cycles that we are in external PROCHOT mode.  This mode is triggered when a sensor off the die determines that something off-die (like DRAM) is too hot and must throttle to avoid damaging the chip",
         "Counter": "0,1,2,3",
         "EventCode": "0xA",
         "EventName": "UNC_P_PROCHOT_EXTERNAL_CYCLES",
         "MetricExpr": "(UNC_P_PROCHOT_EXTERNAL_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "prochot_external_cycles %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
     {
-        "BriefDescription": "Thermal Strongest Upper Limit Cycles. Derived from unc_p_freq_max_limit_thermal_cycles",
+        "BriefDescription": "Counts the number of cycles when temperature is the upper limit on frequency",
         "Counter": "0,1,2,3",
         "EventCode": "0x4",
         "EventName": "UNC_P_FREQ_MAX_LIMIT_THERMAL_CYCLES",
         "MetricExpr": "(UNC_P_FREQ_MAX_LIMIT_THERMAL_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "freq_max_limit_thermal_cycles %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
     {
-        "BriefDescription": "OS Strongest Upper Limit Cycles. Derived from unc_p_freq_max_os_cycles",
+        "BriefDescription": "Counts the number of cycles when the OS is the upper limit on frequency",
         "Counter": "0,1,2,3",
         "EventCode": "0x6",
         "EventName": "UNC_P_FREQ_MAX_OS_CYCLES",
         "MetricExpr": "(UNC_P_FREQ_MAX_OS_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "freq_max_os_cycles %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
     {
-        "BriefDescription": "Power Strongest Upper Limit Cycles. Derived from unc_p_freq_max_power_cycles",
+        "BriefDescription": "Counts the number of cycles when power is the upper limit on frequency",
         "Counter": "0,1,2,3",
         "EventCode": "0x5",
         "EventName": "UNC_P_FREQ_MAX_POWER_CYCLES",
         "MetricExpr": "(UNC_P_FREQ_MAX_POWER_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "freq_max_power_cycles %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
     {
-        "BriefDescription": "Cycles spent changing Frequency. Derived from unc_p_freq_trans_cycles",
+        "BriefDescription": "Counts the number of cycles when current is the upper limit on frequency",
         "Counter": "0,1,2,3",
         "EventCode": "0x74",
         "EventName": "UNC_P_FREQ_TRANS_CYCLES",
         "MetricExpr": "(UNC_P_FREQ_TRANS_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "freq_trans_cycles %",
         "PerPkg": "1",
         "Unit": "PCU"
     }
diff --git a/tools/perf/pmu-events/arch/x86/ivybridge/uncore.json b/tools/perf/pmu-events/arch/x86/ivybridge/uncore.json
new file mode 100644 (file)
index 0000000..42c70ee
--- /dev/null
@@ -0,0 +1,314 @@
+[
+  {
+    "Unit": "CBO",
+    "EventCode": "0x22",
+    "UMask": "0x01",
+    "EventName": "UNC_CBO_XSNP_RESPONSE.MISS",
+    "BriefDescription": "A snoop misses in some processor core.",
+    "PublicDescription": "A snoop misses in some processor core.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x22",
+    "UMask": "0x02",
+    "EventName": "UNC_CBO_XSNP_RESPONSE.INVAL",
+    "BriefDescription": "A snoop invalidates a non-modified line in some processor core.",
+    "PublicDescription": "A snoop invalidates a non-modified line in some processor core.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x22",
+    "UMask": "0x04",
+    "EventName": "UNC_CBO_XSNP_RESPONSE.HIT",
+    "BriefDescription": "A snoop hits a non-modified line in some processor core.",
+    "PublicDescription": "A snoop hits a non-modified line in some processor core.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x22",
+    "UMask": "0x08",
+    "EventName": "UNC_CBO_XSNP_RESPONSE.HITM",
+    "BriefDescription": "A snoop hits a modified line in some processor core.",
+    "PublicDescription": "A snoop hits a modified line in some processor core.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x22",
+    "UMask": "0x10",
+    "EventName": "UNC_CBO_XSNP_RESPONSE.INVAL_M",
+    "BriefDescription": "A snoop invalidates a modified line in some processor core.",
+    "PublicDescription": "A snoop invalidates a modified line in some processor core.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x22",
+    "UMask": "0x20",
+    "EventName": "UNC_CBO_XSNP_RESPONSE.EXTERNAL_FILTER",
+    "BriefDescription": "Filter on cross-core snoops initiated by this Cbox due to external snoop request.",
+    "PublicDescription": "Filter on cross-core snoops initiated by this Cbox due to external snoop request.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x22",
+    "UMask": "0x40",
+    "EventName": "UNC_CBO_XSNP_RESPONSE.XCORE_FILTER",
+    "BriefDescription": "Filter on cross-core snoops initiated by this Cbox due to processor core memory request.",
+    "PublicDescription": "Filter on cross-core snoops initiated by this Cbox due to processor core memory request.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x22",
+    "UMask": "0x80",
+    "EventName": "UNC_CBO_XSNP_RESPONSE.EVICTION_FILTER",
+    "BriefDescription": "Filter on cross-core snoops initiated by this Cbox due to LLC eviction.",
+    "PublicDescription": "Filter on cross-core snoops initiated by this Cbox due to LLC eviction.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x34",
+    "UMask": "0x01",
+    "EventName": "UNC_CBO_CACHE_LOOKUP.M",
+    "BriefDescription": "LLC lookup request that access cache and found line in M-state.",
+    "PublicDescription": "LLC lookup request that access cache and found line in M-state.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x34",
+    "UMask": "0x02",
+    "EventName": "UNC_CBO_CACHE_LOOKUP.E",
+    "BriefDescription": "LLC lookup request that access cache and found line in E-state.",
+    "PublicDescription": "LLC lookup request that access cache and found line in E-state.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x34",
+    "UMask": "0x04",
+    "EventName": "UNC_CBO_CACHE_LOOKUP.S",
+    "BriefDescription": "LLC lookup request that access cache and found line in S-state.",
+    "PublicDescription": "LLC lookup request that access cache and found line in S-state.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x34",
+    "UMask": "0x08",
+    "EventName": "UNC_CBO_CACHE_LOOKUP.I",
+    "BriefDescription": "LLC lookup request that access cache and found line in I-state.",
+    "PublicDescription": "LLC lookup request that access cache and found line in I-state.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x34",
+    "UMask": "0x10",
+    "EventName": "UNC_CBO_CACHE_LOOKUP.READ_FILTER",
+    "BriefDescription": "Filter on processor core initiated cacheable read requests.",
+    "PublicDescription": "Filter on processor core initiated cacheable read requests.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x34",
+    "UMask": "0x20",
+    "EventName": "UNC_CBO_CACHE_LOOKUP.WRITE_FILTER",
+    "BriefDescription": "Filter on processor core initiated cacheable write requests.",
+    "PublicDescription": "Filter on processor core initiated cacheable write requests.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x34",
+    "UMask": "0x40",
+    "EventName": "UNC_CBO_CACHE_LOOKUP.EXTSNP_FILTER",
+    "BriefDescription": "Filter on external snoop requests.",
+    "PublicDescription": "Filter on external snoop requests.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x34",
+    "UMask": "0x80",
+    "EventName": "UNC_CBO_CACHE_LOOKUP.ANY_REQUEST_FILTER",
+    "BriefDescription": "Filter on any IRQ or IPQ initiated requests including uncacheable, non-coherent requests.",
+    "PublicDescription": "Filter on any IRQ or IPQ initiated requests including uncacheable, non-coherent requests.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "ARB",
+    "EventCode": "0x80",
+    "UMask": "0x01",
+    "EventName": "UNC_ARB_TRK_OCCUPANCY.ALL",
+    "BriefDescription": "Counts cycles weighted by the number of requests waiting for data returning from the memory controller. Accounts for coherent and non-coherent requests initiated by IA cores, processor graphic units, or LLC.",
+    "PublicDescription": "Counts cycles weighted by the number of requests waiting for data returning from the memory controller. Accounts for coherent and non-coherent requests initiated by IA cores, processor graphic units, or LLC.",
+    "Counter": "0",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "ARB",
+    "EventCode": "0x81",
+    "UMask": "0x01",
+    "EventName": "UNC_ARB_TRK_REQUESTS.ALL",
+    "BriefDescription": "Counts the number of coherent and in-coherent requests initiated by IA cores, processor graphic units, or LLC.",
+    "PublicDescription": "Counts the number of coherent and in-coherent requests initiated by IA cores, processor graphic units, or LLC.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "ARB",
+    "EventCode": "0x81",
+    "UMask": "0x20",
+    "EventName": "UNC_ARB_TRK_REQUESTS.WRITES",
+    "BriefDescription": "Counts the number of allocated write entries, include full, partial, and LLC evictions.",
+    "PublicDescription": "Counts the number of allocated write entries, include full, partial, and LLC evictions.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "ARB",
+    "EventCode": "0x81",
+    "UMask": "0x80",
+    "EventName": "UNC_ARB_TRK_REQUESTS.EVICTIONS",
+    "BriefDescription": "Counts the number of LLC evictions allocated.",
+    "PublicDescription": "Counts the number of LLC evictions allocated.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "ARB",
+    "EventCode": "0x83",
+    "UMask": "0x01",
+    "EventName": "UNC_ARB_COH_TRK_OCCUPANCY.ALL",
+    "BriefDescription": "Cycles weighted by number of requests pending in Coherency Tracker.",
+    "PublicDescription": "Cycles weighted by number of requests pending in Coherency Tracker.",
+    "Counter": "0",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "ARB",
+    "EventCode": "0x84",
+    "UMask": "0x01",
+    "EventName": "UNC_ARB_COH_TRK_REQUESTS.ALL",
+    "BriefDescription": "Number of requests allocated in Coherency Tracker.",
+    "PublicDescription": "Number of requests allocated in Coherency Tracker.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "ARB",
+    "EventCode": "0x80",
+    "UMask": "0x01",
+    "EventName": "UNC_ARB_TRK_OCCUPANCY.CYCLES_WITH_ANY_REQUEST",
+    "BriefDescription": "Cycles with at least one request outstanding is waiting for data return from memory controller. Account for coherent and non-coherent requests initiated by IA Cores, Processor Graphics Unit, or LLC.",
+    "PublicDescription": "Cycles with at least one request outstanding is waiting for data return from memory controller. Account for coherent and non-coherent requests initiated by IA Cores, Processor Graphics Unit, or LLC.",
+    "Counter": "0,1",
+    "CounterMask": "1",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "ARB",
+    "EventCode": "0x80",
+    "UMask": "0x01",
+    "EventName": "UNC_ARB_TRK_OCCUPANCY.CYCLES_OVER_HALF_FULL",
+    "BriefDescription": "Cycles with at least half of the requests outstanding are waiting for data return from memory controller. Account for coherent and non-coherent requests initiated by IA Cores, Processor Graphics Unit, or LLC.",
+    "PublicDescription": "Cycles with at least half of the requests outstanding are waiting for data return from memory controller. Account for coherent and non-coherent requests initiated by IA Cores, Processor Graphics Unit, or LLC.",
+    "Counter": "0,1",
+    "CounterMask": "10",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "ARB",
+    "EventCode": "0x0",
+    "UMask": "0x01",
+    "EventName": "UNC_CLOCK.SOCKET",
+    "BriefDescription": "This 48-bit fixed counter counts the UCLK cycles.",
+    "PublicDescription": "This 48-bit fixed counter counts the UCLK cycles.",
+    "Counter": "Fixed",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x34",
+    "UMask": "0x06",
+    "EventName": "UNC_CBO_CACHE_LOOKUP.ES",
+    "BriefDescription": "LLC lookup request that access cache and found line in E-state or S-state.",
+    "PublicDescription": "LLC lookup request that access cache and found line in E-state or S-state.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  }
+]
\ No newline at end of file
index 2efdc67..2674105 100644 (file)
@@ -1,13 +1,13 @@
 [
     {
-        "BriefDescription": "Uncore cache clock ticks. Derived from unc_c_clockticks",
+        "BriefDescription": "Uncore cache clock ticks",
         "Counter": "0,1,2,3",
         "EventName": "UNC_C_CLOCKTICKS",
         "PerPkg": "1",
         "Unit": "CBO"
     },
     {
-        "BriefDescription": "All LLC Misses (code+ data rd + data wr - including demand and prefetch). Derived from unc_c_llc_lookup.any",
+        "BriefDescription": "All LLC Misses (code+ data rd + data wr - including demand and prefetch)",
         "Counter": "0,1",
         "EventCode": "0x34",
         "EventName": "UNC_C_LLC_LOOKUP.ANY",
@@ -18,7 +18,7 @@
         "Unit": "CBO"
     },
     {
-        "BriefDescription": "M line evictions from LLC (writebacks to memory). Derived from unc_c_llc_victims.m_state",
+        "BriefDescription": "M line evictions from LLC (writebacks to memory)",
         "Counter": "0,1",
         "EventCode": "0x37",
         "EventName": "UNC_C_LLC_VICTIMS.M_STATE",
         "Unit": "CBO"
     },
     {
-        "BriefDescription": "Occupancy for all LLC misses that are addressed to local memory. Derived from unc_c_tor_occupancy.miss_local",
+        "BriefDescription": "Occupancy for all LLC misses that are addressed to local memory",
         "EventCode": "0x36",
         "EventName": "UNC_C_TOR_OCCUPANCY.MISS_LOCAL",
         "PerPkg": "1",
         "Unit": "CBO"
     },
     {
-        "BriefDescription": "Occupancy for all LLC misses that are addressed to remote memory. Derived from unc_c_tor_occupancy.miss_remote",
+        "BriefDescription": "Occupancy for all LLC misses that are addressed to remote memory",
         "EventCode": "0x36",
         "EventName": "UNC_C_TOR_OCCUPANCY.MISS_REMOTE",
         "PerPkg": "1",
         "Unit": "CBO"
     },
     {
-        "BriefDescription": "Read requests to home agent. Derived from unc_h_requests.reads",
+        "BriefDescription": "Read requests to home agent",
         "Counter": "0,1,2,3",
         "EventCode": "0x1",
         "EventName": "UNC_H_REQUESTS.READS",
         "Unit": "HA"
     },
     {
-        "BriefDescription": "Write requests to home agent. Derived from unc_h_requests.writes",
+        "BriefDescription": "Write requests to home agent",
         "Counter": "0,1,2,3",
         "EventCode": "0x1",
         "EventName": "UNC_H_REQUESTS.WRITES",
         "Unit": "HA"
     },
     {
-        "BriefDescription": "M line forwarded from remote cache along with writeback to memory. Derived from unc_h_snoop_resp.rsp_fwd_wb",
+        "BriefDescription": "M line forwarded from remote cache along with writeback to memory",
         "Counter": "0,1,2,3",
         "EventCode": "0x21",
         "EventName": "UNC_H_SNOOP_RESP.RSP_FWD_WB",
         "Unit": "HA"
     },
     {
-        "BriefDescription": "M line forwarded from remote cache with no writeback to memory. Derived from unc_h_snoop_resp.rspifwd",
+        "BriefDescription": "M line forwarded from remote cache with no writeback to memory",
         "Counter": "0,1,2,3",
         "EventCode": "0x21",
         "EventName": "UNC_H_SNOOP_RESP.RSPIFWD",
         "Unit": "HA"
     },
     {
-        "BriefDescription": "Shared line response from remote cache. Derived from unc_h_snoop_resp.rsps",
+        "BriefDescription": "Shared line response from remote cache",
         "Counter": "0,1,2,3",
         "EventCode": "0x21",
         "EventName": "UNC_H_SNOOP_RESP.RSPS",
         "Unit": "HA"
     },
     {
-        "BriefDescription": "Shared line forwarded from remote cache. Derived from unc_h_snoop_resp.rspsfwd",
+        "BriefDescription": "Shared line forwarded from remote cache",
         "Counter": "0,1,2,3",
         "EventCode": "0x21",
         "EventName": "UNC_H_SNOOP_RESP.RSPSFWD",
index d7e2fda..b798a86 100644 (file)
@@ -1,6 +1,6 @@
 [
     {
-        "BriefDescription": "QPI clock ticks. Use to get percentages for QPI cycles events. Derived from unc_q_clockticks",
+        "BriefDescription": "QPI clock ticks. Use to get percentages for QPI cycles events",
         "Counter": "0,1,2,3",
         "EventCode": "0x14",
         "EventName": "UNC_Q_CLOCKTICKS",
@@ -8,25 +8,27 @@
         "Unit": "QPI LL"
     },
     {
-        "BriefDescription": "Cycles where receiving QPI link is in half-width mode. Derived from unc_q_rxl0p_power_cycles",
+        "BriefDescription": "Cycles where receiving QPI link is in half-width mode",
         "Counter": "0,1,2,3",
         "EventCode": "0x10",
         "EventName": "UNC_Q_RxL0P_POWER_CYCLES",
         "MetricExpr": "(UNC_Q_RxL0P_POWER_CYCLES / UNC_Q_CLOCKTICKS) * 100.",
+        "MetricName": "rxl0p_power_cycles %",
         "PerPkg": "1",
         "Unit": "QPI LL"
     },
     {
-        "BriefDescription": "Cycles where transmitting QPI link is in half-width mode. Derived from unc_q_txl0p_power_cycles",
+        "BriefDescription": "Cycles where transmitting QPI link is in half-width mode",
         "Counter": "0,1,2,3",
         "EventCode": "0xd",
         "EventName": "UNC_Q_TxL0P_POWER_CYCLES",
         "MetricExpr": "(UNC_Q_TxL0P_POWER_CYCLES / UNC_Q_CLOCKTICKS) * 100.",
+        "MetricName": "txl0p_power_cycles %",
         "PerPkg": "1",
         "Unit": "QPI LL"
     },
     {
-        "BriefDescription": "Number of data flits transmitted . Derived from unc_q_txl_flits_g0.data",
+        "BriefDescription": "Number of data flits transmitted ",
         "Counter": "0,1,2,3",
         "EventName": "UNC_Q_TxL_FLITS_G0.DATA",
         "PerPkg": "1",
@@ -35,7 +37,7 @@
         "Unit": "QPI LL"
     },
     {
-        "BriefDescription": "Number of non data (control) flits transmitted . Derived from unc_q_txl_flits_g0.non_data",
+        "BriefDescription": "Number of non data (control) flits transmitted ",
         "Counter": "0,1,2,3",
         "EventName": "UNC_Q_TxL_FLITS_G0.NON_DATA",
         "PerPkg": "1",
index ac4ad4d..df4b432 100644 (file)
@@ -1,6 +1,6 @@
 [
     {
-        "BriefDescription": "Memory page activates for reads and writes. Derived from unc_m_act_count.rd",
+        "BriefDescription": "Memory page activates for reads and writes",
         "Counter": "0,1,2,3",
         "EventCode": "0x1",
         "EventName": "UNC_M_ACT_COUNT.RD",
@@ -13,7 +13,7 @@
         "BriefDescription": "Read requests to memory controller. Derived from unc_m_cas_count.rd",
         "Counter": "0,1,2,3",
         "EventCode": "0x4",
-        "EventName": "UNC_M_CAS_COUNT.RD",
+        "EventName": "LLC_MISSES.MEM_READ",
         "PerPkg": "1",
         "ScaleUnit": "64Bytes",
         "UMask": "0x3",
         "BriefDescription": "Write requests to memory controller. Derived from unc_m_cas_count.wr",
         "Counter": "0,1,2,3",
         "EventCode": "0x4",
-        "EventName": "UNC_M_CAS_COUNT.WR",
+        "EventName": "LLC_MISSES.MEM_WRITE",
         "PerPkg": "1",
         "ScaleUnit": "64Bytes",
         "UMask": "0xC",
         "Unit": "iMC"
     },
     {
-        "BriefDescription": "Memory controller clock ticks. Use to generate percentages for memory controller CYCLES events. Derived from unc_m_clockticks",
+        "BriefDescription": "Memory controller clock ticks. Use to generate percentages for memory controller CYCLES events",
         "Counter": "0,1,2,3",
         "EventName": "UNC_M_CLOCKTICKS",
         "PerPkg": "1",
         "Unit": "iMC"
     },
     {
-        "BriefDescription": "Cycles where DRAM ranks are in power down (CKE) mode. Derived from unc_m_power_channel_ppd",
+        "BriefDescription": "Cycles where DRAM ranks are in power down (CKE) mode",
         "Counter": "0,1,2,3",
         "EventCode": "0x85",
         "EventName": "UNC_M_POWER_CHANNEL_PPD",
         "MetricExpr": "(UNC_M_POWER_CHANNEL_PPD / UNC_M_CLOCKTICKS) * 100.",
+        "MetricName": "power_channel_ppd %",
         "PerPkg": "1",
         "Unit": "iMC"
     },
     {
-        "BriefDescription": "Cycles all ranks are in critical thermal throttle. Derived from unc_m_power_critical_throttle_cycles",
+        "BriefDescription": "Cycles all ranks are in critical thermal throttle",
         "Counter": "0,1,2,3",
         "EventCode": "0x86",
         "EventName": "UNC_M_POWER_CRITICAL_THROTTLE_CYCLES",
         "MetricExpr": "(UNC_M_POWER_CRITICAL_THROTTLE_CYCLES / UNC_M_CLOCKTICKS) * 100.",
+        "MetricName": "power_critical_throttle_cycles %",
         "PerPkg": "1",
         "Unit": "iMC"
     },
     {
-        "BriefDescription": "Cycles Memory is in self refresh power mode. Derived from unc_m_power_self_refresh",
+        "BriefDescription": "Cycles Memory is in self refresh power mode",
         "Counter": "0,1,2,3",
         "EventCode": "0x43",
         "EventName": "UNC_M_POWER_SELF_REFRESH",
         "MetricExpr": "(UNC_M_POWER_SELF_REFRESH / UNC_M_CLOCKTICKS) * 100.",
+        "MetricName": "power_self_refresh %",
         "PerPkg": "1",
         "Unit": "iMC"
     },
     {
-        "BriefDescription": "Memory page conflicts. Derived from unc_m_pre_count.page_miss",
+        "BriefDescription": "Memory page conflicts",
         "Counter": "0,1,2,3",
         "EventCode": "0x2",
         "EventName": "UNC_M_PRE_COUNT.PAGE_MISS",
index dc2586d..d40498f 100644 (file)
@@ -1,44 +1,48 @@
 [
     {
-        "BriefDescription": "PCU clock ticks. Use to get percentages of PCU cycles events. Derived from unc_p_clockticks",
+        "BriefDescription": "PCU clock ticks. Use to get percentages of PCU cycles events",
         "Counter": "0,1,2,3",
         "EventName": "UNC_P_CLOCKTICKS",
         "PerPkg": "1",
         "Unit": "PCU"
     },
     {
-        "BriefDescription": "Counts the number of cycles that the uncore was running at a frequency greater than or equal to the frequency that is configured in the filter.  (filter_band0=XXX, with XXX in 100Mhz units). One can also use inversion (filter_inv=1) to track cycles when we were less than the configured frequency. Derived from unc_p_freq_band0_cycles",
+        "BriefDescription": "Counts the number of cycles that the uncore was running at a frequency greater than or equal to the frequency that is configured in the filter.  (filter_band0=XXX, with XXX in 100Mhz units). One can also use inversion (filter_inv=1) to track cycles when we were less than the configured frequency",
         "Counter": "0,1,2,3",
         "EventCode": "0xb",
         "EventName": "UNC_P_FREQ_BAND0_CYCLES",
         "MetricExpr": "(UNC_P_FREQ_BAND0_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "freq_band0_cycles %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
     {
-        "BriefDescription": "Counts the number of cycles that the uncore was running at a frequency greater than or equal to the frequency that is configured in the filter.  (filter_band1=XXX, with XXX in 100Mhz units). One can also use inversion (filter_inv=1) to track cycles when we were less than the configured frequency. Derived from unc_p_freq_band1_cycles",
+        "BriefDescription": "Counts the number of cycles that the uncore was running at a frequency greater than or equal to the frequency that is configured in the filter.  (filter_band1=XXX, with XXX in 100Mhz units). One can also use inversion (filter_inv=1) to track cycles when we were less than the configured frequency",
         "Counter": "0,1,2,3",
         "EventCode": "0xc",
         "EventName": "UNC_P_FREQ_BAND1_CYCLES",
         "MetricExpr": "(UNC_P_FREQ_BAND1_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "freq_band1_cycles %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
     {
-        "BriefDescription": "Counts the number of cycles that the uncore was running at a frequency greater than or equal to the frequency that is configured in the filter.  (filter_band2=XXX, with XXX in 100Mhz units). One can also use inversion (filter_inv=1) to track cycles when we were less than the configured frequency. Derived from unc_p_freq_band2_cycles",
+        "BriefDescription": "Counts the number of cycles that the uncore was running at a frequency greater than or equal to the frequency that is configured in the filter.  (filter_band2=XXX, with XXX in 100Mhz units). One can also use inversion (filter_inv=1) to track cycles when we were less than the configured frequency",
         "Counter": "0,1,2,3",
         "EventCode": "0xd",
         "EventName": "UNC_P_FREQ_BAND2_CYCLES",
         "MetricExpr": "(UNC_P_FREQ_BAND2_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "freq_band2_cycles %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
     {
-        "BriefDescription": "Counts the number of cycles that the uncore was running at a frequency greater than or equal to the frequency that is configured in the filter.  (filter_band3=XXX, with XXX in 100Mhz units). One can also use inversion (filter_inv=1) to track cycles when we were less than the configured frequency. Derived from unc_p_freq_band3_cycles",
+        "BriefDescription": "Counts the number of cycles that the uncore was running at a frequency greater than or equal to the frequency that is configured in the filter.  (filter_band3=XXX, with XXX in 100Mhz units). One can also use inversion (filter_inv=1) to track cycles when we were less than the configured frequency",
         "Counter": "0,1,2,3",
         "EventCode": "0xe",
         "EventName": "UNC_P_FREQ_BAND3_CYCLES",
         "MetricExpr": "(UNC_P_FREQ_BAND3_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "freq_band3_cycles %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
@@ -49,6 +53,7 @@
         "EventName": "UNC_P_FREQ_BAND0_TRANSITIONS",
         "Filter": "edge=1",
         "MetricExpr": "(UNC_P_FREQ_BAND0_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "freq_band0_cycles %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
@@ -59,6 +64,7 @@
         "EventName": "UNC_P_FREQ_BAND1_TRANSITIONS",
         "Filter": "edge=1",
         "MetricExpr": "(UNC_P_FREQ_BAND1_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "freq_band1_cycles %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
@@ -69,6 +75,7 @@
         "EventName": "UNC_P_FREQ_BAND2_TRANSITIONS",
         "Filter": "edge=1",
         "MetricExpr": "(UNC_P_FREQ_BAND2_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "freq_band2_cycles %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
         "EventName": "UNC_P_FREQ_BAND3_TRANSITIONS",
         "Filter": "edge=1",
         "MetricExpr": "(UNC_P_FREQ_BAND3_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "freq_band3_cycles %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
     {
-        "BriefDescription": "This is an occupancy event that tracks the number of cores that are in the chosen C-State.  It can be used by itself to get the average number of cores in that C-state with threshholding to generate histograms, or with other PCU events and occupancy triggering to capture other details. Derived from unc_p_power_state_occupancy.cores_c0",
+        "BriefDescription": "This is an occupancy event that tracks the number of cores that are in C0.  It can be used by itself to get the average number of cores in C0, with threshholding to generate histograms, or with other PCU events and occupancy triggering to capture other details",
         "Counter": "0,1,2,3",
         "EventCode": "0x80",
         "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C0",
         "Filter": "occ_sel=1",
         "MetricExpr": "(UNC_P_POWER_STATE_OCCUPANCY.CORES_C0 / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "power_state_occupancy.cores_c0 %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
     {
-        "BriefDescription": "This is an occupancy event that tracks the number of cores that are in the chosen C-State.  It can be used by itself to get the average number of cores in that C-state with threshholding to generate histograms, or with other PCU events and occupancy triggering to capture other details. Derived from unc_p_power_state_occupancy.cores_c3",
+        "BriefDescription": "This is an occupancy event that tracks the number of cores that are in C3.  It can be used by itself to get the average number of cores in C0, with threshholding to generate histograms, or with other PCU events and occupancy triggering to capture other details",
         "Counter": "0,1,2,3",
         "EventCode": "0x80",
         "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C3",
         "Filter": "occ_sel=2",
         "MetricExpr": "(UNC_P_POWER_STATE_OCCUPANCY.CORES_C3 / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "power_state_occupancy.cores_c3 %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
     {
-        "BriefDescription": "This is an occupancy event that tracks the number of cores that are in the chosen C-State.  It can be used by itself to get the average number of cores in that C-state with threshholding to generate histograms, or with other PCU events and occupancy triggering to capture other details. Derived from unc_p_power_state_occupancy.cores_c6",
+        "BriefDescription": "This is an occupancy event that tracks the number of cores that are in C6.  It can be used by itself to get the average number of cores in C0, with threshholding to generate histograms, or with other PCU events ",
         "Counter": "0,1,2,3",
         "EventCode": "0x80",
         "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C6",
         "Filter": "occ_sel=3",
         "MetricExpr": "(UNC_P_POWER_STATE_OCCUPANCY.CORES_C6 / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "power_state_occupancy.cores_c6 %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
     {
-        "BriefDescription": "Counts the number of cycles that we are in external PROCHOT mode.  This mode is triggered when a sensor off the die determines that something off-die (like DRAM) is too hot and must throttle to avoid damaging the chip. Derived from unc_p_prochot_external_cycles",
+        "BriefDescription": "Counts the number of cycles that we are in external PROCHOT mode.  This mode is triggered when a sensor off the die determines that something off-die (like DRAM) is too hot and must throttle to avoid damaging the chip",
         "Counter": "0,1,2,3",
         "EventCode": "0xa",
         "EventName": "UNC_P_PROCHOT_EXTERNAL_CYCLES",
         "MetricExpr": "(UNC_P_PROCHOT_EXTERNAL_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "prochot_external_cycles %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
     {
-        "BriefDescription": "Counts the number of cycles when thermal conditions are the upper limit on frequency.  This is related to the THERMAL_THROTTLE CYCLES_ABOVE_TEMP event, which always counts cycles when we are above the thermal temperature.  This event (STRONGEST_UPPER_LIMIT) is sampled at the output of the algorithm that determines the actual frequency, while THERMAL_THROTTLE looks at the input. Derived from unc_p_freq_max_limit_thermal_cycles",
+        "BriefDescription": "Counts the number of cycles when temperature is the upper limit on frequency",
         "Counter": "0,1,2,3",
         "EventCode": "0x4",
         "EventName": "UNC_P_FREQ_MAX_LIMIT_THERMAL_CYCLES",
         "MetricExpr": "(UNC_P_FREQ_MAX_LIMIT_THERMAL_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "freq_max_limit_thermal_cycles %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
     {
-        "BriefDescription": "Counts the number of cycles when the OS is the upper limit on frequency. Derived from unc_p_freq_max_os_cycles",
+        "BriefDescription": "Counts the number of cycles when the OS is the upper limit on frequency",
         "Counter": "0,1,2,3",
         "EventCode": "0x6",
         "EventName": "UNC_P_FREQ_MAX_OS_CYCLES",
         "MetricExpr": "(UNC_P_FREQ_MAX_OS_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "freq_max_os_cycles %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
     {
-        "BriefDescription": "Counts the number of cycles when power is the upper limit on frequency. Derived from unc_p_freq_max_power_cycles",
+        "BriefDescription": "Counts the number of cycles when power is the upper limit on frequency",
         "Counter": "0,1,2,3",
         "EventCode": "0x5",
         "EventName": "UNC_P_FREQ_MAX_POWER_CYCLES",
         "MetricExpr": "(UNC_P_FREQ_MAX_POWER_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "freq_max_power_cycles %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
     {
-        "BriefDescription": "Counts the number of cycles when current is the upper limit on frequency. Derived from unc_p_freq_max_current_cycles",
+        "BriefDescription": "Counts the number of cycles when current is the upper limit on frequency",
         "Counter": "0,1,2,3",
         "EventCode": "0x7",
         "EventName": "UNC_P_FREQ_MAX_CURRENT_CYCLES",
         "MetricExpr": "(UNC_P_FREQ_MAX_CURRENT_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "freq_max_current_cycles %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
     {
-        "BriefDescription": "Counts the number of cycles when the system is changing frequency.  This can not be filtered by thread ID.  One can also use it with the occupancy counter that monitors number of threads in C0 to estimate the performance impact that frequency transitions had on the system. Derived from unc_p_freq_trans_cycles",
+        "BriefDescription": "Cycles spent changing Frequency",
         "Counter": "0,1,2,3",
         "EventCode": "0x60",
         "EventName": "UNC_P_FREQ_TRANS_CYCLES",
         "MetricExpr": "(UNC_P_FREQ_TRANS_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "freq_trans_cycles %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
         "EventName": "UNC_P_FREQ_GE_1200MHZ_CYCLES",
         "Filter": "filter_band0=1200",
         "MetricExpr": "(UNC_P_FREQ_GE_1200MHZ_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "freq_ge_1200mhz_cycles %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
         "EventName": "UNC_P_FREQ_GE_2000MHZ_CYCLES",
         "Filter": "filter_band1=2000",
         "MetricExpr": "(UNC_P_FREQ_GE_2000MHZ_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "freq_ge_2000mhz_cycles %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
         "EventName": "UNC_P_FREQ_GE_3000MHZ_CYCLES",
         "Filter": "filter_band2=3000",
         "MetricExpr": "(UNC_P_FREQ_GE_3000MHZ_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "freq_ge_3000mhz_cycles %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
         "EventName": "UNC_P_FREQ_GE_4000MHZ_CYCLES",
         "Filter": "filter_band3=4000",
         "MetricExpr": "(UNC_P_FREQ_GE_4000MHZ_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "freq_ge_4000mhz_cycles %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
         "EventName": "UNC_P_FREQ_GE_1200MHZ_TRANSITIONS",
         "Filter": "edge=1,filter_band0=1200",
         "MetricExpr": "(UNC_P_FREQ_GE_1200MHZ_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "freq_ge_1200mhz_cycles %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
         "EventName": "UNC_P_FREQ_GE_2000MHZ_TRANSITIONS",
         "Filter": "edge=1,filter_band1=2000",
         "MetricExpr": "(UNC_P_FREQ_GE_2000MHZ_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "freq_ge_2000mhz_cycles %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
         "EventName": "UNC_P_FREQ_GE_3000MHZ_TRANSITIONS",
         "Filter": "edge=1,filter_band2=4000",
         "MetricExpr": "(UNC_P_FREQ_GE_3000MHZ_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "freq_ge_3000mhz_cycles %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
         "EventName": "UNC_P_FREQ_GE_4000MHZ_TRANSITIONS",
         "Filter": "edge=1,filter_band3=4000",
         "MetricExpr": "(UNC_P_FREQ_GE_4000MHZ_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "freq_ge_4000mhz_cycles %",
         "PerPkg": "1",
         "Unit": "PCU"
     }
index 2f23cf0..3fa61d9 100644 (file)
@@ -1,13 +1,13 @@
 [
     {
-        "BriefDescription": "Uncore cache clock ticks. Derived from unc_c_clockticks",
+        "BriefDescription": "Uncore cache clock ticks",
         "Counter": "0,1,2,3",
         "EventName": "UNC_C_CLOCKTICKS",
         "PerPkg": "1",
         "Unit": "CBO"
     },
     {
-        "BriefDescription": "All LLC Misses (code+ data rd + data wr - including demand and prefetch). Derived from unc_c_llc_lookup.any",
+        "BriefDescription": "All LLC Misses (code+ data rd + data wr - including demand and prefetch)",
         "Counter": "0,1",
         "EventCode": "0x34",
         "EventName": "UNC_C_LLC_LOOKUP.ANY",
@@ -18,7 +18,7 @@
         "Unit": "CBO"
     },
     {
-        "BriefDescription": "M line evictions from LLC (writebacks to memory). Derived from unc_c_llc_victims.m_state",
+        "BriefDescription": "M line evictions from LLC (writebacks to memory)",
         "Counter": "0,1",
         "EventCode": "0x37",
         "EventName": "UNC_C_LLC_VICTIMS.M_STATE",
         "Unit": "CBO"
     },
     {
-        "BriefDescription": "Occupancy counter for all LLC misses; we divide this by UNC_C_CLOCKTICKS to get average Q depth. Derived from unc_c_tor_occupancy.miss_all",
+        "BriefDescription": "Occupancy counter for all LLC misses; we divide this by UNC_C_CLOCKTICKS to get average Q depth",
         "EventCode": "0x36",
         "EventName": "UNC_C_TOR_OCCUPANCY.MISS_ALL",
         "Filter": "filter_opc=0x182",
         "MetricExpr": "(UNC_C_TOR_OCCUPANCY.MISS_ALL / UNC_C_CLOCKTICKS) * 100.",
+        "MetricName": "tor_occupancy.miss_all %",
         "PerPkg": "1",
         "UMask": "0xa",
         "Unit": "CBO"
         "Unit": "CBO"
     },
     {
-        "BriefDescription": "read requests to home agent. Derived from unc_h_requests.reads",
+        "BriefDescription": "read requests to home agent",
         "Counter": "0,1,2,3",
         "EventCode": "0x1",
         "EventName": "UNC_H_REQUESTS.READS",
         "Unit": "HA"
     },
     {
-        "BriefDescription": "write requests to home agent. Derived from unc_h_requests.writes",
+        "BriefDescription": "write requests to home agent",
         "Counter": "0,1,2,3",
         "EventCode": "0x1",
         "EventName": "UNC_H_REQUESTS.WRITES",
index 6335187..1b53c0e 100644 (file)
@@ -1,6 +1,6 @@
 [
     {
-        "BriefDescription": "QPI clock ticks. Used to get percentages of QPI cycles events. Derived from unc_q_clockticks",
+        "BriefDescription": "QPI clock ticks. Used to get percentages of QPI cycles events",
         "Counter": "0,1,2,3",
         "EventCode": "0x14",
         "EventName": "UNC_Q_CLOCKTICKS",
@@ -8,25 +8,27 @@
         "Unit": "QPI LL"
     },
     {
-        "BriefDescription": "Cycles where receiving QPI link is in half-width mode. Derived from unc_q_rxl0p_power_cycles",
+        "BriefDescription": "Cycles where receiving QPI link is in half-width mode",
         "Counter": "0,1,2,3",
         "EventCode": "0x10",
         "EventName": "UNC_Q_RxL0P_POWER_CYCLES",
         "MetricExpr": "(UNC_Q_RxL0P_POWER_CYCLES / UNC_Q_CLOCKTICKS) * 100.",
+        "MetricName": "rxl0p_power_cycles %",
         "PerPkg": "1",
         "Unit": "QPI LL"
     },
     {
-        "BriefDescription": "Cycles where transmitting QPI link is in half-width mode. Derived from unc_q_txl0p_power_cycles",
+        "BriefDescription": "Cycles where transmitting QPI link is in half-width mode",
         "Counter": "0,1,2,3",
         "EventCode": "0xd",
         "EventName": "UNC_Q_TxL0P_POWER_CYCLES",
         "MetricExpr": "(UNC_Q_TxL0P_POWER_CYCLES / UNC_Q_CLOCKTICKS) * 100.",
+        "MetricName": "txl0p_power_cycles %",
         "PerPkg": "1",
         "Unit": "QPI LL"
     },
     {
-        "BriefDescription": "Number of data flits transmitted . Derived from unc_q_txl_flits_g0.data",
+        "BriefDescription": "Number of data flits transmitted ",
         "Counter": "0,1,2,3",
         "EventName": "UNC_Q_TxL_FLITS_G0.DATA",
         "PerPkg": "1",
@@ -35,7 +37,7 @@
         "Unit": "QPI LL"
     },
     {
-        "BriefDescription": "Number of non data (control) flits transmitted . Derived from unc_q_txl_flits_g0.non_data",
+        "BriefDescription": "Number of non data (control) flits transmitted ",
         "Counter": "0,1,2,3",
         "EventName": "UNC_Q_TxL_FLITS_G0.NON_DATA",
         "PerPkg": "1",
index e2cf6da..8551ceb 100644 (file)
@@ -1,6 +1,6 @@
 [
     {
-        "BriefDescription": "Memory page activates. Derived from unc_m_act_count",
+        "BriefDescription": "Memory page activates",
         "Counter": "0,1,2,3",
         "EventCode": "0x1",
         "EventName": "UNC_M_ACT_COUNT",
@@ -11,7 +11,7 @@
         "BriefDescription": "read requests to memory controller. Derived from unc_m_cas_count.rd",
         "Counter": "0,1,2,3",
         "EventCode": "0x4",
-        "EventName": "UNC_M_CAS_COUNT.RD",
+        "EventName": "LLC_MISSES.MEM_READ",
         "PerPkg": "1",
         "UMask": "0x3",
         "Unit": "iMC"
         "BriefDescription": "write requests to memory controller. Derived from unc_m_cas_count.wr",
         "Counter": "0,1,2,3",
         "EventCode": "0x4",
-        "EventName": "UNC_M_CAS_COUNT.WR",
+        "EventName": "LLC_MISSES.MEM_WRITE",
         "PerPkg": "1",
         "UMask": "0xc",
         "Unit": "iMC"
     },
     {
-        "BriefDescription": "Memory controller clock ticks. Used to get percentages of memory controller cycles events. Derived from unc_m_clockticks",
+        "BriefDescription": "Memory controller clock ticks. Used to get percentages of memory controller cycles events",
         "Counter": "0,1,2,3",
         "EventName": "UNC_M_CLOCKTICKS",
         "PerPkg": "1",
         "Unit": "iMC"
     },
     {
-        "BriefDescription": "Cycles where DRAM ranks are in power down (CKE) mode. Derived from unc_m_power_channel_ppd",
+        "BriefDescription": "Cycles where DRAM ranks are in power down (CKE) mode",
         "Counter": "0,1,2,3",
         "EventCode": "0x85",
         "EventName": "UNC_M_POWER_CHANNEL_PPD",
         "MetricExpr": "(UNC_M_POWER_CHANNEL_PPD / UNC_M_CLOCKTICKS) * 100.",
+        "MetricName": "power_channel_ppd %",
         "PerPkg": "1",
         "Unit": "iMC"
     },
     {
-        "BriefDescription": "Cycles all ranks are in critical thermal throttle. Derived from unc_m_power_critical_throttle_cycles",
+        "BriefDescription": "Cycles all ranks are in critical thermal throttle",
         "Counter": "0,1,2,3",
         "EventCode": "0x86",
         "EventName": "UNC_M_POWER_CRITICAL_THROTTLE_CYCLES",
         "MetricExpr": "(UNC_M_POWER_CRITICAL_THROTTLE_CYCLES / UNC_M_CLOCKTICKS) * 100.",
+        "MetricName": "power_critical_throttle_cycles %",
         "PerPkg": "1",
         "Unit": "iMC"
     },
     {
-        "BriefDescription": "Cycles Memory is in self refresh power mode. Derived from unc_m_power_self_refresh",
+        "BriefDescription": "Cycles Memory is in self refresh power mode",
         "Counter": "0,1,2,3",
         "EventCode": "0x43",
         "EventName": "UNC_M_POWER_SELF_REFRESH",
         "MetricExpr": "(UNC_M_POWER_SELF_REFRESH / UNC_M_CLOCKTICKS) * 100.",
+        "MetricName": "power_self_refresh %",
         "PerPkg": "1",
         "Unit": "iMC"
     },
     {
-        "BriefDescription": "Memory page conflicts. Derived from unc_m_pre_count.page_miss",
+        "BriefDescription": "Memory page conflicts",
         "Counter": "0,1,2,3",
         "EventCode": "0x2",
         "EventName": "UNC_M_PRE_COUNT.PAGE_MISS",
@@ -69,7 +72,7 @@
         "Unit": "iMC"
     },
     {
-        "BriefDescription": "Occupancy counter for memory read queue. Derived from unc_m_rpq_occupancy",
+        "BriefDescription": "Occupancy counter for memory read queue",
         "Counter": "0,1,2,3",
         "EventCode": "0x80",
         "EventName": "UNC_M_RPQ_OCCUPANCY",
index bbe36d5..16034bf 100644 (file)
@@ -1,44 +1,48 @@
 [
     {
-        "BriefDescription": "PCU clock ticks. Use to get percentages of PCU cycles events. Derived from unc_p_clockticks",
+        "BriefDescription": "PCU clock ticks. Use to get percentages of PCU cycles events",
         "Counter": "0,1,2,3",
         "EventName": "UNC_P_CLOCKTICKS",
         "PerPkg": "1",
         "Unit": "PCU"
     },
     {
-        "BriefDescription": "Counts the number of cycles that the uncore was running at a frequency greater than or equal to the frequency that is configured in the filter.  (filter_band0=XXX with XXX in 100Mhz units). One can also use inversion (filter_inv=1) to track cycles when we were less than the configured frequency. Derived from unc_p_freq_band0_cycles",
+        "BriefDescription": "Counts the number of cycles that the uncore was running at a frequency greater than or equal to the frequency that is configured in the filter.  (filter_band0=XXX with XXX in 100Mhz units). One can also use inversion (filter_inv=1) to track cycles when we were less than the configured frequency",
         "Counter": "0,1,2,3",
         "EventCode": "0xb",
         "EventName": "UNC_P_FREQ_BAND0_CYCLES",
         "MetricExpr": "(UNC_P_FREQ_BAND0_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "freq_band0_cycles %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
     {
-        "BriefDescription": "Counts the number of cycles that the uncore was running at a frequency greater than or equal to the frequency that is configured in the filter.  (filter_band1=XXX with XXX in 100Mhz units). One can also use inversion (filter_inv=1) to track cycles when we were less than the configured frequency. Derived from unc_p_freq_band1_cycles",
+        "BriefDescription": "Counts the number of cycles that the uncore was running at a frequency greater than or equal to the frequency that is configured in the filter.  (filter_band1=XXX with XXX in 100Mhz units). One can also use inversion (filter_inv=1) to track cycles when we were less than the configured frequency",
         "Counter": "0,1,2,3",
         "EventCode": "0xc",
         "EventName": "UNC_P_FREQ_BAND1_CYCLES",
         "MetricExpr": "(UNC_P_FREQ_BAND1_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "freq_band1_cycles %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
     {
-        "BriefDescription": "Counts the number of cycles that the uncore was running at a frequency greater than or equal to the frequency that is configured in the filter.  (filter_band2=XXX with XXX in 100Mhz units). One can also use inversion (filter_inv=1) to track cycles when we were less than the configured frequency. Derived from unc_p_freq_band2_cycles",
+        "BriefDescription": "Counts the number of cycles that the uncore was running at a frequency greater than or equal to the frequency that is configured in the filter.  (filter_band2=XXX with XXX in 100Mhz units). One can also use inversion (filter_inv=1) to track cycles when we were less than the configured frequency",
         "Counter": "0,1,2,3",
         "EventCode": "0xd",
         "EventName": "UNC_P_FREQ_BAND2_CYCLES",
         "MetricExpr": "(UNC_P_FREQ_BAND2_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "freq_band2_cycles %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
     {
-        "BriefDescription": "Counts the number of cycles that the uncore was running at a frequency greater than or equal to the frequency that is configured in the filter.  (filter_band3=XXX, with XXX in 100Mhz units). One can also use inversion (filter_inv=1) to track cycles when we were less than the configured frequency. Derived from unc_p_freq_band3_cycles",
+        "BriefDescription": "Counts the number of cycles that the uncore was running at a frequency greater than or equal to the frequency that is configured in the filter.  (filter_band3=XXX, with XXX in 100Mhz units). One can also use inversion (filter_inv=1) to track cycles when we were less than the configured frequency",
         "Counter": "0,1,2,3",
         "EventCode": "0xe",
         "EventName": "UNC_P_FREQ_BAND3_CYCLES",
         "MetricExpr": "(UNC_P_FREQ_BAND3_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "freq_band3_cycles %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
@@ -49,6 +53,7 @@
         "EventName": "UNC_P_FREQ_BAND0_TRANSITIONS",
         "Filter": "edge=1",
         "MetricExpr": "(UNC_P_FREQ_BAND0_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "freq_band0_cycles %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
@@ -59,6 +64,7 @@
         "EventName": "UNC_P_FREQ_BAND1_TRANSITIONS",
         "Filter": "edge=1",
         "MetricExpr": "(UNC_P_FREQ_BAND1_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "freq_band1_cycles %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
@@ -69,6 +75,7 @@
         "EventName": "UNC_P_FREQ_BAND2_TRANSITIONS",
         "Filter": "edge=1",
         "MetricExpr": "(UNC_P_FREQ_BAND2_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "freq_band2_cycles %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
         "EventName": "UNC_P_FREQ_BAND3_TRANSITIONS",
         "Filter": "edge=1",
         "MetricExpr": "(UNC_P_FREQ_BAND3_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "freq_band3_cycles %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
     {
-        "BriefDescription": "This is an occupancy event that tracks the number of cores that are in C0.  It can be used by itself to get the average number of cores in C0, with threshholding to generate histograms, or with other PCU events and occupancy triggering to capture other details. Derived from unc_p_power_state_occupancy.cores_c0",
+        "BriefDescription": "This is an occupancy event that tracks the number of cores that are in C0.  It can be used by itself to get the average number of cores in C0, with threshholding to generate histograms, or with other PCU events and occupancy triggering to capture other details",
         "Counter": "0,1,2,3",
         "EventCode": "0x80",
         "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C0",
         "Filter": "occ_sel=1",
         "MetricExpr": "(UNC_P_POWER_STATE_OCCUPANCY.CORES_C0 / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "power_state_occupancy.cores_c0 %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
     {
-        "BriefDescription": "This is an occupancy event that tracks the number of cores that are in C3.  It can be used by itself to get the average number of cores in C0, with threshholding to generate histograms, or with other PCU events and occupancy triggering to capture other details. Derived from unc_p_power_state_occupancy.cores_c3",
+        "BriefDescription": "This is an occupancy event that tracks the number of cores that are in C3.  It can be used by itself to get the average number of cores in C0, with threshholding to generate histograms, or with other PCU events and occupancy triggering to capture other details",
         "Counter": "0,1,2,3",
         "EventCode": "0x80",
         "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C3",
         "Filter": "occ_sel=2",
         "MetricExpr": "(UNC_P_POWER_STATE_OCCUPANCY.CORES_C3 / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "power_state_occupancy.cores_c3 %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
     {
-        "BriefDescription": "This is an occupancy event that tracks the number of cores that are in C6.  It can be used by itself to get the average number of cores in C0, with threshholding to generate histograms, or with other PCU events . Derived from unc_p_power_state_occupancy.cores_c6",
+        "BriefDescription": "This is an occupancy event that tracks the number of cores that are in C6.  It can be used by itself to get the average number of cores in C0, with threshholding to generate histograms, or with other PCU events ",
         "Counter": "0,1,2,3",
         "EventCode": "0x80",
         "EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C6",
         "Filter": "occ_sel=3",
         "MetricExpr": "(UNC_P_POWER_STATE_OCCUPANCY.CORES_C6 / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "power_state_occupancy.cores_c6 %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
     {
-        "BriefDescription": "Counts the number of cycles that we are in external PROCHOT mode.  This mode is triggered when a sensor off the die determines that something off-die (like DRAM) is too hot and must throttle to avoid damaging the chip. Derived from unc_p_prochot_external_cycles",
+        "BriefDescription": "Counts the number of cycles that we are in external PROCHOT mode.  This mode is triggered when a sensor off the die determines that something off-die (like DRAM) is too hot and must throttle to avoid damaging the chip",
         "Counter": "0,1,2,3",
         "EventCode": "0xa",
         "EventName": "UNC_P_PROCHOT_EXTERNAL_CYCLES",
         "MetricExpr": "(UNC_P_PROCHOT_EXTERNAL_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "prochot_external_cycles %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
     {
-        "BriefDescription": "Counts the number of cycles when temperature is the upper limit on frequency. Derived from unc_p_freq_max_limit_thermal_cycles",
+        "BriefDescription": "Counts the number of cycles when temperature is the upper limit on frequency",
         "Counter": "0,1,2,3",
         "EventCode": "0x4",
         "EventName": "UNC_P_FREQ_MAX_LIMIT_THERMAL_CYCLES",
         "MetricExpr": "(UNC_P_FREQ_MAX_LIMIT_THERMAL_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "freq_max_limit_thermal_cycles %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
     {
-        "BriefDescription": "Counts the number of cycles when the OS is the upper limit on frequency. Derived from unc_p_freq_max_os_cycles",
+        "BriefDescription": "Counts the number of cycles when the OS is the upper limit on frequency",
         "Counter": "0,1,2,3",
         "EventCode": "0x6",
         "EventName": "UNC_P_FREQ_MAX_OS_CYCLES",
         "MetricExpr": "(UNC_P_FREQ_MAX_OS_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "freq_max_os_cycles %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
     {
-        "BriefDescription": "Counts the number of cycles when power is the upper limit on frequency. Derived from unc_p_freq_max_power_cycles",
+        "BriefDescription": "Counts the number of cycles when power is the upper limit on frequency",
         "Counter": "0,1,2,3",
         "EventCode": "0x5",
         "EventName": "UNC_P_FREQ_MAX_POWER_CYCLES",
         "MetricExpr": "(UNC_P_FREQ_MAX_POWER_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "freq_max_power_cycles %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
     {
-        "BriefDescription": "Counts the number of cycles when current is the upper limit on frequency. Derived from unc_p_freq_max_current_cycles",
+        "BriefDescription": "Counts the number of cycles when current is the upper limit on frequency",
         "Counter": "0,1,2,3",
         "EventCode": "0x7",
         "EventName": "UNC_P_FREQ_MAX_CURRENT_CYCLES",
         "MetricExpr": "(UNC_P_FREQ_MAX_CURRENT_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "freq_max_current_cycles %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
     {
-        "BriefDescription": "Cycles spent changing Frequency. Derived from unc_p_freq_trans_cycles",
+        "BriefDescription": "Cycles spent changing Frequency",
         "Counter": "0,1,2,3",
         "EventName": "UNC_P_FREQ_TRANS_CYCLES",
         "MetricExpr": "(UNC_P_FREQ_TRANS_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "freq_trans_cycles %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
         "EventName": "UNC_P_FREQ_GE_1200MHZ_CYCLES",
         "Filter": "filter_band0=1200",
         "MetricExpr": "(UNC_P_FREQ_GE_1200MHZ_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "freq_ge_1200mhz_cycles %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
         "EventName": "UNC_P_FREQ_GE_2000MHZ_CYCLES",
         "Filter": "filter_band1=2000",
         "MetricExpr": "(UNC_P_FREQ_GE_2000MHZ_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "freq_ge_2000mhz_cycles %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
         "EventName": "UNC_P_FREQ_GE_3000MHZ_CYCLES",
         "Filter": "filter_band2=3000",
         "MetricExpr": "(UNC_P_FREQ_GE_3000MHZ_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "freq_ge_3000mhz_cycles %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
         "EventName": "UNC_P_FREQ_GE_4000MHZ_CYCLES",
         "Filter": "filter_band3=4000",
         "MetricExpr": "(UNC_P_FREQ_GE_4000MHZ_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "freq_ge_4000mhz_cycles %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
         "EventName": "UNC_P_FREQ_GE_1200MHZ_TRANSITIONS",
         "Filter": "edge=1,filter_band0=1200",
         "MetricExpr": "(UNC_P_FREQ_GE_1200MHZ_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "freq_ge_1200mhz_cycles %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
         "EventName": "UNC_P_FREQ_GE_2000MHZ_TRANSITIONS",
         "Filter": "edge=1,filter_band1=2000",
         "MetricExpr": "(UNC_P_FREQ_GE_2000MHZ_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "freq_ge_2000mhz_cycles %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
         "EventName": "UNC_P_FREQ_GE_3000MHZ_TRANSITIONS",
         "Filter": "edge=1,filter_band2=4000",
         "MetricExpr": "(UNC_P_FREQ_GE_3000MHZ_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "freq_ge_3000mhz_cycles %",
         "PerPkg": "1",
         "Unit": "PCU"
     },
         "EventName": "UNC_P_FREQ_GE_4000MHZ_TRANSITIONS",
         "Filter": "edge=1,filter_band3=4000",
         "MetricExpr": "(UNC_P_FREQ_GE_4000MHZ_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+        "MetricName": "freq_ge_4000mhz_cycles %",
         "PerPkg": "1",
         "Unit": "PCU"
     }
index 12181bb..d1a12e5 100644 (file)
@@ -17,6 +17,7 @@ GenuineIntel-6-3A,v18,ivybridge,core
 GenuineIntel-6-3E,v19,ivytown,core
 GenuineIntel-6-2D,v20,jaketown,core
 GenuineIntel-6-57,v9,knightslanding,core
+GenuineIntel-6-85,v9,knightslanding,core
 GenuineIntel-6-1E,v2,nehalemep,core
 GenuineIntel-6-1F,v2,nehalemep,core
 GenuineIntel-6-1A,v2,nehalemep,core
diff --git a/tools/perf/pmu-events/arch/x86/sandybridge/uncore.json b/tools/perf/pmu-events/arch/x86/sandybridge/uncore.json
new file mode 100644 (file)
index 0000000..42c70ee
--- /dev/null
@@ -0,0 +1,314 @@
+[
+  {
+    "Unit": "CBO",
+    "EventCode": "0x22",
+    "UMask": "0x01",
+    "EventName": "UNC_CBO_XSNP_RESPONSE.MISS",
+    "BriefDescription": "A snoop misses in some processor core.",
+    "PublicDescription": "A snoop misses in some processor core.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x22",
+    "UMask": "0x02",
+    "EventName": "UNC_CBO_XSNP_RESPONSE.INVAL",
+    "BriefDescription": "A snoop invalidates a non-modified line in some processor core.",
+    "PublicDescription": "A snoop invalidates a non-modified line in some processor core.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x22",
+    "UMask": "0x04",
+    "EventName": "UNC_CBO_XSNP_RESPONSE.HIT",
+    "BriefDescription": "A snoop hits a non-modified line in some processor core.",
+    "PublicDescription": "A snoop hits a non-modified line in some processor core.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x22",
+    "UMask": "0x08",
+    "EventName": "UNC_CBO_XSNP_RESPONSE.HITM",
+    "BriefDescription": "A snoop hits a modified line in some processor core.",
+    "PublicDescription": "A snoop hits a modified line in some processor core.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x22",
+    "UMask": "0x10",
+    "EventName": "UNC_CBO_XSNP_RESPONSE.INVAL_M",
+    "BriefDescription": "A snoop invalidates a modified line in some processor core.",
+    "PublicDescription": "A snoop invalidates a modified line in some processor core.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x22",
+    "UMask": "0x20",
+    "EventName": "UNC_CBO_XSNP_RESPONSE.EXTERNAL_FILTER",
+    "BriefDescription": "Filter on cross-core snoops initiated by this Cbox due to external snoop request.",
+    "PublicDescription": "Filter on cross-core snoops initiated by this Cbox due to external snoop request.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x22",
+    "UMask": "0x40",
+    "EventName": "UNC_CBO_XSNP_RESPONSE.XCORE_FILTER",
+    "BriefDescription": "Filter on cross-core snoops initiated by this Cbox due to processor core memory request.",
+    "PublicDescription": "Filter on cross-core snoops initiated by this Cbox due to processor core memory request.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x22",
+    "UMask": "0x80",
+    "EventName": "UNC_CBO_XSNP_RESPONSE.EVICTION_FILTER",
+    "BriefDescription": "Filter on cross-core snoops initiated by this Cbox due to LLC eviction.",
+    "PublicDescription": "Filter on cross-core snoops initiated by this Cbox due to LLC eviction.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x34",
+    "UMask": "0x01",
+    "EventName": "UNC_CBO_CACHE_LOOKUP.M",
+    "BriefDescription": "LLC lookup request that access cache and found line in M-state.",
+    "PublicDescription": "LLC lookup request that access cache and found line in M-state.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x34",
+    "UMask": "0x02",
+    "EventName": "UNC_CBO_CACHE_LOOKUP.E",
+    "BriefDescription": "LLC lookup request that access cache and found line in E-state.",
+    "PublicDescription": "LLC lookup request that access cache and found line in E-state.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x34",
+    "UMask": "0x04",
+    "EventName": "UNC_CBO_CACHE_LOOKUP.S",
+    "BriefDescription": "LLC lookup request that access cache and found line in S-state.",
+    "PublicDescription": "LLC lookup request that access cache and found line in S-state.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x34",
+    "UMask": "0x08",
+    "EventName": "UNC_CBO_CACHE_LOOKUP.I",
+    "BriefDescription": "LLC lookup request that access cache and found line in I-state.",
+    "PublicDescription": "LLC lookup request that access cache and found line in I-state.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x34",
+    "UMask": "0x10",
+    "EventName": "UNC_CBO_CACHE_LOOKUP.READ_FILTER",
+    "BriefDescription": "Filter on processor core initiated cacheable read requests.",
+    "PublicDescription": "Filter on processor core initiated cacheable read requests.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x34",
+    "UMask": "0x20",
+    "EventName": "UNC_CBO_CACHE_LOOKUP.WRITE_FILTER",
+    "BriefDescription": "Filter on processor core initiated cacheable write requests.",
+    "PublicDescription": "Filter on processor core initiated cacheable write requests.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x34",
+    "UMask": "0x40",
+    "EventName": "UNC_CBO_CACHE_LOOKUP.EXTSNP_FILTER",
+    "BriefDescription": "Filter on external snoop requests.",
+    "PublicDescription": "Filter on external snoop requests.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x34",
+    "UMask": "0x80",
+    "EventName": "UNC_CBO_CACHE_LOOKUP.ANY_REQUEST_FILTER",
+    "BriefDescription": "Filter on any IRQ or IPQ initiated requests including uncacheable, non-coherent requests.",
+    "PublicDescription": "Filter on any IRQ or IPQ initiated requests including uncacheable, non-coherent requests.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "ARB",
+    "EventCode": "0x80",
+    "UMask": "0x01",
+    "EventName": "UNC_ARB_TRK_OCCUPANCY.ALL",
+    "BriefDescription": "Counts cycles weighted by the number of requests waiting for data returning from the memory controller. Accounts for coherent and non-coherent requests initiated by IA cores, processor graphic units, or LLC.",
+    "PublicDescription": "Counts cycles weighted by the number of requests waiting for data returning from the memory controller. Accounts for coherent and non-coherent requests initiated by IA cores, processor graphic units, or LLC.",
+    "Counter": "0",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "ARB",
+    "EventCode": "0x81",
+    "UMask": "0x01",
+    "EventName": "UNC_ARB_TRK_REQUESTS.ALL",
+    "BriefDescription": "Counts the number of coherent and in-coherent requests initiated by IA cores, processor graphic units, or LLC.",
+    "PublicDescription": "Counts the number of coherent and in-coherent requests initiated by IA cores, processor graphic units, or LLC.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "ARB",
+    "EventCode": "0x81",
+    "UMask": "0x20",
+    "EventName": "UNC_ARB_TRK_REQUESTS.WRITES",
+    "BriefDescription": "Counts the number of allocated write entries, include full, partial, and LLC evictions.",
+    "PublicDescription": "Counts the number of allocated write entries, include full, partial, and LLC evictions.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "ARB",
+    "EventCode": "0x81",
+    "UMask": "0x80",
+    "EventName": "UNC_ARB_TRK_REQUESTS.EVICTIONS",
+    "BriefDescription": "Counts the number of LLC evictions allocated.",
+    "PublicDescription": "Counts the number of LLC evictions allocated.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "ARB",
+    "EventCode": "0x83",
+    "UMask": "0x01",
+    "EventName": "UNC_ARB_COH_TRK_OCCUPANCY.ALL",
+    "BriefDescription": "Cycles weighted by number of requests pending in Coherency Tracker.",
+    "PublicDescription": "Cycles weighted by number of requests pending in Coherency Tracker.",
+    "Counter": "0",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "ARB",
+    "EventCode": "0x84",
+    "UMask": "0x01",
+    "EventName": "UNC_ARB_COH_TRK_REQUESTS.ALL",
+    "BriefDescription": "Number of requests allocated in Coherency Tracker.",
+    "PublicDescription": "Number of requests allocated in Coherency Tracker.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "ARB",
+    "EventCode": "0x80",
+    "UMask": "0x01",
+    "EventName": "UNC_ARB_TRK_OCCUPANCY.CYCLES_WITH_ANY_REQUEST",
+    "BriefDescription": "Cycles with at least one request outstanding is waiting for data return from memory controller. Account for coherent and non-coherent requests initiated by IA Cores, Processor Graphics Unit, or LLC.",
+    "PublicDescription": "Cycles with at least one request outstanding is waiting for data return from memory controller. Account for coherent and non-coherent requests initiated by IA Cores, Processor Graphics Unit, or LLC.",
+    "Counter": "0,1",
+    "CounterMask": "1",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "ARB",
+    "EventCode": "0x80",
+    "UMask": "0x01",
+    "EventName": "UNC_ARB_TRK_OCCUPANCY.CYCLES_OVER_HALF_FULL",
+    "BriefDescription": "Cycles with at least half of the requests outstanding are waiting for data return from memory controller. Account for coherent and non-coherent requests initiated by IA Cores, Processor Graphics Unit, or LLC.",
+    "PublicDescription": "Cycles with at least half of the requests outstanding are waiting for data return from memory controller. Account for coherent and non-coherent requests initiated by IA Cores, Processor Graphics Unit, or LLC.",
+    "Counter": "0,1",
+    "CounterMask": "10",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "ARB",
+    "EventCode": "0x0",
+    "UMask": "0x01",
+    "EventName": "UNC_CLOCK.SOCKET",
+    "BriefDescription": "This 48-bit fixed counter counts the UCLK cycles.",
+    "PublicDescription": "This 48-bit fixed counter counts the UCLK cycles.",
+    "Counter": "Fixed",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x34",
+    "UMask": "0x06",
+    "EventName": "UNC_CBO_CACHE_LOOKUP.ES",
+    "BriefDescription": "LLC lookup request that access cache and found line in E-state or S-state.",
+    "PublicDescription": "LLC lookup request that access cache and found line in E-state or S-state.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  }
+]
\ No newline at end of file
diff --git a/tools/perf/pmu-events/arch/x86/skylake/uncore.json b/tools/perf/pmu-events/arch/x86/skylake/uncore.json
new file mode 100644 (file)
index 0000000..dbc1932
--- /dev/null
@@ -0,0 +1,254 @@
+[
+  {
+    "Unit": "CBO",
+    "EventCode": "0x22",
+    "UMask": "0x41",
+    "EventName": "UNC_CBO_XSNP_RESPONSE.MISS_XCORE",
+    "BriefDescription": "A cross-core snoop initiated by this Cbox due to processor core memory request which misses in some processor core.",
+    "PublicDescription": "A cross-core snoop initiated by this Cbox due to processor core memory request which misses in some processor core.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x22",
+    "UMask": "0x81",
+    "EventName": "UNC_CBO_XSNP_RESPONSE.MISS_EVICTION",
+    "BriefDescription": "A cross-core snoop resulted from L3 Eviction which misses in some processor core.",
+    "PublicDescription": "A cross-core snoop resulted from L3 Eviction which misses in some processor core.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x22",
+    "UMask": "0x44",
+    "EventName": "UNC_CBO_XSNP_RESPONSE.HIT_XCORE",
+    "BriefDescription": "A cross-core snoop initiated by this Cbox due to processor core memory request which hits a non-modified line in some processor core.",
+    "PublicDescription": "A cross-core snoop initiated by this Cbox due to processor core memory request which hits a non-modified line in some processor core.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x22",
+    "UMask": "0x48",
+    "EventName": "UNC_CBO_XSNP_RESPONSE.HITM_XCORE",
+    "BriefDescription": "A cross-core snoop initiated by this Cbox due to processor core memory request which hits a modified line in some processor core.",
+    "PublicDescription": "A cross-core snoop initiated by this Cbox due to processor core memory request which hits a modified line in some processor core.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x34",
+    "UMask": "0x21",
+    "EventName": "UNC_CBO_CACHE_LOOKUP.WRITE_M",
+    "BriefDescription": "L3 Lookup write request that access cache and found line in M-state",
+    "PublicDescription": "L3 Lookup write request that access cache and found line in M-state.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x34",
+    "UMask": "0x81",
+    "EventName": "UNC_CBO_CACHE_LOOKUP.ANY_M",
+    "BriefDescription": "L3 Lookup any request that access cache and found line in M-state",
+    "PublicDescription": "L3 Lookup any request that access cache and found line in M-state.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x34",
+    "UMask": "0x18",
+    "EventName": "UNC_CBO_CACHE_LOOKUP.READ_I",
+    "BriefDescription": "L3 Lookup read request that access cache and found line in I-state",
+    "PublicDescription": "L3 Lookup read request that access cache and found line in I-state.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x34",
+    "UMask": "0x88",
+    "EventName": "UNC_CBO_CACHE_LOOKUP.ANY_I",
+    "BriefDescription": "L3 Lookup any request that access cache and found line in I-state",
+    "PublicDescription": "L3 Lookup any request that access cache and found line in I-state.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x34",
+    "UMask": "0x1f",
+    "EventName": "UNC_CBO_CACHE_LOOKUP.READ_MESI",
+    "BriefDescription": "L3 Lookup read request that access cache and found line in any MESI-state",
+    "PublicDescription": "L3 Lookup read request that access cache and found line in any MESI-state.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x34",
+    "UMask": "0x2f",
+    "EventName": "UNC_CBO_CACHE_LOOKUP.WRITE_MESI",
+    "BriefDescription": "L3 Lookup write request that access cache and found line in MESI-state",
+    "PublicDescription": "L3 Lookup write request that access cache and found line in MESI-state.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x34",
+    "UMask": "0x8f",
+    "EventName": "UNC_CBO_CACHE_LOOKUP.ANY_MESI",
+    "BriefDescription": "L3 Lookup any request that access cache and found line in MESI-state",
+    "PublicDescription": "L3 Lookup any request that access cache and found line in MESI-state.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x34",
+    "UMask": "0x86",
+    "EventName": "UNC_CBO_CACHE_LOOKUP.ANY_ES",
+    "BriefDescription": "L3 Lookup any request that access cache and found line in E or S-state",
+    "PublicDescription": "L3 Lookup any request that access cache and found line in E or S-state.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x34",
+    "UMask": "0x16",
+    "EventName": "UNC_CBO_CACHE_LOOKUP.READ_ES",
+    "BriefDescription": "L3 Lookup read request that access cache and found line in E or S-state",
+    "PublicDescription": "L3 Lookup read request that access cache and found line in E or S-state.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "CBO",
+    "EventCode": "0x34",
+    "UMask": "0x26",
+    "EventName": "UNC_CBO_CACHE_LOOKUP.WRITE_ES",
+    "BriefDescription": "L3 Lookup write request that access cache and found line in E or S-state",
+    "PublicDescription": "L3 Lookup write request that access cache and found line in E or S-state.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "iMPH-U",
+    "EventCode": "0x80",
+    "UMask": "0x01",
+    "EventName": "UNC_ARB_TRK_OCCUPANCY.ALL",
+    "BriefDescription": "Each cycle count number of all Core outgoing valid entries. Such entry is defined as valid from its allocation till first of IDI0 or DRS0 messages is sent out. Accounts for Coherent and non-coherent traffic.",
+    "PublicDescription": "Each cycle count number of all Core outgoing valid entries. Such entry is defined as valid from its allocation till first of IDI0 or DRS0 messages is sent out. Accounts for Coherent and non-coherent traffic.",
+    "Counter": "0",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "iMPH-U",
+    "EventCode": "0x81",
+    "UMask": "0x01",
+    "EventName": "UNC_ARB_TRK_REQUESTS.ALL",
+    "BriefDescription": "Total number of Core outgoing entries allocated. Accounts for Coherent and non-coherent traffic.",
+    "PublicDescription": "Total number of Core outgoing entries allocated. Accounts for Coherent and non-coherent traffic.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "iMPH-U",
+    "EventCode": "0x81",
+    "UMask": "0x02",
+    "EventName": "UNC_ARB_TRK_REQUESTS.DRD_DIRECT",
+    "BriefDescription": "Number of Core coherent Data Read entries allocated in DirectData mode",
+    "PublicDescription": "Number of Core coherent Data Read entries allocated in DirectData mode.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "iMPH-U",
+    "EventCode": "0x81",
+    "UMask": "0x20",
+    "EventName": "UNC_ARB_TRK_REQUESTS.WRITES",
+    "BriefDescription": "Number of Writes allocated - any write transactions: full/partials writes and evictions.",
+    "PublicDescription": "Number of Writes allocated - any write transactions: full/partials writes and evictions.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "iMPH-U",
+    "EventCode": "0x84",
+    "UMask": "0x01",
+    "EventName": "UNC_ARB_COH_TRK_REQUESTS.ALL",
+    "BriefDescription": "Number of entries allocated. Account for Any type: e.g. Snoop, Core aperture, etc.",
+    "PublicDescription": "Number of entries allocated. Account for Any type: e.g. Snoop, Core aperture, etc.",
+    "Counter": "0,1",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "iMPH-U",
+    "EventCode": "0x80",
+    "UMask": "0x01",
+    "EventName": "UNC_ARB_TRK_OCCUPANCY.CYCLES_WITH_ANY_REQUEST",
+    "BriefDescription": "Cycles with at least one request outstanding is waiting for data return from memory controller. Account for coherent and non-coherent requests initiated by IA Cores, Processor Graphics Unit, or LLC.;",
+    "PublicDescription": "Cycles with at least one request outstanding is waiting for data return from memory controller. Account for coherent and non-coherent requests initiated by IA Cores, Processor Graphics Unit, or LLC.",
+    "Counter": "0",
+    "CounterMask": "1",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  },
+  {
+    "Unit": "NCU",
+    "EventCode": "0x0",
+    "UMask": "0x01",
+    "EventName": "UNC_CLOCK.SOCKET",
+    "BriefDescription": "This 48-bit fixed counter counts the UCLK cycles",
+    "PublicDescription": "This 48-bit fixed counter counts the UCLK cycles.",
+    "Counter": "FIXED",
+    "CounterMask": "0",
+    "Invert": "0",
+    "EdgeDetect": "0"
+  }
+]
\ No newline at end of file
index eed0934..baa073f 100644 (file)
@@ -195,6 +195,7 @@ static struct map {
        { "CBO", "uncore_cbox" },
        { "QPI LL", "uncore_qpi" },
        { "SBO", "uncore_sbox" },
+       { "iMPH-U", "uncore_arb" },
        {}
 };
 
@@ -291,7 +292,9 @@ static void print_events_table_prefix(FILE *fp, const char *tblname)
 
 static int print_events_table_entry(void *data, char *name, char *event,
                                    char *desc, char *long_desc,
-                                   char *pmu, char *unit, char *perpkg)
+                                   char *pmu, char *unit, char *perpkg,
+                                   char *metric_expr,
+                                   char *metric_name)
 {
        struct perf_entry_data *pd = data;
        FILE *outfp = pd->outfp;
@@ -315,6 +318,10 @@ static int print_events_table_entry(void *data, char *name, char *event,
                fprintf(outfp, "\t.unit = \"%s\",\n", unit);
        if (perpkg)
                fprintf(outfp, "\t.perpkg = \"%s\",\n", perpkg);
+       if (metric_expr)
+               fprintf(outfp, "\t.metric_expr = \"%s\",\n", metric_expr);
+       if (metric_name)
+               fprintf(outfp, "\t.metric_name = \"%s\",\n", metric_name);
        fprintf(outfp, "},\n");
 
        return 0;
@@ -362,7 +369,9 @@ static char *real_event(const char *name, char *event)
 int json_events(const char *fn,
          int (*func)(void *data, char *name, char *event, char *desc,
                      char *long_desc,
-                     char *pmu, char *unit, char *perpkg),
+                     char *pmu, char *unit, char *perpkg,
+                     char *metric_expr,
+                     char *metric_name),
          void *data)
 {
        int err = -EIO;
@@ -388,6 +397,8 @@ int json_events(const char *fn,
                char *filter = NULL;
                char *perpkg = NULL;
                char *unit = NULL;
+               char *metric_expr = NULL;
+               char *metric_name = NULL;
                unsigned long long eventcode = 0;
                struct msrmap *msr = NULL;
                jsmntok_t *msrval = NULL;
@@ -398,6 +409,7 @@ int json_events(const char *fn,
                for (j = 0; j < obj->size; j += 2) {
                        jsmntok_t *field, *val;
                        int nz;
+                       char *s;
 
                        field = tok + j;
                        EXPECT(field->type == JSMN_STRING, tok + j,
@@ -444,7 +456,6 @@ int json_events(const char *fn,
                                        NULL);
                        } else if (json_streq(map, field, "Unit")) {
                                const char *ppmu;
-                               char *s;
 
                                ppmu = field_to_perf(unit_to_pmu, map, val);
                                if (ppmu) {
@@ -458,12 +469,19 @@ int json_events(const char *fn,
                                }
                                addfield(map, &desc, ". ", "Unit: ", NULL);
                                addfield(map, &desc, "", pmu, NULL);
+                               addfield(map, &desc, "", " ", NULL);
                        } else if (json_streq(map, field, "Filter")) {
                                addfield(map, &filter, "", "", val);
                        } else if (json_streq(map, field, "ScaleUnit")) {
                                addfield(map, &unit, "", "", val);
                        } else if (json_streq(map, field, "PerPkg")) {
                                addfield(map, &perpkg, "", "", val);
+                       } else if (json_streq(map, field, "MetricName")) {
+                               addfield(map, &metric_name, "", "", val);
+                       } else if (json_streq(map, field, "MetricExpr")) {
+                               addfield(map, &metric_expr, "", "", val);
+                               for (s = metric_expr; *s; s++)
+                                       *s = tolower(*s);
                        }
                        /* ignore unknown fields */
                }
@@ -488,7 +506,7 @@ int json_events(const char *fn,
                fixname(name);
 
                err = func(data, name, real_event(name, event), desc, long_desc,
-                               pmu, unit, perpkg);
+                               pmu, unit, perpkg, metric_expr, metric_name);
                free(event);
                free(desc);
                free(name);
@@ -498,6 +516,8 @@ int json_events(const char *fn,
                free(filter);
                free(perpkg);
                free(unit);
+               free(metric_expr);
+               free(metric_name);
                if (err)
                        break;
                tok += j;
index 71e13de..611fac0 100644 (file)
@@ -5,7 +5,8 @@ int json_events(const char *fn,
                int (*func)(void *data, char *name, char *event, char *desc,
                                char *long_desc,
                                char *pmu,
-                               char *unit, char *perpkg),
+                               char *unit, char *perpkg, char *metric_expr,
+                               char *metric_name),
                void *data);
 char *get_cpu_str(void);
 
index c669a3c..569eab3 100644 (file)
@@ -13,6 +13,8 @@ struct pmu_event {
        const char *pmu;
        const char *unit;
        const char *perpkg;
+       const char *metric_expr;
+       const char *metric_name;
 };
 
 /*
index 1cb3d9b..af58ebc 100644 (file)
@@ -38,6 +38,7 @@ perf-y += cpumap.o
 perf-y += stat.o
 perf-y += event_update.o
 perf-y += event-times.o
+perf-y += expr.o
 perf-y += backward-ring-buffer.o
 perf-y += sdt.o
 perf-y += is_printable_array.o
index 88dc51f..0dd7749 100644 (file)
  * permissions. All the event text files are stored there.
  */
 
+#include <errno.h>
+#include <inttypes.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
 #include "../perf.h"
 #include "util.h"
 #include <subcmd/exec-cmd.h>
index 42e892b..50f6d7a 100644 (file)
@@ -8,6 +8,7 @@
 #include <sys/prctl.h>
 #include "tests.h"
 #include "debug.h"
+#include <errno.h>
 
 #define NR_ITERS 111
 
index 1a04fe7..5876da1 100644 (file)
@@ -1,10 +1,14 @@
+#include <errno.h>
 #include <stdio.h>
 #include <sys/epoll.h>
+#include <sys/types.h>
+#include <sys/stat.h>
 #include <util/util.h>
 #include <util/bpf-loader.h>
 #include <util/evlist.h>
 #include <linux/bpf.h>
 #include <linux/filter.h>
+#include <linux/kernel.h>
 #include <api/fs/fs.h>
 #include <bpf/bpf.h>
 #include "tests.h"
index 83c4669..9e08d29 100644 (file)
@@ -3,8 +3,10 @@
  *
  * Builtin regression testing command: ever growing number of sanity tests
  */
+#include <errno.h>
 #include <unistd.h>
 #include <string.h>
+#include <sys/wait.h>
 #include "builtin.h"
 #include "hist.h"
 #include "intlist.h"
@@ -13,6 +15,7 @@
 #include "color.h"
 #include <subcmd/parse-options.h>
 #include "symbol.h"
+#include <linux/kernel.h>
 
 static bool dont_fork;
 
@@ -44,6 +47,10 @@ static struct test generic_tests[] = {
                .func = test__parse_events,
        },
        {
+               .desc = "Simple expression parser",
+               .func = test__expr,
+       },
+       {
                .desc = "PERF_RECORD_* events & perf_sample fields",
                .func = test__PERF_RECORD,
        },
@@ -460,7 +467,7 @@ static int perf_test__list(int argc, const char **argv)
        return 0;
 }
 
-int cmd_test(int argc, const char **argv, const char *prefix __maybe_unused)
+int cmd_test(int argc, const char **argv)
 {
        const char *test_usage[] = {
        "perf test [<options>] [{list <test-name-fragment>|[<test-name-fragments>|<test-numbers>]}]",
index f853e24..c5bb220 100644 (file)
@@ -2,6 +2,7 @@
 #include "debug.h"
 #include "util.h"
 #include "c++/clang-c.h"
+#include <linux/kernel.h>
 
 static struct {
        int (*func)(void);
index d1f6930..1f14e76 100644 (file)
@@ -1,9 +1,12 @@
+#include <errno.h>
+#include <linux/kernel.h>
 #include <linux/types.h>
+#include <inttypes.h>
 #include <stdlib.h>
 #include <unistd.h>
 #include <stdio.h>
-#include <ctype.h>
 #include <string.h>
+#include <sys/param.h>
 
 #include "parse-events.h"
 #include "evlist.h"
@@ -16,6 +19,8 @@
 
 #include "tests.h"
 
+#include "sane_ctype.h"
+
 #define BUFSZ  1024
 #define READLEN        128
 
index f168a85..4478773 100644 (file)
@@ -66,7 +66,7 @@ static int process_event_cpus(struct perf_tool *tool __maybe_unused,
        TEST_ASSERT_VAL("wrong nr",  map->nr == 2);
        TEST_ASSERT_VAL("wrong cpu", map->map[0] == 1);
        TEST_ASSERT_VAL("wrong cpu", map->map[1] == 256);
-       TEST_ASSERT_VAL("wrong refcnt", atomic_read(&map->refcnt) == 1);
+       TEST_ASSERT_VAL("wrong refcnt", refcount_read(&map->refcnt) == 1);
        cpu_map__put(map);
        return 0;
 }
index 13725e0..8f08df5 100644 (file)
@@ -1,4 +1,6 @@
+#include <dirent.h>
 #include <stdlib.h>
+#include <linux/kernel.h>
 #include <linux/types.h>
 #include <sys/stat.h>
 #include <fcntl.h>
index 1046491..dfe5c89 100644 (file)
@@ -1,5 +1,6 @@
 #include <linux/compiler.h>
 #include <linux/types.h>
+#include <inttypes.h>
 #include <unistd.h>
 #include "tests.h"
 #include "debug.h"
index 19ef77b..634f20c 100644 (file)
@@ -1,5 +1,8 @@
 #include <linux/compiler.h>
+#include <errno.h>
+#include <inttypes.h>
 #include <string.h>
+#include <sys/wait.h>
 #include "tests.h"
 #include "evlist.h"
 #include "evsel.h"
index 60926a1..d2bea6f 100644 (file)
@@ -3,6 +3,8 @@
 #include "parse-events.h"
 #include "tests.h"
 #include "debug.h"
+#include <errno.h>
+#include <linux/kernel.h>
 
 static int perf_evsel__roundtrip_cache_name_test(void)
 {
diff --git a/tools/perf/tests/expr.c b/tools/perf/tests/expr.c
new file mode 100644 (file)
index 0000000..6c6a374
--- /dev/null
@@ -0,0 +1,56 @@
+#include "util/debug.h"
+#include "util/expr.h"
+#include "tests.h"
+#include <stdlib.h>
+
+static int test(struct parse_ctx *ctx, const char *e, double val2)
+{
+       double val;
+
+       if (expr__parse(&val, ctx, &e))
+               TEST_ASSERT_VAL("parse test failed", 0);
+       TEST_ASSERT_VAL("unexpected value", val == val2);
+       return 0;
+}
+
+int test__expr(int subtest __maybe_unused)
+{
+       const char *p;
+       const char **other;
+       double val;
+       int ret;
+       struct parse_ctx ctx;
+       int num_other;
+
+       expr__ctx_init(&ctx);
+       expr__add_id(&ctx, "FOO", 1);
+       expr__add_id(&ctx, "BAR", 2);
+
+       ret = test(&ctx, "1+1", 2);
+       ret |= test(&ctx, "FOO+BAR", 3);
+       ret |= test(&ctx, "(BAR/2)%2", 1);
+       ret |= test(&ctx, "1 - -4",  5);
+       ret |= test(&ctx, "(FOO-1)*2 + (BAR/2)%2 - -4",  5);
+
+       if (ret)
+               return ret;
+
+       p = "FOO/0";
+       ret = expr__parse(&val, &ctx, &p);
+       TEST_ASSERT_VAL("division by zero", ret == 1);
+
+       p = "BAR/";
+       ret = expr__parse(&val, &ctx, &p);
+       TEST_ASSERT_VAL("missing operand", ret == 1);
+
+       TEST_ASSERT_VAL("find other",
+                       expr__find_other("FOO + BAR + BAZ + BOZO", "FOO", &other, &num_other) == 0);
+       TEST_ASSERT_VAL("find other", num_other == 3);
+       TEST_ASSERT_VAL("find other", !strcmp(other[0], "BAR"));
+       TEST_ASSERT_VAL("find other", !strcmp(other[1], "BAZ"));
+       TEST_ASSERT_VAL("find other", !strcmp(other[2], "BOZO"));
+       TEST_ASSERT_VAL("find other", other[3] == NULL);
+       free((void *)other);
+
+       return 0;
+}
index 6b21746..00b8dc5 100644 (file)
@@ -1,3 +1,4 @@
+#include <inttypes.h>
 #include "perf.h"
 #include "util/debug.h"
 #include "util/symbol.h"
@@ -7,6 +8,7 @@
 #include "util/machine.h"
 #include "util/thread.h"
 #include "tests/hists_common.h"
+#include <linux/kernel.h>
 
 static struct {
        u32 pid;
index 9fd54b7..70918b9 100644 (file)
@@ -9,6 +9,7 @@
 #include "util/parse-events.h"
 #include "tests/tests.h"
 #include "tests/hists_common.h"
+#include <linux/kernel.h>
 
 struct sample {
        u32 pid;
index 62efb14..f171b2d 100644 (file)
@@ -9,6 +9,7 @@
 #include "util/parse-events.h"
 #include "tests/tests.h"
 #include "tests/hists_common.h"
+#include <linux/kernel.h>
 
 struct sample {
        u32 pid;
index eddc740..a26cbb7 100644 (file)
@@ -9,6 +9,8 @@
 #include "thread.h"
 #include "parse-events.h"
 #include "hists_common.h"
+#include <errno.h>
+#include <linux/kernel.h>
 
 struct sample {
        u32 pid;
index 63c5efa..cdf0dde 100644 (file)
@@ -9,6 +9,7 @@
 #include "util/parse-events.h"
 #include "tests/tests.h"
 #include "tests/hists_common.h"
+#include <linux/kernel.h>
 
 struct sample {
        u32 cpu;
index 42e1339..a5192f6 100644 (file)
@@ -1,7 +1,8 @@
 #include <linux/compiler.h>
+#include <linux/kernel.h>
 #include "tests.h"
 #include "debug.h"
-#include "util.h"
+#include "print_binary.h"
 
 int test__is_printable_array(int subtest __maybe_unused)
 {
index 634bce9..15c7708 100644 (file)
@@ -1,3 +1,5 @@
+#include <errno.h>
+#include <inttypes.h>
 /* For the CLR_() macros */
 #include <pthread.h>
 
@@ -7,6 +9,7 @@
 #include "cpumap.h"
 #include "tests.h"
 #include <linux/err.h>
+#include <linux/kernel.h>
 
 /*
  * This test will generate random numbers of calls to some getpid syscalls,
index 0c5ce44..6ea4d8a 100644 (file)
@@ -1,3 +1,4 @@
+#include <inttypes.h>
 #include <unistd.h>
 #include <sys/syscall.h>
 #include <sys/types.h>
@@ -11,6 +12,7 @@
 #include "thread_map.h"
 #include "symbol.h"
 #include "thread.h"
+#include "util.h"
 
 #define THREADS 4
 
index c8d9592..1a74dd9 100644 (file)
@@ -1,8 +1,14 @@
+#include <errno.h>
+#include <inttypes.h>
 /* For the CPU_* macros */
 #include <pthread.h>
 
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
 #include <api/fs/fs.h>
 #include <linux/err.h>
+#include <api/fs/tracing_path.h>
 #include "evsel.h"
 #include "tests.h"
 #include "thread_map.h"
index f52239f..9788fac 100644 (file)
@@ -5,6 +5,7 @@
 #include "thread_map.h"
 #include "tests.h"
 #include "debug.h"
+#include <errno.h>
 
 #ifndef O_DIRECTORY
 #define O_DIRECTORY    00200000
index d741412..e44506e 100644 (file)
@@ -1,5 +1,10 @@
+#include <errno.h>
+#include <inttypes.h>
 #include <api/fs/tracing_path.h>
 #include <linux/err.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
 #include "thread_map.h"
 #include "evsel.h"
 #include "debug.h"
index 1dc8380..7fad885 100644 (file)
@@ -1,4 +1,3 @@
-
 #include "parse-events.h"
 #include "evsel.h"
 #include "evlist.h"
@@ -6,8 +5,15 @@
 #include "tests.h"
 #include "debug.h"
 #include "util.h"
+#include <dirent.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <linux/kernel.h>
 #include <linux/hw_breakpoint.h>
 #include <api/fs/fs.h>
+#include <api/fs/tracing_path.h>
 
 #define PERF_TP_SAMPLE_TYPE (PERF_SAMPLE_RAW | PERF_SAMPLE_TIME | \
                             PERF_SAMPLE_CPU | PERF_SAMPLE_PERIOD)
index 65dcf48..c6207db 100644 (file)
@@ -1,3 +1,4 @@
+#include <linux/kernel.h>
 #include <linux/types.h>
 #include <stddef.h>
 
index 87893f3..d37cd95 100644 (file)
@@ -1,3 +1,5 @@
+#include <errno.h>
+#include <inttypes.h>
 /* For the CLR_() macros */
 #include <pthread.h>
 
index 1e2ba26..a6d7aef 100644 (file)
@@ -2,6 +2,8 @@
 #include "pmu.h"
 #include "util.h"
 #include "tests.h"
+#include <errno.h>
+#include <linux/kernel.h>
 
 /* Simulated format definitions. */
 static struct test_format {
index 5f23710..bac5c38 100644 (file)
@@ -1,4 +1,6 @@
 #include <stdbool.h>
+#include <inttypes.h>
+#include <linux/kernel.h>
 #include <linux/types.h>
 
 #include "util.h"
index f59d210..f73b3c5 100644 (file)
@@ -1,3 +1,4 @@
+#include <errno.h>
 #include <stdio.h>
 #include <sys/epoll.h>
 #include <util/util.h>
@@ -43,7 +44,7 @@ static char *get_self_path(void)
 {
        char *buf = calloc(PATH_MAX, sizeof(char));
 
-       if (buf && readlink("/proc/self/exe", buf, PATH_MAX) < 0) {
+       if (buf && readlink("/proc/self/exe", buf, PATH_MAX - 1) < 0) {
                pr_debug("Failed to get correct path of perf\n");
                free(buf);
                return NULL;
index 4c9fd04..828494d 100644 (file)
@@ -1,3 +1,5 @@
+#include <errno.h>
+#include <inttypes.h>
 #include <unistd.h>
 #include <stdlib.h>
 #include <signal.h>
index 7ddbe26..65474fd 100644 (file)
@@ -1,5 +1,6 @@
 #include <sys/time.h>
 #include <sys/prctl.h>
+#include <errno.h>
 #include <time.h>
 #include <stdlib.h>
 
index 01a5ba2..32873ec 100644 (file)
@@ -4,6 +4,7 @@
 #include "cpumap.h"
 #include "tests.h"
 
+#include <errno.h>
 #include <signal.h>
 
 static int exited;
index 1fa9b9d..6318596 100644 (file)
@@ -62,6 +62,7 @@ int test__sample_parsing(int subtest);
 int test__keep_tracking(int subtest);
 int test__parse_no_sample_id_all(int subtest);
 int test__dwarf_unwind(int subtest);
+int test__expr(int subtest);
 int test__hists_filter(int subtest);
 int test__mmap_thread_lookup(int subtest);
 int test__thread_mg_share(int subtest);
index f2d2e54..a63d694 100644 (file)
@@ -29,7 +29,7 @@ int test__thread_map(int subtest __maybe_unused)
                        thread_map__comm(map, 0) &&
                        !strcmp(thread_map__comm(map, 0), NAME));
        TEST_ASSERT_VAL("wrong refcnt",
-                       atomic_read(&map->refcnt) == 1);
+                       refcount_read(&map->refcnt) == 1);
        thread_map__put(map);
 
        /* test dummy pid */
@@ -44,7 +44,7 @@ int test__thread_map(int subtest __maybe_unused)
                        thread_map__comm(map, 0) &&
                        !strcmp(thread_map__comm(map, 0), "dummy"));
        TEST_ASSERT_VAL("wrong refcnt",
-                       atomic_read(&map->refcnt) == 1);
+                       refcount_read(&map->refcnt) == 1);
        thread_map__put(map);
        return 0;
 }
@@ -71,7 +71,7 @@ static int process_event(struct perf_tool *tool __maybe_unused,
                        thread_map__comm(threads, 0) &&
                        !strcmp(thread_map__comm(threads, 0), NAME));
        TEST_ASSERT_VAL("wrong refcnt",
-                       atomic_read(&threads->refcnt) == 1);
+                       refcount_read(&threads->refcnt) == 1);
        thread_map__put(threads);
        return 0;
 }
index 188b631..76686dd 100644 (file)
@@ -43,7 +43,7 @@ int test__thread_mg_share(int subtest __maybe_unused)
                        leader && t1 && t2 && t3 && other);
 
        mg = leader->mg;
-       TEST_ASSERT_EQUAL("wrong refcnt", atomic_read(&mg->refcnt), 4);
+       TEST_ASSERT_EQUAL("wrong refcnt", refcount_read(&mg->refcnt), 4);
 
        /* test the map groups pointer is shared */
        TEST_ASSERT_VAL("map groups don't match", mg == t1->mg);
@@ -71,25 +71,25 @@ int test__thread_mg_share(int subtest __maybe_unused)
        machine__remove_thread(machine, other_leader);
 
        other_mg = other->mg;
-       TEST_ASSERT_EQUAL("wrong refcnt", atomic_read(&other_mg->refcnt), 2);
+       TEST_ASSERT_EQUAL("wrong refcnt", refcount_read(&other_mg->refcnt), 2);
 
        TEST_ASSERT_VAL("map groups don't match", other_mg == other_leader->mg);
 
        /* release thread group */
        thread__put(leader);
-       TEST_ASSERT_EQUAL("wrong refcnt", atomic_read(&mg->refcnt), 3);
+       TEST_ASSERT_EQUAL("wrong refcnt", refcount_read(&mg->refcnt), 3);
 
        thread__put(t1);
-       TEST_ASSERT_EQUAL("wrong refcnt", atomic_read(&mg->refcnt), 2);
+       TEST_ASSERT_EQUAL("wrong refcnt", refcount_read(&mg->refcnt), 2);
 
        thread__put(t2);
-       TEST_ASSERT_EQUAL("wrong refcnt", atomic_read(&mg->refcnt), 1);
+       TEST_ASSERT_EQUAL("wrong refcnt", refcount_read(&mg->refcnt), 1);
 
        thread__put(t3);
 
        /* release other group  */
        thread__put(other_leader);
-       TEST_ASSERT_EQUAL("wrong refcnt", atomic_read(&other_mg->refcnt), 1);
+       TEST_ASSERT_EQUAL("wrong refcnt", refcount_read(&other_mg->refcnt), 1);
 
        thread__put(other);
 
index 623c2aa..44589de 100644 (file)
@@ -1,7 +1,8 @@
+#include <inttypes.h>
 #include <linux/compiler.h>
 #include <linux/types.h>
 #include "tests.h"
-#include "util.h"
+#include "units.h"
 #include "debug.h"
 
 int test__unit_number__scnprint(int subtest __maybe_unused)
index 862b043..8456175 100644 (file)
@@ -1,5 +1,6 @@
 #include <linux/compiler.h>
 #include <linux/rbtree.h>
+#include <inttypes.h>
 #include <string.h>
 #include "map.h"
 #include "symbol.h"
diff --git a/tools/perf/trace/beauty/Build b/tools/perf/trace/beauty/Build
new file mode 100644 (file)
index 0000000..be95ac6
--- /dev/null
@@ -0,0 +1 @@
+libperf-y += statx.o
diff --git a/tools/perf/trace/beauty/beauty.h b/tools/perf/trace/beauty/beauty.h
new file mode 100644 (file)
index 0000000..cf50be3
--- /dev/null
@@ -0,0 +1,24 @@
+#ifndef _PERF_TRACE_BEAUTY_H
+#define _PERF_TRACE_BEAUTY_H
+
+#include <linux/types.h>
+
+struct trace;
+struct thread;
+
+struct syscall_arg {
+       unsigned long val;
+       struct thread *thread;
+       struct trace  *trace;
+       void          *parm;
+       u8            idx;
+       u8            mask;
+};
+
+size_t syscall_arg__scnprintf_statx_flags(char *bf, size_t size, struct syscall_arg *arg);
+#define SCA_STATX_FLAGS syscall_arg__scnprintf_statx_flags
+
+size_t syscall_arg__scnprintf_statx_mask(char *bf, size_t size, struct syscall_arg *arg);
+#define SCA_STATX_MASK syscall_arg__scnprintf_statx_mask
+
+#endif /* _PERF_TRACE_BEAUTY_H */
index d3b0b1f..fde8f2f 100644 (file)
@@ -1,3 +1,4 @@
+#include <signal.h>
 
 static size_t syscall_arg__scnprintf_signum(char *bf, size_t size, struct syscall_arg *arg)
 {
diff --git a/tools/perf/trace/beauty/statx.c b/tools/perf/trace/beauty/statx.c
new file mode 100644 (file)
index 0000000..5643b69
--- /dev/null
@@ -0,0 +1,72 @@
+/*
+ * trace/beauty/statx.c
+ *
+ *  Copyright (C) 2017, Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
+ *
+ * Released under the GPL v2. (and only v2, not any later version)
+ */
+
+#include "trace/beauty/beauty.h"
+#include <linux/kernel.h>
+#include <sys/types.h>
+#include <uapi/linux/fcntl.h>
+#include <uapi/linux/stat.h>
+
+size_t syscall_arg__scnprintf_statx_flags(char *bf, size_t size, struct syscall_arg *arg)
+{
+       int printed = 0, flags = arg->val;
+
+       if (flags == 0)
+               return scnprintf(bf, size, "SYNC_AS_STAT");
+#define        P_FLAG(n) \
+       if (flags & AT_##n) { \
+               printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
+               flags &= ~AT_##n; \
+       }
+
+       P_FLAG(SYMLINK_NOFOLLOW);
+       P_FLAG(REMOVEDIR);
+       P_FLAG(SYMLINK_FOLLOW);
+       P_FLAG(NO_AUTOMOUNT);
+       P_FLAG(EMPTY_PATH);
+       P_FLAG(STATX_FORCE_SYNC);
+       P_FLAG(STATX_DONT_SYNC);
+
+#undef P_FLAG
+
+       if (flags)
+               printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
+
+       return printed;
+}
+
+size_t syscall_arg__scnprintf_statx_mask(char *bf, size_t size, struct syscall_arg *arg)
+{
+       int printed = 0, flags = arg->val;
+
+#define        P_FLAG(n) \
+       if (flags & STATX_##n) { \
+               printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
+               flags &= ~STATX_##n; \
+       }
+
+       P_FLAG(TYPE);
+       P_FLAG(MODE);
+       P_FLAG(NLINK);
+       P_FLAG(UID);
+       P_FLAG(GID);
+       P_FLAG(ATIME);
+       P_FLAG(MTIME);
+       P_FLAG(CTIME);
+       P_FLAG(INO);
+       P_FLAG(SIZE);
+       P_FLAG(BLOCKS);
+       P_FLAG(BTIME);
+
+#undef P_FLAG
+
+       if (flags)
+               printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
+
+       return printed;
+}
index 3eb3edb..a4d3762 100644 (file)
@@ -1,4 +1,5 @@
 #include "../util.h"
+#include "../string2.h"
 #include "../config.h"
 #include "../../perf.h"
 #include "libslang.h"
@@ -13,6 +14,7 @@
 #include "helpline.h"
 #include "keysyms.h"
 #include "../color.h"
+#include "sane_ctype.h"
 
 static int ui_browser__percent_color(struct ui_browser *browser,
                                     double percent, bool current)
@@ -579,7 +581,7 @@ static int ui_browser__color_config(const char *var, const char *value,
                        break;
 
                *bg = '\0';
-               while (isspace(*++bg));
+               bg = ltrim(++bg);
                ui_browser__colorsets[i].bg = bg;
                ui_browser__colorsets[i].fg = fg;
                return 0;
index ba36aac..d990ad0 100644 (file)
@@ -9,7 +9,10 @@
 #include "../../util/symbol.h"
 #include "../../util/evsel.h"
 #include "../../util/config.h"
+#include <inttypes.h>
 #include <pthread.h>
+#include <linux/kernel.h>
+#include <sys/ttydefaults.h>
 
 struct disasm_line_samples {
        double          percent;
index edbeaaf..e2c9390 100644 (file)
@@ -8,6 +8,8 @@
 #include "util/header.h"
 #include "util/session.h"
 
+#include <sys/ttydefaults.h>
+
 static void ui_browser__argv_write(struct ui_browser *browser,
                                   void *entry, int row)
 {
index fc4fb66..69f4570 100644 (file)
@@ -1,7 +1,11 @@
+#include <dirent.h>
+#include <errno.h>
+#include <inttypes.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <linux/rbtree.h>
+#include <sys/ttydefaults.h>
 
 #include "../../util/evsel.h"
 #include "../../util/evlist.h"
@@ -10,6 +14,7 @@
 #include "../../util/sort.h"
 #include "../../util/util.h"
 #include "../../util/top.h"
+#include "../../util/thread.h"
 #include "../../arch/common.h"
 
 #include "../browsers/hists.h"
 #include "../ui.h"
 #include "map.h"
 #include "annotate.h"
+#include "srcline.h"
+#include "string2.h"
+#include "units.h"
+
+#include "sane_ctype.h"
 
 extern void hist_browser__init_hpp(void);
 
@@ -144,9 +154,60 @@ static void callchain_list__set_folding(struct callchain_list *cl, bool unfold)
        cl->unfolded = unfold ? cl->has_children : false;
 }
 
+static struct inline_node *inline_node__create(struct map *map, u64 ip)
+{
+       struct dso *dso;
+       struct inline_node *node;
+
+       if (map == NULL)
+               return NULL;
+
+       dso = map->dso;
+       if (dso == NULL)
+               return NULL;
+
+       if (dso->kernel != DSO_TYPE_USER)
+               return NULL;
+
+       node = dso__parse_addr_inlines(dso,
+                                      map__rip_2objdump(map, ip));
+
+       return node;
+}
+
+static int inline__count_rows(struct inline_node *node)
+{
+       struct inline_list *ilist;
+       int i = 0;
+
+       if (node == NULL)
+               return 0;
+
+       list_for_each_entry(ilist, &node->val, list) {
+               if ((ilist->filename != NULL) || (ilist->funcname != NULL))
+                       i++;
+       }
+
+       return i;
+}
+
+static int callchain_list__inline_rows(struct callchain_list *chain)
+{
+       struct inline_node *node;
+       int rows;
+
+       node = inline_node__create(chain->ms.map, chain->ip);
+       if (node == NULL)
+               return 0;
+
+       rows = inline__count_rows(node);
+       inline_node__delete(node);
+       return rows;
+}
+
 static int callchain_node__count_rows_rb_tree(struct callchain_node *node)
 {
-       int n = 0;
+       int n = 0, inline_rows;
        struct rb_node *nd;
 
        for (nd = rb_first(&node->rb_root); nd; nd = rb_next(nd)) {
@@ -156,6 +217,13 @@ static int callchain_node__count_rows_rb_tree(struct callchain_node *node)
 
                list_for_each_entry(chain, &child->val, list) {
                        ++n;
+
+                       if (symbol_conf.inline_name) {
+                               inline_rows =
+                                       callchain_list__inline_rows(chain);
+                               n += inline_rows;
+                       }
+
                        /* We need this because we may not have children */
                        folded_sign = callchain_list__folded(chain);
                        if (folded_sign == '+')
@@ -207,7 +275,7 @@ static int callchain_node__count_rows(struct callchain_node *node)
 {
        struct callchain_list *chain;
        bool unfolded = false;
-       int n = 0;
+       int n = 0, inline_rows;
 
        if (callchain_param.mode == CHAIN_FLAT)
                return callchain_node__count_flat_rows(node);
@@ -216,6 +284,11 @@ static int callchain_node__count_rows(struct callchain_node *node)
 
        list_for_each_entry(chain, &node->val, list) {
                ++n;
+               if (symbol_conf.inline_name) {
+                       inline_rows = callchain_list__inline_rows(chain);
+                       n += inline_rows;
+               }
+
                unfolded = chain->unfolded;
        }
 
@@ -362,6 +435,19 @@ static void hist_entry__init_have_children(struct hist_entry *he)
        he->init_have_children = true;
 }
 
+static void hist_entry_init_inline_node(struct hist_entry *he)
+{
+       if (he->inline_node)
+               return;
+
+       he->inline_node = inline_node__create(he->ms.map, he->ip);
+
+       if (he->inline_node == NULL)
+               return;
+
+       he->has_children = true;
+}
+
 static bool hist_browser__toggle_fold(struct hist_browser *browser)
 {
        struct hist_entry *he = browser->he_selection;
@@ -393,7 +479,12 @@ static bool hist_browser__toggle_fold(struct hist_browser *browser)
 
                if (he->unfolded) {
                        if (he->leaf)
-                               he->nr_rows = callchain__count_rows(&he->sorted_chain);
+                               if (he->inline_node)
+                                       he->nr_rows = inline__count_rows(
+                                                       he->inline_node);
+                               else
+                                       he->nr_rows = callchain__count_rows(
+                                                       &he->sorted_chain);
                        else
                                he->nr_rows = hierarchy_count_rows(browser, he, false);
 
@@ -753,6 +844,71 @@ static bool hist_browser__check_dump_full(struct hist_browser *browser __maybe_u
 
 #define LEVEL_OFFSET_STEP 3
 
+static int hist_browser__show_inline(struct hist_browser *browser,
+                                    struct inline_node *node,
+                                    unsigned short row,
+                                    int offset)
+{
+       struct inline_list *ilist;
+       char buf[1024];
+       int color, width, first_row;
+
+       first_row = row;
+       width = browser->b.width - (LEVEL_OFFSET_STEP + 2);
+       list_for_each_entry(ilist, &node->val, list) {
+               if ((ilist->filename != NULL) || (ilist->funcname != NULL)) {
+                       color = HE_COLORSET_NORMAL;
+                       if (ui_browser__is_current_entry(&browser->b, row))
+                               color = HE_COLORSET_SELECTED;
+
+                       if (callchain_param.key == CCKEY_ADDRESS ||
+                           callchain_param.key == CCKEY_SRCLINE) {
+                               if (ilist->filename != NULL)
+                                       scnprintf(buf, sizeof(buf),
+                                                 "%s:%d (inline)",
+                                                 ilist->filename,
+                                                 ilist->line_nr);
+                               else
+                                       scnprintf(buf, sizeof(buf), "??");
+                       } else if (ilist->funcname != NULL)
+                               scnprintf(buf, sizeof(buf), "%s (inline)",
+                                         ilist->funcname);
+                       else if (ilist->filename != NULL)
+                               scnprintf(buf, sizeof(buf),
+                                         "%s:%d (inline)",
+                                         ilist->filename,
+                                         ilist->line_nr);
+                       else
+                               scnprintf(buf, sizeof(buf), "??");
+
+                       ui_browser__set_color(&browser->b, color);
+                       hist_browser__gotorc(browser, row, 0);
+                       ui_browser__write_nstring(&browser->b, " ",
+                               LEVEL_OFFSET_STEP + offset);
+                       ui_browser__write_nstring(&browser->b, buf, width);
+                       row++;
+               }
+       }
+
+       return row - first_row;
+}
+
+static size_t show_inline_list(struct hist_browser *browser, struct map *map,
+                              u64 ip, int row, int offset)
+{
+       struct inline_node *node;
+       int ret;
+
+       node = inline_node__create(map, ip);
+       if (node == NULL)
+               return 0;
+
+       ret = hist_browser__show_inline(browser, node, row, offset);
+
+       inline_node__delete(node);
+       return ret;
+}
+
 static int hist_browser__show_callchain_list(struct hist_browser *browser,
                                             struct callchain_node *node,
                                             struct callchain_list *chain,
@@ -764,6 +920,7 @@ static int hist_browser__show_callchain_list(struct hist_browser *browser,
        char bf[1024], *alloc_str;
        char buf[64], *alloc_str2;
        const char *str;
+       int inline_rows = 0, ret = 1;
 
        if (arg->row_offset != 0) {
                arg->row_offset--;
@@ -801,10 +958,15 @@ static int hist_browser__show_callchain_list(struct hist_browser *browser,
        }
 
        print(browser, chain, str, offset, row, arg);
-
        free(alloc_str);
        free(alloc_str2);
-       return 1;
+
+       if (symbol_conf.inline_name) {
+               inline_rows = show_inline_list(browser, chain->ms.map,
+                                              chain->ip, row + 1, offset);
+       }
+
+       return ret + inline_rows;
 }
 
 static bool check_percent_display(struct rb_node *node, u64 parent_total)
@@ -1228,6 +1390,12 @@ static int hist_browser__show_entry(struct hist_browser *browser,
                folded_sign = hist_entry__folded(entry);
        }
 
+       if (symbol_conf.inline_name &&
+           (!entry->has_children)) {
+               hist_entry_init_inline_node(entry);
+               folded_sign = hist_entry__folded(entry);
+       }
+
        if (row_offset == 0) {
                struct hpp_arg arg = {
                        .b              = &browser->b,
@@ -1259,7 +1427,8 @@ static int hist_browser__show_entry(struct hist_browser *browser,
                        }
 
                        if (first) {
-                               if (symbol_conf.use_callchain) {
+                               if (symbol_conf.use_callchain ||
+                                       symbol_conf.inline_name) {
                                        ui_browser__printf(&browser->b, "%c ", folded_sign);
                                        width -= 2;
                                }
@@ -1301,8 +1470,14 @@ static int hist_browser__show_entry(struct hist_browser *browser,
                        .is_current_entry = current_entry,
                };
 
-               printed += hist_browser__show_callchain(browser, entry, 1, row,
-                                       hist_browser__show_callchain_entry, &arg,
+               if (entry->inline_node)
+                       printed += hist_browser__show_inline(browser,
+                                       entry->inline_node, row, 0);
+               else
+                       printed += hist_browser__show_callchain(browser,
+                                       entry, 1, row,
+                                       hist_browser__show_callchain_entry,
+                                       &arg,
                                        hist_browser__check_output_full);
        }
 
@@ -2308,7 +2483,7 @@ static int switch_data_file(void)
                return ret;
 
        memset(options, 0, sizeof(options));
-       memset(options, 0, sizeof(abs_path));
+       memset(abs_path, 0, sizeof(abs_path));
 
        while ((dent = readdir(pwd_dir))) {
                char path[PATH_MAX];
index 9ce142d..ffa5add 100644 (file)
@@ -11,6 +11,8 @@
 #include "../keysyms.h"
 #include "map.h"
 
+#include "sane_ctype.h"
+
 struct map_browser {
        struct ui_browser b;
        struct map        *map;
index 8c9308a..e99ba86 100644 (file)
@@ -3,7 +3,8 @@
 #include "util/annotate.h"
 #include "util/evsel.h"
 #include "ui/helpline.h"
-
+#include <inttypes.h>
+#include <signal.h>
 
 enum {
        ANN_COL__PERCENT,
index a4f02de..e24f839 100644 (file)
@@ -4,7 +4,9 @@
 #include "../sort.h"
 #include "../hist.h"
 #include "../helpline.h"
+#include "../string2.h"
 #include "gtk.h"
+#include <signal.h>
 
 #define MAX_COLUMNS                    32
 
index 5d632dc..59addd5 100644 (file)
@@ -1,3 +1,4 @@
+#include <inttypes.h>
 #include <math.h>
 #include <linux/compiler.h>
 
index 50d13e5..5ea0b40 100644 (file)
@@ -4,6 +4,7 @@
 #include "../util/cache.h"
 #include "../util/debug.h"
 #include "../util/hist.h"
+#include "../util/util.h"
 
 pthread_mutex_t ui__lock = PTHREAD_MUTEX_INITIALIZER;
 void *perf_gtk_handle;
index 668f4ae..42e432b 100644 (file)
@@ -4,7 +4,10 @@
 #include "../../util/hist.h"
 #include "../../util/sort.h"
 #include "../../util/evsel.h"
-
+#include "../../util/srcline.h"
+#include "../../util/string2.h"
+#include "../../util/thread.h"
+#include "../../util/sane_ctype.h"
 
 static size_t callchain__fprintf_left_margin(FILE *fp, int left_margin)
 {
@@ -17,6 +20,67 @@ static size_t callchain__fprintf_left_margin(FILE *fp, int left_margin)
        return ret;
 }
 
+static size_t inline__fprintf(struct map *map, u64 ip, int left_margin,
+                             int depth, int depth_mask, FILE *fp)
+{
+       struct dso *dso;
+       struct inline_node *node;
+       struct inline_list *ilist;
+       int ret = 0, i;
+
+       if (map == NULL)
+               return 0;
+
+       dso = map->dso;
+       if (dso == NULL)
+               return 0;
+
+       if (dso->kernel != DSO_TYPE_USER)
+               return 0;
+
+       node = dso__parse_addr_inlines(dso,
+                                      map__rip_2objdump(map, ip));
+       if (node == NULL)
+               return 0;
+
+       list_for_each_entry(ilist, &node->val, list) {
+               if ((ilist->filename != NULL) || (ilist->funcname != NULL)) {
+                       ret += callchain__fprintf_left_margin(fp, left_margin);
+
+                       for (i = 0; i < depth; i++) {
+                               if (depth_mask & (1 << i))
+                                       ret += fprintf(fp, "|");
+                               else
+                                       ret += fprintf(fp, " ");
+                               ret += fprintf(fp, "          ");
+                       }
+
+                       if (callchain_param.key == CCKEY_ADDRESS ||
+                           callchain_param.key == CCKEY_SRCLINE) {
+                               if (ilist->filename != NULL)
+                                       ret += fprintf(fp, "%s:%d (inline)",
+                                                      ilist->filename,
+                                                      ilist->line_nr);
+                               else
+                                       ret += fprintf(fp, "??");
+                       } else if (ilist->funcname != NULL)
+                               ret += fprintf(fp, "%s (inline)",
+                                              ilist->funcname);
+                       else if (ilist->filename != NULL)
+                               ret += fprintf(fp, "%s:%d (inline)",
+                                              ilist->filename,
+                                              ilist->line_nr);
+                       else
+                               ret += fprintf(fp, "??");
+
+                       ret += fprintf(fp, "\n");
+               }
+       }
+
+       inline_node__delete(node);
+       return ret;
+}
+
 static size_t ipchain__fprintf_graph_line(FILE *fp, int depth, int depth_mask,
                                          int left_margin)
 {
@@ -78,6 +142,10 @@ static size_t ipchain__fprintf_graph(FILE *fp, struct callchain_node *node,
        fputs(str, fp);
        fputc('\n', fp);
        free(alloc_str);
+
+       if (symbol_conf.inline_name)
+               ret += inline__fprintf(chain->ms.map, chain->ip,
+                                      left_margin, depth, depth_mask, fp);
        return ret;
 }
 
@@ -229,6 +297,7 @@ static size_t callchain__fprintf_graph(FILE *fp, struct rb_root *root,
                        if (!i++ && field_order == NULL &&
                            sort_order && !prefixcmp(sort_order, "sym"))
                                continue;
+
                        if (!printed) {
                                ret += callchain__fprintf_left_margin(fp, left_margin);
                                ret += fprintf(fp, "|\n");
@@ -251,6 +320,13 @@ static size_t callchain__fprintf_graph(FILE *fp, struct rb_root *root,
 
                        if (++entries_printed == callchain_param.print_limit)
                                break;
+
+                       if (symbol_conf.inline_name)
+                               ret += inline__fprintf(chain->ms.map,
+                                                      chain->ip,
+                                                      left_margin,
+                                                      0, 0,
+                                                      fp);
                }
                root = &cnode->rb_root;
        }
@@ -529,6 +605,8 @@ static int hist_entry__fprintf(struct hist_entry *he, size_t size,
                               bool use_callchain)
 {
        int ret;
+       int callchain_ret = 0;
+       int inline_ret = 0;
        struct perf_hpp hpp = {
                .buf            = bf,
                .size           = size,
@@ -547,7 +625,16 @@ static int hist_entry__fprintf(struct hist_entry *he, size_t size,
        ret = fprintf(fp, "%s\n", bf);
 
        if (use_callchain)
-               ret += hist_entry_callchain__fprintf(he, total_period, 0, fp);
+               callchain_ret = hist_entry_callchain__fprintf(he, total_period,
+                                                             0, fp);
+
+       if (callchain_ret == 0 && symbol_conf.inline_name) {
+               inline_ret = inline__fprintf(he->ms.map, he->ip, 0, 0, 0, fp);
+               ret += inline_ret;
+               if (inline_ret > 0)
+                       ret += fprintf(fp, "\n");
+       } else
+               ret += callchain_ret;
 
        return ret;
 }
index 4ea2ba8..d9350a1 100644 (file)
@@ -1,6 +1,7 @@
 #include <errno.h>
 #include <signal.h>
 #include <stdbool.h>
+#include <linux/kernel.h>
 #ifdef HAVE_BACKTRACE_SUPPORT
 #include <execinfo.h>
 #endif
index 5da376b..069583b 100644 (file)
@@ -1,4 +1,3 @@
-libperf-y += alias.o
 libperf-y += annotate.o
 libperf-y += block-range.o
 libperf-y += build-id.o
@@ -17,6 +16,7 @@ libperf-y += llvm-utils.o
 libperf-y += parse-events.o
 libperf-y += perf_regs.o
 libperf-y += path.o
+libperf-y += print_binary.o
 libperf-y += rbtree.o
 libperf-y += libstring.o
 libperf-y += bitmap.o
@@ -42,6 +42,7 @@ libperf-y += pstack.o
 libperf-y += session.o
 libperf-$(CONFIG_AUDIT) += syscalltbl.o
 libperf-y += ordered-events.o
+libperf-y += namespaces.o
 libperf-y += comm.o
 libperf-y += thread.o
 libperf-y += thread_map.o
@@ -81,13 +82,16 @@ libperf-$(CONFIG_AUXTRACE) += intel-pt-decoder/
 libperf-$(CONFIG_AUXTRACE) += intel-pt.o
 libperf-$(CONFIG_AUXTRACE) += intel-bts.o
 libperf-y += parse-branch-options.o
+libperf-y += dump-insn.o
 libperf-y += parse-regs-options.o
 libperf-y += term.o
 libperf-y += help-unknown-cmd.o
 libperf-y += mem-events.o
 libperf-y += vsprintf.o
 libperf-y += drv_configs.o
+libperf-y += units.o
 libperf-y += time-utils.o
+libperf-y += expr-bison.o
 
 libperf-$(CONFIG_LIBBPF) += bpf-loader.o
 libperf-$(CONFIG_BPF_PROLOGUE) += bpf-prologue.o
@@ -140,6 +144,10 @@ $(OUTPUT)util/parse-events-bison.c: util/parse-events.y
        $(call rule_mkdir)
        $(Q)$(call echo-cmd,bison)$(BISON) -v util/parse-events.y -d $(PARSER_DEBUG_BISON) -o $@ -p parse_events_
 
+$(OUTPUT)util/expr-bison.c: util/expr.y
+       $(call rule_mkdir)
+       $(Q)$(call echo-cmd,bison)$(BISON) -v util/expr.y -d $(PARSER_DEBUG_BISON) -o $@ -p expr__
+
 $(OUTPUT)util/pmu-flex.c: util/pmu.l $(OUTPUT)util/pmu-bison.c
        $(call rule_mkdir)
        $(Q)$(call echo-cmd,flex)$(FLEX) -o $@ --header-file=$(OUTPUT)util/pmu-flex.h util/pmu.l
@@ -152,6 +160,7 @@ CFLAGS_parse-events-flex.o  += -w
 CFLAGS_pmu-flex.o           += -w
 CFLAGS_parse-events-bison.o += -DYYENABLE_NLS=0 -w
 CFLAGS_pmu-bison.o          += -DYYENABLE_NLS=0 -DYYLTYPE_IS_TRIVIAL=0 -w
+CFLAGS_expr-bison.o         += -DYYENABLE_NLS=0 -DYYLTYPE_IS_TRIVIAL=0 -w
 
 $(OUTPUT)util/parse-events.o: $(OUTPUT)util/parse-events-flex.c $(OUTPUT)util/parse-events-bison.c
 $(OUTPUT)util/pmu.o: $(OUTPUT)util/pmu-flex.c $(OUTPUT)util/pmu-bison.c
diff --git a/tools/perf/util/alias.c b/tools/perf/util/alias.c
deleted file mode 100644 (file)
index 6455471..0000000
+++ /dev/null
@@ -1,78 +0,0 @@
-#include "cache.h"
-#include "util.h"
-#include "config.h"
-
-static const char *alias_key;
-static char *alias_val;
-
-static int alias_lookup_cb(const char *k, const char *v,
-                          void *cb __maybe_unused)
-{
-       if (!prefixcmp(k, "alias.") && !strcmp(k+6, alias_key)) {
-               if (!v)
-                       return config_error_nonbool(k);
-               alias_val = strdup(v);
-               return 0;
-       }
-       return 0;
-}
-
-char *alias_lookup(const char *alias)
-{
-       alias_key = alias;
-       alias_val = NULL;
-       perf_config(alias_lookup_cb, NULL);
-       return alias_val;
-}
-
-int split_cmdline(char *cmdline, const char ***argv)
-{
-       int src, dst, count = 0, size = 16;
-       char quoted = 0;
-
-       *argv = malloc(sizeof(char*) * size);
-
-       /* split alias_string */
-       (*argv)[count++] = cmdline;
-       for (src = dst = 0; cmdline[src];) {
-               char c = cmdline[src];
-               if (!quoted && isspace(c)) {
-                       cmdline[dst++] = 0;
-                       while (cmdline[++src]
-                                       && isspace(cmdline[src]))
-                               ; /* skip */
-                       if (count >= size) {
-                               size += 16;
-                               *argv = realloc(*argv, sizeof(char*) * size);
-                       }
-                       (*argv)[count++] = cmdline + dst;
-               } else if (!quoted && (c == '\'' || c == '"')) {
-                       quoted = c;
-                       src++;
-               } else if (c == quoted) {
-                       quoted = 0;
-                       src++;
-               } else {
-                       if (c == '\\' && quoted != '\'') {
-                               src++;
-                               c = cmdline[src];
-                               if (!c) {
-                                       zfree(argv);
-                                       return error("cmdline ends with \\");
-                               }
-                       }
-                       cmdline[dst++] = c;
-                       src++;
-               }
-       }
-
-       cmdline[dst] = 0;
-
-       if (quoted) {
-               zfree(argv);
-               return error("unclosed quote");
-       }
-
-       return count;
-}
-
index 273f21f..683f834 100644 (file)
@@ -7,6 +7,8 @@
  * Released under the GPL v2. (and only v2, not any later version)
  */
 
+#include <errno.h>
+#include <inttypes.h>
 #include "util.h"
 #include "ui/ui.h"
 #include "sort.h"
 #include "annotate.h"
 #include "evsel.h"
 #include "block-range.h"
+#include "string2.h"
 #include "arch/common.h"
 #include <regex.h>
 #include <pthread.h>
 #include <linux/bitops.h>
+#include <linux/kernel.h>
 #include <sys/utsname.h>
 
+#include "sane_ctype.h"
+
 const char     *disassembler_style;
 const char     *objdump_path;
 static regex_t  file_lineno;
@@ -108,6 +114,7 @@ static int arch__associate_ins_ops(struct arch* arch, const char *name, struct i
 #include "arch/arm64/annotate/instructions.c"
 #include "arch/x86/annotate/instructions.c"
 #include "arch/powerpc/annotate/instructions.c"
+#include "arch/s390/annotate/instructions.c"
 
 static struct arch architectures[] = {
        {
@@ -130,6 +137,13 @@ static struct arch architectures[] = {
                .name = "powerpc",
                .init = powerpc__annotate_init,
        },
+       {
+               .name = "s390",
+               .init = s390__annotate_init,
+               .objdump =  {
+                       .comment_char = '#',
+               },
+       },
 };
 
 static void ins__delete(struct ins_operands *ops)
@@ -379,9 +393,7 @@ static int mov__parse(struct arch *arch, struct ins_operands *ops, struct map *m
        if (comment == NULL)
                return 0;
 
-       while (comment[0] != '\0' && isspace(comment[0]))
-               ++comment;
-
+       comment = ltrim(comment);
        comment__symbol(ops->source.raw, comment, &ops->source.addr, &ops->source.name);
        comment__symbol(ops->target.raw, comment, &ops->target.addr, &ops->target.name);
 
@@ -426,9 +438,7 @@ static int dec__parse(struct arch *arch __maybe_unused, struct ins_operands *ops
        if (comment == NULL)
                return 0;
 
-       while (comment[0] != '\0' && isspace(comment[0]))
-               ++comment;
-
+       comment = ltrim(comment);
        comment__symbol(ops->target.raw, comment, &ops->target.addr, &ops->target.name);
 
        return 0;
@@ -777,10 +787,7 @@ static void disasm_line__init_ins(struct disasm_line *dl, struct arch *arch, str
 
 static int disasm_line__parse(char *line, const char **namep, char **rawp)
 {
-       char *name = line, tmp;
-
-       while (isspace(name[0]))
-               ++name;
+       char tmp, *name = ltrim(line);
 
        if (name[0] == '\0')
                return -1;
@@ -798,12 +805,7 @@ static int disasm_line__parse(char *line, const char **namep, char **rawp)
                goto out_free_name;
 
        (*rawp)[0] = tmp;
-
-       if ((*rawp)[0] != '\0') {
-               (*rawp)++;
-               while (isspace((*rawp)[0]))
-                       ++(*rawp);
-       }
+       *rawp = ltrim(*rawp);
 
        return 0;
 
@@ -1148,7 +1150,7 @@ static int symbol__parse_objdump_line(struct symbol *sym, struct map *map,
 {
        struct annotation *notes = symbol__annotation(sym);
        struct disasm_line *dl;
-       char *line = NULL, *parsed_line, *tmp, *tmp2, *c;
+       char *line = NULL, *parsed_line, *tmp, *tmp2;
        size_t line_len;
        s64 line_ip, offset = -1;
        regmatch_t match[2];
@@ -1159,32 +1161,16 @@ static int symbol__parse_objdump_line(struct symbol *sym, struct map *map,
        if (!line)
                return -1;
 
-       while (line_len != 0 && isspace(line[line_len - 1]))
-               line[--line_len] = '\0';
-
-       c = strchr(line, '\n');
-       if (c)
-               *c = 0;
-
        line_ip = -1;
-       parsed_line = line;
+       parsed_line = rtrim(line);
 
        /* /filename:linenr ? Save line number and ignore. */
-       if (regexec(&file_lineno, line, 2, match, 0) == 0) {
-               *line_nr = atoi(line + match[1].rm_so);
+       if (regexec(&file_lineno, parsed_line, 2, match, 0) == 0) {
+               *line_nr = atoi(parsed_line + match[1].rm_so);
                return 0;
        }
 
-       /*
-        * Strip leading spaces:
-        */
-       tmp = line;
-       while (*tmp) {
-               if (*tmp != ' ')
-                       break;
-               tmp++;
-       }
-
+       tmp = ltrim(parsed_line);
        if (*tmp) {
                /*
                 * Parse hexa addresses followed by ':'
@@ -1307,6 +1293,7 @@ static int dso__disassemble_filename(struct dso *dso, char *filename, size_t fil
 {
        char linkname[PATH_MAX];
        char *build_id_filename;
+       char *build_id_path = NULL;
 
        if (dso->symtab_type == DSO_BINARY_TYPE__KALLSYMS &&
            !dso__is_kcore(dso))
@@ -1322,8 +1309,14 @@ static int dso__disassemble_filename(struct dso *dso, char *filename, size_t fil
                goto fallback;
        }
 
+       build_id_path = strdup(filename);
+       if (!build_id_path)
+               return -1;
+
+       dirname(build_id_path);
+
        if (dso__is_kcore(dso) ||
-           readlink(filename, linkname, sizeof(linkname)) < 0 ||
+           readlink(build_id_path, linkname, sizeof(linkname)) < 0 ||
            strstr(linkname, DSO__NAME_KALLSYMS) ||
            access(filename, R_OK)) {
 fallback:
@@ -1335,6 +1328,7 @@ fallback:
                __symbol__join_symfs(filename, filename_size, dso->long_name);
        }
 
+       free(build_id_path);
        return 0;
 }
 
@@ -1435,7 +1429,7 @@ int symbol__disassemble(struct symbol *sym, struct map *map, const char *arch_na
        snprintf(command, sizeof(command),
                 "%s %s%s --start-address=0x%016" PRIx64
                 " --stop-address=0x%016" PRIx64
-                " -l -d %s %s -C %s 2>/dev/null|grep -v %s|expand",
+                " -l -d %s %s -C %s 2>/dev/null|grep -v %s:|expand",
                 objdump_path ? objdump_path : "objdump",
                 disassembler_style ? "-M " : "",
                 disassembler_style ? disassembler_style : "",
@@ -1482,6 +1476,12 @@ int symbol__disassemble(struct symbol *sym, struct map *map, const char *arch_na
 
        nline = 0;
        while (!feof(file)) {
+               /*
+                * The source code line number (lineno) needs to be kept in
+                * accross calls to symbol__parse_objdump_line(), so that it
+                * can associate it with the instructions till the next one.
+                * See disasm_line__new() and struct disasm_line::line_nr.
+                */
                if (symbol__parse_objdump_line(sym, map, arch, file, privsize,
                            &lineno) < 0)
                        break;
@@ -1651,24 +1651,31 @@ static int symbol__get_source_line(struct symbol *sym, struct map *map,
        start = map__rip_2objdump(map, sym->start);
 
        for (i = 0; i < len; i++) {
-               u64 offset;
+               u64 offset, nr_samples;
                double percent_max = 0.0;
 
                src_line->nr_pcnt = nr_pcnt;
 
                for (k = 0; k < nr_pcnt; k++) {
+                       double percent = 0.0;
+
                        h = annotation__histogram(notes, evidx + k);
-                       src_line->samples[k].percent = 100.0 * h->addr[i] / h->sum;
+                       nr_samples = h->addr[i];
+                       if (h->sum)
+                               percent = 100.0 * nr_samples / h->sum;
 
-                       if (src_line->samples[k].percent > percent_max)
-                               percent_max = src_line->samples[k].percent;
+                       if (percent > percent_max)
+                               percent_max = percent;
+                       src_line->samples[k].percent = percent;
+                       src_line->samples[k].nr = nr_samples;
                }
 
                if (percent_max <= 0.5)
                        goto next;
 
                offset = start + i;
-               src_line->path = get_srcline(map->dso, offset, NULL, false);
+               src_line->path = get_srcline(map->dso, offset, NULL,
+                                            false, true);
                insert_source_line(&tmp_root, src_line);
 
        next:
index 09776b5..948aa8e 100644 (file)
@@ -98,7 +98,7 @@ struct cyc_hist {
 struct source_line_samples {
        double          percent;
        double          percent_sum;
-       double          nr;
+       u64             nr;
 };
 
 struct source_line {
index c5a6e0b..0daf63b 100644 (file)
  *
  */
 
+#include <inttypes.h>
 #include <sys/types.h>
 #include <sys/mman.h>
 #include <stdbool.h>
-#include <ctype.h>
 #include <string.h>
 #include <limits.h>
 #include <errno.h>
@@ -46,7 +46,6 @@
 #include "cpumap.h"
 #include "thread_map.h"
 #include "asm/bug.h"
-#include "symbol/kallsyms.h"
 #include "auxtrace.h"
 
 #include <linux/hash.h>
@@ -59,6 +58,9 @@
 #include "intel-pt.h"
 #include "intel-bts.h"
 
+#include "sane_ctype.h"
+#include "symbol/kallsyms.h"
+
 int auxtrace_mmap__mmap(struct auxtrace_mmap *mm,
                        struct auxtrace_mmap_params *mp,
                        void *userpg, int fd)
@@ -1826,7 +1828,7 @@ static int addr_filter__resolve_kernel_syms(struct addr_filter *filt)
                filt->addr = start;
                if (filt->range && !filt->size && !filt->sym_to) {
                        filt->size = size;
-                       no_size = !!size;
+                       no_size = !size;
                }
        }
 
@@ -1840,7 +1842,7 @@ static int addr_filter__resolve_kernel_syms(struct addr_filter *filt)
                if (err)
                        return err;
                filt->size = start + size - filt->addr;
-               no_size = !!size;
+               no_size = !size;
        }
 
        /* The very last symbol in kallsyms does not imply a particular size */
index 26fb1ee..9f0de72 100644 (file)
@@ -17,6 +17,7 @@
 #define __PERF_AUXTRACE_H
 
 #include <sys/types.h>
+#include <errno.h>
 #include <stdbool.h>
 #include <stddef.h>
 #include <linux/list.h>
index bc6bc70..4bd2d1d 100644 (file)
@@ -9,7 +9,9 @@
 #include <bpf/libbpf.h>
 #include <bpf/bpf.h>
 #include <linux/err.h>
+#include <linux/kernel.h>
 #include <linux/string.h>
+#include <errno.h>
 #include "perf.h"
 #include "debug.h"
 #include "bpf-loader.h"
@@ -17,6 +19,7 @@
 #include "probe-event.h"
 #include "probe-finder.h" // for MAX_PROBES
 #include "parse-events.h"
+#include "strfilter.h"
 #include "llvm-utils.h"
 #include "c++/clang-c.h"
 
index f2b737b..4886386 100644 (file)
@@ -85,6 +85,8 @@ int bpf__strerror_setup_stdout(struct perf_evlist *evlist, int err,
                               char *buf, size_t size);
 
 #else
+#include <errno.h>
+
 static inline struct bpf_object *
 bpf__prepare_load(const char *filename __maybe_unused,
                  bool source __maybe_unused)
index 6cdbee1..1356220 100644 (file)
@@ -12,6 +12,7 @@
 #include "bpf-loader.h"
 #include "bpf-prologue.h"
 #include "probe-finder.h"
+#include <errno.h>
 #include <dwarf-regs.h>
 #include <linux/filter.h>
 
index d94cbea..ba56483 100644 (file)
@@ -18,6 +18,8 @@ int bpf__gen_prologue(struct probe_trace_arg *args, int nargs,
                      struct bpf_insn *new_prog, size_t *new_cnt,
                      size_t cnt_space);
 #else
+#include <errno.h>
+
 static inline int
 bpf__gen_prologue(struct probe_trace_arg *args __maybe_unused,
                  int nargs __maybe_unused,
index e528c40..168cc49 100644 (file)
@@ -7,18 +7,26 @@
  * Copyright (C) 2009, 2010 Arnaldo Carvalho de Melo <acme@redhat.com>
  */
 #include "util.h"
+#include <dirent.h>
+#include <errno.h>
 #include <stdio.h>
+#include <sys/stat.h>
+#include <sys/types.h>
 #include "build-id.h"
 #include "event.h"
 #include "symbol.h"
+#include "thread.h"
 #include <linux/kernel.h>
 #include "debug.h"
 #include "session.h"
 #include "tool.h"
 #include "header.h"
 #include "vdso.h"
+#include "path.h"
 #include "probe-file.h"
+#include "strlist.h"
 
+#include "sane_ctype.h"
 
 static bool no_buildid_cache;
 
@@ -182,13 +190,17 @@ char *build_id_cache__origname(const char *sbuild_id)
        char buf[PATH_MAX];
        char *ret = NULL, *p;
        size_t offs = 5;        /* == strlen("../..") */
+       ssize_t len;
 
        linkname = build_id_cache__linkname(sbuild_id, NULL, 0);
        if (!linkname)
                return NULL;
 
-       if (readlink(linkname, buf, PATH_MAX) < 0)
+       len = readlink(linkname, buf, sizeof(buf) - 1);
+       if (len <= 0)
                goto out;
+       buf[len] = '\0';
+
        /* The link should be "../..<origpath>/<sbuild_id>" */
        p = strrchr(buf, '/');  /* Cut off the "/<sbuild_id>" */
        if (p && (p > buf + offs)) {
@@ -443,14 +455,14 @@ void disable_buildid_cache(void)
 }
 
 static bool lsdir_bid_head_filter(const char *name __maybe_unused,
-                                 struct dirent *d __maybe_unused)
+                                 struct dirent *d)
 {
        return (strlen(d->d_name) == 2) &&
                isxdigit(d->d_name[0]) && isxdigit(d->d_name[1]);
 }
 
 static bool lsdir_bid_tail_filter(const char *name __maybe_unused,
-                                 struct dirent *d __maybe_unused)
+                                 struct dirent *d)
 {
        int i = 0;
        while (isxdigit(d->d_name[i]) && i < SBUILD_ID_SIZE - 3)
@@ -690,7 +702,7 @@ int build_id_cache__add_s(const char *sbuild_id, const char *name,
                err = 0;
 
        /* Update SDT cache : error is just warned */
-       if (build_id_cache__add_sdt_cache(sbuild_id, realname) < 0)
+       if (realname && build_id_cache__add_sdt_cache(sbuild_id, realname) < 0)
                pr_debug4("Failed to update/scan SDT cache for %s\n", realname);
 
 out_free:
index d279906..a960811 100644 (file)
@@ -5,7 +5,6 @@
 #define SBUILD_ID_SIZE (BUILD_ID_SIZE * 2 + 1)
 
 #include "tool.h"
-#include "strlist.h"
 #include <linux/types.h>
 
 extern struct perf_tool build_id__mark_dso_hit_ops;
@@ -34,6 +33,9 @@ char *build_id_cache__origname(const char *sbuild_id);
 char *build_id_cache__linkname(const char *sbuild_id, char *bf, size_t size);
 char *build_id_cache__cachedir(const char *sbuild_id, const char *name,
                               bool is_kallsyms, bool is_vdso);
+
+struct strlist;
+
 struct strlist *build_id_cache__list_all(bool validonly);
 char *build_id_cache__complement(const char *incomplete_sbuild_id);
 int build_id_cache__list_build_ids(const char *pathname,
index 0eadd79..ccafcf7 100644 (file)
@@ -20,6 +20,7 @@ extern int perf_clang__compile_bpf(const char *filename,
                                   size_t *p_obj_buf_sz);
 #else
 
+#include <errno.h>
 
 static inline void perf_clang__init(void) { }
 static inline void perf_clang__cleanup(void) { }
index 512c0c8..0328f29 100644 (file)
@@ -15,7 +15,6 @@
 #define PERF_TRACEFS_ENVIRONMENT "PERF_TRACEFS_DIR"
 #define PERF_PAGER_ENVIRONMENT "PERF_PAGER"
 
-char *alias_lookup(const char *alias);
 int split_cmdline(char *cmdline, const char ***argv);
 
 #define alloc_nr(x) (((x)+16)*3/2)
index aba9534..81fc29a 100644 (file)
@@ -9,6 +9,7 @@
  *
  */
 
+#include <inttypes.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <stdbool.h>
 #include "machine.h"
 #include "callchain.h"
 
+#define CALLCHAIN_PARAM_DEFAULT                        \
+       .mode           = CHAIN_GRAPH_ABS,      \
+       .min_percent    = 0.5,                  \
+       .order          = ORDER_CALLEE,         \
+       .key            = CCKEY_FUNCTION,       \
+       .value          = CCVAL_PERCENT,        \
+
+struct callchain_param callchain_param = {
+       CALLCHAIN_PARAM_DEFAULT
+};
+
+struct callchain_param callchain_param_default = {
+       CALLCHAIN_PARAM_DEFAULT
+};
+
 __thread struct callchain_cursor callchain_cursor;
 
 int parse_callchain_record_opt(const char *arg, struct callchain_param *param)
@@ -80,6 +96,10 @@ static int parse_callchain_sort_key(const char *value)
                callchain_param.key = CCKEY_ADDRESS;
                return 0;
        }
+       if (!strncmp(value, "srcline", strlen(value))) {
+               callchain_param.key = CCKEY_SRCLINE;
+               return 0;
+       }
        if (!strncmp(value, "branch", strlen(value))) {
                callchain_param.branch_callstack = 1;
                return 0;
@@ -108,11 +128,37 @@ static int parse_callchain_value(const char *value)
        return -1;
 }
 
+static int get_stack_size(const char *str, unsigned long *_size)
+{
+       char *endptr;
+       unsigned long size;
+       unsigned long max_size = round_down(USHRT_MAX, sizeof(u64));
+
+       size = strtoul(str, &endptr, 0);
+
+       do {
+               if (*endptr)
+                       break;
+
+               size = round_up(size, sizeof(u64));
+               if (!size || size > max_size)
+                       break;
+
+               *_size = size;
+               return 0;
+
+       } while (0);
+
+       pr_err("callchain: Incorrect stack dump size (max %ld): %s\n",
+              max_size, str);
+       return -1;
+}
+
 static int
 __parse_callchain_report_opt(const char *arg, bool allow_record_opt)
 {
        char *tok;
-       char *endptr;
+       char *endptr, *saveptr = NULL;
        bool minpcnt_set = false;
        bool record_opt_set = false;
        bool try_stack_size = false;
@@ -123,7 +169,7 @@ __parse_callchain_report_opt(const char *arg, bool allow_record_opt)
        if (!arg)
                return 0;
 
-       while ((tok = strtok((char *)arg, ",")) != NULL) {
+       while ((tok = strtok_r((char *)arg, ",", &saveptr)) != NULL) {
                if (!strncmp(tok, "none", strlen(tok))) {
                        callchain_param.mode = CHAIN_NONE;
                        callchain_param.enabled = false;
@@ -191,6 +237,68 @@ int parse_callchain_top_opt(const char *arg)
        return __parse_callchain_report_opt(arg, true);
 }
 
+int parse_callchain_record(const char *arg, struct callchain_param *param)
+{
+       char *tok, *name, *saveptr = NULL;
+       char *buf;
+       int ret = -1;
+
+       /* We need buffer that we know we can write to. */
+       buf = malloc(strlen(arg) + 1);
+       if (!buf)
+               return -ENOMEM;
+
+       strcpy(buf, arg);
+
+       tok = strtok_r((char *)buf, ",", &saveptr);
+       name = tok ? : (char *)buf;
+
+       do {
+               /* Framepointer style */
+               if (!strncmp(name, "fp", sizeof("fp"))) {
+                       if (!strtok_r(NULL, ",", &saveptr)) {
+                               param->record_mode = CALLCHAIN_FP;
+                               ret = 0;
+                       } else
+                               pr_err("callchain: No more arguments "
+                                      "needed for --call-graph fp\n");
+                       break;
+
+               /* Dwarf style */
+               } else if (!strncmp(name, "dwarf", sizeof("dwarf"))) {
+                       const unsigned long default_stack_dump_size = 8192;
+
+                       ret = 0;
+                       param->record_mode = CALLCHAIN_DWARF;
+                       param->dump_size = default_stack_dump_size;
+
+                       tok = strtok_r(NULL, ",", &saveptr);
+                       if (tok) {
+                               unsigned long size = 0;
+
+                               ret = get_stack_size(tok, &size);
+                               param->dump_size = size;
+                       }
+               } else if (!strncmp(name, "lbr", sizeof("lbr"))) {
+                       if (!strtok_r(NULL, ",", &saveptr)) {
+                               param->record_mode = CALLCHAIN_LBR;
+                               ret = 0;
+                       } else
+                               pr_err("callchain: No more arguments "
+                                       "needed for --call-graph lbr\n");
+                       break;
+               } else {
+                       pr_err("callchain: Unknown --call-graph option "
+                              "value: %s\n", arg);
+                       break;
+               }
+
+       } while (0);
+
+       free(buf);
+       return ret;
+}
+
 int perf_callchain_config(const char *var, const char *value)
 {
        char *endptr;
@@ -510,14 +618,51 @@ enum match_result {
        MATCH_GT,
 };
 
+static enum match_result match_chain_srcline(struct callchain_cursor_node *node,
+                                            struct callchain_list *cnode)
+{
+       char *left = get_srcline(cnode->ms.map->dso,
+                                map__rip_2objdump(cnode->ms.map, cnode->ip),
+                                cnode->ms.sym, true, false);
+       char *right = get_srcline(node->map->dso,
+                                 map__rip_2objdump(node->map, node->ip),
+                                 node->sym, true, false);
+       enum match_result ret = MATCH_EQ;
+       int cmp;
+
+       if (left && right)
+               cmp = strcmp(left, right);
+       else if (!left && right)
+               cmp = 1;
+       else if (left && !right)
+               cmp = -1;
+       else if (cnode->ip == node->ip)
+               cmp = 0;
+       else
+               cmp = (cnode->ip < node->ip) ? -1 : 1;
+
+       if (cmp != 0)
+               ret = cmp < 0 ? MATCH_LT : MATCH_GT;
+
+       free_srcline(left);
+       free_srcline(right);
+       return ret;
+}
+
 static enum match_result match_chain(struct callchain_cursor_node *node,
                                     struct callchain_list *cnode)
 {
        struct symbol *sym = node->sym;
        u64 left, right;
 
-       if (cnode->ms.sym && sym &&
-           callchain_param.key == CCKEY_FUNCTION) {
+       if (callchain_param.key == CCKEY_SRCLINE) {
+               enum match_result match = match_chain_srcline(node, cnode);
+
+               if (match != MATCH_ERROR)
+                       return match;
+       }
+
+       if (cnode->ms.sym && sym && callchain_param.key == CCKEY_FUNCTION) {
                left = cnode->ms.sym->start;
                right = sym->start;
        } else {
@@ -911,15 +1056,16 @@ out:
 char *callchain_list__sym_name(struct callchain_list *cl,
                               char *bf, size_t bfsize, bool show_dso)
 {
+       bool show_addr = callchain_param.key == CCKEY_ADDRESS;
+       bool show_srcline = show_addr || callchain_param.key == CCKEY_SRCLINE;
        int printed;
 
        if (cl->ms.sym) {
-               if (callchain_param.key == CCKEY_ADDRESS &&
-                   cl->ms.map && !cl->srcline)
+               if (show_srcline && cl->ms.map && !cl->srcline)
                        cl->srcline = get_srcline(cl->ms.map->dso,
                                                  map__rip_2objdump(cl->ms.map,
                                                                    cl->ip),
-                                                 cl->ms.sym, false);
+                                                 cl->ms.sym, false, show_addr);
                if (cl->srcline)
                        printed = scnprintf(bf, bfsize, "%s %s",
                                        cl->ms.sym->name, cl->srcline);
@@ -1063,63 +1209,100 @@ int callchain_branch_counts(struct callchain_root *root,
                                                  cycles_count);
 }
 
-static int callchain_counts_printf(FILE *fp, char *bf, int bfsize,
-                                  u64 branch_count, u64 predicted_count,
-                                  u64 abort_count, u64 cycles_count,
-                                  u64 iter_count, u64 samples_count)
+static int counts_str_build(char *bf, int bfsize,
+                            u64 branch_count, u64 predicted_count,
+                            u64 abort_count, u64 cycles_count,
+                            u64 iter_count, u64 samples_count)
 {
        double predicted_percent = 0.0;
        const char *null_str = "";
        char iter_str[32];
-       char *str;
-       u64 cycles = 0;
-
-       if (branch_count == 0) {
-               if (fp)
-                       return fprintf(fp, " (calltrace)");
+       char cycle_str[32];
+       char *istr, *cstr;
+       u64 cycles;
 
+       if (branch_count == 0)
                return scnprintf(bf, bfsize, " (calltrace)");
-       }
+
+       cycles = cycles_count / branch_count;
 
        if (iter_count && samples_count) {
-               scnprintf(iter_str, sizeof(iter_str),
-                        ", iterations:%" PRId64 "",
-                        iter_count / samples_count);
-               str = iter_str;
+               if (cycles > 0)
+                       scnprintf(iter_str, sizeof(iter_str),
+                                " iterations:%" PRId64 "",
+                                iter_count / samples_count);
+               else
+                       scnprintf(iter_str, sizeof(iter_str),
+                                "iterations:%" PRId64 "",
+                                iter_count / samples_count);
+               istr = iter_str;
+       } else
+               istr = (char *)null_str;
+
+       if (cycles > 0) {
+               scnprintf(cycle_str, sizeof(cycle_str),
+                         "cycles:%" PRId64 "", cycles);
+               cstr = cycle_str;
        } else
-               str = (char *)null_str;
+               cstr = (char *)null_str;
 
        predicted_percent = predicted_count * 100.0 / branch_count;
-       cycles = cycles_count / branch_count;
 
-       if ((predicted_percent >= 100.0) && (abort_count == 0)) {
-               if (fp)
-                       return fprintf(fp, " (cycles:%" PRId64 "%s)",
-                                      cycles, str);
+       if ((predicted_count == branch_count) && (abort_count == 0)) {
+               if ((cycles > 0) || (istr != (char *)null_str))
+                       return scnprintf(bf, bfsize, " (%s%s)", cstr, istr);
+               else
+                       return scnprintf(bf, bfsize, "%s", (char *)null_str);
+       }
 
-               return scnprintf(bf, bfsize, " (cycles:%" PRId64 "%s)",
-                                cycles, str);
+       if ((predicted_count < branch_count) && (abort_count == 0)) {
+               if ((cycles > 0) || (istr != (char *)null_str))
+                       return scnprintf(bf, bfsize,
+                               " (predicted:%.1f%% %s%s)",
+                               predicted_percent, cstr, istr);
+               else {
+                       return scnprintf(bf, bfsize,
+                               " (predicted:%.1f%%)",
+                               predicted_percent);
+               }
        }
 
-       if ((predicted_percent < 100.0) && (abort_count == 0)) {
-               if (fp)
-                       return fprintf(fp,
-                               " (predicted:%.1f%%, cycles:%" PRId64 "%s)",
-                               predicted_percent, cycles, str);
+       if ((predicted_count == branch_count) && (abort_count > 0)) {
+               if ((cycles > 0) || (istr != (char *)null_str))
+                       return scnprintf(bf, bfsize,
+                               " (abort:%" PRId64 " %s%s)",
+                               abort_count, cstr, istr);
+               else
+                       return scnprintf(bf, bfsize,
+                               " (abort:%" PRId64 ")",
+                               abort_count);
+       }
 
+       if ((cycles > 0) || (istr != (char *)null_str))
                return scnprintf(bf, bfsize,
-                       " (predicted:%.1f%%, cycles:%" PRId64 "%s)",
-                       predicted_percent, cycles, str);
-       }
+                       " (predicted:%.1f%% abort:%" PRId64 " %s%s)",
+                       predicted_percent, abort_count, cstr, istr);
+
+       return scnprintf(bf, bfsize,
+                       " (predicted:%.1f%% abort:%" PRId64 ")",
+                       predicted_percent, abort_count);
+}
+
+static int callchain_counts_printf(FILE *fp, char *bf, int bfsize,
+                                  u64 branch_count, u64 predicted_count,
+                                  u64 abort_count, u64 cycles_count,
+                                  u64 iter_count, u64 samples_count)
+{
+       char str[128];
+
+       counts_str_build(str, sizeof(str), branch_count,
+                        predicted_count, abort_count, cycles_count,
+                        iter_count, samples_count);
 
        if (fp)
-               return fprintf(fp,
-               " (predicted:%.1f%%, abort:%" PRId64 ", cycles:%" PRId64 "%s)",
-                       predicted_percent, abort_count, cycles, str);
+               return fprintf(fp, "%s", str);
 
-       return scnprintf(bf, bfsize,
-               " (predicted:%.1f%%, abort:%" PRId64 ", cycles:%" PRId64 "%s)",
-               predicted_percent, abort_count, cycles, str);
+       return scnprintf(bf, bfsize, "%s", str);
 }
 
 int callchain_list_counts__printf_value(struct callchain_node *node,
index 4f4b60f..c56c23d 100644 (file)
@@ -77,7 +77,8 @@ typedef void (*sort_chain_func_t)(struct rb_root *, struct callchain_root *,
 
 enum chain_key {
        CCKEY_FUNCTION,
-       CCKEY_ADDRESS
+       CCKEY_ADDRESS,
+       CCKEY_SRCLINE
 };
 
 enum chain_value {
index eafbf11..0334774 100644 (file)
@@ -4,6 +4,7 @@
 #include "evsel.h"
 #include "cgroup.h"
 #include "evlist.h"
+#include <linux/stringify.h>
 
 int nr_cgroups;
 
@@ -27,8 +28,8 @@ cgroupfs_find_mountpoint(char *buf, size_t maxlen)
        path_v1[0] = '\0';
        path_v2[0] = '\0';
 
-       while (fscanf(fp, "%*s %"STR(PATH_MAX)"s %"STR(PATH_MAX)"s %"
-                               STR(PATH_MAX)"s %*d %*d\n",
+       while (fscanf(fp, "%*s %"__stringify(PATH_MAX)"s %"__stringify(PATH_MAX)"s %"
+                               __stringify(PATH_MAX)"s %*d %*d\n",
                                mountpoint, type, tokens) == 3) {
 
                if (!path_v1[0] && !strcmp(type, "cgroup")) {
@@ -127,19 +128,19 @@ static int add_cgroup(struct perf_evlist *evlist, char *str)
                        goto found;
                n++;
        }
-       if (atomic_read(&cgrp->refcnt) == 0)
+       if (refcount_read(&cgrp->refcnt) == 0)
                free(cgrp);
 
        return -1;
 found:
-       atomic_inc(&cgrp->refcnt);
+       refcount_inc(&cgrp->refcnt);
        counter->cgrp = cgrp;
        return 0;
 }
 
 void close_cgroup(struct cgroup_sel *cgrp)
 {
-       if (cgrp && atomic_dec_and_test(&cgrp->refcnt)) {
+       if (cgrp && refcount_dec_and_test(&cgrp->refcnt)) {
                close(cgrp->fd);
                zfree(&cgrp->name);
                free(cgrp);
index 31f8dcd..d91966b 100644 (file)
@@ -1,14 +1,14 @@
 #ifndef __CGROUP_H__
 #define __CGROUP_H__
 
-#include <linux/atomic.h>
+#include <linux/refcount.h>
 
 struct option;
 
 struct cgroup_sel {
        char *name;
        int fd;
-       atomic_t refcnt;
+       refcount_t refcnt;
 };
 
 
index f0dcd0e..4b4f00d 100644 (file)
@@ -1,3 +1,4 @@
+#include <errno.h>
 #include <sched.h>
 #include "util.h"
 #include "../perf.h"
index d0d4659..94a5a7d 100644 (file)
@@ -3,10 +3,4 @@
 
 unsigned long perf_event_open_cloexec_flag(void);
 
-#ifdef __GLIBC_PREREQ
-#if !__GLIBC_PREREQ(2, 6) && !defined(__UCLIBC__)
-int sched_getcpu(void) __THROW;
-#endif
-#endif
-
 #endif /* __PERF_CLOEXEC_H */
index a93997f..52122bc 100644 (file)
@@ -1,6 +1,8 @@
 #ifndef __PERF_COLOR_H
 #define __PERF_COLOR_H
 
+#include <stdio.h>
+
 /* "\033[1;38;5;2xx;48;5;2xxm\0" is 23 bytes */
 #define COLOR_MAXLEN 24
 
index 21b7ff3..7bc981b 100644 (file)
@@ -1,13 +1,15 @@
 #include "comm.h"
 #include "util.h"
+#include <errno.h>
 #include <stdlib.h>
 #include <stdio.h>
-#include <linux/atomic.h>
+#include <string.h>
+#include <linux/refcount.h>
 
 struct comm_str {
        char *str;
        struct rb_node rb_node;
-       atomic_t refcnt;
+       refcount_t refcnt;
 };
 
 /* Should perhaps be moved to struct machine */
@@ -16,13 +18,13 @@ static struct rb_root comm_str_root;
 static struct comm_str *comm_str__get(struct comm_str *cs)
 {
        if (cs)
-               atomic_inc(&cs->refcnt);
+               refcount_inc(&cs->refcnt);
        return cs;
 }
 
 static void comm_str__put(struct comm_str *cs)
 {
-       if (cs && atomic_dec_and_test(&cs->refcnt)) {
+       if (cs && refcount_dec_and_test(&cs->refcnt)) {
                rb_erase(&cs->rb_node, &comm_str_root);
                zfree(&cs->str);
                free(cs);
@@ -43,7 +45,7 @@ static struct comm_str *comm_str__alloc(const char *str)
                return NULL;
        }
 
-       atomic_set(&cs->refcnt, 0);
+       refcount_set(&cs->refcnt, 1);
 
        return cs;
 }
@@ -61,7 +63,7 @@ static struct comm_str *comm_str__findnew(const char *str, struct rb_root *root)
 
                cmp = strcmp(str, iter->str);
                if (!cmp)
-                       return iter;
+                       return comm_str__get(iter);
 
                if (cmp < 0)
                        p = &(*p)->rb_left;
@@ -95,8 +97,6 @@ struct comm *comm__new(const char *str, u64 timestamp, bool exec)
                return NULL;
        }
 
-       comm_str__get(comm->comm_str);
-
        return comm;
 }
 
@@ -108,7 +108,6 @@ int comm__override(struct comm *comm, const char *str, u64 timestamp, bool exec)
        if (!new)
                return -ENOMEM;
 
-       comm_str__get(new);
        comm_str__put(old);
        comm->comm_str = new;
        comm->start = timestamp;
diff --git a/tools/perf/util/compress.h b/tools/perf/util/compress.h
new file mode 100644 (file)
index 0000000..67fd1bb
--- /dev/null
@@ -0,0 +1,12 @@
+#ifndef PERF_COMPRESS_H
+#define PERF_COMPRESS_H
+
+#ifdef HAVE_ZLIB_SUPPORT
+int gzip_decompress_to_file(const char *input, int output_fd);
+#endif
+
+#ifdef HAVE_LZMA_SUPPORT
+int lzma_decompress_to_file(const char *input, int output_fd);
+#endif
+
+#endif /* PERF_COMPRESS_H */
index 0c7d5a4..8d724f0 100644 (file)
@@ -8,12 +8,19 @@
  * Copyright (C) Johannes Schindelin, 2005
  *
  */
+#include <errno.h>
+#include <sys/param.h>
 #include "util.h"
 #include "cache.h"
 #include <subcmd/exec-cmd.h>
 #include "util/hist.h"  /* perf_hist_config */
 #include "util/llvm-utils.h"   /* perf_llvm_config */
 #include "config.h"
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include "sane_ctype.h"
 
 #define MAXNAME (256)
 
@@ -627,6 +634,8 @@ static int perf_config_set__init(struct perf_config_set *set)
 {
        int ret = -1;
        const char *home = NULL;
+       char *user_config;
+       struct stat st;
 
        /* Setting $PERF_CONFIG makes perf read _only_ the given config file. */
        if (config_exclusive_filename)
@@ -637,35 +646,41 @@ static int perf_config_set__init(struct perf_config_set *set)
        }
 
        home = getenv("HOME");
-       if (perf_config_global() && home) {
-               char *user_config = strdup(mkpath("%s/.perfconfig", home));
-               struct stat st;
 
-               if (user_config == NULL) {
-                       warning("Not enough memory to process %s/.perfconfig, "
-                               "ignoring it.", home);
-                       goto out;
-               }
+       /*
+        * Skip reading user config if:
+        *   - there is no place to read it from (HOME)
+        *   - we are asked not to (PERF_CONFIG_NOGLOBAL=1)
+        */
+       if (!home || !*home || !perf_config_global())
+               return 0;
 
-               if (stat(user_config, &st) < 0) {
-                       if (errno == ENOENT)
-                               ret = 0;
-                       goto out_free;
-               }
+       user_config = strdup(mkpath("%s/.perfconfig", home));
+       if (user_config == NULL) {
+               warning("Not enough memory to process %s/.perfconfig, "
+                       "ignoring it.", home);
+               goto out;
+       }
 
-               ret = 0;
+       if (stat(user_config, &st) < 0) {
+               if (errno == ENOENT)
+                       ret = 0;
+               goto out_free;
+       }
 
-               if (st.st_uid && (st.st_uid != geteuid())) {
-                       warning("File %s not owned by current user or root, "
-                               "ignoring it.", user_config);
-                       goto out_free;
-               }
+       ret = 0;
 
-               if (st.st_size)
-                       ret = perf_config_from_file(collect_config, user_config, set);
-out_free:
-               free(user_config);
+       if (st.st_uid && (st.st_uid != geteuid())) {
+               warning("File %s not owned by current user or root, "
+                       "ignoring it.", user_config);
+               goto out_free;
        }
+
+       if (st.st_size)
+               ret = perf_config_from_file(collect_config, user_config, set);
+
+out_free:
+       free(user_config);
 out:
        return ret;
 }
index e3fde31..c4af82a 100644 (file)
@@ -1,6 +1,8 @@
+#include <errno.h>
 #include <stdlib.h>
 #include "evsel.h"
 #include "counts.h"
+#include "util.h"
 
 struct perf_counts *perf_counts__new(int ncpus, int nthreads)
 {
index 8c75049..37b3bb7 100644 (file)
@@ -3,11 +3,14 @@
 #include "../perf.h"
 #include "cpumap.h"
 #include <assert.h>
+#include <dirent.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <linux/bitmap.h>
 #include "asm/bug.h"
 
+#include "sane_ctype.h"
+
 static int max_cpu_num;
 static int max_present_cpu_num;
 static int max_node_num;
@@ -29,7 +32,7 @@ static struct cpu_map *cpu_map__default_new(void)
                        cpus->map[i] = i;
 
                cpus->nr = nr_cpus;
-               atomic_set(&cpus->refcnt, 1);
+               refcount_set(&cpus->refcnt, 1);
        }
 
        return cpus;
@@ -43,7 +46,7 @@ static struct cpu_map *cpu_map__trim_new(int nr_cpus, int *tmp_cpus)
        if (cpus != NULL) {
                cpus->nr = nr_cpus;
                memcpy(cpus->map, tmp_cpus, payload_size);
-               atomic_set(&cpus->refcnt, 1);
+               refcount_set(&cpus->refcnt, 1);
        }
 
        return cpus;
@@ -252,7 +255,7 @@ struct cpu_map *cpu_map__dummy_new(void)
        if (cpus != NULL) {
                cpus->nr = 1;
                cpus->map[0] = -1;
-               atomic_set(&cpus->refcnt, 1);
+               refcount_set(&cpus->refcnt, 1);
        }
 
        return cpus;
@@ -269,7 +272,7 @@ struct cpu_map *cpu_map__empty_new(int nr)
                for (i = 0; i < nr; i++)
                        cpus->map[i] = -1;
 
-               atomic_set(&cpus->refcnt, 1);
+               refcount_set(&cpus->refcnt, 1);
        }
 
        return cpus;
@@ -278,7 +281,7 @@ struct cpu_map *cpu_map__empty_new(int nr)
 static void cpu_map__delete(struct cpu_map *map)
 {
        if (map) {
-               WARN_ONCE(atomic_read(&map->refcnt) != 0,
+               WARN_ONCE(refcount_read(&map->refcnt) != 0,
                          "cpu_map refcnt unbalanced\n");
                free(map);
        }
@@ -287,13 +290,13 @@ static void cpu_map__delete(struct cpu_map *map)
 struct cpu_map *cpu_map__get(struct cpu_map *map)
 {
        if (map)
-               atomic_inc(&map->refcnt);
+               refcount_inc(&map->refcnt);
        return map;
 }
 
 void cpu_map__put(struct cpu_map *map)
 {
-       if (map && atomic_dec_and_test(&map->refcnt))
+       if (map && refcount_dec_and_test(&map->refcnt))
                cpu_map__delete(map);
 }
 
@@ -357,7 +360,7 @@ int cpu_map__build_map(struct cpu_map *cpus, struct cpu_map **res,
        /* ensure we process id in increasing order */
        qsort(c->map, c->nr, sizeof(int), cmp_ids);
 
-       atomic_set(&c->refcnt, 1);
+       refcount_set(&c->refcnt, 1);
        *res = c;
        return 0;
 }
@@ -673,3 +676,49 @@ size_t cpu_map__snprint(struct cpu_map *map, char *buf, size_t size)
        pr_debug("cpumask list: %s\n", buf);
        return ret;
 }
+
+static char hex_char(unsigned char val)
+{
+       if (val < 10)
+               return val + '0';
+       if (val < 16)
+               return val - 10 + 'a';
+       return '?';
+}
+
+size_t cpu_map__snprint_mask(struct cpu_map *map, char *buf, size_t size)
+{
+       int i, cpu;
+       char *ptr = buf;
+       unsigned char *bitmap;
+       int last_cpu = cpu_map__cpu(map, map->nr - 1);
+
+       bitmap = zalloc((last_cpu + 7) / 8);
+       if (bitmap == NULL) {
+               buf[0] = '\0';
+               return 0;
+       }
+
+       for (i = 0; i < map->nr; i++) {
+               cpu = cpu_map__cpu(map, i);
+               bitmap[cpu / 8] |= 1 << (cpu % 8);
+       }
+
+       for (cpu = last_cpu / 4 * 4; cpu >= 0; cpu -= 4) {
+               unsigned char bits = bitmap[cpu / 8];
+
+               if (cpu % 8)
+                       bits >>= 4;
+               else
+                       bits &= 0xf;
+
+               *ptr++ = hex_char(bits);
+               if ((cpu % 32) == 0 && cpu > 0)
+                       *ptr++ = ',';
+       }
+       *ptr = '\0';
+       free(bitmap);
+
+       buf[size - 1] = '\0';
+       return ptr - buf;
+}
index 1a0549a..6b8bff8 100644 (file)
@@ -3,13 +3,13 @@
 
 #include <stdio.h>
 #include <stdbool.h>
-#include <linux/atomic.h>
+#include <linux/refcount.h>
 
 #include "perf.h"
 #include "util/debug.h"
 
 struct cpu_map {
-       atomic_t refcnt;
+       refcount_t refcnt;
        int nr;
        int map[];
 };
@@ -20,6 +20,7 @@ struct cpu_map *cpu_map__dummy_new(void);
 struct cpu_map *cpu_map__new_data(struct cpu_map_data *data);
 struct cpu_map *cpu_map__read(FILE *file);
 size_t cpu_map__snprint(struct cpu_map *map, char *buf, size_t size);
+size_t cpu_map__snprint_mask(struct cpu_map *map, char *buf, size_t size);
 size_t cpu_map__fprintf(struct cpu_map *map, FILE *fp);
 int cpu_map__get_socket_id(int cpu);
 int cpu_map__get_socket(struct cpu_map *map, int idx, void *data);
index d4a5a21..4b261c2 100644 (file)
@@ -3,7 +3,7 @@
  *
  * No surprises, and works with signed and unsigned chars.
  */
-#include "util.h"
+#include "sane_ctype.h"
 
 enum {
        S = GIT_SPACE,
index 4e6cbc9..89d5031 100644 (file)
@@ -7,7 +7,10 @@
  * Released under the GPL v2. (and only v2, not any later version)
  */
 
+#include <errno.h>
+#include <inttypes.h>
 #include <linux/compiler.h>
+#include <linux/kernel.h>
 #include <babeltrace/ctf-writer/writer.h>
 #include <babeltrace/ctf-writer/clock.h>
 #include <babeltrace/ctf-writer/stream.h>
@@ -27,6 +30,7 @@
 #include "evsel.h"
 #include "machine.h"
 #include "config.h"
+#include "sane_ctype.h"
 
 #define pr_N(n, fmt, ...) \
        eprintf(n, debug_data_convert, fmt, ##__VA_ARGS__)
@@ -1468,6 +1472,7 @@ int bt_convert__perf2ctf(const char *input, const char *path,
                        .lost            = perf_event__process_lost,
                        .tracing_data    = perf_event__process_tracing_data,
                        .build_id        = perf_event__process_build_id,
+                       .namespaces      = perf_event__process_namespaces,
                        .ordered_events  = true,
                        .ordering_requires_timestamps = true,
                },
index 60bfc9c..e84bbc8 100644 (file)
@@ -2,6 +2,7 @@
 #include <linux/kernel.h>
 #include <sys/types.h>
 #include <sys/stat.h>
+#include <errno.h>
 #include <unistd.h>
 #include <string.h>
 
index 03eb81f..a5b3777 100644 (file)
@@ -2,19 +2,26 @@
 
 #include "../perf.h"
 
+#include <inttypes.h>
 #include <string.h>
 #include <stdarg.h>
 #include <stdio.h>
+#include <sys/wait.h>
 #include <api/debug.h>
 #include <linux/time64.h>
-
+#ifdef HAVE_BACKTRACE_SUPPORT
+#include <execinfo.h>
+#endif
 #include "cache.h"
 #include "color.h"
 #include "event.h"
 #include "debug.h"
+#include "print_binary.h"
 #include "util.h"
 #include "target.h"
 
+#include "sane_ctype.h"
+
 int verbose;
 bool dump_trace = false, quiet = false;
 int debug_ordered_events;
@@ -244,3 +251,31 @@ void perf_debug_setup(void)
 {
        libapi_set_print(pr_warning_wrapper, pr_warning_wrapper, pr_debug_wrapper);
 }
+
+/* Obtain a backtrace and print it to stdout. */
+#ifdef HAVE_BACKTRACE_SUPPORT
+void dump_stack(void)
+{
+       void *array[16];
+       size_t size = backtrace(array, ARRAY_SIZE(array));
+       char **strings = backtrace_symbols(array, size);
+       size_t i;
+
+       printf("Obtained %zd stack frames.\n", size);
+
+       for (i = 0; i < size; i++)
+               printf("%s\n", strings[i]);
+
+       free(strings);
+}
+#else
+void dump_stack(void) {}
+#endif
+
+void sighandler_dump_stack(int sig)
+{
+       psignal(sig, "perf");
+       dump_stack();
+       signal(sig, SIG_DFL);
+       raise(sig);
+}
index 98832f5..8a23ea1 100644 (file)
@@ -56,4 +56,7 @@ int perf_debug_option(const char *str);
 void perf_debug_setup(void);
 int perf_quiet_option(void);
 
+void dump_stack(void);
+void sighandler_dump_stack(int sig);
+
 #endif /* __PERF_DEBUG_H */
index 3e6062a..cb66d33 100644 (file)
@@ -7,6 +7,8 @@
 
 #include "demangle-java.h"
 
+#include "sane_ctype.h"
+
 enum {
        MODE_PREFIX = 0,
        MODE_CLASS  = 1,
index 1647f28..eec7542 100644 (file)
@@ -17,6 +17,7 @@
 #include "evlist.h"
 #include "evsel.h"
 #include "pmu.h"
+#include <errno.h>
 
 static int
 perf_evsel__apply_drv_configs(struct perf_evsel *evsel,
index d38b62a..a96a99d 100644 (file)
@@ -1,12 +1,20 @@
 #include <asm/bug.h>
+#include <linux/kernel.h>
 #include <sys/time.h>
 #include <sys/resource.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <errno.h>
+#include "compress.h"
+#include "path.h"
 #include "symbol.h"
 #include "dso.h"
 #include "machine.h"
 #include "auxtrace.h"
 #include "util.h"
 #include "debug.h"
+#include "string2.h"
 #include "vdso.h"
 
 static const char * const debuglink_paths[] = {
@@ -1109,7 +1117,7 @@ struct dso *dso__new(const char *name)
                INIT_LIST_HEAD(&dso->node);
                INIT_LIST_HEAD(&dso->data.open_entry);
                pthread_mutex_init(&dso->lock, NULL);
-               atomic_set(&dso->refcnt, 1);
+               refcount_set(&dso->refcnt, 1);
        }
 
        return dso;
@@ -1147,13 +1155,13 @@ void dso__delete(struct dso *dso)
 struct dso *dso__get(struct dso *dso)
 {
        if (dso)
-               atomic_inc(&dso->refcnt);
+               refcount_inc(&dso->refcnt);
        return dso;
 }
 
 void dso__put(struct dso *dso)
 {
-       if (dso && atomic_dec_and_test(&dso->refcnt))
+       if (dso && refcount_dec_and_test(&dso->refcnt))
                dso__delete(dso);
 }
 
index ecc4bbd..12350b1 100644 (file)
@@ -1,7 +1,7 @@
 #ifndef __PERF_DSO
 #define __PERF_DSO
 
-#include <linux/atomic.h>
+#include <linux/refcount.h>
 #include <linux/types.h>
 #include <linux/rbtree.h>
 #include <sys/types.h>
@@ -187,7 +187,7 @@ struct dso {
                void     *priv;
                u64      db_id;
        };
-       atomic_t         refcnt;
+       refcount_t       refcnt;
        char             name[0];
 };
 
diff --git a/tools/perf/util/dump-insn.c b/tools/perf/util/dump-insn.c
new file mode 100644 (file)
index 0000000..ffbdb19
--- /dev/null
@@ -0,0 +1,14 @@
+#include <linux/compiler.h>
+#include "dump-insn.h"
+
+/* Fallback code */
+
+__weak
+const char *dump_insn(struct perf_insn *x __maybe_unused,
+                     u64 ip __maybe_unused, u8 *inbuf __maybe_unused,
+                     int inlen __maybe_unused, int *lenp)
+{
+       if (lenp)
+               *lenp = 0;
+       return "?";
+}
diff --git a/tools/perf/util/dump-insn.h b/tools/perf/util/dump-insn.h
new file mode 100644 (file)
index 0000000..90fb115
--- /dev/null
@@ -0,0 +1,22 @@
+#ifndef __PERF_DUMP_INSN_H
+#define __PERF_DUMP_INSN_H 1
+
+#define MAXINSN 15
+
+#include <linux/types.h>
+
+struct thread;
+
+struct perf_insn {
+       /* Initialized by callers: */
+       struct thread *thread;
+       u8            cpumode;
+       bool          is64bit;
+       int           cpu;
+       /* Temporary */
+       char          out[256];
+};
+
+const char *dump_insn(struct perf_insn *x, u64 ip,
+                     u8 *inbuf, int inlen, int *lenp);
+#endif
index 41e068e..f5acda1 100644 (file)
  *
  */
 
+#include <errno.h>
+#include <inttypes.h>
 #include <stdbool.h>
 #include "util.h"
 #include "debug.h"
 #include "dwarf-aux.h"
+#include "string2.h"
 
 /**
  * cu_find_realpath - Find the realpath of the target file
index 62bc4a8..c708395 100644 (file)
@@ -8,6 +8,7 @@
 #include <debug.h>
 #include <dwarf-regs.h>
 #include <elf.h>
+#include <linux/kernel.h>
 
 #ifndef EM_AARCH64
 #define EM_AARCH64     183  /* ARM 64 bit */
index 075fc77..9e21538 100644 (file)
@@ -1,6 +1,7 @@
 #include "cpumap.h"
 #include "env.h"
 #include "util.h"
+#include <errno.h>
 
 struct perf_env perf_env;
 
index 4ea7ce7..142835c 100644 (file)
@@ -1,15 +1,23 @@
+#include <dirent.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <linux/kernel.h>
 #include <linux/types.h>
-#include <linux/mman.h> /* To get things like MAP_HUGETLB even on older libc headers */
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <uapi/linux/mman.h> /* To get things like MAP_HUGETLB even on older libc headers */
 #include <api/fs/fs.h>
 #include "event.h"
 #include "debug.h"
 #include "hist.h"
 #include "machine.h"
 #include "sort.h"
-#include "string.h"
+#include "string2.h"
 #include "strlist.h"
 #include "thread.h"
 #include "thread_map.h"
+#include "sane_ctype.h"
 #include "symbol/kallsyms.h"
 #include "asm/bug.h"
 #include "stat.h"
@@ -31,6 +39,7 @@ static const char *perf_event__names[] = {
        [PERF_RECORD_LOST_SAMPLES]              = "LOST_SAMPLES",
        [PERF_RECORD_SWITCH]                    = "SWITCH",
        [PERF_RECORD_SWITCH_CPU_WIDE]           = "SWITCH_CPU_WIDE",
+       [PERF_RECORD_NAMESPACES]                = "NAMESPACES",
        [PERF_RECORD_HEADER_ATTR]               = "ATTR",
        [PERF_RECORD_HEADER_EVENT_TYPE]         = "EVENT_TYPE",
        [PERF_RECORD_HEADER_TRACING_DATA]       = "TRACING_DATA",
@@ -49,6 +58,16 @@ static const char *perf_event__names[] = {
        [PERF_RECORD_TIME_CONV]                 = "TIME_CONV",
 };
 
+static const char *perf_ns__names[] = {
+       [NET_NS_INDEX]          = "net",
+       [UTS_NS_INDEX]          = "uts",
+       [IPC_NS_INDEX]          = "ipc",
+       [PID_NS_INDEX]          = "pid",
+       [USER_NS_INDEX]         = "user",
+       [MNT_NS_INDEX]          = "mnt",
+       [CGROUP_NS_INDEX]       = "cgroup",
+};
+
 const char *perf_event__name(unsigned int id)
 {
        if (id >= ARRAY_SIZE(perf_event__names))
@@ -58,6 +77,13 @@ const char *perf_event__name(unsigned int id)
        return perf_event__names[id];
 }
 
+static const char *perf_ns__name(unsigned int id)
+{
+       if (id >= ARRAY_SIZE(perf_ns__names))
+               return "UNKNOWN";
+       return perf_ns__names[id];
+}
+
 static int perf_tool__process_synth_event(struct perf_tool *tool,
                                          union perf_event *event,
                                          struct machine *machine,
@@ -88,7 +114,7 @@ static int perf_event__get_comm_ids(pid_t pid, char *comm, size_t len,
        int fd;
        size_t size = 0;
        ssize_t n;
-       char *nl, *name, *tgids, *ppids;
+       char *name, *tgids, *ppids;
 
        *tgid = -1;
        *ppid = -1;
@@ -115,10 +141,10 @@ static int perf_event__get_comm_ids(pid_t pid, char *comm, size_t len,
        ppids = strstr(bf, "PPid:");
 
        if (name) {
-               name += 5;  /* strlen("Name:") */
+               char *nl;
 
-               while (*name && isspace(*name))
-                       ++name;
+               name += 5;  /* strlen("Name:") */
+               name = ltrim(name);
 
                nl = strchr(name, '\n');
                if (nl)
@@ -203,6 +229,58 @@ pid_t perf_event__synthesize_comm(struct perf_tool *tool,
        return tgid;
 }
 
+static void perf_event__get_ns_link_info(pid_t pid, const char *ns,
+                                        struct perf_ns_link_info *ns_link_info)
+{
+       struct stat64 st;
+       char proc_ns[128];
+
+       sprintf(proc_ns, "/proc/%u/ns/%s", pid, ns);
+       if (stat64(proc_ns, &st) == 0) {
+               ns_link_info->dev = st.st_dev;
+               ns_link_info->ino = st.st_ino;
+       }
+}
+
+int perf_event__synthesize_namespaces(struct perf_tool *tool,
+                                     union perf_event *event,
+                                     pid_t pid, pid_t tgid,
+                                     perf_event__handler_t process,
+                                     struct machine *machine)
+{
+       u32 idx;
+       struct perf_ns_link_info *ns_link_info;
+
+       if (!tool || !tool->namespace_events)
+               return 0;
+
+       memset(&event->namespaces, 0, (sizeof(event->namespaces) +
+              (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
+              machine->id_hdr_size));
+
+       event->namespaces.pid = tgid;
+       event->namespaces.tid = pid;
+
+       event->namespaces.nr_namespaces = NR_NAMESPACES;
+
+       ns_link_info = event->namespaces.link_info;
+
+       for (idx = 0; idx < event->namespaces.nr_namespaces; idx++)
+               perf_event__get_ns_link_info(pid, perf_ns__name(idx),
+                                            &ns_link_info[idx]);
+
+       event->namespaces.header.type = PERF_RECORD_NAMESPACES;
+
+       event->namespaces.header.size = (sizeof(event->namespaces) +
+                       (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
+                       machine->id_hdr_size);
+
+       if (perf_tool__process_synth_event(tool, event, machine, process) != 0)
+               return -1;
+
+       return 0;
+}
+
 static int perf_event__synthesize_fork(struct perf_tool *tool,
                                       union perf_event *event,
                                       pid_t pid, pid_t tgid, pid_t ppid,
@@ -255,8 +333,8 @@ int perf_event__synthesize_mmap_events(struct perf_tool *tool,
        if (machine__is_default_guest(machine))
                return 0;
 
-       snprintf(filename, sizeof(filename), "%s/proc/%d/maps",
-                machine->root_dir, pid);
+       snprintf(filename, sizeof(filename), "%s/proc/%d/task/%d/maps",
+                machine->root_dir, pid, pid);
 
        fp = fopen(filename, "r");
        if (fp == NULL) {
@@ -434,8 +512,9 @@ int perf_event__synthesize_modules(struct perf_tool *tool,
 static int __event__synthesize_thread(union perf_event *comm_event,
                                      union perf_event *mmap_event,
                                      union perf_event *fork_event,
+                                     union perf_event *namespaces_event,
                                      pid_t pid, int full,
-                                         perf_event__handler_t process,
+                                     perf_event__handler_t process,
                                      struct perf_tool *tool,
                                      struct machine *machine,
                                      bool mmap_data,
@@ -455,6 +534,11 @@ static int __event__synthesize_thread(union perf_event *comm_event,
                if (tgid == -1)
                        return -1;
 
+               if (perf_event__synthesize_namespaces(tool, namespaces_event, pid,
+                                                     tgid, process, machine) < 0)
+                       return -1;
+
+
                return perf_event__synthesize_mmap_events(tool, mmap_event, pid, tgid,
                                                          process, machine, mmap_data,
                                                          proc_map_timeout);
@@ -488,6 +572,11 @@ static int __event__synthesize_thread(union perf_event *comm_event,
                if (perf_event__synthesize_fork(tool, fork_event, _pid, tgid,
                                                ppid, process, machine) < 0)
                        break;
+
+               if (perf_event__synthesize_namespaces(tool, namespaces_event, _pid,
+                                                     tgid, process, machine) < 0)
+                       break;
+
                /*
                 * Send the prepared comm event
                 */
@@ -516,6 +605,7 @@ int perf_event__synthesize_thread_map(struct perf_tool *tool,
                                      unsigned int proc_map_timeout)
 {
        union perf_event *comm_event, *mmap_event, *fork_event;
+       union perf_event *namespaces_event;
        int err = -1, thread, j;
 
        comm_event = malloc(sizeof(comm_event->comm) + machine->id_hdr_size);
@@ -530,10 +620,16 @@ int perf_event__synthesize_thread_map(struct perf_tool *tool,
        if (fork_event == NULL)
                goto out_free_mmap;
 
+       namespaces_event = malloc(sizeof(namespaces_event->namespaces) +
+                                 (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
+                                 machine->id_hdr_size);
+       if (namespaces_event == NULL)
+               goto out_free_fork;
+
        err = 0;
        for (thread = 0; thread < threads->nr; ++thread) {
                if (__event__synthesize_thread(comm_event, mmap_event,
-                                              fork_event,
+                                              fork_event, namespaces_event,
                                               thread_map__pid(threads, thread), 0,
                                               process, tool, machine,
                                               mmap_data, proc_map_timeout)) {
@@ -559,7 +655,7 @@ int perf_event__synthesize_thread_map(struct perf_tool *tool,
                        /* if not, generate events for it */
                        if (need_leader &&
                            __event__synthesize_thread(comm_event, mmap_event,
-                                                      fork_event,
+                                                      fork_event, namespaces_event,
                                                       comm_event->comm.pid, 0,
                                                       process, tool, machine,
                                                       mmap_data, proc_map_timeout)) {
@@ -568,6 +664,8 @@ int perf_event__synthesize_thread_map(struct perf_tool *tool,
                        }
                }
        }
+       free(namespaces_event);
+out_free_fork:
        free(fork_event);
 out_free_mmap:
        free(mmap_event);
@@ -587,6 +685,7 @@ int perf_event__synthesize_threads(struct perf_tool *tool,
        char proc_path[PATH_MAX];
        struct dirent *dirent;
        union perf_event *comm_event, *mmap_event, *fork_event;
+       union perf_event *namespaces_event;
        int err = -1;
 
        if (machine__is_default_guest(machine))
@@ -604,11 +703,17 @@ int perf_event__synthesize_threads(struct perf_tool *tool,
        if (fork_event == NULL)
                goto out_free_mmap;
 
+       namespaces_event = malloc(sizeof(namespaces_event->namespaces) +
+                                 (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
+                                 machine->id_hdr_size);
+       if (namespaces_event == NULL)
+               goto out_free_fork;
+
        snprintf(proc_path, sizeof(proc_path), "%s/proc", machine->root_dir);
        proc = opendir(proc_path);
 
        if (proc == NULL)
-               goto out_free_fork;
+               goto out_free_namespaces;
 
        while ((dirent = readdir(proc)) != NULL) {
                char *end;
@@ -620,13 +725,16 @@ int perf_event__synthesize_threads(struct perf_tool *tool,
                 * We may race with exiting thread, so don't stop just because
                 * one thread couldn't be synthesized.
                 */
-               __event__synthesize_thread(comm_event, mmap_event, fork_event, pid,
-                                          1, process, tool, machine, mmap_data,
+               __event__synthesize_thread(comm_event, mmap_event, fork_event,
+                                          namespaces_event, pid, 1, process,
+                                          tool, machine, mmap_data,
                                           proc_map_timeout);
        }
 
        err = 0;
        closedir(proc);
+out_free_namespaces:
+       free(namespaces_event);
 out_free_fork:
        free(fork_event);
 out_free_mmap:
@@ -1008,6 +1116,33 @@ size_t perf_event__fprintf_comm(union perf_event *event, FILE *fp)
        return fprintf(fp, "%s: %s:%d/%d\n", s, event->comm.comm, event->comm.pid, event->comm.tid);
 }
 
+size_t perf_event__fprintf_namespaces(union perf_event *event, FILE *fp)
+{
+       size_t ret = 0;
+       struct perf_ns_link_info *ns_link_info;
+       u32 nr_namespaces, idx;
+
+       ns_link_info = event->namespaces.link_info;
+       nr_namespaces = event->namespaces.nr_namespaces;
+
+       ret += fprintf(fp, " %d/%d - nr_namespaces: %u\n\t\t[",
+                      event->namespaces.pid,
+                      event->namespaces.tid,
+                      nr_namespaces);
+
+       for (idx = 0; idx < nr_namespaces; idx++) {
+               if (idx && (idx % 4 == 0))
+                       ret += fprintf(fp, "\n\t\t ");
+
+               ret  += fprintf(fp, "%u/%s: %" PRIu64 "/%#" PRIx64 "%s", idx,
+                               perf_ns__name(idx), (u64)ns_link_info[idx].dev,
+                               (u64)ns_link_info[idx].ino,
+                               ((idx + 1) != nr_namespaces) ? ", " : "]\n");
+       }
+
+       return ret;
+}
+
 int perf_event__process_comm(struct perf_tool *tool __maybe_unused,
                             union perf_event *event,
                             struct perf_sample *sample,
@@ -1016,6 +1151,14 @@ int perf_event__process_comm(struct perf_tool *tool __maybe_unused,
        return machine__process_comm_event(machine, event, sample);
 }
 
+int perf_event__process_namespaces(struct perf_tool *tool __maybe_unused,
+                                  union perf_event *event,
+                                  struct perf_sample *sample,
+                                  struct machine *machine)
+{
+       return machine__process_namespaces_event(machine, event, sample);
+}
+
 int perf_event__process_lost(struct perf_tool *tool __maybe_unused,
                             union perf_event *event,
                             struct perf_sample *sample,
@@ -1153,11 +1296,12 @@ int perf_event__process_exit(struct perf_tool *tool __maybe_unused,
 
 size_t perf_event__fprintf_aux(union perf_event *event, FILE *fp)
 {
-       return fprintf(fp, " offset: %#"PRIx64" size: %#"PRIx64" flags: %#"PRIx64" [%s%s]\n",
+       return fprintf(fp, " offset: %#"PRIx64" size: %#"PRIx64" flags: %#"PRIx64" [%s%s%s]\n",
                       event->aux.aux_offset, event->aux.aux_size,
                       event->aux.flags,
                       event->aux.flags & PERF_AUX_FLAG_TRUNCATED ? "T" : "",
-                      event->aux.flags & PERF_AUX_FLAG_OVERWRITE ? "O" : "");
+                      event->aux.flags & PERF_AUX_FLAG_OVERWRITE ? "O" : "",
+                      event->aux.flags & PERF_AUX_FLAG_PARTIAL   ? "P" : "");
 }
 
 size_t perf_event__fprintf_itrace_start(union perf_event *event, FILE *fp)
@@ -1196,6 +1340,9 @@ size_t perf_event__fprintf(union perf_event *event, FILE *fp)
        case PERF_RECORD_MMAP:
                ret += perf_event__fprintf_mmap(event, fp);
                break;
+       case PERF_RECORD_NAMESPACES:
+               ret += perf_event__fprintf_namespaces(event, fp);
+               break;
        case PERF_RECORD_MMAP2:
                ret += perf_event__fprintf_mmap2(event, fp);
                break;
index c735c53..db2de64 100644 (file)
@@ -3,9 +3,9 @@
 
 #include <limits.h>
 #include <stdio.h>
+#include <linux/kernel.h>
 
 #include "../perf.h"
-#include "map.h"
 #include "build-id.h"
 #include "perf_regs.h"
 
@@ -39,6 +39,13 @@ struct comm_event {
        char comm[16];
 };
 
+struct namespaces_event {
+       struct perf_event_header header;
+       u32 pid, tid;
+       u64 nr_namespaces;
+       struct perf_ns_link_info link_info[];
+};
+
 struct fork_event {
        struct perf_event_header header;
        u32 pid, ppid;
@@ -269,6 +276,7 @@ struct events_stats {
        u64 total_lost;
        u64 total_lost_samples;
        u64 total_aux_lost;
+       u64 total_aux_partial;
        u64 total_invalid_chains;
        u32 nr_events[PERF_RECORD_HEADER_MAX];
        u32 nr_non_filtered_samples;
@@ -485,6 +493,7 @@ union perf_event {
        struct mmap_event               mmap;
        struct mmap2_event              mmap2;
        struct comm_event               comm;
+       struct namespaces_event         namespaces;
        struct fork_event               fork;
        struct lost_event               lost;
        struct lost_samples_event       lost_samples;
@@ -587,6 +596,10 @@ int perf_event__process_switch(struct perf_tool *tool,
                               union perf_event *event,
                               struct perf_sample *sample,
                               struct machine *machine);
+int perf_event__process_namespaces(struct perf_tool *tool,
+                                  union perf_event *event,
+                                  struct perf_sample *sample,
+                                  struct machine *machine);
 int perf_event__process_mmap(struct perf_tool *tool,
                             union perf_event *event,
                             struct perf_sample *sample,
@@ -636,6 +649,12 @@ pid_t perf_event__synthesize_comm(struct perf_tool *tool,
                                  perf_event__handler_t process,
                                  struct machine *machine);
 
+int perf_event__synthesize_namespaces(struct perf_tool *tool,
+                                     union perf_event *event,
+                                     pid_t pid, pid_t tgid,
+                                     perf_event__handler_t process,
+                                     struct machine *machine);
+
 int perf_event__synthesize_mmap_events(struct perf_tool *tool,
                                       union perf_event *event,
                                       pid_t pid, pid_t tgid,
@@ -653,6 +672,7 @@ size_t perf_event__fprintf_itrace_start(union perf_event *event, FILE *fp);
 size_t perf_event__fprintf_switch(union perf_event *event, FILE *fp);
 size_t perf_event__fprintf_thread_map(union perf_event *event, FILE *fp);
 size_t perf_event__fprintf_cpu_map(union perf_event *event, FILE *fp);
+size_t perf_event__fprintf_namespaces(union perf_event *event, FILE *fp);
 size_t perf_event__fprintf(union perf_event *event, FILE *fp);
 
 u64 kallsyms__get_function_start(const char *kallsyms_filename,
index b601f28..46c0faf 100644 (file)
@@ -8,6 +8,8 @@
  */
 #include "util.h"
 #include <api/fs/fs.h>
+#include <errno.h>
+#include <inttypes.h>
 #include <poll.h>
 #include "cpumap.h"
 #include "thread_map.h"
 #include "evlist.h"
 #include "evsel.h"
 #include "debug.h"
+#include "units.h"
 #include "asm/bug.h"
+#include <signal.h>
 #include <unistd.h>
 
 #include "parse-events.h"
 #include <subcmd/parse-options.h>
 
+#include <sys/ioctl.h>
 #include <sys/mman.h>
 
 #include <linux/bitops.h>
@@ -777,7 +782,7 @@ union perf_event *perf_mmap__read_forward(struct perf_mmap *md, bool check_messu
        /*
         * Check if event was unmapped due to a POLLHUP/POLLERR.
         */
-       if (!atomic_read(&md->refcnt))
+       if (!refcount_read(&md->refcnt))
                return NULL;
 
        head = perf_mmap__read_head(md);
@@ -794,7 +799,7 @@ perf_mmap__read_backward(struct perf_mmap *md)
        /*
         * Check if event was unmapped due to a POLLHUP/POLLERR.
         */
-       if (!atomic_read(&md->refcnt))
+       if (!refcount_read(&md->refcnt))
                return NULL;
 
        head = perf_mmap__read_head(md);
@@ -856,7 +861,7 @@ void perf_mmap__read_catchup(struct perf_mmap *md)
 {
        u64 head;
 
-       if (!atomic_read(&md->refcnt))
+       if (!refcount_read(&md->refcnt))
                return;
 
        head = perf_mmap__read_head(md);
@@ -875,14 +880,14 @@ static bool perf_mmap__empty(struct perf_mmap *md)
 
 static void perf_mmap__get(struct perf_mmap *map)
 {
-       atomic_inc(&map->refcnt);
+       refcount_inc(&map->refcnt);
 }
 
 static void perf_mmap__put(struct perf_mmap *md)
 {
-       BUG_ON(md->base && atomic_read(&md->refcnt) == 0);
+       BUG_ON(md->base && refcount_read(&md->refcnt) == 0);
 
-       if (atomic_dec_and_test(&md->refcnt))
+       if (refcount_dec_and_test(&md->refcnt))
                perf_mmap__munmap(md);
 }
 
@@ -894,7 +899,7 @@ void perf_mmap__consume(struct perf_mmap *md, bool overwrite)
                perf_mmap__write_tail(md, old);
        }
 
-       if (atomic_read(&md->refcnt) == 1 && perf_mmap__empty(md))
+       if (refcount_read(&md->refcnt) == 1 && perf_mmap__empty(md))
                perf_mmap__put(md);
 }
 
@@ -937,7 +942,7 @@ static void perf_mmap__munmap(struct perf_mmap *map)
                munmap(map->base, perf_mmap__mmap_len(map));
                map->base = NULL;
                map->fd = -1;
-               atomic_set(&map->refcnt, 0);
+               refcount_set(&map->refcnt, 0);
        }
        auxtrace_mmap__munmap(&map->auxtrace_mmap);
 }
@@ -974,8 +979,19 @@ static struct perf_mmap *perf_evlist__alloc_mmap(struct perf_evlist *evlist)
        if (!map)
                return NULL;
 
-       for (i = 0; i < evlist->nr_mmaps; i++)
+       for (i = 0; i < evlist->nr_mmaps; i++) {
                map[i].fd = -1;
+               /*
+                * When the perf_mmap() call is made we grab one refcount, plus
+                * one extra to let perf_evlist__mmap_consume() get the last
+                * events after all real references (perf_mmap__get()) are
+                * dropped.
+                *
+                * Each PERF_EVENT_IOC_SET_OUTPUT points to this mmap and
+                * thus does perf_mmap__get() on it.
+                */
+               refcount_set(&map[i].refcnt, 0);
+       }
        return map;
 }
 
@@ -1001,7 +1017,7 @@ static int perf_mmap__mmap(struct perf_mmap *map,
         * evlist layer can't just drop it when filtering events in
         * perf_evlist__filter_pollfd().
         */
-       atomic_set(&map->refcnt, 2);
+       refcount_set(&map->refcnt, 2);
        map->prev = 0;
        map->mask = mp->mask;
        map->base = mmap(NULL, perf_mmap__mmap_len(map), mp->prot,
index 389b9cc..94cea43 100644 (file)
@@ -1,7 +1,8 @@
 #ifndef __PERF_EVLIST_H
 #define __PERF_EVLIST_H 1
 
-#include <linux/atomic.h>
+#include <linux/kernel.h>
+#include <linux/refcount.h>
 #include <linux/list.h>
 #include <api/fd/array.h>
 #include <stdio.h>
@@ -10,6 +11,7 @@
 #include "evsel.h"
 #include "util.h"
 #include "auxtrace.h"
+#include <signal.h>
 #include <unistd.h>
 
 struct pollfd;
@@ -29,7 +31,7 @@ struct perf_mmap {
        void             *base;
        int              mask;
        int              fd;
-       atomic_t         refcnt;
+       refcount_t       refcnt;
        u64              prev;
        struct auxtrace_mmap auxtrace_mmap;
        char             event_copy[PERF_SAMPLE_MAX_SIZE] __attribute__((aligned(8)));
index ac59710..0e87909 100644 (file)
@@ -8,12 +8,15 @@
  */
 
 #include <byteswap.h>
+#include <errno.h>
+#include <inttypes.h>
 #include <linux/bitops.h>
 #include <api/fs/tracing_path.h>
 #include <traceevent/event-parse.h>
 #include <linux/hw_breakpoint.h>
 #include <linux/perf_event.h>
 #include <linux/err.h>
+#include <sys/ioctl.h>
 #include <sys/resource.h>
 #include "asm/bug.h"
 #include "callchain.h"
@@ -30,6 +33,8 @@
 #include "stat.h"
 #include "util/parse-branch-options.h"
 
+#include "sane_ctype.h"
+
 static struct {
        bool sample_id_all;
        bool exclude_guest;
@@ -236,6 +241,10 @@ void perf_evsel__init(struct perf_evsel *evsel,
        evsel->sample_size = __perf_evsel__sample_size(attr->sample_type);
        perf_evsel__calc_id_pos(evsel);
        evsel->cmdline_group_boundary = false;
+       evsel->metric_expr   = NULL;
+       evsel->metric_name   = NULL;
+       evsel->metric_events = NULL;
+       evsel->collect_stat  = false;
 }
 
 struct perf_evsel *perf_evsel__new_idx(struct perf_event_attr *attr, int idx)
@@ -932,6 +941,9 @@ void perf_evsel__config(struct perf_evsel *evsel, struct record_opts *opts,
        attr->mmap2 = track && !perf_missing_features.mmap2;
        attr->comm  = track;
 
+       if (opts->record_namespaces)
+               attr->namespaces  = track;
+
        if (opts->record_switch_events)
                attr->context_switch = track;
 
@@ -1232,7 +1244,7 @@ int perf_evsel__read(struct perf_evsel *evsel, int cpu, int thread,
        if (FD(evsel, cpu, thread) < 0)
                return -EINVAL;
 
-       if (readn(FD(evsel, cpu, thread), count, sizeof(*count)) < 0)
+       if (readn(FD(evsel, cpu, thread), count, sizeof(*count)) <= 0)
                return -errno;
 
        return 0;
@@ -1250,7 +1262,7 @@ int __perf_evsel__read_on_cpu(struct perf_evsel *evsel,
        if (evsel->counts == NULL && perf_evsel__alloc_counts(evsel, cpu + 1, thread + 1) < 0)
                return -ENOMEM;
 
-       if (readn(FD(evsel, cpu, thread), &count, nv * sizeof(u64)) < 0)
+       if (readn(FD(evsel, cpu, thread), &count, nv * sizeof(u64)) <= 0)
                return -errno;
 
        perf_evsel__compute_deltas(evsel, cpu, thread, &count);
@@ -2450,11 +2462,17 @@ int perf_evsel__open_strerror(struct perf_evsel *evsel, struct target *target,
                              int err, char *msg, size_t size)
 {
        char sbuf[STRERR_BUFSIZE];
+       int printed = 0;
 
        switch (err) {
        case EPERM:
        case EACCES:
-               return scnprintf(msg, size,
+               if (err == EPERM)
+                       printed = scnprintf(msg, size,
+                               "No permission to enable %s event.\n\n",
+                               perf_evsel__name(evsel));
+
+               return scnprintf(msg + printed, size - printed,
                 "You may not have permission to collect %sstats.\n\n"
                 "Consider tweaking /proc/sys/kernel/perf_event_paranoid,\n"
                 "which controls use of the performance events system by\n"
index 06ef6f2..d101695 100644 (file)
@@ -131,6 +131,11 @@ struct perf_evsel {
        bool                    cmdline_group_boundary;
        struct list_head        config_terms;
        int                     bpf_fd;
+       bool                    merged_stat;
+       const char *            metric_expr;
+       const char *            metric_name;
+       struct perf_evsel       **metric_events;
+       bool                    collect_stat;
 };
 
 union u64_swap {
index 4ef5184..e415aee 100644 (file)
@@ -1,9 +1,11 @@
+#include <inttypes.h>
 #include <stdio.h>
 #include <stdbool.h>
 #include <traceevent/event-parse.h>
 #include "evsel.h"
 #include "callchain.h"
 #include "map.h"
+#include "strlist.h"
 #include "symbol.h"
 
 static int comma_fprintf(FILE *fp, bool *first, const char *fmt, ...)
diff --git a/tools/perf/util/expr.h b/tools/perf/util/expr.h
new file mode 100644 (file)
index 0000000..9c2760a
--- /dev/null
@@ -0,0 +1,25 @@
+#ifndef PARSE_CTX_H
+#define PARSE_CTX_H 1
+
+#define EXPR_MAX_OTHER 8
+#define MAX_PARSE_ID EXPR_MAX_OTHER
+
+struct parse_id {
+       const char *name;
+       double val;
+};
+
+struct parse_ctx {
+       int num_ids;
+       struct parse_id ids[MAX_PARSE_ID];
+};
+
+void expr__ctx_init(struct parse_ctx *ctx);
+void expr__add_id(struct parse_ctx *ctx, const char *id, double val);
+#ifndef IN_EXPR_Y
+int expr__parse(double *final_val, struct parse_ctx *ctx, const char **pp);
+#endif
+int expr__find_other(const char *p, const char *one, const char ***other,
+               int *num_other);
+
+#endif
diff --git a/tools/perf/util/expr.y b/tools/perf/util/expr.y
new file mode 100644 (file)
index 0000000..954556b
--- /dev/null
@@ -0,0 +1,173 @@
+/* Simple expression parser */
+%{
+#include "util.h"
+#include "util/debug.h"
+#define IN_EXPR_Y 1
+#include "expr.h"
+#include <string.h>
+
+#define MAXIDLEN 256
+%}
+
+%pure-parser
+%parse-param { double *final_val }
+%parse-param { struct parse_ctx *ctx }
+%parse-param { const char **pp }
+%lex-param { const char **pp }
+
+%union {
+       double num;
+       char id[MAXIDLEN+1];
+}
+
+%token <num> NUMBER
+%token <id> ID
+%left '|'
+%left '^'
+%left '&'
+%left '-' '+'
+%left '*' '/' '%'
+%left NEG NOT
+%type <num> expr
+
+%{
+static int expr__lex(YYSTYPE *res, const char **pp);
+
+static void expr__error(double *final_val __maybe_unused,
+                      struct parse_ctx *ctx __maybe_unused,
+                      const char **pp __maybe_unused,
+                      const char *s)
+{
+       pr_debug("%s\n", s);
+}
+
+static int lookup_id(struct parse_ctx *ctx, char *id, double *val)
+{
+       int i;
+
+       for (i = 0; i < ctx->num_ids; i++) {
+               if (!strcasecmp(ctx->ids[i].name, id)) {
+                       *val = ctx->ids[i].val;
+                       return 0;
+               }
+       }
+       return -1;
+}
+
+%}
+%%
+
+all_expr: expr                 { *final_val = $1; }
+       ;
+
+expr:    NUMBER
+       | ID                    { if (lookup_id(ctx, $1, &$$) < 0) {
+                                       pr_debug("%s not found", $1);
+                                       YYABORT;
+                                 }
+                               }
+       | expr '+' expr         { $$ = $1 + $3; }
+       | expr '-' expr         { $$ = $1 - $3; }
+       | expr '*' expr         { $$ = $1 * $3; }
+       | expr '/' expr         { if ($3 == 0) YYABORT; $$ = $1 / $3; }
+       | expr '%' expr         { if ((long)$3 == 0) YYABORT; $$ = (long)$1 % (long)$3; }
+       | '-' expr %prec NEG    { $$ = -$2; }
+       | '(' expr ')'          { $$ = $2; }
+       ;
+
+%%
+
+static int expr__symbol(YYSTYPE *res, const char *p, const char **pp)
+{
+       char *dst = res->id;
+       const char *s = p;
+
+       while (isalnum(*p) || *p == '_' || *p == '.') {
+               if (p - s >= MAXIDLEN)
+                       return -1;
+               *dst++ = *p++;
+       }
+       *dst = 0;
+       *pp = p;
+       return ID;
+}
+
+static int expr__lex(YYSTYPE *res, const char **pp)
+{
+       int tok;
+       const char *s;
+       const char *p = *pp;
+
+       while (isspace(*p))
+               p++;
+       s = p;
+       switch (*p++) {
+       case 'a' ... 'z':
+       case 'A' ... 'Z':
+               return expr__symbol(res, p - 1, pp);
+       case '0' ... '9': case '.':
+               res->num = strtod(s, (char **)&p);
+               tok = NUMBER;
+               break;
+       default:
+               tok = *s;
+               break;
+       }
+       *pp = p;
+       return tok;
+}
+
+/* Caller must make sure id is allocated */
+void expr__add_id(struct parse_ctx *ctx, const char *name, double val)
+{
+       int idx;
+       assert(ctx->num_ids < MAX_PARSE_ID);
+       idx = ctx->num_ids++;
+       ctx->ids[idx].name = name;
+       ctx->ids[idx].val = val;
+}
+
+void expr__ctx_init(struct parse_ctx *ctx)
+{
+       ctx->num_ids = 0;
+}
+
+int expr__find_other(const char *p, const char *one, const char ***other,
+                    int *num_otherp)
+{
+       const char *orig = p;
+       int err = -1;
+       int num_other;
+
+       *other = malloc((EXPR_MAX_OTHER + 1) * sizeof(char *));
+       if (!*other)
+               return -1;
+
+       num_other = 0;
+       for (;;) {
+               YYSTYPE val;
+               int tok = expr__lex(&val, &p);
+               if (tok == 0) {
+                       err = 0;
+                       break;
+               }
+               if (tok == ID && strcasecmp(one, val.id)) {
+                       if (num_other >= EXPR_MAX_OTHER - 1) {
+                               pr_debug("Too many extra events in %s\n", orig);
+                               break;
+                       }
+                       (*other)[num_other] = strdup(val.id);
+                       if (!(*other)[num_other])
+                               return -1;
+                       num_other++;
+               }
+       }
+       (*other)[num_other] = NULL;
+       *num_otherp = num_other;
+       if (err) {
+               *num_otherp = 0;
+               free(*other);
+               *other = NULL;
+       }
+       return err;
+}
index 05714d5..948b2c5 100644 (file)
@@ -1,4 +1,8 @@
+#include <errno.h>
+#include <inttypes.h>
 #include "util.h"
+#include "string2.h"
+#include <sys/param.h>
 #include <sys/types.h>
 #include <byteswap.h>
 #include <unistd.h>
@@ -7,7 +11,10 @@
 #include <linux/list.h>
 #include <linux/kernel.h>
 #include <linux/bitops.h>
+#include <sys/stat.h>
+#include <sys/types.h>
 #include <sys/utsname.h>
+#include <unistd.h>
 
 #include "evlist.h"
 #include "evsel.h"
@@ -26,6 +33,8 @@
 #include <api/fs/fs.h>
 #include "asm/bug.h"
 
+#include "sane_ctype.h"
+
 /*
  * magic2 = "PERFILE2"
  * must be a numerical value to let the endianness
@@ -370,15 +379,11 @@ static int write_cmdline(int fd, struct perf_header *h __maybe_unused,
                         struct perf_evlist *evlist __maybe_unused)
 {
        char buf[MAXPATHLEN];
-       char proc[32];
        u32 n;
        int i, ret;
 
-       /*
-        * actual atual path to perf binary
-        */
-       sprintf(proc, "/proc/%d/exe", getpid());
-       ret = readlink(proc, buf, sizeof(buf));
+       /* actual path to perf binary */
+       ret = readlink("/proc/self/exe", buf, sizeof(buf) - 1);
        if (ret <= 0)
                return -1;
 
@@ -2274,6 +2279,9 @@ int perf_header__fprintf_info(struct perf_session *session, FILE *fp, bool full)
        perf_header__process_sections(header, fd, &hd,
                                      perf_file_section__fprintf_info);
 
+       if (session->file->is_pipe)
+               return 0;
+
        fprintf(fp, "# missing features: ");
        for_each_clear_bit(bit, header->adds_features, HEADER_LAST_FEATURE) {
                if (bit)
index 2821f8d..1c88ad6 100644 (file)
@@ -1,21 +1,18 @@
 #include "cache.h"
 #include "config.h"
+#include <poll.h>
 #include <stdio.h>
 #include <subcmd/help.h>
 #include "../builtin.h"
 #include "levenshtein.h"
 
 static int autocorrect;
-static struct cmdnames aliases;
 
 static int perf_unknown_cmd_config(const char *var, const char *value,
                                   void *cb __maybe_unused)
 {
        if (!strcmp(var, "help.autocorrect"))
                autocorrect = perf_config_int(var,value);
-       /* Also use aliases for command lookup */
-       if (!prefixcmp(var, "alias."))
-               add_cmdname(&aliases, var + 6, strlen(var + 6));
 
        return 0;
 }
@@ -59,14 +56,12 @@ const char *help_unknown_cmd(const char *cmd)
 
        memset(&main_cmds, 0, sizeof(main_cmds));
        memset(&other_cmds, 0, sizeof(main_cmds));
-       memset(&aliases, 0, sizeof(aliases));
 
        perf_config(perf_unknown_cmd_config, NULL);
 
        load_command_list("perf-", &main_cmds, &other_cmds);
 
-       if (add_cmd_list(&main_cmds, &aliases) < 0 ||
-           add_cmd_list(&main_cmds, &other_cmds) < 0) {
+       if (add_cmd_list(&main_cmds, &other_cmds) < 0) {
                fprintf(stderr, "ERROR: Failed to allocate command list for unknown command.\n");
                goto end;
        }
index eaf72a9..cf0186a 100644 (file)
@@ -3,12 +3,17 @@
 #include "hist.h"
 #include "map.h"
 #include "session.h"
+#include "namespaces.h"
 #include "sort.h"
 #include "evlist.h"
 #include "evsel.h"
 #include "annotate.h"
+#include "srcline.h"
+#include "thread.h"
 #include "ui/progress.h"
+#include <errno.h>
 #include <math.h>
+#include <sys/param.h>
 
 static bool hists__filter_entry_by_dso(struct hists *hists,
                                       struct hist_entry *he);
@@ -169,6 +174,7 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h)
                hists__set_unres_dso_col_len(hists, HISTC_MEM_DADDR_DSO);
        }
 
+       hists__new_col_len(hists, HISTC_CGROUP_ID, 20);
        hists__new_col_len(hists, HISTC_CPU, 3);
        hists__new_col_len(hists, HISTC_SOCKET, 6);
        hists__new_col_len(hists, HISTC_MEM_LOCKED, 6);
@@ -574,9 +580,14 @@ __hists__add_entry(struct hists *hists,
                   bool sample_self,
                   struct hist_entry_ops *ops)
 {
+       struct namespaces *ns = thread__namespaces(al->thread);
        struct hist_entry entry = {
                .thread = al->thread,
                .comm = thread__comm(al->thread),
+               .cgroup_id = {
+                       .dev = ns ? ns->link_info[CGROUP_NS_INDEX].dev : 0,
+                       .ino = ns ? ns->link_info[CGROUP_NS_INDEX].ino : 0,
+               },
                .ms = {
                        .map    = al->map,
                        .sym    = al->sym,
@@ -1129,6 +1140,11 @@ void hist_entry__delete(struct hist_entry *he)
                zfree(&he->mem_info);
        }
 
+       if (he->inline_node) {
+               inline_node__delete(he->inline_node);
+               he->inline_node = NULL;
+       }
+
        zfree(&he->stat_acc);
        free_srcline(he->srcline);
        if (he->srcfile && he->srcfile[0])
@@ -2447,7 +2463,7 @@ int parse_filter_percentage(const struct option *opt __maybe_unused,
        else if (!strcmp(arg, "absolute"))
                symbol_conf.filter_relative = false;
        else {
-               pr_debug("Invalud percentage: %s\n", arg);
+               pr_debug("Invalid percentage: %s\n", arg);
                return -1;
        }
 
index 28c216e..ee3670a 100644 (file)
@@ -30,6 +30,7 @@ enum hist_column {
        HISTC_DSO,
        HISTC_THREAD,
        HISTC_COMM,
+       HISTC_CGROUP_ID,
        HISTC_PARENT,
        HISTC_CPU,
        HISTC_SOCKET,
@@ -57,6 +58,7 @@ enum hist_column {
        HISTC_SRCLINE_FROM,
        HISTC_SRCLINE_TO,
        HISTC_TRACE,
+       HISTC_SYM_SIZE,
        HISTC_NR_COLS, /* Last entry */
 };
 
index 6c2eb5d..b2834ac 100644 (file)
@@ -14,7 +14,9 @@
  */
 
 #include <endian.h>
+#include <errno.h>
 #include <byteswap.h>
+#include <inttypes.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/bitops.h>
index 4f3c758..5481882 100644 (file)
@@ -26,6 +26,7 @@
 #include "insn.c"
 
 #include "intel-pt-insn-decoder.h"
+#include "dump-insn.h"
 
 #if INTEL_PT_INSN_BUF_SZ < MAX_INSN_SIZE || INTEL_PT_INSN_BUF_SZ > MAX_INSN
 #error Instruction buffer size too small
@@ -39,6 +40,8 @@ static void intel_pt_insn_decoder(struct insn *insn,
        enum intel_pt_insn_branch branch = INTEL_PT_BR_NO_BRANCH;
        int ext;
 
+       intel_pt_insn->rel = 0;
+
        if (insn_is_avx(insn)) {
                intel_pt_insn->op = INTEL_PT_OP_OTHER;
                intel_pt_insn->branch = INTEL_PT_BR_NO_BRANCH;
@@ -177,6 +180,29 @@ int intel_pt_get_insn(const unsigned char *buf, size_t len, int x86_64,
        return 0;
 }
 
+const char *dump_insn(struct perf_insn *x, uint64_t ip __maybe_unused,
+                     u8 *inbuf, int inlen, int *lenp)
+{
+       struct insn insn;
+       int n, i;
+       int left;
+
+       insn_init(&insn, inbuf, inlen, x->is64bit);
+       insn_get_length(&insn);
+       if (!insn_complete(&insn) || insn.length > inlen)
+               return "<bad>";
+       if (lenp)
+               *lenp = insn.length;
+       left = sizeof(x->out);
+       n = snprintf(x->out, left, "insn: ");
+       left -= n;
+       for (i = 0; i < insn.length; i++) {
+               n += snprintf(x->out + n, left, "%02x ", inbuf[i]);
+               left -= n;
+       }
+       return x->out;
+}
+
 const char *branch_name[] = {
        [INTEL_PT_OP_OTHER]     = "Other",
        [INTEL_PT_OP_CALL]      = "Call",
index da20cd5..bdd4a28 100644 (file)
@@ -13,6 +13,7 @@
  *
  */
 
+#include <inttypes.h>
 #include <stdio.h>
 #include <stdbool.h>
 #include <errno.h>
index c9a941e..9084930 100644 (file)
@@ -1,5 +1,6 @@
 #include <sys/sysmacros.h>
 #include <sys/types.h>
+#include <errno.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <byteswap.h>
 #include <sys/stat.h>
 #include <sys/mman.h>
+#include <linux/stringify.h>
 
 #include "util.h"
 #include "event.h"
 #include "debug.h"
 #include "evlist.h"
 #include "symbol.h"
-#include "strlist.h"
 #include <elf.h>
 
 #include "tsc.h"
@@ -25,6 +26,8 @@
 #include "genelf.h"
 #include "../builtin.h"
 
+#include "sane_ctype.h"
+
 struct jit_buf_desc {
        struct perf_data_file *output;
        struct perf_session *session;
@@ -181,7 +184,7 @@ jit_open(struct jit_buf_desc *jd, const char *name)
                        jd->use_arch_timestamp);
 
        if (header.version > JITHEADER_VERSION) {
-               pr_err("wrong jitdump version %u, expected " STR(JITHEADER_VERSION),
+               pr_err("wrong jitdump version %u, expected " __stringify(JITHEADER_VERSION),
                        header.version);
                goto error;
        }
index 8243564..c6a15f2 100644 (file)
@@ -12,6 +12,7 @@
 #include "llvm-utils.h"
 #include "config.h"
 #include "util.h"
+#include <sys/wait.h>
 
 #define CLANG_BPF_CMD_DEFAULT_TEMPLATE                         \
                "$CLANG_EXEC -D__KERNEL__ -D__NR_CPUS__=$NR_CPUS "\
index 9ddea5c..4ca7c5c 100644 (file)
@@ -1,6 +1,8 @@
+#include <errno.h>
 #include <lzma.h>
 #include <stdio.h>
 #include <linux/compiler.h>
+#include "compress.h"
 #include "util.h"
 #include "debug.h"
 
index 71c9720..7a47f52 100644 (file)
@@ -1,3 +1,7 @@
+#include <dirent.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <regex.h>
 #include "callchain.h"
 #include "debug.h"
 #include "event.h"
 #include "thread.h"
 #include "vdso.h"
 #include <stdbool.h>
-#include <symbol/kallsyms.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
 #include "unwind.h"
 #include "linux/hash.h"
+#include "asm/bug.h"
+
+#include "sane_ctype.h"
+#include <symbol/kallsyms.h>
 
 static void __machine__remove_thread(struct machine *machine, struct thread *th, bool lock);
 
@@ -501,6 +511,37 @@ int machine__process_comm_event(struct machine *machine, union perf_event *event
        return err;
 }
 
+int machine__process_namespaces_event(struct machine *machine __maybe_unused,
+                                     union perf_event *event,
+                                     struct perf_sample *sample __maybe_unused)
+{
+       struct thread *thread = machine__findnew_thread(machine,
+                                                       event->namespaces.pid,
+                                                       event->namespaces.tid);
+       int err = 0;
+
+       WARN_ONCE(event->namespaces.nr_namespaces > NR_NAMESPACES,
+                 "\nWARNING: kernel seems to support more namespaces than perf"
+                 " tool.\nTry updating the perf tool..\n\n");
+
+       WARN_ONCE(event->namespaces.nr_namespaces < NR_NAMESPACES,
+                 "\nWARNING: perf tool seems to support more namespaces than"
+                 " the kernel.\nTry updating the kernel..\n\n");
+
+       if (dump_trace)
+               perf_event__fprintf_namespaces(event, stdout);
+
+       if (thread == NULL ||
+           thread__set_namespaces(thread, sample->time, &event->namespaces)) {
+               dump_printf("problem processing PERF_RECORD_NAMESPACES, skipping event.\n");
+               err = -1;
+       }
+
+       thread__put(thread);
+
+       return err;
+}
+
 int machine__process_lost_event(struct machine *machine __maybe_unused,
                                union perf_event *event, struct perf_sample *sample __maybe_unused)
 {
@@ -1439,7 +1480,7 @@ static void __machine__remove_thread(struct machine *machine, struct thread *th,
        if (machine->last_match == th)
                machine->last_match = NULL;
 
-       BUG_ON(atomic_read(&th->refcnt) == 0);
+       BUG_ON(refcount_read(&th->refcnt) == 0);
        if (lock)
                pthread_rwlock_wrlock(&machine->threads_lock);
        rb_erase_init(&th->rb_node, &machine->threads);
@@ -1538,6 +1579,8 @@ int machine__process_event(struct machine *machine, union perf_event *event,
                ret = machine__process_comm_event(machine, event, sample); break;
        case PERF_RECORD_MMAP:
                ret = machine__process_mmap_event(machine, event, sample); break;
+       case PERF_RECORD_NAMESPACES:
+               ret = machine__process_namespaces_event(machine, event, sample); break;
        case PERF_RECORD_MMAP2:
                ret = machine__process_mmap2_event(machine, event, sample); break;
        case PERF_RECORD_FORK:
index a283050..3cdb134 100644 (file)
@@ -97,6 +97,9 @@ int machine__process_itrace_start_event(struct machine *machine,
                                        union perf_event *event);
 int machine__process_switch_event(struct machine *machine,
                                  union perf_event *event);
+int machine__process_namespaces_event(struct machine *machine,
+                                     union perf_event *event,
+                                     struct perf_sample *sample);
 int machine__process_mmap_event(struct machine *machine, union perf_event *event,
                                struct perf_sample *sample);
 int machine__process_mmap2_event(struct machine *machine, union perf_event *event,
index 0a943e7..ebfa5d9 100644 (file)
@@ -9,13 +9,13 @@
 #include <uapi/linux/mman.h> /* To get things like MAP_HUGETLB even on older libc headers */
 #include "map.h"
 #include "thread.h"
-#include "strlist.h"
 #include "vdso.h"
 #include "build-id.h"
 #include "util.h"
 #include "debug.h"
 #include "machine.h"
 #include <linux/string.h>
+#include "srcline.h"
 #include "unwind.h"
 
 static void __maps__insert(struct maps *maps, struct map *map);
@@ -141,7 +141,7 @@ void map__init(struct map *map, enum map_type type,
        RB_CLEAR_NODE(&map->rb_node);
        map->groups   = NULL;
        map->erange_warned = false;
-       atomic_set(&map->refcnt, 1);
+       refcount_set(&map->refcnt, 1);
 }
 
 struct map *map__new(struct machine *machine, u64 start, u64 len,
@@ -255,7 +255,7 @@ void map__delete(struct map *map)
 
 void map__put(struct map *map)
 {
-       if (map && atomic_dec_and_test(&map->refcnt))
+       if (map && refcount_dec_and_test(&map->refcnt))
                map__delete(map);
 }
 
@@ -354,7 +354,7 @@ struct map *map__clone(struct map *from)
        struct map *map = memdup(from, sizeof(*map));
 
        if (map != NULL) {
-               atomic_set(&map->refcnt, 1);
+               refcount_set(&map->refcnt, 1);
                RB_CLEAR_NODE(&map->rb_node);
                dso__get(map->dso);
                map->groups = NULL;
@@ -405,7 +405,8 @@ int map__fprintf_srcline(struct map *map, u64 addr, const char *prefix,
 
        if (map && map->dso) {
                srcline = get_srcline(map->dso,
-                                     map__rip_2objdump(map, addr), NULL, true);
+                                     map__rip_2objdump(map, addr), NULL,
+                                     true, true);
                if (srcline != SRCLINE_UNKNOWN)
                        ret = fprintf(fp, "%s%s", prefix, srcline);
                free_srcline(srcline);
@@ -485,7 +486,7 @@ void map_groups__init(struct map_groups *mg, struct machine *machine)
                maps__init(&mg->maps[i]);
        }
        mg->machine = machine;
-       atomic_set(&mg->refcnt, 1);
+       refcount_set(&mg->refcnt, 1);
 }
 
 static void __maps__purge(struct maps *maps)
@@ -547,7 +548,7 @@ void map_groups__delete(struct map_groups *mg)
 
 void map_groups__put(struct map_groups *mg)
 {
-       if (mg && atomic_dec_and_test(&mg->refcnt))
+       if (mg && refcount_dec_and_test(&mg->refcnt))
                map_groups__delete(mg);
 }
 
index abdacf8..c8a5a64 100644 (file)
@@ -1,7 +1,7 @@
 #ifndef __PERF_MAP_H
 #define __PERF_MAP_H
 
-#include <linux/atomic.h>
+#include <linux/refcount.h>
 #include <linux/compiler.h>
 #include <linux/list.h>
 #include <linux/rbtree.h>
@@ -51,7 +51,7 @@ struct map {
 
        struct dso              *dso;
        struct map_groups       *groups;
-       atomic_t                refcnt;
+       refcount_t              refcnt;
 };
 
 struct kmap {
@@ -67,7 +67,7 @@ struct maps {
 struct map_groups {
        struct maps      maps[MAP__NR_TYPES];
        struct machine   *machine;
-       atomic_t         refcnt;
+       refcount_t       refcnt;
 };
 
 struct map_groups *map_groups__new(struct machine *machine);
@@ -77,7 +77,7 @@ bool map_groups__empty(struct map_groups *mg);
 static inline struct map_groups *map_groups__get(struct map_groups *mg)
 {
        if (mg)
-               atomic_inc(&mg->refcnt);
+               refcount_inc(&mg->refcnt);
        return mg;
 }
 
@@ -150,7 +150,7 @@ struct map *map__clone(struct map *map);
 static inline struct map *map__get(struct map *map)
 {
        if (map)
-               atomic_inc(&map->refcnt);
+               refcount_inc(&map->refcnt);
        return map;
 }
 
index 1d4ab53..06f5a3a 100644 (file)
@@ -6,6 +6,7 @@
 #include <sys/stat.h>
 #include <unistd.h>
 #include <api/fs/fs.h>
+#include <linux/kernel.h>
 #include "mem-events.h"
 #include "debug.h"
 #include "symbol.h"
@@ -205,8 +206,8 @@ int perf_mem__lvl_scnprintf(char *out, size_t sz, struct mem_info *mem_info)
 static const char * const snoop_access[] = {
        "N/A",
        "None",
-       "Miss",
        "Hit",
+       "Miss",
        "HitM",
 };
 
diff --git a/tools/perf/util/namespaces.c b/tools/perf/util/namespaces.c
new file mode 100644 (file)
index 0000000..67dcbcc
--- /dev/null
@@ -0,0 +1,37 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * Copyright (C) 2017 Hari Bathini, IBM Corporation
+ */
+
+#include "namespaces.h"
+#include "util.h"
+#include "event.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+struct namespaces *namespaces__new(struct namespaces_event *event)
+{
+       struct namespaces *namespaces;
+       u64 link_info_size = ((event ? event->nr_namespaces : NR_NAMESPACES) *
+                             sizeof(struct perf_ns_link_info));
+
+       namespaces = zalloc(sizeof(struct namespaces) + link_info_size);
+       if (!namespaces)
+               return NULL;
+
+       namespaces->end_time = -1;
+
+       if (event)
+               memcpy(namespaces->link_info, event->link_info, link_info_size);
+
+       return namespaces;
+}
+
+void namespaces__free(struct namespaces *namespaces)
+{
+       free(namespaces);
+}
diff --git a/tools/perf/util/namespaces.h b/tools/perf/util/namespaces.h
new file mode 100644 (file)
index 0000000..468f1e9
--- /dev/null
@@ -0,0 +1,26 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * Copyright (C) 2017 Hari Bathini, IBM Corporation
+ */
+
+#ifndef __PERF_NAMESPACES_H
+#define __PERF_NAMESPACES_H
+
+#include "../perf.h"
+#include <linux/list.h>
+
+struct namespaces_event;
+
+struct namespaces {
+       struct list_head list;
+       u64 end_time;
+       struct perf_ns_link_info link_info[];
+};
+
+struct namespaces *namespaces__new(struct namespaces_event *event);
+void namespaces__free(struct namespaces *namespaces);
+
+#endif  /* __PERF_NAMESPACES_H */
index fe84df1..4de398c 100644 (file)
@@ -1,3 +1,5 @@
+#include <errno.h>
+#include <inttypes.h>
 #include <linux/list.h>
 #include <linux/compiler.h>
 #include <linux/string.h>
@@ -79,7 +81,7 @@ static union perf_event *dup_event(struct ordered_events *oe,
 
 static void free_dup_event(struct ordered_events *oe, union perf_event *event)
 {
-       if (oe->copy_on_queue) {
+       if (event && oe->copy_on_queue) {
                oe->cur_alloc_size -= event->header.size;
                free(event);
        }
@@ -150,6 +152,7 @@ void ordered_events__delete(struct ordered_events *oe, struct ordered_event *eve
        list_move(&event->list, &oe->cache);
        oe->nr_events--;
        free_dup_event(oe, event->event);
+       event->event = NULL;
 }
 
 int ordered_events__queue(struct ordered_events *oe, union perf_event *event,
index 67a8aeb..01e779b 100644 (file)
@@ -1,13 +1,18 @@
 #include <linux/hw_breakpoint.h>
 #include <linux/err.h>
-#include "util.h"
+#include <dirent.h>
+#include <errno.h>
+#include <sys/ioctl.h>
+#include <sys/param.h>
+#include "term.h"
 #include "../perf.h"
 #include "evlist.h"
 #include "evsel.h"
 #include <subcmd/parse-options.h>
 #include "parse-events.h"
 #include <subcmd/exec-cmd.h>
-#include "string.h"
+#include "string2.h"
+#include "strlist.h"
 #include "symbol.h"
 #include "cache.h"
 #include "header.h"
@@ -316,8 +321,9 @@ __add_event(struct list_head *list, int *idx,
                return NULL;
 
        (*idx)++;
-       evsel->cpus     = cpu_map__get(cpus);
-       evsel->own_cpus = cpu_map__get(cpus);
+       evsel->cpus        = cpu_map__get(cpus);
+       evsel->own_cpus    = cpu_map__get(cpus);
+       evsel->system_wide = !!cpus;
 
        if (name)
                evsel->name = strdup(name);
@@ -1254,11 +1260,59 @@ int parse_events_add_pmu(struct parse_events_evlist *data,
                evsel->scale = info.scale;
                evsel->per_pkg = info.per_pkg;
                evsel->snapshot = info.snapshot;
+               evsel->metric_expr = info.metric_expr;
+               evsel->metric_name = info.metric_name;
        }
 
        return evsel ? 0 : -ENOMEM;
 }
 
+int parse_events_multi_pmu_add(struct parse_events_evlist *data,
+                              char *str, struct list_head **listp)
+{
+       struct list_head *head;
+       struct parse_events_term *term;
+       struct list_head *list;
+       struct perf_pmu *pmu = NULL;
+       int ok = 0;
+
+       *listp = NULL;
+       /* Add it for all PMUs that support the alias */
+       list = malloc(sizeof(struct list_head));
+       if (!list)
+               return -1;
+       INIT_LIST_HEAD(list);
+       while ((pmu = perf_pmu__scan(pmu)) != NULL) {
+               struct perf_pmu_alias *alias;
+
+               list_for_each_entry(alias, &pmu->aliases, list) {
+                       if (!strcasecmp(alias->name, str)) {
+                               head = malloc(sizeof(struct list_head));
+                               if (!head)
+                                       return -1;
+                               INIT_LIST_HEAD(head);
+                               if (parse_events_term__num(&term, PARSE_EVENTS__TERM_TYPE_USER,
+                                                          str, 1, false, &str, NULL) < 0)
+                                       return -1;
+                               list_add_tail(&term->list, head);
+
+                               if (!parse_events_add_pmu(data, list,
+                                                 pmu->name, head)) {
+                                       pr_debug("%s -> %s/%s/\n", str,
+                                                pmu->name, alias->str);
+                                       ok++;
+                               }
+
+                               parse_events_terms__delete(head);
+                       }
+               }
+       }
+       if (!ok)
+               return -1;
+       *listp = list;
+       return 0;
+}
+
 int parse_events__modifier_group(struct list_head *list,
                                 char *event_mod)
 {
@@ -2276,7 +2330,7 @@ out_enomem:
  * Print the help text for the event symbols:
  */
 void print_events(const char *event_glob, bool name_only, bool quiet_flag,
-                       bool long_desc)
+                       bool long_desc, bool details_flag)
 {
        print_symbol_events(event_glob, PERF_TYPE_HARDWARE,
                            event_symbols_hw, PERF_COUNT_HW_MAX, name_only);
@@ -2286,7 +2340,8 @@ void print_events(const char *event_glob, bool name_only, bool quiet_flag,
 
        print_hwcache_events(event_glob, name_only);
 
-       print_pmu_events(event_glob, name_only, quiet_flag, long_desc);
+       print_pmu_events(event_glob, name_only, quiet_flag, long_desc,
+                       details_flag);
 
        if (event_glob != NULL)
                return;
@@ -2415,6 +2470,31 @@ int parse_events_term__clone(struct parse_events_term **new,
        return new_term(new, &temp, term->val.str, term->val.num);
 }
 
+int parse_events_copy_term_list(struct list_head *old,
+                                struct list_head **new)
+{
+       struct parse_events_term *term, *n;
+       int ret;
+
+       if (!old) {
+               *new = NULL;
+               return 0;
+       }
+
+       *new = malloc(sizeof(struct list_head));
+       if (!*new)
+               return -ENOMEM;
+       INIT_LIST_HEAD(*new);
+
+       list_for_each_entry (term, old, list) {
+               ret = parse_events_term__clone(&n, term);
+               if (ret)
+                       return ret;
+               list_add_tail(&n->list, *new);
+       }
+       return 0;
+}
+
 void parse_events_terms__purge(struct list_head *terms)
 {
        struct parse_events_term *term, *h;
index 1af6a26..a235f4d 100644 (file)
@@ -8,6 +8,7 @@
 #include <stdbool.h>
 #include <linux/types.h>
 #include <linux/perf_event.h>
+#include <string.h>
 
 struct list_head;
 struct perf_evsel;
@@ -166,6 +167,14 @@ int parse_events_add_breakpoint(struct list_head *list, int *idx,
 int parse_events_add_pmu(struct parse_events_evlist *data,
                         struct list_head *list, char *name,
                         struct list_head *head_config);
+
+int parse_events_multi_pmu_add(struct parse_events_evlist *data,
+                              char *str,
+                              struct list_head **listp);
+
+int parse_events_copy_term_list(struct list_head *old,
+                                struct list_head **new);
+
 enum perf_pmu_event_symbol_type
 perf_pmu__parse_check(const char *name);
 void parse_events__set_leader(char *name, struct list_head *list);
@@ -175,7 +184,7 @@ void parse_events_evlist_error(struct parse_events_evlist *data,
                               int idx, const char *str);
 
 void print_events(const char *event_glob, bool name_only, bool quiet,
-                 bool long_desc);
+                 bool long_desc, bool details_flag);
 
 struct event_symbol {
        const char      *symbol;
@@ -196,4 +205,23 @@ int is_valid_tracepoint(const char *event_string);
 int valid_event_mount(const char *eventfs);
 char *parse_events_formats_error_string(char *additional_terms);
 
+#ifdef HAVE_LIBELF_SUPPORT
+/*
+ * If the probe point starts with '%',
+ * or starts with "sdt_" and has a ':' but no '=',
+ * then it should be a SDT/cached probe point.
+ */
+static inline bool is_sdt_event(char *str)
+{
+       return (str[0] == '%' ||
+               (!strncmp(str, "sdt_", 4) &&
+                !!strchr(str, ':') && !strchr(str, '=')));
+}
+#else
+static inline bool is_sdt_event(char *str __maybe_unused)
+{
+       return false;
+}
+#endif /* HAVE_LIBELF_SUPPORT */
+
 #endif /* __PERF_PARSE_EVENTS_H */
index 30f018e..04fd8c9 100644 (file)
@@ -226,68 +226,55 @@ event_pmu:
 PE_NAME opt_event_config
 {
        struct parse_events_evlist *data = _data;
-       struct list_head *list;
+       struct list_head *list, *orig_terms, *terms;
+
+       if (parse_events_copy_term_list($2, &orig_terms))
+               YYABORT;
 
        ALLOC_LIST(list);
-       ABORT_ON(parse_events_add_pmu(data, list, $1, $2));
+       if (parse_events_add_pmu(data, list, $1, $2)) {
+               struct perf_pmu *pmu = NULL;
+               int ok = 0;
+
+               while ((pmu = perf_pmu__scan(pmu)) != NULL) {
+                       char *name = pmu->name;
+
+                       if (!strncmp(name, "uncore_", 7) &&
+                           strncmp($1, "uncore_", 7))
+                               name += 7;
+                       if (!strncmp($1, name, strlen($1))) {
+                               if (parse_events_copy_term_list(orig_terms, &terms))
+                                       YYABORT;
+                               if (!parse_events_add_pmu(data, list, pmu->name, terms))
+                                       ok++;
+                               parse_events_terms__delete(terms);
+                       }
+               }
+               if (!ok)
+                       YYABORT;
+       }
        parse_events_terms__delete($2);
+       parse_events_terms__delete(orig_terms);
        $$ = list;
 }
 |
 PE_KERNEL_PMU_EVENT sep_dc
 {
-       struct parse_events_evlist *data = _data;
-       struct list_head *head;
-       struct parse_events_term *term;
        struct list_head *list;
-       struct perf_pmu *pmu = NULL;
-       int ok = 0;
-
-       /* Add it for all PMUs that support the alias */
-       ALLOC_LIST(list);
-       while ((pmu = perf_pmu__scan(pmu)) != NULL) {
-               struct perf_pmu_alias *alias;
-
-               list_for_each_entry(alias, &pmu->aliases, list) {
-                       if (!strcasecmp(alias->name, $1)) {
-                               ALLOC_LIST(head);
-                               ABORT_ON(parse_events_term__num(&term, PARSE_EVENTS__TERM_TYPE_USER,
-                                       $1, 1, false, &@1, NULL));
-                               list_add_tail(&term->list, head);
-
-                               if (!parse_events_add_pmu(data, list,
-                                                 pmu->name, head)) {
-                                       pr_debug("%s -> %s/%s/\n", $1,
-                                                pmu->name, alias->str);
-                                       ok++;
-                               }
 
-                               parse_events_terms__delete(head);
-                       }
-               }
-       }
-       if (!ok)
+       if (parse_events_multi_pmu_add(_data, $1, &list) < 0)
                YYABORT;
        $$ = list;
 }
 |
 PE_PMU_EVENT_PRE '-' PE_PMU_EVENT_SUF sep_dc
 {
-       struct parse_events_evlist *data = _data;
-       struct list_head *head;
-       struct parse_events_term *term;
        struct list_head *list;
        char pmu_name[128];
-       snprintf(&pmu_name, 128, "%s-%s", $1, $3);
 
-       ALLOC_LIST(head);
-       ABORT_ON(parse_events_term__num(&term, PARSE_EVENTS__TERM_TYPE_USER,
-                                       &pmu_name, 1, false, &@1, NULL));
-       list_add_tail(&term->list, head);
-
-       ALLOC_LIST(list);
-       ABORT_ON(parse_events_add_pmu(data, list, "cpu", head));
-       parse_events_terms__delete(head);
+       snprintf(&pmu_name, 128, "%s-%s", $1, $3);
+       if (parse_events_multi_pmu_add(_data, pmu_name, &list) < 0)
+               YYABORT;
        $$ = list;
 }
 
index 7c7630b..50ec3bc 100644 (file)
  * which is what it's designed for.
  */
 #include "cache.h"
-#include "util.h"
+#include "path.h"
+#include <linux/kernel.h>
 #include <limits.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
 
 static char bad_path[] = "/bad-path/";
 /*
@@ -50,3 +55,24 @@ char *mkpath(const char *fmt, ...)
                return bad_path;
        return cleanup_path(pathname);
 }
+
+int path__join(char *bf, size_t size, const char *path1, const char *path2)
+{
+       return scnprintf(bf, size, "%s%s%s", path1, path1[0] ? "/" : "", path2);
+}
+
+int path__join3(char *bf, size_t size, const char *path1, const char *path2, const char *path3)
+{
+       return scnprintf(bf, size, "%s%s%s%s%s", path1, path1[0] ? "/" : "",
+                        path2, path2[0] ? "/" : "", path3);
+}
+
+bool is_regular_file(const char *file)
+{
+       struct stat st;
+
+       if (stat(file, &st))
+               return false;
+
+       return S_ISREG(st.st_mode);
+}
diff --git a/tools/perf/util/path.h b/tools/perf/util/path.h
new file mode 100644 (file)
index 0000000..9a276a5
--- /dev/null
@@ -0,0 +1,9 @@
+#ifndef _PERF_PATH_H
+#define _PERF_PATH_H
+
+int path__join(char *bf, size_t size, const char *path1, const char *path2);
+int path__join3(char *bf, size_t size, const char *path1, const char *path2, const char *path3);
+
+bool is_regular_file(const char *file);
+
+#endif /* _PERF_PATH_H */
index cb36830..d550929 100644 (file)
@@ -9,6 +9,7 @@
 #include <stdlib.h>
 #include <setjmp.h>
 #include <linux/err.h>
+#include <linux/kernel.h>
 #include "util/util.h"
 #include "util/debug.h"
 #include "util/perf-hooks.h"
index c4023f2..b2ae039 100644 (file)
@@ -6,6 +6,12 @@ const struct sample_reg __weak sample_reg_masks[] = {
        SMPL_REG_END
 };
 
+int __weak arch_sdt_arg_parse_op(char *old_op __maybe_unused,
+                                char **new_op __maybe_unused)
+{
+       return SDT_ARG_SKIP;
+}
+
 #ifdef HAVE_PERF_REGS_SUPPORT
 int perf_reg_value(u64 *valp, struct regs_dump *regs, int id)
 {
index 679d6e4..32b37d1 100644 (file)
@@ -15,6 +15,13 @@ struct sample_reg {
 
 extern const struct sample_reg sample_reg_masks[];
 
+enum {
+       SDT_ARG_VALID = 0,
+       SDT_ARG_SKIP,
+};
+
+int arch_sdt_arg_parse_op(char *old_op, char **new_op);
+
 #ifdef HAVE_PERF_REGS_SUPPORT
 #include <perf_regs.h>
 
index 12f84dd..ac16a9d 100644 (file)
@@ -1,6 +1,8 @@
 #include <linux/list.h>
 #include <linux/compiler.h>
 #include <sys/types.h>
+#include <errno.h>
+#include <sys/stat.h>
 #include <unistd.h>
 #include <stdio.h>
 #include <stdbool.h>
@@ -15,6 +17,7 @@
 #include "header.h"
 #include "pmu-events/pmu-events.h"
 #include "cache.h"
+#include "string2.h"
 
 struct perf_pmu_format {
        char *name;
@@ -231,7 +234,9 @@ static int perf_pmu__parse_snapshot(struct perf_pmu_alias *alias,
 static int __perf_pmu__new_alias(struct list_head *list, char *dir, char *name,
                                 char *desc, char *val,
                                 char *long_desc, char *topic,
-                                char *unit, char *perpkg)
+                                char *unit, char *perpkg,
+                                char *metric_expr,
+                                char *metric_name)
 {
        struct perf_pmu_alias *alias;
        int ret;
@@ -265,6 +270,8 @@ static int __perf_pmu__new_alias(struct list_head *list, char *dir, char *name,
                perf_pmu__parse_snapshot(alias, dir, name);
        }
 
+       alias->metric_expr = metric_expr ? strdup(metric_expr) : NULL;
+       alias->metric_name = metric_name ? strdup(metric_name): NULL;
        alias->desc = desc ? strdup(desc) : NULL;
        alias->long_desc = long_desc ? strdup(long_desc) :
                                desc ? strdup(desc) : NULL;
@@ -294,7 +301,7 @@ static int perf_pmu__new_alias(struct list_head *list, char *dir, char *name, FI
        buf[ret] = 0;
 
        return __perf_pmu__new_alias(list, dir, name, NULL, buf, NULL, NULL, NULL,
-                                    NULL);
+                                    NULL, NULL, NULL);
 }
 
 static inline bool pmu_alias_info_file(char *name)
@@ -564,7 +571,9 @@ static void pmu_add_cpu_aliases(struct list_head *head, const char *name)
                __perf_pmu__new_alias(head, NULL, (char *)pe->name,
                                (char *)pe->desc, (char *)pe->event,
                                (char *)pe->long_desc, (char *)pe->topic,
-                               (char *)pe->unit, (char *)pe->perpkg);
+                               (char *)pe->unit, (char *)pe->perpkg,
+                               (char *)pe->metric_expr,
+                               (char *)pe->metric_name);
        }
 
 out:
@@ -991,6 +1000,8 @@ int perf_pmu__check_alias(struct perf_pmu *pmu, struct list_head *head_terms,
        info->unit     = NULL;
        info->scale    = 0.0;
        info->snapshot = false;
+       info->metric_expr = NULL;
+       info->metric_name = NULL;
 
        list_for_each_entry_safe(term, h, head_terms, list) {
                alias = pmu_find_alias(pmu, term);
@@ -1006,6 +1017,8 @@ int perf_pmu__check_alias(struct perf_pmu *pmu, struct list_head *head_terms,
 
                if (alias->per_pkg)
                        info->per_pkg = true;
+               info->metric_expr = alias->metric_expr;
+               info->metric_name = alias->metric_name;
 
                list_del(&term->list);
                free(term);
@@ -1100,6 +1113,8 @@ struct sevent {
        char *topic;
        char *str;
        char *pmu;
+       char *metric_expr;
+       char *metric_name;
 };
 
 static int cmp_sevent(const void *a, const void *b)
@@ -1136,13 +1151,12 @@ static void wordwrap(char *s, int start, int max, int corr)
                        break;
                s += wlen;
                column += n;
-               while (isspace(*s))
-                       s++;
+               s = ltrim(s);
        }
 }
 
 void print_pmu_events(const char *event_glob, bool name_only, bool quiet_flag,
-                       bool long_desc)
+                       bool long_desc, bool details_flag)
 {
        struct perf_pmu *pmu;
        struct perf_pmu_alias *alias;
@@ -1198,6 +1212,8 @@ void print_pmu_events(const char *event_glob, bool name_only, bool quiet_flag,
                        aliases[j].topic = alias->topic;
                        aliases[j].str = alias->str;
                        aliases[j].pmu = pmu->name;
+                       aliases[j].metric_expr = alias->metric_expr;
+                       aliases[j].metric_name = alias->metric_name;
                        j++;
                }
                if (pmu->selectable &&
@@ -1232,8 +1248,14 @@ void print_pmu_events(const char *event_glob, bool name_only, bool quiet_flag,
                        printf("%*s", 8, "[");
                        wordwrap(aliases[j].desc, 8, columns, 0);
                        printf("]\n");
-                       if (verbose > 0)
-                               printf("%*s%s/%s/\n", 8, "", aliases[j].pmu, aliases[j].str);
+                       if (details_flag) {
+                               printf("%*s%s/%s/ ", 8, "", aliases[j].pmu, aliases[j].str);
+                               if (aliases[j].metric_name)
+                                       printf(" MetricName: %s", aliases[j].metric_name);
+                               if (aliases[j].metric_expr)
+                                       printf(" MetricExpr: %s", aliases[j].metric_expr);
+                               putchar('\n');
+                       }
                } else
                        printf("  %-50s [Kernel PMU event]\n", aliases[j].name);
                printed++;
index 00852dd..ea7f450 100644 (file)
@@ -31,6 +31,8 @@ struct perf_pmu {
 
 struct perf_pmu_info {
        const char *unit;
+       const char *metric_expr;
+       const char *metric_name;
        double scale;
        bool per_pkg;
        bool snapshot;
@@ -50,6 +52,8 @@ struct perf_pmu_alias {
        double scale;
        bool per_pkg;
        bool snapshot;
+       char *metric_expr;
+       char *metric_name;
 };
 
 struct perf_pmu *perf_pmu__find(const char *name);
@@ -76,7 +80,7 @@ int perf_pmu__format_parse(char *dir, struct list_head *head);
 struct perf_pmu *perf_pmu__scan(struct perf_pmu *pmu);
 
 void print_pmu_events(const char *event_glob, bool name_only, bool quiet,
-                     bool long_desc);
+                     bool long_desc, bool details_flag);
 bool pmu_have_event(const char *pname, const char *name);
 
 int perf_pmu__scan_file(struct perf_pmu *pmu, const char *name, const char *fmt,
diff --git a/tools/perf/util/print_binary.c b/tools/perf/util/print_binary.c
new file mode 100644 (file)
index 0000000..e908177
--- /dev/null
@@ -0,0 +1,55 @@
+#include "print_binary.h"
+#include <linux/log2.h>
+#include "sane_ctype.h"
+
+void print_binary(unsigned char *data, size_t len,
+                 size_t bytes_per_line, print_binary_t printer,
+                 void *extra)
+{
+       size_t i, j, mask;
+
+       if (!printer)
+               return;
+
+       bytes_per_line = roundup_pow_of_two(bytes_per_line);
+       mask = bytes_per_line - 1;
+
+       printer(BINARY_PRINT_DATA_BEGIN, 0, extra);
+       for (i = 0; i < len; i++) {
+               if ((i & mask) == 0) {
+                       printer(BINARY_PRINT_LINE_BEGIN, -1, extra);
+                       printer(BINARY_PRINT_ADDR, i, extra);
+               }
+
+               printer(BINARY_PRINT_NUM_DATA, data[i], extra);
+
+               if (((i & mask) == mask) || i == len - 1) {
+                       for (j = 0; j < mask-(i & mask); j++)
+                               printer(BINARY_PRINT_NUM_PAD, -1, extra);
+
+                       printer(BINARY_PRINT_SEP, i, extra);
+                       for (j = i & ~mask; j <= i; j++)
+                               printer(BINARY_PRINT_CHAR_DATA, data[j], extra);
+                       for (j = 0; j < mask-(i & mask); j++)
+                               printer(BINARY_PRINT_CHAR_PAD, i, extra);
+                       printer(BINARY_PRINT_LINE_END, -1, extra);
+               }
+       }
+       printer(BINARY_PRINT_DATA_END, -1, extra);
+}
+
+int is_printable_array(char *p, unsigned int len)
+{
+       unsigned int i;
+
+       if (!p || !len || p[len - 1] != 0)
+               return 0;
+
+       len--;
+
+       for (i = 0; i < len; i++) {
+               if (!isprint(p[i]) && !isspace(p[i]))
+                       return 0;
+       }
+       return 1;
+}
diff --git a/tools/perf/util/print_binary.h b/tools/perf/util/print_binary.h
new file mode 100644 (file)
index 0000000..da04272
--- /dev/null
@@ -0,0 +1,28 @@
+#ifndef PERF_PRINT_BINARY_H
+#define PERF_PRINT_BINARY_H
+
+#include <stddef.h>
+
+enum binary_printer_ops {
+       BINARY_PRINT_DATA_BEGIN,
+       BINARY_PRINT_LINE_BEGIN,
+       BINARY_PRINT_ADDR,
+       BINARY_PRINT_NUM_DATA,
+       BINARY_PRINT_NUM_PAD,
+       BINARY_PRINT_SEP,
+       BINARY_PRINT_CHAR_DATA,
+       BINARY_PRINT_CHAR_PAD,
+       BINARY_PRINT_LINE_END,
+       BINARY_PRINT_DATA_END,
+};
+
+typedef void (*print_binary_t)(enum binary_printer_ops op,
+                              unsigned int val, void *extra);
+
+void print_binary(unsigned char *data, size_t len,
+                 size_t bytes_per_line, print_binary_t printer,
+                 void *extra);
+
+int is_printable_array(char *p, unsigned int len);
+
+#endif /* PERF_PRINT_BINARY_H */
index 28fb62c..84e7e69 100644 (file)
@@ -19,6 +19,7 @@
  *
  */
 
+#include <inttypes.h>
 #include <sys/utsname.h>
 #include <sys/types.h>
 #include <sys/stat.h>
@@ -35,6 +36,7 @@
 #include "util.h"
 #include "event.h"
 #include "strlist.h"
+#include "strfilter.h"
 #include "debug.h"
 #include "cache.h"
 #include "color.h"
 #include "probe-finder.h"
 #include "probe-file.h"
 #include "session.h"
+#include "string2.h"
+
+#include "sane_ctype.h"
 
-#define MAX_CMDLEN 256
 #define PERFPROBE_GROUP "probe"
 
 bool probe_event_dry_run;      /* Dry run flag */
@@ -757,7 +761,9 @@ post_process_kernel_probe_trace_events(struct probe_trace_event *tevs,
        }
 
        for (i = 0; i < ntevs; i++) {
-               if (!tevs[i].point.address || tevs[i].point.retprobe)
+               if (!tevs[i].point.address)
+                       continue;
+               if (tevs[i].point.retprobe && !kretprobe_offset_is_supported())
                        continue;
                /* If we found a wrong one, mark it by NULL symbol */
                if (kprobe_warn_out_range(tevs[i].point.symbol,
@@ -1339,14 +1345,7 @@ static int parse_perf_probe_point(char *arg, struct perf_probe_event *pev)
        if (!arg)
                return -EINVAL;
 
-       /*
-        * If the probe point starts with '%',
-        * or starts with "sdt_" and has a ':' but no '=',
-        * then it should be a SDT/cached probe point.
-        */
-       if (arg[0] == '%' ||
-           (!strncmp(arg, "sdt_", 4) &&
-            !!strchr(arg, ':') && !strchr(arg, '='))) {
+       if (is_sdt_event(arg)) {
                pev->sdt = true;
                if (arg[0] == '%')
                        arg++;
@@ -1528,11 +1527,6 @@ static int parse_perf_probe_point(char *arg, struct perf_probe_event *pev)
                return -EINVAL;
        }
 
-       if (pp->retprobe && !pp->function) {
-               semantic_error("Return probe requires an entry function.\n");
-               return -EINVAL;
-       }
-
        if ((pp->offset || pp->line || pp->lazy_line) && pp->retprobe) {
                semantic_error("Offset/Line/Lazy pattern can't be used with "
                               "return probe.\n");
@@ -2841,7 +2835,8 @@ static int find_probe_trace_events_from_map(struct perf_probe_event *pev,
        }
 
        /* Note that the symbols in the kmodule are not relocated */
-       if (!pev->uprobes && !pp->retprobe && !pev->target) {
+       if (!pev->uprobes && !pev->target &&
+                       (!pp->retprobe || kretprobe_offset_is_supported())) {
                reloc_sym = kernel_get_ref_reloc_sym();
                if (!reloc_sym) {
                        pr_warning("Relocated base symbol is not found!\n");
@@ -3057,7 +3052,7 @@ concat_probe_trace_events(struct probe_trace_event **tevs, int *ntevs,
        struct probe_trace_event *new_tevs;
        int ret = 0;
 
-       if (ntevs == 0) {
+       if (*ntevs == 0) {
                *tevs = *tevs2;
                *ntevs = ntevs2;
                *tevs2 = NULL;
index 5d4e940..3738426 100644 (file)
@@ -3,8 +3,6 @@
 
 #include <stdbool.h>
 #include "intlist.h"
-#include "strlist.h"
-#include "strfilter.h"
 
 /* Probe related configurations */
 struct probe_conf {
@@ -107,6 +105,8 @@ struct line_range {
        struct intlist          *line_list;     /* Visible lines */
 };
 
+struct strlist;
+
 /* List of variables */
 struct variable_list {
        struct probe_trace_point        point;  /* Actual probepoint */
@@ -153,6 +153,9 @@ int convert_perf_probe_events(struct perf_probe_event *pevs, int npevs);
 int apply_perf_probe_events(struct perf_probe_event *pevs, int npevs);
 int show_probe_trace_events(struct perf_probe_event *pevs, int npevs);
 void cleanup_perf_probe_events(struct perf_probe_event *pevs, int npevs);
+
+struct strfilter;
+
 int del_perf_probe_events(struct strfilter *filter);
 
 int show_perf_probe_event(const char *group, const char *event,
index 436b647..d679389 100644 (file)
  * GNU General Public License for more details.
  *
  */
+#include <errno.h>
+#include <sys/stat.h>
+#include <sys/types.h>
 #include <sys/uio.h>
+#include <unistd.h>
 #include "util.h"
 #include "event.h"
 #include "strlist.h"
+#include "strfilter.h"
 #include "debug.h"
 #include "cache.h"
 #include "color.h"
 #include "probe-event.h"
 #include "probe-file.h"
 #include "session.h"
+#include "perf_regs.h"
+#include "string2.h"
 
-#define MAX_CMDLEN 256
+/* 4096 - 2 ('\n' + '\0') */
+#define MAX_CMDLEN 4094
 
 static void print_open_warning(int err, bool uprobe)
 {
@@ -70,7 +78,7 @@ static void print_both_open_warning(int kerr, int uerr)
        }
 }
 
-static int open_probe_events(const char *trace_file, bool readwrite)
+int open_trace_file(const char *trace_file, bool readwrite)
 {
        char buf[PATH_MAX];
        int ret;
@@ -92,12 +100,12 @@ static int open_probe_events(const char *trace_file, bool readwrite)
 
 static int open_kprobe_events(bool readwrite)
 {
-       return open_probe_events("kprobe_events", readwrite);
+       return open_trace_file("kprobe_events", readwrite);
 }
 
 static int open_uprobe_events(bool readwrite)
 {
-       return open_probe_events("uprobe_events", readwrite);
+       return open_trace_file("uprobe_events", readwrite);
 }
 
 int probe_file__open(int flag)
@@ -687,6 +695,110 @@ static unsigned long long sdt_note__get_addr(struct sdt_note *note)
                 : (unsigned long long)note->addr.a64[0];
 }
 
+static const char * const type_to_suffix[] = {
+       ":s64", "", "", "", ":s32", "", ":s16", ":s8",
+       "", ":u8", ":u16", "", ":u32", "", "", "", ":u64"
+};
+
+/*
+ * Isolate the string number and convert it into a decimal value;
+ * this will be an index to get suffix of the uprobe name (defining
+ * the type)
+ */
+static int sdt_arg_parse_size(char *n_ptr, const char **suffix)
+{
+       long type_idx;
+
+       type_idx = strtol(n_ptr, NULL, 10);
+       if (type_idx < -8 || type_idx > 8) {
+               pr_debug4("Failed to get a valid sdt type\n");
+               return -1;
+       }
+
+       *suffix = type_to_suffix[type_idx + 8];
+       return 0;
+}
+
+static int synthesize_sdt_probe_arg(struct strbuf *buf, int i, const char *arg)
+{
+       char *op, *desc = strdup(arg), *new_op = NULL;
+       const char *suffix = "";
+       int ret = -1;
+
+       if (desc == NULL) {
+               pr_debug4("Allocation error\n");
+               return ret;
+       }
+
+       /*
+        * Argument is in N@OP format. N is size of the argument and OP is
+        * the actual assembly operand. N can be omitted; in that case
+        * argument is just OP(without @).
+        */
+       op = strchr(desc, '@');
+       if (op) {
+               op[0] = '\0';
+               op++;
+
+               if (sdt_arg_parse_size(desc, &suffix))
+                       goto error;
+       } else {
+               op = desc;
+       }
+
+       ret = arch_sdt_arg_parse_op(op, &new_op);
+
+       if (ret < 0)
+               goto error;
+
+       if (ret == SDT_ARG_VALID) {
+               ret = strbuf_addf(buf, " arg%d=%s%s", i + 1, new_op, suffix);
+               if (ret < 0)
+                       goto error;
+       }
+
+       ret = 0;
+error:
+       free(desc);
+       free(new_op);
+       return ret;
+}
+
+static char *synthesize_sdt_probe_command(struct sdt_note *note,
+                                       const char *pathname,
+                                       const char *sdtgrp)
+{
+       struct strbuf buf;
+       char *ret = NULL, **args;
+       int i, args_count;
+
+       if (strbuf_init(&buf, 32) < 0)
+               return NULL;
+
+       if (strbuf_addf(&buf, "p:%s/%s %s:0x%llx",
+                               sdtgrp, note->name, pathname,
+                               sdt_note__get_addr(note)) < 0)
+               goto error;
+
+       if (!note->args)
+               goto out;
+
+       if (note->args) {
+               args = argv_split(note->args, &args_count);
+
+               for (i = 0; i < args_count; ++i) {
+                       if (synthesize_sdt_probe_arg(&buf, i, args[i]) < 0)
+                               goto error;
+               }
+       }
+
+out:
+       ret = strbuf_detach(&buf, NULL);
+error:
+       strbuf_release(&buf);
+       return ret;
+}
+
 int probe_cache__scan_sdt(struct probe_cache *pcache, const char *pathname)
 {
        struct probe_cache_entry *entry = NULL;
@@ -723,11 +835,12 @@ int probe_cache__scan_sdt(struct probe_cache *pcache, const char *pathname)
                        entry->pev.group = strdup(sdtgrp);
                        list_add_tail(&entry->node, &pcache->entries);
                }
-               ret = asprintf(&buf, "p:%s/%s %s:0x%llx",
-                               sdtgrp, note->name, pathname,
-                               sdt_note__get_addr(note));
-               if (ret < 0)
+               buf = synthesize_sdt_probe_command(note, pathname, sdtgrp);
+               if (!buf) {
+                       ret = -ENOMEM;
                        break;
+               }
+
                strlist__add(entry->tevlist, buf);
                free(buf);
                entry = NULL;
@@ -877,59 +990,72 @@ int probe_cache__show_all_caches(struct strfilter *filter)
        return 0;
 }
 
+enum ftrace_readme {
+       FTRACE_README_PROBE_TYPE_X = 0,
+       FTRACE_README_KRETPROBE_OFFSET,
+       FTRACE_README_END,
+};
+
 static struct {
        const char *pattern;
-       bool    avail;
-       bool    checked;
-} probe_type_table[] = {
-#define DEFINE_TYPE(idx, pat, def_avail)       \
-       [idx] = {.pattern = pat, .avail = (def_avail)}
-       DEFINE_TYPE(PROBE_TYPE_U, "* u8/16/32/64,*", true),
-       DEFINE_TYPE(PROBE_TYPE_S, "* s8/16/32/64,*", true),
-       DEFINE_TYPE(PROBE_TYPE_X, "* x8/16/32/64,*", false),
-       DEFINE_TYPE(PROBE_TYPE_STRING, "* string,*", true),
-       DEFINE_TYPE(PROBE_TYPE_BITFIELD,
-                   "* b<bit-width>@<bit-offset>/<container-size>", true),
+       bool avail;
+} ftrace_readme_table[] = {
+#define DEFINE_TYPE(idx, pat)                  \
+       [idx] = {.pattern = pat, .avail = false}
+       DEFINE_TYPE(FTRACE_README_PROBE_TYPE_X, "*type: * x8/16/32/64,*"),
+       DEFINE_TYPE(FTRACE_README_KRETPROBE_OFFSET, "*place (kretprobe): *"),
 };
 
-bool probe_type_is_available(enum probe_type type)
+static bool scan_ftrace_readme(enum ftrace_readme type)
 {
+       int fd;
        FILE *fp;
        char *buf = NULL;
        size_t len = 0;
-       bool target_line = false;
-       bool ret = probe_type_table[type].avail;
+       bool ret = false;
+       static bool scanned = false;
 
-       if (type >= PROBE_TYPE_END)
-               return false;
-       /* We don't have to check the type which supported by default */
-       if (ret || probe_type_table[type].checked)
-               return ret;
+       if (scanned)
+               goto result;
 
-       if (asprintf(&buf, "%s/README", tracing_path) < 0)
+       fd = open_trace_file("README", false);
+       if (fd < 0)
                return ret;
 
-       fp = fopen(buf, "r");
-       if (!fp)
-               goto end;
-
-       zfree(&buf);
-       while (getline(&buf, &len, fp) > 0 && !ret) {
-               if (!target_line) {
-                       target_line = !!strstr(buf, " type: ");
-                       if (!target_line)
-                               continue;
-               } else if (strstr(buf, "\t          ") != buf)
-                       break;
-               ret = strglobmatch(buf, probe_type_table[type].pattern);
+       fp = fdopen(fd, "r");
+       if (!fp) {
+               close(fd);
+               return ret;
        }
-       /* Cache the result */
-       probe_type_table[type].checked = true;
-       probe_type_table[type].avail = ret;
+
+       while (getline(&buf, &len, fp) > 0)
+               for (enum ftrace_readme i = 0; i < FTRACE_README_END; i++)
+                       if (!ftrace_readme_table[i].avail)
+                               ftrace_readme_table[i].avail =
+                                       strglobmatch(buf, ftrace_readme_table[i].pattern);
+       scanned = true;
 
        fclose(fp);
-end:
        free(buf);
 
-       return ret;
+result:
+       if (type >= FTRACE_README_END)
+               return false;
+
+       return ftrace_readme_table[type].avail;
+}
+
+bool probe_type_is_available(enum probe_type type)
+{
+       if (type >= PROBE_TYPE_END)
+               return false;
+       else if (type == PROBE_TYPE_X)
+               return scan_ftrace_readme(FTRACE_README_PROBE_TYPE_X);
+
+       return true;
+}
+
+bool kretprobe_offset_is_supported(void)
+{
+       return scan_ftrace_readme(FTRACE_README_KRETPROBE_OFFSET);
 }
index eba44c3..5ecc9d3 100644 (file)
@@ -1,10 +1,11 @@
 #ifndef __PROBE_FILE_H
 #define __PROBE_FILE_H
 
-#include "strlist.h"
-#include "strfilter.h"
 #include "probe-event.h"
 
+struct strlist;
+struct strfilter;
+
 /* Cache of probe definitions */
 struct probe_cache_entry {
        struct list_head        node;
@@ -35,11 +36,13 @@ enum probe_type {
 
 /* probe-file.c depends on libelf */
 #ifdef HAVE_LIBELF_SUPPORT
+int open_trace_file(const char *trace_file, bool readwrite);
 int probe_file__open(int flag);
 int probe_file__open_both(int *kfd, int *ufd, int flag);
 struct strlist *probe_file__get_namelist(int fd);
 struct strlist *probe_file__get_rawlist(int fd);
 int probe_file__add_event(int fd, struct probe_trace_event *tev);
+
 int probe_file__del_events(int fd, struct strfilter *filter);
 int probe_file__get_events(int fd, struct strfilter *filter,
                                  struct strlist *plist);
@@ -64,6 +67,7 @@ struct probe_cache_entry *probe_cache__find_by_name(struct probe_cache *pcache,
                                        const char *group, const char *event);
 int probe_cache__show_all_caches(struct strfilter *filter);
 bool probe_type_is_available(enum probe_type type);
+bool kretprobe_offset_is_supported(void);
 #else  /* ! HAVE_LIBELF_SUPPORT */
 static inline struct probe_cache *probe_cache__new(const char *tgt __maybe_unused)
 {
index 57cd268..a5731de 100644 (file)
@@ -19,6 +19,7 @@
  *
  */
 
+#include <inttypes.h>
 #include <sys/utsname.h>
 #include <sys/types.h>
 #include <sys/stat.h>
 #include "debug.h"
 #include "intlist.h"
 #include "util.h"
+#include "strlist.h"
 #include "symbol.h"
 #include "probe-finder.h"
 #include "probe-file.h"
+#include "string2.h"
 
 /* Kprobe tracer basic type is up to u64 */
 #define MAX_BASIC_TYPE_BITS    64
index 2956c51..27f0615 100644 (file)
@@ -2,9 +2,9 @@
 #define _PROBE_FINDER_H
 
 #include <stdbool.h>
-#include "util.h"
 #include "intlist.h"
 #include "probe-event.h"
+#include "sane_ctype.h"
 
 #define MAX_PROBE_BUFFER       1024
 #define MAX_PROBES              128
index 0546a43..9f3b0d9 100644 (file)
@@ -21,8 +21,10 @@ util/cgroup.c
 util/parse-branch-options.c
 util/rblist.c
 util/counts.c
+util/print_binary.c
 util/strlist.c
 util/trace-event.c
 ../lib/rbtree.c
 util/string.c
 util/symbol_fprintf.c
+util/units.c
index a5fbc01..c129e99 100644 (file)
@@ -4,12 +4,26 @@
 #include <poll.h>
 #include <linux/err.h>
 #include "evlist.h"
+#include "callchain.h"
 #include "evsel.h"
 #include "event.h"
 #include "cpumap.h"
+#include "print_binary.h"
 #include "thread_map.h"
 
 /*
+ * Provide these two so that we don't have to link against callchain.c and
+ * start dragging hist.c, etc.
+ */
+struct callchain_param callchain_param;
+
+int parse_callchain_record(const char *arg __maybe_unused,
+                          struct callchain_param *param __maybe_unused)
+{
+       return 0;
+}
+
+/*
  * Support debug printing even though util/debug.c is not linked.  That means
  * implementing 'verbose' and 'eprintf'.
  */
index 293534c..1ba8920 100644 (file)
@@ -1,3 +1,4 @@
+#include <errno.h>
 #include <stdlib.h>
 #include "strbuf.h"
 #include "quote.h"
index 98bf584..d91bdf5 100644 (file)
@@ -2,6 +2,7 @@
 #include "evsel.h"
 #include "cpumap.h"
 #include "parse-events.h"
+#include <errno.h>
 #include <api/fs/fs.h>
 #include "util.h"
 #include "cloexec.h"
diff --git a/tools/perf/util/sane_ctype.h b/tools/perf/util/sane_ctype.h
new file mode 100644 (file)
index 0000000..4308c22
--- /dev/null
@@ -0,0 +1,51 @@
+#ifndef _PERF_SANE_CTYPE_H
+#define _PERF_SANE_CTYPE_H
+
+extern const char *graph_line;
+extern const char *graph_dotted_line;
+extern const char *spaces;
+extern const char *dots;
+
+/* Sane ctype - no locale, and works with signed chars */
+#undef isascii
+#undef isspace
+#undef isdigit
+#undef isxdigit
+#undef isalpha
+#undef isprint
+#undef isalnum
+#undef islower
+#undef isupper
+#undef tolower
+#undef toupper
+
+extern unsigned char sane_ctype[256];
+#define GIT_SPACE              0x01
+#define GIT_DIGIT              0x02
+#define GIT_ALPHA              0x04
+#define GIT_GLOB_SPECIAL       0x08
+#define GIT_REGEX_SPECIAL      0x10
+#define GIT_PRINT_EXTRA                0x20
+#define GIT_PRINT              0x3E
+#define sane_istest(x,mask) ((sane_ctype[(unsigned char)(x)] & (mask)) != 0)
+#define isascii(x) (((x) & ~0x7f) == 0)
+#define isspace(x) sane_istest(x,GIT_SPACE)
+#define isdigit(x) sane_istest(x,GIT_DIGIT)
+#define isxdigit(x)    \
+       (sane_istest(toupper(x), GIT_ALPHA | GIT_DIGIT) && toupper(x) < 'G')
+#define isalpha(x) sane_istest(x,GIT_ALPHA)
+#define isalnum(x) sane_istest(x,GIT_ALPHA | GIT_DIGIT)
+#define isprint(x) sane_istest(x,GIT_PRINT)
+#define islower(x) (sane_istest(x,GIT_ALPHA) && (x & 0x20))
+#define isupper(x) (sane_istest(x,GIT_ALPHA) && !(x & 0x20))
+#define tolower(x) sane_case((unsigned char)(x), 0x20)
+#define toupper(x) sane_case((unsigned char)(x), 0)
+
+static inline int sane_case(int x, int high)
+{
+       if (sane_istest(x, GIT_ALPHA))
+               x = (x & ~0x20) | high;
+       return x;
+}
+
+#endif /* _PERF_SANE_CTYPE_H */
index dff043a..2b12bdb 100644 (file)
@@ -19,6 +19,7 @@
  *
  */
 
+#include <inttypes.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
index 783326c..9d92af7 100644 (file)
@@ -21,6 +21,7 @@
 
 #include <Python.h>
 
+#include <inttypes.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -45,6 +46,7 @@
 #include "../call-path.h"
 #include "thread_map.h"
 #include "cpumap.h"
+#include "print_binary.h"
 #include "stat.h"
 
 PyMODINIT_FUNC initperf_trace_context(void);
index 1dd617d..3041c6b 100644 (file)
@@ -1,5 +1,8 @@
+#include <errno.h>
+#include <inttypes.h>
 #include <linux/kernel.h>
 #include <traceevent/event-parse.h>
+#include <api/fs/fs.h>
 
 #include <byteswap.h>
 #include <unistd.h>
@@ -16,6 +19,7 @@
 #include "perf_regs.h"
 #include "asm/bug.h"
 #include "auxtrace.h"
+#include "thread.h"
 #include "thread-stack.h"
 #include "stat.h"
 
@@ -139,8 +143,14 @@ struct perf_session *perf_session__new(struct perf_data_file *file,
                        if (perf_session__open(session) < 0)
                                goto out_close;
 
-                       perf_session__set_id_hdr_size(session);
-                       perf_session__set_comm_exec(session);
+                       /*
+                        * set session attributes that are present in perf.data
+                        * but not in pipe-mode.
+                        */
+                       if (!file->is_pipe) {
+                               perf_session__set_id_hdr_size(session);
+                               perf_session__set_comm_exec(session);
+                       }
                }
        } else  {
                session->machines.host.env = &perf_env;
@@ -155,7 +165,11 @@ struct perf_session *perf_session__new(struct perf_data_file *file,
                        pr_warning("Cannot read kernel map\n");
        }
 
-       if (tool && tool->ordering_requires_timestamps &&
+       /*
+        * In pipe-mode, evlist is empty until PERF_RECORD_HEADER_ATTR is
+        * processed, so perf_evlist__sample_id_all is not meaningful here.
+        */
+       if ((!file || !file->is_pipe) && tool && tool->ordering_requires_timestamps &&
            tool->ordered_events && !perf_evlist__sample_id_all(session->evlist)) {
                dump_printf("WARNING: No sample_id_all support, falling back to unordered processing\n");
                tool->ordered_events = false;
@@ -1239,6 +1253,8 @@ static int machines__deliver_event(struct machines *machines,
                return tool->mmap2(tool, event, sample, machine);
        case PERF_RECORD_COMM:
                return tool->comm(tool, event, sample, machine);
+       case PERF_RECORD_NAMESPACES:
+               return tool->namespaces(tool, event, sample, machine);
        case PERF_RECORD_FORK:
                return tool->fork(tool, event, sample, machine);
        case PERF_RECORD_EXIT:
@@ -1258,9 +1274,12 @@ static int machines__deliver_event(struct machines *machines,
        case PERF_RECORD_UNTHROTTLE:
                return tool->unthrottle(tool, event, sample, machine);
        case PERF_RECORD_AUX:
-               if (tool->aux == perf_event__process_aux &&
-                   (event->aux.flags & PERF_AUX_FLAG_TRUNCATED))
-                       evlist->stats.total_aux_lost += 1;
+               if (tool->aux == perf_event__process_aux) {
+                       if (event->aux.flags & PERF_AUX_FLAG_TRUNCATED)
+                               evlist->stats.total_aux_lost += 1;
+                       if (event->aux.flags & PERF_AUX_FLAG_PARTIAL)
+                               evlist->stats.total_aux_partial += 1;
+               }
                return tool->aux(tool, event, sample, machine);
        case PERF_RECORD_ITRACE_START:
                return tool->itrace_start(tool, event, sample, machine);
@@ -1494,6 +1513,11 @@ int perf_session__register_idle_thread(struct perf_session *session)
                err = -1;
        }
 
+       if (thread == NULL || thread__set_namespaces(thread, 0, NULL)) {
+               pr_err("problem inserting idle task.\n");
+               err = -1;
+       }
+
        /* machine__findnew_thread() got the thread, so put it */
        thread__put(thread);
        return err;
@@ -1548,6 +1572,23 @@ static void perf_session__warn_about_errors(const struct perf_session *session)
                            stats->nr_events[PERF_RECORD_AUX]);
        }
 
+       if (session->tool->aux == perf_event__process_aux &&
+           stats->total_aux_partial != 0) {
+               bool vmm_exclusive = false;
+
+               (void)sysfs__read_bool("module/kvm_intel/parameters/vmm_exclusive",
+                                      &vmm_exclusive);
+
+               ui__warning("AUX data had gaps in it %" PRIu64 " times out of %u!\n\n"
+                           "Are you running a KVM guest in the background?%s\n\n",
+                           stats->total_aux_partial,
+                           stats->nr_events[PERF_RECORD_AUX],
+                           vmm_exclusive ?
+                           "\nReloading kvm_intel module with vmm_exclusive=0\n"
+                           "will reduce the gaps to only guest's timeslices." :
+                           "");
+       }
+
        if (stats->nr_unknown_events != 0) {
                ui__warning("Found %u unknown events!\n\n"
                            "Is this an older tool processing a perf.data "
@@ -1628,6 +1669,7 @@ static int __perf_session__process_pipe_events(struct perf_session *session)
        buf = malloc(cur_size);
        if (!buf)
                return -errno;
+       ordered_events__set_copy_on_queue(oe, true);
 more:
        event = buf;
        err = readn(fd, event, sizeof(struct perf_event_header));
index 4bd7585..47b5e7d 100644 (file)
@@ -5,14 +5,14 @@
 #include "event.h"
 #include "header.h"
 #include "machine.h"
-#include "symbol.h"
-#include "thread.h"
 #include "data.h"
 #include "ordered-events.h"
+#include <linux/kernel.h>
 #include <linux/rbtree.h>
 #include <linux/perf_event.h>
 
 struct ip_callchain;
+struct symbol;
 struct thread;
 
 struct auxtrace;
index 0ff6222..5762ae4 100644 (file)
@@ -1,12 +1,18 @@
+#include <errno.h>
+#include <inttypes.h>
+#include <regex.h>
 #include <sys/mman.h>
 #include "sort.h"
 #include "hist.h"
 #include "comm.h"
 #include "symbol.h"
+#include "thread.h"
 #include "evsel.h"
 #include "evlist.h"
+#include "strlist.h"
 #include <traceevent/event-parse.h>
 #include "mem-events.h"
+#include <linux/kernel.h>
 
 regex_t                parent_regex;
 const char     default_parent_pattern[] = "^sys_|^do_page_fault";
@@ -323,7 +329,7 @@ char *hist_entry__get_srcline(struct hist_entry *he)
                return SRCLINE_UNKNOWN;
 
        return get_srcline(map->dso, map__rip_2objdump(map, he->ip),
-                          he->ms.sym, true);
+                          he->ms.sym, true, true);
 }
 
 static int64_t
@@ -366,7 +372,8 @@ sort__srcline_from_cmp(struct hist_entry *left, struct hist_entry *right)
                        left->branch_info->srcline_from = get_srcline(map->dso,
                                           map__rip_2objdump(map,
                                                             left->branch_info->from.al_addr),
-                                                        left->branch_info->from.sym, true);
+                                                        left->branch_info->from.sym,
+                                                        true, true);
        }
        if (!right->branch_info->srcline_from) {
                struct map *map = right->branch_info->from.map;
@@ -376,7 +383,8 @@ sort__srcline_from_cmp(struct hist_entry *left, struct hist_entry *right)
                        right->branch_info->srcline_from = get_srcline(map->dso,
                                             map__rip_2objdump(map,
                                                               right->branch_info->from.al_addr),
-                                                    right->branch_info->from.sym, true);
+                                                    right->branch_info->from.sym,
+                                                    true, true);
        }
        return strcmp(right->branch_info->srcline_from, left->branch_info->srcline_from);
 }
@@ -407,7 +415,8 @@ sort__srcline_to_cmp(struct hist_entry *left, struct hist_entry *right)
                        left->branch_info->srcline_to = get_srcline(map->dso,
                                           map__rip_2objdump(map,
                                                             left->branch_info->to.al_addr),
-                                                        left->branch_info->from.sym, true);
+                                                        left->branch_info->from.sym,
+                                                        true, true);
        }
        if (!right->branch_info->srcline_to) {
                struct map *map = right->branch_info->to.map;
@@ -417,7 +426,8 @@ sort__srcline_to_cmp(struct hist_entry *left, struct hist_entry *right)
                        right->branch_info->srcline_to = get_srcline(map->dso,
                                             map__rip_2objdump(map,
                                                               right->branch_info->to.al_addr),
-                                                    right->branch_info->to.sym, true);
+                                                    right->branch_info->to.sym,
+                                                    true, true);
        }
        return strcmp(right->branch_info->srcline_to, left->branch_info->srcline_to);
 }
@@ -448,7 +458,7 @@ static char *hist_entry__get_srcfile(struct hist_entry *e)
                return no_srcfile;
 
        sf = __get_srcline(map->dso, map__rip_2objdump(map, e->ip),
-                        e->ms.sym, false, true);
+                        e->ms.sym, false, true, true);
        if (!strcmp(sf, SRCLINE_UNKNOWN))
                return no_srcfile;
        p = strchr(sf, ':');
@@ -536,6 +546,46 @@ struct sort_entry sort_cpu = {
        .se_width_idx   = HISTC_CPU,
 };
 
+/* --sort cgroup_id */
+
+static int64_t _sort__cgroup_dev_cmp(u64 left_dev, u64 right_dev)
+{
+       return (int64_t)(right_dev - left_dev);
+}
+
+static int64_t _sort__cgroup_inode_cmp(u64 left_ino, u64 right_ino)
+{
+       return (int64_t)(right_ino - left_ino);
+}
+
+static int64_t
+sort__cgroup_id_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+       int64_t ret;
+
+       ret = _sort__cgroup_dev_cmp(right->cgroup_id.dev, left->cgroup_id.dev);
+       if (ret != 0)
+               return ret;
+
+       return _sort__cgroup_inode_cmp(right->cgroup_id.ino,
+                                      left->cgroup_id.ino);
+}
+
+static int hist_entry__cgroup_id_snprintf(struct hist_entry *he,
+                                         char *bf, size_t size,
+                                         unsigned int width __maybe_unused)
+{
+       return repsep_snprintf(bf, size, "%lu/0x%lx", he->cgroup_id.dev,
+                              he->cgroup_id.ino);
+}
+
+struct sort_entry sort_cgroup_id = {
+       .se_header      = "cgroup id (dev/inode)",
+       .se_cmp         = sort__cgroup_id_cmp,
+       .se_snprintf    = hist_entry__cgroup_id_snprintf,
+       .se_width_idx   = HISTC_CGROUP_ID,
+};
+
 /* --sort socket */
 
 static int64_t
@@ -846,6 +896,9 @@ static int hist_entry__mispredict_snprintf(struct hist_entry *he, char *bf,
 static int64_t
 sort__cycles_cmp(struct hist_entry *left, struct hist_entry *right)
 {
+       if (!left->branch_info || !right->branch_info)
+               return cmp_null(left->branch_info, right->branch_info);
+
        return left->branch_info->flags.cycles -
                right->branch_info->flags.cycles;
 }
@@ -853,6 +906,8 @@ sort__cycles_cmp(struct hist_entry *left, struct hist_entry *right)
 static int hist_entry__cycles_snprintf(struct hist_entry *he, char *bf,
                                    size_t size, unsigned int width)
 {
+       if (!he->branch_info)
+               return scnprintf(bf, size, "%-.*s", width, "N/A");
        if (he->branch_info->flags.cycles == 0)
                return repsep_snprintf(bf, size, "%-*s", width, "-");
        return repsep_snprintf(bf, size, "%-*hd", width,
@@ -1396,6 +1451,46 @@ struct sort_entry sort_transaction = {
        .se_width_idx   = HISTC_TRANSACTION,
 };
 
+/* --sort symbol_size */
+
+static int64_t _sort__sym_size_cmp(struct symbol *sym_l, struct symbol *sym_r)
+{
+       int64_t size_l = sym_l != NULL ? symbol__size(sym_l) : 0;
+       int64_t size_r = sym_r != NULL ? symbol__size(sym_r) : 0;
+
+       return size_l < size_r ? -1 :
+               size_l == size_r ? 0 : 1;
+}
+
+static int64_t
+sort__sym_size_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+       return _sort__sym_size_cmp(right->ms.sym, left->ms.sym);
+}
+
+static int _hist_entry__sym_size_snprintf(struct symbol *sym, char *bf,
+                                         size_t bf_size, unsigned int width)
+{
+       if (sym)
+               return repsep_snprintf(bf, bf_size, "%*d", width, symbol__size(sym));
+
+       return repsep_snprintf(bf, bf_size, "%*s", width, "unknown");
+}
+
+static int hist_entry__sym_size_snprintf(struct hist_entry *he, char *bf,
+                                        size_t size, unsigned int width)
+{
+       return _hist_entry__sym_size_snprintf(he->ms.sym, bf, size, width);
+}
+
+struct sort_entry sort_sym_size = {
+       .se_header      = "Symbol size",
+       .se_cmp         = sort__sym_size_cmp,
+       .se_snprintf    = hist_entry__sym_size_snprintf,
+       .se_width_idx   = HISTC_SYM_SIZE,
+};
+
+
 struct sort_dimension {
        const char              *name;
        struct sort_entry       *entry;
@@ -1418,6 +1513,8 @@ static struct sort_dimension common_sort_dimensions[] = {
        DIM(SORT_GLOBAL_WEIGHT, "weight", sort_global_weight),
        DIM(SORT_TRANSACTION, "transaction", sort_transaction),
        DIM(SORT_TRACE, "trace", sort_trace),
+       DIM(SORT_SYM_SIZE, "symbol_size", sort_sym_size),
+       DIM(SORT_CGROUP_ID, "cgroup_id", sort_cgroup_id),
 };
 
 #undef DIM
index 796c847..b7c7559 100644 (file)
@@ -2,7 +2,7 @@
 #define __PERF_SORT_H
 #include "../builtin.h"
 
-#include "util.h"
+#include <regex.h>
 
 #include "color.h"
 #include <linux/list.h>
@@ -11,7 +11,6 @@
 #include "symbol.h"
 #include "string.h"
 #include "callchain.h"
-#include "strlist.h"
 #include "values.h"
 
 #include "../perf.h"
@@ -21,7 +20,9 @@
 #include <subcmd/parse-options.h>
 #include "parse-events.h"
 #include "hist.h"
-#include "thread.h"
+#include "srcline.h"
+
+struct thread;
 
 extern regex_t parent_regex;
 extern const char *sort_order;
@@ -54,6 +55,11 @@ struct he_stat {
        u32                     nr_events;
 };
 
+struct namespace_id {
+       u64                     dev;
+       u64                     ino;
+};
+
 struct hist_entry_diff {
        bool    computed;
        union {
@@ -91,6 +97,7 @@ struct hist_entry {
        struct map_symbol       ms;
        struct thread           *thread;
        struct comm             *comm;
+       struct namespace_id     cgroup_id;
        u64                     ip;
        u64                     transaction;
        s32                     socket;
@@ -122,6 +129,7 @@ struct hist_entry {
        };
        char                    *srcline;
        char                    *srcfile;
+       struct inline_node      *inline_node;
        struct symbol           *parent;
        struct branch_info      *branch_info;
        struct hists            *hists;
@@ -211,6 +219,8 @@ enum sort_type {
        SORT_GLOBAL_WEIGHT,
        SORT_TRANSACTION,
        SORT_TRACE,
+       SORT_SYM_SIZE,
+       SORT_CGROUP_ID,
 
        /* branch stack specific sort keys */
        __SORT_BRANCH_STACK,
index b4db3f4..df051a5 100644 (file)
@@ -1,3 +1,4 @@
+#include <inttypes.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -7,11 +8,59 @@
 #include "util/dso.h"
 #include "util/util.h"
 #include "util/debug.h"
+#include "util/callchain.h"
+#include "srcline.h"
 
 #include "symbol.h"
 
 bool srcline_full_filename;
 
+static const char *dso__name(struct dso *dso)
+{
+       const char *dso_name;
+
+       if (dso->symsrc_filename)
+               dso_name = dso->symsrc_filename;
+       else
+               dso_name = dso->long_name;
+
+       if (dso_name[0] == '[')
+               return NULL;
+
+       if (!strncmp(dso_name, "/tmp/perf-", 10))
+               return NULL;
+
+       return dso_name;
+}
+
+static int inline_list__append(char *filename, char *funcname, int line_nr,
+                              struct inline_node *node, struct dso *dso)
+{
+       struct inline_list *ilist;
+       char *demangled;
+
+       ilist = zalloc(sizeof(*ilist));
+       if (ilist == NULL)
+               return -1;
+
+       ilist->filename = filename;
+       ilist->line_nr = line_nr;
+
+       if (dso != NULL) {
+               demangled = dso__demangle_sym(dso, 0, funcname);
+               if (demangled == NULL) {
+                       ilist->funcname = funcname;
+               } else {
+                       ilist->funcname = demangled;
+                       free(funcname);
+               }
+       }
+
+       list_add_tail(&ilist->list, &node->val);
+
+       return 0;
+}
+
 #ifdef HAVE_LIBBFD_SUPPORT
 
 /*
@@ -151,9 +200,17 @@ static void addr2line_cleanup(struct a2l_data *a2l)
 
 #define MAX_INLINE_NEST 1024
 
+static void inline_list__reverse(struct inline_node *node)
+{
+       struct inline_list *ilist, *n;
+
+       list_for_each_entry_safe_reverse(ilist, n, &node->val, list)
+               list_move_tail(&ilist->list, &node->val);
+}
+
 static int addr2line(const char *dso_name, u64 addr,
                     char **file, unsigned int *line, struct dso *dso,
-                    bool unwind_inlines)
+                    bool unwind_inlines, struct inline_node *node)
 {
        int ret = 0;
        struct a2l_data *a2l = dso->a2l;
@@ -178,8 +235,21 @@ static int addr2line(const char *dso_name, u64 addr,
 
                while (bfd_find_inliner_info(a2l->abfd, &a2l->filename,
                                             &a2l->funcname, &a2l->line) &&
-                      cnt++ < MAX_INLINE_NEST)
-                       ;
+                      cnt++ < MAX_INLINE_NEST) {
+
+                       if (node != NULL) {
+                               if (inline_list__append(strdup(a2l->filename),
+                                                       strdup(a2l->funcname),
+                                                       a2l->line, node,
+                                                       dso) != 0)
+                                       return 0;
+                       }
+               }
+
+               if ((node != NULL) &&
+                   (callchain_param.order != ORDER_CALLEE)) {
+                       inline_list__reverse(node);
+               }
        }
 
        if (a2l->found && a2l->filename) {
@@ -205,18 +275,68 @@ void dso__free_a2l(struct dso *dso)
        dso->a2l = NULL;
 }
 
+static struct inline_node *addr2inlines(const char *dso_name, u64 addr,
+       struct dso *dso)
+{
+       char *file = NULL;
+       unsigned int line = 0;
+       struct inline_node *node;
+
+       node = zalloc(sizeof(*node));
+       if (node == NULL) {
+               perror("not enough memory for the inline node");
+               return NULL;
+       }
+
+       INIT_LIST_HEAD(&node->val);
+       node->addr = addr;
+
+       if (!addr2line(dso_name, addr, &file, &line, dso, TRUE, node))
+               goto out_free_inline_node;
+
+       if (list_empty(&node->val))
+               goto out_free_inline_node;
+
+       return node;
+
+out_free_inline_node:
+       inline_node__delete(node);
+       return NULL;
+}
+
 #else /* HAVE_LIBBFD_SUPPORT */
 
+static int filename_split(char *filename, unsigned int *line_nr)
+{
+       char *sep;
+
+       sep = strchr(filename, '\n');
+       if (sep)
+               *sep = '\0';
+
+       if (!strcmp(filename, "??:0"))
+               return 0;
+
+       sep = strchr(filename, ':');
+       if (sep) {
+               *sep++ = '\0';
+               *line_nr = strtoul(sep, NULL, 0);
+               return 1;
+       }
+
+       return 0;
+}
+
 static int addr2line(const char *dso_name, u64 addr,
                     char **file, unsigned int *line_nr,
                     struct dso *dso __maybe_unused,
-                    bool unwind_inlines __maybe_unused)
+                    bool unwind_inlines __maybe_unused,
+                    struct inline_node *node __maybe_unused)
 {
        FILE *fp;
        char cmd[PATH_MAX];
        char *filename = NULL;
        size_t len;
-       char *sep;
        int ret = 0;
 
        scnprintf(cmd, sizeof(cmd), "addr2line -e %s %016"PRIx64,
@@ -233,23 +353,14 @@ static int addr2line(const char *dso_name, u64 addr,
                goto out;
        }
 
-       sep = strchr(filename, '\n');
-       if (sep)
-               *sep = '\0';
-
-       if (!strcmp(filename, "??:0")) {
-               pr_debug("no debugging info in %s\n", dso_name);
+       ret = filename_split(filename, line_nr);
+       if (ret != 1) {
                free(filename);
                goto out;
        }
 
-       sep = strchr(filename, ':');
-       if (sep) {
-               *sep++ = '\0';
-               *file = filename;
-               *line_nr = strtoul(sep, NULL, 0);
-               ret = 1;
-       }
+       *file = filename;
+
 out:
        pclose(fp);
        return ret;
@@ -259,6 +370,58 @@ void dso__free_a2l(struct dso *dso __maybe_unused)
 {
 }
 
+static struct inline_node *addr2inlines(const char *dso_name, u64 addr,
+       struct dso *dso __maybe_unused)
+{
+       FILE *fp;
+       char cmd[PATH_MAX];
+       struct inline_node *node;
+       char *filename = NULL;
+       size_t len;
+       unsigned int line_nr = 0;
+
+       scnprintf(cmd, sizeof(cmd), "addr2line -e %s -i %016"PRIx64,
+                 dso_name, addr);
+
+       fp = popen(cmd, "r");
+       if (fp == NULL) {
+               pr_err("popen failed for %s\n", dso_name);
+               return NULL;
+       }
+
+       node = zalloc(sizeof(*node));
+       if (node == NULL) {
+               perror("not enough memory for the inline node");
+               goto out;
+       }
+
+       INIT_LIST_HEAD(&node->val);
+       node->addr = addr;
+
+       while (getline(&filename, &len, fp) != -1) {
+               if (filename_split(filename, &line_nr) != 1) {
+                       free(filename);
+                       goto out;
+               }
+
+               if (inline_list__append(filename, NULL, line_nr, node,
+                                       NULL) != 0)
+                       goto out;
+
+               filename = NULL;
+       }
+
+out:
+       pclose(fp);
+
+       if (list_empty(&node->val)) {
+               inline_node__delete(node);
+               return NULL;
+       }
+
+       return node;
+}
+
 #endif /* HAVE_LIBBFD_SUPPORT */
 
 /*
@@ -268,7 +431,7 @@ void dso__free_a2l(struct dso *dso __maybe_unused)
 #define A2L_FAIL_LIMIT 123
 
 char *__get_srcline(struct dso *dso, u64 addr, struct symbol *sym,
-                 bool show_sym, bool unwind_inlines)
+                 bool show_sym, bool show_addr, bool unwind_inlines)
 {
        char *file = NULL;
        unsigned line = 0;
@@ -278,18 +441,11 @@ char *__get_srcline(struct dso *dso, u64 addr, struct symbol *sym,
        if (!dso->has_srcline)
                goto out;
 
-       if (dso->symsrc_filename)
-               dso_name = dso->symsrc_filename;
-       else
-               dso_name = dso->long_name;
-
-       if (dso_name[0] == '[')
-               goto out;
-
-       if (!strncmp(dso_name, "/tmp/perf-", 10))
+       dso_name = dso__name(dso);
+       if (dso_name == NULL)
                goto out;
 
-       if (!addr2line(dso_name, addr, &file, &line, dso, unwind_inlines))
+       if (!addr2line(dso_name, addr, &file, &line, dso, unwind_inlines, NULL))
                goto out;
 
        if (asprintf(&srcline, "%s:%u",
@@ -309,6 +465,11 @@ out:
                dso->has_srcline = 0;
                dso__free_a2l(dso);
        }
+
+       if (!show_addr)
+               return (show_sym && sym) ?
+                           strndup(sym->name, sym->namelen) : NULL;
+
        if (sym) {
                if (asprintf(&srcline, "%s+%" PRIu64, show_sym ? sym->name : "",
                                        addr - sym->start) < 0)
@@ -325,7 +486,32 @@ void free_srcline(char *srcline)
 }
 
 char *get_srcline(struct dso *dso, u64 addr, struct symbol *sym,
-                 bool show_sym)
+                 bool show_sym, bool show_addr)
+{
+       return __get_srcline(dso, addr, sym, show_sym, show_addr, false);
+}
+
+struct inline_node *dso__parse_addr_inlines(struct dso *dso, u64 addr)
 {
-       return __get_srcline(dso, addr, sym, show_sym, false);
+       const char *dso_name;
+
+       dso_name = dso__name(dso);
+       if (dso_name == NULL)
+               return NULL;
+
+       return addr2inlines(dso_name, addr, dso);
+}
+
+void inline_node__delete(struct inline_node *node)
+{
+       struct inline_list *ilist, *tmp;
+
+       list_for_each_entry_safe(ilist, tmp, &node->val, list) {
+               list_del_init(&ilist->list);
+               zfree(&ilist->filename);
+               zfree(&ilist->funcname);
+               free(ilist);
+       }
+
+       free(node);
 }
diff --git a/tools/perf/util/srcline.h b/tools/perf/util/srcline.h
new file mode 100644 (file)
index 0000000..7b52ba8
--- /dev/null
@@ -0,0 +1,34 @@
+#ifndef PERF_SRCLINE_H
+#define PERF_SRCLINE_H
+
+#include <linux/list.h>
+#include <linux/types.h>
+
+struct dso;
+struct symbol;
+
+extern bool srcline_full_filename;
+char *get_srcline(struct dso *dso, u64 addr, struct symbol *sym,
+                 bool show_sym, bool show_addr);
+char *__get_srcline(struct dso *dso, u64 addr, struct symbol *sym,
+                 bool show_sym, bool show_addr, bool unwind_inlines);
+void free_srcline(char *srcline);
+
+#define SRCLINE_UNKNOWN  ((char *) "??:0")
+
+struct inline_list {
+       char                    *filename;
+       char                    *funcname;
+       unsigned int            line_nr;
+       struct list_head        list;
+};
+
+struct inline_node {
+       u64                     addr;
+       struct list_head        val;
+};
+
+struct inline_node *dso__parse_addr_inlines(struct dso *dso, u64 addr);
+void inline_node__delete(struct inline_node *node);
+
+#endif /* PERF_SRCLINE_H */
index 8a2bbd2..ac10cc6 100644 (file)
@@ -3,6 +3,9 @@
 #include "stat.h"
 #include "color.h"
 #include "pmu.h"
+#include "rblist.h"
+#include "evlist.h"
+#include "expr.h"
 
 enum {
        CTX_BIT_USER    = 1 << 0,
@@ -41,13 +44,73 @@ static struct stats runtime_topdown_slots_issued[NUM_CTX][MAX_NR_CPUS];
 static struct stats runtime_topdown_slots_retired[NUM_CTX][MAX_NR_CPUS];
 static struct stats runtime_topdown_fetch_bubbles[NUM_CTX][MAX_NR_CPUS];
 static struct stats runtime_topdown_recovery_bubbles[NUM_CTX][MAX_NR_CPUS];
+static struct rblist runtime_saved_values;
 static bool have_frontend_stalled;
 
 struct stats walltime_nsecs_stats;
 
+struct saved_value {
+       struct rb_node rb_node;
+       struct perf_evsel *evsel;
+       int cpu;
+       int ctx;
+       struct stats stats;
+};
+
+static int saved_value_cmp(struct rb_node *rb_node, const void *entry)
+{
+       struct saved_value *a = container_of(rb_node,
+                                            struct saved_value,
+                                            rb_node);
+       const struct saved_value *b = entry;
+
+       if (a->ctx != b->ctx)
+               return a->ctx - b->ctx;
+       if (a->cpu != b->cpu)
+               return a->cpu - b->cpu;
+       return a->evsel - b->evsel;
+}
+
+static struct rb_node *saved_value_new(struct rblist *rblist __maybe_unused,
+                                    const void *entry)
+{
+       struct saved_value *nd = malloc(sizeof(struct saved_value));
+
+       if (!nd)
+               return NULL;
+       memcpy(nd, entry, sizeof(struct saved_value));
+       return &nd->rb_node;
+}
+
+static struct saved_value *saved_value_lookup(struct perf_evsel *evsel,
+                                             int cpu, int ctx,
+                                             bool create)
+{
+       struct rb_node *nd;
+       struct saved_value dm = {
+               .cpu = cpu,
+               .ctx = ctx,
+               .evsel = evsel,
+       };
+       nd = rblist__find(&runtime_saved_values, &dm);
+       if (nd)
+               return container_of(nd, struct saved_value, rb_node);
+       if (create) {
+               rblist__add_node(&runtime_saved_values, &dm);
+               nd = rblist__find(&runtime_saved_values, &dm);
+               if (nd)
+                       return container_of(nd, struct saved_value, rb_node);
+       }
+       return NULL;
+}
+
 void perf_stat__init_shadow_stats(void)
 {
        have_frontend_stalled = pmu_have_event("cpu", "stalled-cycles-frontend");
+       rblist__init(&runtime_saved_values);
+       runtime_saved_values.node_cmp = saved_value_cmp;
+       runtime_saved_values.node_new = saved_value_new;
+       /* No delete for now */
 }
 
 static int evsel_context(struct perf_evsel *evsel)
@@ -70,6 +133,8 @@ static int evsel_context(struct perf_evsel *evsel)
 
 void perf_stat__reset_shadow_stats(void)
 {
+       struct rb_node *pos, *next;
+
        memset(runtime_nsecs_stats, 0, sizeof(runtime_nsecs_stats));
        memset(runtime_cycles_stats, 0, sizeof(runtime_cycles_stats));
        memset(runtime_stalled_cycles_front_stats, 0, sizeof(runtime_stalled_cycles_front_stats));
@@ -92,6 +157,15 @@ void perf_stat__reset_shadow_stats(void)
        memset(runtime_topdown_slots_issued, 0, sizeof(runtime_topdown_slots_issued));
        memset(runtime_topdown_fetch_bubbles, 0, sizeof(runtime_topdown_fetch_bubbles));
        memset(runtime_topdown_recovery_bubbles, 0, sizeof(runtime_topdown_recovery_bubbles));
+
+       next = rb_first(&runtime_saved_values.entries);
+       while (next) {
+               pos = next;
+               next = rb_next(pos);
+               memset(&container_of(pos, struct saved_value, rb_node)->stats,
+                      0,
+                      sizeof(struct stats));
+       }
 }
 
 /*
@@ -143,6 +217,12 @@ void perf_stat__update_shadow_stats(struct perf_evsel *counter, u64 *count,
                update_stats(&runtime_dtlb_cache_stats[ctx][cpu], count[0]);
        else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_ITLB))
                update_stats(&runtime_itlb_cache_stats[ctx][cpu], count[0]);
+
+       if (counter->collect_stat) {
+               struct saved_value *v = saved_value_lookup(counter, cpu, ctx,
+                                                          true);
+               update_stats(&v->stats, count[0]);
+       }
 }
 
 /* used for get_ratio_color() */
@@ -172,6 +252,95 @@ static const char *get_ratio_color(enum grc_type type, double ratio)
        return color;
 }
 
+static struct perf_evsel *perf_stat__find_event(struct perf_evlist *evsel_list,
+                                               const char *name)
+{
+       struct perf_evsel *c2;
+
+       evlist__for_each_entry (evsel_list, c2) {
+               if (!strcasecmp(c2->name, name))
+                       return c2;
+       }
+       return NULL;
+}
+
+/* Mark MetricExpr target events and link events using them to them. */
+void perf_stat__collect_metric_expr(struct perf_evlist *evsel_list)
+{
+       struct perf_evsel *counter, *leader, **metric_events, *oc;
+       bool found;
+       const char **metric_names;
+       int i;
+       int num_metric_names;
+
+       evlist__for_each_entry(evsel_list, counter) {
+               bool invalid = false;
+
+               leader = counter->leader;
+               if (!counter->metric_expr)
+                       continue;
+               metric_events = counter->metric_events;
+               if (!metric_events) {
+                       if (expr__find_other(counter->metric_expr, counter->name,
+                                               &metric_names, &num_metric_names) < 0)
+                               continue;
+
+                       metric_events = calloc(sizeof(struct perf_evsel *),
+                                              num_metric_names + 1);
+                       if (!metric_events)
+                               return;
+                       counter->metric_events = metric_events;
+               }
+
+               for (i = 0; i < num_metric_names; i++) {
+                       found = false;
+                       if (leader) {
+                               /* Search in group */
+                               for_each_group_member (oc, leader) {
+                                       if (!strcasecmp(oc->name, metric_names[i])) {
+                                               found = true;
+                                               break;
+                                       }
+                               }
+                       }
+                       if (!found) {
+                               /* Search ignoring groups */
+                               oc = perf_stat__find_event(evsel_list, metric_names[i]);
+                       }
+                       if (!oc) {
+                               /* Deduping one is good enough to handle duplicated PMUs. */
+                               static char *printed;
+
+                               /*
+                                * Adding events automatically would be difficult, because
+                                * it would risk creating groups that are not schedulable.
+                                * perf stat doesn't understand all the scheduling constraints
+                                * of events. So we ask the user instead to add the missing
+                                * events.
+                                */
+                               if (!printed || strcasecmp(printed, metric_names[i])) {
+                                       fprintf(stderr,
+                                               "Add %s event to groups to get metric expression for %s\n",
+                                               metric_names[i],
+                                               counter->name);
+                                       printed = strdup(metric_names[i]);
+                               }
+                               invalid = true;
+                               continue;
+                       }
+                       metric_events[i] = oc;
+                       oc->collect_stat = true;
+               }
+               metric_events[i] = NULL;
+               free(metric_names);
+               if (invalid) {
+                       free(metric_events);
+                       counter->metric_events = NULL;
+                       counter->metric_expr = NULL;
+               }
+       }
+}
+
 static void print_stalled_cycles_frontend(int cpu,
                                          struct perf_evsel *evsel, double avg,
                                          struct perf_stat_output_ctx *out)
@@ -614,6 +783,34 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
                                        be_bound * 100.);
                else
                        print_metric(ctxp, NULL, NULL, name, 0);
+       } else if (evsel->metric_expr) {
+               struct parse_ctx pctx;
+               int i;
+
+               expr__ctx_init(&pctx);
+               expr__add_id(&pctx, evsel->name, avg);
+               for (i = 0; evsel->metric_events[i]; i++) {
+                       struct saved_value *v;
+
+                       v = saved_value_lookup(evsel->metric_events[i], cpu, ctx, false);
+                       if (!v)
+                               break;
+                       expr__add_id(&pctx, evsel->metric_events[i]->name,
+                                            avg_stats(&v->stats));
+               }
+               if (!evsel->metric_events[i]) {
+                       const char *p = evsel->metric_expr;
+
+                       if (expr__parse(&ratio, &pctx, &p) == 0)
+                               print_metric(ctxp, NULL, "%8.1f",
+                                       evsel->metric_name ?
+                                       evsel->metric_name :
+                                       out->force_header ?  evsel->name : "",
+                                       ratio);
+                       else
+                               print_metric(ctxp, NULL, NULL, "", 0);
+               } else
+                       print_metric(ctxp, NULL, NULL, "", 0);
        } else if (runtime_nsecs_stats[cpu].n != 0) {
                char unit = 'M';
                char unit_buf[10];
index 0d51334..c581744 100644 (file)
@@ -1,3 +1,5 @@
+#include <errno.h>
+#include <inttypes.h>
 #include <math.h>
 #include "stat.h"
 #include "evlist.h"
index c29bb94..0a65ae2 100644 (file)
@@ -85,11 +85,13 @@ struct perf_stat_output_ctx {
        void *ctx;
        print_metric_t print_metric;
        new_line_t new_line;
+       bool force_header;
 };
 
 void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
                                   double avg, int cpu,
                                   struct perf_stat_output_ctx *out);
+void perf_stat__collect_metric_expr(struct perf_evlist *);
 
 int perf_evlist__alloc_stats(struct perf_evlist *evlist, bool alloc_raw);
 void perf_evlist__free_stats(struct perf_evlist *evlist);
index 8175939..e91b5e8 100644 (file)
@@ -1,6 +1,7 @@
 #include "debug.h"
 #include "util.h"
 #include <linux/kernel.h>
+#include <errno.h>
 
 int prefixcmp(const char *str, const char *prefix)
 {
index efb5377..4dc0af6 100644 (file)
@@ -1,7 +1,10 @@
 #include "util.h"
-#include "string.h"
+#include "string2.h"
 #include "strfilter.h"
 
+#include <errno.h>
+#include "sane_ctype.h"
+
 /* Operators */
 static const char *OP_and      = "&";  /* Logical AND */
 static const char *OP_or       = "|";  /* Logical OR */
index bddca51..cca53b6 100644 (file)
@@ -1,5 +1,9 @@
-#include "util.h"
-#include "linux/string.h"
+#include "string2.h"
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <stdlib.h>
+
+#include "sane_ctype.h"
 
 #define K 1024LL
 /*
@@ -99,8 +103,10 @@ static int count_argc(const char *str)
 void argv_free(char **argv)
 {
        char **p;
-       for (p = argv; *p; p++)
-               zfree(p);
+       for (p = argv; *p; p++) {
+               free(*p);
+               *p = NULL;
+       }
 
        free(argv);
 }
@@ -120,7 +126,7 @@ void argv_free(char **argv)
 char **argv_split(const char *str, int *argcp)
 {
        int argc = count_argc(str);
-       char **argv = zalloc(sizeof(*argv) * (argc+1));
+       char **argv = calloc(argc + 1, sizeof(*argv));
        char **argvp;
 
        if (argv == NULL)
@@ -322,12 +328,8 @@ char *strxfrchar(char *s, char from, char to)
  */
 char *ltrim(char *s)
 {
-       int len = strlen(s);
-
-       while (len && isspace(*s)) {
-               len--;
+       while (isspace(*s))
                s++;
-       }
 
        return s;
 }
@@ -381,7 +383,7 @@ char *asprintf_expr_inout_ints(const char *var, bool in, size_t nints, int *ints
                                goto out_err_overflow;
 
                        if (i > 0)
-                               printed += snprintf(e + printed, size - printed, " %s ", or_and);
+                               printed += scnprintf(e + printed, size - printed, " %s ", or_and);
                        printed += scnprintf(e + printed, size - printed,
                                             "%s %s %d", var, eq_neq, ints[i]);
                }
diff --git a/tools/perf/util/string2.h b/tools/perf/util/string2.h
new file mode 100644 (file)
index 0000000..2f61968
--- /dev/null
@@ -0,0 +1,42 @@
+#ifndef PERF_STRING_H
+#define PERF_STRING_H
+
+#include <linux/types.h>
+#include <stddef.h>
+#include <string.h>
+
+s64 perf_atoll(const char *str);
+char **argv_split(const char *str, int *argcp);
+void argv_free(char **argv);
+bool strglobmatch(const char *str, const char *pat);
+bool strglobmatch_nocase(const char *str, const char *pat);
+bool strlazymatch(const char *str, const char *pat);
+static inline bool strisglob(const char *str)
+{
+       return strpbrk(str, "*?[") != NULL;
+}
+int strtailcmp(const char *s1, const char *s2);
+char *strxfrchar(char *s, char from, char to);
+
+char *ltrim(char *s);
+char *rtrim(char *s);
+
+static inline char *trim(char *s)
+{
+       return ltrim(rtrim(s));
+}
+
+char *asprintf_expr_inout_ints(const char *var, bool in, size_t nints, int *ints);
+
+static inline char *asprintf_expr_in_ints(const char *var, size_t nints, int *ints)
+{
+       return asprintf_expr_inout_ints(var, true, nints, ints);
+}
+
+static inline char *asprintf_expr_not_in_ints(const char *var, size_t nints, int *ints)
+{
+       return asprintf_expr_inout_ints(var, false, nints, ints);
+}
+
+
+#endif /* PERF_STRING_H */
index 0d3dfcb..9de5434 100644 (file)
@@ -10,6 +10,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <unistd.h>
 
 static
 struct rb_node *strlist__node_new(struct rblist *rblist, const void *entry)
index 4e59dde..e7ee47f 100644 (file)
@@ -10,8 +10,9 @@
 #include "demangle-rust.h"
 #include "machine.h"
 #include "vdso.h"
-#include <symbol/kallsyms.h>
 #include "debug.h"
+#include "sane_ctype.h"
+#include <symbol/kallsyms.h>
 
 #ifndef EM_AARCH64
 #define EM_AARCH64     183  /* ARM 64 bit */
@@ -390,6 +391,11 @@ out_elf_end:
        return 0;
 }
 
+char *dso__demangle_sym(struct dso *dso, int kmodule, char *elf_name)
+{
+       return demangle_sym(dso, kmodule, elf_name);
+}
+
 /*
  * Align offset to 4 bytes as needed for note name and descriptor data.
  */
@@ -1828,7 +1834,7 @@ void kcore_extract__delete(struct kcore_extract *kce)
 static int populate_sdt_note(Elf **elf, const char *data, size_t len,
                             struct list_head *sdt_notes)
 {
-       const char *provider, *name;
+       const char *provider, *name, *args;
        struct sdt_note *tmp = NULL;
        GElf_Ehdr ehdr;
        GElf_Addr base_off = 0;
@@ -1887,6 +1893,25 @@ static int populate_sdt_note(Elf **elf, const char *data, size_t len,
                goto out_free_prov;
        }
 
+       args = memchr(name, '\0', data + len - name);
+
+       /*
+        * There is no argument if:
+        * - We reached the end of the note;
+        * - There is not enough room to hold a potential string;
+        * - The argument string is empty or just contains ':'.
+        */
+       if (args == NULL || data + len - args < 2 ||
+               args[1] == ':' || args[1] == '\0')
+               tmp->args = NULL;
+       else {
+               tmp->args = strdup(++args);
+               if (!tmp->args) {
+                       ret = -ENOMEM;
+                       goto out_free_name;
+               }
+       }
+
        if (gelf_getclass(*elf) == ELFCLASS32) {
                memcpy(&tmp->addr, &buf, 3 * sizeof(Elf32_Addr));
                tmp->bit32 = true;
@@ -1898,7 +1923,7 @@ static int populate_sdt_note(Elf **elf, const char *data, size_t len,
        if (!gelf_getehdr(*elf, &ehdr)) {
                pr_debug("%s : cannot get elf header.\n", __func__);
                ret = -EBADF;
-               goto out_free_name;
+               goto out_free_args;
        }
 
        /* Adjust the prelink effect :
@@ -1923,6 +1948,8 @@ static int populate_sdt_note(Elf **elf, const char *data, size_t len,
        list_add_tail(&tmp->note_list, sdt_notes);
        return 0;
 
+out_free_args:
+       free(tmp->args);
 out_free_name:
        free(tmp->name);
 out_free_prov:
index 11cdde9..40bf5d4 100644 (file)
@@ -1,6 +1,7 @@
 #include "symbol.h"
 #include "util.h"
 
+#include <errno.h>
 #include <stdio.h>
 #include <fcntl.h>
 #include <string.h>
@@ -373,3 +374,10 @@ int kcore_copy(const char *from_dir __maybe_unused,
 void symbol__elf_init(void)
 {
 }
+
+char *dso__demangle_sym(struct dso *dso __maybe_unused,
+                       int kmodule __maybe_unused,
+                       char *elf_name __maybe_unused)
+{
+       return NULL;
+}
index 9b4d8ba..2cb7665 100644 (file)
@@ -3,6 +3,7 @@
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
+#include <linux/kernel.h>
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <sys/param.h>
@@ -18,6 +19,8 @@
 #include "strlist.h"
 #include "intlist.h"
 #include "header.h"
+#include "path.h"
+#include "sane_ctype.h"
 
 #include <elf.h>
 #include <limits.h>
index 6c358b7..7acd70f 100644 (file)
@@ -13,7 +13,7 @@
 #include <libgen.h>
 #include "build-id.h"
 #include "event.h"
-#include "util.h"
+#include "path.h"
 
 #ifdef HAVE_LIBELF_SUPPORT
 #include <libelf.h>
@@ -118,7 +118,8 @@ struct symbol_conf {
                        show_ref_callgraph,
                        hide_unresolved,
                        raw_trace,
-                       report_hierarchy;
+                       report_hierarchy,
+                       inline_name;
        const char      *vmlinux_name,
                        *kallsyms_name,
                        *source_prefix,
@@ -305,6 +306,8 @@ int dso__load_sym(struct dso *dso, struct map *map, struct symsrc *syms_ss,
 int dso__synthesize_plt_symbols(struct dso *dso, struct symsrc *ss,
                                struct map *map);
 
+char *dso__demangle_sym(struct dso *dso, int kmodule, char *elf_name);
+
 void __symbols__insert(struct rb_root *symbols, struct symbol *sym, bool kernel);
 void symbols__insert(struct rb_root *symbols, struct symbol *sym);
 void symbols__fixup_duplicate(struct rb_root *symbols);
@@ -351,6 +354,7 @@ int arch__choose_best_symbol(struct symbol *syma, struct symbol *symb);
 struct sdt_note {
        char *name;                     /* name of the note*/
        char *provider;                 /* provider name */
+       char *args;
        bool bit32;                     /* whether the location is 32 bits? */
        union {                         /* location, base and semaphore addrs */
                Elf64_Addr a64[3];
index 90b47d8..8f254a7 100644 (file)
@@ -1,4 +1,8 @@
-#include "util.h"
+#include "term.h"
+#include <stdlib.h>
+#include <termios.h>
+#include <unistd.h>
+#include <sys/ioctl.h>
 
 void get_term_dimensions(struct winsize *ws)
 {
index d330152..dd17d6a 100644 (file)
@@ -15,6 +15,7 @@
 
 #include <linux/rbtree.h>
 #include <linux/list.h>
+#include <errno.h>
 #include "thread.h"
 #include "event.h"
 #include "machine.h"
index f5af87f..378c418 100644 (file)
@@ -1,12 +1,15 @@
 #include "../perf.h"
+#include <errno.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
+#include <linux/kernel.h>
 #include "session.h"
 #include "thread.h"
 #include "thread-stack.h"
 #include "util.h"
 #include "debug.h"
+#include "namespaces.h"
 #include "comm.h"
 #include "unwind.h"
 
@@ -40,6 +43,7 @@ struct thread *thread__new(pid_t pid, pid_t tid)
                thread->tid = tid;
                thread->ppid = -1;
                thread->cpu = -1;
+               INIT_LIST_HEAD(&thread->namespaces_list);
                INIT_LIST_HEAD(&thread->comm_list);
 
                comm_str = malloc(32);
@@ -53,7 +57,7 @@ struct thread *thread__new(pid_t pid, pid_t tid)
                        goto err_thread;
 
                list_add(&comm->list, &thread->comm_list);
-               atomic_set(&thread->refcnt, 1);
+               refcount_set(&thread->refcnt, 1);
                RB_CLEAR_NODE(&thread->rb_node);
        }
 
@@ -66,7 +70,8 @@ err_thread:
 
 void thread__delete(struct thread *thread)
 {
-       struct comm *comm, *tmp;
+       struct namespaces *namespaces, *tmp_namespaces;
+       struct comm *comm, *tmp_comm;
 
        BUG_ON(!RB_EMPTY_NODE(&thread->rb_node));
 
@@ -76,7 +81,12 @@ void thread__delete(struct thread *thread)
                map_groups__put(thread->mg);
                thread->mg = NULL;
        }
-       list_for_each_entry_safe(comm, tmp, &thread->comm_list, list) {
+       list_for_each_entry_safe(namespaces, tmp_namespaces,
+                                &thread->namespaces_list, list) {
+               list_del(&namespaces->list);
+               namespaces__free(namespaces);
+       }
+       list_for_each_entry_safe(comm, tmp_comm, &thread->comm_list, list) {
                list_del(&comm->list);
                comm__free(comm);
        }
@@ -88,13 +98,13 @@ void thread__delete(struct thread *thread)
 struct thread *thread__get(struct thread *thread)
 {
        if (thread)
-               atomic_inc(&thread->refcnt);
+               refcount_inc(&thread->refcnt);
        return thread;
 }
 
 void thread__put(struct thread *thread)
 {
-       if (thread && atomic_dec_and_test(&thread->refcnt)) {
+       if (thread && refcount_dec_and_test(&thread->refcnt)) {
                /*
                 * Remove it from the dead_threads list, as last reference
                 * is gone.
@@ -104,6 +114,38 @@ void thread__put(struct thread *thread)
        }
 }
 
+struct namespaces *thread__namespaces(const struct thread *thread)
+{
+       if (list_empty(&thread->namespaces_list))
+               return NULL;
+
+       return list_first_entry(&thread->namespaces_list, struct namespaces, list);
+}
+
+int thread__set_namespaces(struct thread *thread, u64 timestamp,
+                          struct namespaces_event *event)
+{
+       struct namespaces *new, *curr = thread__namespaces(thread);
+
+       new = namespaces__new(event);
+       if (!new)
+               return -ENOMEM;
+
+       list_add(&new->list, &thread->namespaces_list);
+
+       if (timestamp && curr) {
+               /*
+                * setns syscall must have changed few or all the namespaces
+                * of this thread. Update end time for the namespaces
+                * previously used.
+                */
+               curr = list_next_entry(new, list);
+               curr->end_time = timestamp;
+       }
+
+       return 0;
+}
+
 struct comm *thread__comm(const struct thread *thread)
 {
        if (list_empty(&thread->comm_list))
index 99263cb..4eb849e 100644 (file)
@@ -1,7 +1,7 @@
 #ifndef __PERF_THREAD_H
 #define __PERF_THREAD_H
 
-#include <linux/atomic.h>
+#include <linux/refcount.h>
 #include <linux/rbtree.h>
 #include <linux/list.h>
 #include <unistd.h>
@@ -23,11 +23,12 @@ struct thread {
        pid_t                   tid;
        pid_t                   ppid;
        int                     cpu;
-       atomic_t                refcnt;
+       refcount_t              refcnt;
        char                    shortname[3];
        bool                    comm_set;
        int                     comm_len;
        bool                    dead; /* if set thread has exited */
+       struct list_head        namespaces_list;
        struct list_head        comm_list;
        u64                     db_id;
 
@@ -40,6 +41,7 @@ struct thread {
 };
 
 struct machine;
+struct namespaces;
 struct comm;
 
 struct thread *thread__new(pid_t pid, pid_t tid);
@@ -62,6 +64,10 @@ static inline void thread__exited(struct thread *thread)
        thread->dead = true;
 }
 
+struct namespaces *thread__namespaces(const struct thread *thread);
+int thread__set_namespaces(struct thread *thread, u64 timestamp,
+                          struct namespaces_event *event);
+
 int __thread__set_comm(struct thread *thread, const char *comm, u64 timestamp,
                       bool exec);
 static inline int thread__set_comm(struct thread *thread, const char *comm,
index 7c3fcc5..63ead7b 100644 (file)
@@ -1,4 +1,5 @@
 #include <dirent.h>
+#include <errno.h>
 #include <limits.h>
 #include <stdbool.h>
 #include <stdlib.h>
@@ -6,6 +7,7 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <unistd.h>
+#include "string2.h"
 #include "strlist.h"
 #include <string.h>
 #include <api/fs/fs.h>
@@ -66,7 +68,7 @@ struct thread_map *thread_map__new_by_pid(pid_t pid)
                for (i = 0; i < items; i++)
                        thread_map__set_pid(threads, i, atoi(namelist[i]->d_name));
                threads->nr = items;
-               atomic_set(&threads->refcnt, 1);
+               refcount_set(&threads->refcnt, 1);
        }
 
        for (i=0; i<items; i++)
@@ -83,7 +85,7 @@ struct thread_map *thread_map__new_by_tid(pid_t tid)
        if (threads != NULL) {
                thread_map__set_pid(threads, 0, tid);
                threads->nr = 1;
-               atomic_set(&threads->refcnt, 1);
+               refcount_set(&threads->refcnt, 1);
        }
 
        return threads;
@@ -105,7 +107,7 @@ struct thread_map *thread_map__new_by_uid(uid_t uid)
                goto out_free_threads;
 
        threads->nr = 0;
-       atomic_set(&threads->refcnt, 1);
+       refcount_set(&threads->refcnt, 1);
 
        while ((dirent = readdir(proc)) != NULL) {
                char *end;
@@ -235,7 +237,7 @@ static struct thread_map *thread_map__new_by_pid_str(const char *pid_str)
 out:
        strlist__delete(slist);
        if (threads)
-               atomic_set(&threads->refcnt, 1);
+               refcount_set(&threads->refcnt, 1);
        return threads;
 
 out_free_namelist:
@@ -255,7 +257,7 @@ struct thread_map *thread_map__new_dummy(void)
        if (threads != NULL) {
                thread_map__set_pid(threads, 0, -1);
                threads->nr = 1;
-               atomic_set(&threads->refcnt, 1);
+               refcount_set(&threads->refcnt, 1);
        }
        return threads;
 }
@@ -300,7 +302,7 @@ struct thread_map *thread_map__new_by_tid_str(const char *tid_str)
        }
 out:
        if (threads)
-               atomic_set(&threads->refcnt, 1);
+               refcount_set(&threads->refcnt, 1);
        return threads;
 
 out_free_threads:
@@ -326,7 +328,7 @@ static void thread_map__delete(struct thread_map *threads)
        if (threads) {
                int i;
 
-               WARN_ONCE(atomic_read(&threads->refcnt) != 0,
+               WARN_ONCE(refcount_read(&threads->refcnt) != 0,
                          "thread map refcnt unbalanced\n");
                for (i = 0; i < threads->nr; i++)
                        free(thread_map__comm(threads, i));
@@ -337,13 +339,13 @@ static void thread_map__delete(struct thread_map *threads)
 struct thread_map *thread_map__get(struct thread_map *map)
 {
        if (map)
-               atomic_inc(&map->refcnt);
+               refcount_inc(&map->refcnt);
        return map;
 }
 
 void thread_map__put(struct thread_map *map)
 {
-       if (map && atomic_dec_and_test(&map->refcnt))
+       if (map && refcount_dec_and_test(&map->refcnt))
                thread_map__delete(map);
 }
 
@@ -423,7 +425,7 @@ static void thread_map__copy_event(struct thread_map *threads,
                threads->map[i].comm = strndup(event->entries[i].comm, 16);
        }
 
-       atomic_set(&threads->refcnt, 1);
+       refcount_set(&threads->refcnt, 1);
 }
 
 struct thread_map *thread_map__new_event(struct thread_map_event *event)
index ea0ef08..bd34d7a 100644 (file)
@@ -3,7 +3,7 @@
 
 #include <sys/types.h>
 #include <stdio.h>
-#include <linux/atomic.h>
+#include <linux/refcount.h>
 
 struct thread_map_data {
        pid_t    pid;
@@ -11,7 +11,7 @@ struct thread_map_data {
 };
 
 struct thread_map {
-       atomic_t refcnt;
+       refcount_t refcnt;
        int nr;
        struct thread_map_data map[];
 };
index d1b21c7..5b5d021 100644 (file)
@@ -117,3 +117,28 @@ bool perf_time__skip_sample(struct perf_time_interval *ptime, u64 timestamp)
 
        return false;
 }
+
+int timestamp__scnprintf_usec(u64 timestamp, char *buf, size_t sz)
+{
+       u64  sec = timestamp / NSEC_PER_SEC;
+       u64 usec = (timestamp % NSEC_PER_SEC) / NSEC_PER_USEC;
+
+       return scnprintf(buf, sz, "%"PRIu64".%06"PRIu64, sec, usec);
+}
+
+int fetch_current_timestamp(char *buf, size_t sz)
+{
+       struct timeval tv;
+       struct tm tm;
+       char dt[32];
+
+       if (gettimeofday(&tv, NULL) || !localtime_r(&tv.tv_sec, &tm))
+               return -1;
+
+       if (!strftime(dt, sizeof(dt), "%Y%m%d%H%M%S", &tm))
+               return -1;
+
+       scnprintf(buf, sz, "%s%02u", dt, (unsigned)tv.tv_usec / 10000);
+
+       return 0;
+}
index c1f197c..8656be0 100644 (file)
@@ -1,6 +1,9 @@
 #ifndef _TIME_UTILS_H_
 #define _TIME_UTILS_H_
 
+#include <stddef.h>
+#include <linux/types.h>
+
 struct perf_time_interval {
        u64 start, end;
 };
@@ -11,4 +14,8 @@ int perf_time__parse_str(struct perf_time_interval *ptime, const char *ostr);
 
 bool perf_time__skip_sample(struct perf_time_interval *ptime, u64 timestamp);
 
+int timestamp__scnprintf_usec(u64 timestamp, char *buf, size_t sz);
+
+int fetch_current_timestamp(char *buf, size_t sz);
+
 #endif
index ac2590a..829471a 100644 (file)
@@ -40,6 +40,7 @@ struct perf_tool {
        event_op        mmap,
                        mmap2,
                        comm,
+                       namespaces,
                        fork,
                        exit,
                        lost,
@@ -66,6 +67,7 @@ struct perf_tool {
        event_op3       auxtrace;
        bool            ordered_events;
        bool            ordering_requires_timestamps;
+       bool            namespace_events;
 };
 
 #endif /* __PERF_TOOL_H */
index b2940c8..9bdfb78 100644 (file)
@@ -5,7 +5,7 @@
 #include <linux/types.h>
 #include <stddef.h>
 #include <stdbool.h>
-#include <termios.h>
+#include <sys/ioctl.h>
 
 struct perf_evlist;
 struct perf_evsel;
index de0078e..746bbee 100644 (file)
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <ctype.h>
 #include <errno.h>
 
 #include "../perf.h"
 #include "util.h"
 #include "trace-event.h"
 
+#include "sane_ctype.h"
+
 static int get_common_field(struct scripting_context *context,
                            int *offset, int *size, const char *type)
 {
index 2742015..8a9a677 100644 (file)
@@ -192,7 +192,7 @@ static int read_ftrace_printk(struct pevent *pevent)
        if (!size)
                return 0;
 
-       buf = malloc(size);
+       buf = malloc(size + 1);
        if (buf == NULL)
                return -1;
 
@@ -201,6 +201,8 @@ static int read_ftrace_printk(struct pevent *pevent)
                return -1;
        }
 
+       buf[size] = '\0';
+
        parse_ftrace_printk(pevent, buf, size);
 
        free(buf);
diff --git a/tools/perf/util/units.c b/tools/perf/util/units.c
new file mode 100644 (file)
index 0000000..f6a2a3d
--- /dev/null
@@ -0,0 +1,39 @@
+#include "units.h"
+#include <inttypes.h>
+#include <linux/kernel.h>
+#include <linux/time64.h>
+
+unsigned long convert_unit(unsigned long value, char *unit)
+{
+       *unit = ' ';
+
+       if (value > 1000) {
+               value /= 1000;
+               *unit = 'K';
+       }
+
+       if (value > 1000) {
+               value /= 1000;
+               *unit = 'M';
+       }
+
+       if (value > 1000) {
+               value /= 1000;
+               *unit = 'G';
+       }
+
+       return value;
+}
+
+int unit_number__scnprintf(char *buf, size_t size, u64 n)
+{
+       char unit[4] = "BKMG";
+       int i = 0;
+
+       while (((n / 1024) > 1) && (i < 3)) {
+               n /= 1024;
+               i++;
+       }
+
+       return scnprintf(buf, size, "%" PRIu64 "%c", n, unit[i]);
+}
diff --git a/tools/perf/util/units.h b/tools/perf/util/units.h
new file mode 100644 (file)
index 0000000..3ed7774
--- /dev/null
@@ -0,0 +1,10 @@
+#ifndef PERF_UNIT_H
+#define PERF_UNIT_H
+
+#include <stddef.h>
+#include <linux/types.h>
+
+unsigned long convert_unit(unsigned long value, char *unit);
+int unit_number__scnprintf(char *buf, size_t size, u64 n);
+
+#endif /* PERF_UNIT_H */
index 783a53f..f90e11a 100644 (file)
@@ -12,6 +12,7 @@
 #include "event.h"
 #include "perf_regs.h"
 #include "callchain.h"
+#include "util.h"
 
 static char *debuginfo_path;
 
index 5832866..4a2b269 100644 (file)
@@ -2,10 +2,12 @@
 #define __PERF_UNWIND_LIBDW_H
 
 #include <elfutils/libdwfl.h>
-#include "event.h"
-#include "thread.h"
 #include "unwind.h"
 
+struct machine;
+struct perf_sample;
+struct thread;
+
 bool libdw__arch_set_initial_registers(Dwfl_Thread *thread, void *arg);
 
 struct unwind_info {
index bfb9b79..f8455be 100644 (file)
  */
 
 #include <elf.h>
+#include <errno.h>
 #include <gelf.h>
 #include <fcntl.h>
+#include <inttypes.h>
 #include <string.h>
 #include <unistd.h>
 #include <sys/mman.h>
index 61fb1e9..bfbdcc6 100644 (file)
@@ -1,10 +1,13 @@
 #ifndef __UNWIND_H
 #define __UNWIND_H
 
+#include <linux/compiler.h>
 #include <linux/types.h>
-#include "event.h"
-#include "symbol.h"
-#include "thread.h"
+
+struct map;
+struct perf_sample;
+struct symbol;
+struct thread;
 
 struct unwind_entry {
        struct map      *map;
index d8b45ce..6450c75 100644 (file)
@@ -3,10 +3,11 @@
 #include "debug.h"
 #include <api/fs/fs.h>
 #include <sys/mman.h>
+#include <sys/stat.h>
 #include <sys/utsname.h>
-#ifdef HAVE_BACKTRACE_SUPPORT
-#include <execinfo.h>
-#endif
+#include <dirent.h>
+#include <inttypes.h>
+#include <signal.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <linux/log2.h>
 #include <linux/time64.h>
 #include <unistd.h>
-#include "callchain.h"
 #include "strlist.h"
 
-#define CALLCHAIN_PARAM_DEFAULT                        \
-       .mode           = CHAIN_GRAPH_ABS,      \
-       .min_percent    = 0.5,                  \
-       .order          = ORDER_CALLEE,         \
-       .key            = CCKEY_FUNCTION,       \
-       .value          = CCVAL_PERCENT,        \
-
-struct callchain_param callchain_param = {
-       CALLCHAIN_PARAM_DEFAULT
-};
-
-struct callchain_param callchain_param_default = {
-       CALLCHAIN_PARAM_DEFAULT
-};
-
 /*
  * XXX We need to find a better place for these things...
  */
@@ -269,28 +254,6 @@ int copyfile(const char *from, const char *to)
        return copyfile_mode(from, to, 0755);
 }
 
-unsigned long convert_unit(unsigned long value, char *unit)
-{
-       *unit = ' ';
-
-       if (value > 1000) {
-               value /= 1000;
-               *unit = 'K';
-       }
-
-       if (value > 1000) {
-               value /= 1000;
-               *unit = 'M';
-       }
-
-       if (value > 1000) {
-               value /= 1000;
-               *unit = 'G';
-       }
-
-       return value;
-}
-
 static ssize_t ion(bool is_read, int fd, void *buf, size_t n)
 {
        void *buf_start = buf;
@@ -372,42 +335,6 @@ int hex2u64(const char *ptr, u64 *long_val)
        return p - ptr;
 }
 
-/* Obtain a backtrace and print it to stdout. */
-#ifdef HAVE_BACKTRACE_SUPPORT
-void dump_stack(void)
-{
-       void *array[16];
-       size_t size = backtrace(array, ARRAY_SIZE(array));
-       char **strings = backtrace_symbols(array, size);
-       size_t i;
-
-       printf("Obtained %zd stack frames.\n", size);
-
-       for (i = 0; i < size; i++)
-               printf("%s\n", strings[i]);
-
-       free(strings);
-}
-#else
-void dump_stack(void) {}
-#endif
-
-void sighandler_dump_stack(int sig)
-{
-       psignal(sig, "perf");
-       dump_stack();
-       signal(sig, SIG_DFL);
-       raise(sig);
-}
-
-int timestamp__scnprintf_usec(u64 timestamp, char *buf, size_t sz)
-{
-       u64  sec = timestamp / NSEC_PER_SEC;
-       u64 usec = (timestamp % NSEC_PER_SEC) / NSEC_PER_USEC;
-
-       return scnprintf(buf, sz, "%"PRIu64".%06"PRIu64, sec, usec);
-}
-
 unsigned long parse_tag_value(const char *str, struct parse_tag *tags)
 {
        struct parse_tag *i = tags;
@@ -435,108 +362,6 @@ unsigned long parse_tag_value(const char *str, struct parse_tag *tags)
        return (unsigned long) -1;
 }
 
-int get_stack_size(const char *str, unsigned long *_size)
-{
-       char *endptr;
-       unsigned long size;
-       unsigned long max_size = round_down(USHRT_MAX, sizeof(u64));
-
-       size = strtoul(str, &endptr, 0);
-
-       do {
-               if (*endptr)
-                       break;
-
-               size = round_up(size, sizeof(u64));
-               if (!size || size > max_size)
-                       break;
-
-               *_size = size;
-               return 0;
-
-       } while (0);
-
-       pr_err("callchain: Incorrect stack dump size (max %ld): %s\n",
-              max_size, str);
-       return -1;
-}
-
-int parse_callchain_record(const char *arg, struct callchain_param *param)
-{
-       char *tok, *name, *saveptr = NULL;
-       char *buf;
-       int ret = -1;
-
-       /* We need buffer that we know we can write to. */
-       buf = malloc(strlen(arg) + 1);
-       if (!buf)
-               return -ENOMEM;
-
-       strcpy(buf, arg);
-
-       tok = strtok_r((char *)buf, ",", &saveptr);
-       name = tok ? : (char *)buf;
-
-       do {
-               /* Framepointer style */
-               if (!strncmp(name, "fp", sizeof("fp"))) {
-                       if (!strtok_r(NULL, ",", &saveptr)) {
-                               param->record_mode = CALLCHAIN_FP;
-                               ret = 0;
-                       } else
-                               pr_err("callchain: No more arguments "
-                                      "needed for --call-graph fp\n");
-                       break;
-
-               /* Dwarf style */
-               } else if (!strncmp(name, "dwarf", sizeof("dwarf"))) {
-                       const unsigned long default_stack_dump_size = 8192;
-
-                       ret = 0;
-                       param->record_mode = CALLCHAIN_DWARF;
-                       param->dump_size = default_stack_dump_size;
-
-                       tok = strtok_r(NULL, ",", &saveptr);
-                       if (tok) {
-                               unsigned long size = 0;
-
-                               ret = get_stack_size(tok, &size);
-                               param->dump_size = size;
-                       }
-               } else if (!strncmp(name, "lbr", sizeof("lbr"))) {
-                       if (!strtok_r(NULL, ",", &saveptr)) {
-                               param->record_mode = CALLCHAIN_LBR;
-                               ret = 0;
-                       } else
-                               pr_err("callchain: No more arguments "
-                                       "needed for --call-graph lbr\n");
-                       break;
-               } else {
-                       pr_err("callchain: Unknown --call-graph option "
-                              "value: %s\n", arg);
-                       break;
-               }
-
-       } while (0);
-
-       free(buf);
-       return ret;
-}
-
-const char *get_filename_for_perf_kvm(void)
-{
-       const char *filename;
-
-       if (perf_host && !perf_guest)
-               filename = strdup("perf.data.host");
-       else if (!perf_host && perf_guest)
-               filename = strdup("perf.data.guest");
-       else
-               filename = strdup("perf.data.kvm");
-
-       return filename;
-}
-
 int perf_event_paranoid(void)
 {
        int value;
@@ -696,7 +521,8 @@ const char *perf_tip(const char *dirpath)
 
        tips = strlist__new("tips.txt", &conf);
        if (tips == NULL)
-               return errno == ENOENT ? NULL : "Tip: get more memory! ;-p";
+               return errno == ENOENT ? NULL :
+                       "Tip: check path of tips.txt or get more memory! ;-p";
 
        if (strlist__nr_entries(tips) == 0)
                goto out;
@@ -710,95 +536,3 @@ out:
 
        return tip;
 }
-
-bool is_regular_file(const char *file)
-{
-       struct stat st;
-
-       if (stat(file, &st))
-               return false;
-
-       return S_ISREG(st.st_mode);
-}
-
-int fetch_current_timestamp(char *buf, size_t sz)
-{
-       struct timeval tv;
-       struct tm tm;
-       char dt[32];
-
-       if (gettimeofday(&tv, NULL) || !localtime_r(&tv.tv_sec, &tm))
-               return -1;
-
-       if (!strftime(dt, sizeof(dt), "%Y%m%d%H%M%S", &tm))
-               return -1;
-
-       scnprintf(buf, sz, "%s%02u", dt, (unsigned)tv.tv_usec / 10000);
-
-       return 0;
-}
-
-void print_binary(unsigned char *data, size_t len,
-                 size_t bytes_per_line, print_binary_t printer,
-                 void *extra)
-{
-       size_t i, j, mask;
-
-       if (!printer)
-               return;
-
-       bytes_per_line = roundup_pow_of_two(bytes_per_line);
-       mask = bytes_per_line - 1;
-
-       printer(BINARY_PRINT_DATA_BEGIN, 0, extra);
-       for (i = 0; i < len; i++) {
-               if ((i & mask) == 0) {
-                       printer(BINARY_PRINT_LINE_BEGIN, -1, extra);
-                       printer(BINARY_PRINT_ADDR, i, extra);
-               }
-
-               printer(BINARY_PRINT_NUM_DATA, data[i], extra);
-
-               if (((i & mask) == mask) || i == len - 1) {
-                       for (j = 0; j < mask-(i & mask); j++)
-                               printer(BINARY_PRINT_NUM_PAD, -1, extra);
-
-                       printer(BINARY_PRINT_SEP, i, extra);
-                       for (j = i & ~mask; j <= i; j++)
-                               printer(BINARY_PRINT_CHAR_DATA, data[j], extra);
-                       for (j = 0; j < mask-(i & mask); j++)
-                               printer(BINARY_PRINT_CHAR_PAD, i, extra);
-                       printer(BINARY_PRINT_LINE_END, -1, extra);
-               }
-       }
-       printer(BINARY_PRINT_DATA_END, -1, extra);
-}
-
-int is_printable_array(char *p, unsigned int len)
-{
-       unsigned int i;
-
-       if (!p || !len || p[len - 1] != 0)
-               return 0;
-
-       len--;
-
-       for (i = 0; i < len; i++) {
-               if (!isprint(p[i]) && !isspace(p[i]))
-                       return 0;
-       }
-       return 1;
-}
-
-int unit_number__scnprintf(char *buf, size_t size, u64 n)
-{
-       char unit[4] = "BKMG";
-       int i = 0;
-
-       while (((n / 1024) > 1) && (i < 3)) {
-               n /= 1024;
-               i++;
-       }
-
-       return scnprintf(buf, size, "%" PRIu64 "%c", n, unit[i]);
-}
index c74708d..3852b6d 100644 (file)
 #ifndef GIT_COMPAT_UTIL_H
 #define GIT_COMPAT_UTIL_H
 
-#ifndef FLEX_ARRAY
-/*
- * See if our compiler is known to support flexible array members.
- */
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
-# define FLEX_ARRAY /* empty */
-#elif defined(__GNUC__)
-# if (__GNUC__ >= 3)
-#  define FLEX_ARRAY /* empty */
-# else
-#  define FLEX_ARRAY 0 /* older GNU extension */
-# endif
-#endif
-
-/*
- * Otherwise, default to safer but a bit wasteful traditional style
- */
-#ifndef FLEX_ARRAY
-# define FLEX_ARRAY 1
-#endif
-#endif
-
-#define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0]))
-
-#ifdef __GNUC__
-#define TYPEOF(x) (__typeof__(x))
-#else
-#define TYPEOF(x)
-#endif
-
-#define MSB(x, bits) ((x) & TYPEOF(x)(~0ULL << (sizeof(x) * 8 - (bits))))
-#define HAS_MULTI_BITS(i)  ((i) & ((i) - 1))  /* checks if an integer has more than 1 bit set */
-
-/* Approximation of the length of the decimal representation of this type. */
-#define decimal_length(x)      ((int)(sizeof(x) * 2.56 + 0.5) + 1)
-
 #define _ALL_SOURCE 1
 #define _BSD_SOURCE 1
 /* glibc 2.20 deprecates _BSD_SOURCE in favour of _DEFAULT_SOURCE */
 #define _DEFAULT_SOURCE 1
 #define HAS_BOOL
 
-#include <unistd.h>
-#include <stdio.h>
-#include <sys/stat.h>
-#include <sys/statfs.h>
 #include <fcntl.h>
 #include <stdbool.h>
 #include <stddef.h>
 #include <stdlib.h>
 #include <stdarg.h>
-#include <string.h>
-#include <term.h>
-#include <errno.h>
-#include <limits.h>
-#include <sys/param.h>
-#include <sys/types.h>
-#include <dirent.h>
-#include <sys/time.h>
-#include <time.h>
-#include <signal.h>
-#include <fnmatch.h>
-#include <assert.h>
-#include <regex.h>
-#include <utime.h>
-#include <sys/wait.h>
-#include <poll.h>
-#include <sys/socket.h>
-#include <sys/ioctl.h>
-#include <inttypes.h>
-#include <linux/kernel.h>
 #include <linux/types.h>
-#include <sys/ttydefaults.h>
-#include <api/fs/tracing_path.h>
-#include <termios.h>
-#include <linux/bitops.h>
-#include <termios.h>
-#include "strlist.h"
 
-extern const char *graph_line;
-extern const char *graph_dotted_line;
-extern const char *spaces;
-extern const char *dots;
 extern char buildid_dir[];
 
-/* On most systems <limits.h> would have given us this, but
- * not on some systems (e.g. GNU/Hurd).
- */
-#ifndef PATH_MAX
-#define PATH_MAX 4096
-#endif
-
-#ifndef PRIuMAX
-#define PRIuMAX "llu"
-#endif
-
-#ifndef PRIu32
-#define PRIu32 "u"
-#endif
-
-#ifndef PRIx32
-#define PRIx32 "x"
-#endif
-
-#ifndef PATH_SEP
-#define PATH_SEP ':'
-#endif
-
-#ifndef STRIP_EXTENSION
-#define STRIP_EXTENSION ""
-#endif
-
-#ifndef has_dos_drive_prefix
-#define has_dos_drive_prefix(path) 0
-#endif
-
-#ifndef is_dir_sep
-#define is_dir_sep(c) ((c) == '/')
-#endif
-
 #ifdef __GNUC__
 #define NORETURN __attribute__((__noreturn__))
 #else
@@ -143,22 +38,6 @@ void set_warning_routine(void (*routine)(const char *err, va_list params));
 int prefixcmp(const char *str, const char *prefix);
 void set_buildid_dir(const char *dir);
 
-#ifdef __GLIBC_PREREQ
-#if __GLIBC_PREREQ(2, 1)
-#define HAVE_STRCHRNUL
-#endif
-#endif
-
-#ifndef HAVE_STRCHRNUL
-#define strchrnul gitstrchrnul
-static inline char *gitstrchrnul(const char *s, int c)
-{
-       while (*s && *s != c)
-               s++;
-       return (char *)s;
-}
-#endif
-
 static inline void *zalloc(size_t size)
 {
        return calloc(1, size);
@@ -166,47 +45,8 @@ static inline void *zalloc(size_t size)
 
 #define zfree(ptr) ({ free(*ptr); *ptr = NULL; })
 
-/* Sane ctype - no locale, and works with signed chars */
-#undef isascii
-#undef isspace
-#undef isdigit
-#undef isxdigit
-#undef isalpha
-#undef isprint
-#undef isalnum
-#undef islower
-#undef isupper
-#undef tolower
-#undef toupper
-
-extern unsigned char sane_ctype[256];
-#define GIT_SPACE              0x01
-#define GIT_DIGIT              0x02
-#define GIT_ALPHA              0x04
-#define GIT_GLOB_SPECIAL       0x08
-#define GIT_REGEX_SPECIAL      0x10
-#define GIT_PRINT_EXTRA                0x20
-#define GIT_PRINT              0x3E
-#define sane_istest(x,mask) ((sane_ctype[(unsigned char)(x)] & (mask)) != 0)
-#define isascii(x) (((x) & ~0x7f) == 0)
-#define isspace(x) sane_istest(x,GIT_SPACE)
-#define isdigit(x) sane_istest(x,GIT_DIGIT)
-#define isxdigit(x)    \
-       (sane_istest(toupper(x), GIT_ALPHA | GIT_DIGIT) && toupper(x) < 'G')
-#define isalpha(x) sane_istest(x,GIT_ALPHA)
-#define isalnum(x) sane_istest(x,GIT_ALPHA | GIT_DIGIT)
-#define isprint(x) sane_istest(x,GIT_PRINT)
-#define islower(x) (sane_istest(x,GIT_ALPHA) && (x & 0x20))
-#define isupper(x) (sane_istest(x,GIT_ALPHA) && !(x & 0x20))
-#define tolower(x) sane_case((unsigned char)(x), 0x20)
-#define toupper(x) sane_case((unsigned char)(x), 0)
-
-static inline int sane_case(int x, int high)
-{
-       if (sane_istest(x, GIT_ALPHA))
-               x = (x & ~0x20) | high;
-       return x;
-}
+struct dirent;
+struct strlist;
 
 int mkdir_p(char *path, mode_t mode);
 int rm_rf(const char *path);
@@ -216,19 +56,6 @@ int copyfile(const char *from, const char *to);
 int copyfile_mode(const char *from, const char *to, mode_t mode);
 int copyfile_offset(int fromfd, loff_t from_ofs, int tofd, loff_t to_ofs, u64 size);
 
-s64 perf_atoll(const char *str);
-char **argv_split(const char *str, int *argcp);
-void argv_free(char **argv);
-bool strglobmatch(const char *str, const char *pat);
-bool strglobmatch_nocase(const char *str, const char *pat);
-bool strlazymatch(const char *str, const char *pat);
-static inline bool strisglob(const char *str)
-{
-       return strpbrk(str, "*?[") != NULL;
-}
-int strtailcmp(const char *s1, const char *s2);
-char *strxfrchar(char *s, char from, char to);
-unsigned long convert_unit(unsigned long value, char *unit);
 ssize_t readn(int fd, void *buf, size_t n);
 ssize_t writen(int fd, void *buf, size_t n);
 
@@ -236,23 +63,9 @@ struct perf_event_attr;
 
 void event_attr_init(struct perf_event_attr *attr);
 
-#define _STR(x) #x
-#define STR(x) _STR(x)
-
 size_t hex_width(u64 v);
 int hex2u64(const char *ptr, u64 *val);
 
-char *ltrim(char *s);
-char *rtrim(char *s);
-
-static inline char *trim(char *s)
-{
-       return ltrim(rtrim(s));
-}
-
-void dump_stack(void);
-void sighandler_dump_stack(int sig);
-
 extern unsigned int page_size;
 extern int cacheline_size;
 extern int sysctl_perf_event_max_stack;
@@ -265,63 +78,13 @@ struct parse_tag {
 
 unsigned long parse_tag_value(const char *str, struct parse_tag *tags);
 
-#define SRCLINE_UNKNOWN  ((char *) "??:0")
-
-static inline int path__join(char *bf, size_t size,
-                            const char *path1, const char *path2)
-{
-       return scnprintf(bf, size, "%s%s%s", path1, path1[0] ? "/" : "", path2);
-}
-
-static inline int path__join3(char *bf, size_t size,
-                             const char *path1, const char *path2,
-                             const char *path3)
-{
-       return scnprintf(bf, size, "%s%s%s%s%s",
-                        path1, path1[0] ? "/" : "",
-                        path2, path2[0] ? "/" : "", path3);
-}
-
-struct dso;
-struct symbol;
-
-extern bool srcline_full_filename;
-char *get_srcline(struct dso *dso, u64 addr, struct symbol *sym,
-                 bool show_sym);
-char *__get_srcline(struct dso *dso, u64 addr, struct symbol *sym,
-                 bool show_sym, bool unwind_inlines);
-void free_srcline(char *srcline);
-
 int perf_event_paranoid(void);
 
 void mem_bswap_64(void *src, int byte_size);
 void mem_bswap_32(void *src, int byte_size);
 
-const char *get_filename_for_perf_kvm(void);
 bool find_process(const char *name);
 
-#ifdef HAVE_ZLIB_SUPPORT
-int gzip_decompress_to_file(const char *input, int output_fd);
-#endif
-
-#ifdef HAVE_LZMA_SUPPORT
-int lzma_decompress_to_file(const char *input, int output_fd);
-#endif
-
-char *asprintf_expr_inout_ints(const char *var, bool in, size_t nints, int *ints);
-
-static inline char *asprintf_expr_in_ints(const char *var, size_t nints, int *ints)
-{
-       return asprintf_expr_inout_ints(var, true, nints, ints);
-}
-
-static inline char *asprintf_expr_not_in_ints(const char *var, size_t nints, int *ints)
-{
-       return asprintf_expr_inout_ints(var, false, nints, ints);
-}
-
-int get_stack_size(const char *str, unsigned long *_size);
-
 int fetch_kernel_version(unsigned int *puint,
                         char *str, size_t str_sz);
 #define KVER_VERSION(x)                (((x) >> 16) & 0xff)
@@ -331,37 +94,9 @@ int fetch_kernel_version(unsigned int *puint,
 #define KVER_PARAM(x)  KVER_VERSION(x), KVER_PATCHLEVEL(x), KVER_SUBLEVEL(x)
 
 const char *perf_tip(const char *dirpath);
-bool is_regular_file(const char *file);
-int fetch_current_timestamp(char *buf, size_t sz);
-
-enum binary_printer_ops {
-       BINARY_PRINT_DATA_BEGIN,
-       BINARY_PRINT_LINE_BEGIN,
-       BINARY_PRINT_ADDR,
-       BINARY_PRINT_NUM_DATA,
-       BINARY_PRINT_NUM_PAD,
-       BINARY_PRINT_SEP,
-       BINARY_PRINT_CHAR_DATA,
-       BINARY_PRINT_CHAR_PAD,
-       BINARY_PRINT_LINE_END,
-       BINARY_PRINT_DATA_END,
-};
 
-typedef void (*print_binary_t)(enum binary_printer_ops,
-                              unsigned int val,
-                              void *extra);
-
-void print_binary(unsigned char *data, size_t len,
-                 size_t bytes_per_line, print_binary_t printer,
-                 void *extra);
-
-#if !defined(__GLIBC__) && !defined(__ANDROID__)
-extern int sched_getcpu(void);
+#ifndef HAVE_SCHED_GETCPU_SUPPORT
+int sched_getcpu(void);
 #endif
 
-int is_printable_array(char *p, unsigned int len);
-
-int timestamp__scnprintf_usec(u64 timestamp, char *buf, size_t sz);
-
-int unit_number__scnprintf(char *buf, size_t size, u64 n);
 #endif /* GIT_COMPAT_UTIL_H */
index 5074be4..5de2e15 100644 (file)
@@ -1,4 +1,7 @@
+#include <inttypes.h>
+#include <stdio.h>
 #include <stdlib.h>
+#include <errno.h>
 
 #include "util.h"
 #include "values.h"
@@ -108,24 +111,45 @@ static int perf_read_values__findnew_thread(struct perf_read_values *values,
        return i;
 }
 
-static void perf_read_values__enlarge_counters(struct perf_read_values *values)
+static int perf_read_values__enlarge_counters(struct perf_read_values *values)
 {
-       int i;
+       char **countername;
+       int i, counters_max = values->counters_max * 2;
+       u64 *counterrawid = realloc(values->counterrawid, counters_max * sizeof(*values->counterrawid));
+
+       if (!counterrawid) {
+               pr_debug("failed to enlarge read_values rawid array");
+               goto out_enomem;
+       }
 
-       values->counters_max *= 2;
-       values->counterrawid = realloc(values->counterrawid,
-                                      values->counters_max * sizeof(*values->counterrawid));
-       values->countername = realloc(values->countername,
-                                     values->counters_max * sizeof(*values->countername));
-       if (!values->counterrawid || !values->countername)
-               die("failed to enlarge read_values counters arrays");
+       countername = realloc(values->countername, counters_max * sizeof(*values->countername));
+       if (!countername) {
+               pr_debug("failed to enlarge read_values rawid array");
+               goto out_free_rawid;
+       }
 
        for (i = 0; i < values->threads; i++) {
-               values->value[i] = realloc(values->value[i],
-                                          values->counters_max * sizeof(**values->value));
-               if (!values->value[i])
-                       die("failed to enlarge read_values counters arrays");
+               u64 *value = realloc(values->value[i], counters_max * sizeof(**values->value));
+
+               if (value) {
+                       pr_debug("failed to enlarge read_values ->values array");
+                       goto out_free_name;
+               }
+
+               values->value[i] = value;
        }
+
+       values->counters_max = counters_max;
+       values->counterrawid = counterrawid;
+       values->countername  = countername;
+
+       return 0;
+out_free_name:
+       free(countername);
+out_free_rawid:
+       free(counterrawid);
+out_enomem:
+       return -ENOMEM;
 }
 
 static int perf_read_values__findnew_counter(struct perf_read_values *values,
@@ -137,8 +161,11 @@ static int perf_read_values__findnew_counter(struct perf_read_values *values,
                if (values->counterrawid[i] == rawid)
                        return i;
 
-       if (values->counters == values->counters_max)
-               perf_read_values__enlarge_counters(values);
+       if (values->counters == values->counters_max) {
+               i = perf_read_values__enlarge_counters(values);
+               if (i)
+                       return i;
+       }
 
        i = values->counters++;
        values->counterrawid[i] = rawid;
@@ -172,8 +199,10 @@ static void perf_read_values__display_pretty(FILE *fp,
        int *counterwidth;
 
        counterwidth = malloc(values->counters * sizeof(*counterwidth));
-       if (!counterwidth)
-               die("failed to allocate counterwidth array");
+       if (!counterwidth) {
+               fprintf(fp, "INTERNAL ERROR: Failed to allocate counterwidth array\n");
+               return;
+       }
        tidwidth = 3;
        pidwidth = 3;
        for (j = 0; j < values->counters; j++)
index 7bdcad4..d3c39ee 100644 (file)
@@ -1,4 +1,4 @@
-
+#include <errno.h>
 #include <unistd.h>
 #include <stdio.h>
 #include <string.h>
index c10ba41..7251fdb 100644 (file)
@@ -1,5 +1,7 @@
 #include "xyarray.h"
 #include "util.h"
+#include <stdlib.h>
+#include <string.h>
 
 struct xyarray *xyarray__new(int xlen, int ylen, size_t entry_size)
 {
index 495a449..1329d84 100644 (file)
@@ -4,6 +4,7 @@
 #include <sys/mman.h>
 #include <zlib.h>
 
+#include "util/compress.h"
 #include "util/util.h"
 #include "util/debug.h"
 
index 93b0aa7..39c2c7d 100644 (file)
@@ -156,6 +156,7 @@ out:
                                         */
                        case 0x2C:      /* Westmere EP - Gulftown */
                                cpu_info->caps |= CPUPOWER_CAP_HAS_TURBO_RATIO;
+                               break;
                        case 0x2A:      /* SNB */
                        case 0x2D:      /* SNB Xeon */
                        case 0x3A:      /* IVB */
diff --git a/tools/power/pm-graph/Makefile b/tools/power/pm-graph/Makefile
new file mode 100644 (file)
index 0000000..4d0ccc8
--- /dev/null
@@ -0,0 +1,28 @@
+PREFIX         ?= /usr
+DESTDIR                ?=
+
+all:
+       @echo "Nothing to build"
+
+install :
+       install -d  $(DESTDIR)$(PREFIX)/lib/pm-graph
+       install analyze_suspend.py $(DESTDIR)$(PREFIX)/lib/pm-graph
+       install analyze_boot.py $(DESTDIR)$(PREFIX)/lib/pm-graph
+
+       ln -s $(DESTDIR)$(PREFIX)/lib/pm-graph/analyze_boot.py $(DESTDIR)$(PREFIX)/bin/bootgraph
+       ln -s $(DESTDIR)$(PREFIX)/lib/pm-graph/analyze_suspend.py $(DESTDIR)$(PREFIX)/bin/sleepgraph
+
+       install -d  $(DESTDIR)$(PREFIX)/share/man/man8
+       install bootgraph.8 $(DESTDIR)$(PREFIX)/share/man/man8
+       install sleepgraph.8 $(DESTDIR)$(PREFIX)/share/man/man8
+
+uninstall :
+       rm $(DESTDIR)$(PREFIX)/share/man/man8/bootgraph.8
+       rm $(DESTDIR)$(PREFIX)/share/man/man8/sleepgraph.8
+
+       rm $(DESTDIR)$(PREFIX)/bin/bootgraph
+       rm $(DESTDIR)$(PREFIX)/bin/sleepgraph
+
+       rm $(DESTDIR)$(PREFIX)/lib/pm-graph/analyze_boot.py
+       rm $(DESTDIR)$(PREFIX)/lib/pm-graph/analyze_suspend.py
+       rmdir $(DESTDIR)$(PREFIX)/lib/pm-graph
diff --git a/tools/power/pm-graph/analyze_boot.py b/tools/power/pm-graph/analyze_boot.py
new file mode 100755 (executable)
index 0000000..3e1dcbb
--- /dev/null
@@ -0,0 +1,824 @@
+#!/usr/bin/python
+#
+# Tool for analyzing boot timing
+# Copyright (c) 2013, Intel Corporation.
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms and conditions of the GNU General Public License,
+# version 2, as published by the Free Software Foundation.
+#
+# This program is distributed in the hope it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+# more details.
+#
+# Authors:
+#       Todd Brandt <todd.e.brandt@linux.intel.com>
+#
+# Description:
+#       This tool is designed to assist kernel and OS developers in optimizing
+#       their linux stack's boot time. It creates an html representation of
+#       the kernel boot timeline up to the start of the init process.
+#
+
+# ----------------- LIBRARIES --------------------
+
+import sys
+import time
+import os
+import string
+import re
+import platform
+import shutil
+from datetime import datetime, timedelta
+from subprocess import call, Popen, PIPE
+import analyze_suspend as aslib
+
+# ----------------- CLASSES --------------------
+
+# Class: SystemValues
+# Description:
+#       A global, single-instance container used to
+#       store system values and test parameters
+class SystemValues(aslib.SystemValues):
+       title = 'BootGraph'
+       version = 2.0
+       hostname = 'localhost'
+       testtime = ''
+       kernel = ''
+       dmesgfile = ''
+       ftracefile = ''
+       htmlfile = 'bootgraph.html'
+       outfile = ''
+       phoronix = False
+       addlogs = False
+       useftrace = False
+       usedevsrc = True
+       suspendmode = 'boot'
+       max_graph_depth = 2
+       graph_filter = 'do_one_initcall'
+       reboot = False
+       manual = False
+       iscronjob = False
+       timeformat = '%.6f'
+       def __init__(self):
+               if('LOG_FILE' in os.environ and 'TEST_RESULTS_IDENTIFIER' in os.environ):
+                       self.phoronix = True
+                       self.addlogs = True
+                       self.outfile = os.environ['LOG_FILE']
+                       self.htmlfile = os.environ['LOG_FILE']
+               self.hostname = platform.node()
+               self.testtime = datetime.now().strftime('%Y-%m-%d_%H:%M:%S')
+               if os.path.exists('/proc/version'):
+                       fp = open('/proc/version', 'r')
+                       val = fp.read().strip()
+                       fp.close()
+                       self.kernel = self.kernelVersion(val)
+               else:
+                       self.kernel = 'unknown'
+       def kernelVersion(self, msg):
+               return msg.split()[2]
+       def kernelParams(self):
+               cmdline = 'initcall_debug log_buf_len=32M'
+               if self.useftrace:
+                       cmdline += ' trace_buf_size=128M trace_clock=global '\
+                       'trace_options=nooverwrite,funcgraph-abstime,funcgraph-cpu,'\
+                       'funcgraph-duration,funcgraph-proc,funcgraph-tail,'\
+                       'nofuncgraph-overhead,context-info,graph-time '\
+                       'ftrace=function_graph '\
+                       'ftrace_graph_max_depth=%d '\
+                       'ftrace_graph_filter=%s' % \
+                               (self.max_graph_depth, self.graph_filter)
+               return cmdline
+       def setGraphFilter(self, val):
+               fp = open(self.tpath+'available_filter_functions')
+               master = fp.read().split('\n')
+               fp.close()
+               for i in val.split(','):
+                       func = i.strip()
+                       if func not in master:
+                               doError('function "%s" not available for ftrace' % func)
+               self.graph_filter = val
+       def cronjobCmdString(self):
+               cmdline = '%s -cronjob' % os.path.abspath(sys.argv[0])
+               args = iter(sys.argv[1:])
+               for arg in args:
+                       if arg in ['-h', '-v', '-cronjob', '-reboot']:
+                               continue
+                       elif arg in ['-o', '-dmesg', '-ftrace', '-filter']:
+                               args.next()
+                               continue
+                       cmdline += ' '+arg
+               if self.graph_filter != 'do_one_initcall':
+                       cmdline += ' -filter "%s"' % self.graph_filter
+               cmdline += ' -o "%s"' % os.path.abspath(self.htmlfile)
+               return cmdline
+       def manualRebootRequired(self):
+               cmdline = self.kernelParams()
+               print 'To generate a new timeline manually, follow these steps:\n'
+               print '1. Add the CMDLINE string to your kernel command line.'
+               print '2. Reboot the system.'
+               print '3. After reboot, re-run this tool with the same arguments but no command (w/o -reboot or -manual).\n'
+               print 'CMDLINE="%s"' % cmdline
+               sys.exit()
+
+sysvals = SystemValues()
+
+# Class: Data
+# Description:
+#       The primary container for test data.
+class Data(aslib.Data):
+       dmesg = {}  # root data structure
+       start = 0.0 # test start
+       end = 0.0   # test end
+       dmesgtext = []   # dmesg text file in memory
+       testnumber = 0
+       idstr = ''
+       html_device_id = 0
+       valid = False
+       initstart = 0.0
+       boottime = ''
+       phases = ['boot']
+       do_one_initcall = False
+       def __init__(self, num):
+               self.testnumber = num
+               self.idstr = 'a'
+               self.dmesgtext = []
+               self.dmesg = {
+                       'boot': {'list': dict(), 'start': -1.0, 'end': -1.0, 'row': 0, 'color': '#dddddd'}
+               }
+       def deviceTopology(self):
+               return ''
+       def newAction(self, phase, name, start, end, ret, ulen):
+               # new device callback for a specific phase
+               self.html_device_id += 1
+               devid = '%s%d' % (self.idstr, self.html_device_id)
+               list = self.dmesg[phase]['list']
+               length = -1.0
+               if(start >= 0 and end >= 0):
+                       length = end - start
+               i = 2
+               origname = name
+               while(name in list):
+                       name = '%s[%d]' % (origname, i)
+                       i += 1
+               list[name] = {'name': name, 'start': start, 'end': end,
+                       'pid': 0, 'length': length, 'row': 0, 'id': devid,
+                       'ret': ret, 'ulen': ulen }
+               return name
+       def deviceMatch(self, cg):
+               if cg.end - cg.start == 0:
+                       return True
+               list = self.dmesg['boot']['list']
+               for devname in list:
+                       dev = list[devname]
+                       if cg.name == 'do_one_initcall':
+                               if(cg.start <= dev['start'] and cg.end >= dev['end'] and dev['length'] > 0):
+                                       dev['ftrace'] = cg
+                                       self.do_one_initcall = True
+                                       return True
+                       else:
+                               if(cg.start > dev['start'] and cg.end < dev['end']):
+                                       if 'ftraces' not in dev:
+                                               dev['ftraces'] = []
+                                       dev['ftraces'].append(cg)
+                                       return True
+               return False
+
+# ----------------- FUNCTIONS --------------------
+
+# Function: loadKernelLog
+# Description:
+#       Load a raw kernel log from dmesg
+def loadKernelLog():
+       data = Data(0)
+       data.dmesg['boot']['start'] = data.start = ktime = 0.0
+       sysvals.stamp = {
+               'time': datetime.now().strftime('%B %d %Y, %I:%M:%S %p'),
+               'host': sysvals.hostname,
+               'mode': 'boot', 'kernel': ''}
+
+       devtemp = dict()
+       if(sysvals.dmesgfile):
+               lf = open(sysvals.dmesgfile, 'r')
+       else:
+               lf = Popen('dmesg', stdout=PIPE).stdout
+       for line in lf:
+               line = line.replace('\r\n', '')
+               idx = line.find('[')
+               if idx > 1:
+                       line = line[idx:]
+               m = re.match('[ \t]*(\[ *)(?P<ktime>[0-9\.]*)(\]) (?P<msg>.*)', line)
+               if(not m):
+                       continue
+               ktime = float(m.group('ktime'))
+               if(ktime > 120):
+                       break
+               msg = m.group('msg')
+               data.end = data.initstart = ktime
+               data.dmesgtext.append(line)
+               if(ktime == 0.0 and re.match('^Linux version .*', msg)):
+                       if(not sysvals.stamp['kernel']):
+                               sysvals.stamp['kernel'] = sysvals.kernelVersion(msg)
+                       continue
+               m = re.match('.* setting system clock to (?P<t>.*) UTC.*', msg)
+               if(m):
+                       bt = datetime.strptime(m.group('t'), '%Y-%m-%d %H:%M:%S')
+                       bt = bt - timedelta(seconds=int(ktime))
+                       data.boottime = bt.strftime('%Y-%m-%d_%H:%M:%S')
+                       sysvals.stamp['time'] = bt.strftime('%B %d %Y, %I:%M:%S %p')
+                       continue
+               m = re.match('^calling *(?P<f>.*)\+.*', msg)
+               if(m):
+                       devtemp[m.group('f')] = ktime
+                       continue
+               m = re.match('^initcall *(?P<f>.*)\+.* returned (?P<r>.*) after (?P<t>.*) usecs', msg)
+               if(m):
+                       data.valid = True
+                       f, r, t = m.group('f', 'r', 't')
+                       if(f in devtemp):
+                               data.newAction('boot', f, devtemp[f], ktime, int(r), int(t))
+                               data.end = ktime
+                               del devtemp[f]
+                       continue
+               if(re.match('^Freeing unused kernel memory.*', msg)):
+                       break
+
+       data.dmesg['boot']['end'] = data.end
+       lf.close()
+       return data
+
+# Function: loadTraceLog
+# Description:
+#       Check if trace is available and copy to a temp file
+def loadTraceLog(data):
+       # load the data to a temp file if none given
+       if not sysvals.ftracefile:
+               lib = aslib.sysvals
+               aslib.rootCheck(True)
+               if not lib.verifyFtrace():
+                       doError('ftrace not available')
+               if lib.fgetVal('current_tracer').strip() != 'function_graph':
+                       doError('ftrace not configured for a boot callgraph')
+               sysvals.ftracefile = '/tmp/boot_ftrace.%s.txt' % os.getpid()
+               call('cat '+lib.tpath+'trace > '+sysvals.ftracefile, shell=True)
+       if not sysvals.ftracefile:
+               doError('No trace data available')
+
+       # parse the trace log
+       ftemp = dict()
+       tp = aslib.TestProps()
+       tp.setTracerType('function_graph')
+       tf = open(sysvals.ftracefile, 'r')
+       for line in tf:
+               if line[0] == '#':
+                       continue
+               m = re.match(tp.ftrace_line_fmt, line.strip())
+               if(not m):
+                       continue
+               m_time, m_proc, m_pid, m_msg, m_dur = \
+                       m.group('time', 'proc', 'pid', 'msg', 'dur')
+               if float(m_time) > data.end:
+                       break
+               if(m_time and m_pid and m_msg):
+                       t = aslib.FTraceLine(m_time, m_msg, m_dur)
+                       pid = int(m_pid)
+               else:
+                       continue
+               if t.fevent or t.fkprobe:
+                       continue
+               key = (m_proc, pid)
+               if(key not in ftemp):
+                       ftemp[key] = []
+                       ftemp[key].append(aslib.FTraceCallGraph(pid))
+               cg = ftemp[key][-1]
+               if(cg.addLine(t)):
+                       ftemp[key].append(aslib.FTraceCallGraph(pid))
+       tf.close()
+
+       # add the callgraph data to the device hierarchy
+       for key in ftemp:
+               proc, pid = key
+               for cg in ftemp[key]:
+                       if len(cg.list) < 1 or cg.invalid:
+                               continue
+                       if(not cg.postProcess()):
+                               print('Sanity check failed for %s-%d' % (proc, pid))
+                               continue
+                       # match cg data to devices
+                       if not data.deviceMatch(cg):
+                               print ' BAD: %s %s-%d [%f - %f]' % (cg.name, proc, pid, cg.start, cg.end)
+
+# Function: colorForName
+# Description:
+#       Generate a repeatable color from a list for a given name
+def colorForName(name):
+       list = [
+               ('c1', '#ec9999'),
+               ('c2', '#ffc1a6'),
+               ('c3', '#fff0a6'),
+               ('c4', '#adf199'),
+               ('c5', '#9fadea'),
+               ('c6', '#a699c1'),
+               ('c7', '#ad99b4'),
+               ('c8', '#eaffea'),
+               ('c9', '#dcecfb'),
+               ('c10', '#ffffea')
+       ]
+       i = 0
+       total = 0
+       count = len(list)
+       while i < len(name):
+               total += ord(name[i])
+               i += 1
+       return list[total % count]
+
+def cgOverview(cg, minlen):
+       stats = dict()
+       large = []
+       for l in cg.list:
+               if l.fcall and l.depth == 1:
+                       if l.length >= minlen:
+                               large.append(l)
+                       if l.name not in stats:
+                               stats[l.name] = [0, 0.0]
+                       stats[l.name][0] += (l.length * 1000.0)
+                       stats[l.name][1] += 1
+       return (large, stats)
+
+# Function: createBootGraph
+# Description:
+#       Create the output html file from the resident test data
+# Arguments:
+#       testruns: array of Data objects from parseKernelLog or parseTraceLog
+# Output:
+#       True if the html file was created, false if it failed
+def createBootGraph(data, embedded):
+       # html function templates
+       html_srccall = '<div id={6} title="{5}" class="srccall" style="left:{1}%;top:{2}px;height:{3}px;width:{4}%;line-height:{3}px;">{0}</div>\n'
+       html_timetotal = '<table class="time1">\n<tr>'\
+               '<td class="blue">Time from Kernel Boot to start of User Mode: <b>{0} ms</b></td>'\
+               '</tr>\n</table>\n'
+
+       # device timeline
+       devtl = aslib.Timeline(100, 20)
+
+       # write the test title and general info header
+       devtl.createHeader(sysvals, 'noftrace')
+
+       # Generate the header for this timeline
+       t0 = data.start
+       tMax = data.end
+       tTotal = tMax - t0
+       if(tTotal == 0):
+               print('ERROR: No timeline data')
+               return False
+       boot_time = '%.0f'%(tTotal*1000)
+       devtl.html += html_timetotal.format(boot_time)
+
+       # determine the maximum number of rows we need to draw
+       phase = 'boot'
+       list = data.dmesg[phase]['list']
+       devlist = []
+       for devname in list:
+               d = aslib.DevItem(0, phase, list[devname])
+               devlist.append(d)
+       devtl.getPhaseRows(devlist)
+       devtl.calcTotalRows()
+
+       # draw the timeline background
+       devtl.createZoomBox()
+       boot = data.dmesg[phase]
+       length = boot['end']-boot['start']
+       left = '%.3f' % (((boot['start']-t0)*100.0)/tTotal)
+       width = '%.3f' % ((length*100.0)/tTotal)
+       devtl.html += devtl.html_tblock.format(phase, left, width, devtl.scaleH)
+       devtl.html += devtl.html_phase.format('0', '100', \
+               '%.3f'%devtl.scaleH, '%.3f'%devtl.bodyH, \
+               'white', '')
+
+       # draw the device timeline
+       num = 0
+       devstats = dict()
+       for devname in sorted(list):
+               cls, color = colorForName(devname)
+               dev = list[devname]
+               info = '@|%.3f|%.3f|%.3f|%d' % (dev['start']*1000.0, dev['end']*1000.0,
+                       dev['ulen']/1000.0, dev['ret'])
+               devstats[dev['id']] = {'info':info}
+               dev['color'] = color
+               height = devtl.phaseRowHeight(0, phase, dev['row'])
+               top = '%.6f' % ((dev['row']*height) + devtl.scaleH)
+               left = '%.6f' % (((dev['start']-t0)*100)/tTotal)
+               width = '%.6f' % (((dev['end']-dev['start'])*100)/tTotal)
+               length = ' (%0.3f ms) ' % ((dev['end']-dev['start'])*1000)
+               devtl.html += devtl.html_device.format(dev['id'],
+                       devname+length+'kernel_mode', left, top, '%.3f'%height,
+                       width, devname, ' '+cls, '')
+               rowtop = devtl.phaseRowTop(0, phase, dev['row'])
+               height = '%.6f' % (devtl.rowH / 2)
+               top = '%.6f' % (rowtop + devtl.scaleH + (devtl.rowH / 2))
+               if data.do_one_initcall:
+                       if('ftrace' not in dev):
+                               continue
+                       cg = dev['ftrace']
+                       large, stats = cgOverview(cg, 0.001)
+                       devstats[dev['id']]['fstat'] = stats
+                       for l in large:
+                               left = '%f' % (((l.time-t0)*100)/tTotal)
+                               width = '%f' % (l.length*100/tTotal)
+                               title = '%s (%0.3fms)' % (l.name, l.length * 1000.0)
+                               devtl.html += html_srccall.format(l.name, left,
+                                       top, height, width, title, 'x%d'%num)
+                               num += 1
+                       continue
+               if('ftraces' not in dev):
+                       continue
+               for cg in dev['ftraces']:
+                       left = '%f' % (((cg.start-t0)*100)/tTotal)
+                       width = '%f' % ((cg.end-cg.start)*100/tTotal)
+                       cglen = (cg.end - cg.start) * 1000.0
+                       title = '%s (%0.3fms)' % (cg.name, cglen)
+                       cg.id = 'x%d' % num
+                       devtl.html += html_srccall.format(cg.name, left,
+                               top, height, width, title, dev['id']+cg.id)
+                       num += 1
+
+       # draw the time scale, try to make the number of labels readable
+       devtl.createTimeScale(t0, tMax, tTotal, phase)
+       devtl.html += '</div>\n'
+
+       # timeline is finished
+       devtl.html += '</div>\n</div>\n'
+
+       if(sysvals.outfile == sysvals.htmlfile):
+               hf = open(sysvals.htmlfile, 'a')
+       else:
+               hf = open(sysvals.htmlfile, 'w')
+
+       # add the css if this is not an embedded run
+       extra = '\
+               .c1 {background:rgba(209,0,0,0.4);}\n\
+               .c2 {background:rgba(255,102,34,0.4);}\n\
+               .c3 {background:rgba(255,218,33,0.4);}\n\
+               .c4 {background:rgba(51,221,0,0.4);}\n\
+               .c5 {background:rgba(17,51,204,0.4);}\n\
+               .c6 {background:rgba(34,0,102,0.4);}\n\
+               .c7 {background:rgba(51,0,68,0.4);}\n\
+               .c8 {background:rgba(204,255,204,0.4);}\n\
+               .c9 {background:rgba(169,208,245,0.4);}\n\
+               .c10 {background:rgba(255,255,204,0.4);}\n\
+               .vt {transform:rotate(-60deg);transform-origin:0 0;}\n\
+               table.fstat {table-layout:fixed;padding:150px 15px 0 0;font-size:10px;column-width:30px;}\n\
+               .fstat th {width:55px;}\n\
+               .fstat td {text-align:left;width:35px;}\n\
+               .srccall {position:absolute;font-size:10px;z-index:7;overflow:hidden;color:black;text-align:center;white-space:nowrap;border-radius:5px;border:1px solid black;background:linear-gradient(to bottom right,#CCC,#969696);}\n\
+               .srccall:hover {color:white;font-weight:bold;border:1px solid white;}\n'
+       if(not embedded):
+               aslib.addCSS(hf, sysvals, 1, False, extra)
+
+       # write the device timeline
+       hf.write(devtl.html)
+
+       # add boot specific html
+       statinfo = 'var devstats = {\n'
+       for n in sorted(devstats):
+               statinfo += '\t"%s": [\n\t\t"%s",\n' % (n, devstats[n]['info'])
+               if 'fstat' in devstats[n]:
+                       funcs = devstats[n]['fstat']
+                       for f in sorted(funcs, key=funcs.get, reverse=True):
+                               if funcs[f][0] < 0.01 and len(funcs) > 10:
+                                       break
+                               statinfo += '\t\t"%f|%s|%d",\n' % (funcs[f][0], f, funcs[f][1])
+               statinfo += '\t],\n'
+       statinfo += '};\n'
+       html = \
+               '<div id="devicedetailtitle"></div>\n'\
+               '<div id="devicedetail" style="display:none;">\n'\
+               '<div id="devicedetail0">\n'\
+               '<div id="kernel_mode" class="phaselet" style="left:0%;width:100%;background:#DDDDDD"></div>\n'\
+               '</div>\n</div>\n'\
+               '<script type="text/javascript">\n'+statinfo+\
+               '</script>\n'
+       hf.write(html)
+
+       # add the callgraph html
+       if(sysvals.usecallgraph):
+               aslib.addCallgraphs(sysvals, hf, data)
+
+       # add the dmesg log as a hidden div
+       if sysvals.addlogs:
+               hf.write('<div id="dmesglog" style="display:none;">\n')
+               for line in data.dmesgtext:
+                       line = line.replace('<', '&lt').replace('>', '&gt')
+                       hf.write(line)
+               hf.write('</div>\n')
+
+       if(not embedded):
+               # write the footer and close
+               aslib.addScriptCode(hf, [data])
+               hf.write('</body>\n</html>\n')
+       else:
+               # embedded out will be loaded in a page, skip the js
+               hf.write('<div id=bounds style=display:none>%f,%f</div>' % \
+                       (data.start*1000, data.initstart*1000))
+       hf.close()
+       return True
+
+# Function: updateCron
+# Description:
+#    (restore=False) Set the tool to run automatically on reboot
+#    (restore=True) Restore the original crontab
+def updateCron(restore=False):
+       if not restore:
+               sysvals.rootUser(True)
+       crondir = '/var/spool/cron/crontabs/'
+       cronfile = crondir+'root'
+       backfile = crondir+'root-analyze_boot-backup'
+       if not os.path.exists(crondir):
+               doError('%s not found' % crondir)
+       out = Popen(['which', 'crontab'], stdout=PIPE).stdout.read()
+       if not out:
+               doError('crontab not found')
+       # on restore: move the backup cron back into place
+       if restore:
+               if os.path.exists(backfile):
+                       shutil.move(backfile, cronfile)
+               return
+       # backup current cron and install new one with reboot
+       if os.path.exists(cronfile):
+               shutil.move(cronfile, backfile)
+       else:
+               fp = open(backfile, 'w')
+               fp.close()
+       res = -1
+       try:
+               fp = open(backfile, 'r')
+               op = open(cronfile, 'w')
+               for line in fp:
+                       if '@reboot' not in line:
+                               op.write(line)
+                               continue
+               fp.close()
+               op.write('@reboot python %s\n' % sysvals.cronjobCmdString())
+               op.close()
+               res = call('crontab %s' % cronfile, shell=True)
+       except Exception, e:
+               print 'Exception: %s' % str(e)
+               shutil.move(backfile, cronfile)
+               res = -1
+       if res != 0:
+               doError('crontab failed')
+
+# Function: updateGrub
+# Description:
+#       update grub.cfg for all kernels with our parameters
+def updateGrub(restore=False):
+       # call update-grub on restore
+       if restore:
+               try:
+                       call(['update-grub'], stderr=PIPE, stdout=PIPE,
+                               env={'PATH': '.:/sbin:/usr/sbin:/usr/bin:/sbin:/bin'})
+               except Exception, e:
+                       print 'Exception: %s\n' % str(e)
+               return
+       # verify we can do this
+       sysvals.rootUser(True)
+       grubfile = '/etc/default/grub'
+       if not os.path.exists(grubfile):
+               print 'ERROR: Unable to set the kernel parameters via grub.\n'
+               sysvals.manualRebootRequired()
+       out = Popen(['which', 'update-grub'], stdout=PIPE).stdout.read()
+       if not out:
+               print 'ERROR: Unable to set the kernel parameters via grub.\n'
+               sysvals.manualRebootRequired()
+
+       # extract the option and create a grub config without it
+       tgtopt = 'GRUB_CMDLINE_LINUX_DEFAULT'
+       cmdline = ''
+       tempfile = '/etc/default/grub.analyze_boot'
+       shutil.move(grubfile, tempfile)
+       res = -1
+       try:
+               fp = open(tempfile, 'r')
+               op = open(grubfile, 'w')
+               cont = False
+               for line in fp:
+                       line = line.strip()
+                       if len(line) == 0 or line[0] == '#':
+                               continue
+                       opt = line.split('=')[0].strip()
+                       if opt == tgtopt:
+                               cmdline = line.split('=', 1)[1].strip('\\')
+                               if line[-1] == '\\':
+                                       cont = True
+                       elif cont:
+                               cmdline += line.strip('\\')
+                               if line[-1] != '\\':
+                                       cont = False
+                       else:
+                               op.write('%s\n' % line)
+               fp.close()
+               # if the target option value is in quotes, strip them
+               sp = '"'
+               val = cmdline.strip()
+               if val[0] == '\'' or val[0] == '"':
+                       sp = val[0]
+                       val = val.strip(sp)
+               cmdline = val
+               # append our cmd line options
+               if len(cmdline) > 0:
+                       cmdline += ' '
+               cmdline += sysvals.kernelParams()
+               # write out the updated target option
+               op.write('\n%s=%s%s%s\n' % (tgtopt, sp, cmdline, sp))
+               op.close()
+               res = call('update-grub')
+               os.remove(grubfile)
+       except Exception, e:
+               print 'Exception: %s' % str(e)
+               res = -1
+       # cleanup
+       shutil.move(tempfile, grubfile)
+       if res != 0:
+               doError('update-grub failed')
+
+# Function: doError
+# Description:
+#       generic error function for catastrphic failures
+# Arguments:
+#       msg: the error message to print
+#       help: True if printHelp should be called after, False otherwise
+def doError(msg, help=False):
+       if help == True:
+               printHelp()
+       print 'ERROR: %s\n' % msg
+       sys.exit()
+
+# Function: printHelp
+# Description:
+#       print out the help text
+def printHelp():
+       print('')
+       print('%s v%.1f' % (sysvals.title, sysvals.version))
+       print('Usage: bootgraph <options> <command>')
+       print('')
+       print('Description:')
+       print('  This tool reads in a dmesg log of linux kernel boot and')
+       print('  creates an html representation of the boot timeline up to')
+       print('  the start of the init process.')
+       print('')
+       print('  If no specific command is given the tool reads the current dmesg')
+       print('  and/or ftrace log and outputs bootgraph.html')
+       print('')
+       print('Options:')
+       print('  -h            Print this help text')
+       print('  -v            Print the current tool version')
+       print('  -addlogs      Add the dmesg log to the html output')
+       print('  -o file       Html timeline name (default: bootgraph.html)')
+       print(' [advanced]')
+       print('  -f            Use ftrace to add function detail (default: disabled)')
+       print('  -callgraph    Add callgraph detail, can be very large (default: disabled)')
+       print('  -maxdepth N   limit the callgraph data to N call levels (default: 2)')
+       print('  -mincg ms     Discard all callgraphs shorter than ms milliseconds (e.g. 0.001 for us)')
+       print('  -timeprec N   Number of significant digits in timestamps (0:S, 3:ms, [6:us])')
+       print('  -expandcg     pre-expand the callgraph data in the html output (default: disabled)')
+       print('  -filter list  Limit ftrace to comma-delimited list of functions (default: do_one_initcall)')
+       print(' [commands]')
+       print('  -reboot       Reboot the machine automatically and generate a new timeline')
+       print('  -manual       Show the requirements to generate a new timeline manually')
+       print('  -dmesg file   Load a stored dmesg file (used with -ftrace)')
+       print('  -ftrace file  Load a stored ftrace file (used with -dmesg)')
+       print('  -flistall     Print all functions capable of being captured in ftrace')
+       print('')
+       return True
+
+# ----------------- MAIN --------------------
+# exec start (skipped if script is loaded as library)
+if __name__ == '__main__':
+       # loop through the command line arguments
+       cmd = ''
+       simplecmds = ['-updategrub', '-flistall']
+       args = iter(sys.argv[1:])
+       for arg in args:
+               if(arg == '-h'):
+                       printHelp()
+                       sys.exit()
+               elif(arg == '-v'):
+                       print("Version %.1f" % sysvals.version)
+                       sys.exit()
+               elif(arg in simplecmds):
+                       cmd = arg[1:]
+               elif(arg == '-f'):
+                       sysvals.useftrace = True
+               elif(arg == '-callgraph'):
+                       sysvals.useftrace = True
+                       sysvals.usecallgraph = True
+               elif(arg == '-mincg'):
+                       sysvals.mincglen = aslib.getArgFloat('-mincg', args, 0.0, 10000.0)
+               elif(arg == '-timeprec'):
+                       sysvals.setPrecision(aslib.getArgInt('-timeprec', args, 0, 6))
+               elif(arg == '-maxdepth'):
+                       sysvals.max_graph_depth = aslib.getArgInt('-maxdepth', args, 0, 1000)
+               elif(arg == '-filter'):
+                       try:
+                               val = args.next()
+                       except:
+                               doError('No filter functions supplied', True)
+                       aslib.rootCheck(True)
+                       sysvals.setGraphFilter(val)
+               elif(arg == '-ftrace'):
+                       try:
+                               val = args.next()
+                       except:
+                               doError('No ftrace file supplied', True)
+                       if(os.path.exists(val) == False):
+                               doError('%s does not exist' % val)
+                       sysvals.ftracefile = val
+               elif(arg == '-addlogs'):
+                       sysvals.addlogs = True
+               elif(arg == '-expandcg'):
+                       sysvals.cgexp = True
+               elif(arg == '-dmesg'):
+                       try:
+                               val = args.next()
+                       except:
+                               doError('No dmesg file supplied', True)
+                       if(os.path.exists(val) == False):
+                               doError('%s does not exist' % val)
+                       if(sysvals.htmlfile == val or sysvals.outfile == val):
+                               doError('Output filename collision')
+                       sysvals.dmesgfile = val
+               elif(arg == '-o'):
+                       try:
+                               val = args.next()
+                       except:
+                               doError('No HTML filename supplied', True)
+                       if(sysvals.dmesgfile == val or sysvals.ftracefile == val):
+                               doError('Output filename collision')
+                       sysvals.htmlfile = val
+               elif(arg == '-reboot'):
+                       if sysvals.iscronjob:
+                               doError('-reboot and -cronjob are incompatible')
+                       sysvals.reboot = True
+               elif(arg == '-manual'):
+                       sysvals.reboot = True
+                       sysvals.manual = True
+               # remaining options are only for cron job use
+               elif(arg == '-cronjob'):
+                       sysvals.iscronjob = True
+                       if sysvals.reboot:
+                               doError('-reboot and -cronjob are incompatible')
+               else:
+                       doError('Invalid argument: '+arg, True)
+
+       if cmd != '':
+               if cmd == 'updategrub':
+                       updateGrub()
+               elif cmd == 'flistall':
+                       sysvals.getFtraceFilterFunctions(False)
+               sys.exit()
+
+       # update grub, setup a cronjob, and reboot
+       if sysvals.reboot:
+               if not sysvals.manual:
+                       updateGrub()
+                       updateCron()
+                       call('reboot')
+               else:
+                       sysvals.manualRebootRequired()
+               sys.exit()
+
+       # disable the cronjob
+       if sysvals.iscronjob:
+               updateCron(True)
+               updateGrub(True)
+
+       data = loadKernelLog()
+       if sysvals.useftrace:
+               loadTraceLog(data)
+               if sysvals.iscronjob:
+                       try:
+                               sysvals.fsetVal('0', 'tracing_on')
+                       except:
+                               pass
+
+       if(sysvals.outfile and sysvals.phoronix):
+               fp = open(sysvals.outfile, 'w')
+               fp.write('pass %s initstart %.3f end %.3f boot %s\n' %
+                       (data.valid, data.initstart*1000, data.end*1000, data.boottime))
+               fp.close()
+       if(not data.valid):
+               if sysvals.dmesgfile:
+                       doError('No initcall data found in %s' % sysvals.dmesgfile)
+               else:
+                       doError('No initcall data found, is initcall_debug enabled?')
+
+       print('          Host: %s' % sysvals.hostname)
+       print('     Test time: %s' % sysvals.testtime)
+       print('     Boot time: %s' % data.boottime)
+       print('Kernel Version: %s' % sysvals.kernel)
+       print('  Kernel start: %.3f' % (data.start * 1000))
+       print('    init start: %.3f' % (data.initstart * 1000))
+
+       createBootGraph(data, sysvals.phoronix)
similarity index 91%
rename from scripts/analyze_suspend.py
rename to tools/power/pm-graph/analyze_suspend.py
index 20cdb2b..a9206e6 100755 (executable)
 # FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
 # more details.
 #
-# You should have received a copy of the GNU General Public License along with
-# this program; if not, write to the Free Software Foundation, Inc.,
-# 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-#
 # Authors:
 #       Todd Brandt <todd.e.brandt@linux.intel.com>
 #
@@ -23,7 +19,7 @@
 #       Home Page
 #         https://01.org/suspendresume
 #       Source repo
-#         https://github.com/01org/suspendresume
+#         https://github.com/01org/pm-graph
 #
 # Description:
 #       This tool is designed to assist kernel and OS developers in optimizing
@@ -71,14 +67,16 @@ from subprocess import call, Popen, PIPE
 #       A global, single-instance container used to
 #       store system values and test parameters
 class SystemValues:
+       title = 'SleepGraph'
+       version = '4.6'
        ansi = False
-       version = '4.5'
        verbose = False
        addlogs = False
        mindevlen = 0.0
        mincglen = 0.0
        cgphase = ''
        cgtest = -1
+       max_graph_depth = 0
        callloopmaxgap = 0.0001
        callloopmaxlen = 0.005
        srgap = 0
@@ -106,8 +104,8 @@ class SystemValues:
        ftracefile = ''
        htmlfile = ''
        embedded = False
-       rtcwake = False
-       rtcwaketime = 10
+       rtcwake = True
+       rtcwaketime = 15
        rtcpath = ''
        devicefilter = []
        stamp = 0
@@ -235,6 +233,12 @@ class SystemValues:
                        self.rtcpath = rtc
                if (hasattr(sys.stdout, 'isatty') and sys.stdout.isatty()):
                        self.ansi = True
+       def rootUser(self, fatal=False):
+               if 'USER' in os.environ and os.environ['USER'] == 'root':
+                       return True
+               if fatal:
+                       doError('This command must be run as root')
+               return False
        def setPrecision(self, num):
                if num < 0 or num > 6:
                        return
@@ -564,7 +568,7 @@ class SystemValues:
                self.fsetVal('global', 'trace_clock')
                # set trace buffer to a huge value
                self.fsetVal('nop', 'current_tracer')
-               self.fsetVal('100000', 'buffer_size_kb')
+               self.fsetVal('131073', 'buffer_size_kb')
                # go no further if this is just a status check
                if testing:
                        return
@@ -583,7 +587,7 @@ class SystemValues:
                        self.fsetVal('nofuncgraph-overhead', 'trace_options')
                        self.fsetVal('context-info', 'trace_options')
                        self.fsetVal('graph-time', 'trace_options')
-                       self.fsetVal('0', 'max_graph_depth')
+                       self.fsetVal('%d' % self.max_graph_depth, 'max_graph_depth')
                        cf = ['dpm_run_callback']
                        if(self.usetraceeventsonly):
                                cf += ['dpm_prepare', 'dpm_complete']
@@ -639,6 +643,12 @@ class SystemValues:
                return '\x1B[%d;40m%s\x1B[m' % (color, str)
 
 sysvals = SystemValues()
+suspendmodename = {
+       'freeze': 'Freeze (S0)',
+       'standby': 'Standby (S1)',
+       'mem': 'Suspend (S3)',
+       'disk': 'Hibernate (S4)'
+}
 
 # Class: DevProps
 # Description:
@@ -1013,6 +1023,8 @@ class Data:
                tmp = dict()
                for devname in list:
                        dev = list[devname]
+                       if dev['length'] == 0:
+                               continue
                        tmp[dev['start']] = devname
                for t in sorted(tmp):
                        slist.append(tmp[t])
@@ -1477,12 +1489,14 @@ class FTraceLine:
 #       Each instance is tied to a single device in a single phase, and is
 #       comprised of an ordered list of FTraceLine objects
 class FTraceCallGraph:
+       id = ''
        start = -1.0
        end = -1.0
        list = []
        invalid = False
        depth = 0
        pid = 0
+       name = ''
        def __init__(self, pid):
                self.start = -1.0
                self.end = -1.0
@@ -1631,9 +1645,17 @@ class FTraceCallGraph:
                                return True
                return False
        def postProcess(self, debug=False):
+               if len(self.list) > 0:
+                       self.name = self.list[0].name
                stack = dict()
                cnt = 0
+               last = 0
                for l in self.list:
+                       # ftrace bug: reported duration is not reliable
+                       # check each leaf and clip it at max possible length
+                       if(last and last.freturn and last.fcall):
+                               if last.length > l.time - last.time:
+                                       last.length = l.time - last.time
                        if(l.fcall and not l.freturn):
                                stack[l.depth] = l
                                cnt += 1
@@ -1643,11 +1665,12 @@ class FTraceCallGraph:
                                                print 'Post Process Error: Depth missing'
                                                l.debugPrint()
                                        return False
-                               # transfer total time from return line to call line
-                               stack[l.depth].length = l.length
+                               # calculate call length from call/return lines
+                               stack[l.depth].length = l.time - stack[l.depth].time
                                stack.pop(l.depth)
                                l.length = 0
                                cnt -= 1
+                       last = l
                if(cnt == 0):
                        # trace caught the whole call tree
                        return True
@@ -1664,8 +1687,8 @@ class FTraceCallGraph:
                        'dpm_prepare': 'suspend_prepare',
                        'dpm_complete': 'resume_complete'
                }
-               if(self.list[0].name in borderphase):
-                       p = borderphase[self.list[0].name]
+               if(self.name in borderphase):
+                       p = borderphase[self.name]
                        list = data.dmesg[p]['list']
                        for devname in list:
                                dev = list[devname]
@@ -1690,7 +1713,7 @@ class FTraceCallGraph:
                                break
                return found
        def newActionFromFunction(self, data):
-               name = self.list[0].name
+               name = self.name
                if name in ['dpm_run_callback', 'dpm_prepare', 'dpm_complete']:
                        return
                fs = self.start
@@ -1710,7 +1733,7 @@ class FTraceCallGraph:
                        phase, myname = out
                        data.dmesg[phase]['list'][myname]['ftrace'] = self
        def debugPrint(self):
-               print('[%f - %f] %s (%d)') % (self.start, self.end, self.list[0].name, self.pid)
+               print('[%f - %f] %s (%d)') % (self.start, self.end, self.name, self.pid)
                for l in self.list:
                        if(l.freturn and l.fcall):
                                print('%f (%02d): %s(); (%.3f us)' % (l.time, \
@@ -1738,7 +1761,7 @@ class DevItem:
 #       A container for a device timeline which calculates
 #       all the html properties to display it correctly
 class Timeline:
-       html = {}
+       html = ''
        height = 0      # total timeline height
        scaleH = 20     # timescale (top) row height
        rowH = 30       # device row height
@@ -1746,14 +1769,28 @@ class Timeline:
        rows = 0        # total timeline rows
        rowlines = dict()
        rowheight = dict()
+       html_tblock = '<div id="block{0}" class="tblock" style="left:{1}%;width:{2}%;"><div class="tback" style="height:{3}px"></div>\n'
+       html_device = '<div id="{0}" title="{1}" class="thread{7}" style="left:{2}%;top:{3}px;height:{4}px;width:{5}%;{8}">{6}</div>\n'
+       html_phase = '<div class="phase" style="left:{0}%;width:{1}%;top:{2}px;height:{3}px;background:{4}">{5}</div>\n'
+       html_phaselet = '<div id="{0}" class="phaselet" style="left:{1}%;width:{2}%;background:{3}"></div>\n'
        def __init__(self, rowheight, scaleheight):
                self.rowH = rowheight
                self.scaleH = scaleheight
-               self.html = {
-                       'header': '',
-                       'timeline': '',
-                       'legend': '',
-               }
+               self.html = ''
+       def createHeader(self, sv, suppress=''):
+               if(not sv.stamp['time']):
+                       return
+               self.html += '<div class="version"><a href="https://01.org/suspendresume">%s v%s</a></div>' \
+                       % (sv.title, sv.version)
+               if sv.logmsg and 'log' not in suppress:
+                       self.html += '<button id="showtest" class="logbtn">log</button>'
+               if sv.addlogs and 'dmesg' not in suppress:
+                       self.html += '<button id="showdmesg" class="logbtn">dmesg</button>'
+               if sv.addlogs and sv.ftracefile and 'ftrace' not in suppress:
+                       self.html += '<button id="showftrace" class="logbtn">ftrace</button>'
+               headline_stamp = '<div class="stamp">{0} {1} {2} {3}</div>\n'
+               self.html += headline_stamp.format(sv.stamp['host'], sv.stamp['kernel'],
+                       sv.stamp['mode'], sv.stamp['time'])
        # Function: getDeviceRows
        # Description:
        #    determine how may rows the device funcs will take
@@ -1880,10 +1917,8 @@ class Timeline:
                                break
                        top += self.rowheight[test][phase][i]
                return top
-       # Function: calcTotalRows
-       # Description:
-       #        Calculate the heights and offsets for the header and rows
        def calcTotalRows(self):
+               # Calculate the heights and offsets for the header and rows
                maxrows = 0
                standardphases = []
                for t in self.rowlines:
@@ -1901,6 +1936,20 @@ class Timeline:
                for t, p in standardphases:
                        for i in sorted(self.rowheight[t][p]):
                                self.rowheight[t][p][i] = self.bodyH/len(self.rowlines[t][p])
+       def createZoomBox(self, mode='command', testcount=1):
+               # Create bounding box, add buttons
+               html_zoombox = '<center><button id="zoomin">ZOOM IN +</button><button id="zoomout">ZOOM OUT -</button><button id="zoomdef">ZOOM 1:1</button></center>\n'
+               html_timeline = '<div id="dmesgzoombox" class="zoombox">\n<div id="{0}" class="timeline" style="height:{1}px">\n'
+               html_devlist1 = '<button id="devlist1" class="devlist" style="float:left;">Device Detail{0}</button>'
+               html_devlist2 = '<button id="devlist2" class="devlist" style="float:right;">Device Detail2</button>\n'
+               if mode != 'command':
+                       if testcount > 1:
+                               self.html += html_devlist2
+                               self.html += html_devlist1.format('1')
+                       else:
+                               self.html += html_devlist1.format('')
+               self.html += html_zoombox
+               self.html += html_timeline.format('dmesg', self.height)
        # Function: createTimeScale
        # Description:
        #        Create the timescale for a timeline block
@@ -1913,7 +1962,7 @@ class Timeline:
        #        The html code needed to display the time scale
        def createTimeScale(self, m0, mMax, tTotal, mode):
                timescale = '<div class="t" style="right:{0}%">{1}</div>\n'
-               rline = '<div class="t" style="left:0;border-left:1px solid black;border-right:0;">Resume</div>\n'
+               rline = '<div class="t" style="left:0;border-left:1px solid black;border-right:0;">{0}</div>\n'
                output = '<div class="timescale">\n'
                # set scale for timeline
                mTotal = mMax - m0
@@ -1926,21 +1975,20 @@ class Timeline:
                divEdge = (mTotal - tS*(divTotal-1))*100/mTotal
                for i in range(divTotal):
                        htmlline = ''
-                       if(mode == 'resume'):
-                               pos = '%0.3f' % (100 - ((float(i)*tS*100)/mTotal))
-                               val = '%0.fms' % (float(i)*tS*1000)
-                               htmlline = timescale.format(pos, val)
-                               if(i == 0):
-                                       htmlline = rline
-                       else:
+                       if(mode == 'suspend'):
                                pos = '%0.3f' % (100 - ((float(i)*tS*100)/mTotal) - divEdge)
                                val = '%0.fms' % (float(i-divTotal+1)*tS*1000)
                                if(i == divTotal - 1):
-                                       val = 'Suspend'
+                                       val = mode
+                               htmlline = timescale.format(pos, val)
+                       else:
+                               pos = '%0.3f' % (100 - ((float(i)*tS*100)/mTotal))
+                               val = '%0.fms' % (float(i)*tS*1000)
                                htmlline = timescale.format(pos, val)
+                               if(i == 0):
+                                       htmlline = rline.format(mode)
                        output += htmlline
-               output += '</div>\n'
-               return output
+               self.html += output+'</div>\n'
 
 # Class: TestProps
 # Description:
@@ -2009,7 +2057,7 @@ class ProcessMonitor:
                                val['kern'] = kern
                        if ujiff > 0 or kjiff > 0:
                                running[pid] = ujiff + kjiff
-               result = process.wait()
+               process.wait()
                out = ''
                for pid in running:
                        jiffies = running[pid]
@@ -2071,26 +2119,6 @@ def parseStamp(line, data):
        if not sysvals.stamp:
                sysvals.stamp = data.stamp
 
-# Function: diffStamp
-# Description:
-#      compare the host, kernel, and mode fields in 3 stamps
-# Arguments:
-#       stamp1: string array with mode, kernel, and host
-#       stamp2: string array with mode, kernel, and host
-# Return:
-#      True if stamps differ, False if they're the same
-def diffStamp(stamp1, stamp2):
-       if 'host' in stamp1 and 'host' in stamp2:
-               if stamp1['host'] != stamp2['host']:
-                       return True
-       if 'kernel' in stamp1 and 'kernel' in stamp2:
-               if stamp1['kernel'] != stamp2['kernel']:
-                       return True
-       if 'mode' in stamp1 and 'mode' in stamp2:
-               if stamp1['mode'] != stamp2['mode']:
-                       return True
-       return False
-
 # Function: doesTraceLogHaveTraceEvents
 # Description:
 #       Quickly determine if the ftrace log has some or all of the trace events
@@ -2722,7 +2750,7 @@ def parseTraceLog():
                        # create blocks for orphan cg data
                        for sortkey in sorted(sortlist):
                                cg = sortlist[sortkey]
-                               name = cg.list[0].name
+                               name = cg.name
                                if sysvals.isCallgraphFunc(name):
                                        vprint('Callgraph found for task %d: %.3fms, %s' % (cg.pid, (cg.end - cg.start)*1000, name))
                                        cg.newActionFromFunction(data)
@@ -3100,149 +3128,154 @@ def parseKernelLog(data):
        data.fixupInitcallsThatDidntReturn()
        return True
 
+def callgraphHTML(sv, hf, num, cg, title, color, devid):
+       html_func_top = '<article id="{0}" class="atop" style="background:{1}">\n<input type="checkbox" class="pf" id="f{2}" checked/><label for="f{2}">{3} {4}</label>\n'
+       html_func_start = '<article>\n<input type="checkbox" class="pf" id="f{0}" checked/><label for="f{0}">{1} {2}</label>\n'
+       html_func_end = '</article>\n'
+       html_func_leaf = '<article>{0} {1}</article>\n'
+
+       cgid = devid
+       if cg.id:
+               cgid += cg.id
+       cglen = (cg.end - cg.start) * 1000
+       if cglen < sv.mincglen:
+               return num
+
+       fmt = '<r>(%.3f ms @ '+sv.timeformat+' to '+sv.timeformat+')</r>'
+       flen = fmt % (cglen, cg.start, cg.end)
+       hf.write(html_func_top.format(cgid, color, num, title, flen))
+       num += 1
+       for line in cg.list:
+               if(line.length < 0.000000001):
+                       flen = ''
+               else:
+                       fmt = '<n>(%.3f ms @ '+sv.timeformat+')</n>'
+                       flen = fmt % (line.length*1000, line.time)
+               if(line.freturn and line.fcall):
+                       hf.write(html_func_leaf.format(line.name, flen))
+               elif(line.freturn):
+                       hf.write(html_func_end)
+               else:
+                       hf.write(html_func_start.format(num, line.name, flen))
+                       num += 1
+       hf.write(html_func_end)
+       return num
+
+def addCallgraphs(sv, hf, data):
+       hf.write('<section id="callgraphs" class="callgraph">\n')
+       # write out the ftrace data converted to html
+       num = 0
+       for p in data.phases:
+               if sv.cgphase and p != sv.cgphase:
+                       continue
+               list = data.dmesg[p]['list']
+               for devname in data.sortedDevices(p):
+                       dev = list[devname]
+                       color = 'white'
+                       if 'color' in data.dmesg[p]:
+                               color = data.dmesg[p]['color']
+                       if 'color' in dev:
+                               color = dev['color']
+                       name = devname
+                       if(devname in sv.devprops):
+                               name = sv.devprops[devname].altName(devname)
+                       if sv.suspendmode in suspendmodename:
+                               name += ' '+p
+                       if('ftrace' in dev):
+                               cg = dev['ftrace']
+                               num = callgraphHTML(sv, hf, num, cg,
+                                       name, color, dev['id'])
+                       if('ftraces' in dev):
+                               for cg in dev['ftraces']:
+                                       num = callgraphHTML(sv, hf, num, cg,
+                                               name+' &rarr; '+cg.name, color, dev['id'])
+
+       hf.write('\n\n    </section>\n')
+
 # Function: createHTMLSummarySimple
 # Description:
 #       Create summary html file for a series of tests
 # Arguments:
 #       testruns: array of Data objects from parseTraceLog
-def createHTMLSummarySimple(testruns, htmlfile):
-       # print out the basic summary of all the tests
-       hf = open(htmlfile, 'w')
-
+def createHTMLSummarySimple(testruns, htmlfile, folder):
        # write the html header first (html head, css code, up to body start)
        html = '<!DOCTYPE html>\n<html>\n<head>\n\
        <meta http-equiv="content-type" content="text/html; charset=UTF-8">\n\
-       <title>AnalyzeSuspend Summary</title>\n\
+       <title>SleepGraph Summary</title>\n\
        <style type=\'text/css\'>\n\
-               body {overflow-y: scroll;}\n\
-               .stamp {width: 100%;text-align:center;background-color:#495E09;line-height:30px;color:white;font: 25px Arial;}\n\
+               .stamp {width: 100%;text-align:center;background:#888;line-height:30px;color:white;font: 25px Arial;}\n\
                table {width:100%;border-collapse: collapse;}\n\
-               .summary {font: 22px Arial;border:1px solid;}\n\
-               th {border: 1px solid black;background-color:#A7C942;color:white;}\n\
-               td {text-align: center;}\n\
-               tr.alt td {background-color:#EAF2D3;}\n\
-               tr.avg td {background-color:#BDE34C;}\n\
-               a:link {color: #90B521;}\n\
-               a:visited {color: #495E09;}\n\
-               a:hover {color: #B1DF28;}\n\
-               a:active {color: #FFFFFF;}\n\
+               .summary {border:1px solid;}\n\
+               th {border: 1px solid black;background:#222;color:white;}\n\
+               td {font: 16px "Times New Roman";text-align: center;}\n\
+               tr.alt td {background:#ddd;}\n\
+               tr.avg td {background:#aaa;}\n\
        </style>\n</head>\n<body>\n'
 
        # group test header
-       count = len(testruns)
-       headline_stamp = '<div class="stamp">{0} {1} {2} {3} ({4} tests)</div>\n'
-       html += headline_stamp.format(sysvals.stamp['host'],
-               sysvals.stamp['kernel'], sysvals.stamp['mode'],
-               sysvals.stamp['time'], count)
-
-       # check to see if all the tests have the same value
-       stampcolumns = False
-       for data in testruns:
-               if diffStamp(sysvals.stamp, data.stamp):
-                       stampcolumns = True
-                       break
-
+       html += '<div class="stamp">%s (%d tests)</div>\n' % (folder, len(testruns))
        th = '\t<th>{0}</th>\n'
        td = '\t<td>{0}</td>\n'
-       tdlink = '\t<td><a href="{0}">Click Here</a></td>\n'
+       tdlink = '\t<td><a href="{0}">html</a></td>\n'
 
        # table header
-       html += '<table class="summary">\n<tr>\n'
-       html += th.format("Test #")
-       if stampcolumns:
-               html += th.format("Hostname")
-               html += th.format("Kernel Version")
-               html += th.format("Suspend Mode")
-       html += th.format("Test Time")
-       html += th.format("Suspend Time")
-       html += th.format("Resume Time")
-       html += th.format("Detail")
-       html += '</tr>\n'
+       html += '<table class="summary">\n<tr>\n' + th.format('#') +\
+               th.format('Mode') + th.format('Host') + th.format('Kernel') +\
+               th.format('Test Time') + th.format('Suspend') + th.format('Resume') +\
+               th.format('Detail') + '</tr>\n'
 
        # test data, 1 row per test
-       sTimeAvg = 0.0
-       rTimeAvg = 0.0
-       num = 1
-       for data in testruns:
-               # data.end is the end of post_resume
-               resumeEnd = data.dmesg['resume_complete']['end']
+       avg = '<tr class="avg"><td></td><td></td><td></td><td></td>'+\
+               '<td>Average of {0} {1} tests</td><td>{2}</td><td>{3}</td><td></td></tr>\n'
+       sTimeAvg = rTimeAvg = 0.0
+       mode = ''
+       num = 0
+       for data in sorted(testruns, key=lambda v:(v['mode'], v['host'], v['kernel'])):
+               if mode != data['mode']:
+                       # test average line
+                       if(num > 0):
+                               sTimeAvg /= (num - 1)
+                               rTimeAvg /= (num - 1)
+                               html += avg.format('%d' % (num - 1), mode,
+                                       '%3.3f ms' % sTimeAvg, '%3.3f ms' % rTimeAvg)
+                       sTimeAvg = rTimeAvg = 0.0
+                       mode = data['mode']
+                       num = 1
+               # alternate row color
                if num % 2 == 1:
                        html += '<tr class="alt">\n'
                else:
                        html += '<tr>\n'
-
-               # test num
-               html += td.format("test %d" % num)
+               html += td.format("%d" % num)
                num += 1
-               if stampcolumns:
-                       # host name
-                       val = "unknown"
-                       if('host' in data.stamp):
-                               val = data.stamp['host']
-                       html += td.format(val)
-                       # host kernel
+               # basic info
+               for item in ['mode', 'host', 'kernel', 'time']:
                        val = "unknown"
-                       if('kernel' in data.stamp):
-                               val = data.stamp['kernel']
+                       if(item in data):
+                               val = data[item]
                        html += td.format(val)
-                       # suspend mode
-                       val = "unknown"
-                       if('mode' in data.stamp):
-                               val = data.stamp['mode']
-                       html += td.format(val)
-               # test time
-               val = "unknown"
-               if('time' in data.stamp):
-                       val = data.stamp['time']
-               html += td.format(val)
                # suspend time
-               sTime = (data.tSuspended - data.start)*1000
+               sTime = float(data['suspend'])
                sTimeAvg += sTime
-               html += td.format("%3.3f ms" % sTime)
+               html += td.format('%.3f ms' % sTime)
                # resume time
-               rTime = (resumeEnd - data.tResumed)*1000
+               rTime = float(data['resume'])
                rTimeAvg += rTime
-               html += td.format("%3.3f ms" % rTime)
+               html += td.format('%.3f ms' % rTime)
                # link to the output html
-               html += tdlink.format(data.outfile)
-
-               html += '</tr>\n'
-
-       # last line: test average
-       if(count > 0):
-               sTimeAvg /= count
-               rTimeAvg /= count
-       html += '<tr class="avg">\n'
-       html += td.format('Average')    # name
-       if stampcolumns:
-               html += td.format('')                   # host
-               html += td.format('')                   # kernel
-               html += td.format('')                   # mode
-       html += td.format('')                   # time
-       html += td.format("%3.3f ms" % sTimeAvg)        # suspend time
-       html += td.format("%3.3f ms" % rTimeAvg)        # resume time
-       html += td.format('')                   # output link
-       html += '</tr>\n'
+               html += tdlink.format(data['url']) + '</tr>\n'
+       # last test average line
+       if(num > 0):
+               sTimeAvg /= (num - 1)
+               rTimeAvg /= (num - 1)
+               html += avg.format('%d' % (num - 1), mode,
+                       '%3.3f ms' % sTimeAvg, '%3.3f ms' % rTimeAvg)
 
        # flush the data to file
-       hf.write(html+'</table>\n')
-       hf.write('</body>\n</html>\n')
+       hf = open(htmlfile, 'w')
+       hf.write(html+'</table>\n</body>\n</html>\n')
        hf.close()
 
-def htmlTitle():
-       modename = {
-               'freeze': 'Freeze (S0)',
-               'standby': 'Standby (S1)',
-               'mem': 'Suspend (S3)',
-               'disk': 'Hibernate (S4)'
-       }
-       kernel = sysvals.stamp['kernel']
-       host = sysvals.hostname[0].upper()+sysvals.hostname[1:]
-       mode = sysvals.suspendmode
-       if sysvals.suspendmode in modename:
-               mode = modename[sysvals.suspendmode]
-       return host+' '+mode+' '+kernel
-
 def ordinal(value):
        suffix = 'th'
        if value < 10 or value > 19:
@@ -3272,24 +3305,11 @@ def createHTML(testruns):
                        kerror = True
                data.normalizeTime(testruns[-1].tSuspended)
 
-       x2changes = ['', 'absolute']
-       if len(testruns) > 1:
-               x2changes = ['1', 'relative']
        # html function templates
-       headline_version = '<div class="version"><a href="https://01.org/suspendresume">AnalyzeSuspend v%s</a></div>' % sysvals.version
-       headline_stamp = '<div class="stamp">{0} {1} {2} {3}</div>\n'
-       html_devlist1 = '<button id="devlist1" class="devlist" style="float:left;">Device Detail%s</button>' % x2changes[0]
-       html_zoombox = '<center><button id="zoomin">ZOOM IN +</button><button id="zoomout">ZOOM OUT -</button><button id="zoomdef">ZOOM 1:1</button></center>\n'
-       html_devlist2 = '<button id="devlist2" class="devlist" style="float:right;">Device Detail2</button>\n'
-       html_timeline = '<div id="dmesgzoombox" class="zoombox">\n<div id="{0}" class="timeline" style="height:{1}px">\n'
-       html_tblock = '<div id="block{0}" class="tblock" style="left:{1}%;width:{2}%;"><div class="tback" style="height:{3}px"></div>\n'
-       html_device = '<div id="{0}" title="{1}" class="thread{7}" style="left:{2}%;top:{3}px;height:{4}px;width:{5}%;{8}">{6}</div>\n'
        html_error = '<div id="{1}" title="kernel error/warning" class="err" style="right:{0}%">ERROR&rarr;</div>\n'
        html_traceevent = '<div title="{0}" class="traceevent{6}" style="left:{1}%;top:{2}px;height:{3}px;width:{4}%;line-height:{3}px;{7}">{5}</div>\n'
        html_cpuexec = '<div class="jiffie" style="left:{0}%;top:{1}px;height:{2}px;width:{3}%;background:{4};"></div>\n'
-       html_phase = '<div class="phase" style="left:{0}%;width:{1}%;top:{2}px;height:{3}px;background-color:{4}">{5}</div>\n'
-       html_phaselet = '<div id="{0}" class="phaselet" style="left:{1}%;width:{2}%;background:{3}"></div>\n'
-       html_legend = '<div id="p{3}" class="square" style="left:{0}%;background-color:{1}">&nbsp;{2}</div>\n'
+       html_legend = '<div id="p{3}" class="square" style="left:{0}%;background:{1}">&nbsp;{2}</div>\n'
        html_timetotal = '<table class="time1">\n<tr>'\
                '<td class="green" title="{3}">{2} Suspend Time: <b>{0} ms</b></td>'\
                '<td class="yellow" title="{4}">{2} Resume Time: <b>{1} ms</b></td>'\
@@ -3311,20 +3331,18 @@ def createHTML(testruns):
                '</tr>\n</table>\n'
 
        # html format variables
-       hoverZ = 'z-index:8;'
-       if sysvals.usedevsrc:
-               hoverZ = ''
        scaleH = 20
-       scaleTH = 20
        if kerror:
                scaleH = 40
-               scaleTH = 60
 
        # device timeline
        vprint('Creating Device Timeline...')
 
        devtl = Timeline(30, scaleH)
 
+       # write the test title and general info header
+       devtl.createHeader(sysvals)
+
        # Generate the header for this timeline
        for data in testruns:
                tTotal = data.end - data.start
@@ -3346,7 +3364,7 @@ def createHTML(testruns):
                        if(len(testruns) > 1):
                                testdesc = ordinal(data.testnumber+1)+' '+testdesc
                        thtml = html_timetotal3.format(run_time, testdesc)
-                       devtl.html['header'] += thtml
+                       devtl.html += thtml
                elif data.fwValid:
                        suspend_time = '%.0f'%(sktime + (data.fwSuspend/1000000.0))
                        resume_time = '%.0f'%(rktime + (data.fwResume/1000000.0))
@@ -3363,10 +3381,10 @@ def createHTML(testruns):
                        else:
                                thtml = html_timetotal2.format(suspend_time, low_time, \
                                        resume_time, testdesc1, stitle, rtitle)
-                       devtl.html['header'] += thtml
+                       devtl.html += thtml
                        sftime = '%.3f'%(data.fwSuspend / 1000000.0)
                        rftime = '%.3f'%(data.fwResume / 1000000.0)
-                       devtl.html['header'] += html_timegroups.format('%.3f'%sktime, \
+                       devtl.html += html_timegroups.format('%.3f'%sktime, \
                                sftime, rftime, '%.3f'%rktime, testdesc2, sysvals.suspendmode)
                else:
                        suspend_time = '%.3f' % sktime
@@ -3382,7 +3400,7 @@ def createHTML(testruns):
                        else:
                                thtml = html_timetotal2.format(suspend_time, low_time, \
                                        resume_time, testdesc, stitle, rtitle)
-                       devtl.html['header'] += thtml
+                       devtl.html += thtml
 
        # time scale for potentially multiple datasets
        t0 = testruns[0].start
@@ -3429,15 +3447,8 @@ def createHTML(testruns):
                        devtl.getPhaseRows(threadlist, devtl.rows)
        devtl.calcTotalRows()
 
-       # create bounding box, add buttons
-       if sysvals.suspendmode != 'command':
-               devtl.html['timeline'] += html_devlist1
-               if len(testruns) > 1:
-                       devtl.html['timeline'] += html_devlist2
-       devtl.html['timeline'] += html_zoombox
-       devtl.html['timeline'] += html_timeline.format('dmesg', devtl.height)
-
        # draw the full timeline
+       devtl.createZoomBox(sysvals.suspendmode, len(testruns))
        phases = {'suspend':[],'resume':[]}
        for phase in data.dmesg:
                if 'resume' in phase:
@@ -3452,37 +3463,36 @@ def createHTML(testruns):
                        # draw suspend and resume blocks separately
                        bname = '%s%d' % (dir[0], data.testnumber)
                        if dir == 'suspend':
-                               m0 = testruns[data.testnumber].start
-                               mMax = testruns[data.testnumber].tSuspended
-                               mTotal = mMax - m0
+                               m0 = data.start
+                               mMax = data.tSuspended
                                left = '%f' % (((m0-t0)*100.0)/tTotal)
                        else:
-                               m0 = testruns[data.testnumber].tSuspended
-                               mMax = testruns[data.testnumber].end
+                               m0 = data.tSuspended
+                               mMax = data.end
                                # in an x2 run, remove any gap between blocks
                                if len(testruns) > 1 and data.testnumber == 0:
                                        mMax = testruns[1].start
-                               mTotal = mMax - m0
                                left = '%f' % ((((m0-t0)*100.0)+sysvals.srgap/2)/tTotal)
+                       mTotal = mMax - m0
                        # if a timeline block is 0 length, skip altogether
                        if mTotal == 0:
                                continue
                        width = '%f' % (((mTotal*100.0)-sysvals.srgap/2)/tTotal)
-                       devtl.html['timeline'] += html_tblock.format(bname, left, width, devtl.scaleH)
+                       devtl.html += devtl.html_tblock.format(bname, left, width, devtl.scaleH)
                        for b in sorted(phases[dir]):
                                # draw the phase color background
                                phase = data.dmesg[b]
                                length = phase['end']-phase['start']
                                left = '%f' % (((phase['start']-m0)*100.0)/mTotal)
                                width = '%f' % ((length*100.0)/mTotal)
-                               devtl.html['timeline'] += html_phase.format(left, width, \
+                               devtl.html += devtl.html_phase.format(left, width, \
                                        '%.3f'%devtl.scaleH, '%.3f'%devtl.bodyH, \
                                        data.dmesg[b]['color'], '')
                        for e in data.errorinfo[dir]:
                                # draw red lines for any kernel errors found
                                t, err = e
                                right = '%f' % (((mMax-t)*100.0)/mTotal)
-                               devtl.html['timeline'] += html_error.format(right, err)
+                               devtl.html += html_error.format(right, err)
                        for b in sorted(phases[dir]):
                                # draw the devices for this phase
                                phaselist = data.dmesg[b]['list']
@@ -3496,7 +3506,7 @@ def createHTML(testruns):
                                        if 'htmlclass' in dev:
                                                xtraclass = dev['htmlclass']
                                        if 'color' in dev:
-                                               xtrastyle = 'background-color:%s;' % dev['color']
+                                               xtrastyle = 'background:%s;' % dev['color']
                                        if(d in sysvals.devprops):
                                                name = sysvals.devprops[d].altName(d)
                                                xtraclass = sysvals.devprops[d].xtraClass()
@@ -3521,7 +3531,7 @@ def createHTML(testruns):
                                                        title += 'post_resume_process'
                                        else:
                                                title += b
-                                       devtl.html['timeline'] += html_device.format(dev['id'], \
+                                       devtl.html += devtl.html_device.format(dev['id'], \
                                                title, left, top, '%.3f'%rowheight, width, \
                                                d+drv, xtraclass, xtrastyle)
                                        if('cpuexec' in dev):
@@ -3535,7 +3545,7 @@ def createHTML(testruns):
                                                        left = '%f' % (((start-m0)*100)/mTotal)
                                                        width = '%f' % ((end-start)*100/mTotal)
                                                        color = 'rgba(255, 0, 0, %f)' % j
-                                                       devtl.html['timeline'] += \
+                                                       devtl.html += \
                                                                html_cpuexec.format(left, top, height, width, color)
                                        if('src' not in dev):
                                                continue
@@ -3548,20 +3558,20 @@ def createHTML(testruns):
                                                xtrastyle = ''
                                                if e.color:
                                                        xtrastyle = 'background:%s;' % e.color
-                                               devtl.html['timeline'] += \
+                                               devtl.html += \
                                                        html_traceevent.format(e.title(), \
                                                                left, top, height, width, e.text(), '', xtrastyle)
                        # draw the time scale, try to make the number of labels readable
-                       devtl.html['timeline'] += devtl.createTimeScale(m0, mMax, tTotal, dir)
-                       devtl.html['timeline'] += '</div>\n'
+                       devtl.createTimeScale(m0, mMax, tTotal, dir)
+                       devtl.html += '</div>\n'
 
        # timeline is finished
-       devtl.html['timeline'] += '</div>\n</div>\n'
+       devtl.html += '</div>\n</div>\n'
 
        # draw a legend which describes the phases by color
        if sysvals.suspendmode != 'command':
                data = testruns[-1]
-               devtl.html['legend'] = '<div class="legend">\n'
+               devtl.html += '<div class="legend">\n'
                pdelta = 100.0/len(data.phases)
                pmargin = pdelta / 4.0
                for phase in data.phases:
@@ -3571,127 +3581,41 @@ def createHTML(testruns):
                                id += tmp[1][0]
                        order = '%.2f' % ((data.dmesg[phase]['order'] * pdelta) + pmargin)
                        name = string.replace(phase, '_', ' &nbsp;')
-                       devtl.html['legend'] += html_legend.format(order, \
+                       devtl.html += html_legend.format(order, \
                                data.dmesg[phase]['color'], name, id)
-               devtl.html['legend'] += '</div>\n'
+               devtl.html += '</div>\n'
 
        hf = open(sysvals.htmlfile, 'w')
 
-       if not sysvals.cgexp:
-               cgchk = 'checked'
-               cgnchk = 'not(:checked)'
-       else:
-               cgchk = 'not(:checked)'
-               cgnchk = 'checked'
-
-       # write the html header first (html head, css code, up to body start)
-       html_header = '<!DOCTYPE html>\n<html>\n<head>\n\
-       <meta http-equiv="content-type" content="text/html; charset=UTF-8">\n\
-       <title>'+htmlTitle()+'</title>\n\
-       <style type=\'text/css\'>\n\
-               body {overflow-y:scroll;}\n\
-               .stamp {width:100%;text-align:center;background-color:gray;line-height:30px;color:white;font:25px Arial;}\n\
-               .callgraph {margin-top:30px;box-shadow:5px 5px 20px black;}\n\
-               .callgraph article * {padding-left:28px;}\n\
-               h1 {color:black;font:bold 30px Times;}\n\
-               t0 {color:black;font:bold 30px Times;}\n\
-               t1 {color:black;font:30px Times;}\n\
-               t2 {color:black;font:25px Times;}\n\
-               t3 {color:black;font:20px Times;white-space:nowrap;}\n\
-               t4 {color:black;font:bold 30px Times;line-height:60px;white-space:nowrap;}\n\
-               cS {font:bold 13px Times;}\n\
-               table {width:100%;}\n\
-               .gray {background-color:rgba(80,80,80,0.1);}\n\
-               .green {background-color:rgba(204,255,204,0.4);}\n\
-               .purple {background-color:rgba(128,0,128,0.2);}\n\
-               .yellow {background-color:rgba(255,255,204,0.4);}\n\
-               .time1 {font:22px Arial;border:1px solid;}\n\
-               .time2 {font:15px Arial;border-bottom:1px solid;border-left:1px solid;border-right:1px solid;}\n\
-               td {text-align:center;}\n\
-               r {color:#500000;font:15px Tahoma;}\n\
-               n {color:#505050;font:15px Tahoma;}\n\
-               .tdhl {color:red;}\n\
-               .hide {display:none;}\n\
-               .pf {display:none;}\n\
-               .pf:'+cgchk+' + label {background:url(\'data:image/svg+xml;utf,<?xml version="1.0" standalone="no"?><svg xmlns="http://www.w3.org/2000/svg" height="18" width="18" version="1.1"><circle cx="9" cy="9" r="8" stroke="black" stroke-width="1" fill="white"/><rect x="4" y="8" width="10" height="2" style="fill:black;stroke-width:0"/><rect x="8" y="4" width="2" height="10" style="fill:black;stroke-width:0"/></svg>\') no-repeat left center;}\n\
-               .pf:'+cgnchk+' ~ label {background:url(\'data:image/svg+xml;utf,<?xml version="1.0" standalone="no"?><svg xmlns="http://www.w3.org/2000/svg" height="18" width="18" version="1.1"><circle cx="9" cy="9" r="8" stroke="black" stroke-width="1" fill="white"/><rect x="4" y="8" width="10" height="2" style="fill:black;stroke-width:0"/></svg>\') no-repeat left center;}\n\
-               .pf:'+cgchk+' ~ *:not(:nth-child(2)) {display:none;}\n\
-               .zoombox {position:relative;width:100%;overflow-x:scroll;-webkit-user-select:none;-moz-user-select:none;user-select:none;}\n\
-               .timeline {position:relative;font-size:14px;cursor:pointer;width:100%; overflow:hidden;background:linear-gradient(#cccccc, white);}\n\
-               .thread {position:absolute;height:0%;overflow:hidden;z-index:7;line-height:30px;font-size:14px;border:1px solid;text-align:center;white-space:nowrap;}\n\
-               .thread.ps {border-radius:3px;background:linear-gradient(to top, #ccc, #eee);}\n\
-               .thread:hover {background-color:white;border:1px solid red;'+hoverZ+'}\n\
-               .thread.sec,.thread.sec:hover {background-color:black;border:0;color:white;line-height:15px;font-size:10px;}\n\
-               .hover {background-color:white;border:1px solid red;'+hoverZ+'}\n\
-               .hover.sync {background-color:white;}\n\
-               .hover.bg,.hover.kth,.hover.sync,.hover.ps {background-color:white;}\n\
-               .jiffie {position:absolute;pointer-events: none;z-index:8;}\n\
-               .traceevent {position:absolute;font-size:10px;z-index:7;overflow:hidden;color:black;text-align:center;white-space:nowrap;border-radius:5px;border:1px solid black;background:linear-gradient(to bottom right,#CCC,#969696);}\n\
-               .traceevent:hover {color:white;font-weight:bold;border:1px solid white;}\n\
-               .phase {position:absolute;overflow:hidden;border:0px;text-align:center;}\n\
-               .phaselet {position:absolute;overflow:hidden;border:0px;text-align:center;height:100px;font-size:24px;}\n\
-               .t {position:absolute;line-height:'+('%d'%scaleTH)+'px;pointer-events:none;top:0;height:100%;border-right:1px solid black;z-index:6;}\n\
-               .err {position:absolute;top:0%;height:100%;border-right:3px solid red;color:red;font:bold 14px Times;line-height:18px;}\n\
-               .legend {position:relative; width:100%; height:40px; text-align:center;margin-bottom:20px}\n\
-               .legend .square {position:absolute;cursor:pointer;top:10px; width:0px;height:20px;border:1px solid;padding-left:20px;}\n\
-               button {height:40px;width:200px;margin-bottom:20px;margin-top:20px;font-size:24px;}\n\
-               .logbtn {position:relative;float:right;height:25px;width:50px;margin-top:3px;margin-bottom:0;font-size:10px;text-align:center;}\n\
-               .devlist {position:'+x2changes[1]+';width:190px;}\n\
-               a:link {color:white;text-decoration:none;}\n\
-               a:visited {color:white;}\n\
-               a:hover {color:white;}\n\
-               a:active {color:white;}\n\
-               .version {position:relative;float:left;color:white;font-size:10px;line-height:30px;margin-left:10px;}\n\
-               #devicedetail {height:100px;box-shadow:5px 5px 20px black;}\n\
-               .tblock {position:absolute;height:100%;background-color:#ddd;}\n\
-               .tback {position:absolute;width:100%;background:linear-gradient(#ccc, #ddd);}\n\
-               .bg {z-index:1;}\n\
-       </style>\n</head>\n<body>\n'
-
        # no header or css if its embedded
        if(sysvals.embedded):
                hf.write('pass True tSus %.3f tRes %.3f tLow %.3f fwvalid %s tSus %.3f tRes %.3f\n' %
                        (data.tSuspended-data.start, data.end-data.tSuspended, data.tLow, data.fwValid, \
                                data.fwSuspend/1000000, data.fwResume/1000000))
        else:
-               hf.write(html_header)
-
-       # write the test title and general info header
-       if(sysvals.stamp['time'] != ""):
-               hf.write(headline_version)
-               if sysvals.logmsg:
-                       hf.write('<button id="showtest" class="logbtn">log</button>')
-               if sysvals.addlogs and sysvals.dmesgfile:
-                       hf.write('<button id="showdmesg" class="logbtn">dmesg</button>')
-               if sysvals.addlogs and sysvals.ftracefile:
-                       hf.write('<button id="showftrace" class="logbtn">ftrace</button>')
-               hf.write(headline_stamp.format(sysvals.stamp['host'],
-                       sysvals.stamp['kernel'], sysvals.stamp['mode'], \
-                               sysvals.stamp['time']))
+               addCSS(hf, sysvals, len(testruns), kerror)
 
        # write the device timeline
-       hf.write(devtl.html['header'])
-       hf.write(devtl.html['timeline'])
-       hf.write(devtl.html['legend'])
+       hf.write(devtl.html)
        hf.write('<div id="devicedetailtitle"></div>\n')
        hf.write('<div id="devicedetail" style="display:none;">\n')
        # draw the colored boxes for the device detail section
        for data in testruns:
                hf.write('<div id="devicedetail%d">\n' % data.testnumber)
                pscolor = 'linear-gradient(to top left, #ccc, #eee)'
-               hf.write(html_phaselet.format('pre_suspend_process', \
+               hf.write(devtl.html_phaselet.format('pre_suspend_process', \
                        '0', '0', pscolor))
                for b in data.phases:
                        phase = data.dmesg[b]
                        length = phase['end']-phase['start']
                        left = '%.3f' % (((phase['start']-t0)*100.0)/tTotal)
                        width = '%.3f' % ((length*100.0)/tTotal)
-                       hf.write(html_phaselet.format(b, left, width, \
+                       hf.write(devtl.html_phaselet.format(b, left, width, \
                                data.dmesg[b]['color']))
-               hf.write(html_phaselet.format('post_resume_process', \
+               hf.write(devtl.html_phaselet.format('post_resume_process', \
                        '0', '0', pscolor))
                if sysvals.suspendmode == 'command':
-                       hf.write(html_phaselet.format('cmdexec', '0', '0', pscolor))
+                       hf.write(devtl.html_phaselet.format('cmdexec', '0', '0', pscolor))
                hf.write('</div>\n')
        hf.write('</div>\n')
 
@@ -3701,52 +3625,7 @@ def createHTML(testruns):
        else:
                data = testruns[-1]
        if(sysvals.usecallgraph and not sysvals.embedded):
-               hf.write('<section id="callgraphs" class="callgraph">\n')
-               # write out the ftrace data converted to html
-               html_func_top = '<article id="{0}" class="atop" style="background-color:{1}">\n<input type="checkbox" class="pf" id="f{2}" checked/><label for="f{2}">{3} {4}</label>\n'
-               html_func_start = '<article>\n<input type="checkbox" class="pf" id="f{0}" checked/><label for="f{0}">{1} {2}</label>\n'
-               html_func_end = '</article>\n'
-               html_func_leaf = '<article>{0} {1}</article>\n'
-               num = 0
-               for p in data.phases:
-                       if sysvals.cgphase and p != sysvals.cgphase:
-                               continue
-                       list = data.dmesg[p]['list']
-                       for devname in data.sortedDevices(p):
-                               if('ftrace' not in list[devname]):
-                                       continue
-                               devid = list[devname]['id']
-                               cg = list[devname]['ftrace']
-                               clen = (cg.end - cg.start) * 1000
-                               if clen < sysvals.mincglen:
-                                       continue
-                               fmt = '<r>(%.3f ms @ '+sysvals.timeformat+' to '+sysvals.timeformat+')</r>'
-                               flen = fmt % (clen, cg.start, cg.end)
-                               name = devname
-                               if(devname in sysvals.devprops):
-                                       name = sysvals.devprops[devname].altName(devname)
-                               if sysvals.suspendmode == 'command':
-                                       ftitle = name
-                               else:
-                                       ftitle = name+' '+p
-                               hf.write(html_func_top.format(devid, data.dmesg[p]['color'], \
-                                       num, ftitle, flen))
-                               num += 1
-                               for line in cg.list:
-                                       if(line.length < 0.000000001):
-                                               flen = ''
-                                       else:
-                                               fmt = '<n>(%.3f ms @ '+sysvals.timeformat+')</n>'
-                                               flen = fmt % (line.length*1000, line.time)
-                                       if(line.freturn and line.fcall):
-                                               hf.write(html_func_leaf.format(line.name, flen))
-                                       elif(line.freturn):
-                                               hf.write(html_func_end)
-                                       else:
-                                               hf.write(html_func_start.format(num, line.name, flen))
-                                               num += 1
-                               hf.write(html_func_end)
-               hf.write('\n\n    </section>\n')
+               addCallgraphs(sysvals, hf, data)
 
        # add the test log as a hidden div
        if sysvals.logmsg:
@@ -3788,6 +3667,100 @@ def createHTML(testruns):
        hf.close()
        return True
 
+def addCSS(hf, sv, testcount=1, kerror=False, extra=''):
+       kernel = sv.stamp['kernel']
+       host = sv.hostname[0].upper()+sv.hostname[1:]
+       mode = sv.suspendmode
+       if sv.suspendmode in suspendmodename:
+               mode = suspendmodename[sv.suspendmode]
+       title = host+' '+mode+' '+kernel
+
+       # various format changes by flags
+       cgchk = 'checked'
+       cgnchk = 'not(:checked)'
+       if sv.cgexp:
+               cgchk = 'not(:checked)'
+               cgnchk = 'checked'
+
+       hoverZ = 'z-index:8;'
+       if sv.usedevsrc:
+               hoverZ = ''
+
+       devlistpos = 'absolute'
+       if testcount > 1:
+               devlistpos = 'relative'
+
+       scaleTH = 20
+       if kerror:
+               scaleTH = 60
+
+       # write the html header first (html head, css code, up to body start)
+       html_header = '<!DOCTYPE html>\n<html>\n<head>\n\
+       <meta http-equiv="content-type" content="text/html; charset=UTF-8">\n\
+       <title>'+title+'</title>\n\
+       <style type=\'text/css\'>\n\
+               body {overflow-y:scroll;}\n\
+               .stamp {width:100%;text-align:center;background:gray;line-height:30px;color:white;font:25px Arial;}\n\
+               .callgraph {margin-top:30px;box-shadow:5px 5px 20px black;}\n\
+               .callgraph article * {padding-left:28px;}\n\
+               h1 {color:black;font:bold 30px Times;}\n\
+               t0 {color:black;font:bold 30px Times;}\n\
+               t1 {color:black;font:30px Times;}\n\
+               t2 {color:black;font:25px Times;}\n\
+               t3 {color:black;font:20px Times;white-space:nowrap;}\n\
+               t4 {color:black;font:bold 30px Times;line-height:60px;white-space:nowrap;}\n\
+               cS {font:bold 13px Times;}\n\
+               table {width:100%;}\n\
+               .gray {background:rgba(80,80,80,0.1);}\n\
+               .green {background:rgba(204,255,204,0.4);}\n\
+               .purple {background:rgba(128,0,128,0.2);}\n\
+               .yellow {background:rgba(255,255,204,0.4);}\n\
+               .blue {background:rgba(169,208,245,0.4);}\n\
+               .time1 {font:22px Arial;border:1px solid;}\n\
+               .time2 {font:15px Arial;border-bottom:1px solid;border-left:1px solid;border-right:1px solid;}\n\
+               td {text-align:center;}\n\
+               r {color:#500000;font:15px Tahoma;}\n\
+               n {color:#505050;font:15px Tahoma;}\n\
+               .tdhl {color:red;}\n\
+               .hide {display:none;}\n\
+               .pf {display:none;}\n\
+               .pf:'+cgchk+' + label {background:url(\'data:image/svg+xml;utf,<?xml version="1.0" standalone="no"?><svg xmlns="http://www.w3.org/2000/svg" height="18" width="18" version="1.1"><circle cx="9" cy="9" r="8" stroke="black" stroke-width="1" fill="white"/><rect x="4" y="8" width="10" height="2" style="fill:black;stroke-width:0"/><rect x="8" y="4" width="2" height="10" style="fill:black;stroke-width:0"/></svg>\') no-repeat left center;}\n\
+               .pf:'+cgnchk+' ~ label {background:url(\'data:image/svg+xml;utf,<?xml version="1.0" standalone="no"?><svg xmlns="http://www.w3.org/2000/svg" height="18" width="18" version="1.1"><circle cx="9" cy="9" r="8" stroke="black" stroke-width="1" fill="white"/><rect x="4" y="8" width="10" height="2" style="fill:black;stroke-width:0"/></svg>\') no-repeat left center;}\n\
+               .pf:'+cgchk+' ~ *:not(:nth-child(2)) {display:none;}\n\
+               .zoombox {position:relative;width:100%;overflow-x:scroll;-webkit-user-select:none;-moz-user-select:none;user-select:none;}\n\
+               .timeline {position:relative;font-size:14px;cursor:pointer;width:100%; overflow:hidden;background:linear-gradient(#cccccc, white);}\n\
+               .thread {position:absolute;height:0%;overflow:hidden;z-index:7;line-height:30px;font-size:14px;border:1px solid;text-align:center;white-space:nowrap;}\n\
+               .thread.ps {border-radius:3px;background:linear-gradient(to top, #ccc, #eee);}\n\
+               .thread:hover {background:white;border:1px solid red;'+hoverZ+'}\n\
+               .thread.sec,.thread.sec:hover {background:black;border:0;color:white;line-height:15px;font-size:10px;}\n\
+               .hover {background:white;border:1px solid red;'+hoverZ+'}\n\
+               .hover.sync {background:white;}\n\
+               .hover.bg,.hover.kth,.hover.sync,.hover.ps {background:white;}\n\
+               .jiffie {position:absolute;pointer-events: none;z-index:8;}\n\
+               .traceevent {position:absolute;font-size:10px;z-index:7;overflow:hidden;color:black;text-align:center;white-space:nowrap;border-radius:5px;border:1px solid black;background:linear-gradient(to bottom right,#CCC,#969696);}\n\
+               .traceevent:hover {color:white;font-weight:bold;border:1px solid white;}\n\
+               .phase {position:absolute;overflow:hidden;border:0px;text-align:center;}\n\
+               .phaselet {float:left;overflow:hidden;border:0px;text-align:center;min-height:100px;font-size:24px;}\n\
+               .t {position:absolute;line-height:'+('%d'%scaleTH)+'px;pointer-events:none;top:0;height:100%;border-right:1px solid black;z-index:6;}\n\
+               .err {position:absolute;top:0%;height:100%;border-right:3px solid red;color:red;font:bold 14px Times;line-height:18px;}\n\
+               .legend {position:relative; width:100%; height:40px; text-align:center;margin-bottom:20px}\n\
+               .legend .square {position:absolute;cursor:pointer;top:10px; width:0px;height:20px;border:1px solid;padding-left:20px;}\n\
+               button {height:40px;width:200px;margin-bottom:20px;margin-top:20px;font-size:24px;}\n\
+               .logbtn {position:relative;float:right;height:25px;width:50px;margin-top:3px;margin-bottom:0;font-size:10px;text-align:center;}\n\
+               .devlist {position:'+devlistpos+';width:190px;}\n\
+               a:link {color:white;text-decoration:none;}\n\
+               a:visited {color:white;}\n\
+               a:hover {color:white;}\n\
+               a:active {color:white;}\n\
+               .version {position:relative;float:left;color:white;font-size:10px;line-height:30px;margin-left:10px;}\n\
+               #devicedetail {min-height:100px;box-shadow:5px 5px 20px black;}\n\
+               .tblock {position:absolute;height:100%;background:#ddd;}\n\
+               .tback {position:absolute;width:100%;background:linear-gradient(#ccc, #ddd);}\n\
+               .bg {z-index:1;}\n\
+'+extra+'\
+       </style>\n</head>\n<body>\n'
+       hf.write(html_header)
+
 # Function: addScriptCode
 # Description:
 #       Adds the javascript code to the output html
@@ -3809,7 +3782,7 @@ def addScriptCode(hf, testruns):
        '       var resolution = -1;\n'\
        '       var dragval = [0, 0];\n'\
        '       function redrawTimescale(t0, tMax, tS) {\n'\
-       '               var rline = \'<div class="t" style="left:0;border-left:1px solid black;border-right:0;"><cS>&larr;R</cS></div>\';\n'\
+       '               var rline = \'<div class="t" style="left:0;border-left:1px solid black;border-right:0;">\';\n'\
        '               var tTotal = tMax - t0;\n'\
        '               var list = document.getElementsByClassName("tblock");\n'\
        '               for (var i = 0; i < list.length; i++) {\n'\
@@ -3824,19 +3797,23 @@ def addScriptCode(hf, testruns):
        '                       var pos = 0.0, val = 0.0;\n'\
        '                       for (var j = 0; j < divTotal; j++) {\n'\
        '                               var htmlline = "";\n'\
-       '                               if(list[i].id[5] == "r") {\n'\
-       '                                       pos = 100 - (((j)*tS*100)/mTotal);\n'\
-       '                                       val = (j)*tS;\n'\
-       '                                       htmlline = \'<div class="t" style="right:\'+pos+\'%">\'+val+\'ms</div>\';\n'\
-       '                                       if(j == 0)\n'\
-       '                                               htmlline = rline;\n'\
-       '                               } else {\n'\
+       '                               var mode = list[i].id[5];\n'\
+       '                               if(mode == "s") {\n'\
        '                                       pos = 100 - (((j)*tS*100)/mTotal) - divEdge;\n'\
        '                                       val = (j-divTotal+1)*tS;\n'\
        '                                       if(j == divTotal - 1)\n'\
        '                                               htmlline = \'<div class="t" style="right:\'+pos+\'%"><cS>S&rarr;</cS></div>\';\n'\
        '                                       else\n'\
        '                                               htmlline = \'<div class="t" style="right:\'+pos+\'%">\'+val+\'ms</div>\';\n'\
+       '                               } else {\n'\
+       '                                       pos = 100 - (((j)*tS*100)/mTotal);\n'\
+       '                                       val = (j)*tS;\n'\
+       '                                       htmlline = \'<div class="t" style="right:\'+pos+\'%">\'+val+\'ms</div>\';\n'\
+       '                                       if(j == 0)\n'\
+       '                                               if(mode == "r")\n'\
+       '                                                       htmlline = rline+"<cS>&larr;R</cS></div>";\n'\
+       '                                               else\n'\
+       '                                                       htmlline = rline+"<cS>0ms</div>";\n'\
        '                               }\n'\
        '                               html += htmlline;\n'\
        '                       }\n'\
@@ -4002,12 +3979,80 @@ def addScriptCode(hf, testruns):
        '                               }\n'\
        '                       }\n'\
        '               }\n'\
+       '               if(typeof devstats !== \'undefined\')\n'\
+       '                       callDetail(this.id, this.title);\n'\
        '               var cglist = document.getElementById("callgraphs");\n'\
        '               if(!cglist) return;\n'\
        '               var cg = cglist.getElementsByClassName("atop");\n'\
        '               if(cg.length < 10) return;\n'\
        '               for (var i = 0; i < cg.length; i++) {\n'\
-       '                       if(idlist.indexOf(cg[i].id) >= 0) {\n'\
+       '                       cgid = cg[i].id.split("x")[0]\n'\
+       '                       if(idlist.indexOf(cgid) >= 0) {\n'\
+       '                               cg[i].style.display = "block";\n'\
+       '                       } else {\n'\
+       '                               cg[i].style.display = "none";\n'\
+       '                       }\n'\
+       '               }\n'\
+       '       }\n'\
+       '       function callDetail(devid, devtitle) {\n'\
+       '               if(!(devid in devstats) || devstats[devid].length < 1)\n'\
+       '                       return;\n'\
+       '               var list = devstats[devid];\n'\
+       '               var tmp = devtitle.split(" ");\n'\
+       '               var name = tmp[0], phase = tmp[tmp.length-1];\n'\
+       '               var dd = document.getElementById(phase);\n'\
+       '               var total = parseFloat(tmp[1].slice(1));\n'\
+       '               var mlist = [];\n'\
+       '               var maxlen = 0;\n'\
+       '               var info = []\n'\
+       '               for(var i in list) {\n'\
+       '                       if(list[i][0] == "@") {\n'\
+       '                               info = list[i].split("|");\n'\
+       '                               continue;\n'\
+       '                       }\n'\
+       '                       var tmp = list[i].split("|");\n'\
+       '                       var t = parseFloat(tmp[0]), f = tmp[1], c = parseInt(tmp[2]);\n'\
+       '                       var p = (t*100.0/total).toFixed(2);\n'\
+       '                       mlist[mlist.length] = [f, c, t.toFixed(2), p+"%"];\n'\
+       '                       if(f.length > maxlen)\n'\
+       '                               maxlen = f.length;\n'\
+       '               }\n'\
+       '               var pad = 5;\n'\
+       '               if(mlist.length == 0) pad = 30;\n'\
+       '               var html = \'<div style="padding-top:\'+pad+\'px"><t3> <b>\'+name+\':</b>\';\n'\
+       '               if(info.length > 2)\n'\
+       '                       html += " start=<b>"+info[1]+"</b>, end=<b>"+info[2]+"</b>";\n'\
+       '               if(info.length > 3)\n'\
+       '                       html += ", length<i>(w/o overhead)</i>=<b>"+info[3]+" ms</b>";\n'\
+       '               if(info.length > 4)\n'\
+       '                       html += ", return=<b>"+info[4]+"</b>";\n'\
+       '               html += "</t3></div>";\n'\
+       '               if(mlist.length > 0) {\n'\
+       '                       html += \'<table class=fstat style="padding-top:\'+(maxlen*5)+\'px;"><tr><th>Function</th>\';\n'\
+       '                       for(var i in mlist)\n'\
+       '                               html += "<td class=vt>"+mlist[i][0]+"</td>";\n'\
+       '                       html += "</tr><tr><th>Calls</th>";\n'\
+       '                       for(var i in mlist)\n'\
+       '                               html += "<td>"+mlist[i][1]+"</td>";\n'\
+       '                       html += "</tr><tr><th>Time(ms)</th>";\n'\
+       '                       for(var i in mlist)\n'\
+       '                               html += "<td>"+mlist[i][2]+"</td>";\n'\
+       '                       html += "</tr><tr><th>Percent</th>";\n'\
+       '                       for(var i in mlist)\n'\
+       '                               html += "<td>"+mlist[i][3]+"</td>";\n'\
+       '                       html += "</tr></table>";\n'\
+       '               }\n'\
+       '               dd.innerHTML = html;\n'\
+       '               var height = (maxlen*5)+100;\n'\
+       '               dd.style.height = height+"px";\n'\
+       '               document.getElementById("devicedetail").style.height = height+"px";\n'\
+       '       }\n'\
+       '       function callSelect() {\n'\
+       '               var cglist = document.getElementById("callgraphs");\n'\
+       '               if(!cglist) return;\n'\
+       '               var cg = cglist.getElementsByClassName("atop");\n'\
+       '               for (var i = 0; i < cg.length; i++) {\n'\
+       '                       if(this.id == cg[i].id) {\n'\
        '                               cg[i].style.display = "block";\n'\
        '                       } else {\n'\
        '                               cg[i].style.display = "none";\n'\
@@ -4093,6 +4138,9 @@ def addScriptCode(hf, testruns):
        '                       dev[i].onmouseover = deviceHover;\n'\
        '                       dev[i].onmouseout = deviceUnhover;\n'\
        '               }\n'\
+       '               var dev = dmesg.getElementsByClassName("srccall");\n'\
+       '               for (var i = 0; i < dev.length; i++)\n'\
+       '                       dev[i].onclick = callSelect;\n'\
        '               zoomTimeline();\n'\
        '       });\n'\
        '</script>\n'
@@ -4675,7 +4723,7 @@ def rootCheck(fatal):
        if(os.access(sysvals.powerfile, os.W_OK)):
                return True
        if fatal:
-               doError('This command must be run as root')
+               doError('This command requires sysfs mount and root access')
        return False
 
 # Function: getArgInt
@@ -4767,51 +4815,62 @@ def runTest(subdir, testpath=''):
                cmd = 'chown -R {0}:{0} {1} > /dev/null 2>&1'
                call(cmd.format(os.environ['SUDO_USER'], sysvals.testdir), shell=True)
 
+def find_in_html(html, strs, div=False):
+       for str in strs:
+               l = len(str)
+               i = html.find(str)
+               if i >= 0:
+                       break
+       if i < 0:
+               return ''
+       if not div:
+               return re.search(r'[-+]?\d*\.\d+|\d+', html[i+l:i+l+50]).group()
+       n = html[i+l:].find('</div>')
+       if n < 0:
+               return ''
+       return html[i+l:i+l+n]
+
 # Function: runSummary
 # Description:
 #       create a summary of tests in a sub-directory
-def runSummary(subdir, output):
-       # get a list of ftrace output files
-       files = []
+def runSummary(subdir, local=True):
+       inpath = os.path.abspath(subdir)
+       outpath = inpath
+       if local:
+               outpath = os.path.abspath('.')
+       print('Generating a summary of folder "%s"' % inpath)
+       testruns = []
        for dirname, dirnames, filenames in os.walk(subdir):
                for filename in filenames:
-                       if(re.match('.*_ftrace.txt', filename)):
-                               files.append("%s/%s" % (dirname, filename))
-
-       # process the files in order and get an array of data objects
-       testruns = []
-       for file in sorted(files):
-               if output:
-                       print("Test found in %s" % os.path.dirname(file))
-               sysvals.ftracefile = file
-               sysvals.dmesgfile = file.replace('_ftrace.txt', '_dmesg.txt')
-               doesTraceLogHaveTraceEvents()
-               sysvals.usecallgraph = False
-               if not sysvals.usetraceeventsonly:
-                       if(not os.path.exists(sysvals.dmesgfile)):
-                               print("Skipping %s: not a valid test input" % file)
+                       if(not re.match('.*.html', filename)):
                                continue
-                       else:
-                               if output:
-                                       f = os.path.basename(sysvals.ftracefile)
-                                       d = os.path.basename(sysvals.dmesgfile)
-                                       print("\tInput files: %s and %s" % (f, d))
-                               testdata = loadKernelLog()
-                               data = testdata[0]
-                               parseKernelLog(data)
-                               testdata = [data]
-                               appendIncompleteTraceLog(testdata)
-               else:
-                       if output:
-                               print("\tInput file: %s" % os.path.basename(sysvals.ftracefile))
-                       testdata = parseTraceLog()
-                       data = testdata[0]
-               data.normalizeTime(data.tSuspended)
-               link = file.replace(subdir+'/', '').replace('_ftrace.txt', '.html')
-               data.outfile = link
-               testruns.append(data)
-
-       createHTMLSummarySimple(testruns, subdir+'/summary.html')
+                       file = os.path.join(dirname, filename)
+                       html = open(file, 'r').read(10000)
+                       suspend = find_in_html(html,
+                               ['Kernel Suspend: ', 'Kernel Suspend Time: '])
+                       resume = find_in_html(html,
+                               ['Kernel Resume: ', 'Kernel Resume Time: '])
+                       line = find_in_html(html, ['<div class="stamp">'], True)
+                       stmp = line.split()
+                       if not suspend or not resume or len(stmp) < 4:
+                               continue
+                       data = {
+                               'host': stmp[0],
+                               'kernel': stmp[1],
+                               'mode': stmp[2],
+                               'time': string.join(stmp[3:], ' '),
+                               'suspend': suspend,
+                               'resume': resume,
+                               'url': os.path.relpath(file, outpath),
+                       }
+                       if len(stmp) == 7:
+                               data['kernel'] = 'unknown'
+                               data['mode'] = stmp[1]
+                               data['time'] = string.join(stmp[2:], ' ')
+                       testruns.append(data)
+       outfile = os.path.join(outpath, 'summary.html')
+       print('Summary file: %s' % outfile)
+       createHTMLSummarySimple(testruns, outfile, inpath)
 
 # Function: checkArgBool
 # Description:
@@ -4869,9 +4928,14 @@ def configFromFile(file):
                                sysvals.predelay = getArgInt('-predelay', value, 0, 60000, False)
                        elif(opt.lower() == 'postdelay'):
                                sysvals.postdelay = getArgInt('-postdelay', value, 0, 60000, False)
+                       elif(opt.lower() == 'maxdepth'):
+                               sysvals.max_graph_depth = getArgInt('-maxdepth', value, 0, 1000, False)
                        elif(opt.lower() == 'rtcwake'):
-                               sysvals.rtcwake = True
-                               sysvals.rtcwaketime = getArgInt('-rtcwake', value, 0, 3600, False)
+                               if value.lower() == 'off':
+                                       sysvals.rtcwake = False
+                               else:
+                                       sysvals.rtcwake = True
+                                       sysvals.rtcwaketime = getArgInt('-rtcwake', value, 0, 3600, False)
                        elif(opt.lower() == 'timeprec'):
                                sysvals.setPrecision(getArgInt('-timeprec', value, 0, 6, False))
                        elif(opt.lower() == 'mindev'):
@@ -4969,8 +5033,8 @@ def printHelp():
        modes = getModes()
 
        print('')
-       print('AnalyzeSuspend v%s' % sysvals.version)
-       print('Usage: sudo analyze_suspend.py <options>')
+       print('%s v%s' % (sysvals.title, sysvals.version))
+       print('Usage: sudo sleepgraph <options> <commands>')
        print('')
        print('Description:')
        print('  This tool is designed to assist kernel and OS developers in optimizing')
@@ -4981,22 +5045,22 @@ def printHelp():
        print('  a detailed view of which devices/subsystems are taking the most')
        print('  time in suspend/resume.')
        print('')
+       print('  If no specific command is given, the default behavior is to initiate')
+       print('  a suspend/resume and capture the dmesg/ftrace output as an html timeline.')
+       print('')
        print('  Generates output files in subdirectory: suspend-mmddyy-HHMMSS')
        print('   HTML output:                    <hostname>_<mode>.html')
        print('   raw dmesg output:               <hostname>_<mode>_dmesg.txt')
        print('   raw ftrace output:              <hostname>_<mode>_ftrace.txt')
        print('')
        print('Options:')
-       print('  [general]')
        print('   -h           Print this help text')
        print('   -v           Print the current tool version')
        print('   -config fn   Pull arguments and config options from file fn')
        print('   -verbose     Print extra information during execution and analysis')
-       print('   -status      Test to see if the system is enabled to run this tool')
-       print('   -modes       List available suspend modes')
        print('   -m mode      Mode to initiate for suspend %s (default: %s)') % (modes, sysvals.suspendmode)
        print('   -o subdir    Override the output subdirectory')
-       print('   -rtcwake t   Use rtcwake to autoresume after <t> seconds (default: disabled)')
+       print('   -rtcwake t   Wakeup t seconds after suspend, set t to "off" to disable (default: 15)')
        print('   -addlogs     Add the dmesg and ftrace logs to the html output')
        print('   -srgap       Add a visible gap in the timeline between sus/res (default: disabled)')
        print('  [advanced]')
@@ -5012,23 +5076,25 @@ def printHelp():
        print('                be created in a new subdirectory with a summary page.')
        print('  [debug]')
        print('   -f           Use ftrace to create device callgraphs (default: disabled)')
+       print('   -maxdepth N  limit the callgraph data to N call levels (default: 0=all)')
        print('   -expandcg    pre-expand the callgraph data in the html output (default: disabled)')
-       print('   -flist       Print the list of functions currently being captured in ftrace')
-       print('   -flistall    Print all functions capable of being captured in ftrace')
        print('   -fadd file   Add functions to be graphed in the timeline from a list in a text file')
        print('   -filter "d1,d2,..." Filter out all but this comma-delimited list of device names')
        print('   -mincg  ms   Discard all callgraphs shorter than ms milliseconds (e.g. 0.001 for us)')
        print('   -cgphase P   Only show callgraph data for phase P (e.g. suspend_late)')
        print('   -cgtest N    Only show callgraph data for test N (e.g. 0 or 1 in an x2 run)')
        print('   -timeprec N  Number of significant digits in timestamps (0:S, [3:ms], 6:us)')
-       print('  [utilities]')
+       print('  [commands]')
+       print('   -ftrace ftracefile  Create HTML output using ftrace input (used with -dmesg)')
+       print('   -dmesg dmesgfile    Create HTML output using dmesg (used with -ftrace)')
+       print('   -summary directory  Create a summary of all test in this dir')
+       print('   -modes       List available suspend modes')
+       print('   -status      Test to see if the system is enabled to run this tool')
        print('   -fpdt        Print out the contents of the ACPI Firmware Performance Data Table')
        print('   -usbtopo     Print out the current USB topology with power info')
        print('   -usbauto     Enable autosuspend for all connected USB devices')
-       print('  [re-analyze data from previous runs]')
-       print('   -ftrace ftracefile  Create HTML output using ftrace input')
-       print('   -dmesg dmesgfile    Create HTML output using dmesg (not needed for kernel >= 3.15)')
-       print('   -summary directory  Create a summary of all test in this dir')
+       print('   -flist       Print the list of functions currently being captured in ftrace')
+       print('   -flistall    Print all functions capable of being captured in ftrace')
        print('')
        return True
 
@@ -5076,9 +5142,18 @@ if __name__ == '__main__':
                        sysvals.useprocmon = True
                elif(arg == '-dev'):
                        sysvals.usedevsrc = True
+               elif(arg == '-maxdepth'):
+                       sysvals.max_graph_depth = getArgInt('-maxdepth', args, 0, 1000)
                elif(arg == '-rtcwake'):
-                       sysvals.rtcwake = True
-                       sysvals.rtcwaketime = getArgInt('-rtcwake', args, 0, 3600)
+                       try:
+                               val = args.next()
+                       except:
+                               doError('No rtcwake time supplied', True)
+                       if val.lower() == 'off':
+                               sysvals.rtcwake = False
+                       else:
+                               sysvals.rtcwake = True
+                               sysvals.rtcwaketime = getArgInt('-rtcwake', val, 0, 3600, False)
                elif(arg == '-timeprec'):
                        sysvals.setPrecision(getArgInt('-timeprec', args, 0, 6))
                elif(arg == '-mindev'):
@@ -5201,7 +5276,6 @@ if __name__ == '__main__':
                elif(cmd == 'usbauto'):
                        setUSBDevicesAuto()
                elif(cmd == 'summary'):
-                       print("Generating a summary of folder \"%s\"" % cmdarg)
                        runSummary(cmdarg, True)
                sys.exit()
 
diff --git a/tools/power/pm-graph/bootgraph.8 b/tools/power/pm-graph/bootgraph.8
new file mode 100644 (file)
index 0000000..55272a6
--- /dev/null
@@ -0,0 +1,132 @@
+.TH BOOTGRAPH 8
+.SH NAME
+bootgraph \- Kernel boot timing analysis
+.SH SYNOPSIS
+.ft B
+.B bootgraph
+.RB [ OPTIONS ]
+.RB [ COMMAND ]
+.SH DESCRIPTION
+\fBbootgraph \fP reads the dmesg log from kernel boot and
+creates an html representation of the initcall timeline up to the start
+of the init process.
+.PP
+If no specific command is given, the tool reads the current dmesg log and
+outputs bootgraph.html.
+.PP
+The tool can also augment the timeline with ftrace data on custom target
+functions as well as full trace callgraphs.
+.SH OPTIONS
+.TP
+\fB-h\fR
+Print this help text
+.TP
+\fB-v\fR
+Print the current tool version
+.TP
+\fB-addlogs\fR
+Add the dmesg log to the html output. It will be viewable by
+clicking a button in the timeline.
+.TP
+\fB-o \fIfile\fR
+Override the HTML output filename (default: bootgraph.html)
+.SS "Ftrace Debug"
+.TP
+\fB-f\fR
+Use ftrace to add function detail (default: disabled)
+.TP
+\fB-callgraph\fR
+Use ftrace to create initcall callgraphs (default: disabled). If -filter
+is not used there will be one callgraph per initcall. This can produce
+very large outputs, i.e. 10MB - 100MB.
+.TP
+\fB-maxdepth \fIlevel\fR
+limit the callgraph trace depth to \fIlevel\fR (default: 2). This is
+the best way to limit the output size when using -callgraph.
+.TP
+\fB-mincg \fIt\fR
+Discard all callgraphs shorter than \fIt\fR milliseconds (default: 0=all).
+This reduces the html file size as there can be many tiny callgraphs
+which are barely visible in the timeline.
+The value is a float: e.g. 0.001 represents 1 us.
+.TP
+\fB-timeprec \fIn\fR
+Number of significant digits in timestamps (0:S, 3:ms, [6:us])
+.TP
+\fB-expandcg\fR
+pre-expand the callgraph data in the html output (default: disabled)
+.TP
+\fB-filter \fI"func1,func2,..."\fR
+Instead of tracing each initcall, trace a custom list of functions (default: do_one_initcall)
+
+.SH COMMANDS
+.TP
+\fB-reboot\fR
+Reboot the machine and generate a new timeline automatically. Works in 4 steps.
+  1. updates grub with the required kernel parameters
+  2. installs a cron job which re-runs the tool after reboot
+  3. reboots the system
+  4. after startup, extracts the data and generates the timeline
+.TP
+\fB-manual\fR
+Show the requirements to generate a new timeline manually. Requires 3 steps.
+  1. append the string to the kernel command line via your native boot manager.
+  2. reboot the system
+  3. after startup, re-run the tool with the same arguments and no command
+.TP
+\fB-dmesg \fIfile\fR
+Create HTML output from an existing dmesg file.
+.TP
+\fB-ftrace \fIfile\fR
+Create HTML output from an existing ftrace file (used with -dmesg).
+.TP
+\fB-flistall\fR
+Print all ftrace functions capable of being captured. These are all the
+possible values you can add to trace via the -filter argument.
+
+.SH EXAMPLES
+Create a timeline using the current dmesg log.
+.IP
+\f(CW$ bootgraph\fR
+.PP
+Create a timeline using the current dmesg and ftrace log.
+.IP
+\f(CW$ bootgraph -callgraph\fR
+.PP
+Create a timeline using the current dmesg, add the log to the html and change the name.
+.IP
+\f(CW$ bootgraph -addlogs -o myboot.html\fR
+.PP
+Capture a new boot timeline by automatically rebooting the machine.
+.IP
+\f(CW$ sudo bootgraph -reboot -addlogs -o latestboot.html\fR
+.PP
+Capture a new boot timeline with function trace data.
+.IP
+\f(CW$ sudo bootgraph -reboot -f\fR
+.PP
+Capture a new boot timeline with trace & callgraph data. Skip callgraphs smaller than 5ms.
+.IP
+\f(CW$ sudo bootgraph -reboot -callgraph -mincg 5\fR
+.PP
+Capture a new boot timeline with callgraph data over custom functions.
+.IP
+\f(CW$ sudo bootgraph -reboot -callgraph -filter "acpi_ps_parse_aml,msleep"\fR
+.PP
+Capture a brand new boot timeline with manual reboot.
+.IP
+\f(CW$ sudo bootgraph -callgraph -manual\fR
+.IP
+\f(CW$ vi /etc/default/grub      # add the CMDLINE string to your kernel params\fR
+.IP
+\f(CW$ sudo reboot               # reboot the machine\fR
+.IP
+\f(CW$ sudo bootgraph -callgraph # re-run the tool after restart\fR
+.PP
+
+.SH "SEE ALSO"
+dmesg(1), update-grub(8), crontab(1), reboot(8)
+.PP
+.SH AUTHOR
+.nf
+Written by Todd Brandt <todd.e.brandt@linux.intel.com>
diff --git a/tools/power/pm-graph/sleepgraph.8 b/tools/power/pm-graph/sleepgraph.8
new file mode 100644 (file)
index 0000000..610e72e
--- /dev/null
@@ -0,0 +1,243 @@
+.TH SLEEPGRAPH 8
+.SH NAME
+sleepgraph \- Suspend/Resume timing analysis
+.SH SYNOPSIS
+.ft B
+.B sleepgraph
+.RB [ OPTIONS ]
+.RB [ COMMAND ]
+.SH DESCRIPTION
+\fBsleepgraph \fP is designed to assist kernel and OS developers
+in optimizing their linux stack's suspend/resume time. Using a kernel
+image built with a few extra options enabled, the tool will execute a
+suspend and capture dmesg and ftrace data until resume is complete.
+This data is transformed into a device timeline and an optional
+callgraph to give a detailed view of which devices/subsystems are
+taking the most time in suspend/resume.
+.PP
+If no specific command is given, the default behavior is to initiate
+a suspend/resume.
+.PP
+Generates output files in subdirectory: suspend-yymmdd-HHMMSS
+   html timeline   :     <hostname>_<mode>.html
+   raw dmesg file  :     <hostname>_<mode>_dmesg.txt
+   raw ftrace file :     <hostname>_<mode>_ftrace.txt
+.SH OPTIONS
+.TP
+\fB-h\fR
+Print the help text.
+.TP
+\fB-v\fR
+Print the current tool version.
+.TP
+\fB-verbose\fR
+Print extra information during execution and analysis.
+.TP
+\fB-config \fIfile\fR
+Pull arguments and config options from a file.
+.TP
+\fB-m \fImode\fR
+Mode to initiate for suspend e.g. standby, freeze, mem (default: mem).
+.TP
+\fB-o \fIsubdir\fR
+Override the output subdirectory. Use {date}, {time}, {hostname} for current values.
+.sp
+e.g. suspend-{hostname}-{date}-{time}
+.TP
+\fB-rtcwake \fIt\fR | off
+Use rtcwake to autoresume after \fIt\fR seconds (default: 15). Set t to "off" to
+disable rtcwake and require a user keypress to resume.
+.TP
+\fB-addlogs\fR
+Add the dmesg and ftrace logs to the html output. They will be viewable by
+clicking buttons in the timeline.
+
+.SS "Advanced"
+.TP
+\fB-cmd \fIstr\fR
+Run the timeline over a custom suspend command, e.g. pm-suspend. By default
+the tool forces suspend via /sys/power/state so this allows testing over
+an OS's official suspend method. The output file will change to
+hostname_command.html and will autodetect which suspend mode was triggered.
+.TP
+\fB-filter \fI"d1,d2,..."\fR
+Filter out all but these device callbacks. These strings can be device names
+or module names. e.g. 0000:00:02.0, ata5, i915, usb, etc.
+.TP
+\fB-mindev \fIt\fR
+Discard all device callbacks shorter than \fIt\fR milliseconds (default: 0.0).
+This reduces the html file size as there can be many tiny callbacks which are barely
+visible. The value is a float: e.g. 0.001 represents 1 us.
+.TP
+\fB-proc\fR
+Add usermode process info into the timeline (default: disabled).
+.TP
+\fB-dev\fR
+Add kernel source calls and threads to the timeline (default: disabled).
+.TP
+\fB-x2\fR
+Run two suspend/resumes back to back (default: disabled).
+.TP
+\fB-x2delay \fIt\fR
+Include \fIt\fR ms delay between multiple test runs (default: 0 ms).
+.TP
+\fB-predelay \fIt\fR
+Include \fIt\fR ms delay before 1st suspend (default: 0 ms).
+.TP
+\fB-postdelay \fIt\fR
+Include \fIt\fR ms delay after last resume (default: 0 ms).
+.TP
+\fB-multi \fIn d\fR
+Execute \fIn\fR consecutive tests at \fId\fR seconds intervals. The outputs will
+be created in a new subdirectory with a summary page: suspend-xN-{date}-{time}.
+
+.SS "Ftrace Debug"
+.TP
+\fB-f\fR
+Use ftrace to create device callgraphs (default: disabled). This can produce
+very large outputs, i.e. 10MB - 100MB.
+.TP
+\fB-maxdepth \fIlevel\fR
+limit the callgraph trace depth to \fIlevel\fR (default: 0=all). This is
+the best way to limit the output size when using callgraphs via -f.
+.TP
+\fB-expandcg\fR
+pre-expand the callgraph data in the html output (default: disabled)
+.TP
+\fB-fadd \fIfile\fR
+Add functions to be graphed in the timeline from a list in a text file
+.TP
+\fB-mincg \fIt\fR
+Discard all callgraphs shorter than \fIt\fR milliseconds (default: 0.0).
+This reduces the html file size as there can be many tiny callgraphs
+which are barely visible in the timeline.
+The value is a float: e.g. 0.001 represents 1 us.
+.TP
+\fB-cgphase \fIp\fR
+Only show callgraph data for phase \fIp\fR (e.g. suspend_late).
+.TP
+\fB-cgtest \fIn\fR
+In an x2 run, only show callgraph data for test \fIn\fR (e.g. 0 or 1).
+.TP
+\fB-timeprec \fIn\fR
+Number of significant digits in timestamps (0:S, [3:ms], 6:us).
+
+.SH COMMANDS
+.TP
+\fB-ftrace \fIfile\fR
+Create HTML output from an existing ftrace file.
+.TP
+\fB-dmesg \fIfile\fR
+Create HTML output from an existing dmesg file.
+.TP
+\fB-summary \fIindir\fR
+Create a summary page of all tests in \fIindir\fR. Creates summary.html
+in the current folder. The output page is a table of tests with
+suspend and resume values sorted by suspend mode, host, and kernel.
+Includes test averages by mode and links to the test html files.
+.TP
+\fB-modes\fR
+List available suspend modes.
+.TP
+\fB-status\fR
+Test to see if the system is able to run this tool. Use this along
+with any options you intend to use to see if they will work.
+.TP
+\fB-fpdt\fR
+Print out the contents of the ACPI Firmware Performance Data Table.
+.TP
+\fB-usbtopo\fR
+Print out the current USB topology with power info.
+.TP
+\fB-usbauto\fR
+Enable autosuspend for all connected USB devices.
+.TP
+\fB-flist\fR
+Print the list of ftrace functions currently being captured. Functions
+that are not available as symbols in the current kernel are shown in red.
+By default, the tool traces a list of important suspend/resume functions
+in order to better fill out the timeline. If the user has added their own
+with -fadd they will also be checked.
+.TP
+\fB-flistall\fR
+Print all ftrace functions capable of being captured. These are all the
+possible values you can add to trace via the -fadd argument.
+
+.SH EXAMPLES
+.SS "Simple Commands"
+Check which suspend modes are currently supported.
+.IP
+\f(CW$ sleepgraph -modes\fR
+.PP
+Read the Firmware Performance Data Table (FPDT)
+.IP
+\f(CW$ sudo sleepgraph -fpdt\fR
+.PP
+Print out the current USB power topology
+.IP
+\f(CW$ sleepgraph -usbtopo
+.PP
+Verify that you can run a command with a set of arguments
+.IP
+\f(CW$ sudo sleepgraph -f -rtcwake 30 -status
+.PP
+Generate a summary of all timelines in a particular folder.
+.IP
+\f(CW$ sleepgraph -summary ~/workspace/myresults/\fR
+.PP
+Re-generate the html output from a previous run's dmesg and ftrace log.
+.IP
+\f(CW$ sleepgraph -dmesg myhost_mem_dmesg.txt -ftrace myhost_mem_ftrace.txt\fR
+.PP
+
+.SS "Capturing Simple Timelines"
+Execute a mem suspend with a 15 second wakeup. Include the logs in the html.
+.IP
+\f(CW$ sudo sleepgraph -rtcwake 15 -addlogs\fR
+.PP
+Execute a standby with a 15 second wakeup. Change the output folder name.
+.IP
+\f(CW$ sudo sleepgraph -m standby -rtcwake 15 -o "standby-{hostname}-{date}-{time}"\fR
+.PP
+Execute a freeze with no wakeup (require keypress). Change output folder name.
+.IP
+\f(CW$ sudo sleepgraph -m freeze -rtcwake off -o "freeze-{hostname}-{date}-{time}"\fR
+.PP
+
+.SS "Capturing Advanced Timelines"
+Execute a suspend & include dev mode source calls, limit callbacks to 5ms or larger.
+.IP
+\f(CW$ sudo sleepgraph -m mem -rtcwake 15 -dev -mindev 5\fR
+.PP
+Run two suspends back to back, include a 500ms delay before, after, and in between runs.
+.IP
+\f(CW$ sudo sleepgraph -m mem -rtcwake 15 -x2 -predelay 500 -x2delay 500 -postdelay 500\fR
+.PP
+Do a batch run of 10 freezes with 30 seconds delay between runs.
+.IP
+\f(CW$ sudo sleepgraph -m freeze -rtcwake 15 -multi 10 30\fR
+.PP
+Execute a suspend using a custom command.
+.IP
+\f(CW$ sudo sleepgraph -cmd "echo mem > /sys/power/state" -rtcwake 15\fR
+.PP
+
+
+.SS "Capturing Timelines with Callgraph Data"
+Add device callgraphs. Limit the trace depth and only show callgraphs 10ms or larger.
+.IP
+\f(CW$ sudo sleepgraph -m mem -rtcwake 15 -f -maxdepth 5 -mincg 10\fR
+.PP
+Capture a full callgraph across all suspend, then filter the html by a single phase.
+.IP
+\f(CW$ sudo sleepgraph -m mem -rtcwake 15 -f\fR
+.IP
+\f(CW$ sleepgraph -dmesg host_mem_dmesg.txt -ftrace host_mem_ftrace.txt -f -cgphase resume
+.PP
+
+.SH "SEE ALSO"
+dmesg(1)
+.PP
+.SH AUTHOR
+.nf
+Written by Todd Brandt <todd.e.brandt@linux.intel.com>
index fd706ac..0b24dd9 100755 (executable)
@@ -353,6 +353,14 @@ def split_csv():
                 os.system('grep -m 1 common_cpu cpu.csv > cpu{:0>3}.csv'.format(index))
                 os.system('grep CPU_{:0>3} cpu.csv >> cpu{:0>3}.csv'.format(index, index))
 
+def fix_ownership(path):
+    """Change the owner of the file to SUDO_UID, if required"""
+
+    uid = os.environ.get('SUDO_UID')
+    gid = os.environ.get('SUDO_GID')
+    if uid is not None:
+        os.chown(path, int(uid), int(gid))
+
 def cleanup_data_files():
     """ clean up existing data files """
 
@@ -518,12 +526,16 @@ else:
 
 if not os.path.exists('results'):
     os.mkdir('results')
+    # The regular user needs to own the directory, not root.
+    fix_ownership('results')
 
 os.chdir('results')
 if os.path.exists(testname):
     print('The test name directory already exists. Please provide a unique test name. Test re-run not supported, yet.')
     sys.exit()
 os.mkdir(testname)
+# The regular user needs to own the directory, not root.
+fix_ownership(testname)
 os.chdir(testname)
 
 # Temporary (or perhaps not)
@@ -566,4 +578,9 @@ plot_scaled_cpu()
 plot_boost_cpu()
 plot_ghz_cpu()
 
+# It is preferrable, but not necessary, that the regular user owns the files, not root.
+for root, dirs, files in os.walk('.'):
+    for f in files:
+        fix_ownership(f)
+
 os.chdir('../../')
index fedca32..ccf2a69 100644 (file)
@@ -100,6 +100,8 @@ The system configuration dump (if --quiet is not used) is followed by statistics
 \fBCPU%c1, CPU%c3, CPU%c6, CPU%c7\fP show the percentage residency in hardware core idle states.  These numbers are from hardware residency counters.
 \fBCoreTmp\fP Degrees Celsius reported by the per-core Digital Thermal Sensor.
 \fBPkgTtmp\fP Degrees Celsius reported by the per-package Package Thermal Monitor.
+\fBGFX%rc6\fP The percentage of time the GPU is in the "render C6" state, rc6, during the measurement interval. From /sys/class/drm/card0/power/rc6_residency_ms.
+\fBGFXMHz\fP Instantaneous snapshot of what sysfs presents at the end of the measurement interval. From /sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz.
 \fBPkg%pc2, Pkg%pc3, Pkg%pc6, Pkg%pc7\fP percentage residency in hardware package idle states.  These numbers are from hardware residency counters.
 \fBPkgWatt\fP Watts consumed by the whole package.
 \fBCorWatt\fP Watts consumed by the core part of the package.
index 828dccd..b112947 100644 (file)
@@ -1142,7 +1142,7 @@ delta_thread(struct thread_data *new, struct thread_data *old,
                 * it is possible for mperf's non-halted cycles + idle states
                 * to exceed TSC's all cycles: show c1 = 0% in that case.
                 */
-               if ((old->mperf + core_delta->c3 + core_delta->c6 + core_delta->c7) > old->tsc)
+               if ((old->mperf + core_delta->c3 + core_delta->c6 + core_delta->c7) > (old->tsc * tsc_tweak))
                        old->c1 = 0;
                else {
                        /* normal case, derive c1 */
@@ -2485,8 +2485,10 @@ int snapshot_gfx_mhz(void)
 
        if (fp == NULL)
                fp = fopen_or_die("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", "r");
-       else
+       else {
                rewind(fp);
+               fflush(fp);
+       }
 
        retval = fscanf(fp, "%d", &gfx_cur_mhz);
        if (retval != 1)
@@ -3111,7 +3113,7 @@ int print_hwp(struct thread_data *t, struct core_data *c, struct pkg_data *p)
                return 0;
 
        fprintf(outf, "cpu%d: MSR_HWP_CAPABILITIES: 0x%08llx "
-                       "(high 0x%x guar 0x%x eff 0x%x low 0x%x)\n",
+                       "(high %d guar %d eff %d low %d)\n",
                        cpu, msr,
                        (unsigned int)HWP_HIGHEST_PERF(msr),
                        (unsigned int)HWP_GUARANTEED_PERF(msr),
@@ -3122,7 +3124,7 @@ int print_hwp(struct thread_data *t, struct core_data *c, struct pkg_data *p)
                return 0;
 
        fprintf(outf, "cpu%d: MSR_HWP_REQUEST: 0x%08llx "
-                       "(min 0x%x max 0x%x des 0x%x epp 0x%x window 0x%x pkg 0x%x)\n",
+                       "(min %d max %d des %d epp 0x%x window 0x%x pkg 0x%x)\n",
                        cpu, msr,
                        (unsigned int)(((msr) >> 0) & 0xff),
                        (unsigned int)(((msr) >> 8) & 0xff),
@@ -3136,7 +3138,7 @@ int print_hwp(struct thread_data *t, struct core_data *c, struct pkg_data *p)
                        return 0;
 
                fprintf(outf, "cpu%d: MSR_HWP_REQUEST_PKG: 0x%08llx "
-                       "(min 0x%x max 0x%x des 0x%x epp 0x%x window 0x%x)\n",
+                       "(min %d max %d des %d epp 0x%x window 0x%x)\n",
                        cpu, msr,
                        (unsigned int)(((msr) >> 0) & 0xff),
                        (unsigned int)(((msr) >> 8) & 0xff),
@@ -3353,17 +3355,19 @@ void rapl_probe(unsigned int family, unsigned int model)
        case INTEL_FAM6_SKYLAKE_DESKTOP:        /* SKL */
        case INTEL_FAM6_KABYLAKE_MOBILE:        /* KBL */
        case INTEL_FAM6_KABYLAKE_DESKTOP:       /* KBL */
-               do_rapl = RAPL_PKG | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS | RAPL_PKG_POWER_INFO;
+               do_rapl = RAPL_PKG | RAPL_CORES | RAPL_CORE_POLICY | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS | RAPL_GFX | RAPL_PKG_POWER_INFO;
                BIC_PRESENT(BIC_PKG__);
                BIC_PRESENT(BIC_RAM__);
                if (rapl_joules) {
                        BIC_PRESENT(BIC_Pkg_J);
                        BIC_PRESENT(BIC_Cor_J);
                        BIC_PRESENT(BIC_RAM_J);
+                       BIC_PRESENT(BIC_GFX_J);
                } else {
                        BIC_PRESENT(BIC_PkgWatt);
                        BIC_PRESENT(BIC_CorWatt);
                        BIC_PRESENT(BIC_RAMWatt);
+                       BIC_PRESENT(BIC_GFXWatt);
                }
                break;
        case INTEL_FAM6_HASWELL_X:      /* HSX */
@@ -3478,7 +3482,7 @@ void perf_limit_reasons_probe(unsigned int family, unsigned int model)
 int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 {
        unsigned long long msr;
-       unsigned int dts;
+       unsigned int dts, dts2;
        int cpu;
 
        if (!(do_dts || do_ptm))
@@ -3503,7 +3507,6 @@ int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p
                fprintf(outf, "cpu%d: MSR_IA32_PACKAGE_THERM_STATUS: 0x%08llx (%d C)\n",
                        cpu, msr, tcc_activation_temp - dts);
 
-#ifdef THERM_DEBUG
                if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT, &msr))
                        return 0;
 
@@ -3511,11 +3514,10 @@ int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p
                dts2 = (msr >> 8) & 0x7F;
                fprintf(outf, "cpu%d: MSR_IA32_PACKAGE_THERM_INTERRUPT: 0x%08llx (%d C, %d C)\n",
                        cpu, msr, tcc_activation_temp - dts, tcc_activation_temp - dts2);
-#endif
        }
 
 
-       if (do_dts) {
+       if (do_dts && debug) {
                unsigned int resolution;
 
                if (get_msr(cpu, MSR_IA32_THERM_STATUS, &msr))
@@ -3526,7 +3528,6 @@ int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p
                fprintf(outf, "cpu%d: MSR_IA32_THERM_STATUS: 0x%08llx (%d C +/- %d)\n",
                        cpu, msr, tcc_activation_temp - dts, resolution);
 
-#ifdef THERM_DEBUG
                if (get_msr(cpu, MSR_IA32_THERM_INTERRUPT, &msr))
                        return 0;
 
@@ -3534,7 +3535,6 @@ int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p
                dts2 = (msr >> 8) & 0x7F;
                fprintf(outf, "cpu%d: MSR_IA32_THERM_INTERRUPT: 0x%08llx (%d C, %d C)\n",
                        cpu, msr, tcc_activation_temp - dts, tcc_activation_temp - dts2);
-#endif
        }
 
        return 0;
@@ -4578,7 +4578,7 @@ int get_and_dump_counters(void)
 }
 
 void print_version() {
-       fprintf(outf, "turbostat version 17.02.24"
+       fprintf(outf, "turbostat version 17.04.12"
                " - Len Brown <lenb@kernel.org>\n");
 }
 
index 621578a..fc74db6 100644 (file)
@@ -43,6 +43,15 @@ ifneq ($(CC), clang)
 EXTRA_WARNINGS += -Wstrict-aliasing=3
 endif
 
+# Hack to avoid type-punned warnings on old systems such as RHEL5:
+# We should be changing CFLAGS and checking gcc version, but this
+# will do for now and keep the above -Wstrict-aliasing=3 in place
+# in newer systems.
+# Needed for the __raw_cmpxchg in tools/arch/x86/include/asm/cmpxchg.h
+ifneq ($(filter 3.%,$(MAKE_VERSION)),)  # make-3
+EXTRA_WARNINGS += -fno-strict-aliasing
+endif
+
 ifneq ($(findstring $(MAKEFLAGS), w),w)
 PRINT_DIR = --no-print-directory
 else
index 6a1ad58..9af09e8 100644 (file)
@@ -1,7 +1,14 @@
 LIBDIR := ../../../lib
 BPFDIR := $(LIBDIR)/bpf
+APIDIR := ../../../include/uapi
+GENDIR := ../../../../include/generated
+GENHDR := $(GENDIR)/autoconf.h
 
-CFLAGS += -Wall -O2 -I../../../include/uapi -I$(LIBDIR)
+ifneq ($(wildcard $(GENHDR)),)
+  GENFLAGS := -DHAVE_GENHDR
+endif
+
+CFLAGS += -Wall -O2 -I$(APIDIR) -I$(LIBDIR) -I$(GENDIR) $(GENFLAGS)
 LDLIBS += -lcap
 
 TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map
index a0aa200..20f1871 100644 (file)
@@ -282,7 +282,7 @@ static void test_arraymap_percpu(int task, void *data)
 {
        unsigned int nr_cpus = bpf_num_possible_cpus();
        int key, next_key, fd, i;
-       long values[nr_cpus];
+       long long values[nr_cpus];
 
        fd = bpf_create_map(BPF_MAP_TYPE_PERCPU_ARRAY, sizeof(key),
                            sizeof(values[0]), 2, 0);
@@ -340,7 +340,7 @@ static void test_arraymap_percpu_many_keys(void)
         * allocator more than anything else
         */
        unsigned int nr_keys = 2000;
-       long values[nr_cpus];
+       long long values[nr_cpus];
        int key, fd, i;
 
        fd = bpf_create_map(BPF_MAP_TYPE_PERCPU_ARRAY, sizeof(key),
index d1555e4..c848e90 100644 (file)
 
 #include <bpf/bpf.h>
 
+#ifdef HAVE_GENHDR
+# include "autoconf.h"
+#else
+# if defined(__i386) || defined(__x86_64) || defined(__s390x__) || defined(__aarch64__)
+#  define CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS 1
+# endif
+#endif
+
 #include "../../../include/linux/filter.h"
 
 #ifndef ARRAY_SIZE
@@ -39,6 +47,8 @@
 #define MAX_INSNS      512
 #define MAX_FIXUPS     8
 
+#define F_NEEDS_EFFICIENT_UNALIGNED_ACCESS     (1 << 0)
+
 struct bpf_test {
        const char *descr;
        struct bpf_insn insns[MAX_INSNS];
@@ -53,6 +63,7 @@ struct bpf_test {
                REJECT
        } result, result_unpriv;
        enum bpf_prog_type prog_type;
+       uint8_t flags;
 };
 
 /* Note we want this to be 64 bit aligned so that the end of our array is
@@ -2432,6 +2443,30 @@ static struct bpf_test tests[] = {
                .prog_type = BPF_PROG_TYPE_SCHED_CLS,
        },
        {
+               "direct packet access: test15 (spill with xadd)",
+               .insns = {
+                       BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+                                   offsetof(struct __sk_buff, data)),
+                       BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+                                   offsetof(struct __sk_buff, data_end)),
+                       BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+                       BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 8),
+                       BPF_MOV64_IMM(BPF_REG_5, 4096),
+                       BPF_MOV64_REG(BPF_REG_4, BPF_REG_10),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -8),
+                       BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_2, 0),
+                       BPF_STX_XADD(BPF_DW, BPF_REG_4, BPF_REG_5, 0),
+                       BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_4, 0),
+                       BPF_STX_MEM(BPF_W, BPF_REG_2, BPF_REG_5, 0),
+                       BPF_MOV64_IMM(BPF_REG_0, 0),
+                       BPF_EXIT_INSN(),
+               },
+               .errstr = "R2 invalid mem access 'inv'",
+               .result = REJECT,
+               .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+       },
+       {
                "helper access to packet: test1, valid packet_ptr range",
                .insns = {
                        BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
@@ -2934,6 +2969,7 @@ static struct bpf_test tests[] = {
                .errstr_unpriv = "R0 pointer arithmetic prohibited",
                .result_unpriv = REJECT,
                .result = ACCEPT,
+               .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
        },
        {
                "valid map access into an array with a variable",
@@ -2957,6 +2993,7 @@ static struct bpf_test tests[] = {
                .errstr_unpriv = "R0 pointer arithmetic prohibited",
                .result_unpriv = REJECT,
                .result = ACCEPT,
+               .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
        },
        {
                "valid map access into an array with a signed variable",
@@ -2984,6 +3021,7 @@ static struct bpf_test tests[] = {
                .errstr_unpriv = "R0 pointer arithmetic prohibited",
                .result_unpriv = REJECT,
                .result = ACCEPT,
+               .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
        },
        {
                "invalid map access into an array with a constant",
@@ -3025,6 +3063,7 @@ static struct bpf_test tests[] = {
                .errstr = "R0 min value is outside of the array range",
                .result_unpriv = REJECT,
                .result = REJECT,
+               .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
        },
        {
                "invalid map access into an array with a variable",
@@ -3048,6 +3087,7 @@ static struct bpf_test tests[] = {
                .errstr = "R0 min value is negative, either use unsigned index or do a if (index >=0) check.",
                .result_unpriv = REJECT,
                .result = REJECT,
+               .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
        },
        {
                "invalid map access into an array with no floor check",
@@ -3074,6 +3114,7 @@ static struct bpf_test tests[] = {
                .errstr = "R0 min value is negative, either use unsigned index or do a if (index >=0) check.",
                .result_unpriv = REJECT,
                .result = REJECT,
+               .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
        },
        {
                "invalid map access into an array with a invalid max check",
@@ -3100,6 +3141,7 @@ static struct bpf_test tests[] = {
                .errstr = "invalid access to map value, value_size=48 off=44 size=8",
                .result_unpriv = REJECT,
                .result = REJECT,
+               .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
        },
        {
                "invalid map access into an array with a invalid max check",
@@ -3129,6 +3171,7 @@ static struct bpf_test tests[] = {
                .errstr = "R0 min value is negative, either use unsigned index or do a if (index >=0) check.",
                .result_unpriv = REJECT,
                .result = REJECT,
+               .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
        },
        {
                "multiple registers share map_lookup_elem result",
@@ -3252,6 +3295,7 @@ static struct bpf_test tests[] = {
                .result = REJECT,
                .errstr_unpriv = "R0 pointer arithmetic prohibited",
                .result_unpriv = REJECT,
+               .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
        },
        {
                "constant register |= constant should keep constant type",
@@ -3418,6 +3462,26 @@ static struct bpf_test tests[] = {
                .prog_type = BPF_PROG_TYPE_LWT_XMIT,
        },
        {
+               "overlapping checks for direct packet access",
+               .insns = {
+                       BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+                                   offsetof(struct __sk_buff, data)),
+                       BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+                                   offsetof(struct __sk_buff, data_end)),
+                       BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+                       BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 4),
+                       BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 6),
+                       BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_3, 1),
+                       BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_2, 6),
+                       BPF_MOV64_IMM(BPF_REG_0, 0),
+                       BPF_EXIT_INSN(),
+               },
+               .result = ACCEPT,
+               .prog_type = BPF_PROG_TYPE_LWT_XMIT,
+       },
+       {
                "invalid access of tc_classid for LWT_IN",
                .insns = {
                        BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
@@ -3961,7 +4025,208 @@ static struct bpf_test tests[] = {
                .result_unpriv = REJECT,
        },
        {
-               "map element value (adjusted) is preserved across register spilling",
+               "map element value or null is marked on register spilling",
+               .insns = {
+                       BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+                       BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+                       BPF_LD_MAP_FD(BPF_REG_1, 0),
+                       BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+                       BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -152),
+                       BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, 0),
+                       BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+                       BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_1, 0),
+                       BPF_ST_MEM(BPF_DW, BPF_REG_3, 0, 42),
+                       BPF_EXIT_INSN(),
+               },
+               .fixup_map2 = { 3 },
+               .errstr_unpriv = "R0 leaks addr",
+               .result = ACCEPT,
+               .result_unpriv = REJECT,
+       },
+       {
+               "map element value store of cleared call register",
+               .insns = {
+                       BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+                       BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+                       BPF_LD_MAP_FD(BPF_REG_1, 0),
+                       BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+                       BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+                       BPF_STX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, 0),
+                       BPF_EXIT_INSN(),
+               },
+               .fixup_map2 = { 3 },
+               .errstr_unpriv = "R1 !read_ok",
+               .errstr = "R1 !read_ok",
+               .result = REJECT,
+               .result_unpriv = REJECT,
+       },
+       {
+               "map element value with unaligned store",
+               .insns = {
+                       BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+                       BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+                       BPF_LD_MAP_FD(BPF_REG_1, 0),
+                       BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+                       BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 17),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 3),
+                       BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 42),
+                       BPF_ST_MEM(BPF_DW, BPF_REG_0, 2, 43),
+                       BPF_ST_MEM(BPF_DW, BPF_REG_0, -2, 44),
+                       BPF_MOV64_REG(BPF_REG_8, BPF_REG_0),
+                       BPF_ST_MEM(BPF_DW, BPF_REG_8, 0, 32),
+                       BPF_ST_MEM(BPF_DW, BPF_REG_8, 2, 33),
+                       BPF_ST_MEM(BPF_DW, BPF_REG_8, -2, 34),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_8, 5),
+                       BPF_ST_MEM(BPF_DW, BPF_REG_8, 0, 22),
+                       BPF_ST_MEM(BPF_DW, BPF_REG_8, 4, 23),
+                       BPF_ST_MEM(BPF_DW, BPF_REG_8, -7, 24),
+                       BPF_MOV64_REG(BPF_REG_7, BPF_REG_8),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, 3),
+                       BPF_ST_MEM(BPF_DW, BPF_REG_7, 0, 22),
+                       BPF_ST_MEM(BPF_DW, BPF_REG_7, 4, 23),
+                       BPF_ST_MEM(BPF_DW, BPF_REG_7, -4, 24),
+                       BPF_EXIT_INSN(),
+               },
+               .fixup_map2 = { 3 },
+               .errstr_unpriv = "R0 pointer arithmetic prohibited",
+               .result = ACCEPT,
+               .result_unpriv = REJECT,
+               .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+       },
+       {
+               "map element value with unaligned load",
+               .insns = {
+                       BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+                       BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+                       BPF_LD_MAP_FD(BPF_REG_1, 0),
+                       BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+                       BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 11),
+                       BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0),
+                       BPF_JMP_IMM(BPF_JGE, BPF_REG_1, MAX_ENTRIES, 9),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 3),
+                       BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+                       BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 2),
+                       BPF_MOV64_REG(BPF_REG_8, BPF_REG_0),
+                       BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_8, 0),
+                       BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_8, 2),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 5),
+                       BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+                       BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 4),
+                       BPF_EXIT_INSN(),
+               },
+               .fixup_map2 = { 3 },
+               .errstr_unpriv = "R0 pointer arithmetic prohibited",
+               .result = ACCEPT,
+               .result_unpriv = REJECT,
+               .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+       },
+       {
+               "map element value illegal alu op, 1",
+               .insns = {
+                       BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+                       BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+                       BPF_LD_MAP_FD(BPF_REG_1, 0),
+                       BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+                       BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+                       BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 8),
+                       BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 22),
+                       BPF_EXIT_INSN(),
+               },
+               .fixup_map2 = { 3 },
+               .errstr_unpriv = "R0 pointer arithmetic prohibited",
+               .errstr = "invalid mem access 'inv'",
+               .result = REJECT,
+               .result_unpriv = REJECT,
+       },
+       {
+               "map element value illegal alu op, 2",
+               .insns = {
+                       BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+                       BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+                       BPF_LD_MAP_FD(BPF_REG_1, 0),
+                       BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+                       BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+                       BPF_ALU32_IMM(BPF_ADD, BPF_REG_0, 0),
+                       BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 22),
+                       BPF_EXIT_INSN(),
+               },
+               .fixup_map2 = { 3 },
+               .errstr_unpriv = "R0 pointer arithmetic prohibited",
+               .errstr = "invalid mem access 'inv'",
+               .result = REJECT,
+               .result_unpriv = REJECT,
+       },
+       {
+               "map element value illegal alu op, 3",
+               .insns = {
+                       BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+                       BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+                       BPF_LD_MAP_FD(BPF_REG_1, 0),
+                       BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+                       BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+                       BPF_ALU64_IMM(BPF_DIV, BPF_REG_0, 42),
+                       BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 22),
+                       BPF_EXIT_INSN(),
+               },
+               .fixup_map2 = { 3 },
+               .errstr_unpriv = "R0 pointer arithmetic prohibited",
+               .errstr = "invalid mem access 'inv'",
+               .result = REJECT,
+               .result_unpriv = REJECT,
+       },
+       {
+               "map element value illegal alu op, 4",
+               .insns = {
+                       BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+                       BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+                       BPF_LD_MAP_FD(BPF_REG_1, 0),
+                       BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+                       BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+                       BPF_ENDIAN(BPF_FROM_BE, BPF_REG_0, 64),
+                       BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 22),
+                       BPF_EXIT_INSN(),
+               },
+               .fixup_map2 = { 3 },
+               .errstr_unpriv = "R0 pointer arithmetic prohibited",
+               .errstr = "invalid mem access 'inv'",
+               .result = REJECT,
+               .result_unpriv = REJECT,
+       },
+       {
+               "map element value illegal alu op, 5",
+               .insns = {
+                       BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+                       BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+                       BPF_LD_MAP_FD(BPF_REG_1, 0),
+                       BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+                       BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7),
+                       BPF_MOV64_IMM(BPF_REG_3, 4096),
+                       BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+                       BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_0, 0),
+                       BPF_STX_XADD(BPF_DW, BPF_REG_2, BPF_REG_3, 0),
+                       BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, 0),
+                       BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 22),
+                       BPF_EXIT_INSN(),
+               },
+               .fixup_map2 = { 3 },
+               .errstr_unpriv = "R0 invalid mem access 'inv'",
+               .errstr = "R0 invalid mem access 'inv'",
+               .result = REJECT,
+               .result_unpriv = REJECT,
+       },
+       {
+               "map element value is preserved across register spilling",
                .insns = {
                        BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
                        BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
@@ -3983,6 +4248,7 @@ static struct bpf_test tests[] = {
                .errstr_unpriv = "R0 pointer arithmetic prohibited",
                .result = ACCEPT,
                .result_unpriv = REJECT,
+               .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
        },
        {
                "helper access to variable memory: stack, bitwise AND + JMP, correct bounds",
@@ -4421,6 +4687,7 @@ static struct bpf_test tests[] = {
                .errstr = "R0 min value is negative, either use unsigned index or do a if (index >=0) check.",
                .result = REJECT,
                .result_unpriv = REJECT,
+               .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
        },
        {
                "invalid range check",
@@ -4452,6 +4719,7 @@ static struct bpf_test tests[] = {
                .errstr = "R0 min value is negative, either use unsigned index or do a if (index >=0) check.",
                .result = REJECT,
                .result_unpriv = REJECT,
+               .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
        }
 };
 
@@ -4530,11 +4798,11 @@ static void do_test_fixup(struct bpf_test *test, struct bpf_insn *prog,
 static void do_test_single(struct bpf_test *test, bool unpriv,
                           int *passes, int *errors)
 {
+       int fd_prog, expected_ret, reject_from_alignment;
        struct bpf_insn *prog = test->insns;
        int prog_len = probe_filter_length(prog);
        int prog_type = test->prog_type;
        int fd_f1 = -1, fd_f2 = -1, fd_f3 = -1;
-       int fd_prog, expected_ret;
        const char *expected_err;
 
        do_test_fixup(test, prog, &fd_f1, &fd_f2, &fd_f3);
@@ -4547,8 +4815,19 @@ static void do_test_single(struct bpf_test *test, bool unpriv,
                       test->result_unpriv : test->result;
        expected_err = unpriv && test->errstr_unpriv ?
                       test->errstr_unpriv : test->errstr;
+
+       reject_from_alignment = fd_prog < 0 &&
+                               (test->flags & F_NEEDS_EFFICIENT_UNALIGNED_ACCESS) &&
+                               strstr(bpf_vlog, "Unknown alignment.");
+#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+       if (reject_from_alignment) {
+               printf("FAIL\nFailed due to alignment despite having efficient unaligned access: '%s'!\n",
+                      strerror(errno));
+               goto fail_log;
+       }
+#endif
        if (expected_ret == ACCEPT) {
-               if (fd_prog < 0) {
+               if (fd_prog < 0 && !reject_from_alignment) {
                        printf("FAIL\nFailed to load prog '%s'!\n",
                               strerror(errno));
                        goto fail_log;
@@ -4558,14 +4837,15 @@ static void do_test_single(struct bpf_test *test, bool unpriv,
                        printf("FAIL\nUnexpected success to load!\n");
                        goto fail_log;
                }
-               if (!strstr(bpf_vlog, expected_err)) {
+               if (!strstr(bpf_vlog, expected_err) && !reject_from_alignment) {
                        printf("FAIL\nUnexpected error message!\n");
                        goto fail_log;
                }
        }
 
        (*passes)++;
-       printf("OK\n");
+       printf("OK%s\n", reject_from_alignment ?
+              " (NOTE: reject due to unknown alignment)" : "");
 close_fds:
        close(fd_prog);
        close(fd_f1);
diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-pid.tc b/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-pid.tc
new file mode 100644 (file)
index 0000000..bab5ff7
--- /dev/null
@@ -0,0 +1,117 @@
+#!/bin/sh
+# description: ftrace - function pid filters
+
+# Make sure that function pid matching filter works.
+# Also test it on an instance directory
+
+if ! grep -q function available_tracers; then
+    echo "no function tracer configured"
+    exit_unsupported
+fi
+
+if [ ! -f set_ftrace_pid ]; then
+    echo "set_ftrace_pid not found? Is function tracer not set?"
+    exit_unsupported
+fi
+
+if [ ! -f set_ftrace_filter ]; then
+    echo "set_ftrace_filter not found? Is function tracer not set?"
+    exit_unsupported
+fi
+
+do_function_fork=1
+
+if [ ! -f options/function-fork ]; then
+    do_function_fork=0
+    echo "no option for function-fork found. Option will not be tested."
+fi
+
+read PID _ < /proc/self/stat
+
+if [ $do_function_fork -eq 1 ]; then
+    # default value of function-fork option
+    orig_value=`grep function-fork trace_options`
+fi
+
+do_reset() {
+    reset_tracer
+    clear_trace
+    enable_tracing
+    echo > set_ftrace_filter
+    echo > set_ftrace_pid
+
+    if [ $do_function_fork -eq 0 ]; then
+       return
+    fi
+
+    echo $orig_value > trace_options
+}
+
+fail() { # msg
+    do_reset
+    echo $1
+    exit $FAIL
+}
+
+yield() {
+    ping localhost -c 1 || sleep .001 || usleep 1 || sleep 1
+}
+
+do_test() {
+    disable_tracing
+
+    echo do_execve* > set_ftrace_filter
+    echo *do_fork >> set_ftrace_filter
+
+    echo $PID > set_ftrace_pid
+    echo function > current_tracer
+
+    if [ $do_function_fork -eq 1 ]; then
+       # don't allow children to be traced
+       echo nofunction-fork > trace_options
+    fi
+
+    enable_tracing
+    yield
+
+    count_pid=`cat trace | grep -v ^# | grep $PID | wc -l`
+    count_other=`cat trace | grep -v ^# | grep -v $PID | wc -l`
+
+    # count_other should be 0
+    if [ $count_pid -eq 0 -o $count_other -ne 0 ]; then
+       fail "PID filtering not working?"
+    fi
+
+    disable_tracing
+    clear_trace
+
+    if [ $do_function_fork -eq 0 ]; then
+       return
+    fi
+
+    # allow children to be traced
+    echo function-fork > trace_options
+
+    enable_tracing
+    yield
+
+    count_pid=`cat trace | grep -v ^# | grep $PID | wc -l`
+    count_other=`cat trace | grep -v ^# | grep -v $PID | wc -l`
+
+    # count_other should NOT be 0
+    if [ $count_pid -eq 0 -o $count_other -eq 0 ]; then
+       fail "PID filtering not following fork?"
+    fi
+}
+
+do_test
+
+mkdir instances/foo
+cd instances/foo
+do_test
+cd ../../
+rmdir instances/foo
+
+do_reset
+
+exit 0
index 4124593..e62bb35 100644 (file)
@@ -75,7 +75,7 @@ static int sock_fanout_open(uint16_t typeflags, int num_packets)
 {
        int fd, val;
 
-       fd = socket(PF_PACKET, SOCK_DGRAM, htons(ETH_P_IP));
+       fd = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_IP));
        if (fd < 0) {
                perror("socket packet");
                exit(1);
@@ -95,6 +95,24 @@ static int sock_fanout_open(uint16_t typeflags, int num_packets)
        return fd;
 }
 
+static void sock_fanout_set_cbpf(int fd)
+{
+       struct sock_filter bpf_filter[] = {
+               BPF_STMT(BPF_LD+BPF_B+BPF_ABS, 80),           /* ldb [80] */
+               BPF_STMT(BPF_RET+BPF_A, 0),                   /* ret A */
+       };
+       struct sock_fprog bpf_prog;
+
+       bpf_prog.filter = bpf_filter;
+       bpf_prog.len = sizeof(bpf_filter) / sizeof(struct sock_filter);
+
+       if (setsockopt(fd, SOL_PACKET, PACKET_FANOUT_DATA, &bpf_prog,
+                      sizeof(bpf_prog))) {
+               perror("fanout data cbpf");
+               exit(1);
+       }
+}
+
 static void sock_fanout_set_ebpf(int fd)
 {
        const int len_off = __builtin_offsetof(struct __sk_buff, len);
@@ -270,7 +288,7 @@ static int test_datapath(uint16_t typeflags, int port_off,
                exit(1);
        }
        if (type == PACKET_FANOUT_CBPF)
-               sock_setfilter(fds[0], SOL_PACKET, PACKET_FANOUT_DATA);
+               sock_fanout_set_cbpf(fds[0]);
        else if (type == PACKET_FANOUT_EBPF)
                sock_fanout_set_ebpf(fds[0]);
 
index a77da88..7d990d6 100644 (file)
@@ -38,7 +38,7 @@
 # define __maybe_unused                __attribute__ ((__unused__))
 #endif
 
-static __maybe_unused void sock_setfilter(int fd, int lvl, int optnum)
+static __maybe_unused void pair_udp_setfilter(int fd)
 {
        /* the filter below checks for all of the following conditions that
         * are based on the contents of create_payload()
@@ -76,23 +76,16 @@ static __maybe_unused void sock_setfilter(int fd, int lvl, int optnum)
        };
        struct sock_fprog bpf_prog;
 
-       if (lvl == SOL_PACKET && optnum == PACKET_FANOUT_DATA)
-               bpf_filter[5].code = 0x16;   /* RET A                         */
-
        bpf_prog.filter = bpf_filter;
        bpf_prog.len = sizeof(bpf_filter) / sizeof(struct sock_filter);
-       if (setsockopt(fd, lvl, optnum, &bpf_prog,
+
+       if (setsockopt(fd, SOL_SOCKET, SO_ATTACH_FILTER, &bpf_prog,
                       sizeof(bpf_prog))) {
                perror("setsockopt SO_ATTACH_FILTER");
                exit(1);
        }
 }
 
-static __maybe_unused void pair_udp_setfilter(int fd)
-{
-       sock_setfilter(fd, SOL_SOCKET, SO_ATTACH_FILTER);
-}
-
 static __maybe_unused void pair_udp_open(int fds[], uint16_t port)
 {
        struct sockaddr_in saddr, daddr;
index 1c5d057..bf13fc2 100644 (file)
@@ -34,34 +34,34 @@ endif
 all: $(SUB_DIRS)
 
 $(SUB_DIRS):
-       BUILD_TARGET=$$OUTPUT/$@; mkdir -p $$BUILD_TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -k -C $@ all
+       BUILD_TARGET=$(OUTPUT)/$@; mkdir -p $$BUILD_TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -k -C $@ all
 
 include ../lib.mk
 
 override define RUN_TESTS
        @for TARGET in $(SUB_DIRS); do \
-               BUILD_TARGET=$$OUTPUT/$$TARGET; \
+               BUILD_TARGET=$(OUTPUT)/$$TARGET;        \
                $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET run_tests;\
        done;
 endef
 
 override define INSTALL_RULE
        @for TARGET in $(SUB_DIRS); do \
-               BUILD_TARGET=$$OUTPUT/$$TARGET; \
+               BUILD_TARGET=$(OUTPUT)/$$TARGET;        \
                $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET install;\
        done;
 endef
 
 override define EMIT_TESTS
        @for TARGET in $(SUB_DIRS); do \
-               BUILD_TARGET=$$OUTPUT/$$TARGET; \
+               BUILD_TARGET=$(OUTPUT)/$$TARGET;        \
                $(MAKE) OUTPUT=$$BUILD_TARGET -s -C $$TARGET emit_tests;\
        done;
 endef
 
 clean:
        @for TARGET in $(SUB_DIRS); do \
-               BUILD_TARGET=$$OUTPUT/$$TARGET; \
+               BUILD_TARGET=$(OUTPUT)/$$TARGET;        \
                $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET clean; \
        done;
        rm -f tags
index 276139a..702f810 100644 (file)
@@ -392,6 +392,25 @@ static irqreturn_t vgic_maintenance_handler(int irq, void *data)
 }
 
 /**
+ * kvm_vgic_init_cpu_hardware - initialize the GIC VE hardware
+ *
+ * For a specific CPU, initialize the GIC VE hardware.
+ */
+void kvm_vgic_init_cpu_hardware(void)
+{
+       BUG_ON(preemptible());
+
+       /*
+        * We want to make sure the list registers start out clear so that we
+        * only have the program the used registers.
+        */
+       if (kvm_vgic_global_state.type == VGIC_V2)
+               vgic_v2_init_lrs();
+       else
+               kvm_call_hyp(__vgic_v3_init_lrs);
+}
+
+/**
  * kvm_vgic_hyp_init: populates the kvm_vgic_global_state variable
  * according to the host GIC model. Accordingly calls either
  * vgic_v2/v3_probe which registers the KVM_DEVICE that can be
index a3ad7ff..0a4283e 100644 (file)
@@ -229,7 +229,15 @@ static unsigned long vgic_mmio_read_vcpuif(struct kvm_vcpu *vcpu,
                val = vmcr.ctlr;
                break;
        case GIC_CPU_PRIMASK:
-               val = vmcr.pmr;
+               /*
+                * Our KVM_DEV_TYPE_ARM_VGIC_V2 device ABI exports the
+                * the PMR field as GICH_VMCR.VMPriMask rather than
+                * GICC_PMR.Priority, so we expose the upper five bits of
+                * priority mask to userspace using the lower bits in the
+                * unsigned long.
+                */
+               val = (vmcr.pmr & GICV_PMR_PRIORITY_MASK) >>
+                       GICV_PMR_PRIORITY_SHIFT;
                break;
        case GIC_CPU_BINPOINT:
                val = vmcr.bpr;
@@ -262,7 +270,15 @@ static void vgic_mmio_write_vcpuif(struct kvm_vcpu *vcpu,
                vmcr.ctlr = val;
                break;
        case GIC_CPU_PRIMASK:
-               vmcr.pmr = val;
+               /*
+                * Our KVM_DEV_TYPE_ARM_VGIC_V2 device ABI exports the
+                * the PMR field as GICH_VMCR.VMPriMask rather than
+                * GICC_PMR.Priority, so we expose the upper five bits of
+                * priority mask to userspace using the lower bits in the
+                * unsigned long.
+                */
+               vmcr.pmr = (val << GICV_PMR_PRIORITY_SHIFT) &
+                       GICV_PMR_PRIORITY_MASK;
                break;
        case GIC_CPU_BINPOINT:
                vmcr.bpr = val;
index b834ecd..b637d9c 100644 (file)
@@ -36,6 +36,21 @@ static unsigned long *u64_to_bitmask(u64 *val)
        return (unsigned long *)val;
 }
 
+static inline void vgic_v2_write_lr(int lr, u32 val)
+{
+       void __iomem *base = kvm_vgic_global_state.vctrl_base;
+
+       writel_relaxed(val, base + GICH_LR0 + (lr * 4));
+}
+
+void vgic_v2_init_lrs(void)
+{
+       int i;
+
+       for (i = 0; i < kvm_vgic_global_state.nr_lr; i++)
+               vgic_v2_write_lr(i, 0);
+}
+
 void vgic_v2_process_maintenance(struct kvm_vcpu *vcpu)
 {
        struct vgic_v2_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v2;
@@ -191,8 +206,8 @@ void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
                GICH_VMCR_ALIAS_BINPOINT_MASK;
        vmcr |= (vmcrp->bpr << GICH_VMCR_BINPOINT_SHIFT) &
                GICH_VMCR_BINPOINT_MASK;
-       vmcr |= (vmcrp->pmr << GICH_VMCR_PRIMASK_SHIFT) &
-               GICH_VMCR_PRIMASK_MASK;
+       vmcr |= ((vmcrp->pmr >> GICV_PMR_PRIORITY_SHIFT) <<
+                GICH_VMCR_PRIMASK_SHIFT) & GICH_VMCR_PRIMASK_MASK;
 
        vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = vmcr;
 }
@@ -207,8 +222,8 @@ void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
                        GICH_VMCR_ALIAS_BINPOINT_SHIFT;
        vmcrp->bpr  = (vmcr & GICH_VMCR_BINPOINT_MASK) >>
                        GICH_VMCR_BINPOINT_SHIFT;
-       vmcrp->pmr  = (vmcr & GICH_VMCR_PRIMASK_MASK) >>
-                       GICH_VMCR_PRIMASK_SHIFT;
+       vmcrp->pmr  = ((vmcr & GICH_VMCR_PRIMASK_MASK) >>
+                       GICH_VMCR_PRIMASK_SHIFT) << GICV_PMR_PRIORITY_SHIFT;
 }
 
 void vgic_v2_enable(struct kvm_vcpu *vcpu)
index db28f7c..6cf557e 100644 (file)
@@ -81,11 +81,18 @@ static inline bool irq_is_pending(struct vgic_irq *irq)
                return irq->pending_latch || irq->line_level;
 }
 
+/*
+ * This struct provides an intermediate representation of the fields contained
+ * in the GICH_VMCR and ICH_VMCR registers, such that code exporting the GIC
+ * state to userspace can generate either GICv2 or GICv3 CPU interface
+ * registers regardless of the hardware backed GIC used.
+ */
 struct vgic_vmcr {
        u32     ctlr;
        u32     abpr;
        u32     bpr;
-       u32     pmr;
+       u32     pmr;  /* Priority mask field in the GICC_PMR and
+                      * ICC_PMR_EL1 priority field format */
        /* Below member variable are valid only for GICv3 */
        u32     grpen0;
        u32     grpen1;
@@ -130,6 +137,8 @@ int vgic_v2_map_resources(struct kvm *kvm);
 int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address,
                             enum vgic_type);
 
+void vgic_v2_init_lrs(void);
+
 static inline void vgic_get_irq_kref(struct vgic_irq *irq)
 {
        if (irq->intid < VGIC_MIN_LPI)