cpu_capacity: capacity of cpuX.
What: /sys/devices/system/cpu/vulnerabilities
+ /sys/devices/system/cpu/vulnerabilities/gather_data_sampling
+ /sys/devices/system/cpu/vulnerabilities/itlb_multihit
+ /sys/devices/system/cpu/vulnerabilities/l1tf
+ /sys/devices/system/cpu/vulnerabilities/mds
/sys/devices/system/cpu/vulnerabilities/meltdown
+ /sys/devices/system/cpu/vulnerabilities/mmio_stale_data
+ /sys/devices/system/cpu/vulnerabilities/retbleed
+ /sys/devices/system/cpu/vulnerabilities/spec_store_bypass
/sys/devices/system/cpu/vulnerabilities/spectre_v1
/sys/devices/system/cpu/vulnerabilities/spectre_v2
- /sys/devices/system/cpu/vulnerabilities/spec_store_bypass
- /sys/devices/system/cpu/vulnerabilities/l1tf
- /sys/devices/system/cpu/vulnerabilities/mds
/sys/devices/system/cpu/vulnerabilities/srbds
/sys/devices/system/cpu/vulnerabilities/tsx_async_abort
- /sys/devices/system/cpu/vulnerabilities/itlb_multihit
- /sys/devices/system/cpu/vulnerabilities/mmio_stale_data
- /sys/devices/system/cpu/vulnerabilities/retbleed
Date: January 2018
Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
Description: Information about CPU vulnerabilities
/sys/devices/platform/QCOM8061:*/chid
Date: Dec 2015
KernelVersion: 4.4
-Contact: "Sinan Kaya <okaya@codeaurora.org>"
+Contact: "Sinan Kaya <okaya@kernel.org>"
Description:
Contains the ID of the channel within the HIDMA instance.
It is used to associate a given HIDMA channel with the
/sys/devices/platform/QCOM8060:*/chanops/chan*/priority
Date: Nov 2015
KernelVersion: 4.4
-Contact: "Sinan Kaya <okaya@codeaurora.org>"
+Contact: "Sinan Kaya <okaya@kernel.org>"
Description:
Contains either 0 or 1 and indicates if the DMA channel is a
low priority (0) or high priority (1) channel.
/sys/devices/platform/QCOM8060:*/chanops/chan*/weight
Date: Nov 2015
KernelVersion: 4.4
-Contact: "Sinan Kaya <okaya@codeaurora.org>"
+Contact: "Sinan Kaya <okaya@kernel.org>"
Description:
Contains 0..15 and indicates the weight of the channel among
equal priority channels during round robin scheduling.
/sys/devices/platform/QCOM8060:*/chreset_timeout_cycles
Date: Nov 2015
KernelVersion: 4.4
-Contact: "Sinan Kaya <okaya@codeaurora.org>"
+Contact: "Sinan Kaya <okaya@kernel.org>"
Description:
Contains the platform specific cycle value to wait after a
reset command is issued. If the value is chosen too short,
/sys/devices/platform/QCOM8060:*/dma_channels
Date: Nov 2015
KernelVersion: 4.4
-Contact: "Sinan Kaya <okaya@codeaurora.org>"
+Contact: "Sinan Kaya <okaya@kernel.org>"
Description:
Contains the number of dma channels supported by one instance
of HIDMA hardware. The value may change from chip to chip.
/sys/devices/platform/QCOM8060:*/hw_version_major
Date: Nov 2015
KernelVersion: 4.4
-Contact: "Sinan Kaya <okaya@codeaurora.org>"
+Contact: "Sinan Kaya <okaya@kernel.org>"
Description:
Version number major for the hardware.
/sys/devices/platform/QCOM8060:*/hw_version_minor
Date: Nov 2015
KernelVersion: 4.4
-Contact: "Sinan Kaya <okaya@codeaurora.org>"
+Contact: "Sinan Kaya <okaya@kernel.org>"
Description:
Version number minor for the hardware.
/sys/devices/platform/QCOM8060:*/max_rd_xactions
Date: Nov 2015
KernelVersion: 4.4
-Contact: "Sinan Kaya <okaya@codeaurora.org>"
+Contact: "Sinan Kaya <okaya@kernel.org>"
Description:
Contains a value between 0 and 31. Maximum number of
read transactions that can be issued back to back.
/sys/devices/platform/QCOM8060:*/max_read_request
Date: Nov 2015
KernelVersion: 4.4
-Contact: "Sinan Kaya <okaya@codeaurora.org>"
+Contact: "Sinan Kaya <okaya@kernel.org>"
Description:
Size of each read request. The value needs to be a power
of two and can be between 128 and 1024.
/sys/devices/platform/QCOM8060:*/max_wr_xactions
Date: Nov 2015
KernelVersion: 4.4
-Contact: "Sinan Kaya <okaya@codeaurora.org>"
+Contact: "Sinan Kaya <okaya@kernel.org>"
Description:
Contains a value between 0 and 31. Maximum number of
write transactions that can be issued back to back.
/sys/devices/platform/QCOM8060:*/max_write_request
Date: Nov 2015
KernelVersion: 4.4
-Contact: "Sinan Kaya <okaya@codeaurora.org>"
+Contact: "Sinan Kaya <okaya@kernel.org>"
Description:
Size of each write request. The value needs to be a power
of two and can be between 128 and 1024.
--- /dev/null
+.. SPDX-License-Identifier: GPL-2.0
+
+GDS - Gather Data Sampling
+==========================
+
+Gather Data Sampling is a hardware vulnerability which allows unprivileged
+speculative access to data which was previously stored in vector registers.
+
+Problem
+-------
+When a gather instruction performs loads from memory, different data elements
+are merged into the destination vector register. However, when a gather
+instruction that is transiently executed encounters a fault, stale data from
+architectural or internal vector registers may get transiently forwarded to the
+destination vector register instead. This will allow a malicious attacker to
+infer stale data using typical side channel techniques like cache timing
+attacks. GDS is a purely sampling-based attack.
+
+The attacker uses gather instructions to infer the stale vector register data.
+The victim does not need to do anything special other than use the vector
+registers. The victim does not need to use gather instructions to be
+vulnerable.
+
+Because the buffers are shared between Hyper-Threads cross Hyper-Thread attacks
+are possible.
+
+Attack scenarios
+----------------
+Without mitigation, GDS can infer stale data across virtually all
+permission boundaries:
+
+ Non-enclaves can infer SGX enclave data
+ Userspace can infer kernel data
+ Guests can infer data from hosts
+ Guest can infer guest from other guests
+ Users can infer data from other users
+
+Because of this, it is important to ensure that the mitigation stays enabled in
+lower-privilege contexts like guests and when running outside SGX enclaves.
+
+The hardware enforces the mitigation for SGX. Likewise, VMMs should ensure
+that guests are not allowed to disable the GDS mitigation. If a host erred and
+allowed this, a guest could theoretically disable GDS mitigation, mount an
+attack, and re-enable it.
+
+Mitigation mechanism
+--------------------
+This issue is mitigated in microcode. The microcode defines the following new
+bits:
+
+ ================================ === ============================
+ IA32_ARCH_CAPABILITIES[GDS_CTRL] R/O Enumerates GDS vulnerability
+ and mitigation support.
+ IA32_ARCH_CAPABILITIES[GDS_NO] R/O Processor is not vulnerable.
+ IA32_MCU_OPT_CTRL[GDS_MITG_DIS] R/W Disables the mitigation
+ 0 by default.
+ IA32_MCU_OPT_CTRL[GDS_MITG_LOCK] R/W Locks GDS_MITG_DIS=0. Writes
+ to GDS_MITG_DIS are ignored
+ Can't be cleared once set.
+ ================================ === ============================
+
+GDS can also be mitigated on systems that don't have updated microcode by
+disabling AVX. This can be done by setting gather_data_sampling="force" or
+"clearcpuid=avx" on the kernel command-line.
+
+If used, these options will disable AVX use by turning off XSAVE YMM support.
+However, the processor will still enumerate AVX support. Userspace that
+does not follow proper AVX enumeration to check both AVX *and* XSAVE YMM
+support will break.
+
+Mitigation control on the kernel command line
+---------------------------------------------
+The mitigation can be disabled by setting "gather_data_sampling=off" or
+"mitigations=off" on the kernel command line. Not specifying either will default
+to the mitigation being enabled. Specifying "gather_data_sampling=force" will
+use the microcode mitigation when available or disable AVX on affected systems
+where the microcode hasn't been updated to include the mitigation.
+
+GDS System Information
+------------------------
+The kernel provides vulnerability status information through sysfs. For
+GDS this can be accessed by the following sysfs file:
+
+/sys/devices/system/cpu/vulnerabilities/gather_data_sampling
+
+The possible values contained in this file are:
+
+ ============================== =============================================
+ Not affected Processor not vulnerable.
+ Vulnerable Processor vulnerable and mitigation disabled.
+ Vulnerable: No microcode Processor vulnerable and microcode is missing
+ mitigation.
+ Mitigation: AVX disabled,
+ no microcode Processor is vulnerable and microcode is missing
+ mitigation. AVX disabled as mitigation.
+ Mitigation: Microcode Processor is vulnerable and mitigation is in
+ effect.
+ Mitigation: Microcode (locked) Processor is vulnerable and mitigation is in
+ effect and cannot be disabled.
+ Unknown: Dependent on
+ hypervisor status Running on a virtual guest processor that is
+ affected but with no way to know if host
+ processor is mitigated or vulnerable.
+ ============================== =============================================
+
+GDS Default mitigation
+----------------------
+The updated microcode will enable the mitigation by default. The kernel's
+default action is to leave the mitigation enabled.
l1d_flush.rst
processor_mmio_stale_data.rst
cross-thread-rsb.rst
+ srso
+ gather_data_sampling.rst
--- /dev/null
+.. SPDX-License-Identifier: GPL-2.0
+
+Speculative Return Stack Overflow (SRSO)
+========================================
+
+This is a mitigation for the speculative return stack overflow (SRSO)
+vulnerability found on AMD processors. The mechanism is by now the well
+known scenario of poisoning CPU functional units - the Branch Target
+Buffer (BTB) and Return Address Predictor (RAP) in this case - and then
+tricking the elevated privilege domain (the kernel) into leaking
+sensitive data.
+
+AMD CPUs predict RET instructions using a Return Address Predictor (aka
+Return Address Stack/Return Stack Buffer). In some cases, a non-architectural
+CALL instruction (i.e., an instruction predicted to be a CALL but is
+not actually a CALL) can create an entry in the RAP which may be used
+to predict the target of a subsequent RET instruction.
+
+The specific circumstances that lead to this varies by microarchitecture
+but the concern is that an attacker can mis-train the CPU BTB to predict
+non-architectural CALL instructions in kernel space and use this to
+control the speculative target of a subsequent kernel RET, potentially
+leading to information disclosure via a speculative side-channel.
+
+The issue is tracked under CVE-2023-20569.
+
+Affected processors
+-------------------
+
+AMD Zen, generations 1-4. That is, all families 0x17 and 0x19. Older
+processors have not been investigated.
+
+System information and options
+------------------------------
+
+First of all, it is required that the latest microcode be loaded for
+mitigations to be effective.
+
+The sysfs file showing SRSO mitigation status is:
+
+ /sys/devices/system/cpu/vulnerabilities/spec_rstack_overflow
+
+The possible values in this file are:
+
+ - 'Not affected' The processor is not vulnerable
+
+ - 'Vulnerable: no microcode' The processor is vulnerable, no
+ microcode extending IBPB functionality
+ to address the vulnerability has been
+ applied.
+
+ - 'Mitigation: microcode' Extended IBPB functionality microcode
+ patch has been applied. It does not
+ address User->Kernel and Guest->Host
+ transitions protection but it does
+ address User->User and VM->VM attack
+ vectors.
+
+ (spec_rstack_overflow=microcode)
+
+ - 'Mitigation: safe RET' Software-only mitigation. It complements
+ the extended IBPB microcode patch
+ functionality by addressing User->Kernel
+ and Guest->Host transitions protection.
+
+ Selected by default or by
+ spec_rstack_overflow=safe-ret
+
+ - 'Mitigation: IBPB' Similar protection as "safe RET" above
+ but employs an IBPB barrier on privilege
+ domain crossings (User->Kernel,
+ Guest->Host).
+
+ (spec_rstack_overflow=ibpb)
+
+ - 'Mitigation: IBPB on VMEXIT' Mitigation addressing the cloud provider
+ scenario - the Guest->Host transitions
+ only.
+
+ (spec_rstack_overflow=ibpb-vmexit)
+
+In order to exploit vulnerability, an attacker needs to:
+
+ - gain local access on the machine
+
+ - break kASLR
+
+ - find gadgets in the running kernel in order to use them in the exploit
+
+ - potentially create and pin an additional workload on the sibling
+ thread, depending on the microarchitecture (not necessary on fam 0x19)
+
+ - run the exploit
+
+Considering the performance implications of each mitigation type, the
+default one is 'Mitigation: safe RET' which should take care of most
+attack vectors, including the local User->Kernel one.
+
+As always, the user is advised to keep her/his system up-to-date by
+applying software updates regularly.
+
+The default setting will be reevaluated when needed and especially when
+new attack vectors appear.
+
+As one can surmise, 'Mitigation: safe RET' does come at the cost of some
+performance depending on the workload. If one trusts her/his userspace
+and does not want to suffer the performance impact, one can always
+disable the mitigation with spec_rstack_overflow=off.
+
+Similarly, 'Mitigation: IBPB' is another full mitigation type employing
+an indrect branch prediction barrier after having applied the required
+microcode patch for one's system. This mitigation comes also at
+a performance cost.
+
+Mitigation: safe RET
+--------------------
+
+The mitigation works by ensuring all RET instructions speculate to
+a controlled location, similar to how speculation is controlled in the
+retpoline sequence. To accomplish this, the __x86_return_thunk forces
+the CPU to mispredict every function return using a 'safe return'
+sequence.
+
+To ensure the safety of this mitigation, the kernel must ensure that the
+safe return sequence is itself free from attacker interference. In Zen3
+and Zen4, this is accomplished by creating a BTB alias between the
+untraining function srso_untrain_ret_alias() and the safe return
+function srso_safe_ret_alias() which results in evicting a potentially
+poisoned BTB entry and using that safe one for all function returns.
+
+In older Zen1 and Zen2, this is accomplished using a reinterpretation
+technique similar to Retbleed one: srso_untrain_ret() and
+srso_safe_ret().
Format: off | on
default: on
+ gather_data_sampling=
+ [X86,INTEL] Control the Gather Data Sampling (GDS)
+ mitigation.
+
+ Gather Data Sampling is a hardware vulnerability which
+ allows unprivileged speculative access to data which was
+ previously stored in vector registers.
+
+ This issue is mitigated by default in updated microcode.
+ The mitigation may have a performance impact but can be
+ disabled. On systems without the microcode mitigation
+ disabling AVX serves as a mitigation.
+
+ force: Disable AVX to mitigate systems without
+ microcode mitigation. No effect if the microcode
+ mitigation is present. Known to cause crashes in
+ userspace with buggy AVX enumeration.
+
+ off: Disable GDS mitigation.
+
gcov_persist= [GCOV] When non-zero (default), profiling data for
kernel modules is saved and remains accessible via
debugfs, even when the module is unloaded/reloaded.
Disable all optional CPU mitigations. This
improves system performance, but it may also
expose users to several CPU vulnerabilities.
- Equivalent to: nopti [X86,PPC]
- if nokaslr then kpti=0 [ARM64]
- nospectre_v1 [X86,PPC]
- nobp=0 [S390]
- nospectre_v2 [X86,PPC,S390,ARM64]
- spectre_v2_user=off [X86]
- spec_store_bypass_disable=off [X86,PPC]
- ssbd=force-off [ARM64]
- nospectre_bhb [ARM64]
+ Equivalent to: if nokaslr then kpti=0 [ARM64]
+ gather_data_sampling=off [X86]
+ kvm.nx_huge_pages=off [X86]
l1tf=off [X86]
mds=off [X86]
- tsx_async_abort=off [X86]
- kvm.nx_huge_pages=off [X86]
- srbds=off [X86,INTEL]
+ mmio_stale_data=off [X86]
no_entry_flush [PPC]
no_uaccess_flush [PPC]
- mmio_stale_data=off [X86]
+ nobp=0 [S390]
+ nopti [X86,PPC]
+ nospectre_bhb [ARM64]
+ nospectre_v1 [X86,PPC]
+ nospectre_v2 [X86,PPC,S390,ARM64]
retbleed=off [X86]
+ spec_store_bypass_disable=off [X86,PPC]
+ spectre_v2_user=off [X86]
+ srbds=off [X86,INTEL]
+ ssbd=force-off [ARM64]
+ tsx_async_abort=off [X86]
Exceptions:
This does not have any effect on
Not specifying this option is equivalent to
spectre_v2_user=auto.
+ spec_rstack_overflow=
+ [X86] Control RAS overflow mitigation on AMD Zen CPUs
+
+ off - Disable mitigation
+ microcode - Enable microcode mitigation only
+ safe-ret - Enable sw-only safe RET mitigation (default)
+ ibpb - Enable mitigation by issuing IBPB on
+ kernel entry
+ ibpb-vmexit - Issue IBPB only on VMEXIT
+ (cloud-specific mitigation)
+
spec_store_bypass_disable=
[HW] Control Speculative Store Bypass (SSB) Disable mitigation
(Speculative Store Bypass vulnerability)
ARM/MICROCHIP (ARM64) SoC support
M: Conor Dooley <conor@kernel.org>
M: Nicolas Ferre <nicolas.ferre@microchip.com>
-M: Claudiu Beznea <claudiu.beznea@microchip.com>
+M: Claudiu Beznea <claudiu.beznea@tuxon.dev>
L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
S: Supported
T: git https://git.kernel.org/pub/scm/linux/kernel/git/at91/linux.git
ARM/Microchip (AT91) SoC support
M: Nicolas Ferre <nicolas.ferre@microchip.com>
M: Alexandre Belloni <alexandre.belloni@bootlin.com>
-M: Claudiu Beznea <claudiu.beznea@microchip.com>
+M: Claudiu Beznea <claudiu.beznea@tuxon.dev>
L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
S: Supported
W: http://www.linux4sam.org
ATMEL MACB ETHERNET DRIVER
M: Nicolas Ferre <nicolas.ferre@microchip.com>
-M: Claudiu Beznea <claudiu.beznea@microchip.com>
+M: Claudiu Beznea <claudiu.beznea@tuxon.dev>
S: Supported
F: drivers/net/ethernet/cadence/
MAPLE TREE
M: Liam R. Howlett <Liam.Howlett@oracle.com>
+L: maple-tree@lists.infradead.org
L: linux-mm@kvack.org
S: Supported
F: Documentation/core-api/maple_tree.rst
F: drivers/spi/spi-at91-usart.c
MICROCHIP AUDIO ASOC DRIVERS
-M: Claudiu Beznea <claudiu.beznea@microchip.com>
+M: Claudiu Beznea <claudiu.beznea@tuxon.dev>
L: alsa-devel@alsa-project.org (moderated for non-subscribers)
S: Supported
F: Documentation/devicetree/bindings/sound/atmel*
F: drivers/crypto/atmel-ecc.*
MICROCHIP EIC DRIVER
-M: Claudiu Beznea <claudiu.beznea@microchip.com>
+M: Claudiu Beznea <claudiu.beznea@tuxon.dev>
L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
S: Supported
F: Documentation/devicetree/bindings/interrupt-controller/microchip,sama7g5-eic.yaml
F: include/video/atmel_lcdc.h
MICROCHIP MCP16502 PMIC DRIVER
-M: Claudiu Beznea <claudiu.beznea@microchip.com>
+M: Claudiu Beznea <claudiu.beznea@tuxon.dev>
L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
S: Supported
F: Documentation/devicetree/bindings/regulator/mcp16502-regulator.txt
F: drivers/mtd/nand/raw/atmel/*
MICROCHIP OTPC DRIVER
-M: Claudiu Beznea <claudiu.beznea@microchip.com>
+M: Claudiu Beznea <claudiu.beznea@tuxon.dev>
L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
S: Supported
F: Documentation/devicetree/bindings/nvmem/microchip,sama7g5-otpc.yaml
F: drivers/fpga/microchip-spi.c
MICROCHIP PWM DRIVER
-M: Claudiu Beznea <claudiu.beznea@microchip.com>
+M: Claudiu Beznea <claudiu.beznea@tuxon.dev>
L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
L: linux-pwm@vger.kernel.org
S: Supported
F: include/dt-bindings/iio/adc/at91-sama5d2_adc.h
MICROCHIP SAMA5D2-COMPATIBLE SHUTDOWN CONTROLLER
-M: Claudiu Beznea <claudiu.beznea@microchip.com>
+M: Claudiu Beznea <claudiu.beznea@tuxon.dev>
S: Supported
F: Documentation/devicetree/bindings/power/reset/atmel,sama5d2-shdwc.yaml
F: drivers/power/reset/at91-sama5d2_shdwc.c
F: drivers/spi/spi-atmel.*
MICROCHIP SSC DRIVER
-M: Claudiu Beznea <claudiu.beznea@microchip.com>
+M: Claudiu Beznea <claudiu.beznea@tuxon.dev>
L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
S: Supported
F: Documentation/devicetree/bindings/misc/atmel-ssc.txt
MICROCHIP WILC1000 WIFI DRIVER
M: Ajay Singh <ajay.kathat@microchip.com>
-M: Claudiu Beznea <claudiu.beznea@microchip.com>
+M: Claudiu Beznea <claudiu.beznea@tuxon.dev>
L: linux-wireless@vger.kernel.org
S: Supported
F: drivers/net/wireless/microchip/wilc1000/
PCI DRIVER FOR SYNOPSYS DESIGNWARE
M: Jingoo Han <jingoohan1@gmail.com>
M: Gustavo Pimentel <gustavo.pimentel@synopsys.com>
+M: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
L: linux-pci@vger.kernel.org
S: Maintained
F: Documentation/devicetree/bindings/pci/snps,dw-pcie-ep.yaml
L: linux-wireless@vger.kernel.org
S: Orphan
W: https://wireless.wiki.kernel.org/
-T: git git://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-testing.git
F: drivers/net/wireless/realtek/rtl818x/rtl8180/
RTL8187 WIRELESS DRIVER
-M: Herton Ronaldo Krzesinski <herton@canonical.com>
-M: Hin-Tak Leung <htl10@users.sourceforge.net>
+M: Hin-Tak Leung <hintak.leung@gmail.com>
M: Larry Finger <Larry.Finger@lwfinger.net>
L: linux-wireless@vger.kernel.org
S: Maintained
W: https://wireless.wiki.kernel.org/
-T: git git://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-testing.git
F: drivers/net/wireless/realtek/rtl818x/rtl8187/
RTL8XXXU WIRELESS DRIVER (rtl8xxxu)
.Lskip_hcrx_\@:
.endm
+/* Check if running in host at EL2 mode, i.e., (h)VHE. Jump to fail if not. */
+.macro __check_hvhe fail, tmp
+ mrs \tmp, hcr_el2
+ and \tmp, \tmp, #HCR_E2H
+ cbz \tmp, \fail
+.endm
+
/*
* Allow Non-secure EL1 and EL0 to access physical timer and counter.
* This is not necessary for VHE, since the host kernel runs in EL2,
*/
.macro __init_el2_timers
mov x0, #3 // Enable EL1 physical timers
- mrs x1, hcr_el2
- and x1, x1, #HCR_E2H
- cbz x1, .LnVHE_\@
+ __check_hvhe .LnVHE_\@, x1
lsl x0, x0, #10
.LnVHE_\@:
msr cnthctl_el2, x0
/* Coprocessor traps */
.macro __init_el2_cptr
- mrs x1, hcr_el2
- and x1, x1, #HCR_E2H
- cbz x1, .LnVHE_\@
+ __check_hvhe .LnVHE_\@, x1
mov x0, #(CPACR_EL1_FPEN_EL1EN | CPACR_EL1_FPEN_EL0EN)
- b .Lset_cptr_\@
+ msr cpacr_el1, x0
+ b .Lskip_set_cptr_\@
.LnVHE_\@:
mov x0, #0x33ff
-.Lset_cptr_\@:
msr cptr_el2, x0 // Disable copro. traps to EL2
+.Lskip_set_cptr_\@:
.endm
/* Disable any fine grained traps */
check_override id_aa64pfr0, ID_AA64PFR0_EL1_SVE_SHIFT, .Linit_sve_\@, .Lskip_sve_\@, x1, x2
.Linit_sve_\@: /* SVE register access */
- mrs x0, cptr_el2 // Disable SVE traps
- mrs x1, hcr_el2
- and x1, x1, #HCR_E2H
- cbz x1, .Lcptr_nvhe_\@
+ __check_hvhe .Lcptr_nvhe_\@, x1
- // VHE case
+ // (h)VHE case
+ mrs x0, cpacr_el1 // Disable SVE traps
orr x0, x0, #(CPACR_EL1_ZEN_EL1EN | CPACR_EL1_ZEN_EL0EN)
- b .Lset_cptr_\@
+ msr cpacr_el1, x0
+ b .Lskip_set_cptr_\@
.Lcptr_nvhe_\@: // nVHE case
+ mrs x0, cptr_el2 // Disable SVE traps
bic x0, x0, #CPTR_EL2_TZ
-.Lset_cptr_\@:
msr cptr_el2, x0
+.Lskip_set_cptr_\@:
isb
mov x1, #ZCR_ELx_LEN_MASK // SVE: Enable full vector
msr_s SYS_ZCR_EL2, x1 // length for EL1.
check_override id_aa64pfr1, ID_AA64PFR1_EL1_SME_SHIFT, .Linit_sme_\@, .Lskip_sme_\@, x1, x2
.Linit_sme_\@: /* SME register access and priority mapping */
+ __check_hvhe .Lcptr_nvhe_sme_\@, x1
+
+ // (h)VHE case
+ mrs x0, cpacr_el1 // Disable SME traps
+ orr x0, x0, #(CPACR_EL1_SMEN_EL0EN | CPACR_EL1_SMEN_EL1EN)
+ msr cpacr_el1, x0
+ b .Lskip_set_cptr_sme_\@
+
+.Lcptr_nvhe_sme_\@: // nVHE case
mrs x0, cptr_el2 // Disable SME traps
bic x0, x0, #CPTR_EL2_TSM
msr cptr_el2, x0
+.Lskip_set_cptr_sme_\@:
isb
mrs x1, sctlr_el2
asmlinkage void kvm_unexpected_el2_exception(void);
struct kvm_cpu_context;
void handle_trap(struct kvm_cpu_context *host_ctxt);
-asmlinkage void __noreturn kvm_host_psci_cpu_entry(bool is_cpu_on);
+asmlinkage void __noreturn __kvm_host_psci_cpu_entry(bool is_cpu_on);
void __noreturn __pkvm_init_finalise(void);
void kvm_nvhe_prepare_backtrace(unsigned long fp, unsigned long pc);
void kvm_patch_vector_branch(struct alt_instr *alt,
return test_bit(feature, vcpu->arch.features);
}
+static __always_inline void kvm_write_cptr_el2(u64 val)
+{
+ if (has_vhe() || has_hvhe())
+ write_sysreg(val, cpacr_el1);
+ else
+ write_sysreg(val, cptr_el2);
+}
+
static __always_inline u64 kvm_get_reset_cptr_el2(struct kvm_vcpu *vcpu)
{
u64 val;
if (has_vhe()) {
val = (CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN |
CPACR_EL1_ZEN_EL1EN);
+ if (cpus_have_final_cap(ARM64_SME))
+ val |= CPACR_EL1_SMEN_EL1EN;
} else if (has_hvhe()) {
val = (CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN);
+
+ if (!vcpu_has_sve(vcpu) ||
+ (vcpu->arch.fp_state != FP_STATE_GUEST_OWNED))
+ val |= CPACR_EL1_ZEN_EL1EN | CPACR_EL1_ZEN_EL0EN;
+ if (cpus_have_final_cap(ARM64_SME))
+ val |= CPACR_EL1_SMEN_EL1EN | CPACR_EL1_SMEN_EL0EN;
} else {
val = CPTR_NVHE_EL2_RES1;
{
u64 val = kvm_get_reset_cptr_el2(vcpu);
- if (has_vhe() || has_hvhe())
- write_sysreg(val, cpacr_el1);
- else
- write_sysreg(val, cptr_el2);
+ kvm_write_cptr_el2(val);
}
#endif /* __ARM64_KVM_EMULATE_H__ */
static bool vgic_present, kvm_arm_initialised;
-static DEFINE_PER_CPU(unsigned char, kvm_arm_hardware_enabled);
+static DEFINE_PER_CPU(unsigned char, kvm_hyp_initialized);
DEFINE_STATIC_KEY_FALSE(userspace_irqchip_in_use);
bool is_kvm_arm_initialised(void)
cpu_hyp_init_features();
}
-static void _kvm_arch_hardware_enable(void *discard)
+static void cpu_hyp_init(void *discard)
{
- if (!__this_cpu_read(kvm_arm_hardware_enabled)) {
+ if (!__this_cpu_read(kvm_hyp_initialized)) {
cpu_hyp_reinit();
- __this_cpu_write(kvm_arm_hardware_enabled, 1);
+ __this_cpu_write(kvm_hyp_initialized, 1);
}
}
-int kvm_arch_hardware_enable(void)
+static void cpu_hyp_uninit(void *discard)
{
- int was_enabled;
+ if (__this_cpu_read(kvm_hyp_initialized)) {
+ cpu_hyp_reset();
+ __this_cpu_write(kvm_hyp_initialized, 0);
+ }
+}
+int kvm_arch_hardware_enable(void)
+{
/*
* Most calls to this function are made with migration
* disabled, but not with preemption disabled. The former is
*/
preempt_disable();
- was_enabled = __this_cpu_read(kvm_arm_hardware_enabled);
- _kvm_arch_hardware_enable(NULL);
+ cpu_hyp_init(NULL);
- if (!was_enabled) {
- kvm_vgic_cpu_up();
- kvm_timer_cpu_up();
- }
+ kvm_vgic_cpu_up();
+ kvm_timer_cpu_up();
preempt_enable();
return 0;
}
-static void _kvm_arch_hardware_disable(void *discard)
-{
- if (__this_cpu_read(kvm_arm_hardware_enabled)) {
- cpu_hyp_reset();
- __this_cpu_write(kvm_arm_hardware_enabled, 0);
- }
-}
-
void kvm_arch_hardware_disable(void)
{
- if (__this_cpu_read(kvm_arm_hardware_enabled)) {
- kvm_timer_cpu_down();
- kvm_vgic_cpu_down();
- }
+ kvm_timer_cpu_down();
+ kvm_vgic_cpu_down();
if (!is_protected_kvm_enabled())
- _kvm_arch_hardware_disable(NULL);
+ cpu_hyp_uninit(NULL);
}
#ifdef CONFIG_CPU_PM
void *v)
{
/*
- * kvm_arm_hardware_enabled is left with its old value over
+ * kvm_hyp_initialized is left with its old value over
* PM_ENTER->PM_EXIT. It is used to indicate PM_EXIT should
* re-enable hyp.
*/
switch (cmd) {
case CPU_PM_ENTER:
- if (__this_cpu_read(kvm_arm_hardware_enabled))
+ if (__this_cpu_read(kvm_hyp_initialized))
/*
- * don't update kvm_arm_hardware_enabled here
- * so that the hardware will be re-enabled
+ * don't update kvm_hyp_initialized here
+ * so that the hyp will be re-enabled
* when we resume. See below.
*/
cpu_hyp_reset();
return NOTIFY_OK;
case CPU_PM_ENTER_FAILED:
case CPU_PM_EXIT:
- if (__this_cpu_read(kvm_arm_hardware_enabled))
- /* The hardware was enabled before suspend. */
+ if (__this_cpu_read(kvm_hyp_initialized))
+ /* The hyp was enabled before suspend. */
cpu_hyp_reinit();
return NOTIFY_OK;
/*
* Enable hardware so that subsystem initialisation can access EL2.
*/
- on_each_cpu(_kvm_arch_hardware_enable, NULL, 1);
+ on_each_cpu(cpu_hyp_init, NULL, 1);
/*
* Register CPU lower-power notifier
hyp_cpu_pm_exit();
if (err || !is_protected_kvm_enabled())
- on_each_cpu(_kvm_arch_hardware_disable, NULL, 1);
+ on_each_cpu(cpu_hyp_uninit, NULL, 1);
return err;
}
* The stub hypercalls are now disabled, so set our local flag to
* prevent a later re-init attempt in kvm_arch_hardware_enable().
*/
- __this_cpu_write(kvm_arm_hardware_enabled, 1);
+ __this_cpu_write(kvm_hyp_initialized, 1);
preempt_enable();
return ret;
*/
val &= ~(TCR_HD | TCR_HA);
write_sysreg_el1(val, SYS_TCR);
+ __kvm_skip_instr(vcpu);
return true;
}
if (res.a0 == FFA_RET_NOT_SUPPORTED)
return 0;
- if (res.a0 != FFA_VERSION_1_0)
+ /*
+ * Firmware returns the maximum supported version of the FF-A
+ * implementation. Check that the returned version is
+ * backwards-compatible with the hyp according to the rules in DEN0077A
+ * v1.1 REL0 13.2.1.
+ *
+ * Of course, things are never simple when dealing with firmware. v1.1
+ * broke ABI with v1.0 on several structures, which is itself
+ * incompatible with the aforementioned versioning scheme. The
+ * expectation is that v1.x implementations that do not support the v1.0
+ * ABI return NOT_SUPPORTED rather than a version number, according to
+ * DEN0077A v1.1 REL0 18.6.4.
+ */
+ if (FFA_MAJOR_VERSION(res.a0) != 1)
return -EOPNOTSUPP;
arm_smccc_1_1_smc(FFA_ID_GET, 0, 0, 0, 0, 0, 0, 0, &res);
__activate_traps_fpsimd32(vcpu);
}
- write_sysreg(val, cptr_el2);
+ kvm_write_cptr_el2(val);
write_sysreg(__this_cpu_read(kvm_hyp_vector), vbar_el2);
if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) {
#
config LIGHTWEIGHT_SPINLOCK_CHECK
bool "Enable lightweight spinlock checks"
- depends on SMP && !DEBUG_SPINLOCK
+ depends on DEBUG_KERNEL && SMP && !DEBUG_SPINLOCK
default y
help
Add checks with low performance impact to the spinlock functions
return NULL;
}
-int puts(const char *s)
+static int puts(const char *s)
{
const char *nuline = s;
return 0;
}
-int printf(const char *fmt, ...)
+static int printf(const char *fmt, ...)
{
va_list args;
int i = 0;
}
#undef malloc
-void *malloc(size_t size)
+static void *malloc(size_t size)
{
return malloc_gzip(size);
}
#undef free
-void free(void *ptr)
+static void free(void *ptr)
{
return free_gzip(ptr);
}
free(phdrs);
}
-unsigned long decompress_kernel(unsigned int started_wide,
+asmlinkage unsigned long __visible decompress_kernel(unsigned int started_wide,
unsigned int command_line,
const unsigned int rd_start,
const unsigned int rd_end)
#define dma_outb outb
#define dma_inb inb
+extern unsigned long pcxl_dma_start;
+
/*
** DMA_CHUNK_SIZE is used by the SCSI mid-layer to break up
** (or rather not merge) DMAs into manageable chunks.
extern unsigned long sys_call_table[];
extern unsigned long return_address(unsigned int);
+struct ftrace_regs;
+extern void ftrace_function_trampoline(unsigned long parent,
+ unsigned long self_addr, unsigned long org_sp_gr3,
+ struct ftrace_regs *fregs);
#ifdef CONFIG_DYNAMIC_FTRACE
extern void ftrace_caller(void);
#include <asm/processor.h>
#include <asm/spinlock_types.h>
-#define SPINLOCK_BREAK_INSN 0x0000c006 /* break 6,6 */
-
static inline void arch_spin_val_check(int lock_val)
{
if (IS_ENABLED(CONFIG_LIGHTWEIGHT_SPINLOCK_CHECK))
#define __ARCH_SPIN_LOCK_UNLOCKED_VAL 0x1a46
+#define SPINLOCK_BREAK_INSN 0x0000c006 /* break 6,6 */
+
+#ifndef __ASSEMBLY__
+
typedef struct {
#ifdef CONFIG_PA20
volatile unsigned int slock;
volatile unsigned int counter;
} arch_rwlock_t;
+#endif /* __ASSEMBLY__ */
+
#define __ARCH_RW_LOCK_UNLOCKED__ 0x01000000
#define __ARCH_RW_LOCK_UNLOCKED { .lock_mutex = __ARCH_SPIN_LOCK_UNLOCKED, \
.counter = __ARCH_RW_LOCK_UNLOCKED__ }
static DEFINE_SPINLOCK(pdc_lock);
#endif
-unsigned long pdc_result[NUM_PDC_RESULT] __aligned(8);
-unsigned long pdc_result2[NUM_PDC_RESULT] __aligned(8);
+static unsigned long pdc_result[NUM_PDC_RESULT] __aligned(8);
+static unsigned long pdc_result2[NUM_PDC_RESULT] __aligned(8);
#ifdef CONFIG_64BIT
#define WIDE_FIRMWARE 0x1
/**
* pdc_cpu_rendezvous_lock - Lock PDC while transitioning to rendezvous state
*/
-void pdc_cpu_rendezvous_lock(void)
+void pdc_cpu_rendezvous_lock(void) __acquires(&pdc_lock)
{
spin_lock(&pdc_lock);
}
/**
* pdc_cpu_rendezvous_unlock - Unlock PDC after reaching rendezvous state
*/
-void pdc_cpu_rendezvous_unlock(void)
+void pdc_cpu_rendezvous_unlock(void) __releases(&pdc_lock)
{
spin_unlock(&pdc_lock);
}
static ftrace_func_t ftrace_func;
-void notrace __hot ftrace_function_trampoline(unsigned long parent,
+asmlinkage void notrace __hot ftrace_function_trampoline(unsigned long parent,
unsigned long self_addr,
unsigned long org_sp_gr3,
struct ftrace_regs *fregs)
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/syscalls.h>
+#include <linux/libgcc.h>
#include <linux/string.h>
EXPORT_SYMBOL(memset);
EXPORT_SYMBOL($$divI_14);
EXPORT_SYMBOL($$divI_15);
-extern void __ashrdi3(void);
-extern void __ashldi3(void);
-extern void __lshrdi3(void);
-extern void __muldi3(void);
-extern void __ucmpdi2(void);
-
EXPORT_SYMBOL(__ashrdi3);
EXPORT_SYMBOL(__ashldi3);
EXPORT_SYMBOL(__lshrdi3);
static unsigned long pcxl_used_bytes __read_mostly;
static unsigned long pcxl_used_pages __read_mostly;
-extern unsigned long pcxl_dma_start; /* Start of pcxl dma mapping area */
+unsigned long pcxl_dma_start __ro_after_init; /* pcxl dma mapping area start */
static DEFINE_SPINLOCK(pcxl_res_lock);
static char *pcxl_res_map;
static int pcxl_res_hint;
pcxl_res_map = (char *)__get_free_pages(GFP_KERNEL,
get_order(pcxl_res_size));
memset(pcxl_res_map, 0, pcxl_res_size);
- proc_gsc_root = proc_mkdir("gsc", NULL);
+ proc_gsc_root = proc_mkdir("bus/gsc", NULL);
if (!proc_gsc_root)
printk(KERN_WARNING
"pcxl_dma_init: Unable to create gsc /proc dir entry\n");
return -ENODEV;
kpdtd_task = kthread_run(pdt_mainloop, NULL, "kpdtd");
- if (IS_ERR(kpdtd_task))
- return PTR_ERR(kpdtd_task);
- return 0;
+ return PTR_ERR_OR_ZERO(kpdtd_task);
}
late_initcall(pdt_initcall);
static int perf_processor_interface __read_mostly = UNKNOWN_INTF;
static int perf_enabled __read_mostly;
static DEFINE_SPINLOCK(perf_lock);
-struct parisc_device *cpu_device __read_mostly;
+static struct parisc_device *cpu_device __read_mostly;
/* RDRs to write for PCX-W */
static const int perf_rdrs_W[] =
#include <asm/processor.h>
#include <asm/page.h>
#include <asm/pdc.h>
+#include <asm/smp.h>
#include <asm/pdcpat.h>
#include <asm/irq.h> /* for struct irq_region */
#include <asm/parisc-device.h>
static char __initdata command_line[COMMAND_LINE_SIZE];
-/* Intended for ccio/sba/cpu statistics under /proc/bus/{runway|gsc} */
-struct proc_dir_entry * proc_runway_root __read_mostly = NULL;
-struct proc_dir_entry * proc_gsc_root __read_mostly = NULL;
-struct proc_dir_entry * proc_mckinley_root __read_mostly = NULL;
-
static void __init setup_cmdline(char **cmdline_p)
{
extern unsigned int boot_args[];
.show = show_cpuinfo
};
-static void __init parisc_proc_mkdir(void)
-{
- /*
- ** Can't call proc_mkdir() until after proc_root_init() has been
- ** called by start_kernel(). In other words, this code can't
- ** live in arch/.../setup.c because start_parisc() calls
- ** start_kernel().
- */
- switch (boot_cpu_data.cpu_type) {
- case pcxl:
- case pcxl2:
- if (NULL == proc_gsc_root)
- {
- proc_gsc_root = proc_mkdir("bus/gsc", NULL);
- }
- break;
- case pcxt_:
- case pcxu:
- case pcxu_:
- case pcxw:
- case pcxw_:
- case pcxw2:
- if (NULL == proc_runway_root)
- {
- proc_runway_root = proc_mkdir("bus/runway", NULL);
- }
- break;
- case mako:
- case mako2:
- if (NULL == proc_mckinley_root)
- {
- proc_mckinley_root = proc_mkdir("bus/mckinley", NULL);
- }
- break;
- default:
- /* FIXME: this was added to prevent the compiler
- * complaining about missing pcx, pcxs and pcxt
- * I'm assuming they have neither gsc nor runway */
- break;
- }
-}
-
static struct resource central_bus = {
.name = "Central Bus",
.start = F_EXTEND(0xfff80000),
{
u32 osid = (OS_ID_LINUX << 16);
- parisc_proc_mkdir();
parisc_init_resources();
do_device_inventory(); /* probe for hardware */
regs->gr[31] -= 8; /* delayed branching */
/* Get assembler opcode of code in delay branch */
- uaddr = (unsigned int *) ((regs->gr[31] & ~3) + 4);
+ uaddr = (u32 __user *) ((regs->gr[31] & ~3) + 4);
err = get_user(opcode, uaddr);
if (err)
return;
#include <linux/elf-randomize.h>
/*
- * Construct an artificial page offset for the mapping based on the virtual
+ * Construct an artificial page offset for the mapping based on the physical
* address of the kernel file mapping variable.
- * If filp is zero the calculated pgoff value aliases the memory of the given
- * address. This is useful for io_uring where the mapping shall alias a kernel
- * address and a userspace adress where both the kernel and the userspace
- * access the same memory region.
*/
-#define GET_FILP_PGOFF(filp, addr) \
- ((filp ? (((unsigned long) filp->f_mapping) >> 8) \
- & ((SHM_COLOUR-1) >> PAGE_SHIFT) : 0UL) \
- + (addr >> PAGE_SHIFT))
+#define GET_FILP_PGOFF(filp) \
+ (filp ? (((unsigned long) filp->f_mapping) >> 8) \
+ & ((SHM_COLOUR-1) >> PAGE_SHIFT) : 0UL)
static unsigned long shared_align_offset(unsigned long filp_pgoff,
unsigned long pgoff)
do_color_align = 0;
if (filp || (flags & MAP_SHARED))
do_color_align = 1;
- filp_pgoff = GET_FILP_PGOFF(filp, addr);
+ filp_pgoff = GET_FILP_PGOFF(filp);
if (flags & MAP_FIXED) {
/* Even MAP_FIXED mappings must reside within TASK_SIZE */
#include <asm/assembly.h>
#include <asm/processor.h>
#include <asm/cache.h>
+#include <asm/spinlock_types.h>
#include <linux/linkage.h>
stw \reg1, 0(%sr2,\reg2)
.endm
+ /* raise exception if spinlock content is not zero or
+ * __ARCH_SPIN_LOCK_UNLOCKED_VAL */
+ .macro spinlock_check spin_val,tmpreg
+#ifdef CONFIG_LIGHTWEIGHT_SPINLOCK_CHECK
+ ldi __ARCH_SPIN_LOCK_UNLOCKED_VAL, \tmpreg
+ andcm,= \spin_val, \tmpreg, %r0
+ .word SPINLOCK_BREAK_INSN
+#endif
+ .endm
+
.text
.import syscall_exit,code
lws_exit_noerror:
lws_pagefault_enable %r1,%r21
- stw,ma %r20, 0(%sr2,%r20)
+ ldi __ARCH_SPIN_LOCK_UNLOCKED_VAL, %r21
+ stw,ma %r21, 0(%sr2,%r20)
ssm PSW_SM_I, %r0
b lws_exit
copy %r0, %r21
lws_pagefault:
lws_pagefault_enable %r1,%r21
- stw,ma %r20, 0(%sr2,%r20)
+ ldi __ARCH_SPIN_LOCK_UNLOCKED_VAL, %r21
+ stw,ma %r21, 0(%sr2,%r20)
ssm PSW_SM_I, %r0
ldo 3(%r0),%r28
b lws_exit
/* Try to acquire the lock */
LDCW 0(%sr2,%r20), %r28
+ spinlock_check %r28, %r21
comclr,<> %r0, %r28, %r0
b,n lws_wouldblock
/* Try to acquire the lock */
LDCW 0(%sr2,%r20), %r28
+ spinlock_check %r28, %r21
comclr,<> %r0, %r28, %r0
b,n lws_wouldblock
/* Try to acquire the lock */
LDCW 0(%sr2,%r20), %r28
+ spinlock_check %r28, %r21
comclr,<> %r0, %r28, %r0
b,n lws_wouldblock
/* Try to acquire the lock */
LDCW 0(%sr2,%r20), %r28
+ spinlock_check %r28, %r21
comclr,<> %r0, %r28, %r0
b,n lws_wouldblock
/* lws locks */
.rept 256
/* Keep locks aligned at 16-bytes */
- .word 1
+ .word __ARCH_SPIN_LOCK_UNLOCKED_VAL
.word 0
.word 0
.word 0
#include <linux/signal.h>
#include <linux/ratelimit.h>
#include <linux/uaccess.h>
+#include <linux/sysctl.h>
#include <asm/unaligned.h>
#include <asm/hardirq.h>
#include <asm/traps.h>
// SPDX-License-Identifier: GPL-2.0
#include <linux/module.h>
+#include <linux/libgcc.h>
union ull_union {
unsigned long long ull;
} ui;
};
-int __ucmpdi2(unsigned long long a, unsigned long long b)
+word_type __ucmpdi2(unsigned long long a, unsigned long long b)
{
union ull_union au = {.ull = a};
union ull_union bu = {.ull = b};
* For implementation see handle_interruption() in traps.c
*/
static const char * const trap_description[] = {
- [1] "High-priority machine check (HPMC)",
- [2] "Power failure interrupt",
- [3] "Recovery counter trap",
- [5] "Low-priority machine check",
- [6] "Instruction TLB miss fault",
- [7] "Instruction access rights / protection trap",
- [8] "Illegal instruction trap",
- [9] "Break instruction trap",
- [10] "Privileged operation trap",
- [11] "Privileged register trap",
- [12] "Overflow trap",
- [13] "Conditional trap",
- [14] "FP Assist Exception trap",
- [15] "Data TLB miss fault",
- [16] "Non-access ITLB miss fault",
- [17] "Non-access DTLB miss fault",
- [18] "Data memory protection/unaligned access trap",
- [19] "Data memory break trap",
- [20] "TLB dirty bit trap",
- [21] "Page reference trap",
- [22] "Assist emulation trap",
- [25] "Taken branch trap",
- [26] "Data memory access rights trap",
- [27] "Data memory protection ID trap",
- [28] "Unaligned data reference trap",
+ [1] = "High-priority machine check (HPMC)",
+ [2] = "Power failure interrupt",
+ [3] = "Recovery counter trap",
+ [5] = "Low-priority machine check",
+ [6] = "Instruction TLB miss fault",
+ [7] = "Instruction access rights / protection trap",
+ [8] = "Illegal instruction trap",
+ [9] = "Break instruction trap",
+ [10] = "Privileged operation trap",
+ [11] = "Privileged register trap",
+ [12] = "Overflow trap",
+ [13] = "Conditional trap",
+ [14] = "FP Assist Exception trap",
+ [15] = "Data TLB miss fault",
+ [16] = "Non-access ITLB miss fault",
+ [17] = "Non-access DTLB miss fault",
+ [18] = "Data memory protection/unaligned access trap",
+ [19] = "Data memory break trap",
+ [20] = "TLB dirty bit trap",
+ [21] = "Page reference trap",
+ [22] = "Assist emulation trap",
+ [25] = "Taken branch trap",
+ [26] = "Data memory access rights trap",
+ [27] = "Data memory protection ID trap",
+ [28] = "Unaligned data reference trap",
};
const char *trap_name(unsigned long code)
void *parisc_vmalloc_start __ro_after_init;
EXPORT_SYMBOL(parisc_vmalloc_start);
-#ifdef CONFIG_PA11
-unsigned long pcxl_dma_start __ro_after_init;
-#endif
-
void __init mem_init(void)
{
/* Do sanity checks on IPC (compat) structures */
*/
void __iomem *ioremap(unsigned long phys_addr, unsigned long size)
{
- void __iomem *addr;
+ uintptr_t addr;
struct vm_struct *area;
unsigned long offset, last_addr;
pgprot_t pgprot;
if (!area)
return NULL;
- addr = (void __iomem *) area->addr;
- if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size,
- phys_addr, pgprot)) {
- vunmap(addr);
+ addr = (uintptr_t) area->addr;
+ if (ioremap_page_range(addr, addr + size, phys_addr, pgprot)) {
+ vunmap(area->addr);
return NULL;
}
#define flush_icache_user_page(vma, pg, addr, len) \
flush_icache_mm(vma->vm_mm, 0)
+#ifdef CONFIG_64BIT
+#define flush_cache_vmap(start, end) flush_tlb_kernel_range(start, end)
+#endif
+
#ifndef CONFIG_SMP
#define flush_icache_all() local_flush_icache_all()
* Relaxed I/O memory access primitives. These follow the Device memory
* ordering rules but do not guarantee any ordering relative to Normal memory
* accesses. These are defined to order the indicated access (either a read or
- * write) with all other I/O memory accesses. Since the platform specification
- * defines that all I/O regions are strongly ordered on channel 2, no explicit
- * fences are required to enforce this ordering.
+ * write) with all other I/O memory accesses to the same peripheral. Since the
+ * platform specification defines that all I/O regions are strongly ordered on
+ * channel 0, no explicit fences are required to enforce this ordering.
*/
/* FIXME: These are now the same as asm-generic */
#define __io_rbr() do {} while (0)
#endif
/*
- * I/O memory access primitives. Reads are ordered relative to any
- * following Normal memory access. Writes are ordered relative to any prior
- * Normal memory access. The memory barriers here are necessary as RISC-V
+ * I/O memory access primitives. Reads are ordered relative to any following
+ * Normal memory read and delay() loop. Writes are ordered relative to any
+ * prior Normal memory write. The memory barriers here are necessary as RISC-V
* doesn't define any ordering between the memory space and the I/O space.
*/
#define __io_br() do {} while (0)
-#define __io_ar(v) __asm__ __volatile__ ("fence i,r" : : : "memory")
-#define __io_bw() __asm__ __volatile__ ("fence w,o" : : : "memory")
+#define __io_ar(v) ({ __asm__ __volatile__ ("fence i,ir" : : : "memory"); })
+#define __io_bw() ({ __asm__ __volatile__ ("fence w,o" : : : "memory"); })
#define __io_aw() mmiowb_set_pending()
#define readb(c) ({ u8 __v; __io_br(); __v = readb_cpu(c); __io_ar(__v); __v; })
#define PAGE_KERNEL_IO __pgprot(_PAGE_IOREMAP)
extern pgd_t swapper_pg_dir[];
+extern pgd_t trampoline_pg_dir[];
+extern pgd_t early_pg_dir[];
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static inline int pmd_present(pmd_t pmd)
#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+extern bool pgtable_l4_enabled, pgtable_l5_enabled;
+
#define IOREMAP_MAX_ORDER (PUD_SHIFT)
#define arch_vmap_pud_supported arch_vmap_pud_supported
static inline bool arch_vmap_pud_supported(pgprot_t prot)
{
- return true;
+ return pgtable_l4_enabled || pgtable_l5_enabled;
}
#define arch_vmap_pmd_supported arch_vmap_pmd_supported
#include <asm/smp.h>
#include <asm/pgtable.h>
+bool arch_match_cpu_phys_id(int cpu, u64 phys_id)
+{
+ return phys_id == cpuid_to_hartid_map(cpu);
+}
+
/*
* Returns the hart ID of the given device tree node, or -ENODEV if the node
* isn't an enabled and valid RISC-V hart node.
kbuf.buffer = initrd;
kbuf.bufsz = kbuf.memsz = initrd_len;
kbuf.buf_align = PAGE_SIZE;
- kbuf.top_down = false;
+ kbuf.top_down = true;
kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
ret = kexec_add_buffer(&kbuf);
if (ret)
* sym, instead of searching the whole relsec.
*/
case R_RISCV_PCREL_HI20:
+ case R_RISCV_CALL_PLT:
case R_RISCV_CALL:
*(u64 *)loc = CLEAN_IMM(UITYPE, *(u64 *)loc) |
ENCODE_UJTYPE_IMM(val - addr);
return -ENOENT;
}
-bool arch_match_cpu_phys_id(int cpu, u64 phys_id)
-{
- return phys_id == cpuid_to_hartid_map(cpu);
-}
-
static void ipi_stop(void)
{
set_cpu_online(smp_processor_id(), false);
#include <linux/kfence.h>
#include <asm/fixmap.h>
-#include <asm/tlbflush.h>
-#include <asm/sections.h>
-#include <asm/soc.h>
#include <asm/io.h>
-#include <asm/ptdump.h>
#include <asm/numa.h>
+#include <asm/pgtable.h>
+#include <asm/ptdump.h>
+#include <asm/sections.h>
+#include <asm/soc.h>
+#include <asm/tlbflush.h>
#include "../kernel/head.h"
memblock_reserve(vmlinux_start, vmlinux_end - vmlinux_start);
phys_ram_end = memblock_end_of_DRAM();
+
+ /*
+ * Make sure we align the start of the memory on a PMD boundary so that
+ * at worst, we map the linear mapping with PMD mappings.
+ */
if (!IS_ENABLED(CONFIG_XIP_KERNEL))
- phys_ram_base = memblock_start_of_DRAM();
+ phys_ram_base = memblock_start_of_DRAM() & PMD_MASK;
/*
* In 64-bit, any use of __va/__pa before this point is wrong as we
* region is not and then we have to go down to the PUD level.
*/
-extern pgd_t early_pg_dir[PTRS_PER_PGD];
pgd_t tmp_pg_dir[PTRS_PER_PGD] __page_aligned_bss;
p4d_t tmp_p4d[PTRS_PER_P4D] __page_aligned_bss;
pud_t tmp_pud[PTRS_PER_PUD] __page_aligned_bss;
This mitigates both spectre_v2 and retbleed at great cost to
performance.
+config CPU_SRSO
+ bool "Mitigate speculative RAS overflow on AMD"
+ depends on CPU_SUP_AMD && X86_64 && RETHUNK
+ default y
+ help
+ Enable the SRSO mitigation needed on AMD Zen1-4 machines.
+
config SLS
bool "Mitigate Straight-Line-Speculation"
depends on CC_HAS_SLS && X86_64
against straight line speculation. The kernel image might be slightly
larger.
+config GDS_FORCE_MITIGATION
+ bool "Force GDS Mitigation"
+ depends on CPU_SUP_INTEL
+ default n
+ help
+ Gather Data Sampling (GDS) is a hardware vulnerability which allows
+ unprivileged speculative access to data which was previously stored in
+ vector registers.
+
+ This option is equivalent to setting gather_data_sampling=force on the
+ command line. The microcode mitigation is used if present, otherwise
+ AVX is disabled as a mitigation. On affected systems that are missing
+ the microcode any userspace code that unconditionally uses AVX will
+ break with this option set.
+
+ Setting this option on systems not vulnerable to GDS has no effect.
+
+ If in doubt, say N.
+
endif
config ARCH_HAS_ADD_PAGES
#include <asm/mpspec.h>
#include <asm/x86_init.h>
#include <asm/cpufeature.h>
+#include <asm/irq_vectors.h>
#ifdef CONFIG_ACPI_APEI
# include <asm/pgtable_types.h>
extern int acpi_use_timer_override;
extern int acpi_fix_pin2_polarity;
extern int acpi_disable_cmcff;
+extern bool acpi_int_src_ovr[NR_IRQS_LEGACY];
extern u8 acpi_sci_flags;
extern u32 acpi_sci_override_gsi;
* Defines x86 CPU feature bits
*/
#define NCAPINTS 21 /* N 32-bit words worth of info */
-#define NBUGINTS 1 /* N 32-bit bug flags */
+#define NBUGINTS 2 /* N 32-bit bug flags */
/*
* Note: If the comment begins with a quoted string, that string is used
#define X86_FEATURE_SMBA (11*32+21) /* "" Slow Memory Bandwidth Allocation */
#define X86_FEATURE_BMEC (11*32+22) /* "" Bandwidth Monitoring Event Configuration */
+#define X86_FEATURE_SRSO (11*32+24) /* "" AMD BTB untrain RETs */
+#define X86_FEATURE_SRSO_ALIAS (11*32+25) /* "" AMD BTB untrain RETs through aliasing */
+#define X86_FEATURE_IBPB_ON_VMEXIT (11*32+26) /* "" Issue an IBPB only on VMEXIT */
+
/* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
#define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */
#define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */
#define X86_FEATURE_AUTOIBRS (20*32+ 8) /* "" Automatic IBRS */
#define X86_FEATURE_NO_SMM_CTL_MSR (20*32+ 9) /* "" SMM_CTL MSR is not present */
+#define X86_FEATURE_SBPB (20*32+27) /* "" Selective Branch Prediction Barrier */
+#define X86_FEATURE_IBPB_BRTYPE (20*32+28) /* "" MSR_PRED_CMD[IBPB] flushes all branch type predictions */
+#define X86_FEATURE_SRSO_NO (20*32+29) /* "" CPU is not affected by SRSO */
+
/*
* BUG word(s)
*/
#define X86_BUG_RETBLEED X86_BUG(27) /* CPU is affected by RETBleed */
#define X86_BUG_EIBRS_PBRSB X86_BUG(28) /* EIBRS is vulnerable to Post Barrier RSB Predictions */
#define X86_BUG_SMT_RSB X86_BUG(29) /* CPU is vulnerable to Cross-Thread Return Address Predictions */
+#define X86_BUG_GDS X86_BUG(30) /* CPU is affected by Gather Data Sampling */
+/* BUG word 2 */
+#define X86_BUG_SRSO X86_BUG(1*32 + 0) /* AMD SRSO bug */
+#define X86_BUG_DIV0 X86_BUG(1*32 + 1) /* AMD DIV0 speculation bug */
#endif /* _ASM_X86_CPUFEATURES_H */
#define MSR_IA32_PRED_CMD 0x00000049 /* Prediction Command */
#define PRED_CMD_IBPB BIT(0) /* Indirect Branch Prediction Barrier */
+#define PRED_CMD_SBPB BIT(7) /* Selective Branch Prediction Barrier */
#define MSR_PPIN_CTL 0x0000004e
#define MSR_PPIN 0x0000004f
* Not susceptible to Post-Barrier
* Return Stack Buffer Predictions.
*/
+#define ARCH_CAP_GDS_CTRL BIT(25) /*
+ * CPU is vulnerable to Gather
+ * Data Sampling (GDS) and
+ * has controls for mitigation.
+ */
+#define ARCH_CAP_GDS_NO BIT(26) /*
+ * CPU is not vulnerable to Gather
+ * Data Sampling (GDS).
+ */
#define ARCH_CAP_XAPIC_DISABLE BIT(21) /*
* IA32_XAPIC_DISABLE_STATUS MSR
#define RNGDS_MITG_DIS BIT(0) /* SRBDS support */
#define RTM_ALLOW BIT(1) /* TSX development mode */
#define FB_CLEAR_DIS BIT(3) /* CPU Fill buffer clear disable */
+#define GDS_MITG_DIS BIT(4) /* Disable GDS mitigation */
+#define GDS_MITG_LOCKED BIT(5) /* GDS mitigation locked */
#define MSR_IA32_SYSENTER_CS 0x00000174
#define MSR_IA32_SYSENTER_ESP 0x00000175
* eventually turn into it's own annotation.
*/
.macro VALIDATE_UNRET_END
-#if defined(CONFIG_NOINSTR_VALIDATION) && defined(CONFIG_CPU_UNRET_ENTRY)
+#if defined(CONFIG_NOINSTR_VALIDATION) && \
+ (defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_SRSO))
ANNOTATE_RETPOLINE_SAFE
nop
#endif
*/
.macro UNTRAIN_RET
#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) || \
- defined(CONFIG_CALL_DEPTH_TRACKING)
+ defined(CONFIG_CALL_DEPTH_TRACKING) || defined(CONFIG_CPU_SRSO)
VALIDATE_UNRET_END
ALTERNATIVE_3 "", \
CALL_ZEN_UNTRAIN_RET, X86_FEATURE_UNRET, \
"call entry_ibpb", X86_FEATURE_ENTRY_IBPB, \
__stringify(RESET_CALL_DEPTH), X86_FEATURE_CALL_DEPTH
#endif
+
+#ifdef CONFIG_CPU_SRSO
+ ALTERNATIVE_2 "", "call srso_untrain_ret", X86_FEATURE_SRSO, \
+ "call srso_untrain_ret_alias", X86_FEATURE_SRSO_ALIAS
+#endif
.endm
.macro UNTRAIN_RET_FROM_CALL
"call entry_ibpb", X86_FEATURE_ENTRY_IBPB, \
__stringify(RESET_CALL_DEPTH_FROM_CALL), X86_FEATURE_CALL_DEPTH
#endif
+
+#ifdef CONFIG_CPU_SRSO
+ ALTERNATIVE_2 "", "call srso_untrain_ret", X86_FEATURE_SRSO, \
+ "call srso_untrain_ret_alias", X86_FEATURE_SRSO_ALIAS
+#endif
.endm
extern void __x86_return_thunk(void);
extern void zen_untrain_ret(void);
+extern void srso_untrain_ret(void);
+extern void srso_untrain_ret_alias(void);
extern void entry_ibpb(void);
#ifdef CONFIG_CALL_THUNKS
: "memory");
}
+extern u64 x86_pred_cmd;
+
static inline void indirect_branch_prediction_barrier(void)
{
- u64 val = PRED_CMD_IBPB;
-
- alternative_msr_write(MSR_IA32_PRED_CMD, val, X86_FEATURE_USE_IBPB);
+ alternative_msr_write(MSR_IA32_PRED_CMD, x86_pred_cmd, X86_FEATURE_USE_IBPB);
}
/* The Intel SPEC CTRL MSR base value cache */
#ifdef CONFIG_CPU_SUP_AMD
extern u32 amd_get_nodes_per_socket(void);
extern u32 amd_get_highest_perf(void);
+extern bool cpu_has_ibpb_brtype_microcode(void);
+extern void amd_clear_divider(void);
#else
static inline u32 amd_get_nodes_per_socket(void) { return 0; }
static inline u32 amd_get_highest_perf(void) { return 0; }
+static inline bool cpu_has_ibpb_brtype_microcode(void) { return false; }
+static inline void amd_clear_divider(void) { }
#endif
extern unsigned long arch_align_stack(unsigned long sp);
int acpi_ioapic;
int acpi_strict;
int acpi_disable_cmcff;
+bool acpi_int_src_ovr[NR_IRQS_LEGACY];
/* ACPI SCI override configuration */
u8 acpi_sci_flags __initdata;
acpi_table_print_madt_entry(&header->common);
+ if (intsrc->source_irq < NR_IRQS_LEGACY)
+ acpi_int_src_ovr[intsrc->source_irq] = true;
+
if (intsrc->source_irq == acpi_gbl_FADT.sci_interrupt) {
acpi_sci_ioapic_setup(intsrc->source_irq,
intsrc->inti_flags & ACPI_MADT_POLARITY_MASK,
AMD_MODEL_RANGE(0x17, 0x60, 0x0, 0x7f, 0xf),
AMD_MODEL_RANGE(0x17, 0xa0, 0x0, 0xaf, 0xf));
+static const int amd_div0[] =
+ AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0x00, 0x0, 0x2f, 0xf),
+ AMD_MODEL_RANGE(0x17, 0x50, 0x0, 0x5f, 0xf));
+
static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum)
{
int osvw_id = *erratum++;
WARN_ON_ONCE(msr_set_bit(MSR_EFER, _EFER_AUTOIBRS));
zenbleed_check(c);
+
+ if (cpu_has_amd_erratum(c, amd_div0)) {
+ pr_notice_once("AMD Zen1 DIV0 bug detected. Disable SMT for full protection.\n");
+ setup_force_cpu_bug(X86_BUG_DIV0);
+ }
}
#ifdef CONFIG_X86_32
{
on_each_cpu(zenbleed_check_cpu, NULL, 1);
}
+
+bool cpu_has_ibpb_brtype_microcode(void)
+{
+ switch (boot_cpu_data.x86) {
+ /* Zen1/2 IBPB flushes branch type predictions too. */
+ case 0x17:
+ return boot_cpu_has(X86_FEATURE_AMD_IBPB);
+ case 0x19:
+ /* Poke the MSR bit on Zen3/4 to check its presence. */
+ if (!wrmsrl_safe(MSR_IA32_PRED_CMD, PRED_CMD_SBPB)) {
+ setup_force_cpu_cap(X86_FEATURE_SBPB);
+ return true;
+ } else {
+ return false;
+ }
+ default:
+ return false;
+ }
+}
+
+/*
+ * Issue a DIV 0/1 insn to clear any division data from previous DIV
+ * operations.
+ */
+void noinstr amd_clear_divider(void)
+{
+ asm volatile(ALTERNATIVE("", "div %2\n\t", X86_BUG_DIV0)
+ :: "a" (0), "d" (0), "r" (1));
+}
static void __init mmio_select_mitigation(void);
static void __init srbds_select_mitigation(void);
static void __init l1d_flush_select_mitigation(void);
+static void __init srso_select_mitigation(void);
+static void __init gds_select_mitigation(void);
/* The base value of the SPEC_CTRL MSR without task-specific bits set */
u64 x86_spec_ctrl_base;
DEFINE_PER_CPU(u64, x86_spec_ctrl_current);
EXPORT_SYMBOL_GPL(x86_spec_ctrl_current);
+u64 x86_pred_cmd __ro_after_init = PRED_CMD_IBPB;
+EXPORT_SYMBOL_GPL(x86_pred_cmd);
+
static DEFINE_MUTEX(spec_ctrl_mutex);
/* Update SPEC_CTRL MSR and its cached copy unconditionally */
md_clear_select_mitigation();
srbds_select_mitigation();
l1d_flush_select_mitigation();
+ srso_select_mitigation();
+ gds_select_mitigation();
}
/*
early_param("l1d_flush", l1d_flush_parse_cmdline);
#undef pr_fmt
+#define pr_fmt(fmt) "GDS: " fmt
+
+enum gds_mitigations {
+ GDS_MITIGATION_OFF,
+ GDS_MITIGATION_UCODE_NEEDED,
+ GDS_MITIGATION_FORCE,
+ GDS_MITIGATION_FULL,
+ GDS_MITIGATION_FULL_LOCKED,
+ GDS_MITIGATION_HYPERVISOR,
+};
+
+#if IS_ENABLED(CONFIG_GDS_FORCE_MITIGATION)
+static enum gds_mitigations gds_mitigation __ro_after_init = GDS_MITIGATION_FORCE;
+#else
+static enum gds_mitigations gds_mitigation __ro_after_init = GDS_MITIGATION_FULL;
+#endif
+
+static const char * const gds_strings[] = {
+ [GDS_MITIGATION_OFF] = "Vulnerable",
+ [GDS_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode",
+ [GDS_MITIGATION_FORCE] = "Mitigation: AVX disabled, no microcode",
+ [GDS_MITIGATION_FULL] = "Mitigation: Microcode",
+ [GDS_MITIGATION_FULL_LOCKED] = "Mitigation: Microcode (locked)",
+ [GDS_MITIGATION_HYPERVISOR] = "Unknown: Dependent on hypervisor status",
+};
+
+bool gds_ucode_mitigated(void)
+{
+ return (gds_mitigation == GDS_MITIGATION_FULL ||
+ gds_mitigation == GDS_MITIGATION_FULL_LOCKED);
+}
+EXPORT_SYMBOL_GPL(gds_ucode_mitigated);
+
+void update_gds_msr(void)
+{
+ u64 mcu_ctrl_after;
+ u64 mcu_ctrl;
+
+ switch (gds_mitigation) {
+ case GDS_MITIGATION_OFF:
+ rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
+ mcu_ctrl |= GDS_MITG_DIS;
+ break;
+ case GDS_MITIGATION_FULL_LOCKED:
+ /*
+ * The LOCKED state comes from the boot CPU. APs might not have
+ * the same state. Make sure the mitigation is enabled on all
+ * CPUs.
+ */
+ case GDS_MITIGATION_FULL:
+ rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
+ mcu_ctrl &= ~GDS_MITG_DIS;
+ break;
+ case GDS_MITIGATION_FORCE:
+ case GDS_MITIGATION_UCODE_NEEDED:
+ case GDS_MITIGATION_HYPERVISOR:
+ return;
+ };
+
+ wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
+
+ /*
+ * Check to make sure that the WRMSR value was not ignored. Writes to
+ * GDS_MITG_DIS will be ignored if this processor is locked but the boot
+ * processor was not.
+ */
+ rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl_after);
+ WARN_ON_ONCE(mcu_ctrl != mcu_ctrl_after);
+}
+
+static void __init gds_select_mitigation(void)
+{
+ u64 mcu_ctrl;
+
+ if (!boot_cpu_has_bug(X86_BUG_GDS))
+ return;
+
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
+ gds_mitigation = GDS_MITIGATION_HYPERVISOR;
+ goto out;
+ }
+
+ if (cpu_mitigations_off())
+ gds_mitigation = GDS_MITIGATION_OFF;
+ /* Will verify below that mitigation _can_ be disabled */
+
+ /* No microcode */
+ if (!(x86_read_arch_cap_msr() & ARCH_CAP_GDS_CTRL)) {
+ if (gds_mitigation == GDS_MITIGATION_FORCE) {
+ /*
+ * This only needs to be done on the boot CPU so do it
+ * here rather than in update_gds_msr()
+ */
+ setup_clear_cpu_cap(X86_FEATURE_AVX);
+ pr_warn("Microcode update needed! Disabling AVX as mitigation.\n");
+ } else {
+ gds_mitigation = GDS_MITIGATION_UCODE_NEEDED;
+ }
+ goto out;
+ }
+
+ /* Microcode has mitigation, use it */
+ if (gds_mitigation == GDS_MITIGATION_FORCE)
+ gds_mitigation = GDS_MITIGATION_FULL;
+
+ rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
+ if (mcu_ctrl & GDS_MITG_LOCKED) {
+ if (gds_mitigation == GDS_MITIGATION_OFF)
+ pr_warn("Mitigation locked. Disable failed.\n");
+
+ /*
+ * The mitigation is selected from the boot CPU. All other CPUs
+ * _should_ have the same state. If the boot CPU isn't locked
+ * but others are then update_gds_msr() will WARN() of the state
+ * mismatch. If the boot CPU is locked update_gds_msr() will
+ * ensure the other CPUs have the mitigation enabled.
+ */
+ gds_mitigation = GDS_MITIGATION_FULL_LOCKED;
+ }
+
+ update_gds_msr();
+out:
+ pr_info("%s\n", gds_strings[gds_mitigation]);
+}
+
+static int __init gds_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!boot_cpu_has_bug(X86_BUG_GDS))
+ return 0;
+
+ if (!strcmp(str, "off"))
+ gds_mitigation = GDS_MITIGATION_OFF;
+ else if (!strcmp(str, "force"))
+ gds_mitigation = GDS_MITIGATION_FORCE;
+
+ return 0;
+}
+early_param("gather_data_sampling", gds_parse_cmdline);
+
+#undef pr_fmt
#define pr_fmt(fmt) "Spectre V1 : " fmt
enum spectre_v1_mitigation {
early_param("l1tf", l1tf_cmdline);
#undef pr_fmt
+#define pr_fmt(fmt) "Speculative Return Stack Overflow: " fmt
+
+enum srso_mitigation {
+ SRSO_MITIGATION_NONE,
+ SRSO_MITIGATION_MICROCODE,
+ SRSO_MITIGATION_SAFE_RET,
+ SRSO_MITIGATION_IBPB,
+ SRSO_MITIGATION_IBPB_ON_VMEXIT,
+};
+
+enum srso_mitigation_cmd {
+ SRSO_CMD_OFF,
+ SRSO_CMD_MICROCODE,
+ SRSO_CMD_SAFE_RET,
+ SRSO_CMD_IBPB,
+ SRSO_CMD_IBPB_ON_VMEXIT,
+};
+
+static const char * const srso_strings[] = {
+ [SRSO_MITIGATION_NONE] = "Vulnerable",
+ [SRSO_MITIGATION_MICROCODE] = "Mitigation: microcode",
+ [SRSO_MITIGATION_SAFE_RET] = "Mitigation: safe RET",
+ [SRSO_MITIGATION_IBPB] = "Mitigation: IBPB",
+ [SRSO_MITIGATION_IBPB_ON_VMEXIT] = "Mitigation: IBPB on VMEXIT only"
+};
+
+static enum srso_mitigation srso_mitigation __ro_after_init = SRSO_MITIGATION_NONE;
+static enum srso_mitigation_cmd srso_cmd __ro_after_init = SRSO_CMD_SAFE_RET;
+
+static int __init srso_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "off"))
+ srso_cmd = SRSO_CMD_OFF;
+ else if (!strcmp(str, "microcode"))
+ srso_cmd = SRSO_CMD_MICROCODE;
+ else if (!strcmp(str, "safe-ret"))
+ srso_cmd = SRSO_CMD_SAFE_RET;
+ else if (!strcmp(str, "ibpb"))
+ srso_cmd = SRSO_CMD_IBPB;
+ else if (!strcmp(str, "ibpb-vmexit"))
+ srso_cmd = SRSO_CMD_IBPB_ON_VMEXIT;
+ else
+ pr_err("Ignoring unknown SRSO option (%s).", str);
+
+ return 0;
+}
+early_param("spec_rstack_overflow", srso_parse_cmdline);
+
+#define SRSO_NOTICE "WARNING: See https://kernel.org/doc/html/latest/admin-guide/hw-vuln/srso.html for mitigation options."
+
+static void __init srso_select_mitigation(void)
+{
+ bool has_microcode;
+
+ if (!boot_cpu_has_bug(X86_BUG_SRSO) || cpu_mitigations_off())
+ goto pred_cmd;
+
+ /*
+ * The first check is for the kernel running as a guest in order
+ * for guests to verify whether IBPB is a viable mitigation.
+ */
+ has_microcode = boot_cpu_has(X86_FEATURE_IBPB_BRTYPE) || cpu_has_ibpb_brtype_microcode();
+ if (!has_microcode) {
+ pr_warn("IBPB-extending microcode not applied!\n");
+ pr_warn(SRSO_NOTICE);
+ } else {
+ /*
+ * Enable the synthetic (even if in a real CPUID leaf)
+ * flags for guests.
+ */
+ setup_force_cpu_cap(X86_FEATURE_IBPB_BRTYPE);
+
+ /*
+ * Zen1/2 with SMT off aren't vulnerable after the right
+ * IBPB microcode has been applied.
+ */
+ if ((boot_cpu_data.x86 < 0x19) &&
+ (!cpu_smt_possible() || (cpu_smt_control == CPU_SMT_DISABLED)))
+ setup_force_cpu_cap(X86_FEATURE_SRSO_NO);
+ }
+
+ if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB) {
+ if (has_microcode) {
+ pr_err("Retbleed IBPB mitigation enabled, using same for SRSO\n");
+ srso_mitigation = SRSO_MITIGATION_IBPB;
+ goto pred_cmd;
+ }
+ }
+
+ switch (srso_cmd) {
+ case SRSO_CMD_OFF:
+ return;
+
+ case SRSO_CMD_MICROCODE:
+ if (has_microcode) {
+ srso_mitigation = SRSO_MITIGATION_MICROCODE;
+ pr_warn(SRSO_NOTICE);
+ }
+ break;
+
+ case SRSO_CMD_SAFE_RET:
+ if (IS_ENABLED(CONFIG_CPU_SRSO)) {
+ /*
+ * Enable the return thunk for generated code
+ * like ftrace, static_call, etc.
+ */
+ setup_force_cpu_cap(X86_FEATURE_RETHUNK);
+
+ if (boot_cpu_data.x86 == 0x19)
+ setup_force_cpu_cap(X86_FEATURE_SRSO_ALIAS);
+ else
+ setup_force_cpu_cap(X86_FEATURE_SRSO);
+ srso_mitigation = SRSO_MITIGATION_SAFE_RET;
+ } else {
+ pr_err("WARNING: kernel not compiled with CPU_SRSO.\n");
+ goto pred_cmd;
+ }
+ break;
+
+ case SRSO_CMD_IBPB:
+ if (IS_ENABLED(CONFIG_CPU_IBPB_ENTRY)) {
+ if (has_microcode) {
+ setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB);
+ srso_mitigation = SRSO_MITIGATION_IBPB;
+ }
+ } else {
+ pr_err("WARNING: kernel not compiled with CPU_IBPB_ENTRY.\n");
+ goto pred_cmd;
+ }
+ break;
+
+ case SRSO_CMD_IBPB_ON_VMEXIT:
+ if (IS_ENABLED(CONFIG_CPU_SRSO)) {
+ if (!boot_cpu_has(X86_FEATURE_ENTRY_IBPB) && has_microcode) {
+ setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT);
+ srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT;
+ }
+ } else {
+ pr_err("WARNING: kernel not compiled with CPU_SRSO.\n");
+ goto pred_cmd;
+ }
+ break;
+
+ default:
+ break;
+ }
+
+ pr_info("%s%s\n", srso_strings[srso_mitigation], (has_microcode ? "" : ", no microcode"));
+
+pred_cmd:
+ if ((boot_cpu_has(X86_FEATURE_SRSO_NO) || srso_cmd == SRSO_CMD_OFF) &&
+ boot_cpu_has(X86_FEATURE_SBPB))
+ x86_pred_cmd = PRED_CMD_SBPB;
+}
+
+#undef pr_fmt
#define pr_fmt(fmt) fmt
#ifdef CONFIG_SYSFS
return sysfs_emit(buf, "%s\n", retbleed_strings[retbleed_mitigation]);
}
+static ssize_t srso_show_state(char *buf)
+{
+ return sysfs_emit(buf, "%s%s\n",
+ srso_strings[srso_mitigation],
+ (cpu_has_ibpb_brtype_microcode() ? "" : ", no microcode"));
+}
+
+static ssize_t gds_show_state(char *buf)
+{
+ return sysfs_emit(buf, "%s\n", gds_strings[gds_mitigation]);
+}
+
static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr,
char *buf, unsigned int bug)
{
case X86_BUG_RETBLEED:
return retbleed_show_state(buf);
+ case X86_BUG_SRSO:
+ return srso_show_state(buf);
+
+ case X86_BUG_GDS:
+ return gds_show_state(buf);
+
default:
break;
}
{
return cpu_show_common(dev, attr, buf, X86_BUG_RETBLEED);
}
+
+ssize_t cpu_show_spec_rstack_overflow(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_SRSO);
+}
+
+ssize_t cpu_show_gds(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_GDS);
+}
#endif
#define RETBLEED BIT(3)
/* CPU is affected by SMT (cross-thread) return predictions */
#define SMT_RSB BIT(4)
+/* CPU is affected by SRSO */
+#define SRSO BIT(5)
+/* CPU is affected by GDS */
+#define GDS BIT(6)
static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS),
VULNBL_INTEL_STEPPINGS(BROADWELL_X, X86_STEPPING_ANY, MMIO),
VULNBL_INTEL_STEPPINGS(BROADWELL, X86_STEPPING_ANY, SRBDS),
VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED),
- VULNBL_INTEL_STEPPINGS(SKYLAKE_X, X86_STEPPING_ANY, MMIO | RETBLEED),
+ VULNBL_INTEL_STEPPINGS(SKYLAKE_X, X86_STEPPING_ANY, MMIO | RETBLEED | GDS),
VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED),
- VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED),
- VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED),
+ VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED | GDS),
+ VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED | GDS),
VULNBL_INTEL_STEPPINGS(CANNONLAKE_L, X86_STEPPING_ANY, RETBLEED),
- VULNBL_INTEL_STEPPINGS(ICELAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED),
- VULNBL_INTEL_STEPPINGS(ICELAKE_D, X86_STEPPING_ANY, MMIO),
- VULNBL_INTEL_STEPPINGS(ICELAKE_X, X86_STEPPING_ANY, MMIO),
- VULNBL_INTEL_STEPPINGS(COMETLAKE, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED),
+ VULNBL_INTEL_STEPPINGS(ICELAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS),
+ VULNBL_INTEL_STEPPINGS(ICELAKE_D, X86_STEPPING_ANY, MMIO | GDS),
+ VULNBL_INTEL_STEPPINGS(ICELAKE_X, X86_STEPPING_ANY, MMIO | GDS),
+ VULNBL_INTEL_STEPPINGS(COMETLAKE, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS),
VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPINGS(0x0, 0x0), MMIO | RETBLEED),
- VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED),
+ VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS),
+ VULNBL_INTEL_STEPPINGS(TIGERLAKE_L, X86_STEPPING_ANY, GDS),
+ VULNBL_INTEL_STEPPINGS(TIGERLAKE, X86_STEPPING_ANY, GDS),
VULNBL_INTEL_STEPPINGS(LAKEFIELD, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED),
- VULNBL_INTEL_STEPPINGS(ROCKETLAKE, X86_STEPPING_ANY, MMIO | RETBLEED),
+ VULNBL_INTEL_STEPPINGS(ROCKETLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS),
VULNBL_INTEL_STEPPINGS(ATOM_TREMONT, X86_STEPPING_ANY, MMIO | MMIO_SBDS),
VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_D, X86_STEPPING_ANY, MMIO),
VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS),
VULNBL_AMD(0x15, RETBLEED),
VULNBL_AMD(0x16, RETBLEED),
- VULNBL_AMD(0x17, RETBLEED | SMT_RSB),
+ VULNBL_AMD(0x17, RETBLEED | SMT_RSB | SRSO),
VULNBL_HYGON(0x18, RETBLEED | SMT_RSB),
+ VULNBL_AMD(0x19, SRSO),
{}
};
if (cpu_matches(cpu_vuln_blacklist, SMT_RSB))
setup_force_cpu_bug(X86_BUG_SMT_RSB);
+ if (!cpu_has(c, X86_FEATURE_SRSO_NO)) {
+ if (cpu_matches(cpu_vuln_blacklist, SRSO))
+ setup_force_cpu_bug(X86_BUG_SRSO);
+ }
+
+ /*
+ * Check if CPU is vulnerable to GDS. If running in a virtual machine on
+ * an affected processor, the VMM may have disabled the use of GATHER by
+ * disabling AVX2. The only way to do this in HW is to clear XCR0[2],
+ * which means that AVX will be disabled.
+ */
+ if (cpu_matches(cpu_vuln_blacklist, GDS) && !(ia32_cap & ARCH_CAP_GDS_NO) &&
+ boot_cpu_has(X86_FEATURE_AVX))
+ setup_force_cpu_bug(X86_BUG_GDS);
+
if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN))
return;
validate_apic_and_package_id(c);
x86_spec_ctrl_setup_ap();
update_srbds_msr();
+ if (boot_cpu_has_bug(X86_BUG_GDS))
+ update_gds_msr();
tsx_ap_init();
}
extern void x86_spec_ctrl_setup_ap(void);
extern void update_srbds_msr(void);
+extern void update_gds_msr(void);
extern enum spectre_v2_mitigation spectre_v2_enabled;
{
do_error_trap(regs, 0, "divide error", X86_TRAP_DE, SIGFPE,
FPE_INTDIV, error_get_trap_addr(regs));
+
+ amd_clear_divider();
}
DEFINE_IDTENTRY(exc_overflow)
SOFTIRQENTRY_TEXT
#ifdef CONFIG_RETPOLINE
__indirect_thunk_start = .;
- *(.text.__x86.*)
+ *(.text.__x86.indirect_thunk)
+ *(.text.__x86.return_thunk)
__indirect_thunk_end = .;
#endif
STATIC_CALL_TEXT
ALIGN_ENTRY_TEXT_BEGIN
+#ifdef CONFIG_CPU_SRSO
+ *(.text.__x86.rethunk_untrain)
+#endif
+
ENTRY_TEXT
+
+#ifdef CONFIG_CPU_SRSO
+ /*
+ * See the comment above srso_untrain_ret_alias()'s
+ * definition.
+ */
+ . = srso_untrain_ret_alias | (1 << 2) | (1 << 8) | (1 << 14) | (1 << 20);
+ *(.text.__x86.rethunk_safe)
+#endif
ALIGN_ENTRY_TEXT_END
*(.gnu.warning)
#endif
#ifdef CONFIG_RETHUNK
-. = ASSERT((__x86_return_thunk & 0x3f) == 0, "__x86_return_thunk not cacheline-aligned");
+. = ASSERT((__ret & 0x3f) == 0, "__ret not cacheline-aligned");
+. = ASSERT((srso_safe_ret & 0x3f) == 0, "srso_safe_ret not cacheline-aligned");
+#endif
+
+#ifdef CONFIG_CPU_SRSO
+/*
+ * GNU ld cannot do XOR so do: (A | B) - (A & B) in order to compute the XOR
+ * of the two function addresses:
+ */
+. = ASSERT(((srso_untrain_ret_alias | srso_safe_ret_alias) -
+ (srso_untrain_ret_alias & srso_safe_ret_alias)) == ((1 << 2) | (1 << 8) | (1 << 14) | (1 << 20)),
+ "SRSO function pair won't alias");
#endif
#endif /* CONFIG_X86_64 */
F(NULL_SEL_CLR_BASE) | F(AUTOIBRS) | 0 /* PrefetchCtlMsr */
);
+ if (cpu_feature_enabled(X86_FEATURE_SRSO_NO))
+ kvm_cpu_cap_set(X86_FEATURE_SRSO_NO);
+
kvm_cpu_cap_init_kvm_defined(CPUID_8000_0022_EAX,
F(PERFMON_V2)
);
*/
memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
- vcpu->arch.regs[VCPU_REGS_RAX] = ghcb_get_rax_if_valid(ghcb);
- vcpu->arch.regs[VCPU_REGS_RBX] = ghcb_get_rbx_if_valid(ghcb);
- vcpu->arch.regs[VCPU_REGS_RCX] = ghcb_get_rcx_if_valid(ghcb);
- vcpu->arch.regs[VCPU_REGS_RDX] = ghcb_get_rdx_if_valid(ghcb);
- vcpu->arch.regs[VCPU_REGS_RSI] = ghcb_get_rsi_if_valid(ghcb);
+ BUILD_BUG_ON(sizeof(svm->sev_es.valid_bitmap) != sizeof(ghcb->save.valid_bitmap));
+ memcpy(&svm->sev_es.valid_bitmap, &ghcb->save.valid_bitmap, sizeof(ghcb->save.valid_bitmap));
- svm->vmcb->save.cpl = ghcb_get_cpl_if_valid(ghcb);
+ vcpu->arch.regs[VCPU_REGS_RAX] = kvm_ghcb_get_rax_if_valid(svm, ghcb);
+ vcpu->arch.regs[VCPU_REGS_RBX] = kvm_ghcb_get_rbx_if_valid(svm, ghcb);
+ vcpu->arch.regs[VCPU_REGS_RCX] = kvm_ghcb_get_rcx_if_valid(svm, ghcb);
+ vcpu->arch.regs[VCPU_REGS_RDX] = kvm_ghcb_get_rdx_if_valid(svm, ghcb);
+ vcpu->arch.regs[VCPU_REGS_RSI] = kvm_ghcb_get_rsi_if_valid(svm, ghcb);
- if (ghcb_xcr0_is_valid(ghcb)) {
+ svm->vmcb->save.cpl = kvm_ghcb_get_cpl_if_valid(svm, ghcb);
+
+ if (kvm_ghcb_xcr0_is_valid(svm)) {
vcpu->arch.xcr0 = ghcb_get_xcr0(ghcb);
kvm_update_cpuid_runtime(vcpu);
}
control->exit_code_hi = upper_32_bits(exit_code);
control->exit_info_1 = ghcb_get_sw_exit_info_1(ghcb);
control->exit_info_2 = ghcb_get_sw_exit_info_2(ghcb);
+ svm->sev_es.sw_scratch = kvm_ghcb_get_sw_scratch_if_valid(svm, ghcb);
/* Clear the valid entries fields */
memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap));
}
+static u64 kvm_ghcb_get_sw_exit_code(struct vmcb_control_area *control)
+{
+ return (((u64)control->exit_code_hi) << 32) | control->exit_code;
+}
+
static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
{
- struct kvm_vcpu *vcpu;
- struct ghcb *ghcb;
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
u64 exit_code;
u64 reason;
- ghcb = svm->sev_es.ghcb;
-
/*
* Retrieve the exit code now even though it may not be marked valid
* as it could help with debugging.
*/
- exit_code = ghcb_get_sw_exit_code(ghcb);
+ exit_code = kvm_ghcb_get_sw_exit_code(control);
/* Only GHCB Usage code 0 is supported */
- if (ghcb->ghcb_usage) {
+ if (svm->sev_es.ghcb->ghcb_usage) {
reason = GHCB_ERR_INVALID_USAGE;
goto vmgexit_err;
}
reason = GHCB_ERR_MISSING_INPUT;
- if (!ghcb_sw_exit_code_is_valid(ghcb) ||
- !ghcb_sw_exit_info_1_is_valid(ghcb) ||
- !ghcb_sw_exit_info_2_is_valid(ghcb))
+ if (!kvm_ghcb_sw_exit_code_is_valid(svm) ||
+ !kvm_ghcb_sw_exit_info_1_is_valid(svm) ||
+ !kvm_ghcb_sw_exit_info_2_is_valid(svm))
goto vmgexit_err;
- switch (ghcb_get_sw_exit_code(ghcb)) {
+ switch (exit_code) {
case SVM_EXIT_READ_DR7:
break;
case SVM_EXIT_WRITE_DR7:
- if (!ghcb_rax_is_valid(ghcb))
+ if (!kvm_ghcb_rax_is_valid(svm))
goto vmgexit_err;
break;
case SVM_EXIT_RDTSC:
break;
case SVM_EXIT_RDPMC:
- if (!ghcb_rcx_is_valid(ghcb))
+ if (!kvm_ghcb_rcx_is_valid(svm))
goto vmgexit_err;
break;
case SVM_EXIT_CPUID:
- if (!ghcb_rax_is_valid(ghcb) ||
- !ghcb_rcx_is_valid(ghcb))
+ if (!kvm_ghcb_rax_is_valid(svm) ||
+ !kvm_ghcb_rcx_is_valid(svm))
goto vmgexit_err;
- if (ghcb_get_rax(ghcb) == 0xd)
- if (!ghcb_xcr0_is_valid(ghcb))
+ if (vcpu->arch.regs[VCPU_REGS_RAX] == 0xd)
+ if (!kvm_ghcb_xcr0_is_valid(svm))
goto vmgexit_err;
break;
case SVM_EXIT_INVD:
break;
case SVM_EXIT_IOIO:
- if (ghcb_get_sw_exit_info_1(ghcb) & SVM_IOIO_STR_MASK) {
- if (!ghcb_sw_scratch_is_valid(ghcb))
+ if (control->exit_info_1 & SVM_IOIO_STR_MASK) {
+ if (!kvm_ghcb_sw_scratch_is_valid(svm))
goto vmgexit_err;
} else {
- if (!(ghcb_get_sw_exit_info_1(ghcb) & SVM_IOIO_TYPE_MASK))
- if (!ghcb_rax_is_valid(ghcb))
+ if (!(control->exit_info_1 & SVM_IOIO_TYPE_MASK))
+ if (!kvm_ghcb_rax_is_valid(svm))
goto vmgexit_err;
}
break;
case SVM_EXIT_MSR:
- if (!ghcb_rcx_is_valid(ghcb))
+ if (!kvm_ghcb_rcx_is_valid(svm))
goto vmgexit_err;
- if (ghcb_get_sw_exit_info_1(ghcb)) {
- if (!ghcb_rax_is_valid(ghcb) ||
- !ghcb_rdx_is_valid(ghcb))
+ if (control->exit_info_1) {
+ if (!kvm_ghcb_rax_is_valid(svm) ||
+ !kvm_ghcb_rdx_is_valid(svm))
goto vmgexit_err;
}
break;
case SVM_EXIT_VMMCALL:
- if (!ghcb_rax_is_valid(ghcb) ||
- !ghcb_cpl_is_valid(ghcb))
+ if (!kvm_ghcb_rax_is_valid(svm) ||
+ !kvm_ghcb_cpl_is_valid(svm))
goto vmgexit_err;
break;
case SVM_EXIT_RDTSCP:
case SVM_EXIT_WBINVD:
break;
case SVM_EXIT_MONITOR:
- if (!ghcb_rax_is_valid(ghcb) ||
- !ghcb_rcx_is_valid(ghcb) ||
- !ghcb_rdx_is_valid(ghcb))
+ if (!kvm_ghcb_rax_is_valid(svm) ||
+ !kvm_ghcb_rcx_is_valid(svm) ||
+ !kvm_ghcb_rdx_is_valid(svm))
goto vmgexit_err;
break;
case SVM_EXIT_MWAIT:
- if (!ghcb_rax_is_valid(ghcb) ||
- !ghcb_rcx_is_valid(ghcb))
+ if (!kvm_ghcb_rax_is_valid(svm) ||
+ !kvm_ghcb_rcx_is_valid(svm))
goto vmgexit_err;
break;
case SVM_VMGEXIT_MMIO_READ:
case SVM_VMGEXIT_MMIO_WRITE:
- if (!ghcb_sw_scratch_is_valid(ghcb))
+ if (!kvm_ghcb_sw_scratch_is_valid(svm))
goto vmgexit_err;
break;
case SVM_VMGEXIT_NMI_COMPLETE:
return 0;
vmgexit_err:
- vcpu = &svm->vcpu;
-
if (reason == GHCB_ERR_INVALID_USAGE) {
vcpu_unimpl(vcpu, "vmgexit: ghcb usage %#x is not valid\n",
- ghcb->ghcb_usage);
+ svm->sev_es.ghcb->ghcb_usage);
} else if (reason == GHCB_ERR_INVALID_EVENT) {
vcpu_unimpl(vcpu, "vmgexit: exit code %#llx is not valid\n",
exit_code);
dump_ghcb(svm);
}
- /* Clear the valid entries fields */
- memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap));
-
- ghcb_set_sw_exit_info_1(ghcb, 2);
- ghcb_set_sw_exit_info_2(ghcb, reason);
+ ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2);
+ ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, reason);
/* Resume the guest to "return" the error code. */
return 1;
*/
if (svm->sev_es.ghcb_sa_sync) {
kvm_write_guest(svm->vcpu.kvm,
- ghcb_get_sw_scratch(svm->sev_es.ghcb),
+ svm->sev_es.sw_scratch,
svm->sev_es.ghcb_sa,
svm->sev_es.ghcb_sa_len);
svm->sev_es.ghcb_sa_sync = false;
static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len)
{
struct vmcb_control_area *control = &svm->vmcb->control;
- struct ghcb *ghcb = svm->sev_es.ghcb;
u64 ghcb_scratch_beg, ghcb_scratch_end;
u64 scratch_gpa_beg, scratch_gpa_end;
void *scratch_va;
- scratch_gpa_beg = ghcb_get_sw_scratch(ghcb);
+ scratch_gpa_beg = svm->sev_es.sw_scratch;
if (!scratch_gpa_beg) {
pr_err("vmgexit: scratch gpa not provided\n");
goto e_scratch;
return 0;
e_scratch:
- ghcb_set_sw_exit_info_1(ghcb, 2);
- ghcb_set_sw_exit_info_2(ghcb, GHCB_ERR_INVALID_SCRATCH_AREA);
+ ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2);
+ ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, GHCB_ERR_INVALID_SCRATCH_AREA);
return 1;
}
struct vcpu_svm *svm = to_svm(vcpu);
struct vmcb_control_area *control = &svm->vmcb->control;
u64 ghcb_gpa, exit_code;
- struct ghcb *ghcb;
int ret;
/* Validate the GHCB */
}
svm->sev_es.ghcb = svm->sev_es.ghcb_map.hva;
- ghcb = svm->sev_es.ghcb_map.hva;
- trace_kvm_vmgexit_enter(vcpu->vcpu_id, ghcb);
-
- exit_code = ghcb_get_sw_exit_code(ghcb);
+ trace_kvm_vmgexit_enter(vcpu->vcpu_id, svm->sev_es.ghcb);
+ sev_es_sync_from_ghcb(svm);
ret = sev_es_validate_vmgexit(svm);
if (ret)
return ret;
- sev_es_sync_from_ghcb(svm);
- ghcb_set_sw_exit_info_1(ghcb, 0);
- ghcb_set_sw_exit_info_2(ghcb, 0);
+ ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 0);
+ ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 0);
+ exit_code = kvm_ghcb_get_sw_exit_code(control);
switch (exit_code) {
case SVM_VMGEXIT_MMIO_READ:
ret = setup_vmgexit_scratch(svm, true, control->exit_info_2);
break;
case 1:
/* Get AP jump table address */
- ghcb_set_sw_exit_info_2(ghcb, sev->ap_jump_table);
+ ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, sev->ap_jump_table);
break;
default:
pr_err("svm: vmgexit: unsupported AP jump table request - exit_info_1=%#llx\n",
control->exit_info_1);
- ghcb_set_sw_exit_info_1(ghcb, 2);
- ghcb_set_sw_exit_info_2(ghcb, GHCB_ERR_INVALID_INPUT);
+ ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2);
+ ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, GHCB_ERR_INVALID_INPUT);
}
ret = 1;
if (sd->current_vmcb != svm->vmcb) {
sd->current_vmcb = svm->vmcb;
- indirect_branch_prediction_barrier();
+
+ if (!cpu_feature_enabled(X86_FEATURE_IBPB_ON_VMEXIT))
+ indirect_branch_prediction_barrier();
}
if (kvm_vcpu_apicv_active(vcpu))
avic_vcpu_load(vcpu, cpu);
/* SEV-ES support */
struct sev_es_save_area *vmsa;
struct ghcb *ghcb;
+ u8 valid_bitmap[16];
struct kvm_host_map ghcb_map;
bool received_first_sipi;
/* SEV-ES scratch area support */
+ u64 sw_scratch;
void *ghcb_sa;
u32 ghcb_sa_len;
bool ghcb_sa_sync;
void __svm_sev_es_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted);
void __svm_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted);
+#define DEFINE_KVM_GHCB_ACCESSORS(field) \
+ static __always_inline bool kvm_ghcb_##field##_is_valid(const struct vcpu_svm *svm) \
+ { \
+ return test_bit(GHCB_BITMAP_IDX(field), \
+ (unsigned long *)&svm->sev_es.valid_bitmap); \
+ } \
+ \
+ static __always_inline u64 kvm_ghcb_get_##field##_if_valid(struct vcpu_svm *svm, struct ghcb *ghcb) \
+ { \
+ return kvm_ghcb_##field##_is_valid(svm) ? ghcb->save.field : 0; \
+ } \
+
+DEFINE_KVM_GHCB_ACCESSORS(cpl)
+DEFINE_KVM_GHCB_ACCESSORS(rax)
+DEFINE_KVM_GHCB_ACCESSORS(rcx)
+DEFINE_KVM_GHCB_ACCESSORS(rdx)
+DEFINE_KVM_GHCB_ACCESSORS(rbx)
+DEFINE_KVM_GHCB_ACCESSORS(rsi)
+DEFINE_KVM_GHCB_ACCESSORS(sw_exit_code)
+DEFINE_KVM_GHCB_ACCESSORS(sw_exit_info_1)
+DEFINE_KVM_GHCB_ACCESSORS(sw_exit_info_2)
+DEFINE_KVM_GHCB_ACCESSORS(sw_scratch)
+DEFINE_KVM_GHCB_ACCESSORS(xcr0)
+
#endif
*/
UNTRAIN_RET
+ /* SRSO */
+ ALTERNATIVE "", "call entry_ibpb", X86_FEATURE_IBPB_ON_VMEXIT
+
/*
* Clear all general purpose registers except RSP and RAX to prevent
* speculative use of the guest's values, even those that are reloaded
static struct kmem_cache *x86_emulator_cache;
+extern bool gds_ucode_mitigated(void);
+
/*
* When called, it means the previous get/set msr reached an invalid msr.
* Return true if we want to ignore/silent this failed msr access.
ARCH_CAP_SKIP_VMENTRY_L1DFLUSH | ARCH_CAP_SSB_NO | ARCH_CAP_MDS_NO | \
ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \
ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \
- ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO)
+ ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO | ARCH_CAP_GDS_NO)
static u64 kvm_get_arch_capabilities(void)
{
*/
}
+ if (!boot_cpu_has_bug(X86_BUG_GDS) || gds_ucode_mitigated())
+ data |= ARCH_CAP_GDS_NO;
+
return data;
}
#include <asm/unwind_hints.h>
#include <asm/percpu.h>
#include <asm/frame.h>
+#include <asm/nops.h>
.section .text.__x86.indirect_thunk
*/
#ifdef CONFIG_RETHUNK
+/*
+ * srso_untrain_ret_alias() and srso_safe_ret_alias() are placed at
+ * special addresses:
+ *
+ * - srso_untrain_ret_alias() is 2M aligned
+ * - srso_safe_ret_alias() is also in the same 2M page but bits 2, 8, 14
+ * and 20 in its virtual address are set (while those bits in the
+ * srso_untrain_ret_alias() function are cleared).
+ *
+ * This guarantees that those two addresses will alias in the branch
+ * target buffer of Zen3/4 generations, leading to any potential
+ * poisoned entries at that BTB slot to get evicted.
+ *
+ * As a result, srso_safe_ret_alias() becomes a safe return.
+ */
+#ifdef CONFIG_CPU_SRSO
+ .section .text.__x86.rethunk_untrain
+
+SYM_START(srso_untrain_ret_alias, SYM_L_GLOBAL, SYM_A_NONE)
+ ANNOTATE_NOENDBR
+ ASM_NOP2
+ lfence
+ jmp __x86_return_thunk
+SYM_FUNC_END(srso_untrain_ret_alias)
+__EXPORT_THUNK(srso_untrain_ret_alias)
+
+ .section .text.__x86.rethunk_safe
+#endif
+
+/* Needs a definition for the __x86_return_thunk alternative below. */
+SYM_START(srso_safe_ret_alias, SYM_L_GLOBAL, SYM_A_NONE)
+#ifdef CONFIG_CPU_SRSO
+ add $8, %_ASM_SP
+ UNWIND_HINT_FUNC
+#endif
+ ANNOTATE_UNRET_SAFE
+ ret
+ int3
+SYM_FUNC_END(srso_safe_ret_alias)
+
.section .text.__x86.return_thunk
/*
* from re-poisioning the BTB prediction.
*/
.align 64
- .skip 64 - (__x86_return_thunk - zen_untrain_ret), 0xcc
+ .skip 64 - (__ret - zen_untrain_ret), 0xcc
SYM_START(zen_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE)
ANNOTATE_NOENDBR
/*
* evicted, __x86_return_thunk will suffer Straight Line Speculation
* which will be contained safely by the INT3.
*/
-SYM_INNER_LABEL(__x86_return_thunk, SYM_L_GLOBAL)
+SYM_INNER_LABEL(__ret, SYM_L_GLOBAL)
ret
int3
-SYM_CODE_END(__x86_return_thunk)
+SYM_CODE_END(__ret)
/*
* Ensure the TEST decoding / BTB invalidation is complete.
* Jump back and execute the RET in the middle of the TEST instruction.
* INT3 is for SLS protection.
*/
- jmp __x86_return_thunk
+ jmp __ret
int3
SYM_FUNC_END(zen_untrain_ret)
__EXPORT_THUNK(zen_untrain_ret)
+/*
+ * SRSO untraining sequence for Zen1/2, similar to zen_untrain_ret()
+ * above. On kernel entry, srso_untrain_ret() is executed which is a
+ *
+ * movabs $0xccccccc308c48348,%rax
+ *
+ * and when the return thunk executes the inner label srso_safe_ret()
+ * later, it is a stack manipulation and a RET which is mispredicted and
+ * thus a "safe" one to use.
+ */
+ .align 64
+ .skip 64 - (srso_safe_ret - srso_untrain_ret), 0xcc
+SYM_START(srso_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE)
+ ANNOTATE_NOENDBR
+ .byte 0x48, 0xb8
+
+SYM_INNER_LABEL(srso_safe_ret, SYM_L_GLOBAL)
+ add $8, %_ASM_SP
+ ret
+ int3
+ int3
+ int3
+ lfence
+ call srso_safe_ret
+ int3
+SYM_CODE_END(srso_safe_ret)
+SYM_FUNC_END(srso_untrain_ret)
+__EXPORT_THUNK(srso_untrain_ret)
+
+SYM_FUNC_START(__x86_return_thunk)
+ ALTERNATIVE_2 "jmp __ret", "call srso_safe_ret", X86_FEATURE_SRSO, \
+ "call srso_safe_ret_alias", X86_FEATURE_SRSO_ALIAS
+ int3
+SYM_CODE_END(__x86_return_thunk)
EXPORT_SYMBOL(__x86_return_thunk)
#endif /* CONFIG_RETHUNK */
struct block_device *bdev = bio->bi_bdev;
struct request_queue *q = bdev_get_queue(bdev);
blk_status_t status = BLK_STS_IOERR;
- struct blk_plug *plug;
might_sleep();
- plug = blk_mq_plug(bio);
- if (plug && plug->nowait)
- bio->bi_opf |= REQ_NOWAIT;
-
/*
* For a REQ_NOWAIT based request, return -EOPNOTSUPP
* if queue does not support NOWAIT.
plug->rq_count = 0;
plug->multiple_queues = false;
plug->has_elevator = false;
- plug->nowait = false;
INIT_LIST_HEAD(&plug->cb_list);
/*
if (qos[QOS_MIN] > qos[QOS_MAX])
goto einval;
- if (enable) {
+ if (enable && !ioc->enabled) {
blk_stat_enable_accounting(disk->queue);
blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue);
ioc->enabled = true;
- } else {
+ } else if (!enable && ioc->enabled) {
+ blk_stat_disable_accounting(disk->queue);
blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue);
ioc->enabled = false;
}
task_io_account_write(bio->bi_iter.bi_size);
}
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ bio->bi_opf |= REQ_NOWAIT;
+
if (iocb->ki_flags & IOCB_HIPRI) {
- bio->bi_opf |= REQ_POLLED | REQ_NOWAIT;
+ bio->bi_opf |= REQ_POLLED;
submit_bio(bio);
WRITE_ONCE(iocb->private, bio);
} else {
- if (iocb->ki_flags & IOCB_NOWAIT)
- bio->bi_opf |= REQ_NOWAIT;
submit_bio(bio);
}
return -EIOCBQUEUED;
{
unsigned int i, npages = bo->base.size >> PAGE_SHIFT;
+ if (ivpu_bo_cache_mode(bo) != DRM_IVPU_BO_CACHED)
+ set_pages_array_wb(bo->pages, bo->base.size >> PAGE_SHIFT);
+
for (i = 0; i < npages; i++)
put_page(bo->pages[i]);
if (ivpu_bo_cache_mode(bo) != DRM_IVPU_BO_CACHED)
drm_clflush_pages(bo->pages, bo->base.size >> PAGE_SHIFT);
+ if (bo->flags & DRM_IVPU_BO_WC)
+ set_pages_array_wc(bo->pages, bo->base.size >> PAGE_SHIFT);
+ else if (bo->flags & DRM_IVPU_BO_UNCACHED)
+ set_pages_array_uc(bo->pages, bo->base.size >> PAGE_SHIFT);
+
prot = ivpu_bo_pgprot(bo, PAGE_KERNEL);
bo->kvaddr = vmap(bo->pages, bo->base.size >> PAGE_SHIFT, VM_MAP, prot);
if (!bo->kvaddr) {
{ }
};
+static const struct dmi_system_id tongfang_gm_rg[] = {
+ {
+ .ident = "TongFang GMxRGxx/XMG CORE 15 (M22)/TUXEDO Stellaris 15 Gen4 AMD",
+ .matches = {
+ DMI_MATCH(DMI_BOARD_NAME, "GMxRGxx"),
+ },
+ },
+ { }
+};
+
+static const struct dmi_system_id maingear_laptop[] = {
+ {
+ .ident = "MAINGEAR Vector Pro 2 15",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Micro Electronics Inc"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "MG-VCP2-15A3070T"),
+ }
+ },
+ {
+ .ident = "MAINGEAR Vector Pro 2 17",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Micro Electronics Inc"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "MG-VCP2-17A3070T"),
+ },
+ },
+ { }
+};
+
+static const struct dmi_system_id pcspecialist_laptop[] = {
+ {
+ .ident = "PCSpecialist Elimina Pro 16 M",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "PCSpecialist"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Elimina Pro 16 M"),
+ },
+ },
+ { }
+};
+
static const struct dmi_system_id lg_laptop[] = {
{
.ident = "LG Electronics 17U70P",
static const struct irq_override_cmp override_table[] = {
{ medion_laptop, 1, ACPI_LEVEL_SENSITIVE, ACPI_ACTIVE_LOW, 0, false },
{ asus_laptop, 1, ACPI_LEVEL_SENSITIVE, ACPI_ACTIVE_LOW, 0, false },
+ { tongfang_gm_rg, 1, ACPI_EDGE_SENSITIVE, ACPI_ACTIVE_LOW, 1, true },
+ { maingear_laptop, 1, ACPI_EDGE_SENSITIVE, ACPI_ACTIVE_LOW, 1, true },
+ { pcspecialist_laptop, 1, ACPI_EDGE_SENSITIVE, ACPI_ACTIVE_LOW, 1, true },
{ lg_laptop, 1, ACPI_LEVEL_SENSITIVE, ACPI_ACTIVE_LOW, 0, false },
};
return entry->override;
}
+#ifdef CONFIG_X86
+ /*
+ * Always use the MADT override info, except for the i8042 PS/2 ctrl
+ * IRQs (1 and 12). For these the DSDT IRQ settings should sometimes
+ * be used otherwise PS/2 keyboards / mice will not work.
+ */
+ if (gsi != 1 && gsi != 12)
+ return true;
+
+ /* If the override comes from an INT_SRC_OVR MADT entry, honor it. */
+ if (acpi_int_src_ovr[gsi])
+ return true;
+
+ /*
+ * IRQ override isn't needed on modern AMD Zen systems and
+ * this override breaks active low IRQs on AMD Ryzen 6000 and
+ * newer systems. Skip it.
+ */
+ if (boot_cpu_has(X86_FEATURE_ZEN))
+ return false;
+#endif
+
return true;
}
return sysfs_emit(buf, "Not affected\n");
}
+ssize_t __weak cpu_show_spec_rstack_overflow(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "Not affected\n");
+}
+
+ssize_t __weak cpu_show_gds(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "Not affected\n");
+}
+
static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL);
static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL);
static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spectre_v2, NULL);
static DEVICE_ATTR(srbds, 0444, cpu_show_srbds, NULL);
static DEVICE_ATTR(mmio_stale_data, 0444, cpu_show_mmio_stale_data, NULL);
static DEVICE_ATTR(retbleed, 0444, cpu_show_retbleed, NULL);
+static DEVICE_ATTR(spec_rstack_overflow, 0444, cpu_show_spec_rstack_overflow, NULL);
+static DEVICE_ATTR(gather_data_sampling, 0444, cpu_show_gds, NULL);
static struct attribute *cpu_root_vulnerabilities_attrs[] = {
&dev_attr_meltdown.attr,
&dev_attr_srbds.attr,
&dev_attr_mmio_stale_data.attr,
&dev_attr_retbleed.attr,
+ &dev_attr_spec_rstack_overflow.attr,
+ &dev_attr_gather_data_sampling.attr,
NULL
};
static void zram_bio_read(struct zram *zram, struct bio *bio)
{
- struct bvec_iter iter;
- struct bio_vec bv;
- unsigned long start_time;
+ unsigned long start_time = bio_start_io_acct(bio);
+ struct bvec_iter iter = bio->bi_iter;
- start_time = bio_start_io_acct(bio);
- bio_for_each_segment(bv, bio, iter) {
+ do {
u32 index = iter.bi_sector >> SECTORS_PER_PAGE_SHIFT;
u32 offset = (iter.bi_sector & (SECTORS_PER_PAGE - 1)) <<
SECTOR_SHIFT;
+ struct bio_vec bv = bio_iter_iovec(bio, iter);
+
+ bv.bv_len = min_t(u32, bv.bv_len, PAGE_SIZE - offset);
if (zram_bvec_read(zram, &bv, index, offset, bio) < 0) {
atomic64_inc(&zram->stats.failed_reads);
zram_slot_lock(zram, index);
zram_accessed(zram, index);
zram_slot_unlock(zram, index);
- }
+
+ bio_advance_iter_single(bio, &iter, bv.bv_len);
+ } while (iter.bi_size);
+
bio_end_io_acct(bio, start_time);
bio_endio(bio);
}
static void zram_bio_write(struct zram *zram, struct bio *bio)
{
- struct bvec_iter iter;
- struct bio_vec bv;
- unsigned long start_time;
+ unsigned long start_time = bio_start_io_acct(bio);
+ struct bvec_iter iter = bio->bi_iter;
- start_time = bio_start_io_acct(bio);
- bio_for_each_segment(bv, bio, iter) {
+ do {
u32 index = iter.bi_sector >> SECTORS_PER_PAGE_SHIFT;
u32 offset = (iter.bi_sector & (SECTORS_PER_PAGE - 1)) <<
SECTOR_SHIFT;
+ struct bio_vec bv = bio_iter_iovec(bio, iter);
+
+ bv.bv_len = min_t(u32, bv.bv_len, PAGE_SIZE - offset);
if (zram_bvec_write(zram, &bv, index, offset, bio) < 0) {
atomic64_inc(&zram->stats.failed_writes);
zram_slot_lock(zram, index);
zram_accessed(zram, index);
zram_slot_unlock(zram, index);
- }
+
+ bio_advance_iter_single(bio, &iter, bv.bv_len);
+ } while (iter.bi_size);
+
bio_end_io_acct(bio, start_time);
bio_endio(bio);
}
return 0;
}
-/*
- * Some AMD fTPM versions may cause stutter
- * https://www.amd.com/en/support/kb/faq/pa-410
- *
- * Fixes are available in two series of fTPM firmware:
- * 6.x.y.z series: 6.0.18.6 +
- * 3.x.y.z series: 3.57.y.5 +
- */
-#ifdef CONFIG_X86
-static bool tpm_amd_is_rng_defective(struct tpm_chip *chip)
-{
- u32 val1, val2;
- u64 version;
- int ret;
-
- if (!(chip->flags & TPM_CHIP_FLAG_TPM2))
- return false;
-
- ret = tpm_request_locality(chip);
- if (ret)
- return false;
-
- ret = tpm2_get_tpm_pt(chip, TPM2_PT_MANUFACTURER, &val1, NULL);
- if (ret)
- goto release;
- if (val1 != 0x414D4400U /* AMD */) {
- ret = -ENODEV;
- goto release;
- }
- ret = tpm2_get_tpm_pt(chip, TPM2_PT_FIRMWARE_VERSION_1, &val1, NULL);
- if (ret)
- goto release;
- ret = tpm2_get_tpm_pt(chip, TPM2_PT_FIRMWARE_VERSION_2, &val2, NULL);
-
-release:
- tpm_relinquish_locality(chip);
-
- if (ret)
- return false;
-
- version = ((u64)val1 << 32) | val2;
- if ((version >> 48) == 6) {
- if (version >= 0x0006000000180006ULL)
- return false;
- } else if ((version >> 48) == 3) {
- if (version >= 0x0003005700000005ULL)
- return false;
- } else {
- return false;
- }
-
- dev_warn(&chip->dev,
- "AMD fTPM version 0x%llx causes system stutter; hwrng disabled\n",
- version);
-
- return true;
-}
-#else
-static inline bool tpm_amd_is_rng_defective(struct tpm_chip *chip)
-{
- return false;
-}
-#endif /* CONFIG_X86 */
-
static int tpm_hwrng_read(struct hwrng *rng, void *data, size_t max, bool wait)
{
struct tpm_chip *chip = container_of(rng, struct tpm_chip, hwrng);
return tpm_get_random(chip, data, max);
}
+static bool tpm_is_hwrng_enabled(struct tpm_chip *chip)
+{
+ if (!IS_ENABLED(CONFIG_HW_RANDOM_TPM))
+ return false;
+ if (tpm_is_firmware_upgrade(chip))
+ return false;
+ if (chip->flags & TPM_CHIP_FLAG_HWRNG_DISABLED)
+ return false;
+ return true;
+}
+
static int tpm_add_hwrng(struct tpm_chip *chip)
{
- if (!IS_ENABLED(CONFIG_HW_RANDOM_TPM) || tpm_is_firmware_upgrade(chip) ||
- tpm_amd_is_rng_defective(chip))
+ if (!tpm_is_hwrng_enabled(chip))
return 0;
snprintf(chip->hwrng_name, sizeof(chip->hwrng_name),
return 0;
out_hwrng:
- if (IS_ENABLED(CONFIG_HW_RANDOM_TPM) && !tpm_is_firmware_upgrade(chip))
+ if (tpm_is_hwrng_enabled(chip))
hwrng_unregister(&chip->hwrng);
out_ppi:
tpm_bios_log_teardown(chip);
void tpm_chip_unregister(struct tpm_chip *chip)
{
tpm_del_legacy_sysfs(chip);
- if (IS_ENABLED(CONFIG_HW_RANDOM_TPM) && !tpm_is_firmware_upgrade(chip) &&
- !tpm_amd_is_rng_defective(chip))
+ if (tpm_is_hwrng_enabled(chip))
hwrng_unregister(&chip->hwrng);
tpm_bios_log_teardown(chip);
if (chip->flags & TPM_CHIP_FLAG_TPM2 && !tpm_is_firmware_upgrade(chip))
return (cancel & CRB_CANCEL_INVOKE) == CRB_CANCEL_INVOKE;
}
+static int crb_check_flags(struct tpm_chip *chip)
+{
+ u32 val;
+ int ret;
+
+ ret = crb_request_locality(chip, 0);
+ if (ret)
+ return ret;
+
+ ret = tpm2_get_tpm_pt(chip, TPM2_PT_MANUFACTURER, &val, NULL);
+ if (ret)
+ goto release;
+
+ if (val == 0x414D4400U /* AMD */)
+ chip->flags |= TPM_CHIP_FLAG_HWRNG_DISABLED;
+
+release:
+ crb_relinquish_locality(chip, 0);
+
+ return ret;
+}
+
static const struct tpm_class_ops tpm_crb = {
.flags = TPM_OPS_AUTO_STARTUP,
.status = crb_status,
chip->acpi_dev_handle = device->handle;
chip->flags = TPM_CHIP_FLAG_TPM2;
+ rc = tpm_chip_bootstrap(chip);
+ if (rc)
+ goto out;
+
+ rc = crb_check_flags(chip);
+ if (rc)
+ goto out;
+
rc = tpm_chip_register(chip);
out:
},
{
.callback = tpm_tis_disable_irq,
+ .ident = "ThinkStation P620",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+ DMI_MATCH(DMI_PRODUCT_VERSION, "ThinkStation P620"),
+ },
+ },
+ {
+ .callback = tpm_tis_disable_irq,
+ .ident = "TUXEDO InfinityBook S 15/17 Gen7",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "TUXEDO"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "TUXEDO InfinityBook S 15/17 Gen7"),
+ },
+ },
+ {
+ .callback = tpm_tis_disable_irq,
.ident = "UPX-TGL",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "AAEON"),
return 0;
}
-static ssize_t show_status(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
+static ssize_t status_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
ssize_t ret;
return ret;
}
-static ssize_t store_status(struct kobject *a, struct kobj_attribute *b,
+static ssize_t status_store(struct device *a, struct device_attribute *b,
const char *buf, size_t count)
{
char *p = memchr(buf, '\n', count);
cpufreq_freq_attr_ro(amd_pstate_highest_perf);
cpufreq_freq_attr_rw(energy_performance_preference);
cpufreq_freq_attr_ro(energy_performance_available_preferences);
-define_one_global_rw(status);
+static DEVICE_ATTR_RW(status);
static struct freq_attr *amd_pstate_attr[] = {
&amd_pstate_max_freq,
};
static struct attribute *pstate_global_attributes[] = {
- &status.attr,
+ &dev_attr_status.attr,
NULL
};
}
}
-static bool psci_pd_try_set_osi_mode(void)
-{
- int ret;
-
- if (!psci_has_osi_support())
- return false;
-
- ret = psci_set_osi_mode(true);
- if (ret)
- return false;
-
- return true;
-}
-
static void psci_cpuidle_domain_sync_state(struct device *dev)
{
/*
{
struct device_node *np = pdev->dev.of_node;
struct device_node *node;
- bool use_osi;
+ bool use_osi = psci_has_osi_support();
int ret = 0, pd_count = 0;
if (!np)
return -ENODEV;
- /* If OSI mode is supported, let's try to enable it. */
- use_osi = psci_pd_try_set_osi_mode();
-
/*
* Parse child nodes for the "#power-domain-cells" property and
* initialize a genpd/genpd-of-provider pair when it's found.
continue;
ret = psci_pd_init(node, use_osi);
- if (ret)
- goto put_node;
+ if (ret) {
+ of_node_put(node);
+ goto exit;
+ }
pd_count++;
}
/* Bail out if not using the hierarchical CPU topology. */
if (!pd_count)
- goto no_pd;
+ return 0;
/* Link genpd masters/subdomains to model the CPU topology. */
ret = dt_idle_pd_init_topology(np);
if (ret)
goto remove_pd;
+ /* let's try to enable OSI. */
+ ret = psci_set_osi_mode(use_osi);
+ if (ret)
+ goto remove_pd;
+
pr_info("Initialized CPU PM domain topology using %s mode\n",
use_osi ? "OSI" : "PC");
return 0;
-put_node:
- of_node_put(node);
remove_pd:
+ dt_idle_pd_remove_topology(np);
psci_pd_remove();
+exit:
pr_err("failed to create CPU PM domains ret=%d\n", ret);
-no_pd:
- if (use_osi)
- psci_set_osi_mode(false);
return ret;
}
return 0;
}
+int dt_idle_pd_remove_topology(struct device_node *np)
+{
+ struct device_node *node;
+ struct of_phandle_args child, parent;
+ int ret;
+
+ for_each_child_of_node(np, node) {
+ if (of_parse_phandle_with_args(node, "power-domains",
+ "#power-domain-cells", 0, &parent))
+ continue;
+
+ child.np = node;
+ child.args_count = 0;
+ ret = of_genpd_remove_subdomain(&parent, &child);
+ of_node_put(parent.np);
+ if (ret) {
+ of_node_put(node);
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
struct device *dt_idle_attach_cpu(int cpu, const char *name)
{
struct device *dev;
int dt_idle_pd_init_topology(struct device_node *np);
+int dt_idle_pd_remove_topology(struct device_node *np);
+
struct device *dt_idle_attach_cpu(int cpu, const char *name);
void dt_idle_detach_cpu(struct device *dev);
return 0;
}
+static inline int dt_idle_pd_remove_topology(struct device_node *np)
+{
+ return 0;
+}
+
static inline struct device *dt_idle_attach_cpu(int cpu, const char *name)
{
return NULL;
config FSL_EDMA
tristate "Freescale eDMA engine support"
depends on OF
+ depends on HAS_IOMEM
select DMA_ENGINE
select DMA_VIRTUAL_CHANNELS
help
config INTEL_IDMA64
tristate "Intel integrated DMA 64-bit support"
+ depends on HAS_IOMEM
select DMA_ENGINE
select DMA_VIRTUAL_CHANNELS
help
wq->threshold = 0;
wq->priority = 0;
wq->enqcmds_retries = IDXD_ENQCMDS_RETRIES;
- clear_bit(WQ_FLAG_DEDICATED, &wq->flags);
- clear_bit(WQ_FLAG_BLOCK_ON_FAULT, &wq->flags);
- clear_bit(WQ_FLAG_ATS_DISABLE, &wq->flags);
+ wq->flags = 0;
memset(wq->name, 0, WQ_NAME_SIZE);
wq->max_xfer_bytes = WQ_DEFAULT_MAX_XFER;
idxd_wq_set_max_batch_size(idxd->data->type, wq, WQ_DEFAULT_MAX_BATCH);
return -EINVAL;
}
- chans = pdata->dma_channels;
+ if (!pdata->dma_channels) {
+ dev_info(&pdev->dev, "setting default channel number to 64");
+ chans = 64;
+ } else {
+ chans = pdata->dma_channels;
+ }
+
len = sizeof(*mcf_edma) + sizeof(*mcf_chan) * chans;
mcf_edma = devm_kzalloc(&pdev->dev, len, GFP_KERNEL);
if (!mcf_edma)
mcf_edma->drvdata = &mcf_data;
mcf_edma->big_endian = 1;
- if (!mcf_edma->n_chans) {
- dev_info(&pdev->dev, "setting default channel number to 64");
- mcf_edma->n_chans = 64;
- }
-
mutex_init(&mcf_edma->fsl_edma_mutex);
mcf_edma->membase = devm_platform_ioremap_resource(pdev, 0);
};
/**
- * struct owl_dma_pchan - Wrapper for DMA ENGINE channel
+ * struct owl_dma_vchan - Wrapper for DMA ENGINE channel
* @vc: wrapped virtual channel
* @pchan: the physical channel utilized by this channel
* @txd: active transaction on this channel
*/
BUSY,
/*
+ * Pause was called while descriptor was BUSY. Due to hardware
+ * limitations, only termination is possible for descriptors
+ * that have been paused.
+ */
+ PAUSED,
+ /*
* Sitting on the channel work_list but xfer done
* by PL330 core
*/
list_for_each_entry(desc, &pch->work_list, node) {
/* If already submitted */
- if (desc->status == BUSY)
+ if (desc->status == BUSY || desc->status == PAUSED)
continue;
ret = pl330_submit_req(pch->thread, desc);
{
struct dma_pl330_chan *pch = to_pchan(chan);
struct pl330_dmac *pl330 = pch->dmac;
+ struct dma_pl330_desc *desc;
unsigned long flags;
pm_runtime_get_sync(pl330->ddma.dev);
_stop(pch->thread);
spin_unlock(&pl330->lock);
+ list_for_each_entry(desc, &pch->work_list, node) {
+ if (desc->status == BUSY)
+ desc->status = PAUSED;
+ }
spin_unlock_irqrestore(&pch->lock, flags);
pm_runtime_mark_last_busy(pl330->ddma.dev);
pm_runtime_put_autosuspend(pl330->ddma.dev);
else if (running && desc == running)
transferred =
pl330_get_current_xferred_count(pch, desc);
- else if (desc->status == BUSY)
+ else if (desc->status == BUSY || desc->status == PAUSED)
/*
* Busy but not running means either just enqueued,
* or finished and not yet marked done
case DONE:
ret = DMA_COMPLETE;
break;
+ case PAUSED:
+ ret = DMA_PAUSED;
+ break;
case PREP:
case BUSY:
ret = DMA_IN_PROGRESS;
val |= irq_start << shift;
irq_start++;
irq_num--;
+ if (!irq_num)
+ break;
}
/* write IRQ register */
ret = request_irq(irq, xdma_channel_isr, 0,
"xdma-c2h-channel", &xdev->c2h_chans[j]);
if (ret) {
- xdma_err(xdev, "H2C channel%d request irq%d failed: %d",
+ xdma_err(xdev, "C2H channel%d request irq%d failed: %d",
j, irq, ret);
goto failed_init_c2h;
}
}
reg_base = devm_ioremap_resource(&pdev->dev, res);
- if (!reg_base) {
+ if (IS_ERR(reg_base)) {
xdma_err(xdev, "ioremap failed");
goto failed;
}
void amdgpu_device_pci_config_reset(struct amdgpu_device *adev);
int amdgpu_device_pci_reset(struct amdgpu_device *adev);
bool amdgpu_device_need_post(struct amdgpu_device *adev);
+bool amdgpu_sg_display_supported(struct amdgpu_device *adev);
bool amdgpu_device_pcie_dynamic_switching_supported(void);
bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev);
bool amdgpu_device_aspm_support_quirk(void);
if (!p->gang_size) {
ret = -EINVAL;
- goto free_partial_kdata;
+ goto free_all_kdata;
}
for (i = 0; i < p->gang_size; ++i) {
}
/*
+ * On APUs with >= 64GB white flickering has been observed w/ SG enabled.
+ * Disable S/G on such systems until we have a proper fix.
+ * https://gitlab.freedesktop.org/drm/amd/-/issues/2354
+ * https://gitlab.freedesktop.org/drm/amd/-/issues/2735
+ */
+bool amdgpu_sg_display_supported(struct amdgpu_device *adev)
+{
+ switch (amdgpu_sg_display) {
+ case -1:
+ break;
+ case 0:
+ return false;
+ case 1:
+ return true;
+ default:
+ return false;
+ }
+ if ((totalram_pages() << (PAGE_SHIFT - 10)) +
+ (adev->gmc.real_vram_size / 1024) >= 64000000) {
+ DRM_WARN("Disabling S/G due to >=64GB RAM\n");
+ return false;
+ }
+ return true;
+}
+
+/*
* Intel hosts such as Raptor Lake and Sapphire Rapids don't support dynamic
* speed switching. Until we have confirmation from Intel that a specific host
* supports it, it's safer that we keep it disabled for all.
case IP_VERSION(11, 0, 3):
if ((adev->gfx.me_fw_version >= 1505) &&
(adev->gfx.pfp_fw_version >= 1600) &&
- (adev->gfx.mec_fw_version >= 512))
- adev->gfx.cp_gfx_shadow = true;
+ (adev->gfx.mec_fw_version >= 512)) {
+ if (amdgpu_sriov_vf(adev))
+ adev->gfx.cp_gfx_shadow = true;
+ else
+ adev->gfx.cp_gfx_shadow = false;
+ }
break;
default:
adev->gfx.cp_gfx_shadow = false;
int ret;
int retry_loop;
+ /* Wait for bootloader to signify that it is ready having bit 31 of
+ * C2PMSG_35 set to 1. All other bits are expected to be cleared.
+ * If there is an error in processing command, bits[7:0] will be set.
+ * This is applicable for PSP v13.0.6 and newer.
+ */
for (retry_loop = 0; retry_loop < 10; retry_loop++) {
- /* Wait for bootloader to signify that is
- ready having bit 31 of C2PMSG_35 set to 1 */
- ret = psp_wait_for(psp,
- SOC15_REG_OFFSET(MP0, 0, regMP0_SMN_C2PMSG_35),
- 0x80000000,
- 0x80000000,
- false);
+ ret = psp_wait_for(
+ psp, SOC15_REG_OFFSET(MP0, 0, regMP0_SMN_C2PMSG_35),
+ 0x80000000, 0xffffffff, false);
if (ret == 0)
return 0;
if (ignore_crat)
return true;
-#ifndef KFD_SUPPORT_IOMMU_V2
ret = true;
-#else
- ret = false;
-#endif
return ret;
}
kfd_device_info_set_event_interrupt_class(kfd);
- /* Raven */
- if (gc_version == IP_VERSION(9, 1, 0) ||
- gc_version == IP_VERSION(9, 2, 2))
- kfd->device_info.needs_iommu_device = true;
-
if (gc_version < IP_VERSION(11, 0, 0)) {
/* Navi2x+, Navi1x+ */
if (gc_version == IP_VERSION(10, 3, 6))
asic_type != CHIP_TONGA)
kfd->device_info.supports_cwsr = true;
- if (asic_type == CHIP_KAVERI ||
- asic_type == CHIP_CARRIZO)
- kfd->device_info.needs_iommu_device = true;
-
if (asic_type != CHIP_HAWAII && !vf)
kfd->device_info.needs_pci_atomics = true;
}
uint32_t gfx_target_version = 0;
switch (adev->asic_type) {
-#ifdef KFD_SUPPORT_IOMMU_V2
#ifdef CONFIG_DRM_AMDGPU_CIK
case CHIP_KAVERI:
gfx_target_version = 70000;
if (!vf)
f2g = &gfx_v8_kfd2kgd;
break;
-#endif
#ifdef CONFIG_DRM_AMDGPU_CIK
case CHIP_HAWAII:
gfx_target_version = 70001;
gfx_target_version = 90000;
f2g = &gfx_v9_kfd2kgd;
break;
-#ifdef KFD_SUPPORT_IOMMU_V2
/* Raven */
case IP_VERSION(9, 1, 0):
case IP_VERSION(9, 2, 2):
if (!vf)
f2g = &gfx_v9_kfd2kgd;
break;
-#endif
/* Vega12 */
case IP_VERSION(9, 2, 1):
gfx_target_version = 90004;
}
switch (dev->adev->asic_type) {
- case CHIP_CARRIZO:
- device_queue_manager_init_vi(&dqm->asic_ops);
- break;
-
case CHIP_KAVERI:
- device_queue_manager_init_cik(&dqm->asic_ops);
- break;
-
case CHIP_HAWAII:
device_queue_manager_init_cik_hawaii(&dqm->asic_ops);
break;
+ case CHIP_CARRIZO:
case CHIP_TONGA:
case CHIP_FIJI:
case CHIP_POLARIS10:
}
break;
}
- if (init_data.flags.gpu_vm_support &&
- (amdgpu_sg_display == 0))
- init_data.flags.gpu_vm_support = false;
+ if (init_data.flags.gpu_vm_support)
+ init_data.flags.gpu_vm_support = amdgpu_sg_display_supported(adev);
if (init_data.flags.gpu_vm_support)
adev->mode_info.gpu_vm_support = true;
if (computed_streams[i])
continue;
- if (!res_pool->funcs->remove_stream_from_ctx ||
+ if (res_pool->funcs->remove_stream_from_ctx &&
res_pool->funcs->remove_stream_from_ctx(stream->ctx->dc, dc_state, stream) != DC_OK)
return -EINVAL;
dal_gpio_destroy_irq(&hpd);
/* ensure that the panel is detected */
- ASSERT(edp_hpd_high);
+ if (!edp_hpd_high)
+ DC_LOG_DC("%s: wait timed out!\n", __func__);
}
void dce110_edp_power_control(
int cur_rom_en = 0;
if (color_format == CURSOR_MODE_COLOR_PRE_MULTIPLIED_ALPHA ||
- color_format == CURSOR_MODE_COLOR_UN_PRE_MULTIPLIED_ALPHA)
- cur_rom_en = 1;
+ color_format == CURSOR_MODE_COLOR_UN_PRE_MULTIPLIED_ALPHA) {
+ if (cursor_attributes->attribute_flags.bits.ENABLE_CURSOR_DEGAMMA) {
+ cur_rom_en = 1;
+ }
+ }
REG_UPDATE_3(CURSOR0_CONTROL,
CUR0_MODE, color_format,
/*
* For SMU 13.0.4/11, PMFW will handle the features disablement properly
- * for gpu reset case. Driver involvement is unnecessary.
+ * for gpu reset and S0i3 cases. Driver involvement is unnecessary.
*/
- if (amdgpu_in_reset(adev)) {
+ if (amdgpu_in_reset(adev) || adev->in_s0ix) {
switch (adev->ip_versions[MP1_HWIP][0]) {
case IP_VERSION(13, 0, 4):
case IP_VERSION(13, 0, 11):
struct smu_13_0_0_powerplay_table *powerplay_table =
table_context->power_play_table;
struct smu_baco_context *smu_baco = &smu->smu_baco;
+#if 0
PPTable_t *pptable = smu->smu_table.driver_pptable;
const OverDriveLimits_t * const overdrive_upperlimits =
&pptable->SkuTable.OverDriveLimitsBasicMax;
const OverDriveLimits_t * const overdrive_lowerlimits =
&pptable->SkuTable.OverDriveLimitsMin;
+#endif
if (powerplay_table->platform_caps & SMU_13_0_0_PP_PLATFORM_CAP_HARDWAREDC)
smu->dc_controlled_by_gpio = true;
if (powerplay_table->platform_caps & SMU_13_0_0_PP_PLATFORM_CAP_MACO)
smu_baco->maco_support = true;
+ /*
+ * We are in the transition to a new OD mechanism.
+ * Disable the OD feature support for SMU13 temporarily.
+ * TODO: get this reverted when new OD mechanism online
+ */
+#if 0
if (!overdrive_lowerlimits->FeatureCtrlMask ||
!overdrive_upperlimits->FeatureCtrlMask)
smu->od_enabled = false;
- table_context->thermal_controller_type =
- powerplay_table->thermal_controller_type;
-
/*
* Instead of having its own buffer space and get overdrive_table copied,
* smu->od_settings just points to the actual overdrive_table
*/
smu->od_settings = &powerplay_table->overdrive_table;
+#else
+ smu->od_enabled = false;
+#endif
+
+ table_context->thermal_controller_type =
+ powerplay_table->thermal_controller_type;
return 0;
}
(OverDriveTableExternal_t *)smu->smu_table.overdrive_table;
struct smu_13_0_dpm_table *single_dpm_table;
struct smu_13_0_pcie_table *pcie_table;
- const int link_width[] = {0, 1, 2, 4, 8, 12, 16};
uint32_t gen_speed, lane_width;
int i, curr_freq, size = 0;
int32_t min_value, max_value;
(pcie_table->pcie_lane[i] == 6) ? "x16" : "",
pcie_table->clk_freq[i],
(gen_speed == DECODE_GEN_SPEED(pcie_table->pcie_gen[i])) &&
- (lane_width == DECODE_LANE_WIDTH(link_width[pcie_table->pcie_lane[i]])) ?
+ (lane_width == DECODE_LANE_WIDTH(pcie_table->pcie_lane[i])) ?
"*" : "");
break;
gpu_metrics->average_socket_power =
SMUQ10_TO_UINT(metrics->SocketPower);
- /* Energy is reported in 15.625mJ units */
- gpu_metrics->energy_accumulator =
- SMUQ10_TO_UINT(metrics->SocketEnergyAcc);
+ /* Energy counter reported in 15.259uJ (2^-16) units */
+ gpu_metrics->energy_accumulator = metrics->SocketEnergyAcc;
gpu_metrics->current_gfxclk =
SMUQ10_TO_UINT(metrics->GfxclkFrequency[xcc0]);
struct smu_baco_context *smu_baco = &smu->smu_baco;
PPTable_t *smc_pptable = table_context->driver_pptable;
BoardTable_t *BoardTable = &smc_pptable->BoardTable;
+#if 0
const OverDriveLimits_t * const overdrive_upperlimits =
&smc_pptable->SkuTable.OverDriveLimitsBasicMax;
const OverDriveLimits_t * const overdrive_lowerlimits =
&smc_pptable->SkuTable.OverDriveLimitsMin;
+#endif
if (powerplay_table->platform_caps & SMU_13_0_7_PP_PLATFORM_CAP_HARDWAREDC)
smu->dc_controlled_by_gpio = true;
if (smu_baco->platform_support && (BoardTable->HsrEnabled || BoardTable->VddqOffEnabled))
smu_baco->maco_support = true;
+#if 0
if (!overdrive_lowerlimits->FeatureCtrlMask ||
!overdrive_upperlimits->FeatureCtrlMask)
smu->od_enabled = false;
- table_context->thermal_controller_type =
- powerplay_table->thermal_controller_type;
-
/*
* Instead of having its own buffer space and get overdrive_table copied,
* smu->od_settings just points to the actual overdrive_table
*/
smu->od_settings = &powerplay_table->overdrive_table;
+#else
+ smu->od_enabled = false;
+#endif
+
+ table_context->thermal_controller_type =
+ powerplay_table->thermal_controller_type;
return 0;
}
};
int int_status[3], i;
- if (it6505->enable_drv_hold || pm_runtime_get_if_in_use(dev) <= 0)
+ if (it6505->enable_drv_hold || !it6505->powered)
return IRQ_HANDLED;
+ pm_runtime_get_sync(dev);
+
int_status[0] = it6505_read(it6505, INT_STATUS_01);
int_status[1] = it6505_read(it6505, INT_STATUS_02);
int_status[2] = it6505_read(it6505, INT_STATUS_03);
dsi->lanes = 4;
dsi->format = MIPI_DSI_FMT_RGB888;
dsi->mode_flags = MIPI_DSI_MODE_VIDEO | MIPI_DSI_MODE_VIDEO_SYNC_PULSE |
- MIPI_DSI_MODE_VIDEO_HSE | MIPI_DSI_MODE_VIDEO_NO_HSA |
- MIPI_DSI_MODE_VIDEO_NO_HFP | MIPI_DSI_MODE_VIDEO_NO_HBP |
- MIPI_DSI_MODE_NO_EOT_PACKET;
+ MIPI_DSI_MODE_VIDEO_HSE;
ret = devm_mipi_dsi_attach(dev, dsi);
if (ret < 0) {
int ret;
if (obj->import_attach) {
+ /* Reset both vm_ops and vm_private_data, so we don't end up with
+ * vm_ops pointing to our implementation if the dma-buf backend
+ * doesn't set those fields.
+ */
vma->vm_private_data = NULL;
+ vma->vm_ops = NULL;
+
ret = dma_buf_mmap(obj->dma_buf, vma, 0);
/* Drop the reference drm_gem_mmap_obj() acquired.*/
/* Determine display colour depth for everything except LVDS now,
* DP requires this before mode_valid() is called.
*/
- if (connector->connector_type != DRM_MODE_CONNECTOR_LVDS && nv_connector->native_mode)
+ if (connector->connector_type != DRM_MODE_CONNECTOR_LVDS)
nouveau_connector_detect_depth(connector);
/* Find the native mode if this is a digital panel, if we didn't
#include "head.h"
#include "ior.h"
+#include <drm/display/drm_dp.h>
+
#include <subdev/bios.h>
#include <subdev/bios/init.h>
#include <subdev/gpio.h>
return outp->dp.rates != 0;
}
+/* XXX: This is a big fat hack, and this is just drm_dp_read_dpcd_caps()
+ * converted to work inside nvkm. This is a temporary holdover until we start
+ * passing the drm_dp_aux device through NVKM
+ */
+static int
+nvkm_dp_read_dpcd_caps(struct nvkm_outp *outp)
+{
+ struct nvkm_i2c_aux *aux = outp->dp.aux;
+ u8 dpcd_ext[DP_RECEIVER_CAP_SIZE];
+ int ret;
+
+ ret = nvkm_rdaux(aux, DPCD_RC00_DPCD_REV, outp->dp.dpcd, DP_RECEIVER_CAP_SIZE);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * Prior to DP1.3 the bit represented by
+ * DP_EXTENDED_RECEIVER_CAP_FIELD_PRESENT was reserved.
+ * If it is set DP_DPCD_REV at 0000h could be at a value less than
+ * the true capability of the panel. The only way to check is to
+ * then compare 0000h and 2200h.
+ */
+ if (!(outp->dp.dpcd[DP_TRAINING_AUX_RD_INTERVAL] &
+ DP_EXTENDED_RECEIVER_CAP_FIELD_PRESENT))
+ return 0;
+
+ ret = nvkm_rdaux(aux, DP_DP13_DPCD_REV, dpcd_ext, sizeof(dpcd_ext));
+ if (ret < 0)
+ return ret;
+
+ if (outp->dp.dpcd[DP_DPCD_REV] > dpcd_ext[DP_DPCD_REV]) {
+ OUTP_DBG(outp, "Extended DPCD rev less than base DPCD rev (%d > %d)\n",
+ outp->dp.dpcd[DP_DPCD_REV], dpcd_ext[DP_DPCD_REV]);
+ return 0;
+ }
+
+ if (!memcmp(outp->dp.dpcd, dpcd_ext, sizeof(dpcd_ext)))
+ return 0;
+
+ memcpy(outp->dp.dpcd, dpcd_ext, sizeof(dpcd_ext));
+
+ return 0;
+}
+
void
nvkm_dp_enable(struct nvkm_outp *outp, bool auxpwr)
{
memset(outp->dp.lttpr, 0x00, sizeof(outp->dp.lttpr));
}
- if (!nvkm_rdaux(aux, DPCD_RC00_DPCD_REV, outp->dp.dpcd, sizeof(outp->dp.dpcd))) {
+ if (!nvkm_dp_read_dpcd_caps(outp)) {
const u8 rates[] = { 0x1e, 0x14, 0x0a, 0x06, 0 };
const u8 *rate;
int rate_max;
extern const struct gf100_grctx_func gk110_grctx;
void gk110_grctx_generate_r419eb0(struct gf100_gr *);
+void gk110_grctx_generate_r419f78(struct gf100_gr *);
extern const struct gf100_grctx_func gk110b_grctx;
extern const struct gf100_grctx_func gk208_grctx;
gk104_grctx_generate_r419f78(struct gf100_gr *gr)
{
struct nvkm_device *device = gr->base.engine.subdev.device;
- nvkm_mask(device, 0x419f78, 0x00000001, 0x00000000);
+
+ /* bit 3 set disables loads in fp helper invocations, we need it enabled */
+ nvkm_mask(device, 0x419f78, 0x00000009, 0x00000000);
}
void
nvkm_mask(device, 0x419eb0, 0x00001000, 0x00001000);
}
+void
+gk110_grctx_generate_r419f78(struct gf100_gr *gr)
+{
+ struct nvkm_device *device = gr->base.engine.subdev.device;
+
+ /* bit 3 set disables loads in fp helper invocations, we need it enabled */
+ nvkm_mask(device, 0x419f78, 0x00000008, 0x00000000);
+}
+
const struct gf100_grctx_func
gk110_grctx = {
.main = gf100_grctx_generate_main,
.gpc_tpc_nr = gk104_grctx_generate_gpc_tpc_nr,
.r418800 = gk104_grctx_generate_r418800,
.r419eb0 = gk110_grctx_generate_r419eb0,
+ .r419f78 = gk110_grctx_generate_r419f78,
};
.gpc_tpc_nr = gk104_grctx_generate_gpc_tpc_nr,
.r418800 = gk104_grctx_generate_r418800,
.r419eb0 = gk110_grctx_generate_r419eb0,
+ .r419f78 = gk110_grctx_generate_r419f78,
};
.dist_skip_table = gf117_grctx_generate_dist_skip_table,
.gpc_tpc_nr = gk104_grctx_generate_gpc_tpc_nr,
.r418800 = gk104_grctx_generate_r418800,
+ .r419f78 = gk110_grctx_generate_r419f78,
};
.r406500 = gm107_grctx_generate_r406500,
.gpc_tpc_nr = gk104_grctx_generate_gpc_tpc_nr,
.r419e00 = gm107_grctx_generate_r419e00,
+ .r419f78 = gk110_grctx_generate_r419f78,
};
return gk20a_gr_av_to_init_(blob, 64, 0x00100000, ppack);
}
-int
-tu102_gr_load(struct gf100_gr *gr, int ver, const struct gf100_gr_fwif *fwif)
-{
- int ret;
-
- ret = gm200_gr_load(gr, ver, fwif);
- if (ret)
- return ret;
-
- return gk20a_gr_load_net(gr, "gr/", "sw_veid_bundle_init", ver, tu102_gr_av_to_init_veid,
- &gr->bundle_veid);
-}
-
static const struct gf100_gr_fwif
tu102_gr_fwif[] = {
{ 0, gm200_gr_load, &tu102_gr, &gp108_gr_fecs_acr, &gp108_gr_gpccs_acr },
* need align with 2 pixel.
*/
if (fb->format->is_yuv && ((new_plane_state->src.x1 >> 16) % 2)) {
- DRM_ERROR("Invalid Source: Yuv format not support odd xpos\n");
+ DRM_DEBUG_KMS("Invalid Source: Yuv format not support odd xpos\n");
return -EINVAL;
}
if (fb->format->is_yuv && new_plane_state->rotation & DRM_MODE_REFLECT_Y) {
- DRM_ERROR("Invalid Source: Yuv format does not support this rotation\n");
+ DRM_DEBUG_KMS("Invalid Source: Yuv format does not support this rotation\n");
return -EINVAL;
}
struct vop *vop = to_vop(crtc);
if (!vop->data->afbc) {
- DRM_ERROR("vop does not support AFBC\n");
+ DRM_DEBUG_KMS("vop does not support AFBC\n");
return -EINVAL;
}
return ret;
if (new_plane_state->src.x1 || new_plane_state->src.y1) {
- DRM_ERROR("AFBC does not support offset display, xpos=%d, ypos=%d, offset=%d\n",
- new_plane_state->src.x1,
- new_plane_state->src.y1, fb->offsets[0]);
+ DRM_DEBUG_KMS("AFBC does not support offset display, " \
+ "xpos=%d, ypos=%d, offset=%d\n",
+ new_plane_state->src.x1, new_plane_state->src.y1,
+ fb->offsets[0]);
return -EINVAL;
}
if (new_plane_state->rotation && new_plane_state->rotation != DRM_MODE_ROTATE_0) {
- DRM_ERROR("No rotation support in AFBC, rotation=%d\n",
- new_plane_state->rotation);
+ DRM_DEBUG_KMS("No rotation support in AFBC, rotation=%d\n",
+ new_plane_state->rotation);
return -EINVAL;
}
}
#include <linux/crc16.h>
#include <linux/debugfs.h>
+#include <linux/delay.h>
#include <linux/hid.h>
#include <linux/hwmon.h>
#include <linux/jiffies.h>
+#include <linux/ktime.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/seq_file.h>
#define CTRL_REPORT_ID 0x03
#define AQUAERO_CTRL_REPORT_ID 0x0b
+#define CTRL_REPORT_DELAY 200 /* ms */
+
/* The HID report that the official software always sends
* after writing values, currently same for all devices
*/
int secondary_ctrl_report_size;
u8 *secondary_ctrl_report;
+ ktime_t last_ctrl_report_op;
+ int ctrl_report_delay; /* Delay between two ctrl report operations, in ms */
+
int buffer_size;
u8 *buffer;
int checksum_start;
return 0;
}
+static void aqc_delay_ctrl_report(struct aqc_data *priv)
+{
+ /*
+ * If previous read or write is too close to this one, delay the current operation
+ * to give the device enough time to process the previous one.
+ */
+ if (priv->ctrl_report_delay) {
+ s64 delta = ktime_ms_delta(ktime_get(), priv->last_ctrl_report_op);
+
+ if (delta < priv->ctrl_report_delay)
+ msleep(priv->ctrl_report_delay - delta);
+ }
+}
+
/* Expects the mutex to be locked */
static int aqc_get_ctrl_data(struct aqc_data *priv)
{
int ret;
+ aqc_delay_ctrl_report(priv);
+
memset(priv->buffer, 0x00, priv->buffer_size);
ret = hid_hw_raw_request(priv->hdev, priv->ctrl_report_id, priv->buffer, priv->buffer_size,
HID_FEATURE_REPORT, HID_REQ_GET_REPORT);
if (ret < 0)
ret = -ENODATA;
+ priv->last_ctrl_report_op = ktime_get();
+
return ret;
}
int ret;
u16 checksum;
+ aqc_delay_ctrl_report(priv);
+
/* Checksum is not needed for Aquaero */
if (priv->kind != aquaero) {
/* Init and xorout value for CRC-16/USB is 0xffff */
ret = hid_hw_raw_request(priv->hdev, priv->ctrl_report_id, priv->buffer, priv->buffer_size,
HID_FEATURE_REPORT, HID_REQ_SET_REPORT);
if (ret < 0)
- return ret;
+ goto record_access_and_ret;
/* The official software sends this report after every change, so do it here as well */
ret = hid_hw_raw_request(priv->hdev, priv->secondary_ctrl_report_id,
priv->secondary_ctrl_report, priv->secondary_ctrl_report_size,
HID_FEATURE_REPORT, HID_REQ_SET_REPORT);
+
+record_access_and_ret:
+ priv->last_ctrl_report_op = ktime_get();
+
return ret;
}
priv->buffer_size = AQUAERO_CTRL_REPORT_SIZE;
priv->temp_ctrl_offset = AQUAERO_TEMP_CTRL_OFFSET;
+ priv->ctrl_report_delay = CTRL_REPORT_DELAY;
priv->temp_label = label_temp_sensors;
priv->virtual_temp_label = label_virtual_temp_sensors;
priv->temp_ctrl_offset = D5NEXT_TEMP_CTRL_OFFSET;
priv->buffer_size = D5NEXT_CTRL_REPORT_SIZE;
+ priv->ctrl_report_delay = CTRL_REPORT_DELAY;
priv->power_cycle_count_offset = D5NEXT_POWER_CYCLES;
priv->temp_ctrl_offset = OCTO_TEMP_CTRL_OFFSET;
priv->buffer_size = OCTO_CTRL_REPORT_SIZE;
+ priv->ctrl_report_delay = CTRL_REPORT_DELAY;
priv->power_cycle_count_offset = OCTO_POWER_CYCLES;
priv->temp_ctrl_offset = QUADRO_TEMP_CTRL_OFFSET;
priv->buffer_size = QUADRO_CTRL_REPORT_SIZE;
+ priv->ctrl_report_delay = CTRL_REPORT_DELAY;
priv->flow_pulses_ctrl_offset = QUADRO_FLOW_PULSES_CTRL_OFFSET;
priv->power_cycle_count_offset = QUADRO_POWER_CYCLES;
enum chips {pfe1100, pfe3000};
/*
- * Disable status check for pfe3000 devices, because some devices report
- * communication error (invalid command) for VOUT_MODE command (0x20)
- * although correct VOUT_MODE (0x16) is returned: it leads to incorrect
- * exponent in linear mode.
+ * Disable status check because some devices report communication error
+ * (invalid command) for VOUT_MODE command (0x20) although the correct
+ * VOUT_MODE (0x16) is returned: it leads to incorrect exponent in linear
+ * mode.
+ * This affects both pfe3000 and pfe1100.
*/
-static struct pmbus_platform_data pfe3000_plat_data = {
+static struct pmbus_platform_data pfe_plat_data = {
.flags = PMBUS_SKIP_STATUS_CHECK,
};
int model;
model = (int)i2c_match_id(pfe_device_id, client)->driver_data;
+ client->dev.platform_data = &pfe_plat_data;
/*
* PFE3000-12-069RA devices may not stay in page 0 during device
* probe which leads to probe failure (read status word failed).
* So let's set the device to page 0 at the beginning.
*/
- if (model == pfe3000) {
- client->dev.platform_data = &pfe3000_plat_data;
+ if (model == pfe3000)
i2c_smbus_write_byte_data(client, PMBUS_PAGE, 0);
- }
return pmbus_do_probe(client, &pfe_driver_info[model]);
}
extern int dsp_cmx_conf(struct dsp *dsp, u32 conf_id);
extern void dsp_cmx_receive(struct dsp *dsp, struct sk_buff *skb);
extern void dsp_cmx_hdlc(struct dsp *dsp, struct sk_buff *skb);
-extern void dsp_cmx_send(void *arg);
+extern void dsp_cmx_send(struct timer_list *arg);
extern void dsp_cmx_transmit(struct dsp *dsp, struct sk_buff *skb);
extern int dsp_cmx_del_conf_member(struct dsp *dsp);
extern int dsp_cmx_del_conf(struct dsp_conf *conf);
static int dsp_count_valid; /* if we have last sample count */
void
-dsp_cmx_send(void *arg)
+dsp_cmx_send(struct timer_list *arg)
{
struct dsp_conf *conf;
struct dsp_conf_member *member;
}
/* set sample timer */
- timer_setup(&dsp_spl_tl, (void *)dsp_cmx_send, 0);
+ timer_setup(&dsp_spl_tl, dsp_cmx_send, 0);
dsp_spl_tl.expires = jiffies + dsp_tics;
dsp_spl_jiffies = dsp_spl_tl.expires;
add_timer(&dsp_spl_tl);
pkt->extradata_size = 0;
pkt->shdr.hdr.size =
- struct_size((struct hfi_session_set_buffers_pkt *)0,
- buffer_info, bd->num_buffers);
+ struct_size_t(struct hfi_session_set_buffers_pkt,
+ buffer_info, bd->num_buffers);
}
pkt->response_req = bd->response_required;
return;
}
for (len = 0; len < remain && len < host->fifo_width;) {
- /* SCR data must be read in big endian. */
- if (data->mrq->cmd->opcode == SD_APP_SEND_SCR)
- *sgp = ioread32be(host->base +
- REG_DATA_WINDOW);
- else
- *sgp = ioread32(host->base +
- REG_DATA_WINDOW);
+ *sgp = ioread32(host->base + REG_DATA_WINDOW);
sgp++;
len += 4;
}
bool enable_cmd_dat_delay;
};
+static void *sdhci_f_sdhost_priv(struct sdhci_host *host)
+{
+ struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host);
+
+ return sdhci_pltfm_priv(pltfm_host);
+}
+
static void sdhci_f_sdh30_soft_voltage_switch(struct sdhci_host *host)
{
- struct f_sdhost_priv *priv = sdhci_priv(host);
+ struct f_sdhost_priv *priv = sdhci_f_sdhost_priv(host);
u32 ctrl = 0;
usleep_range(2500, 3000);
static void sdhci_f_sdh30_reset(struct sdhci_host *host, u8 mask)
{
- struct f_sdhost_priv *priv = sdhci_priv(host);
+ struct f_sdhost_priv *priv = sdhci_f_sdhost_priv(host);
u32 ctl;
if (sdhci_readw(host, SDHCI_CLOCK_CONTROL) == 0)
.set_uhs_signaling = sdhci_set_uhs_signaling,
};
+static const struct sdhci_pltfm_data sdhci_f_sdh30_pltfm_data = {
+ .ops = &sdhci_f_sdh30_ops,
+ .quirks = SDHCI_QUIRK_NO_ENDATTR_IN_NOPDESC
+ | SDHCI_QUIRK_INVERTED_WRITE_PROTECT,
+ .quirks2 = SDHCI_QUIRK2_SUPPORT_SINGLE
+ | SDHCI_QUIRK2_TUNING_WORK_AROUND,
+};
+
static int sdhci_f_sdh30_probe(struct platform_device *pdev)
{
struct sdhci_host *host;
struct device *dev = &pdev->dev;
- int irq, ctrl = 0, ret = 0;
+ int ctrl = 0, ret = 0;
struct f_sdhost_priv *priv;
+ struct sdhci_pltfm_host *pltfm_host;
u32 reg = 0;
- irq = platform_get_irq(pdev, 0);
- if (irq < 0)
- return irq;
-
- host = sdhci_alloc_host(dev, sizeof(struct f_sdhost_priv));
+ host = sdhci_pltfm_init(pdev, &sdhci_f_sdh30_pltfm_data,
+ sizeof(struct f_sdhost_priv));
if (IS_ERR(host))
return PTR_ERR(host);
- priv = sdhci_priv(host);
+ pltfm_host = sdhci_priv(host);
+ priv = sdhci_pltfm_priv(pltfm_host);
priv->dev = dev;
- host->quirks = SDHCI_QUIRK_NO_ENDATTR_IN_NOPDESC |
- SDHCI_QUIRK_INVERTED_WRITE_PROTECT;
- host->quirks2 = SDHCI_QUIRK2_SUPPORT_SINGLE |
- SDHCI_QUIRK2_TUNING_WORK_AROUND;
-
priv->enable_cmd_dat_delay = device_property_read_bool(dev,
"fujitsu,cmd-dat-delay-select");
if (ret)
goto err;
- platform_set_drvdata(pdev, host);
-
- host->hw_name = "f_sdh30";
- host->ops = &sdhci_f_sdh30_ops;
- host->irq = irq;
-
- host->ioaddr = devm_platform_ioremap_resource(pdev, 0);
- if (IS_ERR(host->ioaddr)) {
- ret = PTR_ERR(host->ioaddr);
- goto err;
- }
-
if (dev_of_node(dev)) {
sdhci_get_of_property(pdev);
err_clk:
clk_disable_unprepare(priv->clk_iface);
err:
- sdhci_free_host(host);
+ sdhci_pltfm_free(pdev);
+
return ret;
}
static int sdhci_f_sdh30_remove(struct platform_device *pdev)
{
struct sdhci_host *host = platform_get_drvdata(pdev);
- struct f_sdhost_priv *priv = sdhci_priv(host);
-
- sdhci_remove_host(host, readl(host->ioaddr + SDHCI_INT_STATUS) ==
- 0xffffffff);
+ struct f_sdhost_priv *priv = sdhci_f_sdhost_priv(host);
reset_control_assert(priv->rst);
clk_disable_unprepare(priv->clk);
clk_disable_unprepare(priv->clk_iface);
- sdhci_free_host(host);
- platform_set_drvdata(pdev, NULL);
+ sdhci_pltfm_unregister(pdev);
return 0;
}
bond_dev->hw_features = BOND_VLAN_FEATURES |
NETIF_F_HW_VLAN_CTAG_RX |
- NETIF_F_HW_VLAN_CTAG_FILTER;
+ NETIF_F_HW_VLAN_CTAG_FILTER |
+ NETIF_F_HW_VLAN_STAG_RX |
+ NETIF_F_HW_VLAN_STAG_FILTER;
bond_dev->hw_features |= NETIF_F_GSO_ENCAP_ALL;
bond_dev->features |= bond_dev->hw_features;
struct felix *felix = ocelot_to_felix(ocelot);
struct dsa_port *dp;
+ rtnl_lock();
if (felix->tag_proto_ops)
felix->tag_proto_ops->teardown(ds);
+ rtnl_unlock();
dsa_switch_for_each_available_port(dp, ds)
ocelot_deinit_port(ocelot, dp->index);
static int enetc_pf_register_with_ierb(struct pci_dev *pdev)
{
- struct device_node *node = pdev->dev.of_node;
struct platform_device *ierb_pdev;
struct device_node *ierb_node;
- /* Don't register with the IERB if the PF itself is disabled */
- if (!node || !of_device_is_available(node))
- return 0;
-
ierb_node = of_find_compatible_node(NULL, NULL,
"fsl,ls1028a-enetc-ierb");
if (!ierb_node || !of_device_is_available(ierb_node))
return enetc_ierb_register_pf(ierb_pdev, pdev);
}
-static int enetc_pf_probe(struct pci_dev *pdev,
- const struct pci_device_id *ent)
+static struct enetc_si *enetc_psi_create(struct pci_dev *pdev)
{
- struct device_node *node = pdev->dev.of_node;
- struct enetc_ndev_priv *priv;
- struct net_device *ndev;
struct enetc_si *si;
- struct enetc_pf *pf;
int err;
- err = enetc_pf_register_with_ierb(pdev);
- if (err == -EPROBE_DEFER)
- return err;
- if (err)
- dev_warn(&pdev->dev,
- "Could not register with IERB driver: %pe, please update the device tree\n",
- ERR_PTR(err));
-
- err = enetc_pci_probe(pdev, KBUILD_MODNAME, sizeof(*pf));
- if (err)
- return dev_err_probe(&pdev->dev, err, "PCI probing failed\n");
+ err = enetc_pci_probe(pdev, KBUILD_MODNAME, sizeof(struct enetc_pf));
+ if (err) {
+ dev_err_probe(&pdev->dev, err, "PCI probing failed\n");
+ goto out;
+ }
si = pci_get_drvdata(pdev);
if (!si->hw.port || !si->hw.global) {
err = -ENODEV;
dev_err(&pdev->dev, "could not map PF space, probing a VF?\n");
- goto err_map_pf_space;
+ goto out_pci_remove;
}
err = enetc_setup_cbdr(&pdev->dev, &si->hw, ENETC_CBDR_DEFAULT_SIZE,
&si->cbd_ring);
if (err)
- goto err_setup_cbdr;
+ goto out_pci_remove;
err = enetc_init_port_rfs_memory(si);
if (err) {
dev_err(&pdev->dev, "Failed to initialize RFS memory\n");
- goto err_init_port_rfs;
+ goto out_teardown_cbdr;
}
err = enetc_init_port_rss_memory(si);
if (err) {
dev_err(&pdev->dev, "Failed to initialize RSS memory\n");
- goto err_init_port_rss;
+ goto out_teardown_cbdr;
}
- if (node && !of_device_is_available(node)) {
- dev_info(&pdev->dev, "device is disabled, skipping\n");
- err = -ENODEV;
- goto err_device_disabled;
+ return si;
+
+out_teardown_cbdr:
+ enetc_teardown_cbdr(&si->cbd_ring);
+out_pci_remove:
+ enetc_pci_remove(pdev);
+out:
+ return ERR_PTR(err);
+}
+
+static void enetc_psi_destroy(struct pci_dev *pdev)
+{
+ struct enetc_si *si = pci_get_drvdata(pdev);
+
+ enetc_teardown_cbdr(&si->cbd_ring);
+ enetc_pci_remove(pdev);
+}
+
+static int enetc_pf_probe(struct pci_dev *pdev,
+ const struct pci_device_id *ent)
+{
+ struct device_node *node = pdev->dev.of_node;
+ struct enetc_ndev_priv *priv;
+ struct net_device *ndev;
+ struct enetc_si *si;
+ struct enetc_pf *pf;
+ int err;
+
+ err = enetc_pf_register_with_ierb(pdev);
+ if (err == -EPROBE_DEFER)
+ return err;
+ if (err)
+ dev_warn(&pdev->dev,
+ "Could not register with IERB driver: %pe, please update the device tree\n",
+ ERR_PTR(err));
+
+ si = enetc_psi_create(pdev);
+ if (IS_ERR(si)) {
+ err = PTR_ERR(si);
+ goto err_psi_create;
}
pf = enetc_si_priv(si);
si->ndev = NULL;
free_netdev(ndev);
err_alloc_netdev:
-err_init_port_rss:
-err_init_port_rfs:
-err_device_disabled:
err_setup_mac_addresses:
- enetc_teardown_cbdr(&si->cbd_ring);
-err_setup_cbdr:
-err_map_pf_space:
- enetc_pci_remove(pdev);
-
+ enetc_psi_destroy(pdev);
+err_psi_create:
return err;
}
enetc_free_msix(priv);
enetc_free_si_resources(priv);
- enetc_teardown_cbdr(&si->cbd_ring);
free_netdev(si->ndev);
- enetc_pci_remove(pdev);
+ enetc_psi_destroy(pdev);
+}
+
+static void enetc_fixup_clear_rss_rfs(struct pci_dev *pdev)
+{
+ struct device_node *node = pdev->dev.of_node;
+ struct enetc_si *si;
+
+ /* Only apply quirk for disabled functions. For the ones
+ * that are enabled, enetc_pf_probe() will apply it.
+ */
+ if (node && of_device_is_available(node))
+ return;
+
+ si = enetc_psi_create(pdev);
+ if (si)
+ enetc_psi_destroy(pdev);
}
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_FREESCALE, ENETC_DEV_ID_PF,
+ enetc_fixup_clear_rss_rfs);
static const struct pci_device_id enetc_pf_id_table[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_FREESCALE, ENETC_DEV_ID_PF) },
if (result) {
if (item_len < strlen(result[i]))
break;
- strscpy(pos, result[i], strlen(result[i]));
+ memcpy(pos, result[i], strlen(result[i]));
} else {
- strscpy(pos, items[i].name, strlen(items[i].name));
+ memcpy(pos, items[i].name, strlen(items[i].name));
}
pos += item_len;
len -= item_len;
if (!if_running)
return;
+ if (test_and_set_bit(HNS3_NIC_STATE_DOWN, &priv->state))
+ return;
+
netif_carrier_off(ndev);
netif_tx_disable(ndev);
if (!if_running)
return;
- hns3_nic_reset_all_ring(priv->ae_handle);
+ if (hns3_nic_resetting(ndev))
+ return;
+
+ if (!test_bit(HNS3_NIC_STATE_DOWN, &priv->state))
+ return;
+
+ if (hns3_nic_reset_all_ring(priv->ae_handle))
+ return;
+
+ clear_bit(HNS3_NIC_STATE_DOWN, &priv->state);
for (i = 0; i < priv->vector_num; i++)
hns3_vector_enable(&priv->tqp_vector[i]);
if (result) {
if (item_len < strlen(result[i]))
break;
- strscpy(pos, result[i], strlen(result[i]));
+ memcpy(pos, result[i], strlen(result[i]));
} else {
- strscpy(pos, items[i].name, strlen(items[i].name));
+ memcpy(pos, items[i].name, strlen(items[i].name));
}
pos += item_len;
len -= item_len;
static void hclge_sync_promisc_mode(struct hclge_dev *hdev);
static void hclge_sync_fd_table(struct hclge_dev *hdev);
static void hclge_update_fec_stats(struct hclge_dev *hdev);
+static int hclge_mac_link_status_wait(struct hclge_dev *hdev, int link_ret,
+ int wait_cnt);
static struct hnae3_ae_algo ae_algo;
static void hclge_cfg_mac_mode(struct hclge_dev *hdev, bool enable)
{
+#define HCLGE_LINK_STATUS_WAIT_CNT 3
+
struct hclge_desc desc;
struct hclge_config_mac_mode_cmd *req =
(struct hclge_config_mac_mode_cmd *)desc.data;
req->txrx_pad_fcs_loop_en = cpu_to_le32(loop_en);
ret = hclge_cmd_send(&hdev->hw, &desc, 1);
- if (ret)
+ if (ret) {
dev_err(&hdev->pdev->dev,
"mac enable fail, ret =%d.\n", ret);
+ return;
+ }
+
+ if (!enable)
+ hclge_mac_link_status_wait(hdev, HCLGE_LINK_STATUS_DOWN,
+ HCLGE_LINK_STATUS_WAIT_CNT);
}
static int hclge_config_switch_param(struct hclge_dev *hdev, int vfid,
} while (++i < HCLGE_PHY_LINK_STATUS_NUM);
}
-static int hclge_mac_link_status_wait(struct hclge_dev *hdev, int link_ret)
+static int hclge_mac_link_status_wait(struct hclge_dev *hdev, int link_ret,
+ int wait_cnt)
{
-#define HCLGE_MAC_LINK_STATUS_NUM 100
-
int link_status;
int i = 0;
int ret;
return 0;
msleep(HCLGE_LINK_STATUS_MS);
- } while (++i < HCLGE_MAC_LINK_STATUS_NUM);
+ } while (++i < wait_cnt);
return -EBUSY;
}
static int hclge_mac_phy_link_status_wait(struct hclge_dev *hdev, bool en,
bool is_phy)
{
+#define HCLGE_MAC_LINK_STATUS_NUM 100
+
int link_ret;
link_ret = en ? HCLGE_LINK_STATUS_UP : HCLGE_LINK_STATUS_DOWN;
if (is_phy)
hclge_phy_link_status_wait(hdev, link_ret);
- return hclge_mac_link_status_wait(hdev, link_ret);
+ return hclge_mac_link_status_wait(hdev, link_ret,
+ HCLGE_MAC_LINK_STATUS_NUM);
}
static int hclge_set_app_loopback(struct hclge_dev *hdev, bool en)
u32 rx_pause, tx_pause;
u8 flowctl;
- if (!phydev->link || !phydev->autoneg)
+ if (!phydev->link)
return 0;
+ if (!phydev->autoneg)
+ return hclge_mac_pause_setup_hw(hdev);
+
local_advertising = linkmode_adv_to_lcl_adv_t(phydev->advertising);
if (phydev->pause)
return 0;
}
-static int hclge_mac_pause_setup_hw(struct hclge_dev *hdev)
+int hclge_mac_pause_setup_hw(struct hclge_dev *hdev)
{
bool tx_en, rx_en;
u8 pfc_bitmap);
int hclge_mac_pause_en_cfg(struct hclge_dev *hdev, bool tx, bool rx);
int hclge_pause_addr_cfg(struct hclge_dev *hdev, const u8 *mac_addr);
+int hclge_mac_pause_setup_hw(struct hclge_dev *hdev);
void hclge_pfc_rx_stats_get(struct hclge_dev *hdev, u64 *stats);
void hclge_pfc_tx_stats_get(struct hclge_dev *hdev, u64 *stats);
int hclge_tm_qs_shaper_cfg(struct hclge_vport *vport, int max_tx_rate);
static union sub_crq *ibmvnic_next_scrq(struct ibmvnic_adapter *,
struct ibmvnic_sub_crq_queue *);
static int ibmvnic_poll(struct napi_struct *napi, int data);
+static int reset_sub_crq_queues(struct ibmvnic_adapter *adapter);
+static inline void reinit_init_done(struct ibmvnic_adapter *adapter);
static void send_query_map(struct ibmvnic_adapter *adapter);
static int send_request_map(struct ibmvnic_adapter *, dma_addr_t, u32, u8);
static int send_request_unmap(struct ibmvnic_adapter *, u8);
static void free_long_term_buff(struct ibmvnic_adapter *adapter,
struct ibmvnic_long_term_buff *ltb);
static void ibmvnic_disable_irqs(struct ibmvnic_adapter *adapter);
+static void flush_reset_queue(struct ibmvnic_adapter *adapter);
struct ibmvnic_stat {
char name[ETH_GSTRING_LEN];
static int ibmvnic_login(struct net_device *netdev)
{
+ unsigned long flags, timeout = msecs_to_jiffies(20000);
struct ibmvnic_adapter *adapter = netdev_priv(netdev);
- unsigned long timeout = msecs_to_jiffies(20000);
int retry_count = 0;
int retries = 10;
bool retry;
if (!wait_for_completion_timeout(&adapter->init_done,
timeout)) {
- netdev_warn(netdev, "Login timed out, retrying...\n");
- retry = true;
- adapter->init_done_rc = 0;
- retry_count++;
- continue;
+ netdev_warn(netdev, "Login timed out\n");
+ adapter->login_pending = false;
+ goto partial_reset;
}
if (adapter->init_done_rc == ABORTED) {
"SCRQ irq initialization failed\n");
return rc;
}
+ /* Default/timeout error handling, reset and start fresh */
} else if (adapter->init_done_rc) {
netdev_warn(netdev, "Adapter login failed, init_done_rc = %d\n",
adapter->init_done_rc);
- return -EIO;
+
+partial_reset:
+ /* adapter login failed, so free any CRQs or sub-CRQs
+ * and register again before attempting to login again.
+ * If we don't do this then the VIOS may think that
+ * we are already logged in and reject any subsequent
+ * attempts
+ */
+ netdev_warn(netdev,
+ "Freeing and re-registering CRQs before attempting to login again\n");
+ retry = true;
+ adapter->init_done_rc = 0;
+ release_sub_crqs(adapter, true);
+ /* Much of this is similar logic as ibmvnic_probe(),
+ * we are essentially re-initializing communication
+ * with the server. We really should not run any
+ * resets/failovers here because this is already a form
+ * of reset and we do not want parallel resets occurring
+ */
+ do {
+ reinit_init_done(adapter);
+ /* Clear any failovers we got in the previous
+ * pass since we are re-initializing the CRQ
+ */
+ adapter->failover_pending = false;
+ release_crq_queue(adapter);
+ /* If we don't sleep here then we risk an
+ * unnecessary failover event from the VIOS.
+ * This is a known VIOS issue caused by a vnic
+ * device freeing and registering a CRQ too
+ * quickly.
+ */
+ msleep(1500);
+ /* Avoid any resets, since we are currently
+ * resetting.
+ */
+ spin_lock_irqsave(&adapter->rwi_lock, flags);
+ flush_reset_queue(adapter);
+ spin_unlock_irqrestore(&adapter->rwi_lock,
+ flags);
+
+ rc = init_crq_queue(adapter);
+ if (rc) {
+ netdev_err(netdev, "login recovery: init CRQ failed %d\n",
+ rc);
+ return -EIO;
+ }
+
+ rc = ibmvnic_reset_init(adapter, false);
+ if (rc)
+ netdev_err(netdev, "login recovery: Reset init failed %d\n",
+ rc);
+ /* IBMVNIC_CRQ_INIT will return EAGAIN if it
+ * fails, since ibmvnic_reset_init will free
+ * irq's in failure, we won't be able to receive
+ * new CRQs so we need to keep trying. probe()
+ * handles this similarly.
+ */
+ } while (rc == -EAGAIN && retry_count++ < retries);
}
} while (retry);
static void release_login_buffer(struct ibmvnic_adapter *adapter)
{
+ if (!adapter->login_buf)
+ return;
+
+ dma_unmap_single(&adapter->vdev->dev, adapter->login_buf_token,
+ adapter->login_buf_sz, DMA_TO_DEVICE);
kfree(adapter->login_buf);
adapter->login_buf = NULL;
}
static void release_login_rsp_buffer(struct ibmvnic_adapter *adapter)
{
+ if (!adapter->login_rsp_buf)
+ return;
+
+ dma_unmap_single(&adapter->vdev->dev, adapter->login_rsp_buf_token,
+ adapter->login_rsp_buf_sz, DMA_FROM_DEVICE);
kfree(adapter->login_rsp_buf);
adapter->login_rsp_buf = NULL;
}
if (rc) {
adapter->login_pending = false;
netdev_err(adapter->netdev, "Failed to send login, rc=%d\n", rc);
- goto buf_rsp_map_failed;
+ goto buf_send_failed;
}
return 0;
+buf_send_failed:
+ dma_unmap_single(dev, rsp_buffer_token, rsp_buffer_size,
+ DMA_FROM_DEVICE);
buf_rsp_map_failed:
kfree(login_rsp_buffer);
adapter->login_rsp_buf = NULL;
int num_tx_pools;
int num_rx_pools;
u64 *size_array;
+ u32 rsp_len;
int i;
/* CHECK: Test/set of login_pending does not need to be atomic
}
adapter->login_pending = false;
- dma_unmap_single(dev, adapter->login_buf_token, adapter->login_buf_sz,
- DMA_TO_DEVICE);
- dma_unmap_single(dev, adapter->login_rsp_buf_token,
- adapter->login_rsp_buf_sz, DMA_FROM_DEVICE);
-
/* If the number of queues requested can't be allocated by the
* server, the login response will return with code 1. We will need
* to resend the login buffer with fewer queues requested.
ibmvnic_reset(adapter, VNIC_RESET_FATAL);
return -EIO;
}
+
+ rsp_len = be32_to_cpu(login_rsp->len);
+ if (be32_to_cpu(login->login_rsp_len) < rsp_len ||
+ rsp_len <= be32_to_cpu(login_rsp->off_txsubm_subcrqs) ||
+ rsp_len <= be32_to_cpu(login_rsp->off_rxadd_subcrqs) ||
+ rsp_len <= be32_to_cpu(login_rsp->off_rxadd_buff_size) ||
+ rsp_len <= be32_to_cpu(login_rsp->off_supp_tx_desc)) {
+ /* This can happen if a login request times out and there are
+ * 2 outstanding login requests sent, the LOGIN_RSP crq
+ * could have been for the older login request. So we are
+ * parsing the newer response buffer which may be incomplete
+ */
+ dev_err(dev, "FATAL: Login rsp offsets/lengths invalid\n");
+ ibmvnic_reset(adapter, VNIC_RESET_FATAL);
+ return -EIO;
+ }
+
size_array = (u64 *)((u8 *)(adapter->login_rsp_buf) +
be32_to_cpu(adapter->login_rsp_buf->off_rxadd_buff_size));
/* variable buffer sizes are not supported, so just read the
if (fsp->flow_type & FLOW_MAC_EXT)
return -EINVAL;
+ spin_lock_bh(&adapter->fdir_fltr_lock);
if (adapter->fdir_active_fltr >= IAVF_MAX_FDIR_FILTERS) {
+ spin_unlock_bh(&adapter->fdir_fltr_lock);
dev_err(&adapter->pdev->dev,
"Unable to add Flow Director filter because VF reached the limit of max allowed filters (%u)\n",
IAVF_MAX_FDIR_FILTERS);
return -ENOSPC;
}
- spin_lock_bh(&adapter->fdir_fltr_lock);
if (iavf_find_fdir_fltr_by_loc(adapter, fsp->location)) {
dev_err(&adapter->pdev->dev, "Failed to add Flow Director filter, it already exists\n");
spin_unlock_bh(&adapter->fdir_fltr_lock);
case ETHTOOL_GRXCLSRLCNT:
if (!FDIR_FLTR_SUPPORT(adapter))
break;
+ spin_lock_bh(&adapter->fdir_fltr_lock);
cmd->rule_cnt = adapter->fdir_active_fltr;
+ spin_unlock_bh(&adapter->fdir_fltr_lock);
cmd->data = IAVF_MAX_FDIR_FILTERS;
ret = 0;
break;
bool iavf_fdir_is_dup_fltr(struct iavf_adapter *adapter, struct iavf_fdir_fltr *fltr)
{
struct iavf_fdir_fltr *tmp;
+ bool ret = false;
+ spin_lock_bh(&adapter->fdir_fltr_lock);
list_for_each_entry(tmp, &adapter->fdir_list_head, list) {
if (tmp->flow_type != fltr->flow_type)
continue;
!memcmp(&tmp->ip_data, &fltr->ip_data,
sizeof(fltr->ip_data)) &&
!memcmp(&tmp->ext_data, &fltr->ext_data,
- sizeof(fltr->ext_data)))
- return true;
+ sizeof(fltr->ext_data))) {
+ ret = true;
+ break;
+ }
}
+ spin_unlock_bh(&adapter->fdir_fltr_lock);
- return false;
+ return ret;
}
/**
u32 qbv_config_change_errors;
bool qbv_transition;
unsigned int qbv_count;
+ /* Access to oper_gate_closed, admin_gate_closed and qbv_transition
+ * are protected by the qbv_tx_lock.
+ */
+ spinlock_t qbv_tx_lock;
/* OS defined structs */
struct pci_dev *pdev;
adapter->nfc_rule_count = 0;
spin_lock_init(&adapter->stats64_lock);
+ spin_lock_init(&adapter->qbv_tx_lock);
/* Assume MSI-X interrupts, will be checked during IRQ allocation */
adapter->flags |= IGC_FLAG_HAS_MSIX;
return igc_tsn_offload_apply(adapter);
}
-static int igc_tsn_clear_schedule(struct igc_adapter *adapter)
+static int igc_qbv_clear_schedule(struct igc_adapter *adapter)
{
+ unsigned long flags;
int i;
adapter->base_time = 0;
adapter->cycle_time = NSEC_PER_SEC;
adapter->taprio_offload_enable = false;
adapter->qbv_config_change_errors = 0;
- adapter->qbv_transition = false;
adapter->qbv_count = 0;
for (i = 0; i < adapter->num_tx_queues; i++) {
ring->start_time = 0;
ring->end_time = NSEC_PER_SEC;
ring->max_sdu = 0;
+ }
+
+ spin_lock_irqsave(&adapter->qbv_tx_lock, flags);
+
+ adapter->qbv_transition = false;
+
+ for (i = 0; i < adapter->num_tx_queues; i++) {
+ struct igc_ring *ring = adapter->tx_ring[i];
+
ring->oper_gate_closed = false;
ring->admin_gate_closed = false;
}
+ spin_unlock_irqrestore(&adapter->qbv_tx_lock, flags);
+
+ return 0;
+}
+
+static int igc_tsn_clear_schedule(struct igc_adapter *adapter)
+{
+ igc_qbv_clear_schedule(adapter);
+
return 0;
}
struct igc_hw *hw = &adapter->hw;
u32 start_time = 0, end_time = 0;
struct timespec64 now;
+ unsigned long flags;
size_t n;
int i;
start_time += e->interval;
}
+ spin_lock_irqsave(&adapter->qbv_tx_lock, flags);
+
/* Check whether a queue gets configured.
* If not, set the start and end time to be end time.
*/
}
}
+ spin_unlock_irqrestore(&adapter->qbv_tx_lock, flags);
+
for (i = 0; i < adapter->num_tx_queues; i++) {
struct igc_ring *ring = adapter->tx_ring[i];
struct net_device *dev = adapter->netdev;
{
struct igc_adapter *adapter = container_of(timer, struct igc_adapter,
hrtimer);
+ unsigned long flags;
unsigned int i;
+ spin_lock_irqsave(&adapter->qbv_tx_lock, flags);
+
adapter->qbv_transition = true;
for (i = 0; i < adapter->num_tx_queues; i++) {
struct igc_ring *tx_ring = adapter->tx_ring[i];
}
}
adapter->qbv_transition = false;
+
+ spin_unlock_irqrestore(&adapter->qbv_tx_lock, flags);
+
return HRTIMER_NORESTART;
}
static bool __prestera_fi_is_direct(struct fib_info *fi)
{
- struct fib_nh *fib_nh;
+ struct fib_nh_common *fib_nhc;
if (fib_info_num_path(fi) == 1) {
- fib_nh = fib_info_nh(fi, 0);
- if (fib_nh->fib_nh_gw_family == AF_UNSPEC)
+ fib_nhc = fib_info_nhc(fi, 0);
+ if (fib_nhc->nhc_gw_family == AF_UNSPEC)
return true;
}
__prestera_util_kern_n_is_reachable_v4(u32 tb_id, __be32 *addr,
struct net_device *dev)
{
- struct fib_nh *fib_nh;
+ struct fib_nh_common *fib_nhc;
struct fib_result res;
bool reachable;
if (!prestera_util_kern_get_route(&res, tb_id, addr))
if (prestera_fi_is_direct(res.fi)) {
- fib_nh = fib_info_nh(res.fi, 0);
- if (dev == fib_nh->fib_nh_dev)
+ fib_nhc = fib_info_nhc(res.fi, 0);
+ if (dev == fib_nhc->nhc_dev)
reachable = true;
}
if (info->family == AF_INET) {
fen4_info = container_of(info, struct fib_entry_notifier_info,
info);
- return &fib_info_nh(fen4_info->fi, n)->nh_common;
+ return fib_info_nhc(fen4_info->fi, n);
} else if (info->family == AF_INET6) {
fen6_info = container_of(info, struct fib6_entry_notifier_info,
info);
/* Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. */
#include "reporter_vnic.h"
+#include "en_stats.h"
#include "devlink.h"
#define VNIC_ENV_GET64(vnic_env_stats, c) \
if (err)
return err;
- err = devlink_fmsg_u64_pair_put(fmsg, "total_error_queues",
- VNIC_ENV_GET64(&vnic, total_error_queues));
- if (err)
- return err;
-
- err = devlink_fmsg_u64_pair_put(fmsg, "send_queue_priority_update_flow",
- VNIC_ENV_GET64(&vnic, send_queue_priority_update_flow));
- if (err)
- return err;
-
- err = devlink_fmsg_u64_pair_put(fmsg, "comp_eq_overrun",
- VNIC_ENV_GET64(&vnic, comp_eq_overrun));
- if (err)
- return err;
-
- err = devlink_fmsg_u64_pair_put(fmsg, "async_eq_overrun",
- VNIC_ENV_GET64(&vnic, async_eq_overrun));
- if (err)
- return err;
-
- err = devlink_fmsg_u64_pair_put(fmsg, "cq_overrun",
- VNIC_ENV_GET64(&vnic, cq_overrun));
- if (err)
- return err;
-
- err = devlink_fmsg_u64_pair_put(fmsg, "invalid_command",
- VNIC_ENV_GET64(&vnic, invalid_command));
- if (err)
- return err;
-
- err = devlink_fmsg_u64_pair_put(fmsg, "quota_exceeded_command",
- VNIC_ENV_GET64(&vnic, quota_exceeded_command));
- if (err)
- return err;
-
- err = devlink_fmsg_u64_pair_put(fmsg, "nic_receive_steering_discard",
- VNIC_ENV_GET64(&vnic, nic_receive_steering_discard));
- if (err)
- return err;
-
- err = devlink_fmsg_u64_pair_put(fmsg, "generated_pkt_steering_fail",
- VNIC_ENV_GET64(&vnic, generated_pkt_steering_fail));
- if (err)
- return err;
-
- err = devlink_fmsg_u64_pair_put(fmsg, "handled_pkt_steering_fail",
- VNIC_ENV_GET64(&vnic, handled_pkt_steering_fail));
- if (err)
- return err;
+ if (MLX5_CAP_GEN(dev, vnic_env_queue_counters)) {
+ err = devlink_fmsg_u32_pair_put(fmsg, "total_error_queues",
+ VNIC_ENV_GET(&vnic, total_error_queues));
+ if (err)
+ return err;
+
+ err = devlink_fmsg_u32_pair_put(fmsg, "send_queue_priority_update_flow",
+ VNIC_ENV_GET(&vnic,
+ send_queue_priority_update_flow));
+ if (err)
+ return err;
+ }
+
+ if (MLX5_CAP_GEN(dev, eq_overrun_count)) {
+ err = devlink_fmsg_u32_pair_put(fmsg, "comp_eq_overrun",
+ VNIC_ENV_GET(&vnic, comp_eq_overrun));
+ if (err)
+ return err;
+
+ err = devlink_fmsg_u32_pair_put(fmsg, "async_eq_overrun",
+ VNIC_ENV_GET(&vnic, async_eq_overrun));
+ if (err)
+ return err;
+ }
+
+ if (MLX5_CAP_GEN(dev, vnic_env_cq_overrun)) {
+ err = devlink_fmsg_u32_pair_put(fmsg, "cq_overrun",
+ VNIC_ENV_GET(&vnic, cq_overrun));
+ if (err)
+ return err;
+ }
+
+ if (MLX5_CAP_GEN(dev, invalid_command_count)) {
+ err = devlink_fmsg_u32_pair_put(fmsg, "invalid_command",
+ VNIC_ENV_GET(&vnic, invalid_command));
+ if (err)
+ return err;
+ }
+
+ if (MLX5_CAP_GEN(dev, quota_exceeded_count)) {
+ err = devlink_fmsg_u32_pair_put(fmsg, "quota_exceeded_command",
+ VNIC_ENV_GET(&vnic, quota_exceeded_command));
+ if (err)
+ return err;
+ }
+
+ if (MLX5_CAP_GEN(dev, nic_receive_steering_discard)) {
+ err = devlink_fmsg_u64_pair_put(fmsg, "nic_receive_steering_discard",
+ VNIC_ENV_GET64(&vnic,
+ nic_receive_steering_discard));
+ if (err)
+ return err;
+ }
+
+ if (MLX5_CAP_GEN(dev, vnic_env_cnt_steering_fail)) {
+ err = devlink_fmsg_u64_pair_put(fmsg, "generated_pkt_steering_fail",
+ VNIC_ENV_GET64(&vnic,
+ generated_pkt_steering_fail));
+ if (err)
+ return err;
+
+ err = devlink_fmsg_u64_pair_put(fmsg, "handled_pkt_steering_fail",
+ VNIC_ENV_GET64(&vnic, handled_pkt_steering_fail));
+ if (err)
+ return err;
+ }
err = devlink_fmsg_obj_nest_end(fmsg);
if (err)
attr = mlx5e_tc_get_encap_attr(flow);
esw_attr = attr->esw_attr;
- if (flow_flag_test(flow, SLOW))
+ if (flow_flag_test(flow, SLOW)) {
mlx5e_tc_unoffload_from_slow_path(esw, flow);
- else
+ } else {
mlx5e_tc_unoffload_fdb_rules(esw, flow, flow->attr);
+ mlx5e_tc_unoffload_flow_post_acts(flow);
+ }
mlx5e_tc_detach_mod_hdr(priv, flow, attr);
attr->modify_hdr = NULL;
static int mlx5e_nic_init(struct mlx5_core_dev *mdev,
struct net_device *netdev)
{
+ const bool take_rtnl = netdev->reg_state == NETREG_REGISTERED;
struct mlx5e_priv *priv = netdev_priv(netdev);
struct mlx5e_flow_steering *fs;
int err;
mlx5_core_err(mdev, "TLS initialization failed, %d\n", err);
mlx5e_health_create_reporters(priv);
+
+ /* If netdev is already registered (e.g. move from uplink to nic profile),
+ * RTNL lock must be held before triggering netdev notifiers.
+ */
+ if (take_rtnl)
+ rtnl_lock();
+
/* update XDP supported features */
mlx5e_set_xdp_feature(netdev);
+ if (take_rtnl)
+ rtnl_unlock();
+
return 0;
}
{
struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
struct mlx5_flow_attr *attr = flow->attr;
- struct mlx5_esw_flow_attr *esw_attr;
- esw_attr = attr->esw_attr;
mlx5e_put_flow_tunnel_id(flow);
remove_unready_flow(flow);
mlx5_tc_ct_match_del(get_ct_priv(priv), &flow->attr->ct_attr);
- if (esw_attr->int_port)
- mlx5e_tc_int_port_put(mlx5e_get_int_port_priv(priv), esw_attr->int_port);
-
- if (esw_attr->dest_int_port)
- mlx5e_tc_int_port_put(mlx5e_get_int_port_priv(priv), esw_attr->dest_int_port);
-
if (flow_flag_test(flow, L3_TO_L2_DECAP))
mlx5e_detach_decap(priv, flow);
mlx5_free_flow_attr_actions(struct mlx5e_tc_flow *flow, struct mlx5_flow_attr *attr)
{
struct mlx5_core_dev *counter_dev = get_flow_counter_dev(flow);
+ struct mlx5_esw_flow_attr *esw_attr;
if (!attr)
return;
mlx5e_tc_detach_mod_hdr(flow->priv, flow, attr);
}
+ if (mlx5e_is_eswitch_flow(flow)) {
+ esw_attr = attr->esw_attr;
+
+ if (esw_attr->int_port)
+ mlx5e_tc_int_port_put(mlx5e_get_int_port_priv(flow->priv),
+ esw_attr->int_port);
+
+ if (esw_attr->dest_int_port)
+ mlx5e_tc_int_port_put(mlx5e_get_int_port_priv(flow->priv),
+ esw_attr->dest_int_port);
+ }
+
mlx5_tc_ct_delete_flow(get_ct_priv(flow->priv), attr);
free_branch_attr(flow, attr->branch_true);
} else if (mlx5_core_is_ec_vf_vport(esw->dev, vport_num)) {
memcpy(dl_port->attrs.switch_id.id, ppid.id, ppid.id_len);
dl_port->attrs.switch_id.id_len = ppid.id_len;
- devlink_port_attrs_pci_vf_set(dl_port, controller_num, pfnum,
+ devlink_port_attrs_pci_vf_set(dl_port, 0, pfnum,
vport_num - 1, false);
}
return dl_port;
for (i = 0; i < ldev->ports; i++) {
for (j = 0; j < ldev->buckets; j++) {
idx = i * ldev->buckets + j;
- if (ldev->v2p_map[i] == ports[i])
+ if (ldev->v2p_map[idx] == ports[idx])
continue;
dest.vport.vhca_id = MLX5_CAP_GEN(ldev->pf[ports[idx] - 1].dev,
clock = container_of(timer, struct mlx5_clock, timer);
mdev = container_of(clock, struct mlx5_core_dev, clock);
+ if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
+ goto out;
+
write_seqlock_irqsave(&clock->lock, flags);
timecounter_read(&timer->tc);
mlx5_update_clock_info_page(mdev);
write_sequnlock_irqrestore(&clock->lock, flags);
+
+out:
schedule_delayed_work(&timer->overflow_work, timer->overflow_period);
}
mlx5_enter_error_state(dev, false);
mlx5_error_sw_reset(dev);
- mlx5_unload_one(dev, true);
+ mlx5_unload_one(dev, false);
mlx5_drain_health_wq(dev);
mlx5_pci_disable_device(dev);
static inline int mlx5_vport_to_func_id(const struct mlx5_core_dev *dev, u16 vport, bool ec_vf_func)
{
- return ec_vf_func ? vport - mlx5_core_ec_vf_vport_base(dev)
+ return ec_vf_func ? vport - mlx5_core_ec_vf_vport_base(dev) + 1
: vport;
}
host_total_vfs = MLX5_GET(query_esw_functions_out, out,
host_params_context.host_total_vfs);
kvfree(out);
- if (host_total_vfs)
- return host_total_vfs;
+ return host_total_vfs;
}
done:
u32 chunk_size;
u32 index;
- chunk_size = ilog2(num_of_actions);
+ chunk_size = ilog2(roundup_pow_of_two(num_of_actions));
/* HW modify action index granularity is at least 64B */
chunk_size = max_t(u32, chunk_size, DR_CHUNK_SIZE_8);
#include <linux/ethtool.h>
#include <linux/filter.h>
#include <linux/mm.h>
+#include <linux/pci.h>
#include <net/checksum.h>
#include <net/ip6_checksum.h>
static int mana_dealloc_queues(struct net_device *ndev)
{
struct mana_port_context *apc = netdev_priv(ndev);
+ unsigned long timeout = jiffies + 120 * HZ;
struct gdma_dev *gd = apc->ac->gdma_dev;
struct mana_txq *txq;
+ struct sk_buff *skb;
int i, err;
+ u32 tsleep;
if (apc->port_is_up)
return -EINVAL;
* to false, but it doesn't matter since mana_start_xmit() drops any
* new packets due to apc->port_is_up being false.
*
- * Drain all the in-flight TX packets
+ * Drain all the in-flight TX packets.
+ * A timeout of 120 seconds for all the queues is used.
+ * This will break the while loop when h/w is not responding.
+ * This value of 120 has been decided here considering max
+ * number of queues.
*/
+
for (i = 0; i < apc->num_queues; i++) {
txq = &apc->tx_qp[i].txq;
-
- while (atomic_read(&txq->pending_sends) > 0)
- usleep_range(1000, 2000);
+ tsleep = 1000;
+ while (atomic_read(&txq->pending_sends) > 0 &&
+ time_before(jiffies, timeout)) {
+ usleep_range(tsleep, tsleep + 1000);
+ tsleep <<= 1;
+ }
+ if (atomic_read(&txq->pending_sends)) {
+ err = pcie_flr(to_pci_dev(gd->gdma_context->dev));
+ if (err) {
+ netdev_err(ndev, "flr failed %d with %d pkts pending in txq %u\n",
+ err, atomic_read(&txq->pending_sends),
+ txq->gdma_txq_id);
+ }
+ break;
+ }
}
+ for (i = 0; i < apc->num_queues; i++) {
+ txq = &apc->tx_qp[i].txq;
+ while ((skb = skb_dequeue(&txq->pending_skbs))) {
+ mana_unmap_skb(skb, apc);
+ dev_kfree_skb_any(skb);
+ }
+ atomic_set(&txq->pending_sends, 0);
+ }
/* We're 100% sure the queues can no longer be woken up, because
* we're sure now mana_poll_tx_cq() can't be running.
*/
static void ionic_tx_timeout_work(struct work_struct *ws)
{
struct ionic_lif *lif = container_of(ws, struct ionic_lif, tx_timeout_work);
+ int err;
if (test_bit(IONIC_LIF_F_FW_RESET, lif->state))
return;
mutex_lock(&lif->queue_lock);
ionic_stop_queues_reconfig(lif);
- ionic_start_queues_reconfig(lif);
+ err = ionic_start_queues_reconfig(lif);
mutex_unlock(&lif->queue_lock);
+
+ if (err)
+ dev_err(lif->ionic->dev, "%s: Restarting queues failed\n", __func__);
}
static void ionic_tx_timeout(struct net_device *netdev, unsigned int txqueue)
if (err) {
dev_err(lif->ionic->dev,
"CMB restore failed: %d\n", err);
- goto errout;
+ goto err_out;
}
}
- ionic_start_queues_reconfig(lif);
- } else {
- /* This was detached in ionic_stop_queues_reconfig() */
- netif_device_attach(lif->netdev);
+ err = ionic_start_queues_reconfig(lif);
+ if (err) {
+ dev_err(lif->ionic->dev,
+ "CMB reconfig failed: %d\n", err);
+ goto err_out;
+ }
}
-errout:
+err_out:
+ /* This was detached in ionic_stop_queues_reconfig() */
+ netif_device_attach(lif->netdev);
+
return err;
}
u64_stats_update_begin(&rxsc_stats->syncp);
rxsc_stats->stats.InPktsLate++;
u64_stats_update_end(&rxsc_stats->syncp);
- secy->netdev->stats.rx_dropped++;
+ DEV_STATS_INC(secy->netdev, rx_dropped);
return false;
}
rxsc_stats->stats.InPktsNotValid++;
u64_stats_update_end(&rxsc_stats->syncp);
this_cpu_inc(rx_sa->stats->InPktsNotValid);
- secy->netdev->stats.rx_errors++;
+ DEV_STATS_INC(secy->netdev, rx_errors);
return false;
}
u64_stats_update_begin(&secy_stats->syncp);
secy_stats->stats.InPktsNoTag++;
u64_stats_update_end(&secy_stats->syncp);
- macsec->secy.netdev->stats.rx_dropped++;
+ DEV_STATS_INC(macsec->secy.netdev, rx_dropped);
continue;
}
u64_stats_update_begin(&secy_stats->syncp);
secy_stats->stats.InPktsBadTag++;
u64_stats_update_end(&secy_stats->syncp);
- secy->netdev->stats.rx_errors++;
+ DEV_STATS_INC(secy->netdev, rx_errors);
goto drop_nosa;
}
u64_stats_update_begin(&rxsc_stats->syncp);
rxsc_stats->stats.InPktsNotUsingSA++;
u64_stats_update_end(&rxsc_stats->syncp);
- secy->netdev->stats.rx_errors++;
+ DEV_STATS_INC(secy->netdev, rx_errors);
if (active_rx_sa)
this_cpu_inc(active_rx_sa->stats->InPktsNotUsingSA);
goto drop_nosa;
u64_stats_update_begin(&rxsc_stats->syncp);
rxsc_stats->stats.InPktsLate++;
u64_stats_update_end(&rxsc_stats->syncp);
- macsec->secy.netdev->stats.rx_dropped++;
+ DEV_STATS_INC(macsec->secy.netdev, rx_dropped);
goto drop;
}
}
if (ret == NET_RX_SUCCESS)
count_rx(dev, len);
else
- macsec->secy.netdev->stats.rx_dropped++;
+ DEV_STATS_INC(macsec->secy.netdev, rx_dropped);
rcu_read_unlock();
u64_stats_update_begin(&secy_stats->syncp);
secy_stats->stats.InPktsNoSCI++;
u64_stats_update_end(&secy_stats->syncp);
- macsec->secy.netdev->stats.rx_errors++;
+ DEV_STATS_INC(macsec->secy.netdev, rx_errors);
continue;
}
secy_stats->stats.InPktsUnknownSCI++;
u64_stats_update_end(&secy_stats->syncp);
} else {
- macsec->secy.netdev->stats.rx_dropped++;
+ DEV_STATS_INC(macsec->secy.netdev, rx_dropped);
}
}
if (!secy->operational) {
kfree_skb(skb);
- dev->stats.tx_dropped++;
+ DEV_STATS_INC(dev, tx_dropped);
return NETDEV_TX_OK;
}
skb = macsec_encrypt(skb, dev);
if (IS_ERR(skb)) {
if (PTR_ERR(skb) != -EINPROGRESS)
- dev->stats.tx_dropped++;
+ DEV_STATS_INC(dev, tx_dropped);
return NETDEV_TX_OK;
}
dev_fetch_sw_netstats(s, dev->tstats);
- s->rx_dropped = dev->stats.rx_dropped;
- s->tx_dropped = dev->stats.tx_dropped;
- s->rx_errors = dev->stats.rx_errors;
+ s->rx_dropped = atomic_long_read(&dev->stats.__rx_dropped);
+ s->tx_dropped = atomic_long_read(&dev->stats.__tx_dropped);
+ s->rx_errors = atomic_long_read(&dev->stats.__rx_errors);
}
static int macsec_get_iflink(const struct net_device *dev)
phy_write_mmd(phydev, MDIO_MMD_PCS, offsets[i],
mac[(i * 2) + 1] | (mac[(i * 2)] << 8));
- /* Enable WOL function */
- ret = phy_modify_mmd(phydev, MDIO_MMD_PCS, AT803X_PHY_MMD3_WOL_CTRL,
- 0, AT803X_WOL_EN);
- if (ret)
- return ret;
+ /* Enable WOL function for 1588 */
+ if (phydev->drv->phy_id == ATH8031_PHY_ID) {
+ ret = phy_modify_mmd(phydev, MDIO_MMD_PCS,
+ AT803X_PHY_MMD3_WOL_CTRL,
+ 0, AT803X_WOL_EN);
+ if (ret)
+ return ret;
+ }
/* Enable WOL interrupt */
ret = phy_modify(phydev, AT803X_INTR_ENABLE, 0, AT803X_INTR_ENABLE_WOL);
if (ret)
return ret;
} else {
- /* Disable WoL function */
- ret = phy_modify_mmd(phydev, MDIO_MMD_PCS, AT803X_PHY_MMD3_WOL_CTRL,
- AT803X_WOL_EN, 0);
- if (ret)
- return ret;
+ /* Disable WoL function for 1588 */
+ if (phydev->drv->phy_id == ATH8031_PHY_ID) {
+ ret = phy_modify_mmd(phydev, MDIO_MMD_PCS,
+ AT803X_PHY_MMD3_WOL_CTRL,
+ AT803X_WOL_EN, 0);
+ if (ret)
+ return ret;
+ }
/* Disable WOL interrupt */
ret = phy_modify(phydev, AT803X_INTR_ENABLE, AT803X_INTR_ENABLE_WOL, 0);
if (ret)
wol->supported = WAKE_MAGIC;
wol->wolopts = 0;
- value = phy_read_mmd(phydev, MDIO_MMD_PCS, AT803X_PHY_MMD3_WOL_CTRL);
+ value = phy_read(phydev, AT803X_INTR_ENABLE);
if (value < 0)
return;
- if (value & AT803X_WOL_EN)
+ if (value & AT803X_INTR_ENABLE_WOL)
wol->wolopts |= WAKE_MAGIC;
}
if (phydev->drv->phy_id == ATH8031_PHY_ID) {
int ccr = phy_read(phydev, AT803X_REG_CHIP_CONFIG);
int mode_cfg;
- struct ethtool_wolinfo wol = {
- .wolopts = 0,
- };
if (ccr < 0)
return ccr;
break;
}
- /* Disable WOL by default */
- ret = at803x_set_wol(phydev, &wol);
- if (ret < 0) {
- phydev_err(phydev, "failed to disable WOL on probe: %d\n", ret);
+ /* Disable WoL in 1588 register which is enabled
+ * by default
+ */
+ ret = phy_modify_mmd(phydev, MDIO_MMD_PCS,
+ AT803X_PHY_MMD3_WOL_CTRL,
+ AT803X_WOL_EN, 0);
+ if (ret)
return ret;
- }
}
return 0;
.flags = PHY_POLL_CABLE_TEST,
.config_init = at803x_config_init,
.link_change_notify = at803x_link_change_notify,
- .set_wol = at803x_set_wol,
- .get_wol = at803x_get_wol,
.suspend = at803x_suspend,
.resume = at803x_resume,
/* PHY_BASIC_FEATURES */
if (zerocopy)
return false;
- if (SKB_DATA_ALIGN(len + TUN_RX_PAD) +
+ if (SKB_DATA_ALIGN(len + TUN_RX_PAD + XDP_PACKET_HEADROOM) +
SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE)
return false;
return vninode;
}
+static void vxlan_vni_free(struct vxlan_vni_node *vninode)
+{
+ free_percpu(vninode->stats);
+ kfree(vninode);
+}
+
static int vxlan_vni_add(struct vxlan_dev *vxlan,
struct vxlan_vni_group *vg,
u32 vni, union vxlan_addr *group,
&vninode->vnode,
vxlan_vni_rht_params);
if (err) {
- kfree(vninode);
+ vxlan_vni_free(vninode);
return err;
}
struct vxlan_vni_node *v;
v = container_of(rcu, struct vxlan_vni_node, rcu);
- free_percpu(v->stats);
- kfree(v);
+ vxlan_vni_free(v);
}
static int vxlan_vni_del(struct vxlan_dev *vxlan,
#include "allowedips.h"
#include "peer.h"
-enum { MAX_ALLOWEDIPS_BITS = 128 };
+enum { MAX_ALLOWEDIPS_DEPTH = 129 };
static struct kmem_cache *node_cache;
struct allowedips_node __rcu *p, unsigned int *len)
{
if (rcu_access_pointer(p)) {
- if (WARN_ON(IS_ENABLED(DEBUG) && *len >= MAX_ALLOWEDIPS_BITS))
+ if (WARN_ON(IS_ENABLED(DEBUG) && *len >= MAX_ALLOWEDIPS_DEPTH))
return;
stack[(*len)++] = rcu_dereference_raw(p);
}
static void root_free_rcu(struct rcu_head *rcu)
{
- struct allowedips_node *node, *stack[MAX_ALLOWEDIPS_BITS] = {
+ struct allowedips_node *node, *stack[MAX_ALLOWEDIPS_DEPTH] = {
container_of(rcu, struct allowedips_node, rcu) };
unsigned int len = 1;
static void root_remove_peer_lists(struct allowedips_node *root)
{
- struct allowedips_node *node, *stack[MAX_ALLOWEDIPS_BITS] = { root };
+ struct allowedips_node *node, *stack[MAX_ALLOWEDIPS_DEPTH] = { root };
unsigned int len = 1;
while (len > 0 && (node = stack[--len])) {
wg_allowedips_remove_by_peer(&t, a, &mutex);
test_negative(4, a, 192, 168, 0, 1);
- /* These will hit the WARN_ON(len >= MAX_ALLOWEDIPS_BITS) in free_node
+ /* These will hit the WARN_ON(len >= MAX_ALLOWEDIPS_DEPTH) in free_node
* if something goes wrong.
*/
- for (i = 0; i < MAX_ALLOWEDIPS_BITS; ++i) {
- part = cpu_to_be64(~(1LLU << (i % 64)));
- memset(&ip, 0xff, 16);
- memcpy((u8 *)&ip + (i < 64) * 8, &part, 8);
+ for (i = 0; i < 64; ++i) {
+ part = cpu_to_be64(~0LLU << i);
+ memset(&ip, 0xff, 8);
+ memcpy((u8 *)&ip + 8, &part, 8);
+ wg_allowedips_insert_v6(&t, &ip, 128, a, &mutex);
+ memcpy(&ip, &part, 8);
+ memset((u8 *)&ip + 8, 0, 8);
wg_allowedips_insert_v6(&t, &ip, 128, a, &mutex);
}
-
+ memset(&ip, 0, 16);
+ wg_allowedips_insert_v6(&t, &ip, 128, a, &mutex);
wg_allowedips_free(&t, &mutex);
wg_allowedips_init(&t);
struct wmi_tlv *tlv;
void *ptr;
int i, ret, len;
- u32 *tmp_ptr;
- u8 extraie_len_with_pad = 0;
+ u32 *tmp_ptr, extraie_len_with_pad = 0;
struct ath12k_wmi_hint_short_ssid_arg *s_ssid = NULL;
struct ath12k_wmi_hint_bssid_arg *hint_bssid = NULL;
params_size -= BRCMF_SCAN_PARAMS_V2_FIXED_SIZE;
params_size += BRCMF_SCAN_PARAMS_FIXED_SIZE;
params_v1 = kzalloc(params_size, GFP_KERNEL);
+ if (!params_v1) {
+ err = -ENOMEM;
+ goto exit_params;
+ }
params_v1->version = cpu_to_le32(BRCMF_ESCAN_REQ_VERSION);
brcmf_scan_params_v2_to_v1(¶ms->params_v2_le, ¶ms_v1->params_le);
kfree(params);
bphy_err(drvr, "error (%d)\n", err);
}
+exit_params:
kfree(params);
exit:
return err;
u32 reg;
int ret;
- if (chip_id != RTL8852A && chip_id != RTL8852B)
+ if (chip_id != RTL8852B)
return 0;
ret = rtw89_mac_check_mac_en(rtwdev, mac_idx, RTW89_CMAC_SEL);
struct gnttab_map_grant_ref *gop = queue->tx_map_ops + *map_ops;
struct xen_netif_tx_request *txp = first;
- nr_slots = shinfo->nr_frags + 1;
+ nr_slots = shinfo->nr_frags + frag_overflow + 1;
copy_count(skb) = 0;
XENVIF_TX_CB(skb)->split_mask = 0;
}
}
- for (shinfo->nr_frags = 0; shinfo->nr_frags < nr_slots;
- shinfo->nr_frags++, gop++) {
+ for (shinfo->nr_frags = 0; nr_slots > 0 && shinfo->nr_frags < MAX_SKB_FRAGS;
+ shinfo->nr_frags++, gop++, nr_slots--) {
index = pending_index(queue->pending_cons++);
pending_idx = queue->pending_ring[index];
xenvif_tx_create_map_op(queue, pending_idx, txp,
txp++;
}
- if (frag_overflow) {
+ if (nr_slots > 0) {
shinfo = skb_shinfo(nskb);
frags = shinfo->frags;
- for (shinfo->nr_frags = 0; shinfo->nr_frags < frag_overflow;
+ for (shinfo->nr_frags = 0; shinfo->nr_frags < nr_slots;
shinfo->nr_frags++, txp++, gop++) {
index = pending_index(queue->pending_cons++);
pending_idx = queue->pending_ring[index];
}
skb_shinfo(skb)->frag_list = nskb;
+ } else if (nskb) {
+ /* A frag_list skb was allocated but it is no longer needed
+ * because enough slots were converted to copy ops above.
+ */
+ kfree_skb(nskb);
}
(*copy_ops) = cop - queue->tx_copy_ops;
*/
nvme_mpath_clear_ctrl_paths(ctrl);
+ /*
+ * Unquiesce io queues so any pending IO won't hang, especially
+ * those submitted from scan work
+ */
+ nvme_unquiesce_io_queues(ctrl);
+
/* prevent racing with ns scanning */
flush_work(&ctrl->scan_work);
* removing the namespaces' disks; fail all the queues now to avoid
* potentially having to clean up the failed sync later.
*/
- if (ctrl->state == NVME_CTRL_DEAD) {
+ if (ctrl->state == NVME_CTRL_DEAD)
nvme_mark_namespaces_dead(ctrl);
- nvme_unquiesce_io_queues(ctrl);
- }
/* this is a no-op when called from the controller reset handler */
nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO);
if (!(ioucmd->flags & IORING_URING_CMD_POLLED))
return 0;
- rcu_read_lock();
req = READ_ONCE(ioucmd->cookie);
if (req && blk_rq_is_poll(req))
ret = blk_rq_poll(req, iob, poll_flags);
- rcu_read_unlock();
return ret;
}
#ifdef CONFIG_NVME_MULTIPATH
{ PCI_DEVICE(0x1d97, 0x2263), /* SPCC */
.driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
{ PCI_DEVICE(0x144d, 0xa80b), /* Samsung PM9B1 256G and 512G */
- .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
+ .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES |
+ NVME_QUIRK_BOGUS_NID, },
{ PCI_DEVICE(0x144d, 0xa809), /* Samsung MZALQ256HBJD 256G */
.driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
{ PCI_DEVICE(0x144d, 0xa802), /* Samsung SM953 */
goto out_cleanup_tagset;
if (!new) {
+ nvme_start_freeze(&ctrl->ctrl);
nvme_unquiesce_io_queues(&ctrl->ctrl);
if (!nvme_wait_freeze_timeout(&ctrl->ctrl, NVME_IO_TIMEOUT)) {
/*
* to be safe.
*/
ret = -ENODEV;
+ nvme_unfreeze(&ctrl->ctrl);
goto out_wait_freeze_timed_out;
}
blk_mq_update_nr_hw_queues(ctrl->ctrl.tagset,
bool remove)
{
if (ctrl->ctrl.queue_count > 1) {
- nvme_start_freeze(&ctrl->ctrl);
nvme_quiesce_io_queues(&ctrl->ctrl);
nvme_sync_io_queues(&ctrl->ctrl);
nvme_rdma_stop_io_queues(ctrl);
goto out_cleanup_connect_q;
if (!new) {
+ nvme_start_freeze(ctrl);
nvme_unquiesce_io_queues(ctrl);
if (!nvme_wait_freeze_timeout(ctrl, NVME_IO_TIMEOUT)) {
/*
* to be safe.
*/
ret = -ENODEV;
+ nvme_unfreeze(ctrl);
goto out_wait_freeze_timed_out;
}
blk_mq_update_nr_hw_queues(ctrl->tagset,
if (ctrl->queue_count <= 1)
return;
nvme_quiesce_admin_queue(ctrl);
- nvme_start_freeze(ctrl);
nvme_quiesce_io_queues(ctrl);
nvme_sync_io_queues(ctrl);
nvme_tcp_stop_io_queues(ctrl);
MODULE_PARM_DESC(sba_reserve_agpgart, "Reserve half of IO pdir as AGPGART");
#endif
+struct proc_dir_entry *proc_runway_root __ro_after_init;
+struct proc_dir_entry *proc_mckinley_root __ro_after_init;
/************************************
** SBA register read and write support
#ifdef CONFIG_PROC_FS
switch (dev->id.hversion) {
case PLUTO_MCKINLEY_PORT:
+ if (!proc_mckinley_root)
+ proc_mckinley_root = proc_mkdir("bus/mckinley", NULL);
root = proc_mckinley_root;
break;
case ASTRO_RUNWAY_PORT:
case IKE_MERCED_PORT:
default:
+ if (!proc_runway_root)
+ proc_runway_root = proc_mkdir("bus/runway", NULL);
root = proc_runway_root;
break;
}
#include <linux/pci.h>
#include <linux/errno.h>
#include <linux/ioport.h>
+#include <linux/of.h>
#include <linux/proc_fs.h>
#include <linux/slab.h>
*/
void pci_bus_add_device(struct pci_dev *dev)
{
+ struct device_node *dn = dev->dev.of_node;
int retval;
/*
pci_proc_attach_device(dev);
pci_bridge_d3_update(dev);
- dev->match_driver = true;
+ dev->match_driver = !dn || of_device_is_available(dn);
retval = device_attach(&dev->dev);
if (retval < 0 && retval != -EPROBE_DEFER)
pci_warn(dev, "device attach failed (%d)\n", retval);
depends on MVEBU_MBUS
depends on ARM
depends on OF
- depends on BROKEN
select PCI_BRIDGE_EMUL
help
Add support for Marvell EBU PCIe controller. This PCIe controller
if (ret)
goto err_remove_edma;
- if (dw_pcie_link_up(pci)) {
- dw_pcie_print_link_status(pci);
- } else {
+ if (!dw_pcie_link_up(pci)) {
ret = dw_pcie_start_link(pci);
if (ret)
goto err_remove_edma;
-
- if (pci->ops && pci->ops->start_link) {
- ret = dw_pcie_wait_for_link(pci);
- if (ret)
- goto err_stop_link;
- }
}
+ /* Ignore errors, the link may come up later */
+ dw_pcie_wait_for_link(pci);
+
bridge->sysdata = pp;
ret = pci_host_probe(bridge);
dw_pcie_writel_atu(pci, dir, index, PCIE_ATU_REGION_CTRL2, 0);
}
-void dw_pcie_print_link_status(struct dw_pcie *pci)
-{
- u32 offset, val;
-
- offset = dw_pcie_find_capability(pci, PCI_CAP_ID_EXP);
- val = dw_pcie_readw_dbi(pci, offset + PCI_EXP_LNKSTA);
-
- dev_info(pci->dev, "PCIe Gen.%u x%u link up\n",
- FIELD_GET(PCI_EXP_LNKSTA_CLS, val),
- FIELD_GET(PCI_EXP_LNKSTA_NLW, val));
-}
-
int dw_pcie_wait_for_link(struct dw_pcie *pci)
{
+ u32 offset, val;
int retries;
/* Check if the link is up or not */
return -ETIMEDOUT;
}
- dw_pcie_print_link_status(pci);
+ offset = dw_pcie_find_capability(pci, PCI_CAP_ID_EXP);
+ val = dw_pcie_readw_dbi(pci, offset + PCI_EXP_LNKSTA);
+
+ dev_info(pci->dev, "PCIe Gen.%u x%u link up\n",
+ FIELD_GET(PCI_EXP_LNKSTA_CLS, val),
+ FIELD_GET(PCI_EXP_LNKSTA_NLW, val));
return 0;
}
void dw_pcie_iatu_detect(struct dw_pcie *pci);
int dw_pcie_edma_detect(struct dw_pcie *pci);
void dw_pcie_edma_remove(struct dw_pcie *pci);
-void dw_pcie_print_link_status(struct dw_pcie *pci);
static inline void dw_pcie_writel_dbi(struct dw_pcie *pci, u32 reg, u32 val)
{
acpiphp_native_scan_bridge(dev);
}
} else {
+ LIST_HEAD(add_list);
int max, pass;
acpiphp_rescan_slot(slot);
if (pass && dev->subordinate) {
check_hotplug_bridge(slot, dev);
pcibios_resource_survey_bus(dev->subordinate);
+ if (pci_is_root_bus(bus))
+ __pci_bus_size_bridges(dev->subordinate, &add_list);
}
}
}
- pci_assign_unassigned_bridge_resources(bus->self);
+ if (pci_is_root_bus(bus))
+ __pci_bus_assign_resources(bus, &add_list, NULL);
+ else
+ pci_assign_unassigned_bridge_resources(bus->self);
}
acpiphp_sanitize_bus(bus);
if (!node)
return 0;
- if (!of_device_is_available(node)) {
- of_node_put(node);
- return -ENODEV;
- }
-
device_set_node(&dev->dev, of_fwnode_handle(node));
return 0;
}
.fsync = gfs2_fsync,
.lock = gfs2_lock,
.flock = gfs2_flock,
- .splice_read = filemap_splice_read,
+ .splice_read = copy_splice_read,
.splice_write = gfs2_file_splice_write,
.setlease = simple_nosetlease,
.fallocate = gfs2_fallocate,
.open = gfs2_open,
.release = gfs2_release,
.fsync = gfs2_fsync,
- .splice_read = filemap_splice_read,
+ .splice_read = copy_splice_read,
.splice_write = gfs2_file_splice_write,
.setlease = generic_setlease,
.fallocate = gfs2_fallocate,
{
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
+ struct super_block *sb = sdp->sd_vfs;
struct gfs2_bufdata *bd;
struct gfs2_meta_header *mh;
struct gfs2_trans *tr = current->journal_info;
+ bool withdraw = false;
lock_buffer(bh);
if (buffer_pinned(bh)) {
(unsigned long long)bd->bd_bh->b_blocknr);
BUG();
}
- if (unlikely(test_bit(SDF_FROZEN, &sdp->sd_flags))) {
- fs_info(sdp, "GFS2:adding buf while frozen\n");
- gfs2_assert_withdraw(sdp, 0);
- }
if (unlikely(gfs2_withdrawn(sdp))) {
fs_info(sdp, "GFS2:adding buf while withdrawn! 0x%llx\n",
(unsigned long long)bd->bd_bh->b_blocknr);
+ goto out_unlock;
+ }
+ if (unlikely(sb->s_writers.frozen == SB_FREEZE_COMPLETE)) {
+ fs_info(sdp, "GFS2:adding buf while frozen\n");
+ withdraw = true;
+ goto out_unlock;
}
gfs2_pin(sdp, bd->bd_bh);
mh->__pad0 = cpu_to_be64(0);
tr->tr_num_buf_new++;
out_unlock:
gfs2_log_unlock(sdp);
+ if (withdraw)
+ gfs2_assert_withdraw(sdp, 0);
out:
unlock_buffer(bh);
}
int __nilfs_mark_inode_dirty(struct inode *inode, int flags)
{
+ struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
struct buffer_head *ibh;
int err;
+ /*
+ * Do not dirty inodes after the log writer has been detached
+ * and its nilfs_root struct has been freed.
+ */
+ if (unlikely(nilfs_purging(nilfs)))
+ return 0;
+
err = nilfs_load_inode_block(inode, &ibh);
if (unlikely(err)) {
nilfs_warn(inode->i_sb,
nilfs_segctor_destroy(nilfs->ns_writer);
nilfs->ns_writer = NULL;
}
+ set_nilfs_purging(nilfs);
/* Force to free the list of dirty files */
spin_lock(&nilfs->ns_inode_lock);
up_write(&nilfs->ns_segctor_sem);
nilfs_dispose_list(nilfs, &garbage_list, 1);
+ clear_nilfs_purging(nilfs);
}
THE_NILFS_DISCONTINUED, /* 'next' pointer chain has broken */
THE_NILFS_GC_RUNNING, /* gc process is running */
THE_NILFS_SB_DIRTY, /* super block is dirty */
+ THE_NILFS_PURGING, /* disposing dirty files for cleanup */
};
/**
THE_NILFS_FNS(DISCONTINUED, discontinued)
THE_NILFS_FNS(GC_RUNNING, gc_running)
THE_NILFS_FNS(SB_DIRTY, sb_dirty)
+THE_NILFS_FNS(PURGING, purging)
/*
* Mount option operations
static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
{
+ struct file *file = iocb->ki_filp;
+ char *buf = file->private_data;
loff_t *fpos = &iocb->ki_pos;
size_t phdrs_offset, notes_offset, data_offset;
size_t page_offline_frozen = 1;
case KCORE_VMEMMAP:
case KCORE_TEXT:
/*
- * We use _copy_to_iter() to bypass usermode hardening
- * which would otherwise prevent this operation.
+ * Sadly we must use a bounce buffer here to be able to
+ * make use of copy_from_kernel_nofault(), as these
+ * memory regions might not always be mapped on all
+ * architectures.
*/
- if (_copy_to_iter((char *)start, tsz, iter) != tsz) {
+ if (copy_from_kernel_nofault(buf, (void *)start, tsz)) {
+ if (iov_iter_zero(tsz, iter) != tsz) {
+ ret = -EFAULT;
+ goto out;
+ }
+ /*
+ * We know the bounce buffer is safe to copy from, so
+ * use _copy_to_iter() directly.
+ */
+ } else if (_copy_to_iter(buf, tsz, iter) != tsz) {
ret = -EFAULT;
goto out;
}
if (ret)
return ret;
+ filp->private_data = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!filp->private_data)
+ return -ENOMEM;
+
if (kcore_need_update)
kcore_update_ram();
if (i_size_read(inode) != proc_root_kcore->size) {
return 0;
}
+static int release_kcore(struct inode *inode, struct file *file)
+{
+ kfree(file->private_data);
+ return 0;
+}
+
static const struct proc_ops kcore_proc_ops = {
.proc_read_iter = read_kcore_iter,
.proc_open = open_kcore,
+ .proc_release = release_kcore,
.proc_lseek = default_llseek,
};
}
if (smb2_req_struct_sizes[command] != pdu->StructureSize2) {
- if (command == SMB2_OPLOCK_BREAK_HE &&
- le16_to_cpu(pdu->StructureSize2) != OP_BREAK_STRUCT_SIZE_20 &&
- le16_to_cpu(pdu->StructureSize2) != OP_BREAK_STRUCT_SIZE_21) {
+ if (!(command == SMB2_OPLOCK_BREAK_HE &&
+ (le16_to_cpu(pdu->StructureSize2) == OP_BREAK_STRUCT_SIZE_20 ||
+ le16_to_cpu(pdu->StructureSize2) == OP_BREAK_STRUCT_SIZE_21))) {
/* special case for SMB2.1 lease break message */
ksmbd_debug(SMB,
- "Illegal request size %d for oplock break\n",
- le16_to_cpu(pdu->StructureSize2));
+ "Illegal request size %u for command %d\n",
+ le16_to_cpu(pdu->StructureSize2), command);
return 1;
}
}
break;
buf_len -= next;
eabuf = (struct smb2_ea_info *)((char *)eabuf + next);
- if (next < (u32)eabuf->EaNameLength + le16_to_cpu(eabuf->EaValueLength))
+ if (buf_len < sizeof(struct smb2_ea_info)) {
+ rc = -EINVAL;
break;
+ }
+ if (buf_len < sizeof(struct smb2_ea_info) + eabuf->EaNameLength +
+ le16_to_cpu(eabuf->EaValueLength)) {
+ rc = -EINVAL;
+ break;
+ }
} while (next != 0);
kfree(attr_name);
/** UTF-8 or UTF-16 string. Nul terminated. */
union {
- u8 utf8[2];
- u16 utf16[1];
- u16 ucs2[1]; /* misnomer, use utf16. */
+ u8 legacy_padding[2];
+ DECLARE_FLEX_ARRAY(u8, utf8);
+ DECLARE_FLEX_ARRAY(u16, utf16);
} string;
};
VMMDEV_ASSERT_SIZE(shfl_string, 6);
static inline void bio_set_polled(struct bio *bio, struct kiocb *kiocb)
{
bio->bi_opf |= REQ_POLLED;
- if (!is_sync_kiocb(kiocb))
+ if (kiocb->ki_flags & IOCB_NOWAIT)
bio->bi_opf |= REQ_NOWAIT;
}
bool multiple_queues;
bool has_elevator;
- bool nowait;
struct list_head cb_list; /* md requires an unplug callback */
};
char *buf);
extern ssize_t cpu_show_retbleed(struct device *dev,
struct device_attribute *attr, char *buf);
+extern ssize_t cpu_show_spec_rstack_overflow(struct device *dev,
+ struct device_attribute *attr, char *buf);
extern __printf(4, 5)
struct device *cpu_device_create(struct device *parent, void *drvdata,
enum sk_psock_state_bits {
SK_PSOCK_TX_ENABLED,
+ SK_PSOCK_RX_STRP_ENABLED,
};
struct sk_psock_link {
TPM_CHIP_FLAG_FIRMWARE_POWER_MANAGED = BIT(6),
TPM_CHIP_FLAG_FIRMWARE_UPGRADE = BIT(7),
TPM_CHIP_FLAG_SUSPENDED = BIT(8),
+ TPM_CHIP_FLAG_HWRNG_DISABLED = BIT(9),
};
#define to_tpm_chip(d) container_of(d, struct tpm_chip, dev)
if (WARN_ON(iftype >= NL80211_IFTYPE_MAX))
return NULL;
+ if (iftype == NL80211_IFTYPE_AP_VLAN)
+ iftype = NL80211_IFTYPE_AP;
+
for (i = 0; i < sband->n_iftype_data; i++) {
const struct ieee80211_sband_iftype_data *data =
&sband->iftype_data[i];
*
* @list: table set list node
* @bindings: list of set bindings
+ * @refs: internal refcounting for async set destruction
* @table: table this set belongs to
* @net: netnamespace this set belongs to
* @name: name of the set
struct nft_set {
struct list_head list;
struct list_head bindings;
+ refcount_t refs;
struct nft_table *table;
possible_net_t net;
char *name;
struct list_head pending_update;
/* runtime data below here */
const struct nft_set_ops *ops ____cacheline_aligned;
- u16 flags:14,
+ u16 flags:13,
+ dead:1,
genmask:2;
u8 klen;
u8 dlen;
struct nft_set_ext *nft_set_catchall_lookup(const struct net *net,
const struct nft_set *set);
-void *nft_set_catchall_gc(const struct nft_set *set);
static inline unsigned long nft_set_gc_interval(const struct nft_set *set)
{
void nf_tables_set_elem_destroy(const struct nft_ctx *ctx,
const struct nft_set *set, void *elem);
-/**
- * struct nft_set_gc_batch_head - nf_tables set garbage collection batch
- *
- * @rcu: rcu head
- * @set: set the elements belong to
- * @cnt: count of elements
- */
-struct nft_set_gc_batch_head {
- struct rcu_head rcu;
- const struct nft_set *set;
- unsigned int cnt;
-};
-
-#define NFT_SET_GC_BATCH_SIZE ((PAGE_SIZE - \
- sizeof(struct nft_set_gc_batch_head)) / \
- sizeof(void *))
-
-/**
- * struct nft_set_gc_batch - nf_tables set garbage collection batch
- *
- * @head: GC batch head
- * @elems: garbage collection elements
- */
-struct nft_set_gc_batch {
- struct nft_set_gc_batch_head head;
- void *elems[NFT_SET_GC_BATCH_SIZE];
-};
-
-struct nft_set_gc_batch *nft_set_gc_batch_alloc(const struct nft_set *set,
- gfp_t gfp);
-void nft_set_gc_batch_release(struct rcu_head *rcu);
-
-static inline void nft_set_gc_batch_complete(struct nft_set_gc_batch *gcb)
-{
- if (gcb != NULL)
- call_rcu(&gcb->head.rcu, nft_set_gc_batch_release);
-}
-
-static inline struct nft_set_gc_batch *
-nft_set_gc_batch_check(const struct nft_set *set, struct nft_set_gc_batch *gcb,
- gfp_t gfp)
-{
- if (gcb != NULL) {
- if (gcb->head.cnt + 1 < ARRAY_SIZE(gcb->elems))
- return gcb;
- nft_set_gc_batch_complete(gcb);
- }
- return nft_set_gc_batch_alloc(set, gfp);
-}
-
-static inline void nft_set_gc_batch_add(struct nft_set_gc_batch *gcb,
- void *elem)
-{
- gcb->elems[gcb->head.cnt++] = elem;
-}
-
struct nft_expr_ops;
/**
* struct nft_expr_type - nf_tables expression type
#endif /* IS_ENABLED(CONFIG_NF_TABLES) */
-/*
- * We use a free bit in the genmask field to indicate the element
- * is busy, meaning it is currently being processed either by
- * the netlink API or GC.
- *
- * Even though the genmask is only a single byte wide, this works
- * because the extension structure if fully constant once initialized,
- * so there are no non-atomic write accesses unless it is already
- * marked busy.
- */
-#define NFT_SET_ELEM_BUSY_MASK (1 << 2)
+#define NFT_SET_ELEM_DEAD_MASK (1 << 2)
#if defined(__LITTLE_ENDIAN_BITFIELD)
-#define NFT_SET_ELEM_BUSY_BIT 2
+#define NFT_SET_ELEM_DEAD_BIT 2
#elif defined(__BIG_ENDIAN_BITFIELD)
-#define NFT_SET_ELEM_BUSY_BIT (BITS_PER_LONG - BITS_PER_BYTE + 2)
+#define NFT_SET_ELEM_DEAD_BIT (BITS_PER_LONG - BITS_PER_BYTE + 2)
#else
#error
#endif
-static inline int nft_set_elem_mark_busy(struct nft_set_ext *ext)
+static inline void nft_set_elem_dead(struct nft_set_ext *ext)
{
unsigned long *word = (unsigned long *)ext;
BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0);
- return test_and_set_bit(NFT_SET_ELEM_BUSY_BIT, word);
+ set_bit(NFT_SET_ELEM_DEAD_BIT, word);
}
-static inline void nft_set_elem_clear_busy(struct nft_set_ext *ext)
+static inline int nft_set_elem_is_dead(const struct nft_set_ext *ext)
{
unsigned long *word = (unsigned long *)ext;
- clear_bit(NFT_SET_ELEM_BUSY_BIT, word);
+ BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0);
+ return test_bit(NFT_SET_ELEM_DEAD_BIT, word);
}
/**
#define nft_trans_flowtable_flags(trans) \
(((struct nft_trans_flowtable *)trans->data)->flags)
+#define NFT_TRANS_GC_BATCHCOUNT 256
+
+struct nft_trans_gc {
+ struct list_head list;
+ struct net *net;
+ struct nft_set *set;
+ u32 seq;
+ u8 count;
+ void *priv[NFT_TRANS_GC_BATCHCOUNT];
+ struct rcu_head rcu;
+};
+
+struct nft_trans_gc *nft_trans_gc_alloc(struct nft_set *set,
+ unsigned int gc_seq, gfp_t gfp);
+void nft_trans_gc_destroy(struct nft_trans_gc *trans);
+
+struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc,
+ unsigned int gc_seq, gfp_t gfp);
+void nft_trans_gc_queue_async_done(struct nft_trans_gc *gc);
+
+struct nft_trans_gc *nft_trans_gc_queue_sync(struct nft_trans_gc *gc, gfp_t gfp);
+void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans);
+
+void nft_trans_gc_elem_add(struct nft_trans_gc *gc, void *priv);
+
+struct nft_trans_gc *nft_trans_gc_catchall(struct nft_trans_gc *gc,
+ unsigned int gc_seq);
+
+void nft_setelem_data_deactivate(const struct net *net,
+ const struct nft_set *set,
+ struct nft_set_elem *elem);
+
int __init nft_chain_filter_init(void);
void nft_chain_filter_fini(void);
struct mutex commit_mutex;
u64 table_handle;
unsigned int base_seq;
+ unsigned int gc_seq;
};
extern unsigned int nf_tables_net_id;
__field(const void *, skaddr)
__field(__u16, sport)
__field(__u16, dport)
+ __field(__u16, family)
__array(__u8, saddr, 4)
__array(__u8, daddr, 4)
__array(__u8, saddr_v6, 16)
__entry->sport = ntohs(inet->inet_sport);
__entry->dport = ntohs(inet->inet_dport);
+ __entry->family = sk->sk_family;
p32 = (__be32 *) __entry->saddr;
*p32 = inet->inet_saddr;
__entry->cong_state = ca_state;
),
- TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c cong_state=%u",
+ TP_printk("family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c cong_state=%u",
+ show_family_name(__entry->family),
__entry->sport, __entry->dport,
__entry->saddr, __entry->daddr,
__entry->saddr_v6, __entry->daddr_v6,
* - use the kernel virtual address of the shared io_uring context
* (instead of the userspace-provided address, which has to be 0UL
* anyway).
+ * - use the same pgoff which the get_unmapped_area() uses to
+ * calculate the page colouring.
* For architectures without such aliasing requirements, the
* architecture will return any suitable mapping because addr is 0.
*/
pgoff = 0; /* has been translated to ptr above */
#ifdef SHM_COLOUR
addr = (uintptr_t) ptr;
+ pgoff = addr >> PAGE_SHIFT;
#else
addr = 0UL;
#endif
{
/*
* Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open,
- * it'll always -EAGAIN
+ * it'll always -EAGAIN. Note that we test for __O_TMPFILE because
+ * O_TMPFILE includes O_DIRECTORY, which isn't a flag we need to force
+ * async for.
*/
- return open->how.flags & (O_TRUNC | O_CREAT | O_TMPFILE);
+ return open->how.flags & (O_TRUNC | O_CREAT | __O_TMPFILE);
}
static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
int error;
if (!hibernation_available())
- return 0;
+ return n;
if (len && buf[len-1] == '\n')
len--;
#include <linux/sched/debug.h>
#include <linux/nmi.h>
#include <linux/kvm_para.h>
+#include <linux/delay.h>
#include "workqueue_internal.h"
* Per-cpu work items which run for longer than the following threshold are
* automatically considered CPU intensive and excluded from concurrency
* management to prevent them from noticeably delaying other per-cpu work items.
+ * ULONG_MAX indicates that the user hasn't overridden it with a boot parameter.
+ * The actual value is initialized in wq_cpu_intensive_thresh_init().
*/
-static unsigned long wq_cpu_intensive_thresh_us = 10000;
+static unsigned long wq_cpu_intensive_thresh_us = ULONG_MAX;
module_param_named(cpu_intensive_thresh_us, wq_cpu_intensive_thresh_us, ulong, 0644);
static bool wq_disable_numa;
!system_freezable_power_efficient_wq);
}
+static void __init wq_cpu_intensive_thresh_init(void)
+{
+ unsigned long thresh;
+ unsigned long bogo;
+
+ /* if the user set it to a specific value, keep it */
+ if (wq_cpu_intensive_thresh_us != ULONG_MAX)
+ return;
+
+ /*
+ * The default of 10ms is derived from the fact that most modern (as of
+ * 2023) processors can do a lot in 10ms and that it's just below what
+ * most consider human-perceivable. However, the kernel also runs on a
+ * lot slower CPUs including microcontrollers where the threshold is way
+ * too low.
+ *
+ * Let's scale up the threshold upto 1 second if BogoMips is below 4000.
+ * This is by no means accurate but it doesn't have to be. The mechanism
+ * is still useful even when the threshold is fully scaled up. Also, as
+ * the reports would usually be applicable to everyone, some machines
+ * operating on longer thresholds won't significantly diminish their
+ * usefulness.
+ */
+ thresh = 10 * USEC_PER_MSEC;
+
+ /* see init/calibrate.c for lpj -> BogoMIPS calculation */
+ bogo = max_t(unsigned long, loops_per_jiffy / 500000 * HZ, 1);
+ if (bogo < 4000)
+ thresh = min_t(unsigned long, thresh * 4000 / bogo, USEC_PER_SEC);
+
+ pr_debug("wq_cpu_intensive_thresh: lpj=%lu BogoMIPS=%lu thresh_us=%lu\n",
+ loops_per_jiffy, bogo, thresh);
+
+ wq_cpu_intensive_thresh_us = thresh;
+}
+
/**
* workqueue_init - bring workqueue subsystem fully online
*
struct worker_pool *pool;
int cpu, bkt;
+ wq_cpu_intensive_thresh_init();
+
/*
* It'd be simpler to initialize NUMA in workqueue_init_early() but
* CPU to node mapping may not be available that early on some
help
Say Y here to enable reporting of concurrency-managed per-cpu work
items that hog CPUs for longer than
- workqueue.cpu_intensive_threshold_us. Workqueue automatically
+ workqueue.cpu_intensive_thresh_us. Workqueue automatically
detects and excludes them from concurrency management to prevent
them from stalling other per-cpu work items. Occassional
triggering may not necessarily indicate a problem. Repeated
failed:
while (sgtable->nents > sgtable->orig_nents)
- put_page(sg_page(&sgtable->sgl[--sgtable->nents]));
+ unpin_user_page(sg_page(&sgtable->sgl[--sgtable->nents]));
return res;
}
/*
* Check if the pageblock has already been marked skipped.
- * Only the aligned PFN is checked as the caller isolates
+ * Only the first PFN is checked as the caller isolates
* COMPACT_CLUSTER_MAX at a time so the second call must
* not falsely conclude that the block should be skipped.
*/
- if (!valid_page && pageblock_aligned(low_pfn)) {
+ if (!valid_page && (pageblock_aligned(low_pfn) ||
+ low_pfn == cc->zone->zone_start_pfn)) {
if (!isolation_suitable(cc, page)) {
low_pfn = end_pfn;
folio = NULL;
* before making it "skip" so other compaction instances do
* not scan the same block.
*/
- if (pageblock_aligned(low_pfn) &&
+ if ((pageblock_aligned(low_pfn) ||
+ low_pfn == cc->zone->zone_start_pfn) &&
!fast_find_block && !isolation_suitable(cc, page))
continue;
return NULL;
filter->type = type;
filter->matching = matching;
+ INIT_LIST_HEAD(&filter->list);
return filter;
}
unsigned int order) { }
#endif
+static inline void __clear_hugetlb_destructor(struct hstate *h,
+ struct folio *folio)
+{
+ lockdep_assert_held(&hugetlb_lock);
+
+ /*
+ * Very subtle
+ *
+ * For non-gigantic pages set the destructor to the normal compound
+ * page dtor. This is needed in case someone takes an additional
+ * temporary ref to the page, and freeing is delayed until they drop
+ * their reference.
+ *
+ * For gigantic pages set the destructor to the null dtor. This
+ * destructor will never be called. Before freeing the gigantic
+ * page destroy_compound_gigantic_folio will turn the folio into a
+ * simple group of pages. After this the destructor does not
+ * apply.
+ *
+ */
+ if (hstate_is_gigantic(h))
+ folio_set_compound_dtor(folio, NULL_COMPOUND_DTOR);
+ else
+ folio_set_compound_dtor(folio, COMPOUND_PAGE_DTOR);
+}
+
/*
- * Remove hugetlb folio from lists, and update dtor so that the folio appears
- * as just a compound page.
+ * Remove hugetlb folio from lists.
+ * If vmemmap exists for the folio, update dtor so that the folio appears
+ * as just a compound page. Otherwise, wait until after allocating vmemmap
+ * to update dtor.
*
* A reference is held on the folio, except in the case of demote.
*
}
/*
- * Very subtle
- *
- * For non-gigantic pages set the destructor to the normal compound
- * page dtor. This is needed in case someone takes an additional
- * temporary ref to the page, and freeing is delayed until they drop
- * their reference.
- *
- * For gigantic pages set the destructor to the null dtor. This
- * destructor will never be called. Before freeing the gigantic
- * page destroy_compound_gigantic_folio will turn the folio into a
- * simple group of pages. After this the destructor does not
- * apply.
- *
- * This handles the case where more than one ref is held when and
- * after update_and_free_hugetlb_folio is called.
- *
- * In the case of demote we do not ref count the page as it will soon
- * be turned into a page of smaller size.
+ * We can only clear the hugetlb destructor after allocating vmemmap
+ * pages. Otherwise, someone (memory error handling) may try to write
+ * to tail struct pages.
+ */
+ if (!folio_test_hugetlb_vmemmap_optimized(folio))
+ __clear_hugetlb_destructor(h, folio);
+
+ /*
+ * In the case of demote we do not ref count the page as it will soon
+ * be turned into a page of smaller size.
*/
if (!demote)
folio_ref_unfreeze(folio, 1);
- if (hstate_is_gigantic(h))
- folio_set_compound_dtor(folio, NULL_COMPOUND_DTOR);
- else
- folio_set_compound_dtor(folio, COMPOUND_PAGE_DTOR);
h->nr_huge_pages--;
h->nr_huge_pages_node[nid]--;
{
int i;
struct page *subpage;
+ bool clear_dtor = folio_test_hugetlb_vmemmap_optimized(folio);
if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
return;
if (unlikely(folio_test_hwpoison(folio)))
folio_clear_hugetlb_hwpoison(folio);
+ /*
+ * If vmemmap pages were allocated above, then we need to clear the
+ * hugetlb destructor under the hugetlb lock.
+ */
+ if (clear_dtor) {
+ spin_lock_irq(&hugetlb_lock);
+ __clear_hugetlb_destructor(h, folio);
+ spin_unlock_irq(&hugetlb_lock);
+ }
+
for (i = 0; i < pages_per_huge_page(h); i++) {
subpage = folio_page(folio, i);
subpage->flags &= ~(1 << PG_locked | 1 << PG_error |
anon_vma->root == vma->anon_vma->root) {
return page; /* still no need to copy it */
}
+ if (PageHWPoison(page))
+ return ERR_PTR(-EHWPOISON);
if (!PageUptodate(page))
return page; /* let do_swap_page report the error */
{
struct folio *folio;
struct page *p;
- int ret = -EBUSY;
+ int ret = -EBUSY, ghp;
unsigned long count = 1;
bool huge = false;
static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
goto unlock_mutex;
}
+ if (folio_test_slab(folio) || PageTable(&folio->page) || folio_test_reserved(folio))
+ goto unlock_mutex;
+
+ /*
+ * Note that folio->_mapcount is overloaded in SLAB, so the simple test
+ * in folio_mapped() has to be done after folio_test_slab() is checked.
+ */
if (folio_mapped(folio)) {
unpoison_pr_info("Unpoison: Someone maps the hwpoison page %#lx\n",
pfn, &unpoison_rs);
goto unlock_mutex;
}
- if (folio_test_slab(folio) || PageTable(&folio->page) || folio_test_reserved(folio))
- goto unlock_mutex;
-
- ret = get_hwpoison_page(p, MF_UNPOISON);
- if (!ret) {
+ ghp = get_hwpoison_page(p, MF_UNPOISON);
+ if (!ghp) {
if (PageHuge(p)) {
huge = true;
count = folio_free_raw_hwp(folio, false);
- if (count == 0) {
- ret = -EBUSY;
+ if (count == 0)
goto unlock_mutex;
- }
}
ret = folio_test_clear_hwpoison(folio) ? 0 : -EBUSY;
- } else if (ret < 0) {
- if (ret == -EHWPOISON) {
+ } else if (ghp < 0) {
+ if (ghp == -EHWPOISON) {
ret = put_page_back_buddy(p) ? 0 : -EBUSY;
- } else
+ } else {
+ ret = ghp;
unpoison_pr_info("Unpoison: failed to grab page %#lx\n",
pfn, &unpoison_rs);
+ }
} else {
if (PageHuge(p)) {
huge = true;
count = folio_free_raw_hwp(folio, false);
if (count == 0) {
- ret = -EBUSY;
folio_put(folio);
goto unlock_mutex;
}
if (mmap_read_lock_killable(mm))
return 0;
+ /* Untag the address before looking up the VMA */
+ addr = untagged_addr_remote(mm, addr);
+
/* Avoid triggering the temporary warning in __get_user_pages */
if (!vma_lookup(mm, addr) && !expand_stack(mm, addr))
return 0;
struct page *swapcache;
spinlock_t *ptl;
pte_t *pte, new_pte, old_pte;
- bool hwposioned = false;
+ bool hwpoisoned = PageHWPoison(page);
int ret = 1;
swapcache = page;
if (unlikely(!page))
return -ENOMEM;
else if (unlikely(PTR_ERR(page) == -EHWPOISON))
- hwposioned = true;
+ hwpoisoned = true;
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
if (unlikely(!pte || !pte_same_as_swp(ptep_get(pte),
old_pte = ptep_get(pte);
- if (unlikely(hwposioned || !PageUptodate(page))) {
+ if (unlikely(hwpoisoned || !PageUptodate(page))) {
swp_entry_t swp_entry;
dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
- if (hwposioned) {
+ if (hwpoisoned) {
swp_entry = make_hwpoison_entry(swapcache);
page = swapcache;
} else {
static bool zs_page_isolate(struct page *page, isolate_mode_t mode)
{
+ struct zs_pool *pool;
struct zspage *zspage;
/*
VM_BUG_ON_PAGE(PageIsolated(page), page);
zspage = get_zspage(page);
- migrate_write_lock(zspage);
+ pool = zspage->pool;
+ spin_lock(&pool->lock);
inc_zspage_isolation(zspage);
- migrate_write_unlock(zspage);
+ spin_unlock(&pool->lock);
return true;
}
kunmap_atomic(s_addr);
replace_sub_page(class, zspage, newpage, page);
+ dec_zspage_isolation(zspage);
/*
* Since we complete the data copy and set up new zspage structure,
* it's okay to release the pool's lock.
*/
spin_unlock(&pool->lock);
- dec_zspage_isolation(zspage);
migrate_write_unlock(zspage);
get_page(newpage);
static void zs_page_putback(struct page *page)
{
+ struct zs_pool *pool;
struct zspage *zspage;
VM_BUG_ON_PAGE(!PageIsolated(page), page);
zspage = get_zspage(page);
- migrate_write_lock(zspage);
+ pool = zspage->pool;
+ spin_lock(&pool->lock);
dec_zspage_isolation(zspage);
- migrate_write_unlock(zspage);
+ spin_unlock(&pool->lock);
}
static const struct movable_operations zsmalloc_mops = {
dev->name);
vlan_vid_add(dev, htons(ETH_P_8021Q), 0);
}
- if (event == NETDEV_DOWN &&
- (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER))
+ if (event == NETDEV_DOWN)
vlan_vid_del(dev, htons(ETH_P_8021Q), 0);
vlan_info = rtnl_dereference(dev->vlan_info);
if (unlikely(data_end > data_hard_end))
return -EINVAL;
- /* ALL drivers MUST init xdp->frame_sz, chicken check below */
- if (unlikely(xdp->frame_sz > PAGE_SIZE)) {
- WARN_ONCE(1, "Too BIG xdp->frame_sz = %d\n", xdp->frame_sz);
- return -EINVAL;
- }
-
if (unlikely(data_end < xdp->data + ETH_HLEN))
return -EINVAL;
int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock)
{
+ int ret;
+
static const struct strp_callbacks cb = {
.rcv_msg = sk_psock_strp_read,
.read_sock_done = sk_psock_strp_read_done,
.parse_msg = sk_psock_strp_parse,
};
- return strp_init(&psock->strp, sk, &cb);
+ ret = strp_init(&psock->strp, sk, &cb);
+ if (!ret)
+ sk_psock_set_state(psock, SK_PSOCK_RX_STRP_ENABLED);
+
+ return ret;
}
void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock)
static void sk_psock_done_strp(struct sk_psock *psock)
{
/* Parser has been stopped */
- if (psock->progs.stream_parser)
+ if (sk_psock_test_state(psock, SK_PSOCK_RX_STRP_ENABLED))
strp_done(&psock->strp);
}
#else
spin_unlock(&sk->sk_peer_lock);
if (!peer_pid)
- return -ESRCH;
+ return -ENODATA;
pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file);
put_pid(peer_pid);
list_for_each_entry_safe(link, tmp, &psock->link, list) {
if (link->link_raw == link_raw) {
struct bpf_map *map = link->map;
- struct bpf_stab *stab = container_of(map, struct bpf_stab,
- map);
- if (psock->saved_data_ready && stab->progs.stream_parser)
+ struct sk_psock_progs *progs = sock_map_progs(map);
+
+ if (psock->saved_data_ready && progs->stream_parser)
strp_stop = true;
- if (psock->saved_data_ready && stab->progs.stream_verdict)
+ if (psock->saved_data_ready && progs->stream_verdict)
verdict_stop = true;
- if (psock->saved_data_ready && stab->progs.skb_verdict)
+ if (psock->saved_data_ready && progs->skb_verdict)
verdict_stop = true;
list_del(&link->list);
sk_psock_free_link(link);
/* And store cached results */
icsk->icsk_pmtu_cookie = pmtu;
- dp->dccps_mss_cache = cur_mps;
+ WRITE_ONCE(dp->dccps_mss_cache, cur_mps);
return cur_mps;
}
return dccp_getsockopt_service(sk, len,
(__be32 __user *)optval, optlen);
case DCCP_SOCKOPT_GET_CUR_MPS:
- val = dp->dccps_mss_cache;
+ val = READ_ONCE(dp->dccps_mss_cache);
break;
case DCCP_SOCKOPT_AVAILABLE_CCIDS:
return ccid_getsockopt_builtin_ccids(sk, len, optval, optlen);
trace_dccp_probe(sk, len);
- if (len > dp->dccps_mss_cache)
+ if (len > READ_ONCE(dp->dccps_mss_cache))
return -EMSGSIZE;
lock_sock(sk);
goto out_discard;
}
+ /* We need to check dccps_mss_cache after socket is locked. */
+ if (len > dp->dccps_mss_cache) {
+ rc = -EMSGSIZE;
+ goto out_discard;
+ }
+
skb_reserve(skb, sk->sk_prot->max_header);
rc = memcpy_from_msg(skb_put(skb, len), msg, len);
if (rc != 0)
.un.frag.__unused = 0,
.un.frag.mtu = htons(mtu),
};
- icmph->checksum = ip_compute_csum(icmph, len);
+ icmph->checksum = csum_fold(skb_checksum(skb, 0, len, 0));
skb_reset_transport_header(skb);
niph = skb_push(skb, sizeof(*niph));
&rtm_dump_nexthop_cb, &filter);
if (err < 0) {
if (likely(skb->len))
- goto out;
- goto out_err;
+ err = skb->len;
}
-out:
- err = skb->len;
-out_err:
cb->seq = net->nexthop.seq;
nl_dump_check_consistent(cb, nlmsg_hdr(skb));
return err;
dd->filter.res_bucket_nh_id != nhge->nh->id)
continue;
+ dd->ctx->bucket_index = bucket_index;
err = nh_fill_res_bucket(skb, nh, bucket, bucket_index,
RTM_NEWNEXTHOPBUCKET, portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI,
cb->extack);
- if (err < 0) {
- if (likely(skb->len))
- goto out;
- goto out_err;
- }
+ if (err)
+ return err;
}
dd->ctx->done_nh_idx = dd->ctx->nh.idx + 1;
- bucket_index = 0;
+ dd->ctx->bucket_index = 0;
-out:
- err = skb->len;
-out_err:
- dd->ctx->bucket_index = bucket_index;
- return err;
+ return 0;
}
static int rtm_dump_nexthop_bucket_cb(struct sk_buff *skb,
if (err < 0) {
if (likely(skb->len))
- goto out;
- goto out_err;
+ err = skb->len;
}
-out:
- err = skb->len;
-out_err:
cb->seq = net->nexthop.seq;
nl_dump_check_consistent(cb, nlmsg_hdr(skb));
return err;
static inline int ndisc_is_useropt(const struct net_device *dev,
struct nd_opt_hdr *opt)
{
- return opt->nd_opt_type == ND_OPT_RDNSS ||
+ return opt->nd_opt_type == ND_OPT_PREFIX_INFO ||
+ opt->nd_opt_type == ND_OPT_RDNSS ||
opt->nd_opt_type == ND_OPT_DNSSL ||
opt->nd_opt_type == ND_OPT_CAPTIVE_PORTAL ||
opt->nd_opt_type == ND_OPT_PREF64 ||
lock_sock_nested(ssk, SINGLE_DEPTH_NESTING);
- if (flags & MPTCP_CF_FASTCLOSE) {
+ if ((flags & MPTCP_CF_FASTCLOSE) && !__mptcp_check_fallback(msk)) {
/* be sure to force the tcp_disconnect() path,
* to generate the egress reset
*/
if (__test_and_clear_bit(MPTCP_CLEAN_UNA, &msk->cb_flags))
__mptcp_clean_una_wakeup(sk);
- if (unlikely(&msk->cb_flags)) {
+ if (unlikely(msk->cb_flags)) {
/* be sure to set the current sk state before tacking actions
* depending on sk_state, that is processing MPTCP_ERROR_REPORT
*/
u32 subflow_id;
u32 setsockopt_seq;
char ca_name[TCP_CA_NAME_MAX];
- struct mptcp_sock *dl_next;
};
#define mptcp_data_lock(sk) spin_lock_bh(&(sk)->sk_lock.slock)
void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_ssk)
{
struct request_sock_queue *queue = &inet_csk(listener_ssk)->icsk_accept_queue;
- struct mptcp_sock *msk, *next, *head = NULL;
- struct request_sock *req;
- struct sock *sk;
+ struct request_sock *req, *head, *tail;
+ struct mptcp_subflow_context *subflow;
+ struct sock *sk, *ssk;
- /* build a list of all unaccepted mptcp sockets */
+ /* Due to lock dependencies no relevant lock can be acquired under rskq_lock.
+ * Splice the req list, so that accept() can not reach the pending ssk after
+ * the listener socket is released below.
+ */
spin_lock_bh(&queue->rskq_lock);
- for (req = queue->rskq_accept_head; req; req = req->dl_next) {
- struct mptcp_subflow_context *subflow;
- struct sock *ssk = req->sk;
+ head = queue->rskq_accept_head;
+ tail = queue->rskq_accept_tail;
+ queue->rskq_accept_head = NULL;
+ queue->rskq_accept_tail = NULL;
+ spin_unlock_bh(&queue->rskq_lock);
+
+ if (!head)
+ return;
+ /* can't acquire the msk socket lock under the subflow one,
+ * or will cause ABBA deadlock
+ */
+ release_sock(listener_ssk);
+
+ for (req = head; req; req = req->dl_next) {
+ ssk = req->sk;
if (!sk_is_mptcp(ssk))
continue;
if (!subflow || !subflow->conn)
continue;
- /* skip if already in list */
sk = subflow->conn;
- msk = mptcp_sk(sk);
- if (msk->dl_next || msk == head)
- continue;
-
sock_hold(sk);
- msk->dl_next = head;
- head = msk;
- }
- spin_unlock_bh(&queue->rskq_lock);
- if (!head)
- return;
-
- /* can't acquire the msk socket lock under the subflow one,
- * or will cause ABBA deadlock
- */
- release_sock(listener_ssk);
-
- for (msk = head; msk; msk = next) {
- sk = (struct sock *)msk;
lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
- next = msk->dl_next;
- msk->dl_next = NULL;
-
__mptcp_unaccepted_force_close(sk);
release_sock(sk);
/* we are still under the listener msk socket lock */
lock_sock_nested(listener_ssk, SINGLE_DEPTH_NESTING);
+
+ /* restore the listener queue, to let the TCP code clean it up */
+ spin_lock_bh(&queue->rskq_lock);
+ WARN_ON_ONCE(queue->rskq_accept_head);
+ queue->rskq_accept_head = head;
+ queue->rskq_accept_tail = tail;
+ spin_unlock_bh(&queue->rskq_lock);
}
static int subflow_ulp_init(struct sock *sk)
static LIST_HEAD(nf_tables_objects);
static LIST_HEAD(nf_tables_flowtables);
static LIST_HEAD(nf_tables_destroy_list);
+static LIST_HEAD(nf_tables_gc_list);
static DEFINE_SPINLOCK(nf_tables_destroy_list_lock);
+static DEFINE_SPINLOCK(nf_tables_gc_list_lock);
enum {
NFT_VALIDATE_SKIP = 0,
static void nf_tables_trans_destroy_work(struct work_struct *w);
static DECLARE_WORK(trans_destroy_work, nf_tables_trans_destroy_work);
+static void nft_trans_gc_work(struct work_struct *work);
+static DECLARE_WORK(trans_gc_work, nft_trans_gc_work);
+
static void nft_ctx_init(struct nft_ctx *ctx,
struct net *net,
const struct sk_buff *skb,
return __nft_trans_set_add(ctx, msg_type, set, NULL);
}
-static void nft_setelem_data_deactivate(const struct net *net,
- const struct nft_set *set,
- struct nft_set_elem *elem);
-
static int nft_mapelem_deactivate(const struct nft_ctx *ctx,
struct nft_set *set,
const struct nft_set_iter *iter,
INIT_LIST_HEAD(&set->bindings);
INIT_LIST_HEAD(&set->catchall_list);
+ refcount_set(&set->refs, 1);
set->table = table;
write_pnet(&set->net, net);
set->ops = ops;
}
}
+static void nft_set_put(struct nft_set *set)
+{
+ if (refcount_dec_and_test(&set->refs)) {
+ kfree(set->name);
+ kvfree(set);
+ }
+}
+
static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set)
{
int i;
set->ops->destroy(ctx, set);
nft_set_catchall_destroy(ctx, set);
- kfree(set->name);
- kvfree(set);
+ nft_set_put(set);
}
static int nf_tables_delset(struct sk_buff *skb, const struct nfnl_info *info,
const struct nft_set_iter *iter,
struct nft_set_elem *elem)
{
+ const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
struct nft_set_dump_args *args;
+ if (nft_set_elem_expired(ext))
+ return 0;
+
args = container_of(iter, struct nft_set_dump_args, iter);
return nf_tables_fill_setelem(args->skb, set, elem, args->reset);
}
list_for_each_entry_rcu(catchall, &set->catchall_list, list) {
ext = nft_set_elem_ext(set, catchall->elem);
if (nft_set_elem_active(ext, genmask) &&
- !nft_set_elem_expired(ext))
+ !nft_set_elem_expired(ext) &&
+ !nft_set_elem_is_dead(ext))
return ext;
}
}
EXPORT_SYMBOL_GPL(nft_set_catchall_lookup);
-void *nft_set_catchall_gc(const struct nft_set *set)
-{
- struct nft_set_elem_catchall *catchall, *next;
- struct nft_set_ext *ext;
- void *elem = NULL;
-
- list_for_each_entry_safe(catchall, next, &set->catchall_list, list) {
- ext = nft_set_elem_ext(set, catchall->elem);
-
- if (!nft_set_elem_expired(ext) ||
- nft_set_elem_mark_busy(ext))
- continue;
-
- elem = catchall->elem;
- list_del_rcu(&catchall->list);
- kfree_rcu(catchall, rcu);
- break;
- }
-
- return elem;
-}
-EXPORT_SYMBOL_GPL(nft_set_catchall_gc);
-
static int nft_setelem_catchall_insert(const struct net *net,
struct nft_set *set,
const struct nft_set_elem *elem,
if (nft_setelem_is_catchall(set, elem)) {
nft_set_elem_change_active(net, set, ext);
- nft_set_elem_clear_busy(ext);
} else {
set->ops->activate(net, set, elem);
}
list_for_each_entry(catchall, &set->catchall_list, list) {
ext = nft_set_elem_ext(set, catchall->elem);
- if (!nft_is_active(net, ext) ||
- nft_set_elem_mark_busy(ext))
+ if (!nft_is_active(net, ext))
continue;
kfree(elem->priv);
goto err_elem_free;
}
- ext->genmask = nft_genmask_cur(ctx->net) | NFT_SET_ELEM_BUSY_MASK;
+ ext->genmask = nft_genmask_cur(ctx->net);
err = nft_setelem_insert(ctx->net, set, &elem, &ext2, flags);
if (err) {
nft_use_inc_restore(&(*nft_set_ext_obj(ext))->use);
}
-static void nft_setelem_data_deactivate(const struct net *net,
- const struct nft_set *set,
- struct nft_set_elem *elem)
+void nft_setelem_data_deactivate(const struct net *net,
+ const struct nft_set *set,
+ struct nft_set_elem *elem)
{
const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
list_for_each_entry_rcu(catchall, &set->catchall_list, list) {
ext = nft_set_elem_ext(set, catchall->elem);
- if (!nft_set_elem_active(ext, genmask) ||
- nft_set_elem_mark_busy(ext))
+ if (!nft_set_elem_active(ext, genmask))
continue;
elem.priv = catchall->elem;
return err;
}
-void nft_set_gc_batch_release(struct rcu_head *rcu)
-{
- struct nft_set_gc_batch *gcb;
- unsigned int i;
-
- gcb = container_of(rcu, struct nft_set_gc_batch, head.rcu);
- for (i = 0; i < gcb->head.cnt; i++)
- nft_set_elem_destroy(gcb->head.set, gcb->elems[i], true);
- kfree(gcb);
-}
-
-struct nft_set_gc_batch *nft_set_gc_batch_alloc(const struct nft_set *set,
- gfp_t gfp)
-{
- struct nft_set_gc_batch *gcb;
-
- gcb = kzalloc(sizeof(*gcb), gfp);
- if (gcb == NULL)
- return gcb;
- gcb->head.set = set;
- return gcb;
-}
-
/*
* Stateful objects
*/
list_del_rcu(&chain->list);
}
+static void nft_trans_gc_setelem_remove(struct nft_ctx *ctx,
+ struct nft_trans_gc *trans)
+{
+ void **priv = trans->priv;
+ unsigned int i;
+
+ for (i = 0; i < trans->count; i++) {
+ struct nft_set_elem elem = {
+ .priv = priv[i],
+ };
+
+ nft_setelem_data_deactivate(ctx->net, trans->set, &elem);
+ nft_setelem_remove(ctx->net, trans->set, &elem);
+ }
+}
+
+void nft_trans_gc_destroy(struct nft_trans_gc *trans)
+{
+ nft_set_put(trans->set);
+ put_net(trans->net);
+ kfree(trans);
+}
+
+static void nft_trans_gc_trans_free(struct rcu_head *rcu)
+{
+ struct nft_set_elem elem = {};
+ struct nft_trans_gc *trans;
+ struct nft_ctx ctx = {};
+ unsigned int i;
+
+ trans = container_of(rcu, struct nft_trans_gc, rcu);
+ ctx.net = read_pnet(&trans->set->net);
+
+ for (i = 0; i < trans->count; i++) {
+ elem.priv = trans->priv[i];
+ if (!nft_setelem_is_catchall(trans->set, &elem))
+ atomic_dec(&trans->set->nelems);
+
+ nf_tables_set_elem_destroy(&ctx, trans->set, elem.priv);
+ }
+
+ nft_trans_gc_destroy(trans);
+}
+
+static bool nft_trans_gc_work_done(struct nft_trans_gc *trans)
+{
+ struct nftables_pernet *nft_net;
+ struct nft_ctx ctx = {};
+
+ nft_net = nft_pernet(trans->net);
+
+ mutex_lock(&nft_net->commit_mutex);
+
+ /* Check for race with transaction, otherwise this batch refers to
+ * stale objects that might not be there anymore. Skip transaction if
+ * set has been destroyed from control plane transaction in case gc
+ * worker loses race.
+ */
+ if (READ_ONCE(nft_net->gc_seq) != trans->seq || trans->set->dead) {
+ mutex_unlock(&nft_net->commit_mutex);
+ return false;
+ }
+
+ ctx.net = trans->net;
+ ctx.table = trans->set->table;
+
+ nft_trans_gc_setelem_remove(&ctx, trans);
+ mutex_unlock(&nft_net->commit_mutex);
+
+ return true;
+}
+
+static void nft_trans_gc_work(struct work_struct *work)
+{
+ struct nft_trans_gc *trans, *next;
+ LIST_HEAD(trans_gc_list);
+
+ spin_lock(&nf_tables_destroy_list_lock);
+ list_splice_init(&nf_tables_gc_list, &trans_gc_list);
+ spin_unlock(&nf_tables_destroy_list_lock);
+
+ list_for_each_entry_safe(trans, next, &trans_gc_list, list) {
+ list_del(&trans->list);
+ if (!nft_trans_gc_work_done(trans)) {
+ nft_trans_gc_destroy(trans);
+ continue;
+ }
+ call_rcu(&trans->rcu, nft_trans_gc_trans_free);
+ }
+}
+
+struct nft_trans_gc *nft_trans_gc_alloc(struct nft_set *set,
+ unsigned int gc_seq, gfp_t gfp)
+{
+ struct net *net = read_pnet(&set->net);
+ struct nft_trans_gc *trans;
+
+ trans = kzalloc(sizeof(*trans), gfp);
+ if (!trans)
+ return NULL;
+
+ refcount_inc(&set->refs);
+ trans->set = set;
+ trans->net = get_net(net);
+ trans->seq = gc_seq;
+
+ return trans;
+}
+
+void nft_trans_gc_elem_add(struct nft_trans_gc *trans, void *priv)
+{
+ trans->priv[trans->count++] = priv;
+}
+
+static void nft_trans_gc_queue_work(struct nft_trans_gc *trans)
+{
+ spin_lock(&nf_tables_gc_list_lock);
+ list_add_tail(&trans->list, &nf_tables_gc_list);
+ spin_unlock(&nf_tables_gc_list_lock);
+
+ schedule_work(&trans_gc_work);
+}
+
+static int nft_trans_gc_space(struct nft_trans_gc *trans)
+{
+ return NFT_TRANS_GC_BATCHCOUNT - trans->count;
+}
+
+struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc,
+ unsigned int gc_seq, gfp_t gfp)
+{
+ if (nft_trans_gc_space(gc))
+ return gc;
+
+ nft_trans_gc_queue_work(gc);
+
+ return nft_trans_gc_alloc(gc->set, gc_seq, gfp);
+}
+
+void nft_trans_gc_queue_async_done(struct nft_trans_gc *trans)
+{
+ if (trans->count == 0) {
+ nft_trans_gc_destroy(trans);
+ return;
+ }
+
+ nft_trans_gc_queue_work(trans);
+}
+
+struct nft_trans_gc *nft_trans_gc_queue_sync(struct nft_trans_gc *gc, gfp_t gfp)
+{
+ if (WARN_ON_ONCE(!lockdep_commit_lock_is_held(gc->net)))
+ return NULL;
+
+ if (nft_trans_gc_space(gc))
+ return gc;
+
+ call_rcu(&gc->rcu, nft_trans_gc_trans_free);
+
+ return nft_trans_gc_alloc(gc->set, 0, gfp);
+}
+
+void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans)
+{
+ WARN_ON_ONCE(!lockdep_commit_lock_is_held(trans->net));
+
+ if (trans->count == 0) {
+ nft_trans_gc_destroy(trans);
+ return;
+ }
+
+ call_rcu(&trans->rcu, nft_trans_gc_trans_free);
+}
+
+struct nft_trans_gc *nft_trans_gc_catchall(struct nft_trans_gc *gc,
+ unsigned int gc_seq)
+{
+ struct nft_set_elem_catchall *catchall;
+ const struct nft_set *set = gc->set;
+ struct nft_set_ext *ext;
+
+ list_for_each_entry_rcu(catchall, &set->catchall_list, list) {
+ ext = nft_set_elem_ext(set, catchall->elem);
+
+ if (!nft_set_elem_expired(ext))
+ continue;
+ if (nft_set_elem_is_dead(ext))
+ goto dead_elem;
+
+ nft_set_elem_dead(ext);
+dead_elem:
+ gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC);
+ if (!gc)
+ return NULL;
+
+ nft_trans_gc_elem_add(gc, catchall->elem);
+ }
+
+ return gc;
+}
+
static void nf_tables_module_autoload_cleanup(struct net *net)
{
struct nftables_pernet *nft_net = nft_pernet(net);
{
struct nftables_pernet *nft_net = nft_pernet(net);
struct nft_trans *trans, *next;
+ unsigned int base_seq, gc_seq;
LIST_HEAD(set_update_list);
struct nft_trans_elem *te;
struct nft_chain *chain;
struct nft_table *table;
- unsigned int base_seq;
LIST_HEAD(adl);
int err;
WRITE_ONCE(nft_net->base_seq, base_seq);
+ /* Bump gc counter, it becomes odd, this is the busy mark. */
+ gc_seq = READ_ONCE(nft_net->gc_seq);
+ WRITE_ONCE(nft_net->gc_seq, ++gc_seq);
+
/* step 3. Start new generation, rules_gen_X now in use. */
net->nft.gencursor = nft_gencursor_next(net);
break;
case NFT_MSG_DELSET:
case NFT_MSG_DESTROYSET:
+ nft_trans_set(trans)->dead = 1;
list_del_rcu(&nft_trans_set(trans)->list);
nf_tables_set_notify(&trans->ctx, nft_trans_set(trans),
trans->msg_type, GFP_KERNEL);
nft_commit_notify(net, NETLINK_CB(skb).portid);
nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN);
nf_tables_commit_audit_log(&adl, nft_net->base_seq);
+
+ WRITE_ONCE(nft_net->gc_seq, ++gc_seq);
nf_tables_commit_release(net);
return 0;
INIT_LIST_HEAD(&nft_net->notify_list);
mutex_init(&nft_net->commit_mutex);
nft_net->base_seq = 1;
+ nft_net->gc_seq = 0;
return 0;
}
WARN_ON_ONCE(!list_empty(&nft_net->notify_list));
}
+static void nf_tables_exit_batch(struct list_head *net_exit_list)
+{
+ flush_work(&trans_gc_work);
+}
+
static struct pernet_operations nf_tables_net_ops = {
.init = nf_tables_init_net,
.pre_exit = nf_tables_pre_exit_net,
.exit = nf_tables_exit_net,
+ .exit_batch = nf_tables_exit_batch,
.id = &nf_tables_net_id,
.size = sizeof(struct nftables_pernet),
};
nft_chain_filter_fini();
nft_chain_route_fini();
unregister_pernet_subsys(&nf_tables_net_ops);
+ cancel_work_sync(&trans_gc_work);
cancel_work_sync(&trans_destroy_work);
rcu_barrier();
rhltable_destroy(&nft_objname_ht);
if (memcmp(nft_set_ext_key(&he->ext), x->key, x->set->klen))
return 1;
+ if (nft_set_elem_is_dead(&he->ext))
+ return 1;
if (nft_set_elem_expired(&he->ext))
return 1;
if (!nft_set_elem_active(&he->ext, x->genmask))
struct nft_rhash_elem *he = elem->priv;
nft_set_elem_change_active(net, set, &he->ext);
- nft_set_elem_clear_busy(&he->ext);
}
static bool nft_rhash_flush(const struct net *net,
{
struct nft_rhash_elem *he = priv;
- if (!nft_set_elem_mark_busy(&he->ext) ||
- !nft_is_active(net, &he->ext)) {
- nft_set_elem_change_active(net, set, &he->ext);
- return true;
- }
- return false;
+ nft_set_elem_change_active(net, set, &he->ext);
+
+ return true;
}
static void *nft_rhash_deactivate(const struct net *net,
rcu_read_lock();
he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params);
- if (he != NULL &&
- !nft_rhash_flush(net, set, he))
- he = NULL;
+ if (he)
+ nft_set_elem_change_active(net, set, &he->ext);
rcu_read_unlock();
if (he == NULL)
return false;
- return rhashtable_remove_fast(&priv->ht, &he->node, nft_rhash_params) == 0;
+ nft_set_elem_dead(&he->ext);
+
+ return true;
}
static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set,
if (iter->count < iter->skip)
goto cont;
- if (nft_set_elem_expired(&he->ext))
- goto cont;
if (!nft_set_elem_active(&he->ext, iter->genmask))
goto cont;
static void nft_rhash_gc(struct work_struct *work)
{
+ struct nftables_pernet *nft_net;
struct nft_set *set;
struct nft_rhash_elem *he;
struct nft_rhash *priv;
- struct nft_set_gc_batch *gcb = NULL;
struct rhashtable_iter hti;
+ struct nft_trans_gc *gc;
+ struct net *net;
+ u32 gc_seq;
priv = container_of(work, struct nft_rhash, gc_work.work);
set = nft_set_container_of(priv);
+ net = read_pnet(&set->net);
+ nft_net = nft_pernet(net);
+ gc_seq = READ_ONCE(nft_net->gc_seq);
+
+ gc = nft_trans_gc_alloc(set, gc_seq, GFP_KERNEL);
+ if (!gc)
+ goto done;
rhashtable_walk_enter(&priv->ht, &hti);
rhashtable_walk_start(&hti);
while ((he = rhashtable_walk_next(&hti))) {
if (IS_ERR(he)) {
- if (PTR_ERR(he) != -EAGAIN)
- break;
+ if (PTR_ERR(he) != -EAGAIN) {
+ nft_trans_gc_destroy(gc);
+ gc = NULL;
+ goto try_later;
+ }
continue;
}
+ /* Ruleset has been updated, try later. */
+ if (READ_ONCE(nft_net->gc_seq) != gc_seq) {
+ nft_trans_gc_destroy(gc);
+ gc = NULL;
+ goto try_later;
+ }
+
+ if (nft_set_elem_is_dead(&he->ext))
+ goto dead_elem;
+
if (nft_set_ext_exists(&he->ext, NFT_SET_EXT_EXPRESSIONS) &&
nft_rhash_expr_needs_gc_run(set, &he->ext))
goto needs_gc_run;
if (!nft_set_elem_expired(&he->ext))
continue;
needs_gc_run:
- if (nft_set_elem_mark_busy(&he->ext))
- continue;
+ nft_set_elem_dead(&he->ext);
+dead_elem:
+ gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC);
+ if (!gc)
+ goto try_later;
- gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
- if (gcb == NULL)
- break;
- rhashtable_remove_fast(&priv->ht, &he->node, nft_rhash_params);
- atomic_dec(&set->nelems);
- nft_set_gc_batch_add(gcb, he);
+ nft_trans_gc_elem_add(gc, he);
}
+
+ gc = nft_trans_gc_catchall(gc, gc_seq);
+
+try_later:
+ /* catchall list iteration requires rcu read side lock. */
rhashtable_walk_stop(&hti);
rhashtable_walk_exit(&hti);
- he = nft_set_catchall_gc(set);
- if (he) {
- gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
- if (gcb)
- nft_set_gc_batch_add(gcb, he);
- }
- nft_set_gc_batch_complete(gcb);
+ if (gc)
+ nft_trans_gc_queue_async_done(gc);
+
+done:
queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
nft_set_gc_interval(set));
}
return err;
INIT_DEFERRABLE_WORK(&priv->gc_work, nft_rhash_gc);
- if (set->flags & NFT_SET_TIMEOUT)
+ if (set->flags & (NFT_SET_TIMEOUT | NFT_SET_EVAL))
nft_rhash_gc_init(set);
return 0;
};
cancel_delayed_work_sync(&priv->gc_work);
- rcu_barrier();
rhashtable_free_and_destroy(&priv->ht, nft_rhash_elem_destroy,
(void *)&rhash_ctx);
}
goto out;
if (last) {
- if (nft_set_elem_expired(&f->mt[b].e->ext) ||
- (genmask &&
+ if ((genmask &&
!nft_set_elem_active(&f->mt[b].e->ext, genmask)))
goto next_match;
static void *nft_pipapo_get(const struct net *net, const struct nft_set *set,
const struct nft_set_elem *elem, unsigned int flags)
{
- return pipapo_get(net, set, (const u8 *)elem->key.val.data,
- nft_genmask_cur(net));
+ struct nft_pipapo_elem *ret;
+
+ ret = pipapo_get(net, set, (const u8 *)elem->key.val.data,
+ nft_genmask_cur(net));
+ if (IS_ERR(ret))
+ return ret;
+
+ if (nft_set_elem_expired(&ret->ext))
+ return ERR_PTR(-ENOENT);
+
+ return ret;
}
/**
}
}
+static void nft_pipapo_gc_deactivate(struct net *net, struct nft_set *set,
+ struct nft_pipapo_elem *e)
+
+{
+ struct nft_set_elem elem = {
+ .priv = e,
+ };
+
+ nft_setelem_data_deactivate(net, set, &elem);
+}
+
/**
* pipapo_gc() - Drop expired entries from set, destroy start and end elements
* @set: nftables API set representation
* @m: Matching data
*/
-static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m)
+static void pipapo_gc(const struct nft_set *_set, struct nft_pipapo_match *m)
{
+ struct nft_set *set = (struct nft_set *) _set;
struct nft_pipapo *priv = nft_set_priv(set);
+ struct net *net = read_pnet(&set->net);
int rules_f0, first_rule = 0;
struct nft_pipapo_elem *e;
+ struct nft_trans_gc *gc;
+
+ gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL);
+ if (!gc)
+ return;
while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) {
union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS];
f--;
i--;
e = f->mt[rulemap[i].to].e;
- if (nft_set_elem_expired(&e->ext) &&
- !nft_set_elem_mark_busy(&e->ext)) {
+
+ /* synchronous gc never fails, there is no need to set on
+ * NFT_SET_ELEM_DEAD_BIT.
+ */
+ if (nft_set_elem_expired(&e->ext)) {
priv->dirty = true;
- pipapo_drop(m, rulemap);
- rcu_barrier();
- nft_set_elem_destroy(set, e, true);
+ gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC);
+ if (!gc)
+ break;
+
+ nft_pipapo_gc_deactivate(net, set, e);
+ pipapo_drop(m, rulemap);
+ nft_trans_gc_elem_add(gc, e);
/* And check again current first rule, which is now the
* first we haven't checked.
}
}
- e = nft_set_catchall_gc(set);
- if (e)
- nft_set_elem_destroy(set, e, true);
-
- priv->last_gc = jiffies;
+ gc = nft_trans_gc_catchall(gc, 0);
+ if (gc) {
+ nft_trans_gc_queue_sync_done(gc);
+ priv->last_gc = jiffies;
+ }
}
/**
return;
nft_set_elem_change_active(net, set, &e->ext);
- nft_set_elem_clear_busy(&e->ext);
}
/**
goto cont;
e = f->mt[r].e;
- if (nft_set_elem_expired(&e->ext))
- goto cont;
elem.priv = e;
set->klen);
}
+static bool nft_rbtree_elem_expired(const struct nft_rbtree_elem *rbe)
+{
+ return nft_set_elem_expired(&rbe->ext) ||
+ nft_set_elem_is_dead(&rbe->ext);
+}
+
static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
const u32 *key, const struct nft_set_ext **ext,
unsigned int seq)
continue;
}
- if (nft_set_elem_expired(&rbe->ext))
+ if (nft_rbtree_elem_expired(rbe))
return false;
if (nft_rbtree_interval_end(rbe)) {
if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
nft_set_elem_active(&interval->ext, genmask) &&
- !nft_set_elem_expired(&interval->ext) &&
+ !nft_rbtree_elem_expired(interval) &&
nft_rbtree_interval_start(interval)) {
*ext = &interval->ext;
return true;
return rbe;
}
+static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set,
+ struct nft_rbtree *priv,
+ struct nft_rbtree_elem *rbe)
+{
+ struct nft_set_elem elem = {
+ .priv = rbe,
+ };
+
+ nft_setelem_data_deactivate(net, set, &elem);
+ rb_erase(&rbe->node, &priv->root);
+}
+
static int nft_rbtree_gc_elem(const struct nft_set *__set,
struct nft_rbtree *priv,
struct nft_rbtree_elem *rbe,
{
struct nft_set *set = (struct nft_set *)__set;
struct rb_node *prev = rb_prev(&rbe->node);
+ struct net *net = read_pnet(&set->net);
struct nft_rbtree_elem *rbe_prev;
- struct nft_set_gc_batch *gcb;
+ struct nft_trans_gc *gc;
- gcb = nft_set_gc_batch_check(set, NULL, GFP_ATOMIC);
- if (!gcb)
+ gc = nft_trans_gc_alloc(set, 0, GFP_ATOMIC);
+ if (!gc)
return -ENOMEM;
/* search for end interval coming before this element.
if (prev) {
rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node);
+ nft_rbtree_gc_remove(net, set, priv, rbe_prev);
- rb_erase(&rbe_prev->node, &priv->root);
- atomic_dec(&set->nelems);
- nft_set_gc_batch_add(gcb, rbe_prev);
+ /* There is always room in this trans gc for this element,
+ * memory allocation never actually happens, hence, the warning
+ * splat in such case. No need to set NFT_SET_ELEM_DEAD_BIT,
+ * this is synchronous gc which never fails.
+ */
+ gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC);
+ if (WARN_ON_ONCE(!gc))
+ return -ENOMEM;
+
+ nft_trans_gc_elem_add(gc, rbe_prev);
}
- rb_erase(&rbe->node, &priv->root);
- atomic_dec(&set->nelems);
+ nft_rbtree_gc_remove(net, set, priv, rbe);
+ gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC);
+ if (WARN_ON_ONCE(!gc))
+ return -ENOMEM;
+
+ nft_trans_gc_elem_add(gc, rbe);
- nft_set_gc_batch_add(gcb, rbe);
- nft_set_gc_batch_complete(gcb);
+ nft_trans_gc_queue_sync_done(gc);
return 0;
}
struct nft_rbtree_elem *rbe = elem->priv;
nft_set_elem_change_active(net, set, &rbe->ext);
- nft_set_elem_clear_busy(&rbe->ext);
}
static bool nft_rbtree_flush(const struct net *net,
{
struct nft_rbtree_elem *rbe = priv;
- if (!nft_set_elem_mark_busy(&rbe->ext) ||
- !nft_is_active(net, &rbe->ext)) {
- nft_set_elem_change_active(net, set, &rbe->ext);
- return true;
- }
- return false;
+ nft_set_elem_change_active(net, set, &rbe->ext);
+
+ return true;
}
static void *nft_rbtree_deactivate(const struct net *net,
if (iter->count < iter->skip)
goto cont;
- if (nft_set_elem_expired(&rbe->ext))
- goto cont;
if (!nft_set_elem_active(&rbe->ext, iter->genmask))
goto cont;
static void nft_rbtree_gc(struct work_struct *work)
{
- struct nft_rbtree_elem *rbe, *rbe_end = NULL, *rbe_prev = NULL;
- struct nft_set_gc_batch *gcb = NULL;
+ struct nft_rbtree_elem *rbe, *rbe_end = NULL;
+ struct nftables_pernet *nft_net;
struct nft_rbtree *priv;
+ struct nft_trans_gc *gc;
struct rb_node *node;
struct nft_set *set;
+ unsigned int gc_seq;
struct net *net;
- u8 genmask;
priv = container_of(work, struct nft_rbtree, gc_work.work);
set = nft_set_container_of(priv);
net = read_pnet(&set->net);
- genmask = nft_genmask_cur(net);
+ nft_net = nft_pernet(net);
+ gc_seq = READ_ONCE(nft_net->gc_seq);
+
+ gc = nft_trans_gc_alloc(set, gc_seq, GFP_KERNEL);
+ if (!gc)
+ goto done;
write_lock_bh(&priv->lock);
write_seqcount_begin(&priv->count);
for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
+
+ /* Ruleset has been updated, try later. */
+ if (READ_ONCE(nft_net->gc_seq) != gc_seq) {
+ nft_trans_gc_destroy(gc);
+ gc = NULL;
+ goto try_later;
+ }
+
rbe = rb_entry(node, struct nft_rbtree_elem, node);
- if (!nft_set_elem_active(&rbe->ext, genmask))
- continue;
+ if (nft_set_elem_is_dead(&rbe->ext))
+ goto dead_elem;
/* elements are reversed in the rbtree for historical reasons,
* from highest to lowest value, that is why end element is
if (!nft_set_elem_expired(&rbe->ext))
continue;
- if (nft_set_elem_mark_busy(&rbe->ext)) {
- rbe_end = NULL;
+ nft_set_elem_dead(&rbe->ext);
+
+ if (!rbe_end)
continue;
- }
- if (rbe_prev) {
- rb_erase(&rbe_prev->node, &priv->root);
- rbe_prev = NULL;
- }
- gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
- if (!gcb)
- break;
+ nft_set_elem_dead(&rbe_end->ext);
- atomic_dec(&set->nelems);
- nft_set_gc_batch_add(gcb, rbe);
- rbe_prev = rbe;
+ gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC);
+ if (!gc)
+ goto try_later;
- if (rbe_end) {
- atomic_dec(&set->nelems);
- nft_set_gc_batch_add(gcb, rbe_end);
- rb_erase(&rbe_end->node, &priv->root);
- rbe_end = NULL;
- }
- node = rb_next(node);
- if (!node)
- break;
+ nft_trans_gc_elem_add(gc, rbe_end);
+ rbe_end = NULL;
+dead_elem:
+ gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC);
+ if (!gc)
+ goto try_later;
+
+ nft_trans_gc_elem_add(gc, rbe);
}
- if (rbe_prev)
- rb_erase(&rbe_prev->node, &priv->root);
+
+ gc = nft_trans_gc_catchall(gc, gc_seq);
+
+try_later:
write_seqcount_end(&priv->count);
write_unlock_bh(&priv->lock);
- rbe = nft_set_catchall_gc(set);
- if (rbe) {
- gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
- if (gcb)
- nft_set_gc_batch_add(gcb, rbe);
- }
- nft_set_gc_batch_complete(gcb);
-
+ if (gc)
+ nft_trans_gc_queue_async_done(gc);
+done:
queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
nft_set_gc_interval(set));
}
{
union tpacket_uhdr h;
+ /* WRITE_ONCE() are paired with READ_ONCE() in __packet_get_status */
+
h.raw = frame;
switch (po->tp_version) {
case TPACKET_V1:
- h.h1->tp_status = status;
+ WRITE_ONCE(h.h1->tp_status, status);
flush_dcache_page(pgv_to_page(&h.h1->tp_status));
break;
case TPACKET_V2:
- h.h2->tp_status = status;
+ WRITE_ONCE(h.h2->tp_status, status);
flush_dcache_page(pgv_to_page(&h.h2->tp_status));
break;
case TPACKET_V3:
- h.h3->tp_status = status;
+ WRITE_ONCE(h.h3->tp_status, status);
flush_dcache_page(pgv_to_page(&h.h3->tp_status));
break;
default:
smp_rmb();
+ /* READ_ONCE() are paired with WRITE_ONCE() in __packet_set_status */
+
h.raw = frame;
switch (po->tp_version) {
case TPACKET_V1:
flush_dcache_page(pgv_to_page(&h.h1->tp_status));
- return h.h1->tp_status;
+ return READ_ONCE(h.h1->tp_status);
case TPACKET_V2:
flush_dcache_page(pgv_to_page(&h.h2->tp_status));
- return h.h2->tp_status;
+ return READ_ONCE(h.h2->tp_status);
case TPACKET_V3:
flush_dcache_page(pgv_to_page(&h.h3->tp_status));
- return h.h3->tp_status;
+ return READ_ONCE(h.h3->tp_status);
default:
WARN(1, "TPACKET version not supported.\n");
BUG();
sk->sk_state = SMC_INIT;
sk->sk_destruct = smc_destruct;
sk->sk_protocol = protocol;
- WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(net->smc.sysctl_wmem));
- WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(net->smc.sysctl_rmem));
+ WRITE_ONCE(sk->sk_sndbuf, 2 * READ_ONCE(net->smc.sysctl_wmem));
+ WRITE_ONCE(sk->sk_rcvbuf, 2 * READ_ONCE(net->smc.sysctl_rmem));
smc = smc_sk(sk);
INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
INIT_WORK(&smc->connect_work, smc_connect_work);
return rc;
}
+/* copy only relevant settings and flags of SOL_SOCKET level from smc to
+ * clc socket (since smc is not called for these options from net/core)
+ */
+
+#define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
+ (1UL << SOCK_KEEPOPEN) | \
+ (1UL << SOCK_LINGER) | \
+ (1UL << SOCK_BROADCAST) | \
+ (1UL << SOCK_TIMESTAMP) | \
+ (1UL << SOCK_DBG) | \
+ (1UL << SOCK_RCVTSTAMP) | \
+ (1UL << SOCK_RCVTSTAMPNS) | \
+ (1UL << SOCK_LOCALROUTE) | \
+ (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
+ (1UL << SOCK_RXQ_OVFL) | \
+ (1UL << SOCK_WIFI_STATUS) | \
+ (1UL << SOCK_NOFCS) | \
+ (1UL << SOCK_FILTER_LOCKED) | \
+ (1UL << SOCK_TSTAMP_NEW))
+
+/* if set, use value set by setsockopt() - else use IPv4 or SMC sysctl value */
+static void smc_adjust_sock_bufsizes(struct sock *nsk, struct sock *osk,
+ unsigned long mask)
+{
+ struct net *nnet = sock_net(nsk);
+
+ nsk->sk_userlocks = osk->sk_userlocks;
+ if (osk->sk_userlocks & SOCK_SNDBUF_LOCK) {
+ nsk->sk_sndbuf = osk->sk_sndbuf;
+ } else {
+ if (mask == SK_FLAGS_SMC_TO_CLC)
+ WRITE_ONCE(nsk->sk_sndbuf,
+ READ_ONCE(nnet->ipv4.sysctl_tcp_wmem[1]));
+ else
+ WRITE_ONCE(nsk->sk_sndbuf,
+ 2 * READ_ONCE(nnet->smc.sysctl_wmem));
+ }
+ if (osk->sk_userlocks & SOCK_RCVBUF_LOCK) {
+ nsk->sk_rcvbuf = osk->sk_rcvbuf;
+ } else {
+ if (mask == SK_FLAGS_SMC_TO_CLC)
+ WRITE_ONCE(nsk->sk_rcvbuf,
+ READ_ONCE(nnet->ipv4.sysctl_tcp_rmem[1]));
+ else
+ WRITE_ONCE(nsk->sk_rcvbuf,
+ 2 * READ_ONCE(nnet->smc.sysctl_rmem));
+ }
+}
+
static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
unsigned long mask)
{
/* options we don't get control via setsockopt for */
nsk->sk_type = osk->sk_type;
- nsk->sk_sndbuf = osk->sk_sndbuf;
- nsk->sk_rcvbuf = osk->sk_rcvbuf;
nsk->sk_sndtimeo = osk->sk_sndtimeo;
nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
nsk->sk_mark = READ_ONCE(osk->sk_mark);
nsk->sk_flags &= ~mask;
nsk->sk_flags |= osk->sk_flags & mask;
+
+ smc_adjust_sock_bufsizes(nsk, osk, mask);
}
-#define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
- (1UL << SOCK_KEEPOPEN) | \
- (1UL << SOCK_LINGER) | \
- (1UL << SOCK_BROADCAST) | \
- (1UL << SOCK_TIMESTAMP) | \
- (1UL << SOCK_DBG) | \
- (1UL << SOCK_RCVTSTAMP) | \
- (1UL << SOCK_RCVTSTAMPNS) | \
- (1UL << SOCK_LOCALROUTE) | \
- (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
- (1UL << SOCK_RXQ_OVFL) | \
- (1UL << SOCK_WIFI_STATUS) | \
- (1UL << SOCK_NOFCS) | \
- (1UL << SOCK_FILTER_LOCKED) | \
- (1UL << SOCK_TSTAMP_NEW))
-/* copy only relevant settings and flags of SOL_SOCKET level from smc to
- * clc socket (since smc is not called for these options from net/core)
- */
static void smc_copy_sock_settings_to_clc(struct smc_sock *smc)
{
smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC);
sock_hold(lsk); /* sock_put in smc_listen_work */
INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
smc_copy_sock_settings_to_smc(new_smc);
- new_smc->sk.sk_sndbuf = lsmc->sk.sk_sndbuf;
- new_smc->sk.sk_rcvbuf = lsmc->sk.sk_rcvbuf;
sock_hold(&new_smc->sk); /* sock_put in passive closing */
if (!queue_work(smc_hs_wq, &new_smc->smc_listen_work))
sock_put(&new_smc->sk);
struct smc_buf_desc *sndbuf_desc; /* send buffer descriptor */
struct smc_buf_desc *rmb_desc; /* RMBE descriptor */
- int rmbe_size_short;/* compressed notation */
+ int rmbe_size_comp; /* compressed notation */
int rmbe_update_limit;
/* lower limit for consumer
* cursor update
clc->d0.gid =
conn->lgr->smcd->ops->get_local_gid(conn->lgr->smcd);
clc->d0.token = conn->rmb_desc->token;
- clc->d0.dmbe_size = conn->rmbe_size_short;
+ clc->d0.dmbe_size = conn->rmbe_size_comp;
clc->d0.dmbe_idx = 0;
memcpy(&clc->d0.linkid, conn->lgr->id, SMC_LGR_ID_SIZE);
if (version == SMC_V1) {
clc->r0.qp_mtu = min(link->path_mtu, link->peer_mtu);
break;
}
- clc->r0.rmbe_size = conn->rmbe_size_short;
+ clc->r0.rmbe_size = conn->rmbe_size_comp;
clc->r0.rmb_dma_addr = conn->rmb_desc->is_vm ?
cpu_to_be64((uintptr_t)conn->rmb_desc->cpu_addr) :
cpu_to_be64((u64)sg_dma_address
struct smc_connection *conn = &smc->conn;
struct smc_link_group *lgr = conn->lgr;
struct list_head *buf_list;
- int bufsize, bufsize_short;
+ int bufsize, bufsize_comp;
struct rw_semaphore *lock; /* lock buffer list */
bool is_dgraded = false;
- int sk_buf_size;
if (is_rmb)
/* use socket recv buffer size (w/o overhead) as start value */
- sk_buf_size = smc->sk.sk_rcvbuf;
+ bufsize = smc->sk.sk_rcvbuf / 2;
else
/* use socket send buffer size (w/o overhead) as start value */
- sk_buf_size = smc->sk.sk_sndbuf;
+ bufsize = smc->sk.sk_sndbuf / 2;
- for (bufsize_short = smc_compress_bufsize(sk_buf_size, is_smcd, is_rmb);
- bufsize_short >= 0; bufsize_short--) {
+ for (bufsize_comp = smc_compress_bufsize(bufsize, is_smcd, is_rmb);
+ bufsize_comp >= 0; bufsize_comp--) {
if (is_rmb) {
lock = &lgr->rmbs_lock;
- buf_list = &lgr->rmbs[bufsize_short];
+ buf_list = &lgr->rmbs[bufsize_comp];
} else {
lock = &lgr->sndbufs_lock;
- buf_list = &lgr->sndbufs[bufsize_short];
+ buf_list = &lgr->sndbufs[bufsize_comp];
}
- bufsize = smc_uncompress_bufsize(bufsize_short);
+ bufsize = smc_uncompress_bufsize(bufsize_comp);
/* check for reusable slot in the link group */
- buf_desc = smc_buf_get_slot(bufsize_short, lock, buf_list);
+ buf_desc = smc_buf_get_slot(bufsize_comp, lock, buf_list);
if (buf_desc) {
buf_desc->is_dma_need_sync = 0;
SMC_STAT_RMB_SIZE(smc, is_smcd, is_rmb, bufsize);
if (is_rmb) {
conn->rmb_desc = buf_desc;
- conn->rmbe_size_short = bufsize_short;
- smc->sk.sk_rcvbuf = bufsize;
+ conn->rmbe_size_comp = bufsize_comp;
+ smc->sk.sk_rcvbuf = bufsize * 2;
atomic_set(&conn->bytes_to_rcv, 0);
conn->rmbe_update_limit =
smc_rmb_wnd_update_limit(buf_desc->len);
smc_ism_set_conn(conn); /* map RMB/smcd_dev to conn */
} else {
conn->sndbuf_desc = buf_desc;
- smc->sk.sk_sndbuf = bufsize;
+ smc->sk.sk_sndbuf = bufsize * 2;
atomic_set(&conn->sndbuf_space, bufsize);
}
return 0;
static int min_sndbuf = SMC_BUF_MIN_SIZE;
static int min_rcvbuf = SMC_BUF_MIN_SIZE;
+static int max_sndbuf = INT_MAX / 2;
+static int max_rcvbuf = INT_MAX / 2;
+static const int net_smc_wmem_init = (64 * 1024);
+static const int net_smc_rmem_init = (64 * 1024);
static struct ctl_table smc_table[] = {
{
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &min_sndbuf,
+ .extra2 = &max_sndbuf,
},
{
.procname = "rmem",
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &min_rcvbuf,
+ .extra2 = &max_rcvbuf,
},
{ }
};
net->smc.sysctl_autocorking_size = SMC_AUTOCORKING_DEFAULT_SIZE;
net->smc.sysctl_smcr_buf_type = SMCR_PHYS_CONT_BUFS;
net->smc.sysctl_smcr_testlink_time = SMC_LLC_TESTLINK_DEFAULT_TIME;
- WRITE_ONCE(net->smc.sysctl_wmem, READ_ONCE(net->ipv4.sysctl_tcp_wmem[1]));
- WRITE_ONCE(net->smc.sysctl_rmem, READ_ONCE(net->ipv4.sysctl_tcp_rmem[1]));
+ WRITE_ONCE(net->smc.sysctl_wmem, net_smc_wmem_init);
+ WRITE_ONCE(net->smc.sysctl_rmem, net_smc_rmem_init);
return 0;
static LIST_HEAD(tls_device_down_list);
static DEFINE_SPINLOCK(tls_device_lock);
+static struct page *dummy_page;
+
static void tls_device_free_ctx(struct tls_context *ctx)
{
if (ctx->tx_conf == TLS_HW) {
return tls_push_sg(sk, ctx, offload_ctx->sg_tx_data, 0, flags);
}
-static int tls_device_record_close(struct sock *sk,
- struct tls_context *ctx,
- struct tls_record_info *record,
- struct page_frag *pfrag,
- unsigned char record_type)
+static void tls_device_record_close(struct sock *sk,
+ struct tls_context *ctx,
+ struct tls_record_info *record,
+ struct page_frag *pfrag,
+ unsigned char record_type)
{
struct tls_prot_info *prot = &ctx->prot_info;
- int ret;
+ struct page_frag dummy_tag_frag;
/* append tag
* device will fill in the tag, we just need to append a placeholder
* use socket memory to improve coalescing (re-using a single buffer
* increases frag count)
- * if we can't allocate memory now, steal some back from data
+ * if we can't allocate memory now use the dummy page
*/
- if (likely(skb_page_frag_refill(prot->tag_size, pfrag,
- sk->sk_allocation))) {
- ret = 0;
- tls_append_frag(record, pfrag, prot->tag_size);
- } else {
- ret = prot->tag_size;
- if (record->len <= prot->overhead_size)
- return -ENOMEM;
+ if (unlikely(pfrag->size - pfrag->offset < prot->tag_size) &&
+ !skb_page_frag_refill(prot->tag_size, pfrag, sk->sk_allocation)) {
+ dummy_tag_frag.page = dummy_page;
+ dummy_tag_frag.offset = 0;
+ pfrag = &dummy_tag_frag;
}
+ tls_append_frag(record, pfrag, prot->tag_size);
/* fill prepend */
tls_fill_prepend(ctx, skb_frag_address(&record->frags[0]),
record->len - prot->overhead_size,
record_type);
- return ret;
}
static int tls_create_new_record(struct tls_offload_context_tx *offload_ctx,
if (done || record->len >= max_open_record_len ||
(record->num_frags >= MAX_SKB_FRAGS - 1)) {
- rc = tls_device_record_close(sk, tls_ctx, record,
- pfrag, record_type);
- if (rc) {
- if (rc > 0) {
- size += rc;
- } else {
- size = orig_size;
- destroy_record(record);
- ctx->open_record = NULL;
- break;
- }
- }
+ tls_device_record_close(sk, tls_ctx, record,
+ pfrag, record_type);
rc = tls_push_record(sk,
tls_ctx,
{
int err;
- destruct_wq = alloc_workqueue("ktls_device_destruct", 0, 0);
- if (!destruct_wq)
+ dummy_page = alloc_page(GFP_KERNEL);
+ if (!dummy_page)
return -ENOMEM;
+ destruct_wq = alloc_workqueue("ktls_device_destruct", 0, 0);
+ if (!destruct_wq) {
+ err = -ENOMEM;
+ goto err_free_dummy;
+ }
+
err = register_netdevice_notifier(&tls_dev_notifier);
if (err)
- destroy_workqueue(destruct_wq);
+ goto err_destroy_wq;
+ return 0;
+
+err_destroy_wq:
+ destroy_workqueue(destruct_wq);
+err_free_dummy:
+ put_page(dummy_page);
return err;
}
unregister_netdevice_notifier(&tls_dev_notifier);
destroy_workqueue(destruct_wq);
clean_acked_data_flush();
+ put_page(dummy_page);
}
ctx->splicing_pages = true;
while (1) {
- if (sg_is_last(sg))
- msg.msg_flags = flags;
-
/* is sending application-limited? */
tcp_rate_check_app_limited(sk);
p = sg_page(sg);
if (!wiphy->mbssid_max_interfaces)
return ERR_PTR(-EINVAL);
- nla_for_each_nested(nl_elems, attrs, rem_elems)
+ nla_for_each_nested(nl_elems, attrs, rem_elems) {
+ if (num_elems >= 255)
+ return ERR_PTR(-EINVAL);
num_elems++;
+ }
elems = kzalloc(struct_size(elems, elem, num_elems), GFP_KERNEL);
if (!elems)
err = xp_alloc_tx_descs(xs->pool, xs);
if (err) {
xp_put_pool(xs->pool);
+ xs->pool = NULL;
sockfd_put(sock);
goto out_unlock;
}
#include <linux/sysctl.h>
#include "internal.h"
-struct ctl_table key_sysctls[] = {
+static struct ctl_table key_sysctls[] = {
{
.procname = "maxkeys",
.data = &key_quota_maxkeys,
* Defines x86 CPU feature bits
*/
#define NCAPINTS 21 /* N 32-bit words worth of info */
-#define NBUGINTS 1 /* N 32-bit bug flags */
+#define NBUGINTS 2 /* N 32-bit bug flags */
/*
* Note: If the comment begins with a quoted string, that string is used
#define MSR_AMD64_DE_CFG 0xc0011029
#define MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT 1
#define MSR_AMD64_DE_CFG_LFENCE_SERIALIZE BIT_ULL(MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT)
+#define MSR_AMD64_DE_CFG_ZEN2_FP_BACKUP_FIX_BIT 9
#define MSR_AMD64_BU_CFG2 0xc001102a
#define MSR_AMD64_IBSFETCHCTL 0xc0011030
bool arch_is_rethunk(struct symbol *sym)
{
- return !strcmp(sym->name, "__x86_return_thunk");
+ return !strcmp(sym->name, "__x86_return_thunk") ||
+ !strcmp(sym->name, "srso_untrain_ret") ||
+ !strcmp(sym->name, "srso_safe_ret") ||
+ !strcmp(sym->name, "__ret");
}
static void __machine__remove_thread(struct machine *machine, struct thread_rb_node *nd,
struct thread *th, bool lock);
-static int append_inlines(struct callchain_cursor *cursor, struct map_symbol *ms, u64 ip);
static struct dso *machine__kernel_dso(struct machine *machine)
{
ms.maps = maps__get(al.maps);
ms.map = map__get(al.map);
ms.sym = al.sym;
-
- if (!branch && append_inlines(cursor, &ms, ip) == 0)
- goto out;
-
srcline = callchain_srcline(&ms, al.addr);
err = callchain_cursor_append(cursor, ip, &ms,
branch, flags, nr_loop_iter,
*/
if (config->aggr_mode == AGGR_THREAD && config->system_wide)
return true;
+
+ /* Tool events have the software PMU but are only gathered on 1. */
+ if (evsel__is_tool(counter))
+ return true;
+
/*
* Skip value 0 when it's an uncore event and the given aggr id
* does not belong to the PMU cpumask.
nr_threads = 2;
pthread_barrier_init(&worker_barrier, NULL, nr_threads);
- threads = malloc(nr_threads * sizeof(pthread_t *));
+ threads = malloc(nr_threads * sizeof(*threads));
for (i = 0; i < nr_threads; i++) {
arg = i;
xbpf_prog_detach2(verdict, sock_map, BPF_SK_MSG_VERDICT);
}
+static void redir_partial(int family, int sotype, int sock_map, int parser_map)
+{
+ int s, c0, c1, p0, p1;
+ int err, n, key, value;
+ char buf[] = "abc";
+
+ key = 0;
+ value = sizeof(buf) - 1;
+ err = xbpf_map_update_elem(parser_map, &key, &value, 0);
+ if (err)
+ return;
+
+ s = socket_loopback(family, sotype | SOCK_NONBLOCK);
+ if (s < 0)
+ goto clean_parser_map;
+
+ err = create_socket_pairs(s, family, sotype, &c0, &c1, &p0, &p1);
+ if (err)
+ goto close_srv;
+
+ err = add_to_sockmap(sock_map, p0, p1);
+ if (err)
+ goto close;
+
+ n = xsend(c1, buf, sizeof(buf), 0);
+ if (n < sizeof(buf))
+ FAIL("incomplete write");
+
+ n = xrecv_nonblock(c0, buf, sizeof(buf), 0);
+ if (n != sizeof(buf) - 1)
+ FAIL("expect %zu, received %d", sizeof(buf) - 1, n);
+
+close:
+ xclose(c0);
+ xclose(p0);
+ xclose(c1);
+ xclose(p1);
+close_srv:
+ xclose(s);
+
+clean_parser_map:
+ key = 0;
+ value = 0;
+ xbpf_map_update_elem(parser_map, &key, &value, 0);
+}
+
+static void test_skb_redir_partial(struct test_sockmap_listen *skel,
+ struct bpf_map *inner_map, int family,
+ int sotype)
+{
+ int verdict = bpf_program__fd(skel->progs.prog_stream_verdict);
+ int parser = bpf_program__fd(skel->progs.prog_stream_parser);
+ int parser_map = bpf_map__fd(skel->maps.parser_map);
+ int sock_map = bpf_map__fd(inner_map);
+ int err;
+
+ err = xbpf_prog_attach(parser, sock_map, BPF_SK_SKB_STREAM_PARSER, 0);
+ if (err)
+ return;
+
+ err = xbpf_prog_attach(verdict, sock_map, BPF_SK_SKB_STREAM_VERDICT, 0);
+ if (err)
+ goto detach;
+
+ redir_partial(family, sotype, sock_map, parser_map);
+
+ xbpf_prog_detach2(verdict, sock_map, BPF_SK_SKB_STREAM_VERDICT);
+detach:
+ xbpf_prog_detach2(parser, sock_map, BPF_SK_SKB_STREAM_PARSER);
+}
+
static void test_reuseport_select_listening(int family, int sotype,
int sock_map, int verd_map,
int reuseport_prog)
} tests[] = {
TEST(test_skb_redir_to_connected),
TEST(test_skb_redir_to_listening),
+ TEST(test_skb_redir_partial),
TEST(test_msg_redir_to_connected),
TEST(test_msg_redir_to_listening),
};
if (n < 1)
goto out;
- n = recv(mode == REDIR_INGRESS ? u0 : u1, &b, sizeof(b), MSG_DONTWAIT);
+ n = xrecv_nonblock(mode == REDIR_INGRESS ? u0 : u1, &b, sizeof(b), 0);
if (n < 0)
FAIL("%s: recv() err, errno=%d", log_prefix, errno);
if (n == 0)
__type(value, unsigned int);
} verdict_map SEC(".maps");
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 1);
+ __type(key, int);
+ __type(value, int);
+} parser_map SEC(".maps");
+
bool test_sockmap = false; /* toggled by user-space */
bool test_ingress = false; /* toggled by user-space */
SEC("sk_skb/stream_parser")
int prog_stream_parser(struct __sk_buff *skb)
{
+ int *value;
+ __u32 key = 0;
+
+ value = bpf_map_lookup_elem(&parser_map, &key);
+ if (value && *value)
+ return *value;
+
return skb->len;
}
goto cleanup;
cg_write(cg, "memory.high", "1M");
+
+ /* wait for RCU freeing */
+ sleep(1);
+
slab1 = cg_read_key_long(cg, "memory.stat", "slab ");
if (slab1 <= 0)
goto cleanup;
printf("Size must be greater than 0\n");
return KSFT_FAIL;
}
+ break;
case 't':
{
int tmp = atoi(optarg);
run_cmd "$IP link set dev lo up"
+ # Dump should not loop endlessly when maximum nexthop ID is configured.
+ run_cmd "$IP nexthop add id $((2**32-1)) blackhole"
+ run_cmd "timeout 5 $IP nexthop"
+ log_test $? 0 "Maximum nexthop ID dump"
+
#
# groups
#
run_cmd "$IP nexthop bucket list fdb"
log_test $? 255 "Dump all nexthop buckets with invalid 'fdb' keyword"
+ # Dump should not loop endlessly when maximum nexthop ID is configured.
+ run_cmd "$IP nexthop add id $((2**32-1)) group 1/2 type resilient buckets 4"
+ run_cmd "timeout 5 $IP nexthop bucket"
+ log_test $? 0 "Maximum nexthop ID dump"
+
#
# resilient nexthop buckets get requests
#
grep -q "permanent"
check_err $? "Entry not added as \"permanent\" when should"
bridge -d -s mdb show dev br0 vid 10 | grep "$grp_key" | \
- grep -q "0.00"
+ grep -q " 0.00"
check_err $? "\"permanent\" entry has a pending group timer"
bridge mdb del dev br0 port $swp1 $grp_key vid 10
grep -q "temp"
check_err $? "Entry not added as \"temp\" when should"
bridge -d -s mdb show dev br0 vid 10 | grep "$grp_key" | \
- grep -q "0.00"
+ grep -q " 0.00"
check_fail $? "\"temp\" entry has an unpending group timer"
bridge mdb del dev br0 port $swp1 $grp_key vid 10
grep -q "permanent"
check_err $? "Entry not marked as \"permanent\" after replace"
bridge -d -s mdb show dev br0 vid 10 | grep "$grp_key" | \
- grep -q "0.00"
+ grep -q " 0.00"
check_err $? "Entry has a pending group timer after replace"
bridge mdb replace dev br0 port $swp1 $grp_key vid 10 temp
grep -q "temp"
check_err $? "Entry not marked as \"temp\" after replace"
bridge -d -s mdb show dev br0 vid 10 | grep "$grp_key" | \
- grep -q "0.00"
+ grep -q " 0.00"
check_fail $? "Entry has an unpending group timer after replace"
bridge mdb del dev br0 port $swp1 $grp_key vid 10
__fwd_test_host_ip()
{
local grp=$1; shift
+ local dmac=$1; shift
local src=$1; shift
local mode=$1; shift
local name
# Packet should only be flooded to multicast router ports when there is
# no matching MDB entry. The bridge is not configured as a multicast
# router port.
- $MZ $mode $h1.10 -c 1 -p 128 -A $src -B $grp -t udp -q
+ $MZ $mode $h1.10 -a own -b $dmac -c 1 -p 128 -A $src -B $grp -t udp -q
tc_check_packets "dev br0 ingress" 1 0
check_err $? "Packet locally received after flood"
# Install a regular port group entry and expect the packet to not be
# locally received.
bridge mdb add dev br0 port $swp2 grp $grp temp vid 10
- $MZ $mode $h1.10 -c 1 -p 128 -A $src -B $grp -t udp -q
+ $MZ $mode $h1.10 -a own -b $dmac -c 1 -p 128 -A $src -B $grp -t udp -q
tc_check_packets "dev br0 ingress" 1 0
check_err $? "Packet locally received after installing a regular entry"
# Add a host entry and expect the packet to be locally received.
bridge mdb add dev br0 port br0 grp $grp temp vid 10
- $MZ $mode $h1.10 -c 1 -p 128 -A $src -B $grp -t udp -q
+ $MZ $mode $h1.10 -a own -b $dmac -c 1 -p 128 -A $src -B $grp -t udp -q
tc_check_packets "dev br0 ingress" 1 1
check_err $? "Packet not locally received after adding a host entry"
# Remove the host entry and expect the packet to not be locally
# received.
bridge mdb del dev br0 port br0 grp $grp vid 10
- $MZ $mode $h1.10 -c 1 -p 128 -A $src -B $grp -t udp -q
+ $MZ $mode $h1.10 -a own -b $dmac -c 1 -p 128 -A $src -B $grp -t udp -q
tc_check_packets "dev br0 ingress" 1 1
check_err $? "Packet locally received after removing a host entry"
fwd_test_host_ip()
{
- __fwd_test_host_ip "239.1.1.1" "192.0.2.1" "-4"
- __fwd_test_host_ip "ff0e::1" "2001:db8:1::1" "-6"
+ __fwd_test_host_ip "239.1.1.1" "01:00:5e:01:01:01" "192.0.2.1" "-4"
+ __fwd_test_host_ip "ff0e::1" "33:33:00:00:00:01" "2001:db8:1::1" "-6"
}
fwd_test_host_l2()
__fwd_test_port_ip()
{
local grp=$1; shift
+ local dmac=$1; shift
local valid_src=$1; shift
local invalid_src=$1; shift
local mode=$1; shift
vlan_ethtype $eth_type vlan_id 10 dst_ip $grp \
src_ip $invalid_src action drop
- $MZ $mode $h1.10 -c 1 -p 128 -A $valid_src -B $grp -t udp -q
+ $MZ $mode $h1.10 -a own -b $dmac -c 1 -p 128 -A $valid_src -B $grp -t udp -q
tc_check_packets "dev $h2 ingress" 1 0
check_err $? "Packet from valid source received on H2 before adding entry"
- $MZ $mode $h1.10 -c 1 -p 128 -A $invalid_src -B $grp -t udp -q
+ $MZ $mode $h1.10 -a own -b $dmac -c 1 -p 128 -A $invalid_src -B $grp -t udp -q
tc_check_packets "dev $h2 ingress" 2 0
check_err $? "Packet from invalid source received on H2 before adding entry"
bridge mdb add dev br0 port $swp2 grp $grp vid 10 \
filter_mode $filter_mode source_list $src_list
- $MZ $mode $h1.10 -c 1 -p 128 -A $valid_src -B $grp -t udp -q
+ $MZ $mode $h1.10 -a own -b $dmac -c 1 -p 128 -A $valid_src -B $grp -t udp -q
tc_check_packets "dev $h2 ingress" 1 1
check_err $? "Packet from valid source not received on H2 after adding entry"
- $MZ $mode $h1.10 -c 1 -p 128 -A $invalid_src -B $grp -t udp -q
+ $MZ $mode $h1.10 -a own -b $dmac -c 1 -p 128 -A $invalid_src -B $grp -t udp -q
tc_check_packets "dev $h2 ingress" 2 0
check_err $? "Packet from invalid source received on H2 after adding entry"
bridge mdb replace dev br0 port $swp2 grp $grp vid 10 \
filter_mode exclude
- $MZ $mode $h1.10 -c 1 -p 128 -A $valid_src -B $grp -t udp -q
+ $MZ $mode $h1.10 -a own -b $dmac -c 1 -p 128 -A $valid_src -B $grp -t udp -q
tc_check_packets "dev $h2 ingress" 1 2
check_err $? "Packet from valid source not received on H2 after allowing all sources"
- $MZ $mode $h1.10 -c 1 -p 128 -A $invalid_src -B $grp -t udp -q
+ $MZ $mode $h1.10 -a own -b $dmac -c 1 -p 128 -A $invalid_src -B $grp -t udp -q
tc_check_packets "dev $h2 ingress" 2 1
check_err $? "Packet from invalid source not received on H2 after allowing all sources"
bridge mdb del dev br0 port $swp2 grp $grp vid 10
- $MZ $mode $h1.10 -c 1 -p 128 -A $valid_src -B $grp -t udp -q
+ $MZ $mode $h1.10 -a own -b $dmac -c 1 -p 128 -A $valid_src -B $grp -t udp -q
tc_check_packets "dev $h2 ingress" 1 2
check_err $? "Packet from valid source received on H2 after deleting entry"
- $MZ $mode $h1.10 -c 1 -p 128 -A $invalid_src -B $grp -t udp -q
+ $MZ $mode $h1.10 -a own -b $dmac -c 1 -p 128 -A $invalid_src -B $grp -t udp -q
tc_check_packets "dev $h2 ingress" 2 1
check_err $? "Packet from invalid source received on H2 after deleting entry"
fwd_test_port_ip()
{
- __fwd_test_port_ip "239.1.1.1" "192.0.2.1" "192.0.2.2" "-4" "exclude"
- __fwd_test_port_ip "ff0e::1" "2001:db8:1::1" "2001:db8:1::2" "-6" \
+ __fwd_test_port_ip "239.1.1.1" "01:00:5e:01:01:01" "192.0.2.1" "192.0.2.2" "-4" "exclude"
+ __fwd_test_port_ip "ff0e::1" "33:33:00:00:00:01" "2001:db8:1::1" "2001:db8:1::2" "-6" \
"exclude"
- __fwd_test_port_ip "239.1.1.1" "192.0.2.1" "192.0.2.2" "-4" "include"
- __fwd_test_port_ip "ff0e::1" "2001:db8:1::1" "2001:db8:1::2" "-6" \
+ __fwd_test_port_ip "239.1.1.1" "01:00:5e:01:01:01" "192.0.2.1" "192.0.2.2" "-4" "include"
+ __fwd_test_port_ip "ff0e::1" "33:33:00:00:00:01" "2001:db8:1::1" "2001:db8:1::2" "-6" \
"include"
}
filter_mode include source_list 192.0.2.1
# IS_IN ( 192.0.2.2 )
- $MZ $h1.10 -c 1 -A 192.0.2.1 -B 239.1.1.1 \
+ $MZ $h1.10 -c 1 -a own -b 01:00:5e:01:01:01 -A 192.0.2.1 -B 239.1.1.1 \
-t ip proto=2,p=$(igmpv3_is_in_get 239.1.1.1 192.0.2.2) -q
bridge -d mdb show dev br0 vid 10 | grep 239.1.1.1 | grep -q 192.0.2.2
filter_mode include source_list 192.0.2.1
# IS_IN ( 192.0.2.2 )
- $MZ $h1.10 -c 1 -A 192.0.2.1 -B 239.1.1.1 \
+ $MZ $h1.10 -a own -b 01:00:5e:01:01:01 -c 1 -A 192.0.2.1 -B 239.1.1.1 \
-t ip proto=2,p=$(igmpv3_is_in_get 239.1.1.1 192.0.2.2) -q
bridge -d mdb show dev br0 vid 10 | grep 239.1.1.1 | grep -v "src" | \
# IS_IN ( 2001:db8:1::2 )
local p=$(mldv2_is_in_get fe80::1 ff0e::1 2001:db8:1::2)
- $MZ -6 $h1.10 -c 1 -A fe80::1 -B ff0e::1 \
+ $MZ -6 $h1.10 -a own -b 33:33:00:00:00:01 -c 1 -A fe80::1 -B ff0e::1 \
-t ip hop=1,next=0,p="$p" -q
bridge -d mdb show dev br0 vid 10 | grep ff0e::1 | \
filter_mode include source_list 2001:db8:1::1
# IS_IN ( 2001:db8:1::2 )
- $MZ -6 $h1.10 -c 1 -A fe80::1 -B ff0e::1 \
+ $MZ -6 $h1.10 -a own -b 33:33:00:00:00:01 -c 1 -A fe80::1 -B ff0e::1 \
-t ip hop=1,next=0,p="$p" -q
bridge -d mdb show dev br0 vid 10 | grep ff0e::1 | grep -v "src" | \
ctrl_mldv2_is_in_test
}
+if ! bridge mdb help 2>&1 | grep -q "replace"; then
+ echo "SKIP: iproute2 too old, missing bridge mdb replace support"
+ exit $ksft_skip
+fi
+
trap cleanup EXIT
setup_prepare
local IPs=$(seq -f 192.0.2.%g 1 $((n - 1)))
local peer=$(locus_dev_peer $locus)
local GRP=239.1.1.${grp}
- $MZ $peer -c 1 -A 192.0.2.1 -B $GRP \
+ local dmac=01:00:5e:01:01:$(printf "%02x" $grp)
+ $MZ $peer -a own -b $dmac -c 1 -A 192.0.2.1 -B $GRP \
-t ip proto=2,p=$(igmpv3_is_in_get $GRP $IPs) -q
sleep 1
local peer=$(locus_dev_peer $locus)
local GRP=239.1.1.${grp}
- $MZ $peer -c 1 -A 192.0.2.1 -B 224.0.0.2 \
+ local dmac=01:00:5e:00:00:02
+ $MZ $peer -a own -b $dmac -c 1 -A 192.0.2.1 -B 224.0.0.2 \
-t ip proto=2,p=$(igmpv2_leave_get $GRP) -q
sleep 1
! bridge mdb show dev br0 | grep -q $GRP
local peer=$(locus_dev_peer $locus)
local SIP=fe80::1
local GRP=ff0e::${grp}
+ local dmac=33:33:00:00:00:$(printf "%02x" $grp)
local p=$(mldv2_is_in_get $SIP $GRP $IPs)
- $MZ -6 $peer -c 1 -A $SIP -B $GRP -t ip hop=1,next=0,p="$p" -q
+ $MZ -6 $peer -a own -b $dmac -c 1 -A $SIP -B $GRP \
+ -t ip hop=1,next=0,p="$p" -q
sleep 1
local nn=$(bridge mdb show dev br0 | grep $GRP | wc -l)
local peer=$(locus_dev_peer $locus)
local SIP=fe80::1
local GRP=ff0e::${grp}
+ local dmac=33:33:00:00:00:$(printf "%02x" $grp)
local p=$(mldv1_done_get $SIP $GRP)
- $MZ -6 $peer -c 1 -A $SIP -B $GRP -t ip hop=1,next=0,p="$p" -q
+ $MZ -6 $peer -a own -b $dmac -c 1 -A $SIP -B $GRP \
+ -t ip hop=1,next=0,p="$p" -q
sleep 1
! bridge mdb show dev br0 | grep -q $GRP
}
switch_destroy
}
+if ! bridge link help 2>&1 | grep -q "mcast_max_groups"; then
+ echo "SKIP: iproute2 too old, missing bridge \"mcast_max_groups\" support"
+ exit $ksft_skip
+fi
+
trap cleanup EXIT
setup_prepare
ethtool -s $h1 autoneg on
}
+skip_on_veth
+
trap cleanup EXIT
setup_prepare
ip link set dev $swp3 down
}
+skip_on_veth
+
setup_prepare
tests_run
setup_prepare()
{
- check_ethtool_mm_support
- check_tc_fp_support
- require_command lldptool
- bail_on_lldpad "autoconfigure the MAC Merge layer" "configure it manually"
-
h1=${NETIFS[p1]}
h2=${NETIFS[p2]}
h1_destroy
}
+check_ethtool_mm_support
+check_tc_fp_support
+require_command lldptool
+bail_on_lldpad "autoconfigure the MAC Merge layer" "configure it manually"
+
+for netif in ${NETIFS[@]}; do
+ ethtool --show-mm $netif 2>&1 &> /dev/null
+ if [[ $? -ne 0 ]]; then
+ echo "SKIP: $netif does not support MAC Merge"
+ exit $ksft_skip
+ fi
+done
+
trap cleanup EXIT
setup_prepare
test_stats g2a rx
}
+skip_on_veth
+
trap cleanup EXIT
setup_prepare
NUM_NETIFS=4
source lib.sh
+require_command $TROUTE6
+
h1_create()
{
simple_if_init $h1 2001:1:1::2/64
REQUIRE_MTOOLS=${REQUIRE_MTOOLS:=no}
STABLE_MAC_ADDRS=${STABLE_MAC_ADDRS:=no}
TCPDUMP_EXTRA_FLAGS=${TCPDUMP_EXTRA_FLAGS:=}
+TROUTE6=${TROUTE6:=traceroute6}
relative_path="${BASH_SOURCE%/*}"
if [[ "$relative_path" == "${BASH_SOURCE}" ]]; then
fi
}
+skip_on_veth()
+{
+ local kind=$(ip -j -d link show dev ${NETIFS[p1]} |
+ jq -r '.[].linkinfo.info_kind')
+
+ if [[ $kind == veth ]]; then
+ echo "SKIP: Test cannot be run with veth pairs"
+ exit $ksft_skip
+ fi
+}
+
if [[ "$(id -u)" -ne 0 ]]; then
echo "SKIP: need root privileges"
exit $ksft_skip
for ((i = 1; i <= NUM_NETIFS; ++i)); do
local j=$((i+1))
+ if [ -z ${NETIFS[p$i]} ]; then
+ echo "SKIP: Cannot create interface. Name not specified"
+ exit $ksft_skip
+ fi
+
ip link show dev ${NETIFS[p$i]} &> /dev/null
if [[ $? -ne 0 ]]; then
ip link add ${NETIFS[p$i]} type veth \
source tc_common.sh
source lib.sh
+require_command ncat
+
tcflags="skip_hw"
h1_create()
ip_proto icmp \
action drop
- ip vrf exec v$h1 nc --recv-only -w10 -l -p 12345 -o $mirred_e2i_tf2 &
+ ip vrf exec v$h1 ncat --recv-only -w10 -l -p 12345 -o $mirred_e2i_tf2 &
local rpid=$!
- ip vrf exec v$h1 nc -w1 --send-only 192.0.2.2 12345 <$mirred_e2i_tf1
+ ip vrf exec v$h1 ncat -w1 --send-only 192.0.2.2 12345 <$mirred_e2i_tf1
wait -n $rpid
cmp -s $mirred_e2i_tf1 $mirred_e2i_tf2
check_err $? "server output check failed"
tc_check_packets "dev $h2 ingress" 101 1
check_fail $? "Matched on a wrong filter"
- tc_check_packets "dev $h2 ingress" 102 1
- check_err $? "Did not match on correct filter"
+ tc_check_packets "dev $h2 ingress" 102 0
+ check_fail $? "Did not match on correct filter"
tc filter del dev $h2 ingress protocol ip pref 1 handle 101 flower
tc filter del dev $h2 ingress protocol ip pref 2 handle 102 flower
tc_check_packets "dev $h2 ingress" 101 1
check_fail $? "Matched on a wrong filter"
- tc_check_packets "dev $h2 ingress" 102 1
- check_err $? "Did not match on correct filter"
+ tc_check_packets "dev $h2 ingress" 102 0
+ check_fail $? "Did not match on correct filter"
tc filter del dev $h2 ingress protocol ip pref 1 handle 101 flower
tc filter del dev $h2 ingress protocol ip pref 2 handle 102 flower
local proto=$1; shift
local sip=$1; shift
local dip=$1; shift
+ local dmac=$1; shift
local mode=$1; shift
local name=$1; shift
action pass
# Before adding MDB entry.
- $MZ $mode $h1 -t ip -A $sip -B $dip -c 1 -p 100 -q
+ $MZ $mode $h1 -a own -b $dmac -t ip -A $sip -B $dip -c 1 -p 100 -q
tc_check_packets "dev $swp2 egress" 101 1
check_err $? "Unregistered multicast filter was not hit before adding MDB entry"
# Adding MDB entry.
bridge mdb replace dev br1 port $swp2 grp $dip permanent
- $MZ $mode $h1 -t ip -A $sip -B $dip -c 1 -p 100 -q
+ $MZ $mode $h1 -a own -b $dmac -t ip -A $sip -B $dip -c 1 -p 100 -q
tc_check_packets "dev $swp2 egress" 101 1
check_err $? "Unregistered multicast filter was hit after adding MDB entry"
# Deleting MDB entry.
bridge mdb del dev br1 port $swp2 grp $dip
- $MZ $mode $h1 -t ip -A $sip -B $dip -c 1 -p 100 -q
+ $MZ $mode $h1 -a own -b $dmac -t ip -A $sip -B $dip -c 1 -p 100 -q
tc_check_packets "dev $swp2 egress" 101 2
check_err $? "Unregistered multicast filter was not hit after deleting MDB entry"
local proto="ipv4"
local sip=192.0.2.1
local dip=239.1.1.1
+ local dmac=01:00:5e:01:01:01
local mode="-4"
local name="IPv4"
- test_l2_miss_multicast_common $proto $sip $dip $mode $name
+ test_l2_miss_multicast_common $proto $sip $dip $dmac $mode $name
}
test_l2_miss_multicast_ipv6()
local proto="ipv6"
local sip=2001:db8:1::1
local dip=ff0e::1
+ local dmac=33:33:00:00:00:01
local mode="-6"
local name="IPv6"
- test_l2_miss_multicast_common $proto $sip $dip $mode $name
+ test_l2_miss_multicast_common $proto $sip $dip $dmac $mode $name
}
test_l2_miss_multicast()
local i
tc filter add dev $swp1 ingress protocol ip pref 100 handle 100 \
- flower ip_flags nofrag action drop
+ flower src_ip 192.0.2.1 dst_ip 192.0.2.2 ip_proto udp \
+ ip_flags nofrag action drop
tc filter add dev $swp1 ingress protocol ip pref 101 handle 101 \
- flower ip_flags firstfrag action drop
+ flower src_ip 192.0.2.1 dst_ip 192.0.2.2 ip_proto udp \
+ ip_flags firstfrag action drop
tc filter add dev $swp1 ingress protocol ip pref 102 handle 102 \
- flower ip_flags nofirstfrag action drop
+ flower src_ip 192.0.2.1 dst_ip 192.0.2.2 ip_proto udp \
+ ip_flags nofirstfrag action drop
# test 'nofrag' set
tc filter add dev h1-et egress protocol all pref 1 handle 1 matchall $tcflags \
local addr=$3
if [ $ip_mptcp -eq 1 ]; then
+ [ $id -ne 0 ] && addr=''
ip -n $ns mptcp endpoint delete id $id $addr
else
ip netns exec $ns ./pm_nl_ctl del $id $addr
fi
if [ $ip_mptcp -eq 1 ]; then
+ # get line and trim trailing whitespace
line=$(ip -n $ns mptcp endpoint show $id)
+ line="${line% }"
# the dump order is: address id flags port dev
- expected_line="$addr"
- [ -n "$addr" ] && expected_line="$expected_line $addr"
+ [ -n "$addr" ] && expected_line="$addr"
expected_line="$expected_line $id"
[ -n "$_flags" ] && expected_line="$expected_line ${_flags//","/" "}"
[ -n "$dev" ] && expected_line="$expected_line $dev"
tcpdump_pids=
nettest_pids=
socat_pids=
+tmpoutfile=
err() {
err_buf="${err_buf}${1}
ip link del veth_A-R1 2>/dev/null
ovs-vsctl --if-exists del-port vxlan_a 2>/dev/null
ovs-vsctl --if-exists del-br ovs_br0 2>/dev/null
+ rm -f "$tmpoutfile"
}
mtu() {
check_pmtu_value ${exp_mtu} "${pmtu}" "exceeding link layer MTU on bridged ${type} interface"
pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst})"
check_pmtu_value ${exp_mtu} "${pmtu}" "exceeding link layer MTU on locally bridged ${type} interface"
+
+ tmpoutfile=$(mktemp)
+
+ # Flush Exceptions, retry with TCP
+ run_cmd ${ns_a} ip route flush cached ${dst}
+ run_cmd ${ns_b} ip route flush cached ${dst}
+ run_cmd ${ns_c} ip route flush cached ${dst}
+
+ for target in "${ns_a}" "${ns_c}" ; do
+ if [ ${family} -eq 4 ]; then
+ TCPDST=TCP:${dst}:50000
+ else
+ TCPDST="TCP:[${dst}]:50000"
+ fi
+ ${ns_b} socat -T 3 -u -6 TCP-LISTEN:50000 STDOUT > $tmpoutfile &
+
+ sleep 1
+
+ dd if=/dev/zero of=/dev/stdout status=none bs=1M count=1 | ${target} socat -T 3 -u STDIN $TCPDST,connect-timeout=3
+
+ size=$(du -sb $tmpoutfile)
+ size=${size%%/tmp/*}
+
+ [ $size -ne 1048576 ] && err "File size $size mismatches exepcted value in locally bridged vxlan test" && return 1
+ done
+
+ rm -f "$tmpoutfile"
+
+ # Check that exceptions were created
+ pmtu="$(route_get_dst_pmtu_from_exception "${ns_c}" ${dst})"
+ check_pmtu_value ${exp_mtu} "${pmtu}" "tcp: exceeding link layer MTU on bridged ${type} interface"
+ pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst})"
+ check_pmtu_value ${exp_mtu} "${pmtu}" "tcp exceeding link layer MTU on locally bridged ${type} interface"
}
test_pmtu_ipv4_br_vxlan4_exception() {
CLANG_FLAGS += -no-integrated-as
endif
+top_srcdir = ../../../..
+
CFLAGS += -O2 -Wall -g -I./ $(KHDR_INCLUDES) -L$(OUTPUT) -Wl,-rpath=./ \
- $(CLANG_FLAGS)
+ $(CLANG_FLAGS) -I$(top_srcdir)/tools/include
LDLIBS += -lpthread -ldl
# Own dependencies because we only want to build against 1st prerequisite, but
#include <sys/auxv.h>
#include <linux/auxvec.h>
+#include <linux/compiler.h>
+
#include "../kselftest.h"
#include "rseq.h"