diff --git a/.gitignore b/.gitignore index 2258e906f01c..23871de69072 100644 --- a/.gitignore +++ b/.gitignore @@ -44,6 +44,7 @@ *.tab.[ch] *.tar *.xz +*.zst Module.symvers modules.builtin modules.order diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 7bc83f3d9bdf..1924845c879c 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3033,6 +3033,8 @@ no5lvl [X86-64] Disable 5-level paging mode. Forces kernel to use 4-level paging instead. + nofsgsbase [X86] Disables FSGSBASE instructions. + no_console_suspend [HW] Never suspend the console Disable suspending of consoles during suspend and diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index 0329a4d3fa9e..360914b4f346 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -119,6 +119,21 @@ all zones are compacted such that free memory is available in contiguous blocks where possible. This can be important for example in the allocation of huge pages although processes will also directly compact memory as required. +compaction_proactiveness +======================== + +This tunable takes a value in the range [0, 100] with a default value of +20. This tunable determines how aggressively compaction is done in the +background. Setting it to 0 disables proactive compaction. + +Note that compaction has a non-trivial system-wide impact as pages +belonging to different processes are moved around, which could also lead +to latency spikes in unsuspecting applications. The kernel employs +various heuristics to avoid wasting CPU cycles if it detects that +proactive compaction is not being effective. + +Be careful when setting it to extreme values like 100, as that may +cause excessive background compaction activity. compact_unevictable_allowed =========================== diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 38b606991065..3baba687a6fe 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -47,6 +47,7 @@ fixes/update part 1.1 Stefani Seibold June 9 2009 3.10 /proc//timerslack_ns - Task timerslack value 3.11 /proc//patch_state - Livepatch patch operation state 3.12 /proc//arch_status - Task architecture specific information + 3.13 /proc//ksm - Remote KSM 4 Configuring procfs 4.1 Mount options @@ -2131,6 +2132,19 @@ AVX512_elapsed_ms: the task is unlikely an AVX512 user, but depends on the workload and the scheduling scenario, it also could be a false negative mentioned above. +3.13 /proc//ksm - Remote KSM +------------------------------------ +This write-only file allows marking memory of another task for merging +and unmerging via KSM. + +The following actions are available: + + * mark task's memory as mergeable: + # echo merge > /proc//ksm + + * unmerging all the task's memory: + # echo unmerge > /proc//ksm + Configuring procfs ------------------ diff --git a/Documentation/x86/boot.rst b/Documentation/x86/boot.rst index 5325c71ca877..7fafc7ac00d7 100644 --- a/Documentation/x86/boot.rst +++ b/Documentation/x86/boot.rst @@ -782,9 +782,9 @@ Protocol: 2.08+ uncompressed data should be determined using the standard magic numbers. The currently supported compression formats are gzip (magic numbers 1F 8B or 1F 9E), bzip2 (magic number 42 5A), LZMA - (magic number 5D 00), XZ (magic number FD 37), and LZ4 (magic number - 02 21). The uncompressed payload is currently always ELF (magic - number 7F 45 4C 46). + (magic number 5D 00), XZ (magic number FD 37), LZ4 (magic number + 02 21) and ZSTD (magic number 28 B5). The uncompressed payload is + currently always ELF (magic number 7F 45 4C 46). ============ ============== Field name: payload_length diff --git a/Documentation/x86/x86_64/fsgs.rst b/Documentation/x86/x86_64/fsgs.rst new file mode 100644 index 000000000000..50960e09e1f6 --- /dev/null +++ b/Documentation/x86/x86_64/fsgs.rst @@ -0,0 +1,199 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Using FS and GS segments in user space applications +=================================================== + +The x86 architecture supports segmentation. Instructions which access +memory can use segment register based addressing mode. The following +notation is used to address a byte within a segment: + + Segment-register:Byte-address + +The segment base address is added to the Byte-address to compute the +resulting virtual address which is accessed. This allows to access multiple +instances of data with the identical Byte-address, i.e. the same code. The +selection of a particular instance is purely based on the base-address in +the segment register. + +In 32-bit mode the CPU provides 6 segments, which also support segment +limits. The limits can be used to enforce address space protections. + +In 64-bit mode the CS/SS/DS/ES segments are ignored and the base address is +always 0 to provide a full 64bit address space. The FS and GS segments are +still functional in 64-bit mode. + +Common FS and GS usage +------------------------------ + +The FS segment is commonly used to address Thread Local Storage (TLS). FS +is usually managed by runtime code or a threading library. Variables +declared with the '__thread' storage class specifier are instantiated per +thread and the compiler emits the FS: address prefix for accesses to these +variables. Each thread has its own FS base address so common code can be +used without complex address offset calculations to access the per thread +instances. Applications should not use FS for other purposes when they use +runtimes or threading libraries which manage the per thread FS. + +The GS segment has no common use and can be used freely by +applications. GCC and Clang support GS based addressing via address space +identifiers. + +Reading and writing the FS/GS base address +------------------------------------------ + +There exist two mechanisms to read and write the FS/GS base address: + + - the arch_prctl() system call + + - the FSGSBASE instruction family + +Accessing FS/GS base with arch_prctl() +-------------------------------------- + + The arch_prctl(2) based mechanism is available on all 64-bit CPUs and all + kernel versions. + + Reading the base: + + arch_prctl(ARCH_GET_FS, &fsbase); + arch_prctl(ARCH_GET_GS, &gsbase); + + Writing the base: + + arch_prctl(ARCH_SET_FS, fsbase); + arch_prctl(ARCH_SET_GS, gsbase); + + The ARCH_SET_GS prctl may be disabled depending on kernel configuration + and security settings. + +Accessing FS/GS base with the FSGSBASE instructions +--------------------------------------------------- + + With the Ivy Bridge CPU generation Intel introduced a new set of + instructions to access the FS and GS base registers directly from user + space. These instructions are also supported on AMD Family 17H CPUs. The + following instructions are available: + + =============== =========================== + RDFSBASE %reg Read the FS base register + RDGSBASE %reg Read the GS base register + WRFSBASE %reg Write the FS base register + WRGSBASE %reg Write the GS base register + =============== =========================== + + The instructions avoid the overhead of the arch_prctl() syscall and allow + more flexible usage of the FS/GS addressing modes in user space + applications. This does not prevent conflicts between threading libraries + and runtimes which utilize FS and applications which want to use it for + their own purpose. + +FSGSBASE instructions enablement +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + The instructions are enumerated in CPUID leaf 7, bit 0 of EBX. If + available /proc/cpuinfo shows 'fsgsbase' in the flag entry of the CPUs. + + The availability of the instructions does not enable them + automatically. The kernel has to enable them explicitly in CR4. The + reason for this is that older kernels make assumptions about the values in + the GS register and enforce them when GS base is set via + arch_prctl(). Allowing user space to write arbitrary values to GS base + would violate these assumptions and cause malfunction. + + On kernels which do not enable FSGSBASE the execution of the FSGSBASE + instructions will fault with a #UD exception. + + The kernel provides reliable information about the enabled state in the + ELF AUX vector. If the HWCAP2_FSGSBASE bit is set in the AUX vector, the + kernel has FSGSBASE instructions enabled and applications can use them. + The following code example shows how this detection works:: + + #include + #include + + /* Will be eventually in asm/hwcap.h */ + #ifndef HWCAP2_FSGSBASE + #define HWCAP2_FSGSBASE (1 << 1) + #endif + + .... + + unsigned val = getauxval(AT_HWCAP2); + + if (val & HWCAP2_FSGSBASE) + printf("FSGSBASE enabled\n"); + +FSGSBASE instructions compiler support +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +GCC version 4.6.4 and newer provide instrinsics for the FSGSBASE +instructions. Clang 5 supports them as well. + + =================== =========================== + _readfsbase_u64() Read the FS base register + _readfsbase_u64() Read the GS base register + _writefsbase_u64() Write the FS base register + _writegsbase_u64() Write the GS base register + =================== =========================== + +To utilize these instrinsics must be included in the source +code and the compiler option -mfsgsbase has to be added. + +Compiler support for FS/GS based addressing +------------------------------------------- + +GCC version 6 and newer provide support for FS/GS based addressing via +Named Address Spaces. GCC implements the following address space +identifiers for x86: + + ========= ==================================== + __seg_fs Variable is addressed relative to FS + __seg_gs Variable is addressed relative to GS + ========= ==================================== + +The preprocessor symbols __SEG_FS and __SEG_GS are defined when these +address spaces are supported. Code which implements fallback modes should +check whether these symbols are defined. Usage example:: + + #ifdef __SEG_GS + + long data0 = 0; + long data1 = 1; + + long __seg_gs *ptr; + + /* Check whether FSGSBASE is enabled by the kernel (HWCAP2_FSGSBASE) */ + .... + + /* Set GS base to point to data0 */ + _writegsbase_u64(&data0); + + /* Access offset 0 of GS */ + ptr = 0; + printf("data0 = %ld\n", *ptr); + + /* Set GS base to point to data1 */ + _writegsbase_u64(&data1); + /* ptr still addresses offset 0! */ + printf("data1 = %ld\n", *ptr); + + +Clang does not provide the GCC address space identifiers, but it provides +address spaces via an attribute based mechanism in Clang 2.6 and newer +versions: + + ==================================== ===================================== + __attribute__((address_space(256)) Variable is addressed relative to GS + __attribute__((address_space(257)) Variable is addressed relative to FS + ==================================== ===================================== + +FS/GS based addressing with inline assembly +------------------------------------------- + +In case the compiler does not support address spaces, inline assembly can +be used for FS/GS based addressing mode:: + + mov %fs:offset, %reg + mov %gs:offset, %reg + + mov %reg, %fs:offset + mov %reg, %gs:offset diff --git a/Documentation/x86/x86_64/index.rst b/Documentation/x86/x86_64/index.rst index d6eaaa5a35fc..a56070fc8e77 100644 --- a/Documentation/x86/x86_64/index.rst +++ b/Documentation/x86/x86_64/index.rst @@ -14,3 +14,4 @@ x86_64 Support fake-numa-for-cpusets cpu-hotplug-spec machinecheck + fsgs diff --git a/Makefile b/Makefile index b668725a2a62..7964846b1aeb 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 5 PATCHLEVEL = 7 SUBLEVEL = 0 -EXTRAVERSION = +EXTRAVERSION = -pf1 NAME = Kleptomaniac Octopus # *DOCUMENTATION* diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 2d3f963fd6f1..897b3253147b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -185,6 +185,7 @@ config X86 select HAVE_KERNEL_LZMA select HAVE_KERNEL_LZO select HAVE_KERNEL_XZ + select HAVE_KERNEL_ZSTD select HAVE_KPROBES select HAVE_KPROBES_ON_FTRACE select HAVE_FUNCTION_ERROR_INJECTION diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index bc3a497c029c..997e50da4303 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -123,6 +123,7 @@ config MPENTIUMM config MPENTIUM4 bool "Pentium-4/Celeron(P4-based)/Pentium-4 M/older Xeon" depends on X86_32 + select X86_P6_NOP ---help--- Select this for Intel Pentium 4 chips. This includes the Pentium 4, Pentium D, P4-based Celeron and Xeon, and @@ -155,9 +156,8 @@ config MPENTIUM4 -Paxville -Dempsey - config MK6 - bool "K6/K6-II/K6-III" + bool "AMD K6/K6-II/K6-III" depends on X86_32 ---help--- Select this for an AMD K6-family processor. Enables use of @@ -165,7 +165,7 @@ config MK6 flags to GCC. config MK7 - bool "Athlon/Duron/K7" + bool "AMD Athlon/Duron/K7" depends on X86_32 ---help--- Select this for an AMD Athlon K7-family processor. Enables use of @@ -173,12 +173,90 @@ config MK7 flags to GCC. config MK8 - bool "Opteron/Athlon64/Hammer/K8" + bool "AMD Opteron/Athlon64/Hammer/K8" ---help--- Select this for an AMD Opteron or Athlon64 Hammer-family processor. Enables use of some extended instructions, and passes appropriate optimization flags to GCC. +config MK8SSE3 + bool "AMD Opteron/Athlon64/Hammer/K8 with SSE3" + ---help--- + Select this for improved AMD Opteron or Athlon64 Hammer-family processors. + Enables use of some extended instructions, and passes appropriate + optimization flags to GCC. + +config MK10 + bool "AMD 61xx/7x50/PhenomX3/X4/II/K10" + ---help--- + Select this for an AMD 61xx Eight-Core Magny-Cours, Athlon X2 7x50, + Phenom X3/X4/II, Athlon II X2/X3/X4, or Turion II-family processor. + Enables use of some extended instructions, and passes appropriate + optimization flags to GCC. + +config MBARCELONA + bool "AMD Barcelona" + ---help--- + Select this for AMD Family 10h Barcelona processors. + + Enables -march=barcelona + +config MBOBCAT + bool "AMD Bobcat" + ---help--- + Select this for AMD Family 14h Bobcat processors. + + Enables -march=btver1 + +config MJAGUAR + bool "AMD Jaguar" + ---help--- + Select this for AMD Family 16h Jaguar processors. + + Enables -march=btver2 + +config MBULLDOZER + bool "AMD Bulldozer" + ---help--- + Select this for AMD Family 15h Bulldozer processors. + + Enables -march=bdver1 + +config MPILEDRIVER + bool "AMD Piledriver" + ---help--- + Select this for AMD Family 15h Piledriver processors. + + Enables -march=bdver2 + +config MSTEAMROLLER + bool "AMD Steamroller" + ---help--- + Select this for AMD Family 15h Steamroller processors. + + Enables -march=bdver3 + +config MEXCAVATOR + bool "AMD Excavator" + ---help--- + Select this for AMD Family 15h Excavator processors. + + Enables -march=bdver4 + +config MZEN + bool "AMD Zen" + ---help--- + Select this for AMD Family 17h Zen processors. + + Enables -march=znver1 + +config MZEN2 + bool "AMD Zen 2" + ---help--- + Select this for AMD Family 17h Zen 2 processors. + + Enables -march=znver2 + config MCRUSOE bool "Crusoe" depends on X86_32 @@ -260,6 +338,7 @@ config MVIAC7 config MPSC bool "Intel P4 / older Netburst based Xeon" + select X86_P6_NOP depends on X86_64 ---help--- Optimize for Intel Pentium 4, Pentium D and older Nocona/Dempsey @@ -269,8 +348,19 @@ config MPSC using the cpu family field in /proc/cpuinfo. Family 15 is an older Xeon, Family 6 a newer one. +config MATOM + bool "Intel Atom" + select X86_P6_NOP + ---help--- + + Select this for the Intel Atom platform. Intel Atom CPUs have an + in-order pipelining architecture and thus can benefit from + accordingly optimized code. Use a recent GCC with specific Atom + support in order to fully benefit from selecting this option. + config MCORE2 - bool "Core 2/newer Xeon" + bool "Intel Core 2" + select X86_P6_NOP ---help--- Select this for Intel Core 2 and newer Core 2 Xeons (Xeon 51xx and @@ -278,14 +368,133 @@ config MCORE2 family in /proc/cpuinfo. Newer ones have 6 and older ones 15 (not a typo) -config MATOM - bool "Intel Atom" + Enables -march=core2 + +config MNEHALEM + bool "Intel Nehalem" + select X86_P6_NOP ---help--- - Select this for the Intel Atom platform. Intel Atom CPUs have an - in-order pipelining architecture and thus can benefit from - accordingly optimized code. Use a recent GCC with specific Atom - support in order to fully benefit from selecting this option. + Select this for 1st Gen Core processors in the Nehalem family. + + Enables -march=nehalem + +config MWESTMERE + bool "Intel Westmere" + select X86_P6_NOP + ---help--- + + Select this for the Intel Westmere formerly Nehalem-C family. + + Enables -march=westmere + +config MSILVERMONT + bool "Intel Silvermont" + select X86_P6_NOP + ---help--- + + Select this for the Intel Silvermont platform. + + Enables -march=silvermont + +config MGOLDMONT + bool "Intel Goldmont" + select X86_P6_NOP + ---help--- + + Select this for the Intel Goldmont platform including Apollo Lake and Denverton. + + Enables -march=goldmont + +config MGOLDMONTPLUS + bool "Intel Goldmont Plus" + select X86_P6_NOP + ---help--- + + Select this for the Intel Goldmont Plus platform including Gemini Lake. + + Enables -march=goldmont-plus + +config MSANDYBRIDGE + bool "Intel Sandy Bridge" + select X86_P6_NOP + ---help--- + + Select this for 2nd Gen Core processors in the Sandy Bridge family. + + Enables -march=sandybridge + +config MIVYBRIDGE + bool "Intel Ivy Bridge" + select X86_P6_NOP + ---help--- + + Select this for 3rd Gen Core processors in the Ivy Bridge family. + + Enables -march=ivybridge + +config MHASWELL + bool "Intel Haswell" + select X86_P6_NOP + ---help--- + + Select this for 4th Gen Core processors in the Haswell family. + + Enables -march=haswell + +config MBROADWELL + bool "Intel Broadwell" + select X86_P6_NOP + ---help--- + + Select this for 5th Gen Core processors in the Broadwell family. + + Enables -march=broadwell + +config MSKYLAKE + bool "Intel Skylake" + select X86_P6_NOP + ---help--- + + Select this for 6th Gen Core processors in the Skylake family. + + Enables -march=skylake + +config MSKYLAKEX + bool "Intel Skylake X" + select X86_P6_NOP + ---help--- + + Select this for 6th Gen Core processors in the Skylake X family. + + Enables -march=skylake-avx512 + +config MCANNONLAKE + bool "Intel Cannon Lake" + select X86_P6_NOP + ---help--- + + Select this for 8th Gen Core processors + + Enables -march=cannonlake + +config MICELAKE + bool "Intel Ice Lake" + select X86_P6_NOP + ---help--- + + Select this for 10th Gen Core processors in the Ice Lake family. + + Enables -march=icelake-client + +config MCASCADELAKE + bool "Intel Cascade Lake" + select X86_P6_NOP + ---help--- + + Select this for Xeon processors in the Cascade Lake family. + + Enables -march=cascadelake config GENERIC_CPU bool "Generic-x86-64" @@ -294,6 +503,19 @@ config GENERIC_CPU Generic x86-64 CPU. Run equally well on all x86-64 CPUs. +config MNATIVE + bool "Native optimizations autodetected by GCC" + ---help--- + + GCC 4.2 and above support -march=native, which automatically detects + the optimum settings to use based on your processor. -march=native + also detects and applies additional settings beyond -march specific + to your CPU, (eg. -msse4). Unless you have a specific reason not to + (e.g. distcc cross-compiling), you should probably be using + -march=native rather than anything listed below. + + Enables -march=native + endchoice config X86_GENERIC @@ -318,7 +540,7 @@ config X86_INTERNODE_CACHE_SHIFT config X86_L1_CACHE_SHIFT int default "7" if MPENTIUM4 || MPSC - default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU + default "6" if MK7 || MK8 || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MJAGUAR || MPENTIUMM || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MNATIVE || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU default "4" if MELAN || M486SX || M486 || MGEODEGX1 default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX @@ -336,35 +558,36 @@ config X86_ALIGNMENT_16 config X86_INTEL_USERCOPY def_bool y - depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2 + depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK8SSE3 || MK7 || MEFFICEON || MCORE2 || MK10 || MBARCELONA || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MNATIVE config X86_USE_PPRO_CHECKSUM def_bool y - depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM + depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MK10 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MK8SSE3 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MATOM || MNATIVE config X86_USE_3DNOW def_bool y depends on (MCYRIXIII || MK7 || MGEODE_LX) && !UML -# -# P6_NOPs are a relatively minor optimization that require a family >= -# 6 processor, except that it is broken on certain VIA chips. -# Furthermore, AMD chips prefer a totally different sequence of NOPs -# (which work on all CPUs). In addition, it looks like Virtual PC -# does not understand them. -# -# As a result, disallow these if we're not compiling for X86_64 (these -# NOPs do work on all x86-64 capable chips); the list of processors in -# the right-hand clause are the cores that benefit from this optimization. -# config X86_P6_NOP - def_bool y - depends on X86_64 - depends on (MCORE2 || MPENTIUM4 || MPSC) + default n + bool "Support for P6_NOPs on Intel chips" + depends on (MCORE2 || MPENTIUM4 || MPSC || MATOM || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MNATIVE) + ---help--- + P6_NOPs are a relatively minor optimization that require a family >= + 6 processor, except that it is broken on certain VIA chips. + Furthermore, AMD chips prefer a totally different sequence of NOPs + (which work on all CPUs). In addition, it looks like Virtual PC + does not understand them. + + As a result, disallow these if we're not compiling for X86_64 (these + NOPs do work on all x86-64 capable chips); the list of processors in + the right-hand clause are the cores that benefit from this optimization. + + Say Y if you have Intel CPU newer than Pentium Pro, N otherwise. config X86_TSC def_bool y - depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MATOM) || X86_64 + depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MK8SSE3 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MNATIVE || MATOM) || X86_64 config X86_CMPXCHG64 def_bool y @@ -374,7 +597,7 @@ config X86_CMPXCHG64 # generates cmov. config X86_CMOV def_bool y - depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM || MGEODE_LX) + depends on (MK8 || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MJAGUAR || MK7 || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MNATIVE || MATOM || MGEODE_LX) config X86_MINIMUM_CPU_FAMILY int diff --git a/arch/x86/Makefile b/arch/x86/Makefile index b65ec63c7db7..6e2b5c7e76a6 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -119,13 +119,53 @@ else KBUILD_CFLAGS += $(call cc-option,-mskip-rax-setup) # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu) + cflags-$(CONFIG_MNATIVE) += $(call cc-option,-march=native) cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8) + cflags-$(CONFIG_MK8SSE3) += $(call cc-option,-march=k8-sse3,-mtune=k8) + cflags-$(CONFIG_MK10) += $(call cc-option,-march=amdfam10) + cflags-$(CONFIG_MBARCELONA) += $(call cc-option,-march=barcelona) + cflags-$(CONFIG_MBOBCAT) += $(call cc-option,-march=btver1) + cflags-$(CONFIG_MJAGUAR) += $(call cc-option,-march=btver2) + cflags-$(CONFIG_MBULLDOZER) += $(call cc-option,-march=bdver1) + cflags-$(CONFIG_MPILEDRIVER) += $(call cc-option,-march=bdver2) + cflags-$(CONFIG_MSTEAMROLLER) += $(call cc-option,-march=bdver3) + cflags-$(CONFIG_MEXCAVATOR) += $(call cc-option,-march=bdver4) + cflags-$(CONFIG_MZEN) += $(call cc-option,-march=znver1) + cflags-$(CONFIG_MZEN2) += $(call cc-option,-march=znver2) cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona) cflags-$(CONFIG_MCORE2) += \ - $(call cc-option,-march=core2,$(call cc-option,-mtune=generic)) - cflags-$(CONFIG_MATOM) += $(call cc-option,-march=atom) \ - $(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic)) + $(call cc-option,-march=core2,$(call cc-option,-mtune=core2)) + cflags-$(CONFIG_MNEHALEM) += \ + $(call cc-option,-march=nehalem,$(call cc-option,-mtune=nehalem)) + cflags-$(CONFIG_MWESTMERE) += \ + $(call cc-option,-march=westmere,$(call cc-option,-mtune=westmere)) + cflags-$(CONFIG_MSILVERMONT) += \ + $(call cc-option,-march=silvermont,$(call cc-option,-mtune=silvermont)) + cflags-$(CONFIG_MGOLDMONT) += \ + $(call cc-option,-march=goldmont,$(call cc-option,-mtune=goldmont)) + cflags-$(CONFIG_MGOLDMONTPLUS) += \ + $(call cc-option,-march=goldmont-plus,$(call cc-option,-mtune=goldmont-plus)) + cflags-$(CONFIG_MSANDYBRIDGE) += \ + $(call cc-option,-march=sandybridge,$(call cc-option,-mtune=sandybridge)) + cflags-$(CONFIG_MIVYBRIDGE) += \ + $(call cc-option,-march=ivybridge,$(call cc-option,-mtune=ivybridge)) + cflags-$(CONFIG_MHASWELL) += \ + $(call cc-option,-march=haswell,$(call cc-option,-mtune=haswell)) + cflags-$(CONFIG_MBROADWELL) += \ + $(call cc-option,-march=broadwell,$(call cc-option,-mtune=broadwell)) + cflags-$(CONFIG_MSKYLAKE) += \ + $(call cc-option,-march=skylake,$(call cc-option,-mtune=skylake)) + cflags-$(CONFIG_MSKYLAKEX) += \ + $(call cc-option,-march=skylake-avx512,$(call cc-option,-mtune=skylake-avx512)) + cflags-$(CONFIG_MCANNONLAKE) += \ + $(call cc-option,-march=cannonlake,$(call cc-option,-mtune=cannonlake)) + cflags-$(CONFIG_MICELAKE) += \ + $(call cc-option,-march=icelake-client,$(call cc-option,-mtune=icelake-client)) + cflags-$(CONFIG_MCASCADELAKE) += \ + $(call cc-option,-march=cascadelake,$(call cc-option,-mtune=cascadelake)) + cflags-$(CONFIG_MATOM) += $(call cc-option,-march=bonnell) \ + $(call cc-option,-mtune=bonnell,$(call cc-option,-mtune=generic)) cflags-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=generic) KBUILD_CFLAGS += $(cflags-y) diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu index cd3056759880..2c81838df533 100644 --- a/arch/x86/Makefile_32.cpu +++ b/arch/x86/Makefile_32.cpu @@ -24,7 +24,19 @@ cflags-$(CONFIG_MK6) += -march=k6 # Please note, that patches that add -march=athlon-xp and friends are pointless. # They make zero difference whatsosever to performance at this time. cflags-$(CONFIG_MK7) += -march=athlon +cflags-$(CONFIG_MNATIVE) += $(call cc-option,-march=native) cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8,-march=athlon) +cflags-$(CONFIG_MK8SSE3) += $(call cc-option,-march=k8-sse3,-march=athlon) +cflags-$(CONFIG_MK10) += $(call cc-option,-march=amdfam10,-march=athlon) +cflags-$(CONFIG_MBARCELONA) += $(call cc-option,-march=barcelona,-march=athlon) +cflags-$(CONFIG_MBOBCAT) += $(call cc-option,-march=btver1,-march=athlon) +cflags-$(CONFIG_MJAGUAR) += $(call cc-option,-march=btver2,-march=athlon) +cflags-$(CONFIG_MBULLDOZER) += $(call cc-option,-march=bdver1,-march=athlon) +cflags-$(CONFIG_MPILEDRIVER) += $(call cc-option,-march=bdver2,-march=athlon) +cflags-$(CONFIG_MSTEAMROLLER) += $(call cc-option,-march=bdver3,-march=athlon) +cflags-$(CONFIG_MEXCAVATOR) += $(call cc-option,-march=bdver4,-march=athlon) +cflags-$(CONFIG_MZEN) += $(call cc-option,-march=znver1,-march=athlon) +cflags-$(CONFIG_MZEN2) += $(call cc-option,-march=znver2,-march=athlon) cflags-$(CONFIG_MCRUSOE) += -march=i686 -falign-functions=0 -falign-jumps=0 -falign-loops=0 cflags-$(CONFIG_MEFFICEON) += -march=i686 $(call tune,pentium3) -falign-functions=0 -falign-jumps=0 -falign-loops=0 cflags-$(CONFIG_MWINCHIPC6) += $(call cc-option,-march=winchip-c6,-march=i586) @@ -33,8 +45,22 @@ cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) -falign-fu cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686) cflags-$(CONFIG_MVIAC7) += -march=i686 cflags-$(CONFIG_MCORE2) += -march=i686 $(call tune,core2) -cflags-$(CONFIG_MATOM) += $(call cc-option,-march=atom,$(call cc-option,-march=core2,-march=i686)) \ - $(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic)) +cflags-$(CONFIG_MNEHALEM) += -march=i686 $(call tune,nehalem) +cflags-$(CONFIG_MWESTMERE) += -march=i686 $(call tune,westmere) +cflags-$(CONFIG_MSILVERMONT) += -march=i686 $(call tune,silvermont) +cflags-$(CONFIG_MGOLDMONT) += -march=i686 $(call tune,goldmont) +cflags-$(CONFIG_MGOLDMONTPLUS) += -march=i686 $(call tune,goldmont-plus) +cflags-$(CONFIG_MSANDYBRIDGE) += -march=i686 $(call tune,sandybridge) +cflags-$(CONFIG_MIVYBRIDGE) += -march=i686 $(call tune,ivybridge) +cflags-$(CONFIG_MHASWELL) += -march=i686 $(call tune,haswell) +cflags-$(CONFIG_MBROADWELL) += -march=i686 $(call tune,broadwell) +cflags-$(CONFIG_MSKYLAKE) += -march=i686 $(call tune,skylake) +cflags-$(CONFIG_MSKYLAKEX) += -march=i686 $(call tune,skylake-avx512) +cflags-$(CONFIG_MCANNONLAKE) += -march=i686 $(call tune,cannonlake) +cflags-$(CONFIG_MICELAKE) += -march=i686 $(call tune,icelake-client) +cflags-$(CONFIG_MCASCADELAKE) += -march=i686 $(call tune,cascadelake) +cflags-$(CONFIG_MATOM) += $(call cc-option,-march=bonnell,$(call cc-option,-march=core2,-march=i686)) \ + $(call cc-option,-mtune=bonnell,$(call cc-option,-mtune=generic)) # AMD Elan support cflags-$(CONFIG_MELAN) += -march=i486 diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 5f7c262bcc99..7f0aaf9c064a 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -24,7 +24,7 @@ OBJECT_FILES_NON_STANDARD := y KCOV_INSTRUMENT := n targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \ - vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4 + vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4 vmlinux.bin.zst KBUILD_CFLAGS := -m$(BITS) -O2 KBUILD_CFLAGS += -fno-strict-aliasing $(call cc-option, -fPIE, -fPIC) @@ -143,6 +143,8 @@ $(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) FORCE $(call if_changed,lzo) $(obj)/vmlinux.bin.lz4: $(vmlinux.bin.all-y) FORCE $(call if_changed,lz4) +$(obj)/vmlinux.bin.zst: $(vmlinux.bin.all-y) FORCE + $(call if_changed,zstd) suffix-$(CONFIG_KERNEL_GZIP) := gz suffix-$(CONFIG_KERNEL_BZIP2) := bz2 @@ -150,6 +152,7 @@ suffix-$(CONFIG_KERNEL_LZMA) := lzma suffix-$(CONFIG_KERNEL_XZ) := xz suffix-$(CONFIG_KERNEL_LZO) := lzo suffix-$(CONFIG_KERNEL_LZ4) := lz4 +suffix-$(CONFIG_KERNEL_ZSTD) := zst quiet_cmd_mkpiggy = MKPIGGY $@ cmd_mkpiggy = $(obj)/mkpiggy $< > $@ diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 9652d5c2afda..39e592d0e0b4 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -77,6 +77,10 @@ static int lines, cols; #ifdef CONFIG_KERNEL_LZ4 #include "../../../../lib/decompress_unlz4.c" #endif + +#ifdef CONFIG_KERNEL_ZSTD +#include "../../../../lib/decompress_unzstd.c" +#endif /* * NOTE: When adding a new decompressor, please update the analysis in * ../header.S. diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 735ad7f21ab0..6dbd7e9f74c9 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -539,8 +539,14 @@ pref_address: .quad LOAD_PHYSICAL_ADDR # preferred load addr # the size-dependent part now grows so fast. # # extra_bytes = (uncompressed_size >> 8) + 65536 +# +# ZSTD compressed data grows by at most 3 bytes per 128K, and only has a 22 +# byte fixed overhead but has a maximum block size of 128K, so it needs a +# larger margin. +# +# extra_bytes = (uncompressed_size >> 8) + 131072 -#define ZO_z_extra_bytes ((ZO_z_output_len >> 8) + 65536) +#define ZO_z_extra_bytes ((ZO_z_output_len >> 8) + 131072) #if ZO_z_output_len > ZO_z_input_len # define ZO_z_extract_offset (ZO_z_output_len + ZO_z_extra_bytes - \ ZO_z_input_len) diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 1c7f13bb6728..57335f948bf7 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -6,6 +6,7 @@ #include #include #include +#include /* @@ -341,6 +342,12 @@ For 32-bit we have the following conventions - kernel is built with #endif .endm +.macro SAVE_AND_SET_GSBASE scratch_reg:req save_reg:req + rdgsbase \save_reg + GET_PERCPU_BASE \scratch_reg + wrgsbase \scratch_reg +.endm + #endif /* CONFIG_X86_64 */ .macro STACKLEAK_ERASE @@ -349,6 +356,39 @@ For 32-bit we have the following conventions - kernel is built with #endif .endm +#ifdef CONFIG_SMP + +/* + * CPU/node NR is loaded from the limit (size) field of a special segment + * descriptor entry in GDT. + */ +.macro LOAD_CPU_AND_NODE_SEG_LIMIT reg:req + movq $__CPUNODE_SEG, \reg + lsl \reg, \reg +.endm + +/* + * Fetch the per-CPU GSBASE value for this processor and put it in @reg. + * We normally use %gs for accessing per-CPU data, but we are setting up + * %gs here and obviously can not use %gs itself to access per-CPU data. + */ +.macro GET_PERCPU_BASE reg:req + ALTERNATIVE \ + "LOAD_CPU_AND_NODE_SEG_LIMIT \reg", \ + "RDPID \reg", \ + X86_FEATURE_RDPID + andq $VDSO_CPUNODE_MASK, \reg + movq __per_cpu_offset(, \reg, 8), \reg +.endm + +#else + +.macro GET_PERCPU_BASE reg:req + movq pcpu_unit_offsets(%rip), \reg +.endm + +#endif /* CONFIG_SMP */ + /* * This does 'call enter_from_user_mode' unless we can avoid it based on * kernel config or using the static jump infrastructure. diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 3063aa9090f9..53246c470607 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -38,6 +38,7 @@ #include #include #include +#include #include #include "calling.h" @@ -921,7 +922,6 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt .endif .if \paranoid - /* this procedure expect "no swapgs" flag in ebx */ jmp paranoid_exit .else jmp error_exit @@ -1211,24 +1211,21 @@ idtentry machine_check do_mce has_error_code=0 paranoid=1 #endif /* - * Save all registers in pt_regs, and switch gs if needed. - * Use slow, but surefire "are we in kernel?" check. - * Return: ebx=0: need swapgs on exit, ebx=1: otherwise + * Save all registers in pt_regs. Return GSBASE related information + * in EBX depending on the availability of the FSGSBASE instructions: + * + * FSGSBASE R/EBX + * N 0 -> SWAPGS on exit + * 1 -> no SWAPGS on exit + * + * Y GSBASE value at entry, must be restored in paranoid_exit */ SYM_CODE_START_LOCAL(paranoid_entry) UNWIND_HINT_FUNC cld PUSH_AND_CLEAR_REGS save_ret=1 ENCODE_FRAME_POINTER 8 - movl $1, %ebx - movl $MSR_GS_BASE, %ecx - rdmsr - testl %edx, %edx - js 1f /* negative -> in kernel */ - SWAPGS - xorl %ebx, %ebx -1: /* * Always stash CR3 in %r14. This value will be restored, * verbatim, at exit. Needed if paranoid_entry interrupted @@ -1238,9 +1235,51 @@ SYM_CODE_START_LOCAL(paranoid_entry) * This is also why CS (stashed in the "iret frame" by the * hardware at entry) can not be used: this may be a return * to kernel code, but with a user CR3 value. + * + * Switching CR3 does not depend on kernel GSBASE so it can + * be done before switching to the kernel GSBASE. This is + * required for FSGSBASE because the kernel GSBASE has to + * be retrieved from a kernel internal table. */ SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14 + /* + * Handling GSBASE depends on the availability of FSGSBASE. + * + * Without FSGSBASE the kernel enforces that negative GSBASE + * values indicate kernel GSBASE. With FSGSBASE no assumptions + * can be made about the GSBASE value when entering from user + * space. + */ + ALTERNATIVE "jmp .Lparanoid_entry_checkgs", "", X86_FEATURE_FSGSBASE + + /* + * Read the current GSBASE and store it in %rbx unconditionally, + * retrieve and set the current CPUs kernel GSBASE. The stored value + * has to be restored in paranoid_exit unconditionally. + * + * The MSR write ensures that no subsequent load is based on a + * mispredicted GSBASE. No extra FENCE required. + */ + SAVE_AND_SET_GSBASE scratch_reg=%rax save_reg=%rbx + ret + +.Lparanoid_entry_checkgs: + /* EBX = 1 -> kernel GSBASE active, no restore required */ + movl $1, %ebx + /* + * The kernel-enforced convention is a negative GSBASE indicates + * a kernel value. No SWAPGS needed on entry and exit. + */ + movl $MSR_GS_BASE, %ecx + rdmsr + testl %edx, %edx + jns .Lparanoid_entry_swapgs + ret + +.Lparanoid_entry_swapgs: + SWAPGS + /* * The above SAVE_AND_SWITCH_TO_KERNEL_CR3 macro doesn't do an * unconditional CR3 write, even in the PTI case. So do an lfence @@ -1248,6 +1287,8 @@ SYM_CODE_START_LOCAL(paranoid_entry) */ FENCE_SWAPGS_KERNEL_ENTRY + /* EBX = 0 -> SWAPGS required on exit */ + xorl %ebx, %ebx ret SYM_CODE_END(paranoid_entry) @@ -1258,27 +1299,48 @@ SYM_CODE_END(paranoid_entry) * * We may be returning to very strange contexts (e.g. very early * in syscall entry), so checking for preemption here would - * be complicated. Fortunately, we there's no good reason - * to try to handle preemption here. + * be complicated. Fortunately, there's no good reason to try + * to handle preemption here. * - * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) + * R/EBX contains the GSBASE related information depending on the + * availability of the FSGSBASE instructions: + * + * FSGSBASE R/EBX + * N 0 -> SWAPGS on exit + * 1 -> no SWAPGS on exit + * + * Y User space GSBASE, must be restored unconditionally */ SYM_CODE_START_LOCAL(paranoid_exit) UNWIND_HINT_REGS DISABLE_INTERRUPTS(CLBR_ANY) - TRACE_IRQS_OFF_DEBUG - testl %ebx, %ebx /* swapgs needed? */ - jnz .Lparanoid_exit_no_swapgs - TRACE_IRQS_IRETQ - /* Always restore stashed CR3 value (see paranoid_entry) */ - RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 - SWAPGS_UNSAFE_STACK - jmp restore_regs_and_return_to_kernel -.Lparanoid_exit_no_swapgs: + /* + * The order of operations is important. IRQ tracing requires + * kernel GSBASE and CR3. RESTORE_CR3 requires kernel GSBASE. + * + * NB to anyone to try to optimize this code: this code does + * not execute at all for exceptions from user mode. Those + * exceptions go through error_exit instead. + */ TRACE_IRQS_IRETQ_DEBUG - /* Always restore stashed CR3 value (see paranoid_entry) */ - RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 - jmp restore_regs_and_return_to_kernel + + RESTORE_CR3 scratch_reg=%rax save_reg=%r14 + + /* Handle the three GSBASE cases */ + ALTERNATIVE "jmp .Lparanoid_exit_checkgs", "", X86_FEATURE_FSGSBASE + + /* With FSGSBASE enabled, unconditionally restore GSBASE */ + wrgsbase %rbx + jmp restore_regs_and_return_to_kernel + +.Lparanoid_exit_checkgs: + /* On non-FSGSBASE systems, conditionally do SWAPGS */ + testl %ebx, %ebx + jnz restore_regs_and_return_to_kernel + + /* We are returning to a context with user GSBASE */ + SWAPGS_UNSAFE_STACK + jmp restore_regs_and_return_to_kernel SYM_CODE_END(paranoid_exit) /* @@ -1686,10 +1748,27 @@ end_repeat_nmi: /* Always restore stashed CR3 value (see paranoid_entry) */ RESTORE_CR3 scratch_reg=%r15 save_reg=%r14 - testl %ebx, %ebx /* swapgs needed? */ + /* + * The above invocation of paranoid_entry stored the GSBASE + * related information in R/EBX depending on the availability + * of FSGSBASE. + * + * If FSGSBASE is enabled, restore the saved GSBASE value + * unconditionally, otherwise take the conditional SWAPGS path. + */ + ALTERNATIVE "jmp nmi_no_fsgsbase", "", X86_FEATURE_FSGSBASE + + wrgsbase %rbx + jmp nmi_restore + +nmi_no_fsgsbase: + /* EBX == 0 -> invoke SWAPGS */ + testl %ebx, %ebx jnz nmi_restore + nmi_swapgs: SWAPGS_UNSAFE_STACK + nmi_restore: POP_REGS diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h index 680c320363db..d6dd43d25d9f 100644 --- a/arch/x86/include/asm/boot.h +++ b/arch/x86/include/asm/boot.h @@ -24,9 +24,11 @@ # error "Invalid value for CONFIG_PHYSICAL_ALIGN" #endif -#ifdef CONFIG_KERNEL_BZIP2 +#if defined(CONFIG_KERNEL_BZIP2) # define BOOT_HEAP_SIZE 0x400000 -#else /* !CONFIG_KERNEL_BZIP2 */ +#elif defined(CONFIG_KERNEL_ZSTD) +# define BOOT_HEAP_SIZE 0x30000 +#else # define BOOT_HEAP_SIZE 0x10000 #endif diff --git a/arch/x86/include/asm/fsgsbase.h b/arch/x86/include/asm/fsgsbase.h index bca4c743de77..aefd53767a5d 100644 --- a/arch/x86/include/asm/fsgsbase.h +++ b/arch/x86/include/asm/fsgsbase.h @@ -19,36 +19,63 @@ extern unsigned long x86_gsbase_read_task(struct task_struct *task); extern void x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase); extern void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase); -/* Helper functions for reading/writing FS/GS base */ +/* Must be protected by X86_FEATURE_FSGSBASE check. */ -static inline unsigned long x86_fsbase_read_cpu(void) +static __always_inline unsigned long rdfsbase(void) { unsigned long fsbase; - rdmsrl(MSR_FS_BASE, fsbase); + asm volatile("rdfsbase %0" : "=r" (fsbase) :: "memory"); return fsbase; } -static inline unsigned long x86_gsbase_read_cpu_inactive(void) +static __always_inline unsigned long rdgsbase(void) { unsigned long gsbase; - rdmsrl(MSR_KERNEL_GS_BASE, gsbase); + asm volatile("rdgsbase %0" : "=r" (gsbase) :: "memory"); return gsbase; } -static inline void x86_fsbase_write_cpu(unsigned long fsbase) +static __always_inline void wrfsbase(unsigned long fsbase) { - wrmsrl(MSR_FS_BASE, fsbase); + asm volatile("wrfsbase %0" :: "r" (fsbase) : "memory"); } -static inline void x86_gsbase_write_cpu_inactive(unsigned long gsbase) +static __always_inline void wrgsbase(unsigned long gsbase) { - wrmsrl(MSR_KERNEL_GS_BASE, gsbase); + asm volatile("wrgsbase %0" :: "r" (gsbase) : "memory"); } +#include + +/* Helper functions for reading/writing FS/GS base */ + +static inline unsigned long x86_fsbase_read_cpu(void) +{ + unsigned long fsbase; + + if (static_cpu_has(X86_FEATURE_FSGSBASE)) + fsbase = rdfsbase(); + else + rdmsrl(MSR_FS_BASE, fsbase); + + return fsbase; +} + +static inline void x86_fsbase_write_cpu(unsigned long fsbase) +{ + if (static_cpu_has(X86_FEATURE_FSGSBASE)) + wrfsbase(fsbase); + else + wrmsrl(MSR_FS_BASE, fsbase); +} + +extern unsigned long x86_gsbase_read_cpu_inactive(void); +extern void x86_gsbase_write_cpu_inactive(unsigned long gsbase); + #endif /* CONFIG_X86_64 */ #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/inst.h b/arch/x86/include/asm/inst.h index f5a796da07f8..d063841a17e3 100644 --- a/arch/x86/include/asm/inst.h +++ b/arch/x86/include/asm/inst.h @@ -306,6 +306,21 @@ .endif MODRM 0xc0 movq_r64_xmm_opd1 movq_r64_xmm_opd2 .endm + +.macro RDPID opd + REG_TYPE rdpid_opd_type \opd + .if rdpid_opd_type == REG_TYPE_R64 + R64_NUM rdpid_opd \opd + .else + R32_NUM rdpid_opd \opd + .endif + .byte 0xf3 + .if rdpid_opd > 7 + PFX_REX rdpid_opd 0 + .endif + .byte 0x0f, 0xc7 + MODRM 0xc0 rdpid_opd 0x7 +.endm #endif #endif diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 3bcf27caf6c9..809bc013db70 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -456,10 +456,8 @@ static inline unsigned long cpu_kernelmode_gs_base(int cpu) DECLARE_PER_CPU(unsigned int, irq_count); extern asmlinkage void ignore_sysret(void); -#if IS_ENABLED(CONFIG_KVM) /* Save actual FS/GS selectors and bases to current->thread */ -void save_fsgs_for_kvm(void); -#endif +void current_save_fsgs(void); #else /* X86_64 */ #ifdef CONFIG_STACKPROTECTOR /* diff --git a/arch/x86/include/asm/vermagic.h b/arch/x86/include/asm/vermagic.h index 75884d2cdec3..0cf864d2d110 100644 --- a/arch/x86/include/asm/vermagic.h +++ b/arch/x86/include/asm/vermagic.h @@ -17,6 +17,36 @@ #define MODULE_PROC_FAMILY "586MMX " #elif defined CONFIG_MCORE2 #define MODULE_PROC_FAMILY "CORE2 " +#elif defined CONFIG_MNATIVE +#define MODULE_PROC_FAMILY "NATIVE " +#elif defined CONFIG_MNEHALEM +#define MODULE_PROC_FAMILY "NEHALEM " +#elif defined CONFIG_MWESTMERE +#define MODULE_PROC_FAMILY "WESTMERE " +#elif defined CONFIG_MSILVERMONT +#define MODULE_PROC_FAMILY "SILVERMONT " +#elif defined CONFIG_MGOLDMONT +#define MODULE_PROC_FAMILY "GOLDMONT " +#elif defined CONFIG_MGOLDMONTPLUS +#define MODULE_PROC_FAMILY "GOLDMONTPLUS " +#elif defined CONFIG_MSANDYBRIDGE +#define MODULE_PROC_FAMILY "SANDYBRIDGE " +#elif defined CONFIG_MIVYBRIDGE +#define MODULE_PROC_FAMILY "IVYBRIDGE " +#elif defined CONFIG_MHASWELL +#define MODULE_PROC_FAMILY "HASWELL " +#elif defined CONFIG_MBROADWELL +#define MODULE_PROC_FAMILY "BROADWELL " +#elif defined CONFIG_MSKYLAKE +#define MODULE_PROC_FAMILY "SKYLAKE " +#elif defined CONFIG_MSKYLAKEX +#define MODULE_PROC_FAMILY "SKYLAKEX " +#elif defined CONFIG_MCANNONLAKE +#define MODULE_PROC_FAMILY "CANNONLAKE " +#elif defined CONFIG_MICELAKE +#define MODULE_PROC_FAMILY "ICELAKE " +#elif defined CONFIG_MCASCADELAKE +#define MODULE_PROC_FAMILY "CASCADELAKE " #elif defined CONFIG_MATOM #define MODULE_PROC_FAMILY "ATOM " #elif defined CONFIG_M686 @@ -35,6 +65,28 @@ #define MODULE_PROC_FAMILY "K7 " #elif defined CONFIG_MK8 #define MODULE_PROC_FAMILY "K8 " +#elif defined CONFIG_MK8SSE3 +#define MODULE_PROC_FAMILY "K8SSE3 " +#elif defined CONFIG_MK10 +#define MODULE_PROC_FAMILY "K10 " +#elif defined CONFIG_MBARCELONA +#define MODULE_PROC_FAMILY "BARCELONA " +#elif defined CONFIG_MBOBCAT +#define MODULE_PROC_FAMILY "BOBCAT " +#elif defined CONFIG_MBULLDOZER +#define MODULE_PROC_FAMILY "BULLDOZER " +#elif defined CONFIG_MPILEDRIVER +#define MODULE_PROC_FAMILY "PILEDRIVER " +#elif defined CONFIG_MSTEAMROLLER +#define MODULE_PROC_FAMILY "STEAMROLLER " +#elif defined CONFIG_MJAGUAR +#define MODULE_PROC_FAMILY "JAGUAR " +#elif defined CONFIG_MEXCAVATOR +#define MODULE_PROC_FAMILY "EXCAVATOR " +#elif defined CONFIG_MZEN +#define MODULE_PROC_FAMILY "ZEN " +#elif defined CONFIG_MZEN2 +#define MODULE_PROC_FAMILY "ZEN2 " #elif defined CONFIG_MELAN #define MODULE_PROC_FAMILY "ELAN " #elif defined CONFIG_MCRUSOE diff --git a/arch/x86/include/uapi/asm/hwcap2.h b/arch/x86/include/uapi/asm/hwcap2.h index 8b2effe6efb8..5fdfcb47000f 100644 --- a/arch/x86/include/uapi/asm/hwcap2.h +++ b/arch/x86/include/uapi/asm/hwcap2.h @@ -5,4 +5,7 @@ /* MONITOR/MWAIT enabled in Ring 3 */ #define HWCAP2_RING3MWAIT (1 << 0) +/* Kernel allows FSGSBASE instructions available in Ring 3 */ +#define HWCAP2_FSGSBASE BIT(1) + #endif diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index ed54b3b21c39..487603ea51cd 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -450,14 +450,12 @@ static void __init spectre_v1_select_mitigation(void) * If FSGSBASE is enabled, the user can put a kernel address in * GS, in which case SMAP provides no protection. * - * [ NOTE: Don't check for X86_FEATURE_FSGSBASE until the - * FSGSBASE enablement patches have been merged. ] - * * If FSGSBASE is disabled, the user can only put a user space * address in GS. That makes an attack harder, but still * possible if there's no SMAP protection. */ - if (!smap_works_speculatively()) { + if (boot_cpu_has(X86_FEATURE_FSGSBASE) || + !smap_works_speculatively()) { /* * Mitigation can be provided from SWAPGS itself or * PTI as the CR3 write in the Meltdown mitigation diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index bed0cb83fe24..b5a086ea3425 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -418,6 +418,22 @@ static void __init setup_cr_pinning(void) static_key_enable(&cr_pinning.key); } +static __init int x86_nofsgsbase_setup(char *arg) +{ + /* Require an exact match without trailing characters. */ + if (strlen(arg)) + return 0; + + /* Do not emit a message if the feature is not present. */ + if (!boot_cpu_has(X86_FEATURE_FSGSBASE)) + return 1; + + setup_clear_cpu_cap(X86_FEATURE_FSGSBASE); + pr_info("FSGSBASE disabled via kernel command line\n"); + return 1; +} +__setup("nofsgsbase", x86_nofsgsbase_setup); + /* * Protection Keys are not available in 32-bit mode. */ @@ -1478,6 +1494,12 @@ static void identify_cpu(struct cpuinfo_x86 *c) setup_smap(c); setup_umip(c); + /* Enable FSGSBASE instructions if available. */ + if (cpu_has(c, X86_FEATURE_FSGSBASE)) { + cr4_set_bits(X86_CR4_FSGSBASE); + elf_hwcap2 |= HWCAP2_FSGSBASE; + } + /* * The vendor-specific functions might have changed features. * Now we do "generic changes." diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 35638f1c5791..bd8d496acc58 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -140,10 +140,12 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); #ifdef CONFIG_X86_64 - savesegment(gs, p->thread.gsindex); - p->thread.gsbase = p->thread.gsindex ? 0 : current->thread.gsbase; - savesegment(fs, p->thread.fsindex); - p->thread.fsbase = p->thread.fsindex ? 0 : current->thread.fsbase; + current_save_fsgs(); + p->thread.fsindex = current->thread.fsindex; + p->thread.fsbase = current->thread.fsbase; + p->thread.gsindex = current->thread.gsindex; + p->thread.gsbase = current->thread.gsbase; + savesegment(es, p->thread.es); savesegment(ds, p->thread.ds); #else diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 5ef9d8f25b0e..0bcb48a1264a 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -150,6 +150,40 @@ enum which_selector { GS }; +/* + * Out of line to be protected from kprobes. It is not used on Xen + * paravirt. When paravirt support is needed, it needs to be renamed + * with native_ prefix. + */ +static noinline unsigned long __rdgsbase_inactive(void) +{ + unsigned long gsbase; + + lockdep_assert_irqs_disabled(); + + native_swapgs(); + gsbase = rdgsbase(); + native_swapgs(); + + return gsbase; +} +NOKPROBE_SYMBOL(__rdgsbase_inactive); + +/* + * Out of line to be protected from kprobes. It is not used on Xen + * paravirt. When paravirt support is needed, it needs to be renamed + * with native_ prefix. + */ +static noinline void __wrgsbase_inactive(unsigned long gsbase) +{ + lockdep_assert_irqs_disabled(); + + native_swapgs(); + wrgsbase(gsbase); + native_swapgs(); +} +NOKPROBE_SYMBOL(__wrgsbase_inactive); + /* * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are * not available. The goal is to be reasonably fast on non-FSGSBASE systems. @@ -199,22 +233,35 @@ static __always_inline void save_fsgs(struct task_struct *task) { savesegment(fs, task->thread.fsindex); savesegment(gs, task->thread.gsindex); - save_base_legacy(task, task->thread.fsindex, FS); - save_base_legacy(task, task->thread.gsindex, GS); + if (static_cpu_has(X86_FEATURE_FSGSBASE)) { + /* + * If FSGSBASE is enabled, we can't make any useful guesses + * about the base, and user code expects us to save the current + * value. Fortunately, reading the base directly is efficient. + */ + task->thread.fsbase = rdfsbase(); + task->thread.gsbase = __rdgsbase_inactive(); + } else { + save_base_legacy(task, task->thread.fsindex, FS); + save_base_legacy(task, task->thread.gsindex, GS); + } } -#if IS_ENABLED(CONFIG_KVM) /* * While a process is running,current->thread.fsbase and current->thread.gsbase - * may not match the corresponding CPU registers (see save_base_legacy()). KVM - * wants an efficient way to save and restore FSBASE and GSBASE. - * When FSGSBASE extensions are enabled, this will have to use RD{FS,GS}BASE. + * may not match the corresponding CPU registers (see save_base_legacy()). */ -void save_fsgs_for_kvm(void) +void current_save_fsgs(void) { + unsigned long flags; + + /* Interrupts need to be off for FSGSBASE */ + local_irq_save(flags); save_fsgs(current); + local_irq_restore(flags); } -EXPORT_SYMBOL_GPL(save_fsgs_for_kvm); +#if IS_ENABLED(CONFIG_KVM) +EXPORT_SYMBOL_GPL(current_save_fsgs); #endif static __always_inline void loadseg(enum which_selector which, @@ -279,10 +326,22 @@ static __always_inline void load_seg_legacy(unsigned short prev_index, static __always_inline void x86_fsgsbase_load(struct thread_struct *prev, struct thread_struct *next) { - load_seg_legacy(prev->fsindex, prev->fsbase, - next->fsindex, next->fsbase, FS); - load_seg_legacy(prev->gsindex, prev->gsbase, - next->gsindex, next->gsbase, GS); + if (static_cpu_has(X86_FEATURE_FSGSBASE)) { + /* Update the FS and GS selectors if they could have changed. */ + if (unlikely(prev->fsindex || next->fsindex)) + loadseg(FS, next->fsindex); + if (unlikely(prev->gsindex || next->gsindex)) + loadseg(GS, next->gsindex); + + /* Update the bases. */ + wrfsbase(next->fsbase); + __wrgsbase_inactive(next->gsbase); + } else { + load_seg_legacy(prev->fsindex, prev->fsbase, + next->fsindex, next->fsbase, FS); + load_seg_legacy(prev->gsindex, prev->gsbase, + next->gsindex, next->gsbase, GS); + } } static unsigned long x86_fsgsbase_read_task(struct task_struct *task, @@ -328,13 +387,44 @@ static unsigned long x86_fsgsbase_read_task(struct task_struct *task, return base; } +unsigned long x86_gsbase_read_cpu_inactive(void) +{ + unsigned long gsbase; + + if (static_cpu_has(X86_FEATURE_FSGSBASE)) { + unsigned long flags; + + local_irq_save(flags); + gsbase = __rdgsbase_inactive(); + local_irq_restore(flags); + } else { + rdmsrl(MSR_KERNEL_GS_BASE, gsbase); + } + + return gsbase; +} + +void x86_gsbase_write_cpu_inactive(unsigned long gsbase) +{ + if (static_cpu_has(X86_FEATURE_FSGSBASE)) { + unsigned long flags; + + local_irq_save(flags); + __wrgsbase_inactive(gsbase); + local_irq_restore(flags); + } else { + wrmsrl(MSR_KERNEL_GS_BASE, gsbase); + } +} + unsigned long x86_fsbase_read_task(struct task_struct *task) { unsigned long fsbase; if (task == current) fsbase = x86_fsbase_read_cpu(); - else if (task->thread.fsindex == 0) + else if (static_cpu_has(X86_FEATURE_FSGSBASE) || + (task->thread.fsindex == 0)) fsbase = task->thread.fsbase; else fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex); @@ -348,7 +438,8 @@ unsigned long x86_gsbase_read_task(struct task_struct *task) if (task == current) gsbase = x86_gsbase_read_cpu_inactive(); - else if (task->thread.gsindex == 0) + else if (static_cpu_has(X86_FEATURE_FSGSBASE) || + (task->thread.gsindex == 0)) gsbase = task->thread.gsbase; else gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex); diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index f0e1ddbc2fd7..cc56efb75d27 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -380,25 +380,12 @@ static int putreg(struct task_struct *child, case offsetof(struct user_regs_struct,fs_base): if (value >= TASK_SIZE_MAX) return -EIO; - /* - * When changing the FS base, use do_arch_prctl_64() - * to set the index to zero and to set the base - * as requested. - * - * NB: This behavior is nonsensical and likely needs to - * change when FSGSBASE support is added. - */ - if (child->thread.fsbase != value) - return do_arch_prctl_64(child, ARCH_SET_FS, value); + x86_fsbase_write_task(child, value); return 0; case offsetof(struct user_regs_struct,gs_base): - /* - * Exactly the same here as the %fs handling above. - */ if (value >= TASK_SIZE_MAX) return -EIO; - if (child->thread.gsbase != value) - return do_arch_prctl_64(child, ARCH_SET_GS, value); + x86_gsbase_write_task(child, value); return 0; #endif } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 89c766fad889..309e6dc975d5 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1167,7 +1167,7 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) gs_base = cpu_kernelmode_gs_base(cpu); if (likely(is_64bit_mm(current->mm))) { - save_fsgs_for_kvm(); + current_save_fsgs(); fs_sel = current->thread.fsindex; gs_sel = current->thread.gsindex; fs_base = current->thread.fsbase; diff --git a/drivers/hwmon/applesmc.c b/drivers/hwmon/applesmc.c index ec93b8d673f5..0433c77b2571 100644 --- a/drivers/hwmon/applesmc.c +++ b/drivers/hwmon/applesmc.c @@ -46,6 +46,7 @@ #define APPLESMC_MIN_WAIT 0x0010 #define APPLESMC_RETRY_WAIT 0x0100 #define APPLESMC_MAX_WAIT 0x20000 +#define APPLESMC_UDELAY_MAX 20000 #define APPLESMC_READ_CMD 0x10 #define APPLESMC_WRITE_CMD 0x11 @@ -157,14 +158,23 @@ static struct workqueue_struct *applesmc_led_wq; static int wait_read(void) { u8 status; - int us; - for (us = APPLESMC_MIN_WAIT; us < APPLESMC_MAX_WAIT; us <<= 1) { + unsigned int us; + + for (us = APPLESMC_MIN_WAIT; us < APPLESMC_UDELAY_MAX; us <<= 1) { udelay(us); status = inb(APPLESMC_CMD_PORT); /* read: wait for smc to settle */ if (status & 0x01) return 0; } + /* switch to mdelay for longer sleeps */ + for (; us < APPLESMC_MAX_WAIT; us <<= 1) { + mdelay(us); + status = inb(APPLESMC_CMD_PORT); + /* read: wait for smc to settle */ + if (status & 0x01) + return 0; + } pr_warn("wait_read() fail: 0x%02x\n", status); return -EIO; @@ -177,10 +187,10 @@ static int wait_read(void) static int send_byte(u8 cmd, u16 port) { u8 status; - int us; + unsigned int us; outb(cmd, port); - for (us = APPLESMC_MIN_WAIT; us < APPLESMC_MAX_WAIT; us <<= 1) { + for (us = APPLESMC_MIN_WAIT; us < APPLESMC_UDELAY_MAX; us <<= 1) { udelay(us); status = inb(APPLESMC_CMD_PORT); /* write: wait for smc to settle */ @@ -190,6 +200,23 @@ static int send_byte(u8 cmd, u16 port) if (status & 0x04) return 0; /* timeout: give up */ + if (us << 1 == APPLESMC_UDELAY_MAX) + break; + /* busy: long wait and resend */ + udelay(APPLESMC_RETRY_WAIT); + outb(cmd, port); + } + /* switch to mdelay for longer sleeps */ + for (; us < APPLESMC_MAX_WAIT; us <<= 1) { + mdelay(us); + status = inb(APPLESMC_CMD_PORT); + /* write: wait for smc to settle */ + if (status & 0x02) + continue; + /* ready: cmd accepted, return */ + if (status & 0x04) + return 0; + /* timeout: give up */ if (us << 1 == APPLESMC_MAX_WAIT) break; /* busy: long wait and resend */ diff --git a/drivers/platform/x86/dell-wmi.c b/drivers/platform/x86/dell-wmi.c index 86e8dd6a8b33..c23a3e1c880d 100644 --- a/drivers/platform/x86/dell-wmi.c +++ b/drivers/platform/x86/dell-wmi.c @@ -173,6 +173,11 @@ static const struct key_entry dell_wmi_keymap_type_0000[] = { /* Dell Support Center key */ { KE_IGNORE, 0xe06e, { KEY_RESERVED } }, + /* Dell Vostro 3360 multimedia keys with mangled DSDT */ + { KE_KEY, 0xe0f1, { KEY_PROG1 } }, + { KE_KEY, 0xe0f2, { KEY_PROG2 } }, + { KE_KEY, 0xe0f3, { KEY_PROG3 } }, + { KE_IGNORE, 0xe0f7, { KEY_MUTE } }, { KE_IGNORE, 0xe0f8, { KEY_VOLUMEDOWN } }, { KE_IGNORE, 0xe0f9, { KEY_VOLUMEUP } }, diff --git a/fs/proc/base.c b/fs/proc/base.c index eb2255e95f62..fd6e12f23a86 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -96,6 +96,8 @@ #include #include #include +#include +#include #include #include "internal.h" #include "fd.h" @@ -3115,6 +3117,149 @@ static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns, } #endif /* CONFIG_STACKLEAK_METRICS */ +#ifdef CONFIG_KSM +static int ksm_open(struct inode *inode, struct file *file) +{ + struct task_struct *task; + struct mm_struct *mm; + int err; + + task = get_proc_task(inode); + if (!task) { + err = -ESRCH; + goto out; + } + if (task->flags & PF_KTHREAD) { + put_task_struct(task); + err = -EINVAL; + goto out; + } + + mm = mm_access(task, PTRACE_MODE_ATTACH_FSCREDS); + put_task_struct(task); + if (!mm) { + err = -EINVAL; + goto out; + } + if (IS_ERR(mm)) { + err = PTR_ERR(mm); + goto out; + } + + /* ensure this mm_struct can't be freed */ + mmgrab(mm); + /* but do not pin its memory */ + mmput(mm); + + err = 0; + file->private_data = mm; + +out: + return err; +} + +static ssize_t ksm_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + char kbuf[PROC_NUMBUF]; + char *pos; + int behaviour; + struct mm_struct *mm = file->private_data; + int err; + int last_err; + struct vm_area_struct *vma; + + if (!mm) { + err = -EINVAL; + goto out; + } + + /* Only allow a very narrow range of strings to be written */ + if ((*ppos != 0) || (count >= sizeof(kbuf))) { + err = -EINVAL; + goto out; + } + + /* What was written? */ + if (copy_from_user(kbuf, buf, count)) { + err = -EFAULT; + goto out; + } + kbuf[count] = '\0'; + pos = kbuf; + + /* What is being requested? */ + if (strncmp(pos, "merge", 5) == 0) { + pos += 5; + behaviour = MADV_MERGEABLE; + } + else if (strncmp(pos, "unmerge", 7) == 0) { + pos += 7; + behaviour = MADV_UNMERGEABLE; + } + else { + err = -EINVAL; + goto out; + } + + /* Verify there is not trailing junk on the line */ + pos = skip_spaces(pos); + if (*pos != '\0') { + err = -EINVAL; + goto out; + } + + if (!mmget_not_zero(mm)) { + err = -EINVAL; + goto out; + } + + down_write(&mm->mmap_sem); + if (!mmget_still_valid(mm)) { + err = -EINVAL; + goto skip_mm; + } + + err = 0; + + vma = mm->mmap; + while (vma) { + if (behaviour == MADV_MERGEABLE) + last_err = ksm_madvise_merge(vma->vm_mm, vma, &vma->vm_flags); + else + last_err = ksm_madvise_unmerge(vma, vma->vm_start, vma->vm_end, &vma->vm_flags); + if (last_err) + err = last_err; + vma = vma->vm_next; + } + +skip_mm: + up_write(&mm->mmap_sem); + + mmput(mm); + +out: + return err ? err : count; +} + +static int ksm_release(struct inode *inode, struct file *file) +{ + struct mm_struct *mm = file->private_data; + + if (mm) + mmdrop(mm); + + return 0; +} + +static const struct file_operations proc_ksm_operations = { + .open = ksm_open, + .write = ksm_write, + .llseek = noop_llseek, + .release = ksm_release, +}; +#endif /* CONFIG_KSM */ + /* * Thread groups */ @@ -3228,6 +3373,9 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_PROC_PID_ARCH_STATUS ONE("arch_status", S_IRUGO, proc_pid_arch_status), #endif +#ifdef CONFIG_KSM + REG("ksm", S_IRUGO|S_IWUSR, proc_ksm_operations), +#endif }; static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx) diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 4b898cdbdf05..ccd28978b296 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -85,11 +85,13 @@ static inline unsigned long compact_gap(unsigned int order) #ifdef CONFIG_COMPACTION extern int sysctl_compact_memory; +extern int sysctl_compaction_proactiveness; extern int sysctl_compaction_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos); extern int sysctl_extfrag_threshold; extern int sysctl_compact_unevictable_allowed; +extern int extfrag_for_order(struct zone *zone, unsigned int order); extern int fragmentation_index(struct zone *zone, unsigned int order); extern enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, unsigned int alloc_flags, diff --git a/include/linux/decompress/unzstd.h b/include/linux/decompress/unzstd.h new file mode 100644 index 000000000000..56d539ae880f --- /dev/null +++ b/include/linux/decompress/unzstd.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef LINUX_DECOMPRESS_UNZSTD_H +#define LINUX_DECOMPRESS_UNZSTD_H + +int unzstd(unsigned char *inbuf, long len, + long (*fill)(void*, unsigned long), + long (*flush)(void*, unsigned long), + unsigned char *output, + long *pos, + void (*error_fn)(char *x)); +#endif diff --git a/include/linux/ksm.h b/include/linux/ksm.h index e48b1e453ff5..a91a7cfc87a1 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -19,6 +19,10 @@ struct stable_node; struct mem_cgroup; #ifdef CONFIG_KSM +int ksm_madvise_merge(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long *vm_flags); +int ksm_madvise_unmerge(struct vm_area_struct *vma, unsigned long start, + unsigned long end, unsigned long *vm_flags); int ksm_madvise(struct vm_area_struct *vma, unsigned long start, unsigned long end, int advice, unsigned long *vm_flags); int __ksm_enter(struct mm_struct *mm); diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h index a89eb0accd5e..a3e760886b8e 100644 --- a/include/uapi/linux/futex.h +++ b/include/uapi/linux/futex.h @@ -21,6 +21,7 @@ #define FUTEX_WAKE_BITSET 10 #define FUTEX_WAIT_REQUEUE_PI 11 #define FUTEX_CMP_REQUEUE_PI 12 +#define FUTEX_WAIT_MULTIPLE 31 #define FUTEX_PRIVATE_FLAG 128 #define FUTEX_CLOCK_REALTIME 256 @@ -40,6 +41,8 @@ FUTEX_PRIVATE_FLAG) #define FUTEX_CMP_REQUEUE_PI_PRIVATE (FUTEX_CMP_REQUEUE_PI | \ FUTEX_PRIVATE_FLAG) +#define FUTEX_WAIT_MULTIPLE_PRIVATE (FUTEX_WAIT_MULTIPLE | \ + FUTEX_PRIVATE_FLAG) /* * Support for robust futexes: the kernel cleans up held futexes at @@ -150,4 +153,21 @@ struct robust_list_head { (((op & 0xf) << 28) | ((cmp & 0xf) << 24) \ | ((oparg & 0xfff) << 12) | (cmparg & 0xfff)) +/* + * Maximum number of multiple futexes to wait for + */ +#define FUTEX_MULTIPLE_MAX_COUNT 128 + +/** + * struct futex_wait_block - Block of futexes to be waited for + * @uaddr: User address of the futex + * @val: Futex value expected by userspace + * @bitset: Bitset for the optional bitmasked wakeup + */ +struct futex_wait_block { + __u32 __user *uaddr; + __u32 val; + __u32 bitset; +}; + #endif /* _UAPI_LINUX_FUTEX_H */ diff --git a/init/Kconfig b/init/Kconfig index 74a5ac65644f..923772555b21 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -160,13 +160,16 @@ config HAVE_KERNEL_LZO config HAVE_KERNEL_LZ4 bool +config HAVE_KERNEL_ZSTD + bool + config HAVE_KERNEL_UNCOMPRESSED bool choice prompt "Kernel compression mode" default KERNEL_GZIP - depends on HAVE_KERNEL_GZIP || HAVE_KERNEL_BZIP2 || HAVE_KERNEL_LZMA || HAVE_KERNEL_XZ || HAVE_KERNEL_LZO || HAVE_KERNEL_LZ4 || HAVE_KERNEL_UNCOMPRESSED + depends on HAVE_KERNEL_GZIP || HAVE_KERNEL_BZIP2 || HAVE_KERNEL_LZMA || HAVE_KERNEL_XZ || HAVE_KERNEL_LZO || HAVE_KERNEL_LZ4 || HAVE_KERNEL_ZSTD || HAVE_KERNEL_UNCOMPRESSED help The linux kernel is a kind of self-extracting executable. Several compression algorithms are available, which differ @@ -245,6 +248,16 @@ config KERNEL_LZ4 is about 8% bigger than LZO. But the decompression speed is faster than LZO. +config KERNEL_ZSTD + bool "ZSTD" + depends on HAVE_KERNEL_ZSTD + help + ZSTD is a compression algorithm targeting intermediate compression + with fast decompression speed. It will compress better than GZIP and + decompress around the same speed as LZO, but slower than LZ4. You + will need at least 192 KB RAM or more for booting. The zstd command + line tools is required for compression. + config KERNEL_UNCOMPRESSED bool "None" depends on HAVE_KERNEL_UNCOMPRESSED @@ -1102,6 +1115,22 @@ config USER_NS If unsure, say N. +config USER_NS_UNPRIVILEGED + bool "Allow unprivileged users to create namespaces" + default y + depends on USER_NS + help + When disabled, unprivileged users will not be able to create + new namespaces. Allowing users to create their own namespaces + has been part of several recent local privilege escalation + exploits, so if you need user namespaces but are + paranoid^Wsecurity-conscious you want to disable this. + + This setting can be overridden at runtime via the + kernel.unprivileged_userns_clone sysctl. + + If unsure, say Y. + config PID_NS bool "PID Namespaces" default y @@ -1240,7 +1269,6 @@ config CC_OPTIMIZE_FOR_PERFORMANCE config CC_OPTIMIZE_FOR_PERFORMANCE_O3 bool "Optimize more for performance (-O3)" - depends on ARC help Choosing this option will pass "-O3" to your compiler to optimize the kernel yet more for performance. diff --git a/kernel/fork.c b/kernel/fork.c index 48ed22774efa..ec61454a18d5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -106,6 +106,11 @@ #define CREATE_TRACE_POINTS #include +#ifdef CONFIG_USER_NS +extern int unprivileged_userns_clone; +#else +#define unprivileged_userns_clone 0 +#endif /* * Minimum number of threads to boot the kernel @@ -1848,6 +1853,10 @@ static __latent_entropy struct task_struct *copy_process( if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS)) return ERR_PTR(-EINVAL); + if ((clone_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + /* * Thread groups must share signals as well, and detached threads * can only be started up within the thread group. @@ -2948,6 +2957,12 @@ int ksys_unshare(unsigned long unshare_flags) if (unshare_flags & CLONE_NEWNS) unshare_flags |= CLONE_FS; + if ((unshare_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) { + err = -EPERM; + if (!capable(CAP_SYS_ADMIN)) + goto bad_unshare_out; + } + err = check_unshare_flags(unshare_flags); if (err) goto bad_unshare_out; diff --git a/kernel/futex.c b/kernel/futex.c index b59532862bc0..2fbfb0b808c1 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -214,6 +214,8 @@ struct futex_pi_state { * @rt_waiter: rt_waiter storage for use with requeue_pi * @requeue_pi_key: the requeue_pi target futex key * @bitset: bitset for the optional bitmasked wakeup + * @uaddr: userspace address of futex + * @uval: expected futex's value * * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so * we can wake only the relevant ones (hashed queues may be shared). @@ -236,6 +238,8 @@ struct futex_q { struct rt_mutex_waiter *rt_waiter; union futex_key *requeue_pi_key; u32 bitset; + u32 __user *uaddr; + u32 uval; } __randomize_layout; static const struct futex_q futex_q_init = { @@ -2346,6 +2350,29 @@ static int unqueue_me(struct futex_q *q) return ret; } +/** + * unqueue_multiple() - Remove several futexes from their futex_hash_bucket + * @q: The list of futexes to unqueue + * @count: Number of futexes in the list + * + * Helper to unqueue a list of futexes. This can't fail. + * + * Return: + * - >=0 - Index of the last futex that was awoken; + * - -1 - If no futex was awoken + */ +static int unqueue_multiple(struct futex_q *q, int count) +{ + int ret = -1; + int i; + + for (i = 0; i < count; i++) { + if (!unqueue_me(&q[i])) + ret = i; + } + return ret; +} + /* * PI futexes can not be requeued and must remove themself from the * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry @@ -2709,6 +2736,211 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, return ret; } +/** + * futex_wait_multiple_setup() - Prepare to wait and enqueue multiple futexes + * @qs: The corresponding futex list + * @count: The size of the lists + * @flags: Futex flags (FLAGS_SHARED, etc.) + * @awaken: Index of the last awoken futex + * + * Prepare multiple futexes in a single step and enqueue them. This may fail if + * the futex list is invalid or if any futex was already awoken. On success the + * task is ready to interruptible sleep. + * + * Return: + * - 1 - One of the futexes was awaken by another thread + * - 0 - Success + * - <0 - -EFAULT, -EWOULDBLOCK or -EINVAL + */ +static int futex_wait_multiple_setup(struct futex_q *qs, int count, + unsigned int flags, int *awaken) +{ + struct futex_hash_bucket *hb; + int ret, i; + u32 uval; + + /* + * Enqueuing multiple futexes is tricky, because we need to + * enqueue each futex in the list before dealing with the next + * one to avoid deadlocking on the hash bucket. But, before + * enqueuing, we need to make sure that current->state is + * TASK_INTERRUPTIBLE, so we don't absorb any awake events, which + * cannot be done before the get_futex_key of the next key, + * because it calls get_user_pages, which can sleep. Thus, we + * fetch the list of futexes keys in two steps, by first pinning + * all the memory keys in the futex key, and only then we read + * each key and queue the corresponding futex. + */ +retry: + for (i = 0; i < count; i++) { + qs[i].key = FUTEX_KEY_INIT; + ret = get_futex_key(qs[i].uaddr, flags & FLAGS_SHARED, + &qs[i].key, FUTEX_READ); + if (unlikely(ret)) { + for (--i; i >= 0; i--) + put_futex_key(&qs[i].key); + return ret; + } + } + + set_current_state(TASK_INTERRUPTIBLE); + + for (i = 0; i < count; i++) { + struct futex_q *q = &qs[i]; + + hb = queue_lock(q); + + ret = get_futex_value_locked(&uval, q->uaddr); + if (ret) { + /* + * We need to try to handle the fault, which + * cannot be done without sleep, so we need to + * undo all the work already done, to make sure + * we don't miss any wake ups. Therefore, clean + * up, handle the fault and retry from the + * beginning. + */ + queue_unlock(hb); + + /* + * Keys 0..(i-1) are implicitly put + * on unqueue_multiple. + */ + put_futex_key(&q->key); + + *awaken = unqueue_multiple(qs, i); + + __set_current_state(TASK_RUNNING); + + /* + * On a real fault, prioritize the error even if + * some other futex was awoken. Userspace gave + * us a bad address, -EFAULT them. + */ + ret = get_user(uval, q->uaddr); + if (ret) + return ret; + + /* + * Even if the page fault was handled, If + * something was already awaken, we can safely + * give up and succeed to give a hint for userspace to + * acquire the right futex faster. + */ + if (*awaken >= 0) + return 1; + + goto retry; + } + + if (uval != q->uval) { + queue_unlock(hb); + + put_futex_key(&qs[i].key); + + /* + * If something was already awaken, we can + * safely ignore the error and succeed. + */ + *awaken = unqueue_multiple(qs, i); + __set_current_state(TASK_RUNNING); + if (*awaken >= 0) + return 1; + + return -EWOULDBLOCK; + } + + /* + * The bucket lock can't be held while dealing with the + * next futex. Queue each futex at this moment so hb can + * be unlocked. + */ + queue_me(&qs[i], hb); + } + return 0; +} + +/** + * futex_wait_multiple() - Prepare to wait on and enqueue several futexes + * @qs: The list of futexes to wait on + * @op: Operation code from futex's syscall + * @count: The number of objects + * @abs_time: Timeout before giving up and returning to userspace + * + * Entry point for the FUTEX_WAIT_MULTIPLE futex operation, this function + * sleeps on a group of futexes and returns on the first futex that + * triggered, or after the timeout has elapsed. + * + * Return: + * - >=0 - Hint to the futex that was awoken + * - <0 - On error + */ +static int futex_wait_multiple(struct futex_q *qs, int op, + u32 count, ktime_t *abs_time) +{ + struct hrtimer_sleeper timeout, *to; + int ret, flags = 0, hint = 0; + unsigned int i; + + if (!(op & FUTEX_PRIVATE_FLAG)) + flags |= FLAGS_SHARED; + + if (op & FUTEX_CLOCK_REALTIME) + flags |= FLAGS_CLOCKRT; + + to = futex_setup_timer(abs_time, &timeout, flags, 0); + while (1) { + ret = futex_wait_multiple_setup(qs, count, flags, &hint); + if (ret) { + if (ret > 0) { + /* A futex was awaken during setup */ + ret = hint; + } + break; + } + + if (to) + hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS); + + /* + * Avoid sleeping if another thread already tried to + * wake us. + */ + for (i = 0; i < count; i++) { + if (plist_node_empty(&qs[i].list)) + break; + } + + if (i == count && (!to || to->task)) + freezable_schedule(); + + ret = unqueue_multiple(qs, count); + + __set_current_state(TASK_RUNNING); + + if (ret >= 0) + break; + if (to && !to->task) { + ret = -ETIMEDOUT; + break; + } else if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + /* + * The final case is a spurious wakeup, for + * which just retry. + */ + } + + if (to) { + hrtimer_cancel(&to->timer); + destroy_hrtimer_on_stack(&to->timer); + } + + return ret; +} + static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset) { @@ -3833,6 +4065,43 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, return -ENOSYS; } +/** + * futex_read_wait_block - Read an array of futex_wait_block from userspace + * @uaddr: Userspace address of the block + * @count: Number of blocks to be read + * + * This function creates and allocate an array of futex_q (we zero it to + * initialize the fields) and then, for each futex_wait_block element from + * userspace, fill a futex_q element with proper values. + */ +inline struct futex_q *futex_read_wait_block(u32 __user *uaddr, u32 count) +{ + unsigned int i; + struct futex_q *qs; + struct futex_wait_block fwb; + struct futex_wait_block __user *entry = + (struct futex_wait_block __user *)uaddr; + + if (!count || count > FUTEX_MULTIPLE_MAX_COUNT) + return ERR_PTR(-EINVAL); + + qs = kcalloc(count, sizeof(*qs), GFP_KERNEL); + if (!qs) + return ERR_PTR(-ENOMEM); + + for (i = 0; i < count; i++) { + if (copy_from_user(&fwb, &entry[i], sizeof(fwb))) { + kfree(qs); + return ERR_PTR(-EFAULT); + } + + qs[i].uaddr = fwb.uaddr; + qs[i].uval = fwb.val; + qs[i].bitset = fwb.bitset; + } + + return qs; +} SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, struct __kernel_timespec __user *, utime, u32 __user *, uaddr2, @@ -3845,7 +4114,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || cmd == FUTEX_WAIT_BITSET || - cmd == FUTEX_WAIT_REQUEUE_PI)) { + cmd == FUTEX_WAIT_REQUEUE_PI || + cmd == FUTEX_WAIT_MULTIPLE)) { if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG)))) return -EFAULT; if (get_timespec64(&ts, utime)) @@ -3854,7 +4124,7 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, return -EINVAL; t = timespec64_to_ktime(ts); - if (cmd == FUTEX_WAIT) + if (cmd == FUTEX_WAIT || cmd == FUTEX_WAIT_MULTIPLE) t = ktime_add_safe(ktime_get(), t); tp = &t; } @@ -3866,6 +4136,25 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) val2 = (u32) (unsigned long) utime; + if (cmd == FUTEX_WAIT_MULTIPLE) { + int ret; + struct futex_q *qs; + +#ifdef CONFIG_X86_X32 + if (unlikely(in_x32_syscall())) + return -ENOSYS; +#endif + qs = futex_read_wait_block(uaddr, val); + + if (IS_ERR(qs)) + return PTR_ERR(qs); + + ret = futex_wait_multiple(qs, op, val, tp); + kfree(qs); + + return ret; + } + return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); } @@ -4028,6 +4317,58 @@ COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid, #endif /* CONFIG_COMPAT */ #ifdef CONFIG_COMPAT_32BIT_TIME +/** + * struct compat_futex_wait_block - Block of futexes to be waited for + * @uaddr: User address of the futex (compatible pointer) + * @val: Futex value expected by userspace + * @bitset: Bitset for the optional bitmasked wakeup + */ +struct compat_futex_wait_block { + compat_uptr_t uaddr; + __u32 pad; + __u32 val; + __u32 bitset; +}; + +/** + * compat_futex_read_wait_block - Read an array of futex_wait_block from + * userspace + * @uaddr: Userspace address of the block + * @count: Number of blocks to be read + * + * This function does the same as futex_read_wait_block(), except that it + * converts the pointer to the futex from the compat version to the regular one. + */ +inline struct futex_q *compat_futex_read_wait_block(u32 __user *uaddr, + u32 count) +{ + unsigned int i; + struct futex_q *qs; + struct compat_futex_wait_block fwb; + struct compat_futex_wait_block __user *entry = + (struct compat_futex_wait_block __user *)uaddr; + + if (!count || count > FUTEX_MULTIPLE_MAX_COUNT) + return ERR_PTR(-EINVAL); + + qs = kcalloc(count, sizeof(*qs), GFP_KERNEL); + if (!qs) + return ERR_PTR(-ENOMEM); + + for (i = 0; i < count; i++) { + if (copy_from_user(&fwb, &entry[i], sizeof(fwb))) { + kfree(qs); + return ERR_PTR(-EFAULT); + } + + qs[i].uaddr = compat_ptr(fwb.uaddr); + qs[i].uval = fwb.val; + qs[i].bitset = fwb.bitset; + } + + return qs; +} + SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, struct old_timespec32 __user *, utime, u32 __user *, uaddr2, u32, val3) @@ -4039,14 +4380,15 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || cmd == FUTEX_WAIT_BITSET || - cmd == FUTEX_WAIT_REQUEUE_PI)) { + cmd == FUTEX_WAIT_REQUEUE_PI || + cmd == FUTEX_WAIT_MULTIPLE)) { if (get_old_timespec32(&ts, utime)) return -EFAULT; if (!timespec64_valid(&ts)) return -EINVAL; t = timespec64_to_ktime(ts); - if (cmd == FUTEX_WAIT) + if (cmd == FUTEX_WAIT || cmd == FUTEX_WAIT_MULTIPLE) t = ktime_add_safe(ktime_get(), t); tp = &t; } @@ -4054,6 +4396,19 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) val2 = (int) (unsigned long) utime; + if (cmd == FUTEX_WAIT_MULTIPLE) { + int ret; + struct futex_q *qs = compat_futex_read_wait_block(uaddr, val); + + if (IS_ERR(qs)) + return PTR_ERR(qs); + + ret = futex_wait_multiple(qs, op, val, tp); + kfree(qs); + + return ret; + } + return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); } #endif /* CONFIG_COMPAT_32BIT_TIME */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8a176d8727a3..c15054889162 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -110,6 +110,9 @@ extern int core_uses_pid; extern char core_pattern[]; extern unsigned int core_pipe_limit; #endif +#ifdef CONFIG_USER_NS +extern int unprivileged_userns_clone; +#endif extern int pid_max; extern int pid_max_min, pid_max_max; extern int percpu_pagelist_fraction; @@ -534,6 +537,15 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif +#ifdef CONFIG_USER_NS + { + .procname = "unprivileged_userns_clone", + .data = &unprivileged_userns_clone, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif #ifdef CONFIG_PROC_SYSCTL { .procname = "tainted", @@ -1458,6 +1470,15 @@ static struct ctl_table vm_table[] = { .mode = 0200, .proc_handler = sysctl_compaction_handler, }, + { + .procname = "compaction_proactiveness", + .data = &sysctl_compaction_proactiveness, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &one_hundred, + }, { .procname = "extfrag_threshold", .data = &sysctl_extfrag_threshold, diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 8eadadc478f9..c36ecd19562c 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -21,6 +21,13 @@ #include #include +/* sysctl */ +#ifdef CONFIG_USER_NS_UNPRIVILEGED +int unprivileged_userns_clone = 1; +#else +int unprivileged_userns_clone; +#endif + static struct kmem_cache *user_ns_cachep __read_mostly; static DEFINE_MUTEX(userns_state_mutex); diff --git a/lib/Kconfig b/lib/Kconfig index 5d53f9609c25..e883aecb9279 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -336,6 +336,10 @@ config DECOMPRESS_LZ4 select LZ4_DECOMPRESS tristate +config DECOMPRESS_ZSTD + select ZSTD_DECOMPRESS + tristate + # # Generic allocator support is selected if needed # diff --git a/lib/Makefile b/lib/Makefile index 685aee60de1d..46a4c7a39beb 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -163,6 +163,7 @@ lib-$(CONFIG_DECOMPRESS_LZMA) += decompress_unlzma.o lib-$(CONFIG_DECOMPRESS_XZ) += decompress_unxz.o lib-$(CONFIG_DECOMPRESS_LZO) += decompress_unlzo.o lib-$(CONFIG_DECOMPRESS_LZ4) += decompress_unlz4.o +lib-$(CONFIG_DECOMPRESS_ZSTD) += decompress_unzstd.o obj-$(CONFIG_TEXTSEARCH) += textsearch.o obj-$(CONFIG_TEXTSEARCH_KMP) += ts_kmp.o diff --git a/lib/decompress.c b/lib/decompress.c index 857ab1af1ef3..ab3fc90ffc64 100644 --- a/lib/decompress.c +++ b/lib/decompress.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -37,6 +38,9 @@ #ifndef CONFIG_DECOMPRESS_LZ4 # define unlz4 NULL #endif +#ifndef CONFIG_DECOMPRESS_ZSTD +# define unzstd NULL +#endif struct compress_format { unsigned char magic[2]; @@ -52,6 +56,7 @@ static const struct compress_format compressed_formats[] __initconst = { { {0xfd, 0x37}, "xz", unxz }, { {0x89, 0x4c}, "lzo", unlzo }, { {0x02, 0x21}, "lz4", unlz4 }, + { {0x28, 0xb5}, "zstd", unzstd }, { {0, 0}, NULL, NULL } }; diff --git a/lib/decompress_unzstd.c b/lib/decompress_unzstd.c new file mode 100644 index 000000000000..f317afab502f --- /dev/null +++ b/lib/decompress_unzstd.c @@ -0,0 +1,342 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Important notes about in-place decompression + * + * At least on x86, the kernel is decompressed in place: the compressed data + * is placed to the end of the output buffer, and the decompressor overwrites + * most of the compressed data. There must be enough safety margin to + * guarantee that the write position is always behind the read position. + * + * The safety margin for ZSTD with a 128 KB block size is calculated below. + * Note that the margin with ZSTD is bigger than with GZIP or XZ! + * + * The worst case for in-place decompression is that the beginning of + * the file is compressed extremely well, and the rest of the file is + * uncompressible. Thus, we must look for worst-case expansion when the + * compressor is encoding uncompressible data. + * + * The structure of the .zst file in case of a compresed kernel is as follows. + * Maximum sizes (as bytes) of the fields are in parenthesis. + * + * Frame Header: (18) + * Blocks: (N) + * Checksum: (4) + * + * The frame header and checksum overhead is at most 22 bytes. + * + * ZSTD stores the data in blocks. Each block has a header whose size is + * a 3 bytes. After the block header, there is up to 128 KB of payload. + * The maximum uncompressed size of the payload is 128 KB. The minimum + * uncompressed size of the payload is never less than the payload size + * (excluding the block header). + * + * The assumption, that the uncompressed size of the payload is never + * smaller than the payload itself, is valid only when talking about + * the payload as a whole. It is possible that the payload has parts where + * the decompressor consumes more input than it produces output. Calculating + * the worst case for this would be tricky. Instead of trying to do that, + * let's simply make sure that the decompressor never overwrites any bytes + * of the payload which it is currently reading. + * + * Now we have enough information to calculate the safety margin. We need + * - 22 bytes for the .zst file format headers; + * - 3 bytes per every 128 KiB of uncompressed size (one block header per + * block); and + * - 128 KiB (biggest possible zstd block size) to make sure that the + * decompressor never overwrites anything from the block it is currently + * reading. + * + * We get the following formula: + * + * safety_margin = 22 + uncompressed_size * 3 / 131072 + 131072 + * <= 22 + (uncompressed_size >> 15) + 131072 + */ + +/* + * Preboot environments #include "path/to/decompress_unzstd.c". + * All of the source files we depend on must be #included. + * zstd's only source dependeny is xxhash, which has no source + * dependencies. + * + * zstd and xxhash avoid declaring themselves as modules + * when ZSTD_PREBOOT and XXH_PREBOOT are defined. + */ +#ifdef STATIC +# define ZSTD_PREBOOT +# define XXH_PREBOOT +# include "xxhash.c" +# include "zstd/entropy_common.c" +# include "zstd/fse_decompress.c" +# include "zstd/huf_decompress.c" +# include "zstd/zstd_common.c" +# include "zstd/decompress.c" +#endif + +#include +#include +#include + +/* 128MB is the maximum window size supported by zstd. */ +#define ZSTD_WINDOWSIZE_MAX (1 << ZSTD_WINDOWLOG_MAX) +/* Size of the input and output buffers in multi-call mode. + * Pick a larger size because it isn't used during kernel decompression, + * since that is single pass, and we have to allocate a large buffer for + * zstd's window anyways. The larger size speeds up initramfs decompression. + */ +#define ZSTD_IOBUF_SIZE (1 << 17) + +static int INIT handle_zstd_error(size_t ret, void (*error)(char *x)) +{ + const int err = ZSTD_getErrorCode(ret); + + if (!ZSTD_isError(ret)) + return 0; + + switch (err) { + case ZSTD_error_memory_allocation: + error("ZSTD decompressor ran out of memory"); + break; + case ZSTD_error_prefix_unknown: + error("Input is not in the ZSTD format (wrong magic bytes)"); + break; + case ZSTD_error_dstSize_tooSmall: + case ZSTD_error_corruption_detected: + case ZSTD_error_checksum_wrong: + error("ZSTD-compressed data is corrupt"); + break; + default: + error("ZSTD-compressed data is probably corrupt"); + break; + } + return -1; +} + +/* + * Handle the case where we have the entire input and output in one segment. + * We can allocate less memory (no circular buffer for the sliding window), + * and avoid some memcpy() calls. + */ +static int INIT decompress_single(const u8 *in_buf, long in_len, u8 *out_buf, + long out_len, long *in_pos, + void (*error)(char *x)) +{ + const size_t wksp_size = ZSTD_DCtxWorkspaceBound(); + void *wksp = large_malloc(wksp_size); + ZSTD_DCtx *dctx = ZSTD_initDCtx(wksp, wksp_size); + int err; + size_t ret; + + if (dctx == NULL) { + error("Out of memory while allocating ZSTD_DCtx"); + err = -1; + goto out; + } + /* + * Find out how large the frame actually is, there may be junk at + * the end of the frame that ZSTD_decompressDCtx() can't handle. + */ + ret = ZSTD_findFrameCompressedSize(in_buf, in_len); + err = handle_zstd_error(ret, error); + if (err) + goto out; + in_len = (long)ret; + + ret = ZSTD_decompressDCtx(dctx, out_buf, out_len, in_buf, in_len); + err = handle_zstd_error(ret, error); + if (err) + goto out; + + if (in_pos != NULL) + *in_pos = in_len; + + err = 0; +out: + if (wksp != NULL) + large_free(wksp); + return err; +} + +static int INIT __unzstd(unsigned char *in_buf, long in_len, + long (*fill)(void*, unsigned long), + long (*flush)(void*, unsigned long), + unsigned char *out_buf, long out_len, + long *in_pos, + void (*error)(char *x)) +{ + ZSTD_inBuffer in; + ZSTD_outBuffer out; + ZSTD_frameParams params; + void *in_allocated = NULL; + void *out_allocated = NULL; + void *wksp = NULL; + size_t wksp_size; + ZSTD_DStream *dstream; + int err; + size_t ret; + + if (out_len == 0) + out_len = LONG_MAX; /* no limit */ + + if (fill == NULL && flush == NULL) + /* + * We can decompress faster and with less memory when we have a + * single chunk. + */ + return decompress_single(in_buf, in_len, out_buf, out_len, + in_pos, error); + + /* + * If in_buf is not provided, we must be using fill(), so allocate + * a large enough buffer. If it is provided, it must be at least + * ZSTD_IOBUF_SIZE large. + */ + if (in_buf == NULL) { + in_allocated = large_malloc(ZSTD_IOBUF_SIZE); + if (in_allocated == NULL) { + error("Out of memory while allocating input buffer"); + err = -1; + goto out; + } + in_buf = in_allocated; + in_len = 0; + } + /* Read the first chunk, since we need to decode the frame header. */ + if (fill != NULL) + in_len = fill(in_buf, ZSTD_IOBUF_SIZE); + if (in_len < 0) { + error("ZSTD-compressed data is truncated"); + err = -1; + goto out; + } + /* Set the first non-empty input buffer. */ + in.src = in_buf; + in.pos = 0; + in.size = in_len; + /* Allocate the output buffer if we are using flush(). */ + if (flush != NULL) { + out_allocated = large_malloc(ZSTD_IOBUF_SIZE); + if (out_allocated == NULL) { + error("Out of memory while allocating output buffer"); + err = -1; + goto out; + } + out_buf = out_allocated; + out_len = ZSTD_IOBUF_SIZE; + } + /* Set the output buffer. */ + out.dst = out_buf; + out.pos = 0; + out.size = out_len; + + /* + * We need to know the window size to allocate the ZSTD_DStream. + * Since we are streaming, we need to allocate a buffer for the sliding + * window. The window size varies from 1 KB to ZSTD_WINDOWSIZE_MAX + * (8 MB), so it is important to use the actual value so as not to + * waste memory when it is smaller. + */ + ret = ZSTD_getFrameParams(¶ms, in.src, in.size); + err = handle_zstd_error(ret, error); + if (err) + goto out; + if (ret != 0) { + error("ZSTD-compressed data has an incomplete frame header"); + err = -1; + goto out; + } + if (params.windowSize > ZSTD_WINDOWSIZE_MAX) { + error("ZSTD-compressed data has too large a window size"); + err = -1; + goto out; + } + + /* + * Allocate the ZSTD_DStream now that we know how much memory is + * required. + */ + wksp_size = ZSTD_DStreamWorkspaceBound(params.windowSize); + wksp = large_malloc(wksp_size); + dstream = ZSTD_initDStream(params.windowSize, wksp, wksp_size); + if (dstream == NULL) { + error("Out of memory while allocating ZSTD_DStream"); + err = -1; + goto out; + } + + /* + * Decompression loop: + * Read more data if necessary (error if no more data can be read). + * Call the decompression function, which returns 0 when finished. + * Flush any data produced if using flush(). + */ + if (in_pos != NULL) + *in_pos = 0; + do { + /* + * If we need to reload data, either we have fill() and can + * try to get more data, or we don't and the input is truncated. + */ + if (in.pos == in.size) { + if (in_pos != NULL) + *in_pos += in.pos; + in_len = fill ? fill(in_buf, ZSTD_IOBUF_SIZE) : -1; + if (in_len < 0) { + error("ZSTD-compressed data is truncated"); + err = -1; + goto out; + } + in.pos = 0; + in.size = in_len; + } + /* Returns zero when the frame is complete. */ + ret = ZSTD_decompressStream(dstream, &out, &in); + err = handle_zstd_error(ret, error); + if (err) + goto out; + /* Flush all of the data produced if using flush(). */ + if (flush != NULL && out.pos > 0) { + if (out.pos != flush(out.dst, out.pos)) { + error("Failed to flush()"); + err = -1; + goto out; + } + out.pos = 0; + } + } while (ret != 0); + + if (in_pos != NULL) + *in_pos += in.pos; + + err = 0; +out: + if (in_allocated != NULL) + large_free(in_allocated); + if (out_allocated != NULL) + large_free(out_allocated); + if (wksp != NULL) + large_free(wksp); + return err; +} + +#ifndef ZSTD_PREBOOT +STATIC int INIT unzstd(unsigned char *buf, long len, + long (*fill)(void*, unsigned long), + long (*flush)(void*, unsigned long), + unsigned char *out_buf, + long *pos, + void (*error)(char *x)) +{ + return __unzstd(buf, len, fill, flush, out_buf, 0, pos, error); +} +#else +STATIC int INIT __decompress(unsigned char *buf, long len, + long (*fill)(void*, unsigned long), + long (*flush)(void*, unsigned long), + unsigned char *out_buf, long out_len, + long *pos, + void (*error)(char *x)) +{ + return __unzstd(buf, len, fill, flush, out_buf, out_len, pos, error); +} +#endif diff --git a/lib/xxhash.c b/lib/xxhash.c index aa61e2a3802f..b4364e011392 100644 --- a/lib/xxhash.c +++ b/lib/xxhash.c @@ -80,13 +80,11 @@ void xxh32_copy_state(struct xxh32_state *dst, const struct xxh32_state *src) { memcpy(dst, src, sizeof(*dst)); } -EXPORT_SYMBOL(xxh32_copy_state); void xxh64_copy_state(struct xxh64_state *dst, const struct xxh64_state *src) { memcpy(dst, src, sizeof(*dst)); } -EXPORT_SYMBOL(xxh64_copy_state); /*-*************************** * Simple Hash Functions @@ -151,7 +149,6 @@ uint32_t xxh32(const void *input, const size_t len, const uint32_t seed) return h32; } -EXPORT_SYMBOL(xxh32); static uint64_t xxh64_round(uint64_t acc, const uint64_t input) { @@ -234,7 +231,6 @@ uint64_t xxh64(const void *input, const size_t len, const uint64_t seed) return h64; } -EXPORT_SYMBOL(xxh64); /*-************************************************** * Advanced Hash Functions @@ -251,7 +247,6 @@ void xxh32_reset(struct xxh32_state *statePtr, const uint32_t seed) state.v4 = seed - PRIME32_1; memcpy(statePtr, &state, sizeof(state)); } -EXPORT_SYMBOL(xxh32_reset); void xxh64_reset(struct xxh64_state *statePtr, const uint64_t seed) { @@ -265,7 +260,6 @@ void xxh64_reset(struct xxh64_state *statePtr, const uint64_t seed) state.v4 = seed - PRIME64_1; memcpy(statePtr, &state, sizeof(state)); } -EXPORT_SYMBOL(xxh64_reset); int xxh32_update(struct xxh32_state *state, const void *input, const size_t len) { @@ -334,7 +328,6 @@ int xxh32_update(struct xxh32_state *state, const void *input, const size_t len) return 0; } -EXPORT_SYMBOL(xxh32_update); uint32_t xxh32_digest(const struct xxh32_state *state) { @@ -372,7 +365,6 @@ uint32_t xxh32_digest(const struct xxh32_state *state) return h32; } -EXPORT_SYMBOL(xxh32_digest); int xxh64_update(struct xxh64_state *state, const void *input, const size_t len) { @@ -439,7 +431,6 @@ int xxh64_update(struct xxh64_state *state, const void *input, const size_t len) return 0; } -EXPORT_SYMBOL(xxh64_update); uint64_t xxh64_digest(const struct xxh64_state *state) { @@ -494,7 +485,19 @@ uint64_t xxh64_digest(const struct xxh64_state *state) return h64; } + +#ifndef XXH_PREBOOT +EXPORT_SYMBOL(xxh32_copy_state); +EXPORT_SYMBOL(xxh64_copy_state); +EXPORT_SYMBOL(xxh32); +EXPORT_SYMBOL(xxh64); +EXPORT_SYMBOL(xxh32_reset); +EXPORT_SYMBOL(xxh64_reset); +EXPORT_SYMBOL(xxh32_update); +EXPORT_SYMBOL(xxh32_digest); +EXPORT_SYMBOL(xxh64_update); EXPORT_SYMBOL(xxh64_digest); MODULE_LICENSE("Dual BSD/GPL"); MODULE_DESCRIPTION("xxHash"); +#endif diff --git a/lib/zstd/decompress.c b/lib/zstd/decompress.c index 269ee9a796c1..73ded63278cf 100644 --- a/lib/zstd/decompress.c +++ b/lib/zstd/decompress.c @@ -2490,6 +2490,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream *zds, ZSTD_outBuffer *output, ZSTD_inB } } +#ifndef ZSTD_PREBOOT EXPORT_SYMBOL(ZSTD_DCtxWorkspaceBound); EXPORT_SYMBOL(ZSTD_initDCtx); EXPORT_SYMBOL(ZSTD_decompressDCtx); @@ -2529,3 +2530,4 @@ EXPORT_SYMBOL(ZSTD_insertBlock); MODULE_LICENSE("Dual BSD/GPL"); MODULE_DESCRIPTION("Zstd Decompressor"); +#endif diff --git a/lib/zstd/fse_decompress.c b/lib/zstd/fse_decompress.c index a84300e5a013..0b353530fb3f 100644 --- a/lib/zstd/fse_decompress.c +++ b/lib/zstd/fse_decompress.c @@ -47,6 +47,7 @@ ****************************************************************/ #include "bitstream.h" #include "fse.h" +#include "zstd_internal.h" #include #include #include /* memcpy, memset */ @@ -60,14 +61,6 @@ enum { FSE_static_assert = 1 / (int)(!!(c)) }; \ } /* use only *after* variable declarations */ -/* check and forward error code */ -#define CHECK_F(f) \ - { \ - size_t const e = f; \ - if (FSE_isError(e)) \ - return e; \ - } - /* ************************************************************** * Templates ****************************************************************/ diff --git a/lib/zstd/zstd_internal.h b/lib/zstd/zstd_internal.h index 1a79fab9e13a..dac753397f86 100644 --- a/lib/zstd/zstd_internal.h +++ b/lib/zstd/zstd_internal.h @@ -127,7 +127,14 @@ static const U32 OF_defaultNormLog = OF_DEFAULTNORMLOG; * Shared functions to include for inlining *********************************************/ ZSTD_STATIC void ZSTD_copy8(void *dst, const void *src) { - memcpy(dst, src, 8); + /* + * zstd relies heavily on gcc being able to analyze and inline this + * memcpy() call, since it is called in a tight loop. Preboot mode + * is compiled in freestanding mode, which stops gcc from analyzing + * memcpy(). Use __builtin_memcpy() to tell gcc to analyze this as a + * regular memcpy(). + */ + __builtin_memcpy(dst, src, 8); } /*! ZSTD_wildcopy() : * custom version of memcpy(), can copy up to 7 bytes too many (8 bytes if length==0) */ @@ -137,13 +144,16 @@ ZSTD_STATIC void ZSTD_wildcopy(void *dst, const void *src, ptrdiff_t length) const BYTE* ip = (const BYTE*)src; BYTE* op = (BYTE*)dst; BYTE* const oend = op + length; - /* Work around https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81388. +#if defined(GCC_VERSION) && GCC_VERSION >= 70000 && GCC_VERSION < 70200 + /* + * Work around https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81388. * Avoid the bad case where the loop only runs once by handling the * special case separately. This doesn't trigger the bug because it * doesn't involve pointer/integer overflow. */ if (length <= 8) return ZSTD_copy8(dst, src); +#endif do { ZSTD_copy8(op, ip); op += 8; diff --git a/mm/compaction.c b/mm/compaction.c index 46f0fcc93081..822ff72817d5 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -50,6 +50,24 @@ static inline void count_compact_events(enum vm_event_item item, long delta) #define pageblock_start_pfn(pfn) block_start_pfn(pfn, pageblock_order) #define pageblock_end_pfn(pfn) block_end_pfn(pfn, pageblock_order) +/* + * Fragmentation score check interval for proactive compaction purposes. + */ +static const int HPAGE_FRAG_CHECK_INTERVAL_MSEC = 500; + +/* + * Page order with-respect-to which proactive compaction + * calculates external fragmentation, which is used as + * the "fragmentation score" of a node/zone. + */ +#if defined HPAGE_PMD_ORDER +#define COMPACTION_HPAGE_ORDER HPAGE_PMD_ORDER +#elif defined HUGETLB_PAGE_ORDER +#define COMPACTION_HPAGE_ORDER HUGETLB_PAGE_ORDER +#else +#define COMPACTION_HPAGE_ORDER (PMD_SHIFT - PAGE_SHIFT) +#endif + static unsigned long release_freepages(struct list_head *freelist) { struct page *page, *next; @@ -1855,6 +1873,76 @@ static inline bool is_via_compact_memory(int order) return order == -1; } +static bool kswapd_is_running(pg_data_t *pgdat) +{ + return pgdat->kswapd && (pgdat->kswapd->state == TASK_RUNNING); +} + +/* + * A zone's fragmentation score is the external fragmentation wrt to the + * COMPACTION_HPAGE_ORDER scaled by the zone's size. It returns a value + * in the range [0, 100]. + * + * The scaling factor ensures that proactive compaction focuses on larger + * zones like ZONE_NORMAL, rather than smaller, specialized zones like + * ZONE_DMA32. For smaller zones, the score value remains close to zero, + * and thus never exceeds the high threshold for proactive compaction. + */ +static int fragmentation_score_zone(struct zone *zone) +{ + unsigned long score; + + score = zone->present_pages * + extfrag_for_order(zone, COMPACTION_HPAGE_ORDER); + return div64_ul(score, zone->zone_pgdat->node_present_pages + 1); +} + +/* + * The per-node proactive (background) compaction process is started by its + * corresponding kcompactd thread when the node's fragmentation score + * exceeds the high threshold. The compaction process remains active till + * the node's score falls below the low threshold, or one of the back-off + * conditions is met. + */ +static int fragmentation_score_node(pg_data_t *pgdat) +{ + unsigned long score = 0; + int zoneid; + + for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { + struct zone *zone; + + zone = &pgdat->node_zones[zoneid]; + score += fragmentation_score_zone(zone); + } + + return score; +} + +static int fragmentation_score_wmark(pg_data_t *pgdat, bool low) +{ + int wmark_low; + + /* + * Cap the low watermak to avoid excessive compaction + * activity in case a user sets the proactivess tunable + * close to 100 (maximum). + */ + wmark_low = max(100 - sysctl_compaction_proactiveness, 5); + return low ? wmark_low : min(wmark_low + 10, 100); +} + +static bool should_proactive_compact_node(pg_data_t *pgdat) +{ + int wmark_high; + + if (!sysctl_compaction_proactiveness || kswapd_is_running(pgdat)) + return false; + + wmark_high = fragmentation_score_wmark(pgdat, false); + return fragmentation_score_node(pgdat) > wmark_high; +} + static enum compact_result __compact_finished(struct compact_control *cc) { unsigned int order; @@ -1881,6 +1969,25 @@ static enum compact_result __compact_finished(struct compact_control *cc) return COMPACT_PARTIAL_SKIPPED; } + if (cc->proactive_compaction) { + int score, wmark_low; + pg_data_t *pgdat; + + pgdat = cc->zone->zone_pgdat; + if (kswapd_is_running(pgdat)) + return COMPACT_PARTIAL_SKIPPED; + + score = fragmentation_score_zone(cc->zone); + wmark_low = fragmentation_score_wmark(pgdat, true); + + if (score > wmark_low) + ret = COMPACT_CONTINUE; + else + ret = COMPACT_SUCCESS; + + goto out; + } + if (is_via_compact_memory(cc->order)) return COMPACT_CONTINUE; @@ -1939,6 +2046,7 @@ static enum compact_result __compact_finished(struct compact_control *cc) } } +out: if (cc->contended || fatal_signal_pending(current)) ret = COMPACT_CONTENDED; @@ -2412,6 +2520,41 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, return rc; } +/* + * Compact all zones within a node till each zone's fragmentation score + * reaches within proactive compaction thresholds (as determined by the + * proactiveness tunable). + * + * It is possible that the function returns before reaching score targets + * due to various back-off conditions, such as, contention on per-node or + * per-zone locks. + */ +static void proactive_compact_node(pg_data_t *pgdat) +{ + int zoneid; + struct zone *zone; + struct compact_control cc = { + .order = -1, + .mode = MIGRATE_SYNC_LIGHT, + .ignore_skip_hint = true, + .whole_zone = true, + .gfp_mask = GFP_KERNEL, + .proactive_compaction = true, + }; + + for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { + zone = &pgdat->node_zones[zoneid]; + if (!populated_zone(zone)) + continue; + + cc.zone = zone; + + compact_zone(&cc, NULL); + + VM_BUG_ON(!list_empty(&cc.freepages)); + VM_BUG_ON(!list_empty(&cc.migratepages)); + } +} /* Compact all zones within a node */ static void compact_node(int nid) @@ -2458,6 +2601,13 @@ static void compact_nodes(void) /* The written value is actually unused, all memory is compacted */ int sysctl_compact_memory; +/* + * Tunable for proactive compaction. It determines how + * aggressively the kernel should compact memory in the + * background. It takes values in the range [0, 100]. + */ +int __read_mostly sysctl_compaction_proactiveness = 20; + /* * This is the entry point for compacting all nodes via * /proc/sys/vm/compact_memory @@ -2637,6 +2787,7 @@ static int kcompactd(void *p) { pg_data_t *pgdat = (pg_data_t*)p; struct task_struct *tsk = current; + unsigned int proactive_defer = 0; const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); @@ -2652,12 +2803,34 @@ static int kcompactd(void *p) unsigned long pflags; trace_mm_compaction_kcompactd_sleep(pgdat->node_id); - wait_event_freezable(pgdat->kcompactd_wait, - kcompactd_work_requested(pgdat)); + if (wait_event_freezable_timeout(pgdat->kcompactd_wait, + kcompactd_work_requested(pgdat), + msecs_to_jiffies(HPAGE_FRAG_CHECK_INTERVAL_MSEC))) { + + psi_memstall_enter(&pflags); + kcompactd_do_work(pgdat); + psi_memstall_leave(&pflags); + continue; + } - psi_memstall_enter(&pflags); - kcompactd_do_work(pgdat); - psi_memstall_leave(&pflags); + /* kcompactd wait timeout */ + if (should_proactive_compact_node(pgdat)) { + unsigned int prev_score, score; + + if (proactive_defer) { + proactive_defer--; + continue; + } + prev_score = fragmentation_score_node(pgdat); + proactive_compact_node(pgdat); + score = fragmentation_score_node(pgdat); + /* + * Defer proactive compaction if the fragmentation + * score did not go down i.e. no progress made. + */ + proactive_defer = score < prev_score ? + 0 : 1 << COMPACT_MAX_DEFER_SHIFT; + } } return 0; diff --git a/mm/internal.h b/mm/internal.h index b5634e78f01d..9671bccd97d5 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -228,6 +228,7 @@ struct compact_control { bool no_set_skip_hint; /* Don't mark blocks for skipping */ bool ignore_block_suitable; /* Scan blocks considered unsuitable */ bool direct_compaction; /* False from kcompactd or /proc/... */ + bool proactive_compaction; /* kcompactd proactive compaction */ bool whole_zone; /* Whole zone should/has been scanned */ bool contended; /* Signal lock or sched contention */ bool rescan; /* Rescanning the same pageblock */ diff --git a/mm/ksm.c b/mm/ksm.c index 281c00129a2e..44a0d344648a 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -2433,54 +2433,78 @@ static int ksm_scan_thread(void *nothing) return 0; } -int ksm_madvise(struct vm_area_struct *vma, unsigned long start, - unsigned long end, int advice, unsigned long *vm_flags) +int ksm_madvise_merge(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long *vm_flags) { - struct mm_struct *mm = vma->vm_mm; int err; - switch (advice) { - case MADV_MERGEABLE: - /* - * Be somewhat over-protective for now! - */ - if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | - VM_PFNMAP | VM_IO | VM_DONTEXPAND | - VM_HUGETLB | VM_MIXEDMAP)) - return 0; /* just ignore the advice */ + /* + * Be somewhat over-protective for now! + */ + if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | + VM_PFNMAP | VM_IO | VM_DONTEXPAND | + VM_HUGETLB | VM_MIXEDMAP)) + return 0; /* just ignore the advice */ - if (vma_is_dax(vma)) - return 0; + if (vma_is_dax(vma)) + return 0; #ifdef VM_SAO - if (*vm_flags & VM_SAO) - return 0; + if (*vm_flags & VM_SAO) + return 0; #endif #ifdef VM_SPARC_ADI - if (*vm_flags & VM_SPARC_ADI) - return 0; + if (*vm_flags & VM_SPARC_ADI) + return 0; #endif - if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { - err = __ksm_enter(mm); - if (err) - return err; - } + if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { + err = __ksm_enter(mm); + if (err) + return err; + } - *vm_flags |= VM_MERGEABLE; - break; + *vm_flags |= VM_MERGEABLE; - case MADV_UNMERGEABLE: - if (!(*vm_flags & VM_MERGEABLE)) - return 0; /* just ignore the advice */ + return 0; +} - if (vma->anon_vma) { - err = unmerge_ksm_pages(vma, start, end); - if (err) - return err; - } +int ksm_madvise_unmerge(struct vm_area_struct *vma, unsigned long start, + unsigned long end, unsigned long *vm_flags) +{ + int err; + + if (!(*vm_flags & VM_MERGEABLE)) + return 0; /* just ignore the advice */ + + if (vma->anon_vma) { + err = unmerge_ksm_pages(vma, start, end); + if (err) + return err; + } + + *vm_flags &= ~VM_MERGEABLE; + + return 0; +} + +int ksm_madvise(struct vm_area_struct *vma, unsigned long start, + unsigned long end, int advice, unsigned long *vm_flags) +{ + struct mm_struct *mm = vma->vm_mm; + int err; - *vm_flags &= ~VM_MERGEABLE; + switch (advice) { + case MADV_MERGEABLE: + err = ksm_madvise_merge(mm, vma, vm_flags); + if (err) + return err; + break; + + case MADV_UNMERGEABLE: + err = ksm_madvise_unmerge(vma, start, end, vm_flags); + if (err) + return err; break; } diff --git a/mm/vmstat.c b/mm/vmstat.c index 96d21a792b57..cc88f7533b8d 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1074,6 +1074,24 @@ static int __fragmentation_index(unsigned int order, struct contig_page_info *in return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total); } +/* + * Calculates external fragmentation within a zone wrt the given order. + * It is defined as the percentage of pages found in blocks of size + * less than 1 << order. It returns values in range [0, 100]. + */ +int extfrag_for_order(struct zone *zone, unsigned int order) +{ + struct contig_page_info info; + + fill_contig_page_info(zone, order, &info); + if (info.free_pages == 0) + return 0; + + return div_u64((info.free_pages - + (info.free_blocks_suitable << order)) * 100, + info.free_pages); +} + /* Same as __fragmentation index but allocs contig_page_info on stack */ int fragmentation_index(struct zone *zone, unsigned int order) { diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 4b799737722c..41063ffd92b3 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -395,6 +395,21 @@ quiet_cmd_xzkern = XZKERN $@ quiet_cmd_xzmisc = XZMISC $@ cmd_xzmisc = cat $(real-prereqs) | xz --check=crc32 --lzma2=dict=1MiB > $@ +# ZSTD +# --------------------------------------------------------------------------- +# Appends the uncompressed size of the data using size_append. The .zst +# format has the size information available at the beginning of the file too, +# but it's in a more complex format and it's good to avoid changing the part +# of the boot code that reads the uncompressed size. +# Note that the bytes added by size_append will make the zstd tool think that +# the file is corrupt. This is expected. + +quiet_cmd_zstd = ZSTD $@ +cmd_zstd = (cat $(filter-out FORCE,$^) | \ + zstd -19 && \ + $(call size_append, $(filter-out FORCE,$^))) > $@ || \ + (rm -f $@ ; false) + # ASM offsets # --------------------------------------------------------------------------- diff --git a/tools/testing/selftests/futex/functional/.gitignore b/tools/testing/selftests/futex/functional/.gitignore index 0efcd494daab..03a4bedcebc2 100644 --- a/tools/testing/selftests/futex/functional/.gitignore +++ b/tools/testing/selftests/futex/functional/.gitignore @@ -6,3 +6,4 @@ futex_wait_private_mapped_file futex_wait_timeout futex_wait_uninitialized_heap futex_wait_wouldblock +futex_wait_multiple diff --git a/tools/testing/selftests/futex/functional/Makefile b/tools/testing/selftests/futex/functional/Makefile index 23207829ec75..26562f2d792d 100644 --- a/tools/testing/selftests/futex/functional/Makefile +++ b/tools/testing/selftests/futex/functional/Makefile @@ -14,7 +14,8 @@ TEST_GEN_FILES := \ futex_requeue_pi_signal_restart \ futex_requeue_pi_mismatched_ops \ futex_wait_uninitialized_heap \ - futex_wait_private_mapped_file + futex_wait_private_mapped_file \ + futex_wait_multiple TEST_PROGS := run.sh diff --git a/tools/testing/selftests/futex/functional/futex_wait_multiple.c b/tools/testing/selftests/futex/functional/futex_wait_multiple.c new file mode 100644 index 000000000000..b48422e79f42 --- /dev/null +++ b/tools/testing/selftests/futex/functional/futex_wait_multiple.c @@ -0,0 +1,173 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/****************************************************************************** + * + * Copyright © Collabora, Ltd., 2019 + * + * DESCRIPTION + * Test basic semantics of FUTEX_WAIT_MULTIPLE + * + * AUTHOR + * Gabriel Krisman Bertazi + * + * HISTORY + * 2019-Dec-13: Initial version by Krisman + * + *****************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include "futextest.h" +#include "logging.h" + +#define TEST_NAME "futex-wait-multiple" +#define timeout_ns 100000 +#define MAX_COUNT 128 +#define WAKE_WAIT_US 3000000 + +int ret = RET_PASS; +char *progname; +futex_t f[MAX_COUNT] = {0}; +struct futex_wait_block fwb[MAX_COUNT]; + +void usage(char *prog) +{ + printf("Usage: %s\n", prog); + printf(" -c Use color\n"); + printf(" -h Display this help message\n"); + printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", + VQUIET, VCRITICAL, VINFO); +} + +void test_count_overflow(void) +{ + futex_t f = FUTEX_INITIALIZER; + struct futex_wait_block fwb[MAX_COUNT+1]; + int res, i; + + ksft_print_msg("%s: Test a too big number of futexes\n", progname); + + for (i = 0; i < MAX_COUNT+1; i++) { + fwb[i].uaddr = &f; + fwb[i].val = f; + fwb[i].bitset = 0; + } + + res = futex_wait_multiple(fwb, MAX_COUNT+1, NULL, FUTEX_PRIVATE_FLAG); + +#ifdef __ILP32__ + if (res != -1 || errno != ENOSYS) { + ksft_test_result_fail("futex_wait_multiple returned %d\n", + res < 0 ? errno : res); + ret = RET_FAIL; + } else { + ksft_test_result_skip("futex_wait_multiple not supported at x32\n"); + } +#else + if (res != -1 || errno != EINVAL) { + ksft_test_result_fail("futex_wait_multiple returned %d\n", + res < 0 ? errno : res); + ret = RET_FAIL; + } else { + ksft_test_result_pass("futex_wait_multiple count overflow succeed\n"); + } + +#endif /* __ILP32__ */ +} + +void *waiterfn(void *arg) +{ + int res; + + res = futex_wait_multiple(fwb, MAX_COUNT, NULL, FUTEX_PRIVATE_FLAG); + +#ifdef __ILP32__ + if (res != -1 || errno != ENOSYS) { + ksft_test_result_fail("futex_wait_multiple returned %d\n", + res < 0 ? errno : res); + ret = RET_FAIL; + } else { + ksft_test_result_skip("futex_wait_multiple not supported at x32\n"); + } +#else + if (res < 0) + ksft_print_msg("waiter failed %d\n", res); + + info("futex_wait_multiple: Got hint futex %d was freed\n", res); +#endif /* __ILP32__ */ + + return NULL; +} + +void test_fwb_wakeup(void) +{ + int res, i; + pthread_t waiter; + + ksft_print_msg("%s: Test wake up in a list of futex\n", progname); + + for (i = 0; i < MAX_COUNT; i++) { + fwb[i].uaddr = &f[i]; + fwb[i].val = f[i]; + fwb[i].bitset = 0xffffffff; + } + + res = pthread_create(&waiter, NULL, waiterfn, NULL); + if (res) { + ksft_test_result_fail("Creating waiting thread failed"); + ksft_exit_fail(); + } + + usleep(WAKE_WAIT_US); + res = futex_wake(&(f[MAX_COUNT-1]), 1, FUTEX_PRIVATE_FLAG); + if (res != 1) { + ksft_test_result_fail("Failed to wake thread res=%d\n", res); + ksft_exit_fail(); + } + + pthread_join(waiter, NULL); + ksft_test_result_pass("%s succeed\n", __func__); +} + +int main(int argc, char *argv[]) +{ + int c; + + while ((c = getopt(argc, argv, "cht:v:")) != -1) { + switch (c) { + case 'c': + log_color(1); + break; + case 'h': + usage(basename(argv[0])); + exit(0); + case 'v': + log_verbosity(atoi(optarg)); + break; + default: + usage(basename(argv[0])); + exit(1); + } + } + + progname = basename(argv[0]); + + ksft_print_header(); + ksft_set_plan(2); + + test_count_overflow(); + +#ifdef __ILP32__ + // if it's a 32x binary, there's no futex to wakeup + ksft_test_result_skip("futex_wait_multiple not supported at x32\n"); +#else + test_fwb_wakeup(); +#endif /* __ILP32__ */ + + ksft_print_cnts(); + return ret; +} diff --git a/tools/testing/selftests/futex/functional/futex_wait_timeout.c b/tools/testing/selftests/futex/functional/futex_wait_timeout.c index ee55e6d389a3..2a63e1c2cfb6 100644 --- a/tools/testing/selftests/futex/functional/futex_wait_timeout.c +++ b/tools/testing/selftests/futex/functional/futex_wait_timeout.c @@ -11,6 +11,7 @@ * * HISTORY * 2009-Nov-6: Initial version by Darren Hart + * 2019-Dec-13: Add WAIT_MULTIPLE test by Krisman * *****************************************************************************/ @@ -41,6 +42,8 @@ int main(int argc, char *argv[]) { futex_t f1 = FUTEX_INITIALIZER; struct timespec to; + time_t secs; + struct futex_wait_block fwb = {&f1, f1, 0}; int res, ret = RET_PASS; int c; @@ -65,7 +68,7 @@ int main(int argc, char *argv[]) } ksft_print_header(); - ksft_set_plan(1); + ksft_set_plan(2); ksft_print_msg("%s: Block on a futex and wait for timeout\n", basename(argv[0])); ksft_print_msg("\tArguments: timeout=%ldns\n", timeout_ns); @@ -79,8 +82,39 @@ int main(int argc, char *argv[]) if (!res || errno != ETIMEDOUT) { fail("futex_wait returned %d\n", ret < 0 ? errno : ret); ret = RET_FAIL; + } else + ksft_test_result_pass("futex_wait timeout succeeds\n"); + + info("Calling futex_wait_multiple on f1: %u @ %p\n", f1, &f1); + + /* Setup absolute time */ + ret = clock_gettime(CLOCK_REALTIME, &to); + secs = (to.tv_nsec + timeout_ns) / 1000000000; + to.tv_nsec = ((int64_t)to.tv_nsec + timeout_ns) % 1000000000; + to.tv_sec += secs; + info("to.tv_sec = %ld\n", to.tv_sec); + info("to.tv_nsec = %ld\n", to.tv_nsec); + + res = futex_wait_multiple(&fwb, 1, &to, + FUTEX_PRIVATE_FLAG | FUTEX_CLOCK_REALTIME); + +#ifdef __ILP32__ + if (res == -1 && errno == ENOSYS) { + ksft_test_result_skip("futex_wait_multiple not supported at x32\n"); + } else { + ksft_test_result_fail("futex_wait_multiple returned %d\n", + res < 0 ? errno : res); + ret = RET_FAIL; } +#else + if (!res || errno != ETIMEDOUT) { + ksft_test_result_fail("futex_wait_multiple returned %d\n", + res < 0 ? errno : res); + ret = RET_FAIL; + } else + ksft_test_result_pass("futex_wait_multiple timeout succeeds\n"); +#endif /* __ILP32__ */ - print_result(TEST_NAME, ret); + ksft_print_cnts(); return ret; } diff --git a/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c b/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c index 0ae390ff8164..bcbac042992d 100644 --- a/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c +++ b/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c @@ -12,6 +12,7 @@ * * HISTORY * 2009-Nov-14: Initial version by Gowrishankar + * 2019-Dec-13: Add WAIT_MULTIPLE test by Krisman * *****************************************************************************/ @@ -40,6 +41,7 @@ int main(int argc, char *argv[]) { struct timespec to = {.tv_sec = 0, .tv_nsec = timeout_ns}; futex_t f1 = FUTEX_INITIALIZER; + struct futex_wait_block fwb = {&f1, f1+1, 0}; int res, ret = RET_PASS; int c; @@ -61,7 +63,7 @@ int main(int argc, char *argv[]) } ksft_print_header(); - ksft_set_plan(1); + ksft_set_plan(2); ksft_print_msg("%s: Test the unexpected futex value in FUTEX_WAIT\n", basename(argv[0])); @@ -71,8 +73,30 @@ int main(int argc, char *argv[]) fail("futex_wait returned: %d %s\n", res ? errno : res, res ? strerror(errno) : ""); ret = RET_FAIL; + } else + ksft_test_result_pass("futex_wait wouldblock succeeds\n"); + + info("Calling futex_wait_multiple on f1: %u @ %p with val=%u\n", + f1, &f1, f1+1); + res = futex_wait_multiple(&fwb, 1, NULL, FUTEX_PRIVATE_FLAG); + +#ifdef __ILP32__ + if (res != -1 || errno != ENOSYS) { + ksft_test_result_fail("futex_wait_multiple returned %d\n", + res < 0 ? errno : res); + ret = RET_FAIL; + } else { + ksft_test_result_skip("futex_wait_multiple not supported at x32\n"); + } +#else + if (!res || errno != EWOULDBLOCK) { + ksft_test_result_fail("futex_wait_multiple returned %d\n", + res < 0 ? errno : res); + ret = RET_FAIL; } + ksft_test_result_pass("futex_wait_multiple wouldblock succeeds\n"); +#endif /* __ILP32__ */ - print_result(TEST_NAME, ret); + ksft_print_cnts(); return ret; } diff --git a/tools/testing/selftests/futex/functional/run.sh b/tools/testing/selftests/futex/functional/run.sh index 1acb6ace1680..a8be94f28ff7 100755 --- a/tools/testing/selftests/futex/functional/run.sh +++ b/tools/testing/selftests/futex/functional/run.sh @@ -73,3 +73,6 @@ echo echo ./futex_wait_uninitialized_heap $COLOR ./futex_wait_private_mapped_file $COLOR + +echo +./futex_wait_multiple $COLOR diff --git a/tools/testing/selftests/futex/include/futextest.h b/tools/testing/selftests/futex/include/futextest.h index ddbcfc9b7bac..bb103bef4557 100644 --- a/tools/testing/selftests/futex/include/futextest.h +++ b/tools/testing/selftests/futex/include/futextest.h @@ -38,6 +38,14 @@ typedef volatile u_int32_t futex_t; #ifndef FUTEX_CMP_REQUEUE_PI #define FUTEX_CMP_REQUEUE_PI 12 #endif +#ifndef FUTEX_WAIT_MULTIPLE +#define FUTEX_WAIT_MULTIPLE 13 +struct futex_wait_block { + futex_t *uaddr; + futex_t val; + __u32 bitset; +}; +#endif #ifndef FUTEX_WAIT_REQUEUE_PI_PRIVATE #define FUTEX_WAIT_REQUEUE_PI_PRIVATE (FUTEX_WAIT_REQUEUE_PI | \ FUTEX_PRIVATE_FLAG) @@ -80,6 +88,20 @@ futex_wait(futex_t *uaddr, futex_t val, struct timespec *timeout, int opflags) return futex(uaddr, FUTEX_WAIT, val, timeout, NULL, 0, opflags); } +/** + * futex_wait_multiple() - block on several futexes with optional timeout + * @fwb: wait block user space address + * @count: number of entities at fwb + * @timeout: absolute timeout + */ +static inline int +futex_wait_multiple(struct futex_wait_block *fwb, int count, + struct timespec *timeout, int opflags) +{ + return futex(fwb, FUTEX_WAIT_MULTIPLE, count, timeout, NULL, 0, + opflags); +} + /** * futex_wake() - wake one or more tasks blocked on uaddr * @nr_wake: wake up to this many tasks diff --git a/tools/testing/selftests/x86/fsgsbase.c b/tools/testing/selftests/x86/fsgsbase.c index 15a329da59fa..9a4349813a30 100644 --- a/tools/testing/selftests/x86/fsgsbase.c +++ b/tools/testing/selftests/x86/fsgsbase.c @@ -489,11 +489,27 @@ static void test_ptrace_write_gsbase(void) * selector value is changed or not by the GSBASE write in * a ptracer. */ - if (gs == 0 && base == 0xFF) { - printf("[OK]\tGS was reset as expected\n"); - } else { + if (gs != *shared_scratch) { nerrs++; - printf("[FAIL]\tGS=0x%lx, GSBASE=0x%lx (should be 0, 0xFF)\n", gs, base); + printf("[FAIL]\tGS changed to %lx\n", gs); + + /* + * On older kernels, poking a nonzero value into the + * base would zero the selector. On newer kernels, + * this behavior has changed -- poking the base + * changes only the base and, if FSGSBASE is not + * available, this may not effect. + */ + if (gs == 0) + printf("\tNote: this is expected behavior on older kernels.\n"); + } else if (have_fsgsbase && (base != 0xFF)) { + nerrs++; + printf("[FAIL]\tGSBASE changed to %lx\n", base); + } else { + printf("[OK]\tGS remained 0x%hx", *shared_scratch); + if (have_fsgsbase) + printf(" and GSBASE changed to 0xFF"); + printf("\n"); } } diff --git a/usr/Kconfig b/usr/Kconfig index 96afb03b65f9..2599bc21c1b2 100644 --- a/usr/Kconfig +++ b/usr/Kconfig @@ -100,6 +100,15 @@ config RD_LZ4 Support loading of a LZ4 encoded initial ramdisk or cpio buffer If unsure, say N. +config RD_ZSTD + bool "Support initial ramdisk/ramfs compressed using ZSTD" + default y + depends on BLK_DEV_INITRD + select DECOMPRESS_ZSTD + help + Support loading of a ZSTD encoded initial ramdisk or cpio buffer. + If unsure, say N. + choice prompt "Built-in initramfs compression mode" depends on INITRAMFS_SOURCE != "" @@ -196,6 +205,17 @@ config INITRAMFS_COMPRESSION_LZ4 If you choose this, keep in mind that most distros don't provide lz4 by default which could cause a build failure. +config INITRAMFS_COMPRESSION_ZSTD + bool "ZSTD" + depends on RD_ZSTD + help + ZSTD is a compression algorithm targeting intermediate compression + with fast decompression speed. It will compress better than GZIP and + decompress around the same speed as LZO, but slower than LZ4. + + If you choose this, keep in mind that you may need to install the zstd + tool to be able to compress the initram. + config INITRAMFS_COMPRESSION_NONE bool "None" help diff --git a/usr/Makefile b/usr/Makefile index c12e6b15ce72..b1a81a40eab1 100644 --- a/usr/Makefile +++ b/usr/Makefile @@ -15,6 +15,7 @@ compress-$(CONFIG_INITRAMFS_COMPRESSION_LZMA) := lzma compress-$(CONFIG_INITRAMFS_COMPRESSION_XZ) := xzmisc compress-$(CONFIG_INITRAMFS_COMPRESSION_LZO) := lzo compress-$(CONFIG_INITRAMFS_COMPRESSION_LZ4) := lz4 +compress-$(CONFIG_INITRAMFS_COMPRESSION_ZSTD) := zstd obj-$(CONFIG_BLK_DEV_INITRD) := initramfs_data.o