Skip to content
Snippets Groups Projects
Commit 19dbc1a9 authored by Rocky Automation's avatar Rocky Automation :tv:
Browse files

import glibc-2.34-125.el9_5.1

parent c3b44e5b
No related branches found
Tags imports/r9/glibc-2.34-125.el9_5.1
No related merge requests found
Showing
with 3213 additions and 1 deletion
609d4863fbcbcc09c71823c7b05065d719529c7500836bda2bc760b3f16d09f7
a80b272db684a3386983a445e07d1b36c3e6fc723eb93ecf89cae7d08970fd83
From 317f1c0a8a71a862b1e600ff5386b08e02cf4b95 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Thu, 26 Jan 2023 08:26:18 -0800
Subject: [PATCH] x86-64: Add glibc.cpu.prefer_map_32bit_exec [BZ #28656]
Content-type: text/plain; charset=UTF-8
Crossing 2GB boundaries with indirect calls and jumps can use more
branch prediction resources on Intel Golden Cove CPU (see the
"Misprediction for Branches >2GB" section in Intel 64 and IA-32
Architectures Optimization Reference Manual.) There is visible
performance improvement on workloads with many PLT calls when executable
and shared libraries are mmapped below 2GB. Add the Prefer_MAP_32BIT_EXEC
bit so that mmap will try to map executable or denywrite pages in shared
libraries with MAP_32BIT first.
NB: Prefer_MAP_32BIT_EXEC reduces bits available for address space
layout randomization (ASLR), which is always disabled for SUID programs
and can only be enabled by the tunable, glibc.cpu.prefer_map_32bit_exec,
or the environment variable, LD_PREFER_MAP_32BIT_EXEC. This works only
between shared libraries or between shared libraries and executables with
addresses below 2GB. PIEs are usually loaded at a random address above
4GB by the kernel.
Conflicts:
manual/tunables.texi
(line numbers)
sysdeps/unix/sysv/linux/x86_64/64/dl-tunables.list
(merged local @order list)
sysdeps/x86/cpu-features.c
(line numbers)
---
manual/tunables.texi | 33 ++++++++++----
sysdeps/unix/sysv/linux/x86_64/64/Makefile | 25 +++++++++++
.../sysv/linux/x86_64/64/dl-tunables.list | 29 +++++++++++++
.../unix/sysv/linux/x86_64/64/mmap_internal.h | 43 +++++++++++++++++++
.../sysv/linux/x86_64/64/tst-map-32bit-1a.c | 34 +++++++++++++++
.../sysv/linux/x86_64/64/tst-map-32bit-1b.c | 1 +
.../sysv/linux/x86_64/64/tst-map-32bit-mod.c | 33 ++++++++++++++
sysdeps/x86/cpu-features.c | 15 +++++++
...cpu-features-preferred_feature_index_1.def | 1 +
9 files changed, 205 insertions(+), 9 deletions(-)
create mode 100644 sysdeps/unix/sysv/linux/x86_64/64/dl-tunables.list
create mode 100644 sysdeps/unix/sysv/linux/x86_64/64/mmap_internal.h
create mode 100644 sysdeps/unix/sysv/linux/x86_64/64/tst-map-32bit-1a.c
create mode 100644 sysdeps/unix/sysv/linux/x86_64/64/tst-map-32bit-1b.c
create mode 100644 sysdeps/unix/sysv/linux/x86_64/64/tst-map-32bit-mod.c
diff --git a/manual/tunables.texi b/manual/tunables.texi
index 0be7231e36..c76c5c53cd 100644
--- a/manual/tunables.texi
+++ b/manual/tunables.texi
@@ -35,27 +35,32 @@ tunables with minimum and maximum values:
@example
$ /lib64/ld-linux-x86-64.so.2 --list-tunables
glibc.rtld.nns: 0x4 (min: 0x1, max: 0x10)
-glibc.elision.skip_lock_after_retries: 3 (min: -2147483648, max: 2147483647)
+glibc.elision.skip_lock_after_retries: 3 (min: 0, max: 2147483647)
glibc.malloc.trim_threshold: 0x0 (min: 0x0, max: 0xffffffffffffffff)
glibc.malloc.perturb: 0 (min: 0, max: 255)
glibc.cpu.x86_shared_cache_size: 0x100000 (min: 0x0, max: 0xffffffffffffffff)
+glibc.pthread.rseq: 1 (min: 0, max: 1)
+glibc.cpu.prefer_map_32bit_exec: 0 (min: 0, max: 1)
glibc.mem.tagging: 0 (min: 0, max: 255)
-glibc.elision.tries: 3 (min: -2147483648, max: 2147483647)
+glibc.elision.tries: 3 (min: 0, max: 2147483647)
glibc.elision.enable: 0 (min: 0, max: 1)
-glibc.cpu.x86_rep_movsb_threshold: 0x1000 (min: 0x100, max: 0xffffffffffffffff)
+glibc.malloc.hugetlb: 0x0 (min: 0x0, max: 0xffffffffffffffff)
+glibc.cpu.x86_rep_movsb_threshold: 0x2000 (min: 0x100, max: 0xffffffffffffffff)
glibc.malloc.mxfast: 0x0 (min: 0x0, max: 0xffffffffffffffff)
-glibc.elision.skip_lock_busy: 3 (min: -2147483648, max: 2147483647)
-glibc.malloc.top_pad: 0x0 (min: 0x0, max: 0xffffffffffffffff)
+glibc.rtld.dynamic_sort: 2 (min: 1, max: 2)
+glibc.elision.skip_lock_busy: 3 (min: 0, max: 2147483647)
+glibc.malloc.top_pad: 0x20000 (min: 0x0, max: 0xffffffffffffffff)
glibc.cpu.x86_rep_stosb_threshold: 0x800 (min: 0x1, max: 0xffffffffffffffff)
-glibc.cpu.x86_non_temporal_threshold: 0xc0000 (min: 0x4040, max: 0x0fffffffffffffff)
+glibc.cpu.x86_non_temporal_threshold: 0xc0000 (min: 0x4040, max: 0xfffffffffffffff)
glibc.cpu.x86_shstk:
+glibc.pthread.stack_cache_size: 0x2800000 (min: 0x0, max: 0xffffffffffffffff)
glibc.cpu.hwcap_mask: 0x6 (min: 0x0, max: 0xffffffffffffffff)
-glibc.malloc.mmap_max: 0 (min: -2147483648, max: 2147483647)
-glibc.elision.skip_trylock_internal_abort: 3 (min: -2147483648, max: 2147483647)
+glibc.malloc.mmap_max: 0 (min: 0, max: 2147483647)
+glibc.elision.skip_trylock_internal_abort: 3 (min: 0, max: 2147483647)
glibc.malloc.tcache_unsorted_limit: 0x0 (min: 0x0, max: 0xffffffffffffffff)
glibc.cpu.x86_ibt:
glibc.cpu.hwcaps:
-glibc.elision.skip_lock_internal_abort: 3 (min: -2147483648, max: 2147483647)
+glibc.elision.skip_lock_internal_abort: 3 (min: 0, max: 2147483647)
glibc.malloc.arena_max: 0x0 (min: 0x1, max: 0xffffffffffffffff)
glibc.malloc.mmap_threshold: 0x0 (min: 0x0, max: 0xffffffffffffffff)
glibc.cpu.x86_data_cache_size: 0x8000 (min: 0x0, max: 0xffffffffffffffff)
@@ -569,6 +574,16 @@ instead.
This tunable is specific to i386 and x86-64.
@end deftp
+@deftp Tunable glibc.cpu.prefer_map_32bit_exec
+When this tunable is set to \code{1}, shared libraries of non-setuid
+programs will be loaded below 2GB with MAP_32BIT.
+
+Note that the @env{LD_PREFER_MAP_32BIT_EXEC} environment is an alias of
+this tunable.
+
+This tunable is specific to 64-bit x86-64.
+@end deftp
+
@node Memory Related Tunables
@section Memory Related Tunables
@cindex memory related tunables
diff --git a/sysdeps/unix/sysv/linux/x86_64/64/Makefile b/sysdeps/unix/sysv/linux/x86_64/64/Makefile
index a7b6dc5a53..8ff4f27786 100644
--- a/sysdeps/unix/sysv/linux/x86_64/64/Makefile
+++ b/sysdeps/unix/sysv/linux/x86_64/64/Makefile
@@ -1,2 +1,27 @@
# The default ABI is 64.
default-abi := 64
+
+ifeq ($(subdir),elf)
+ifneq ($(have-tunables),no)
+
+tests-map-32bit = \
+ tst-map-32bit-1a \
+ tst-map-32bit-1b \
+# tests-map-32bit
+tst-map-32bit-1a-no-pie = yes
+tst-map-32bit-1b-no-pie = yes
+tests += $(tests-map-32bit)
+
+modules-map-32bit = \
+ tst-map-32bit-mod \
+# modules-map-32bit
+modules-names += $(modules-map-32bit)
+
+$(objpfx)tst-map-32bit-mod.so: $(libsupport)
+tst-map-32bit-1a-ENV = LD_PREFER_MAP_32BIT_EXEC=1
+$(objpfx)tst-map-32bit-1a: $(objpfx)tst-map-32bit-mod.so
+tst-map-32bit-1b-ENV = GLIBC_TUNABLES=glibc.cpu.prefer_map_32bit_exec=1
+$(objpfx)tst-map-32bit-1b: $(objpfx)tst-map-32bit-mod.so
+
+endif
+endif
diff -rup a/sysdeps/unix/sysv/linux/x86_64/64/dl-tunables.list b/sysdeps/unix/sysv/linux/x86_64/64/dl-tunables.list
--- a/sysdeps/unix/sysv/linux/x86_64/64/dl-tunables.list 2024-03-06 17:52:50.968514369 -0500
+++ b/sysdeps/unix/sysv/linux/x86_64/64/dl-tunables.list 2024-03-06 17:55:48.778264896 -0500
@@ -1,3 +1,33 @@
+# x86-64 specific tunables.
+# Copyright (C) 2023 Free Software Foundation, Inc.
+# This file is part of the GNU C Library.
+
+# The GNU C Library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+
+# The GNU C Library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+
+# You should have received a copy of the GNU Lesser General Public
+# License along with the GNU C Library; if not, see
+# <https://www.gnu.org/licenses/>.
+
+glibc {
+ cpu {
+ prefer_map_32bit_exec {
+ type: INT_32
+ minval: 0
+ maxval: 1
+ env_alias: LD_PREFER_MAP_32BIT_EXEC
+ security_level: SXID_IGNORE
+ }
+ }
+}
+
# Order of tunables in RHEL 9.1.z.
@order glibc.rtld.nns
@order glibc.elision.skip_lock_after_retries
@@ -35,3 +65,5 @@
@order glibc.malloc.check
@order glibc.gmon.minarcs
@order glibc.gmon.maxarcs
+# Order of tunables in RHEL 9.5.z
+@order glibc.cpu.prefer_map_32bit_exec
diff --git a/sysdeps/unix/sysv/linux/x86_64/64/mmap_internal.h b/sysdeps/unix/sysv/linux/x86_64/64/mmap_internal.h
new file mode 100644
index 0000000000..33dec3f805
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/x86_64/64/mmap_internal.h
@@ -0,0 +1,43 @@
+/* Linux mmap system call. x86-64 version.
+ Copyright (C) 2015-2023 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public License as
+ published by the Free Software Foundation; either version 2.1 of the
+ License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef MMAP_X86_64_INTERNAL_H
+#define MMAP_X86_64_INTERNAL_H
+
+#include <ldsodefs.h>
+
+/* If the Prefer_MAP_32BIT_EXEC bit is set, try to map executable or
+ denywrite pages with MAP_32BIT first. */
+#define MMAP_PREPARE(addr, len, prot, flags, fd, offset) \
+ if ((addr) == NULL \
+ && (((prot) & PROT_EXEC) != 0 \
+ || ((flags) & MAP_DENYWRITE) != 0) \
+ && HAS_ARCH_FEATURE (Prefer_MAP_32BIT_EXEC)) \
+ { \
+ void *ret = (void*) INLINE_SYSCALL_CALL (mmap, (addr), (len), \
+ (prot), \
+ (flags) | MAP_32BIT, \
+ (fd), (offset)); \
+ if (ret != MAP_FAILED) \
+ return ret; \
+ }
+
+#include_next <mmap_internal.h>
+
+#endif
diff --git a/sysdeps/unix/sysv/linux/x86_64/64/tst-map-32bit-1a.c b/sysdeps/unix/sysv/linux/x86_64/64/tst-map-32bit-1a.c
new file mode 100644
index 0000000000..abc396589e
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/x86_64/64/tst-map-32bit-1a.c
@@ -0,0 +1,34 @@
+/* Check that LD_PREFER_MAP_32BIT_EXEC works in PDE and shared library.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <support/check.h>
+
+extern void dso_check_map_32bit (void);
+
+static int
+do_test (void)
+{
+ printf ("do_test: %p\n", do_test);
+ TEST_VERIFY ((uintptr_t) do_test < 0xffffffffUL);
+ dso_check_map_32bit ();
+ return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/sysdeps/unix/sysv/linux/x86_64/64/tst-map-32bit-1b.c b/sysdeps/unix/sysv/linux/x86_64/64/tst-map-32bit-1b.c
new file mode 100644
index 0000000000..34ab01c773
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/x86_64/64/tst-map-32bit-1b.c
@@ -0,0 +1 @@
+#include "tst-map-32bit-1a.c"
diff --git a/sysdeps/unix/sysv/linux/x86_64/64/tst-map-32bit-mod.c b/sysdeps/unix/sysv/linux/x86_64/64/tst-map-32bit-mod.c
new file mode 100644
index 0000000000..78d4b6133c
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/x86_64/64/tst-map-32bit-mod.c
@@ -0,0 +1,33 @@
+/* Check that LD_PREFER_MAP_32BIT_EXEC works in shared library.
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <support/check.h>
+
+static void
+dso_do_test (void)
+{
+}
+
+void
+dso_check_map_32bit (void)
+{
+ printf ("dso_do_test: %p\n", dso_do_test);
+ TEST_VERIFY ((uintptr_t) dso_do_test < 0xffffffffUL);
+}
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index a2197ed211..822688e21f 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -27,6 +27,16 @@
extern void TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *)
attribute_hidden;
+# ifdef __LP64__
+static void
+TUNABLE_CALLBACK (set_prefer_map_32bit_exec) (tunable_val_t *valp)
+{
+ if (valp->numval)
+ GLRO(dl_x86_cpu_features).preferred[index_arch_Prefer_MAP_32BIT_EXEC]
+ |= bit_arch_Prefer_MAP_32BIT_EXEC;
+}
+# endif
+
# if CET_ENABLED
extern void TUNABLE_CALLBACK (set_x86_ibt) (tunable_val_t *)
attribute_hidden;
@@ -949,6 +959,11 @@ no_cpuid:
#if HAVE_TUNABLES
TUNABLE_GET (hwcaps, tunable_val_t *, TUNABLE_CALLBACK (set_hwcaps));
+# ifdef __LP64__
+ TUNABLE_GET (prefer_map_32bit_exec, tunable_val_t *,
+ TUNABLE_CALLBACK (set_prefer_map_32bit_exec));
+# endif
+
bool disable_xsave_features = false;
if (!CPU_FEATURE_USABLE_P (cpu_features, OSXSAVE))
diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
index e45f9cb159..d20c5b3196 100644
--- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
+++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
@@ -26,6 +26,7 @@ BIT (I586)
BIT (I686)
BIT (Slow_SSE4_2)
BIT (AVX_Fast_Unaligned_Load)
+BIT (Prefer_MAP_32BIT_EXEC)
BIT (Prefer_No_VZEROUPPER)
BIT (Prefer_ERMS)
BIT (Prefer_No_AVX512)
--
2.39.3
From 188ecdb7774145050a6e167a277f45f03dac5fe8 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Wed, 22 Feb 2023 20:04:26 -0800
Subject: [PATCH] tunables.texi: Change \code{1} to @code{1}
Content-type: text/plain; charset=UTF-8
Update
317f1c0a8a x86-64: Add glibc.cpu.prefer_map_32bit_exec [BZ #28656]
---
manual/tunables.texi | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/manual/tunables.texi b/manual/tunables.texi
index c76c5c53cd..70dd2264c5 100644
--- a/manual/tunables.texi
+++ b/manual/tunables.texi
@@ -589,7 +589,7 @@ This tunable is specific to i386 and x86-64.
@end deftp
@deftp Tunable glibc.cpu.prefer_map_32bit_exec
-When this tunable is set to \code{1}, shared libraries of non-setuid
+When this tunable is set to @code{1}, shared libraries of non-setuid
programs will be loaded below 2GB with MAP_32BIT.
Note that the @env{LD_PREFER_MAP_32BIT_EXEC} environment is an alias of
--
2.39.3
commit 85860ad6eaf4c9739318f6b2a1ff7c2fa6b12ab5
Author: Florian Weimer <fweimer@redhat.com>
Date: Mon Aug 15 16:45:40 2022 +0200
malloc: Do not use MAP_NORESERVE to allocate heap segments
Address space for heap segments is reserved in a mmap call with
MAP_ANONYMOUS | MAP_PRIVATE and protection flags PROT_NONE. This
reservation does not count against the RSS limit of the process or
system. Backing memory is allocated using mprotect in alloc_new_heap
and grow_heap, and at this point, the allocator expects the kernel
to provide memory (subject to memory overcommit).
The SIGSEGV that might generate due to MAP_NORESERVE (according to
the mmap manual page) does not seem to occur in practice, it's always
SIGKILL from the OOM killer. Even if there is a way that SIGSEGV
could be generated, it is confusing to applications that this only
happens for secondary heaps, not for large mmap-based allocations,
and not for the main arena.
Reviewed-by: Siddhesh Poyarekar <siddhesh@sourceware.org>
Conflicts:
malloc/arena.c
(huge page support was added upstream)
diff --git a/malloc/arena.c b/malloc/arena.c
index 667484630ed0afa5..2852783355d3d869 100644
--- a/malloc/arena.c
+++ b/malloc/arena.c
@@ -466,8 +466,7 @@ new_heap (size_t size, size_t top_pad)
p2 = MAP_FAILED;
if (aligned_heap_area)
{
- p2 = (char *) MMAP (aligned_heap_area, HEAP_MAX_SIZE, PROT_NONE,
- MAP_NORESERVE);
+ p2 = (char *) MMAP (aligned_heap_area, HEAP_MAX_SIZE, PROT_NONE, 0);
aligned_heap_area = NULL;
if (p2 != MAP_FAILED && ((unsigned long) p2 & (HEAP_MAX_SIZE - 1)))
{
@@ -477,7 +476,7 @@ new_heap (size_t size, size_t top_pad)
}
if (p2 == MAP_FAILED)
{
- p1 = (char *) MMAP (0, HEAP_MAX_SIZE << 1, PROT_NONE, MAP_NORESERVE);
+ p1 = (char *) MMAP (0, HEAP_MAX_SIZE << 1, PROT_NONE, 0);
if (p1 != MAP_FAILED)
{
p2 = (char *) (((unsigned long) p1 + (HEAP_MAX_SIZE - 1))
@@ -493,7 +492,7 @@ new_heap (size_t size, size_t top_pad)
{
/* Try to take the chance that an allocation of only HEAP_MAX_SIZE
is already aligned. */
- p2 = (char *) MMAP (0, HEAP_MAX_SIZE, PROT_NONE, MAP_NORESERVE);
+ p2 = (char *) MMAP (0, HEAP_MAX_SIZE, PROT_NONE, 0);
if (p2 == MAP_FAILED)
return 0;
diff --git a/malloc/malloc.c b/malloc/malloc.c
index 375f50f5db13e234..fe80b8239756a7c9 100644
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -1112,10 +1112,6 @@ static mchunkptr mremap_chunk(mchunkptr p, size_t new_size);
# define MAP_ANONYMOUS MAP_ANON
#endif
-#ifndef MAP_NORESERVE
-# define MAP_NORESERVE 0
-#endif
-
#define MMAP(addr, size, prot, flags) \
__mmap((addr), (size), (prot), (flags)|MAP_ANONYMOUS|MAP_PRIVATE, -1, 0)
commit f21962ddfc8bb23e92597da1f98e313dbde11cc1
Author: Florian Weimer <fweimer@redhat.com>
Date: Fri Aug 25 14:15:28 2023 +0200
manual: Document ld.so --list-diagnostics output
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
diff --git a/manual/dynlink.texi b/manual/dynlink.texi
index 45bf5a5b55..df41c56bfc 100644
--- a/manual/dynlink.texi
+++ b/manual/dynlink.texi
@@ -13,9 +13,216 @@ as plugins) later at run time.
Dynamic linkers are sometimes called @dfn{dynamic loaders}.
@menu
+* Dynamic Linker Invocation:: Explicit invocation of the dynamic linker.
* Dynamic Linker Introspection:: Interfaces for querying mapping information.
@end menu
+@node Dynamic Linker Invocation
+
+@cindex program interpreter
+When a dynamically linked program starts, the operating system
+automatically loads the dynamic linker along with the program.
+@Theglibc{} also supports invoking the dynamic linker explicitly to
+launch a program. This command uses the implied dynamic linker
+(also sometimes called the @dfn{program interpreter}):
+
+@smallexample
+sh -c 'echo "Hello, world!"'
+@end smallexample
+
+This command specifies the dynamic linker explicitly:
+
+@smallexample
+ld.so /bin/sh -c 'echo "Hello, world!"'
+@end smallexample
+
+Note that @command{ld.so} does not search the @env{PATH} environment
+variable, so the full file name of the executable needs to be specified.
+
+The @command{ld.so} program supports various options. Options start
+@samp{--} and need to come before the program that is being launched.
+Some of the supported options are listed below.
+
+@table @code
+@item --list-diagnostics
+Print system diagnostic information in a machine-readable format.
+@xref{Dynamic Linker Diagnostics}.
+@end table
+
+@menu
+* Dynamic Linker Diagnostics:: Obtaining system diagnostic information.
+@end menu
+
+@node Dynamic Linker Diagnostics
+@section Dynamic Linker Diagnostics
+@cindex diagnostics (dynamic linker)
+
+The @samp{ld.so --list-diagnostics} produces machine-readable
+diagnostics output. This output contains system data that affects the
+behavior of @theglibc{}, and potentially application behavior as well.
+
+The exact set of diagnostic items can change between releases of
+@theglibc{}. The output format itself is not expected to change
+radically.
+
+The following table shows some example lines that can be written by the
+diagnostics command.
+
+@table @code
+@item dl_pagesize=0x1000
+The system page size is 4096 bytes.
+
+@item env[0x14]="LANG=en_US.UTF-8"
+This item indicates that the 21st environment variable at process
+startup contains a setting for @code{LANG}.
+
+@item env_filtered[0x22]="DISPLAY"
+The 35th environment variable is @code{DISPLAY}. Its value is not
+included in the output for privacy reasons because it is not recognized
+as harmless by the diagnostics code.
+
+@item path.prefix="/usr"
+This means that @theglibc{} was configured with @code{--prefix=/usr}.
+
+@item path.system_dirs[0x0]="/lib64/"
+@itemx path.system_dirs[0x1]="/usr/lib64/"
+The built-in dynamic linker search path contains two directories,
+@code{/lib64} and @code{/usr/lib64}.
+@end table
+
+@subsection Dynamic Linker Diagnostics Output Format
+
+As seen above, diagnostic lines assign values (integers or strings) to a
+sequence of labeled subscripts, separated by @samp{.}. Some subscripts
+have integer indices associated with them. The subscript indices are
+not necessarily contiguous or small, so an associative array should be
+used to store them. Currently, all integers fit into the 64-bit
+unsigned integer range. Every access path to a value has a fixed type
+(string or integer) independent of subscript index values. Likewise,
+whether a subscript is indexed does not depend on previous indices (but
+may depend on previous subscript labels).
+
+A syntax description in ABNF (RFC 5234) follows. Note that
+@code{%x30-39} denotes the range of decimal digits. Diagnostic output
+lines are expected to match the @code{line} production.
+
+@c ABNF-START
+@smallexample
+HEXDIG = %x30-39 / %x61-6f ; lowercase a-f only
+ALPHA = %x41-5a / %x61-7a / %x7f ; letters and underscore
+ALPHA-NUMERIC = ALPHA / %x30-39 / "_"
+DQUOTE = %x22 ; "
+
+; Numbers are always hexadecimal and use a 0x prefix.
+hex-value-prefix = %x30 %x78
+hex-value = hex-value-prefix 1*HEXDIG
+
+; Strings use octal escape sequences and \\, \".
+string-char = %x20-21 / %x23-5c / %x5d-7e ; printable but not "\
+string-quoted-octal = %x30-33 2*2%x30-37
+string-quoted = "\" ("\" / DQUOTE / string-quoted-octal)
+string-value = DQUOTE *(string-char / string-quoted) DQUOTE
+
+value = hex-value / string-value
+
+label = ALPHA *ALPHA-NUMERIC
+index = "[" hex-value "]"
+subscript = label [index]
+
+line = subscript *("." subscript) "=" value
+@end smallexample
+
+@subsection Dynamic Linker Diagnostics Values
+
+As mentioned above, the set of diagnostics may change between
+@theglibc{} releases. Nevertheless, the following table documents a few
+common diagnostic items. All numbers are in hexadecimal, with a
+@samp{0x} prefix.
+
+@table @code
+@item dl_dst_lib=@var{string}
+The @code{$LIB} dynamic string token expands to @var{string}.
+
+@cindex HWCAP (diagnostics)
+@item dl_hwcap=@var{integer}
+@itemx dl_hwcap2=@var{integer}
+The HWCAP and HWCAP2 values, as returned for @code{getauxval}, and as
+used in other places depending on the architecture.
+
+@cindex page size (diagnostics)
+@item dl_pagesize=@var{integer}
+The system page size is @var{integer} bytes.
+
+@item dl_platform=@var{string}
+The @code{$PLATFORM} dynamic string token expands to @var{string}.
+
+@item dso.libc=@var{string}
+This is the soname of the shared @code{libc} object that is part of
+@theglibc{}. On most architectures, this is @code{libc.so.6}.
+
+@item env[@var{index}]=@var{string}
+@itemx env_filtered[@var{index}]=@var{string}
+An environment variable from the process environment. The integer
+@var{index} is the array index in the environment array. Variables
+under @code{env} include the variable value after the @samp{=} (assuming
+that it was present), variables under @code{env_filtered} do not.
+
+@item path.prefix=@var{string}
+This indicates that @theglibc{} was configured using
+@samp{--prefix=@var{string}}.
+
+@item path.sysconfdir=@var{string}
+@Theglibc{} was configured (perhaps implicitly) with
+@samp{--sysconfdir=@var{string}} (typically @code{/etc}).
+
+@item path.system_dirs[@var{index}]=@var{string}
+These items list the elements of the built-in array that describes the
+default library search path. The value @var{string} is a directory file
+name with a trailing @samp{/}.
+
+@item path.rtld=@var{string}
+This string indicates the application binary interface (ABI) file name
+of the run-time dynamic linker.
+
+@item version.release="stable"
+@itemx version.release="development"
+The value @code{"stable"} indicates that this build of @theglibc{} is
+from a release branch. Releases labeled as @code{"development"} are
+unreleased development versions.
+
+@cindex version (diagnostics)
+@item version.version="@var{major}.@var{minor}"
+@itemx version.version="@var{major}.@var{minor}.9000"
+@Theglibc{} version. Development releases end in @samp{.9000}.
+
+@cindex auxiliary vector (diagnostics)
+@item auxv[@var{index}].a_type=@var{type}
+@itemx auxv[@var{index}].a_val=@var{integer}
+@itemx auxv[@var{index}].a_val_string=@var{string}
+An entry in the auxiliary vector (specific to Linux). The values
+@var{type} (an integer) and @var{integer} correspond to the members of
+@code{struct auxv}. If the value is a string, @code{a_val_string} is
+used instead of @code{a_val}, so that values have consistent types.
+
+The @code{AT_HWCAP} and @code{AT_HWCAP2} values in this output do not
+reflect adjustment by @theglibc{}.
+
+@item uname.sysname=@var{string}
+@itemx uname.nodename=@var{string}
+@itemx uname.release=@var{string}
+@itemx uname.version=@var{string}
+@itemx uname.machine=@var{string}
+@itemx uname.domain=@var{string}
+These Linux-specific items show the values of @code{struct utsname}, as
+reported by the @code{uname} function. @xref{Platform Type}.
+
+@cindex CPUID (diagnostics)
+@item x86.cpu_features.@dots{}
+These items are specific to the i386 and x86-64 architectures. They
+reflect supported CPU features and information on cache geometry, mostly
+collected using the @code{CPUID} instruction.
+@end table
+
@node Dynamic Linker Introspection
@section Dynamic Linker Introspection
commit d99609a3eb8bc96c3af841fd35294a679e0fea7f
Author: Florian Weimer <fweimer@redhat.com>
Date: Wed Sep 6 18:37:21 2023 +0200
manual: Fix ld.so diagnostics menu/section structure
And shorten the section/node names a bit, so that the menu
entries become easier to read.
Texinfo 6.5 fails to process the previous structure:
./dynlink.texi:56: warning: node `Dynamic Linker Introspection' is
next for `Dynamic Linker Diagnostics' in sectioning but not in menu
./dynlink.texi:56: warning: node up `Dynamic Linker Diagnostics'
in menu `Dynamic Linker Invocation' and
in sectioning `Dynamic Linker' differ
./dynlink.texi:1: node `Dynamic Linker' lacks menu item for
`Dynamic Linker Diagnostics' despite being its Up target
./dynlink.texi:226: warning: node prev `Dynamic Linker Introspection' in menu `Dynamic Linker Invocation'
and in sectioning `Dynamic Linker Diagnostics' differ
Texinfo 7.0.2 does not report an error.
This fixes commit f21962ddfc8bb23e92597da1f98e313dbde11cc1
("manual: Document ld.so --list-diagnostics output").
Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
diff --git a/manual/dynlink.texi b/manual/dynlink.texi
index df41c56bfc..06a6c15533 100644
--- a/manual/dynlink.texi
+++ b/manual/dynlink.texi
@@ -18,6 +18,7 @@ Dynamic linkers are sometimes called @dfn{dynamic loaders}.
@end menu
@node Dynamic Linker Invocation
+@section Dynamic Linker Invocation
@cindex program interpreter
When a dynamically linked program starts, the operating system
@@ -54,7 +55,7 @@ Print system diagnostic information in a machine-readable format.
@end menu
@node Dynamic Linker Diagnostics
-@section Dynamic Linker Diagnostics
+@subsection Dynamic Linker Diagnostics
@cindex diagnostics (dynamic linker)
The @samp{ld.so --list-diagnostics} produces machine-readable
@@ -90,7 +91,13 @@ The built-in dynamic linker search path contains two directories,
@code{/lib64} and @code{/usr/lib64}.
@end table
-@subsection Dynamic Linker Diagnostics Output Format
+@menu
+* Dynamic Linker Diagnostics Format:: Format of ld.so output.
+* Dynamic Linker Diagnostics Values:: Data contain in ld.so output.
+@end menu
+
+@node Dynamic Linker Diagnostics Format
+@subsubsection Dynamic Linker Diagnostics Format
As seen above, diagnostic lines assign values (integers or strings) to a
sequence of labeled subscripts, separated by @samp{.}. Some subscripts
@@ -132,7 +139,8 @@ subscript = label [index]
line = subscript *("." subscript) "=" value
@end smallexample
-@subsection Dynamic Linker Diagnostics Values
+@node Dynamic Linker Diagnostics Values
+@subsubsection Dynamic Linker Diagnostics Values
As mentioned above, the set of diagnostics may change between
@theglibc{} releases. Nevertheless, the following table documents a few
commit f8d8b1b1e6d3b8b93f224efc796b7ea083fdb83f
Author: Florian Weimer <fweimer@redhat.com>
Date: Mon Apr 8 16:48:55 2024 +0200
aarch64: Enhanced CPU diagnostics for ld.so
This prints some information from struct cpu_features, and the midr_el1
and dczid_el0 system register contents on every CPU.
Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
Modified for RHEL by: Patsy Griffin <patsy@redhat.com>
Diagnostics for the cpu_features mops and prefer_sve_ifuncs are not
currently supported on aarch64.
diff -Nrup a/manual/dynlink.texi b/manual/dynlink.texi
--- a/manual/dynlink.texi 2024-05-31 20:55:08.238959456 -0400
+++ b/manual/dynlink.texi 2024-05-31 20:55:41.298121623 -0400
@@ -224,6 +224,40 @@ reflect adjustment by @theglibc{}.
These Linux-specific items show the values of @code{struct utsname}, as
reported by the @code{uname} function. @xref{Platform Type}.
+@item aarch64.cpu_features.@dots{}
+These items are specific to the AArch64 architectures. They report data
+@theglibc{} uses to activate conditionally supported features such as
+BTI and MTE, and to select alternative function implementations.
+
+@item aarch64.processor[@var{index}].@dots{}
+These are additional items for the AArch64 architecture and are
+described below.
+
+@item aarch64.processor[@var{index}].requested=@var{kernel-cpu}
+The kernel is told to run the subsequent probing on the CPU numbered
+@var{kernel-cpu}. The values @var{kernel-cpu} and @var{index} can be
+distinct if there are gaps in the process CPU affinity mask. This line
+is not included if CPU affinity mask information is not available.
+
+@item aarch64.processor[@var{index}].observed=@var{kernel-cpu}
+This line reports the kernel CPU number @var{kernel-cpu} on which the
+probing code initially ran. If the CPU number cannot be obtained,
+this line is not printed.
+
+@item aarch64.processor[@var{index}].observed_node=@var{node}
+This reports the observed NUMA node number, as reported by the
+@code{getcpu} system call. If this information cannot be obtained, this
+line is not printed.
+
+@item aarch64.processor[@var{index}].midr_el1=@var{value}
+The value of the @code{midr_el1} system register on the processor
+@var{index}. This line is only printed if the kernel indicates that
+this system register is supported.
+
+@item aarch64.processor[@var{index}].dczid_el0=@var{value}
+The value of the @code{dczid_el0} system register on the processor
+@var{index}.
+
@cindex CPUID (diagnostics)
@item x86.cpu_features.@dots{}
These items are specific to the i386 and x86-64 architectures. They
diff -Nrup a/sysdeps/aarch64/dl-diagnostics-cpu.c b/sysdeps/aarch64/dl-diagnostics-cpu.c
--- a/sysdeps/aarch64/dl-diagnostics-cpu.c 1969-12-31 19:00:00.000000000 -0500
+++ b/sysdeps/aarch64/dl-diagnostics-cpu.c 2024-05-31 20:57:23.536623129 -0400
@@ -0,0 +1,81 @@
+/* Print CPU diagnostics data in ld.so. AArch64 version.
+ Copyright (C) 2021-2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <dl-diagnostics.h>
+
+#include <cpu-features.h>
+#include <dl-iterate_cpu.h>
+#include <ldsodefs.h>
+#include <sys/auxv.h>
+
+static void
+print_cpu_features_value (const char *label, uint64_t value)
+{
+ _dl_printf ("aarch64.cpu_features.");
+ _dl_diagnostics_print_labeled_value (label, value);
+}
+
+static void
+print_per_cpu_value (const struct dl_iterate_cpu *dic,
+ const char *label, uint64_t value)
+{
+ _dl_printf ("aarch64.processor[0x%x].", dic->processor_index);
+ _dl_diagnostics_print_labeled_value (label, value);
+}
+
+void
+_dl_diagnostics_cpu (void)
+{
+ print_cpu_features_value ("bti", GLRO (dl_aarch64_cpu_features).bti);
+ print_cpu_features_value ("midr_el1",
+ GLRO (dl_aarch64_cpu_features).midr_el1);
+ print_cpu_features_value ("mte_state",
+ GLRO (dl_aarch64_cpu_features).mte_state);
+ print_cpu_features_value ("sve", GLRO (dl_aarch64_cpu_features).sve);
+ print_cpu_features_value ("zva_size",
+ GLRO (dl_aarch64_cpu_features).zva_size);
+
+ struct dl_iterate_cpu dic;
+ _dl_iterate_cpu_init (&dic);
+
+ while (_dl_iterate_cpu_next (&dic))
+ {
+ if (dic.requested_cpu >= 0)
+ _dl_printf ("aarch64.processor[0x%x].requested=0x%x\n",
+ dic.processor_index, dic.requested_cpu);
+ if (dic.actual_cpu >= 0)
+ _dl_printf ("aarch64.processor[0x%x].observed=0x%x\n",
+ dic.processor_index, dic.actual_cpu);
+ if (dic.actual_node >= 0)
+ _dl_printf ("aarch64.processor[0x%x].observed_node=0x%x\n",
+ dic.processor_index, dic.actual_node);
+
+ if (GLRO (dl_hwcap) & HWCAP_CPUID)
+ {
+ uint64_t midr_el1;
+ asm ("mrs %0, midr_el1" : "=r" (midr_el1));
+ print_per_cpu_value (&dic, "midr_el1", midr_el1);
+ }
+
+ {
+ uint64_t dczid_el0;
+ asm ("mrs %0, dczid_el0" : "=r" (dczid_el0));
+ print_per_cpu_value (&dic, "dczid_el0", dczid_el0);
+ }
+ }
+}
This diff is collapsed.
commit 5653ccd847f0cd3a98906e44c97c71d68652d326
Author: Florian Weimer <fweimer@redhat.com>
Date: Mon Apr 8 16:48:55 2024 +0200
elf: Add CPU iteration support for future use in ld.so diagnostics
Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
diff --git a/elf/dl-iterate_cpu.h b/elf/dl-iterate_cpu.h
new file mode 100644
index 0000000000..60db167b13
--- /dev/null
+++ b/elf/dl-iterate_cpu.h
@@ -0,0 +1,136 @@
+/* Iterate over all CPUs, for CPU-specific diagnostics.
+ Copyright (C) 2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef DL_ITERATE_CPU_H
+#define DL_ITERATE_CPU_H
+
+#include <dl-affinity.h>
+#include <stdbool.h>
+
+struct dl_iterate_cpu
+{
+ /* Sequential iteration count, starting at 0. */
+ unsigned int processor_index;
+
+ /* Requested CPU. Can be -1 if affinity could not be set. */
+ int requested_cpu;
+
+ /* Observed current CPU. -1 if unavailable. */
+ int actual_cpu;
+
+ /* Observed node ID for the CPU. -1 if unavailable. */
+ int actual_node;
+
+ /* Internal fields to implement the iteration. */
+
+ /* Affinity as obtained by _dl_iterate_cpu_init, using
+ _dl_getaffinity. Space for 8,192 CPUs. */
+ unsigned long int mask_reference[8192 / sizeof (unsigned long int) / 8];
+
+ /* This array is used by _dl_setaffinity calls. */
+ unsigned long int mask_request[8192 / sizeof (unsigned long int) / 8];
+
+ /* Return value from the initial _dl_getaffinity call. */
+ int length_reference;
+};
+
+static void
+_dl_iterate_cpu_init (struct dl_iterate_cpu *dic)
+{
+ dic->length_reference
+ = _dl_getaffinity (dic->mask_reference, sizeof (dic->mask_reference));
+ /* Prepare for the first _dl_iterate_cpu_next call. */
+ dic->processor_index = -1;
+ dic->requested_cpu = -1;
+}
+
+static bool
+_dl_iterate_cpu_next (struct dl_iterate_cpu *dic)
+{
+ ++dic->processor_index;
+
+ if (dic->length_reference > 0)
+ {
+ /* Search for the next CPU to switch to. */
+ while (true)
+ {
+ ++dic->requested_cpu;
+
+ /* Array index and bit number within the array. */
+ unsigned int long_index
+ = dic->requested_cpu / sizeof (unsigned long int) / 8;
+ unsigned int bit_index
+ = dic->requested_cpu % (sizeof (unsigned long int) * 8);
+
+ if (long_index * sizeof (unsigned long int) >= dic->length_reference)
+ /* All possible CPUs have been covered. */
+ return false;
+
+ unsigned long int bit = 1UL << bit_index;
+ if (dic->mask_reference[long_index] & bit)
+ {
+ /* The CPU is available. Try to select it. */
+ dic->mask_request[long_index] = bit;
+ if (_dl_setaffinity (dic->mask_request,
+ (long_index + 1)
+ * sizeof (unsigned long int)) < 0)
+ {
+ /* Record that we could not perform a CPU request. */
+ dic->length_reference = -1;
+
+ if (dic->processor_index > 0)
+ /* We already reported something. There is no need to
+ continue because the new data is probably not useful. */
+ return false;
+ }
+
+ /* Clear the bit in case the next iteration switches to the
+ next long value. */
+ dic->mask_request[long_index] = 0;
+
+ /* We found a CPU to run on. */
+ break;
+ }
+ }
+ }
+ else
+ {
+ /* No way to set CPU affinity. Iterate just once. */
+ if (dic->processor_index > 0)
+ return false;
+ }
+
+ /* Fill in the actual CPU information. CPU pinning may not actually
+ be effective, depending on the container host. */
+ unsigned int cpu, node;
+ if (_dl_getcpu (&cpu, &node) < 0)
+ {
+ /* No CPU information available. */
+ dic->actual_cpu = -1;
+ dic->actual_node = -1;
+ }
+ else
+ {
+ dic->actual_cpu = cpu;
+ dic->actual_node = node;
+ }
+
+ return true;
+}
+
+#endif /* DL_ITERATE_CPU_H */
diff --git a/sysdeps/generic/dl-affinity.h b/sysdeps/generic/dl-affinity.h
new file mode 100644
index 0000000000..d117f737e9
--- /dev/null
+++ b/sysdeps/generic/dl-affinity.h
@@ -0,0 +1,54 @@
+/* CPU affinity handling for the dynamic linker. Stub version.
+ Copyright (C) 2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef DL_AFFINITY_H
+#define DL_AFFINITY_H
+
+#include <errno.h>
+#include <stddef.h>
+
+/* On success, write the current CPU ID to *CPU, and the current node
+ ID to *NODE, and return 0. Return a negative error code on
+ failure. */
+static inline int
+_dl_getcpu (unsigned int *cpu, unsigned int *node)
+{
+ return -ENOSYS;
+}
+
+/* On success, write CPU ID affinity bits for the current thread to
+ *BITS, which must be SIZE bytes long, and return the number of
+ bytes updated, a multiple of sizeof (unsigned long int). On
+ failure, return a negative error code. */
+static int
+_dl_getaffinity (unsigned long int *bits, size_t size)
+{
+ return -ENOSYS;
+}
+
+/* Set the CPU affinity mask for the current thread to *BITS, using
+ the SIZE bytes from that array, which should be a multiple of
+ sizeof (unsigned long int). Return 0 on success, and a negative
+ error code on failure. */
+static int
+_dl_setaffinity (const unsigned long int *bits, size_t size)
+{
+ return -ENOSYS;
+}
+
+#endif /* DL_AFFINITY_H */
diff --git a/sysdeps/unix/sysv/linux/dl-affinity.h b/sysdeps/unix/sysv/linux/dl-affinity.h
new file mode 100644
index 0000000000..bbfede7750
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/dl-affinity.h
@@ -0,0 +1,46 @@
+/* CPU affinity handling for the dynamic linker. Linux version.
+ Copyright (C) 2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+/* See sysdeps/generic/dl-affinity.h for documentation of these interfaces. */
+
+#ifndef DL_AFFINITY_H
+#define DL_AFFINITY_H
+
+#include <sysdep.h>
+#include <stddef.h>
+#include <unistd.h>
+
+static inline int
+_dl_getcpu (unsigned int *cpu, unsigned int *node)
+{
+ return INTERNAL_SYSCALL_CALL (getcpu, cpu, node);
+}
+
+static int
+_dl_getaffinity (unsigned long int *bits, size_t size)
+{
+ return INTERNAL_SYSCALL_CALL (sched_getaffinity, /* TID */ 0, size, bits);
+}
+
+static int
+_dl_setaffinity (const unsigned long int *bits, size_t size)
+{
+ return INTERNAL_SYSCALL_CALL (sched_setaffinity, /* TID */ 0, size, bits);
+}
+
+#endif /* DL_AFFINITY_H */
commit 6a04404521ac4119ae36827eeb288ea84eee7cf6
Author: Florian Weimer <fweimer@redhat.com>
Date: Sat Feb 17 09:17:04 2024 +0100
Linux: Switch back to assembly syscall wrapper for prctl (bug 29770)
Commit ff026950e280bc3e9487b41b460fb31bc5b57721 ("Add a C wrapper for
prctl [BZ #25896]") replaced the assembler wrapper with a C function.
However, on powerpc64le-linux-gnu, the C variadic function
implementation requires extra work in the caller to set up the
parameter save area. Calling a function that needs a parameter save
area without one (because the prototype used indicates the function is
not variadic) corrupts the caller's stack. The Linux manual pages
project documents prctl as a non-variadic function. This has resulted
in various projects over the years using non-variadic prototypes,
including the sanitizer libraries in LLVm and GCC (GCC PR 113728).
This commit switches back to the assembler implementation on most
targets and only keeps the C implementation for x86-64 x32.
Also add the __prctl_time64 alias from commit
b39ffab860cd743a82c91946619f1b8158b0b65e ("Linux: Add time64 alias for
prctl") to sysdeps/unix/sysv/linux/syscalls.list; it was not yet
present in commit ff026950e280bc3e9487b41b460fb31bc5b57721.
This restores the old ABI on powerpc64le-linux-gnu, thus fixing
bug 29770.
Reviewed-By: Simon Chopin <simon.chopin@canonical.com>
Resolved conflicts:
sysdeps/unix/sysv/linux/syscalls.list
sysdeps/unix/sysv/linux/x86_64/x32/prctl.c
diff -Nrup a/sysdeps/unix/sysv/linux/prctl.c b/sysdeps/unix/sysv/linux/prctl.c
--- a/sysdeps/unix/sysv/linux/prctl.c 2021-08-01 21:33:43.000000000 -0400
+++ b/sysdeps/unix/sysv/linux/prctl.c 1969-12-31 19:00:00.000000000 -0500
@@ -1,45 +0,0 @@
-/* prctl - Linux specific syscall.
- Copyright (C) 2020-2021 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include <stdarg.h>
-#include <sys/prctl.h>
-
-/* Unconditionally read all potential arguments. This may pass
- garbage values to the kernel, but avoids the need for teaching
- glibc the argument counts of individual options (including ones
- that are added to the kernel in the future). */
-
-int
-__prctl (int option, ...)
-{
- va_list arg;
- va_start (arg, option);
- unsigned long int arg2 = va_arg (arg, unsigned long int);
- unsigned long int arg3 = va_arg (arg, unsigned long int);
- unsigned long int arg4 = va_arg (arg, unsigned long int);
- unsigned long int arg5 = va_arg (arg, unsigned long int);
- va_end (arg);
- return INLINE_SYSCALL_CALL (prctl, option, arg2, arg3, arg4, arg5);
-}
-
-libc_hidden_def (__prctl)
-weak_alias (__prctl, prctl)
-#if __TIMESIZE != 64
-weak_alias (__prctl, __prctl_time64)
-#endif
diff -Nrup a/sysdeps/unix/sysv/linux/syscalls.list b/sysdeps/unix/sysv/linux/syscalls.list
--- a/sysdeps/unix/sysv/linux/syscalls.list 2021-08-01 21:33:43.000000000 -0400
+++ b/sysdeps/unix/sysv/linux/syscalls.list 2024-02-27 14:33:01.594782897 -0500
@@ -41,6 +41,7 @@ munlockall - munlockall i: munlockall
nfsservctl EXTRA nfsservctl i:ipp __compat_nfsservctl nfsservctl@GLIBC_2.0:GLIBC_2.28
pipe - pipe i:f __pipe pipe
pipe2 - pipe2 i:fi __pipe2 pipe2
+prctl EXTRA prctl i:iiiii __prctl prctl __prctl_time64
pivot_root EXTRA pivot_root i:ss pivot_root
query_module EXTRA query_module i:sipip __compat_query_module query_module@GLIBC_2.0:GLIBC_2.23
quotactl EXTRA quotactl i:isip quotactl
diff -Nrup a/sysdeps/unix/sysv/linux/x86_64/x32/prctl.c b/sysdeps/unix/sysv/linux/x86_64/x32/prctl.c
--- a/sysdeps/unix/sysv/linux/x86_64/x32/prctl.c 1969-12-31 19:00:00.000000000 -0500
+++ b/sysdeps/unix/sysv/linux/x86_64/x32/prctl.c 2024-02-27 14:35:03.388602623 -0500
@@ -0,0 +1,42 @@
+/* prctl - Linux specific syscall. x86-64 x32 version.
+ Copyright (C) 2020-2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <stdarg.h>
+#include <sys/prctl.h>
+
+/* Unconditionally read all potential arguments. This may pass
+ garbage values to the kernel, but avoids the need for teaching
+ glibc the argument counts of individual options (including ones
+ that are added to the kernel in the future). */
+
+int
+__prctl (int option, ...)
+{
+ va_list arg;
+ va_start (arg, option);
+ unsigned long int arg2 = va_arg (arg, unsigned long int);
+ unsigned long int arg3 = va_arg (arg, unsigned long int);
+ unsigned long int arg4 = va_arg (arg, unsigned long int);
+ unsigned long int arg5 = va_arg (arg, unsigned long int);
+ va_end (arg);
+ return INLINE_SYSCALL_CALL (prctl, option, arg2, arg3, arg4, arg5);
+}
+
+libc_hidden_def (__prctl)
+weak_alias (__prctl, prctl)
From dfb05f8e704edac70db38c4c8ee700769d91a413 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Fri, 16 Feb 2024 07:17:10 -0800
Subject: [PATCH] x86-64: Save APX registers in ld.so trampoline
Content-type: text/plain; charset=UTF-8
Add APX registers to STATE_SAVE_MASK so that APX registers are saved in
ld.so trampoline. This fixes BZ #31371.
Also update STATE_SAVE_OFFSET and STATE_SAVE_MASK for i386 which will
be used by i386 _dl_tlsdesc_dynamic.
Reviewed-by: Noah Goldstein <goldstein.w.n@gmail.com>
Conflicts:
sysdeps/x86/sysdep.h
(rebased for context and line numbers)
---
sysdeps/x86/sysdep.h | 52 +++++++++++++++++++++++++++++++++++++++-----
1 file changed, 46 insertions(+), 6 deletions(-)
diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
index 85d0a8c943..837fd28734 100644
--- a/sysdeps/x86/sysdep.h
+++ b/sysdeps/x86/sysdep.h
@@ -48,14 +48,54 @@
# define SHSTK_ENABLED 0
#endif
+/* The extended state feature IDs in the state component bitmap. */
+#define X86_XSTATE_X87_ID 0
+#define X86_XSTATE_SSE_ID 1
+#define X86_XSTATE_AVX_ID 2
+#define X86_XSTATE_BNDREGS_ID 3
+#define X86_XSTATE_BNDCFG_ID 4
+#define X86_XSTATE_K_ID 5
+#define X86_XSTATE_ZMM_H_ID 6
+#define X86_XSTATE_ZMM_ID 7
+#define X86_XSTATE_PKRU_ID 9
+#define X86_XSTATE_TILECFG_ID 17
+#define X86_XSTATE_TILEDATA_ID 18
+#define X86_XSTATE_APX_F_ID 19
+
+#ifdef __x86_64__
/* Offset for fxsave/xsave area used by _dl_runtime_resolve. Also need
space to preserve RCX, RDX, RSI, RDI, R8, R9 and RAX. It must be
- aligned to 16 bytes for fxsave and 64 bytes for xsave. */
-#define STATE_SAVE_OFFSET (8 * 7 + 8)
-
-/* Save SSE, AVX, AVX512, mask and bound registers. */
-#define STATE_SAVE_MASK \
- ((1 << 1) | (1 << 2) | (1 << 3) | (1 << 5) | (1 << 6) | (1 << 7))
+ aligned to 16 bytes for fxsave and 64 bytes for xsave.
+
+ NB: Is is non-zero because of the 128-byte red-zone. Some registers
+ are saved on stack without adjusting stack pointer first. When we
+ update stack pointer to allocate more space, we need to take the
+ red-zone into account. */
+# define STATE_SAVE_OFFSET (8 * 7 + 8)
+
+/* Save SSE, AVX, AVX512, mask, bound and APX registers. Bound and APX
+ registers are mutually exclusive. */
+# define STATE_SAVE_MASK \
+ ((1 << X86_XSTATE_SSE_ID) \
+ | (1 << X86_XSTATE_AVX_ID) \
+ | (1 << X86_XSTATE_BNDREGS_ID) \
+ | (1 << X86_XSTATE_K_ID) \
+ | (1 << X86_XSTATE_ZMM_H_ID) \
+ | (1 << X86_XSTATE_ZMM_ID) \
+ | (1 << X86_XSTATE_APX_F_ID))
+#else
+/* Offset for fxsave/xsave area used by _dl_tlsdesc_dynamic. Since i386
+ doesn't have red-zone, use 0 here. */
+# define STATE_SAVE_OFFSET 0
+
+/* Save SSE, AVX, AXV512, mask and bound registers. */
+# define STATE_SAVE_MASK \
+ ((1 << X86_XSTATE_SSE_ID) \
+ | (1 << X86_XSTATE_AVX_ID) \
+ | (1 << X86_XSTATE_BNDREGS_ID) \
+ | (1 << X86_XSTATE_K_ID) \
+ | (1 << X86_XSTATE_ZMM_H_ID))
+#endif
/* Constants for bits in __x86_string_control: */
--
2.39.3
commit 127fc56152347d73cb7c1c283e60e1cb1f15e9f9
Author: sayan paul <saypaul@redhat.com>
Date: Wed May 29 15:31:04 2024 +0530
malloc: New test to check malloc alternate path using memory obstruction
The test aims to ensure that malloc uses the alternate path to
allocate memory when sbrk() or brk() fails.To achieve this,
the test first creates an obstruction at current program break,
tests that obstruction with a failing sbrk(), then checks if malloc
is still returning a valid ptr thus inferring that malloc() used
mmap() instead of brk() or sbrk() to allocate the memory.
Reviewed-by: Arjun Shankar <arjun@redhat.com>
Reviewed-by: Zack Weinberg <zack@owlfolio.org>
Conflicts:
malloc/Makefile
(usual tests conflict)
diff --git a/malloc/Makefile b/malloc/Makefile
index 9b70831d383cb522..cb4e027d28b179f0 100644
--- a/malloc/Makefile
+++ b/malloc/Makefile
@@ -43,6 +43,7 @@ tests := mallocbug tst-malloc tst-valloc tst-calloc tst-obstack \
tst-tcfree1 tst-tcfree2 tst-tcfree3 \
tst-safe-linking \
tst-mallocalign1 \
+ tst-malloc-alternate-path \
tests-static := \
tst-interpose-static-nothread \
diff --git a/malloc/tst-malloc-alternate-path.c b/malloc/tst-malloc-alternate-path.c
new file mode 100644
index 0000000000000000..43ae916815d6ff47
--- /dev/null
+++ b/malloc/tst-malloc-alternate-path.c
@@ -0,0 +1,72 @@
+/* Test that malloc uses mmap when sbrk or brk fails.
+ Copyright (C) 2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+/* This test sets up an obstruction to ensure that brk/sbrk fails to
+ grow the heap, then verifies that malloc uses mmap for allocations
+ instead. */
+
+#include <unistd.h>
+#include <sys/mman.h>
+#include <stdlib.h>
+#include <libc-pointer-arith.h>
+#include <support/check.h>
+#include <stddef.h>
+#include <stdalign.h>
+
+#define LARGE_SIZE (10 * (1 << 20)) // 10 MB
+static long page_size;
+
+static int
+do_test (void)
+{
+ /* Get current program break. */
+ void *current_brk = sbrk (0);
+
+ page_size = sysconf (_SC_PAGESIZE);
+
+ /* Round up to the next page boundary. */
+ void *next_page_boundary = PTR_ALIGN_UP (current_brk, page_size);
+
+ /* Place a mapping using mmap at the next page boundary. */
+ void *obstruction_addr
+ = mmap (next_page_boundary, page_size, PROT_READ,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
+
+ /* Check if memory obstruction is set up correctly. */
+ TEST_VERIFY_EXIT (obstruction_addr == next_page_boundary);
+
+ /* Try to extend the heap beyond the obstruction using sbrk */
+ int *ptr = sbrk (page_size);
+ TEST_VERIFY_EXIT (ptr == (void *) -1);
+
+ /* Attempt multiple small allocations using malloc. */
+ for (size_t i = 0; i < page_size / alignof (max_align_t); i++)
+ {
+ TEST_VERIFY (malloc (alignof (max_align_t)));
+ }
+
+ /* Attempt to allocate a large block of memory using malloc. */
+ TEST_VERIFY_EXIT (malloc (LARGE_SIZE) != NULL);
+
+ /* Check if malloc changed current program break. */
+ TEST_VERIFY_EXIT (current_brk == sbrk (0));
+
+ return 0;
+}
+
+#include <support/test-driver.c>
From a07e000e82cb71238259e674529c37c12dc7d423 Mon Sep 17 00:00:00 2001
From: DJ Delorie <dj@redhat.com>
Date: Fri, 10 May 2024 17:34:29 -0400
Subject: manual: add dup3
Reviewed-by: Florian Weimer <fweimer@redhat.com>
diff --git a/manual/llio.texi b/manual/llio.texi
index fae49d1433..fe1807a849 100644
--- a/manual/llio.texi
+++ b/manual/llio.texi
@@ -3415,6 +3415,14 @@ middle of calling @code{dup2} at which @var{new} is closed and not yet a
duplicate of @var{old}.
@end deftypefun
+@deftypefun int dup3 (int @var{old}, int @var{new}, int @var{flags})
+@standards{Linux, unistd.h}
+@safety{@prelim{}@mtsafe{}@assafe{}@acsafe{}}
+This function is the same as @code{dup2} but creates the new
+descriptor as if it had been opened with flags @var{flags}. The only
+allowed flag is @code{O_CLOEXEC}.
+@end deftypefun
+
@deftypevr Macro int F_DUPFD
@standards{POSIX.1, fcntl.h}
This macro is used as the @var{command} argument to @code{fcntl}, to
From 6c0be74305745c8f78bcfb69442c8c379459d99b Mon Sep 17 00:00:00 2001
From: DJ Delorie <dj@redhat.com>
Date: Mon, 8 Jul 2024 17:52:15 -0400
Subject: manual: add syscalls
The purpose of this patch is to add some system calls that (1) aren't
otherwise documented, and (2) are merely redirected to the kernel, so
can refer to their documentation; and define a standard way of doing
so in the future. A more detailed explaination of how system calls
are wrapped is added along with reference to the Linux Man-Pages
project.
Default version of man-pages is in configure.ac but can be overridden
by --with-man-pages=X.Y
Reviewed-by: Alejandro Colomar <alx@kernel.org>
diff --git a/config.make.in b/config.make.in
index 55e8b7563b..36096881b7 100644
--- a/config.make.in
+++ b/config.make.in
@@ -101,6 +101,7 @@ build-hardcoded-path-in-tests= @hardcode
build-pt-chown = @build_pt_chown@
have-tunables = @have_tunables@
pthread-in-libc = @pthread_in_libc@
+man-pages-version = @man_pages_version@
# Build tools.
CC = @CC@
diff --git a/configure b/configure
index 55e8b7563b..36096881b7 100644
--- a/configure 2024-07-17 15:45:55.033649362 -0400
+++ b/configure 2024-07-17 15:47:44.281886629 -0400
@@ -681,6 +681,7 @@ force_install
bindnow
hardcoded_path_in_tests
enable_timezone_tools
+man_pages_version
rtld_early_cflags
extra_nonshared_cflags
use_default_link
@@ -763,6 +764,7 @@ with_headers
with_default_link
with_nonshared_cflags
with_rtld_early_cflags
+with_man_pages
enable_sanity_checks
enable_shared
enable_profile
@@ -1483,6 +1485,8 @@ Optional Packages:
build nonshared libraries with additional CFLAGS
--with-rtld-early-cflags=CFLAGS
build early initialization with additional CFLAGS
+ --with-man-pages=VERSION
+ tie manual to a specific man-pages version
--with-cpu=CPU select code for CPU variant
Some influential environment variables:
@@ -3396,6 +3400,15 @@ fi
+man_pages_version=6.9.1
+
+
+# Check whether --with-man-pages was given.
+if test "${with_man_pages+set}" = set; then :
+ withval=$with_man_pages; man_pages_version=$withval
+fi
+
+
# Check whether --enable-sanity-checks was given.
if test "${enable_sanity_checks+set}" = set; then :
diff --git a/configure.ac b/configure.ac
index e48957f318..9cbc0bf68f 100644
--- a/configure.ac 2024-07-17 15:45:52.181538742 -0400
+++ b/configure.ac 2024-07-17 15:46:29.885001095 -0400
@@ -169,6 +169,15 @@ AC_ARG_WITH([rtld-early-cflags],
[rtld_early_cflags=])
AC_SUBST(rtld_early_cflags)
+man_pages_version=6.9.1
+
+AC_ARG_WITH([man-pages],
+ AS_HELP_STRING([--with-man-pages=VERSION],
+ [tie manual to a specific man-pages version]),
+ [man_pages_version=$withval],
+ [])
+AC_SUBST(man_pages_version)
+
AC_ARG_ENABLE([sanity-checks],
AS_HELP_STRING([--disable-sanity-checks],
[really do not use threads (should not be used except in special situations) @<:@default=yes@:>@]),
diff --git a/manual/Makefile b/manual/Makefile
index b5fda4a7ae..a6c05db540 100644
--- a/manual/Makefile
+++ b/manual/Makefile
@@ -117,6 +117,7 @@ $(objpfx)stamp-pkgvers: $(common-objpfx)config.make
echo "@set PKGVERSION_DEFAULT" >> $(objpfx)pkgvers-tmp; \
fi
echo "@set REPORT_BUGS_TO $(REPORT_BUGS_TEXI)" >> $(objpfx)pkgvers-tmp
+ echo "@set man_pages_version $(man-pages-version)" >> $(objpfx)pkgvers-tmp; \
echo "@end ifclear" >> $(objpfx)pkgvers-tmp
$(move-if-change) $(objpfx)pkgvers-tmp $(objpfx)pkgvers.texi
touch $@
diff --git a/manual/intro.texi b/manual/intro.texi
index ff43c5a7fb..879c1b38d9 100644
--- a/manual/intro.texi
+++ b/manual/intro.texi
@@ -85,6 +85,7 @@ standards each function or symbol comes from.
* Berkeley Unix:: BSD and SunOS.
* SVID:: The System V Interface Description.
* XPG:: The X/Open Portability Guide.
+* Linux Kernel:: The Linux kernel.
@end menu
@node ISO C, POSIX, , Standards and Portability
@@ -941,7 +942,7 @@ inter-process communication and shared memory, the @code{hsearch} and
@code{drand48} families of functions, @code{fmtmsg} and several of the
mathematical functions.
-@node XPG, , SVID, Standards and Portability
+@node XPG, Linux Kernel, SVID, Standards and Portability
@subsection XPG (The X/Open Portability Guide)
The X/Open Portability Guide, published by the X/Open Company, Ltd., is
@@ -960,6 +961,20 @@ fulfilling the XPG standard with the Unix extensions is a
precondition for getting the Unix brand chances are good that the
functionality is available on commercial systems.
+@node Linux Kernel, , XPG, Standards and Portability
+@subsection Linux (The Linux Kernel)
+
+@Theglibc{} includes by reference the Linux man-pages
+@value{man_pages_version} documentation to document the listed
+syscalls for the Linux kernel. For reference purposes only the latest
+@uref{https://www.kernel.org/doc/man-pages/,Linux man-pages Project}
+documentation can be accessed from the
+@uref{https://www.kernel.org,Linux kernel} website. Where the syscall
+has more specific documentation in this manual that more specific
+documentation is considered authoritative.
+
+Additional details on the Linux system call interface can be found in
+@xref{System Calls}.
@node Using the Library, Roadmap to the Manual, Standards and Portability, Introduction
@section Using the Library
diff --git a/manual/llio.texi b/manual/llio.texi
index 78c7c79913..6f0a48609b 100644
--- a/manual/llio.texi
+++ b/manual/llio.texi
@@ -65,6 +65,7 @@ directly.)
* Interrupt Input:: Getting an asynchronous signal when
input arrives.
* IOCTLs:: Generic I/O Control operations.
+* Other Low-Level I/O APIs:: Other low-level-I/O-related functions.
@end menu
@@ -2324,6 +2325,8 @@ file descriptor, or until the timeout period expires.
There is another example showing the use of @code{select} to multiplex
input from multiple sockets in @ref{Server Example}.
+For an alternate interface to this functionality, see @code{poll}
+(@pxref{Other Low-Level I/O APIs}).
@node Synchronizing I/O
@section Synchronizing I/O operations
@@ -3407,7 +3410,9 @@ require additional arguments to be supplied. These additional arguments
and the return value and error conditions are given in the detailed
descriptions of the individual commands.
-Briefly, here is a list of what the various commands are.
+Briefly, here is a list of what the various commands are. For an
+exhaustive list of kernel-specific options, please see @xref{System
+Calls}.
@vtable @code
@item F_DUPFD
@@ -4743,5 +4748,28 @@ Most IOCTLs are OS-specific and/or only used in special system utilities,
and are thus beyond the scope of this document. For an example of the use
of an IOCTL, see @ref{Out-of-Band Data}.
-@c FIXME this is undocumented:
-@c dup3
+@node Other Low-Level I/O APIs
+@section Other low-level-I/O-related functions
+
+@deftp {Data Type} {struct pollfd}
+@standards{POSIX.1,poll.h}
+@end deftp
+
+@deftp {Data Type} {struct epoll_event}
+@standards{Linux,sys/epoll.h}
+@end deftp
+
+@deftypefun int poll (struct pollfd *@var{fds}, nfds_t @var{nfds}, int @var{timeout})
+
+@manpagefunctionstub{poll,2}
+@end deftypefun
+
+@deftypefun int epoll_create(int @var{size})
+
+@manpagefunctionstub{epoll_create,2}
+@end deftypefun
+
+@deftypefun int epoll_wait(int @var{epfd}, struct epoll_event *@var{events}, int @var{maxevents}, int @var{timeout})
+
+@manpagefunctionstub{epoll_wait,2}
+@end deftypefun
diff --git a/manual/macros.texi b/manual/macros.texi
index 4a2e22f473..579da3fb81 100644
--- a/manual/macros.texi
+++ b/manual/macros.texi
@@ -282,4 +282,11 @@ cwd\comments\
@macro standardsx {element, standard, header}
@end macro
+@macro manpagefunctionstub {func,sec}
+This documentation is a stub. For additional information on this
+function, consult the manual page
+@url{https://man7.org/linux/man-pages/man\sec\/\func\.\sec\.html}.
+@xref{Linux Kernel}.
+@end macro
+
@end ifclear
diff --git a/manual/socket.texi b/manual/socket.texi
index f0e35d9e13..8708cbb07c 100644
--- a/manual/socket.texi
+++ b/manual/socket.texi
@@ -41,6 +41,7 @@ aren't documented either so far.
is to make it work with Inetd.
* Socket Options:: Miscellaneous low-level socket options.
* Networks Database:: Accessing the database of network names.
+* Other Socket APIs:: Other socket-related functions.
@end menu
@node Socket Concepts
@@ -3134,38 +3135,8 @@ You can use plain @code{recv} (@pxref{Receiving Data}) instead of
treat all possible senders alike). Even @code{read} can be used if
you don't want to specify @var{flags} (@pxref{I/O Primitives}).
-@ignore
-@c sendmsg and recvmsg are like readv and writev in that they
-@c use a series of buffers. It's not clear this is worth
-@c supporting or that we support them.
-@c !!! they can do more; it is hairy
-
-@deftp {Data Type} {struct msghdr}
-@standards{BSD, sys/socket.h}
-@end deftp
-
-@deftypefun ssize_t sendmsg (int @var{socket}, const struct msghdr *@var{message}, int @var{flags})
-@standards{BSD, sys/socket.h}
-@safety{@prelim{}@mtsafe{}@assafe{}@acsafe{}}
-
-This function is defined as a cancellation point in multi-threaded
-programs, so one has to be prepared for this and make sure that
-allocated resources (like memory, files descriptors, semaphores or
-whatever) are freed even if the thread is cancel.
-@c @xref{pthread_cleanup_push}, for a method how to do this.
-@end deftypefun
-
-@deftypefun ssize_t recvmsg (int @var{socket}, struct msghdr *@var{message}, int @var{flags})
-@standards{BSD, sys/socket.h}
-@safety{@prelim{}@mtsafe{}@assafe{}@acsafe{}}
-
-This function is defined as a cancellation point in multi-threaded
-programs, so one has to be prepared for this and make sure that
-allocated resources (like memory, files descriptors, semaphores or
-whatever) are freed even if the thread is canceled.
-@c @xref{pthread_cleanup_push}, for a method how to do this.
-@end deftypefun
-@end ignore
+If you need more flexibility and/or control over sending and receiving
+packets, see @code{sendmsg} and @code{recvmsg} (@pxref{Other Socket APIs}).
@node Datagram Example
@subsection Datagram Socket Example
@@ -3664,3 +3635,20 @@ returns a null pointer if there are no more entries.
@c libc_lock_unlock @aculock
This function closes the networks database.
@end deftypefun
+
+@node Other Socket APIs
+@section Other Socket APIs
+
+@deftp {Data Type} {struct msghdr}
+@standards{BSD, sys/socket.h}
+@end deftp
+
+@deftypefun ssize_t sendmsg (int @var{socket}, const struct msghdr *@var{message}, int @var{flags})
+
+@manpagefunctionstub{sendmsg,2}
+@end deftypefun
+
+@deftypefun ssize_t recvmsg (int @var{socket}, struct msghdr *@var{message}, int @var{flags})
+
+@manpagefunctionstub{recvmsg,2}
+@end deftypefun
diff --git a/manual/startup.texi b/manual/startup.texi
index 224dd98c1e..747beed4d9 100644
--- a/manual/startup.texi
+++ b/manual/startup.texi
@@ -689,7 +689,25 @@ you don't need to know about it because you can just use @theglibc{}'s
@code{chmod} function.
@cindex kernel call
-System calls are sometimes called kernel calls.
+System calls are sometimes called syscalls or kernel calls, and this
+interface is mostly a purely mechanical translation from the kernel's
+ABI to the C ABI. For the set of syscalls where we do not guarantee
+POSIX Thread cancellation the wrappers only organize the incoming
+arguments from the C calling convention to the calling convention of
+the target kernel. For the set of syscalls where we provided POSIX
+Thread cancellation the wrappers set some internal state in the
+library to support cancellation, but this does not impact the
+behaviour of the syscall provided by the kernel.
+
+In some cases, if @theglibc{} detects that a system call has been
+superseded by a more capable one, the wrapper may map the old call to
+the new one. For example, @code{dup2} is implemented via @code{dup3}
+by passing an additional empty flags argument, and @code{open} calls
+@code{openat} passing the additional @code{AT_FDCWD}. Sometimes even
+more is done, such as converting between 32-bit and 64-bit time
+values. In general, though, such processing is only to make the
+system call better match the C ABI, rather than change its
+functionality.
However, there are times when you want to make a system call explicitly,
and for that, @theglibc{} provides the @code{syscall} function.
@@ -711,6 +729,8 @@ we won't describe it here either because anyone who is coding
library source code as a specification of the interface between them
anyway.
+@code{syscall} does not provide cancellation logic, even if the system
+call you're calling is listed as cancellable above.
@code{syscall} is declared in @file{unistd.h}.
From a4c3f5f46e850c977cda81c251036475aab8313c Mon Sep 17 00:00:00 2001
From: Adhemerval Zanella <adhemerval.zanella@linaro.org>
Date: Thu, 23 Nov 2023 14:29:14 -0300
Subject: [PATCH] elf: Add a way to check if tunable is set (BZ 27069)
Content-type: text/plain; charset=UTF-8
The patch adds two new macros, TUNABLE_GET_DEFAULT and TUNABLE_IS_INITIALIZED,
here the former get the default value with a signature similar to
TUNABLE_GET, while the later returns whether the tunable was set by
the environment variable.
Checked on x86_64-linux-gnu.
Reviewed-by: DJ Delorie <dj@redhat.com>
Tested-by: Zhangfei Gao <zhangfei.gao@linaro.org>
Conflicts:
elf/Versions
(removed to preserve ABI)
elf/dl-tunable-types.h
(line numbers)
scripts/gen-tunables.awk
(account for missing TUNABLE_SECLEVEL patch)
---
elf/dl-tunable-types.h | 1 +
elf/dl-tunables.c | 40 ++++++++++++++++++++++++++++++++++++++++
elf/dl-tunables.h | 28 ++++++++++++++++++++++++++++
elf/dl-tunables.list | 1 +
scripts/gen-tunables.awk | 4 ++--
6 files changed, 73 insertions(+), 2 deletions(-)
diff -rup a/elf/dl-tunable-types.h b/elf/dl-tunable-types.h
--- a/elf/dl-tunable-types.h 2021-08-01 21:33:43.000000000 -0400
+++ b/elf/dl-tunable-types.h 2024-03-26 18:23:22.211504813 -0400
@@ -61,6 +61,7 @@ struct _tunable
{
const char name[TUNABLE_NAME_MAX]; /* Internal name of the tunable. */
tunable_type_t type; /* Data type of the tunable. */
+ const tunable_val_t def; /* The value. */
tunable_val_t val; /* The value. */
bool initialized; /* Flag to indicate that the tunable is
initialized. */
diff -rup a/elf/dl-tunables.c b/elf/dl-tunables.c
--- a/elf/dl-tunables.c 2024-03-26 18:21:10.090681748 -0400
+++ b/elf/dl-tunables.c 2024-03-26 18:23:22.214504923 -0400
@@ -152,6 +152,13 @@ tunable_initialize (tunable_t *cur, cons
do_tunable_update_val (cur, &val, NULL, NULL);
}
+bool
+__tunable_is_initialized (tunable_id_t id)
+{
+ return tunable_list[id].initialized;
+}
+rtld_hidden_def (__tunable_is_initialized)
+
void
__tunable_set_val (tunable_id_t id, tunable_val_t *valp, tunable_num_t *minp,
tunable_num_t *maxp)
@@ -399,6 +406,39 @@ __tunables_print (void)
}
}
+void
+__tunable_get_default (tunable_id_t id, void *valp)
+{
+ tunable_t *cur = &tunable_list[id];
+
+ switch (cur->type.type_code)
+ {
+ case TUNABLE_TYPE_UINT_64:
+ {
+ *((uint64_t *) valp) = (uint64_t) cur->def.numval;
+ break;
+ }
+ case TUNABLE_TYPE_INT_32:
+ {
+ *((int32_t *) valp) = (int32_t) cur->def.numval;
+ break;
+ }
+ case TUNABLE_TYPE_SIZE_T:
+ {
+ *((size_t *) valp) = (size_t) cur->def.numval;
+ break;
+ }
+ case TUNABLE_TYPE_STRING:
+ {
+ *((const char **)valp) = cur->def.strval;
+ break;
+ }
+ default:
+ __builtin_unreachable ();
+ }
+}
+rtld_hidden_def (__tunable_get_default)
+
/* Set the tunable value. This is called by the module that the tunable exists
in. */
void
diff -rup a/elf/dl-tunables.h b/elf/dl-tunables.h
--- a/elf/dl-tunables.h 2021-08-01 21:33:43.000000000 -0400
+++ b/elf/dl-tunables.h 2024-03-26 18:23:22.217505032 -0400
@@ -53,18 +53,26 @@ typedef void (*tunable_callback_t) (tuna
extern void __tunables_init (char **);
extern void __tunables_print (void);
+extern bool __tunable_is_initialized (tunable_id_t);
extern void __tunable_get_val (tunable_id_t, void *, tunable_callback_t);
extern void __tunable_set_val (tunable_id_t, tunable_val_t *, tunable_num_t *,
tunable_num_t *);
+extern void __tunable_get_default (tunable_id_t id, void *valp);
rtld_hidden_proto (__tunables_init)
rtld_hidden_proto (__tunables_print)
+rtld_hidden_proto (__tunable_is_initialized)
rtld_hidden_proto (__tunable_get_val)
rtld_hidden_proto (__tunable_set_val)
+rtld_hidden_proto (__tunable_get_default)
/* Define TUNABLE_GET and TUNABLE_SET in short form if TOP_NAMESPACE and
TUNABLE_NAMESPACE are defined. This is useful shorthand to get and set
tunables within a module. */
#if defined TOP_NAMESPACE && defined TUNABLE_NAMESPACE
+# define TUNABLE_IS_INITIALIZED(__id) \
+ TUNABLE_IS_INITIALIZED_FULL(TOP_NAMESPACE, TUNABLE_NAMESPACE, __id)
+# define TUNABLE_GET_DEFAULT(__id, __type) \
+ TUNABLE_GET_DEFAULT_FULL(TOP_NAMESPACE, TUNABLE_NAMESPACE,__id, __type)
# define TUNABLE_GET(__id, __type, __cb) \
TUNABLE_GET_FULL (TOP_NAMESPACE, TUNABLE_NAMESPACE, __id, __type, __cb)
# define TUNABLE_SET(__id, __val) \
@@ -73,6 +81,10 @@ rtld_hidden_proto (__tunable_set_val)
TUNABLE_SET_WITH_BOUNDS_FULL (TOP_NAMESPACE, TUNABLE_NAMESPACE, __id, \
__val, __min, __max)
#else
+# define TUNABLE_IS_INITIALIZED(__top, __ns, __id) \
+ TUNABLE_IS_INITIALIZED_FULL(__top, __ns, __id)
+# define TUNABLE_GET_DEFAULT(__top, __ns, __type) \
+ TUNABLE_GET_DEFAULT_FULL(__top, __ns, __id, __type)
# define TUNABLE_GET(__top, __ns, __id, __type, __cb) \
TUNABLE_GET_FULL (__top, __ns, __id, __type, __cb)
# define TUNABLE_SET(__top, __ns, __id, __val) \
@@ -81,6 +93,22 @@ rtld_hidden_proto (__tunable_set_val)
TUNABLE_SET_WITH_BOUNDS_FULL (__top, __ns, __id, __val, __min, __max)
#endif
+/* Return whether the tunable was initialized by the environment variable. */
+#define TUNABLE_IS_INITIALIZED_FULL(__top, __ns, __id) \
+({ \
+ tunable_id_t id = TUNABLE_ENUM_NAME (__top, __ns, __id); \
+ __tunable_is_initialized (id); \
+})
+
+/* Return the default value of the tunable. */
+#define TUNABLE_GET_DEFAULT_FULL(__top, __ns, __id, __type) \
+({ \
+ tunable_id_t id = TUNABLE_ENUM_NAME (__top, __ns, __id); \
+ __type __ret; \
+ __tunable_get_default (id, &__ret); \
+ __ret; \
+})
+
/* Get and return a tunable value. If the tunable was set externally and __CB
is defined then call __CB before returning the value. */
# define TUNABLE_GET_FULL(__top, __ns, __id, __type, __cb) \
diff -rup a/elf/dl-tunables.list b/elf/dl-tunables.list
--- a/elf/dl-tunables.list 2024-03-26 18:21:09.664666196 -0400
+++ b/elf/dl-tunables.list 2024-03-26 18:23:22.220505142 -0400
@@ -20,6 +20,7 @@
# type: Defaults to STRING
# minval: Optional minimum acceptable value
# maxval: Optional maximum acceptable value
+# default: Optional default value (if not specified it will be 0 or "")
# env_alias: An alias environment variable
# security_level: Specify security level of the tunable for AT_SECURE binaries.
# Valid values are:
diff -rup a/scripts/gen-tunables.awk b/scripts/gen-tunables.awk
--- a/scripts/gen-tunables.awk 2024-03-26 18:21:09.523661049 -0400
+++ b/scripts/gen-tunables.awk 2024-03-26 18:34:45.385462341 -0400
@@ -236,8 +236,8 @@ END {
n = indices[2];
m = indices[3];
printf (" {TUNABLE_NAME_S(%s, %s, %s)", t, n, m)
- printf (", {TUNABLE_TYPE_%s, %s, %s}, {%s}, NULL, TUNABLE_SECLEVEL_%s, %s},\n",
- types[t,n,m], minvals[t,n,m], maxvals[t,n,m],
+ printf (", {TUNABLE_TYPE_%s, %s, %s}, {%s}, {%s}, NULL, TUNABLE_SECLEVEL_%s, %s},\n",
+ types[t,n,m], minvals[t,n,m], maxvals[t,n,m],default_val[t,n,m],
default_val[t,n,m], security_level[t,n,m], env_alias[t,n,m]);
}
print "};"
From 0c0d39fe4aeb0f69b26e76337c5dfd5530d5d44e Mon Sep 17 00:00:00 2001
From: Adhemerval Zanella <adhemerval.zanella@linaro.org>
Date: Thu, 8 Feb 2024 10:08:38 -0300
Subject: [PATCH] x86: Fix Zen3/Zen4 ERMS selection (BZ 30994)
Content-type: text/plain; charset=UTF-8
The REP MOVSB usage on memcpy/memmove does not show much performance
improvement on Zen3/Zen4 cores compared to the vectorized loops. Also,
as from BZ 30994, if the source is aligned and the destination is not
the performance can be 20x slower.
The performance difference is noticeable with small buffer sizes, closer
to the lower bounds limits when memcpy/memmove starts to use ERMS. The
performance of REP MOVSB is similar to vectorized instruction on the
size limit (the L2 cache). Also, there is no drawback to multiple cores
sharing the cache.
Checked on x86_64-linux-gnu on Zen3.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Conflicts:
sysdeps/x86/dl-cacheinfo.h
(tweaked for changed context)
---
sysdeps/x86/dl-cacheinfo.h | 38 ++++++++++++++++++--------------------
1 file changed, 18 insertions(+), 20 deletions(-)
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index d5101615e3..f34d12846c 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -791,7 +791,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
long int data = -1;
long int shared = -1;
long int shared_per_thread = -1;
- long int core = -1;
unsigned int threads = 0;
unsigned long int level1_icache_size = -1;
unsigned long int level1_icache_linesize = -1;
@@ -809,7 +808,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
if (cpu_features->basic.kind == arch_kind_intel)
{
data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, cpu_features);
- core = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features);
shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features);
shared_per_thread = shared;
@@ -822,7 +820,8 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
= handle_intel (_SC_LEVEL1_DCACHE_ASSOC, cpu_features);
level1_dcache_linesize
= handle_intel (_SC_LEVEL1_DCACHE_LINESIZE, cpu_features);
- level2_cache_size = core;
+ level2_cache_size
+ = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features);
level2_cache_assoc
= handle_intel (_SC_LEVEL2_CACHE_ASSOC, cpu_features);
level2_cache_linesize
@@ -835,12 +834,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
level4_cache_size
= handle_intel (_SC_LEVEL4_CACHE_SIZE, cpu_features);
- get_common_cache_info (&shared, &shared_per_thread, &threads, core);
+ get_common_cache_info (&shared, &shared_per_thread, &threads,
+ level2_cache_size);
}
else if (cpu_features->basic.kind == arch_kind_zhaoxin)
{
data = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE);
- core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE);
shared_per_thread = shared;
@@ -849,19 +848,19 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
level1_dcache_size = data;
level1_dcache_assoc = handle_zhaoxin (_SC_LEVEL1_DCACHE_ASSOC);
level1_dcache_linesize = handle_zhaoxin (_SC_LEVEL1_DCACHE_LINESIZE);
- level2_cache_size = core;
+ level2_cache_size = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
level2_cache_assoc = handle_zhaoxin (_SC_LEVEL2_CACHE_ASSOC);
level2_cache_linesize = handle_zhaoxin (_SC_LEVEL2_CACHE_LINESIZE);
level3_cache_size = shared;
level3_cache_assoc = handle_zhaoxin (_SC_LEVEL3_CACHE_ASSOC);
level3_cache_linesize = handle_zhaoxin (_SC_LEVEL3_CACHE_LINESIZE);
- get_common_cache_info (&shared, &shared_per_thread, &threads, core);
+ get_common_cache_info (&shared, &shared_per_thread, &threads,
+ level2_cache_size);
}
else if (cpu_features->basic.kind == arch_kind_amd)
{
data = handle_amd (_SC_LEVEL1_DCACHE_SIZE);
- core = handle_amd (_SC_LEVEL2_CACHE_SIZE);
shared = handle_amd (_SC_LEVEL3_CACHE_SIZE);
level1_icache_size = handle_amd (_SC_LEVEL1_ICACHE_SIZE);
@@ -869,7 +868,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
level1_dcache_size = data;
level1_dcache_assoc = handle_amd (_SC_LEVEL1_DCACHE_ASSOC);
level1_dcache_linesize = handle_amd (_SC_LEVEL1_DCACHE_LINESIZE);
- level2_cache_size = core;
+ level2_cache_size = handle_amd (_SC_LEVEL2_CACHE_SIZE);;
level2_cache_assoc = handle_amd (_SC_LEVEL2_CACHE_ASSOC);
level2_cache_linesize = handle_amd (_SC_LEVEL2_CACHE_LINESIZE);
level3_cache_size = shared;
@@ -880,12 +879,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
if (shared <= 0)
{
/* No shared L3 cache. All we have is the L2 cache. */
- shared = core;
+ shared = level2_cache_size;
}
else if (cpu_features->basic.family < 0x17)
{
/* Account for exclusive L2 and L3 caches. */
- shared += core;
+ shared += level2_cache_size;
}
shared_per_thread = shared;
@@ -987,6 +986,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
rep_movsb_threshold = 2112;
+ /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
+ cases slower than the vectorized path (and for some alignments,
+ it is really slow, check BZ #30994). */
+ if (cpu_features->basic.kind == arch_kind_amd)
+ rep_movsb_threshold = non_temporal_threshold;
+
/* The default threshold to use Enhanced REP STOSB. */
unsigned long int rep_stosb_threshold = 2048;
@@ -1028,15 +1033,8 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
#endif
unsigned long int rep_movsb_stop_threshold;
- /* ERMS feature is implemented from AMD Zen3 architecture and it is
- performing poorly for data above L2 cache size. Henceforth, adding
- an upper bound threshold parameter to limit the usage of Enhanced
- REP MOVSB operations and setting its value to L2 cache size. */
- if (cpu_features->basic.kind == arch_kind_amd)
- rep_movsb_stop_threshold = core;
/* Setting the upper bound of ERMS to the computed value of
- non-temporal threshold for architectures other than AMD. */
- else
- rep_movsb_stop_threshold = non_temporal_threshold;
+ non-temporal threshold for all architectures. */
+ rep_movsb_stop_threshold = non_temporal_threshold;
cpu_features->data_cache_size = data;
cpu_features->shared_cache_size = shared;
--
2.39.3
From 272708884cb750f12f5c74a00e6620c19dc6d567 Mon Sep 17 00:00:00 2001
From: Adhemerval Zanella <adhemerval.zanella@linaro.org>
Date: Thu, 8 Feb 2024 10:08:39 -0300
Subject: [PATCH] x86: Do not prefer ERMS for memset on Zen3+
Content-type: text/plain; charset=UTF-8
For AMD Zen3+ architecture, the performance of the vectorized loop is
slightly better than ERMS.
Checked on x86_64-linux-gnu on Zen3.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86/dl-cacheinfo.h | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index f34d12846c..5a98f70364 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -1021,6 +1021,11 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
minimum value is fixed. */
rep_stosb_threshold = TUNABLE_GET (x86_rep_stosb_threshold,
long int, NULL);
+ if (cpu_features->basic.kind == arch_kind_amd
+ && !TUNABLE_IS_INITIALIZED (x86_rep_stosb_threshold))
+ /* For AMD Zen3+ architecture, the performance of the vectorized loop is
+ slightly better than ERMS. */
+ rep_stosb_threshold = SIZE_MAX;
TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX);
TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX);
--
2.39.3
From 491e55beab7457ed310a4a47496f4a333c5d1032 Mon Sep 17 00:00:00 2001
From: Adhemerval Zanella <adhemerval.zanella@linaro.org>
Date: Thu, 8 Feb 2024 10:08:40 -0300
Subject: [PATCH] x86: Expand the comment on when REP STOSB is used on memset
Content-type: text/plain; charset=UTF-8
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index 9984c3ca0f..97839a2248 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -21,7 +21,9 @@
2. If size is less than VEC, use integer register stores.
3. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores.
4. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores.
- 5. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with
+ 5. On machines ERMS feature, if size is greater or equal than
+ __x86_rep_stosb_threshold then REP STOSB will be used.
+ 6. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with
4 VEC stores and store 4 * VEC at a time until done. */
#include <sysdep.h>
--
2.39.3
From dce754b1553b86fc6352636f1fa490a85b7cf0ff Mon Sep 17 00:00:00 2001
From: DJ Delorie <dj@redhat.com>
Date: Fri, 10 May 2024 14:52:09 -0400
Subject: Update mmap() flags and errors lists
Extend the list of MAP_* macros to include all macros available
to the average program (gcc -E -dM | grep MAP_*)
Extend the list of errno codes.
Reviewed-by: Florian Weimer <fweimer@redhat.com>
diff --git a/manual/llio.texi b/manual/llio.texi
index fe1807a849..78c7c79913 100644
--- a/manual/llio.texi
+++ b/manual/llio.texi
@@ -1573,10 +1573,15 @@ permitted. They include @code{PROT_READ}, @code{PROT_WRITE}, and
of address space for future use. The @code{mprotect} function can be
used to change the protection flags. @xref{Memory Protection}.
-@var{flags} contains flags that control the nature of the map.
-One of @code{MAP_SHARED} or @code{MAP_PRIVATE} must be specified.
+The @var{flags} parameter contains flags that control the nature of
+the map. One of @code{MAP_SHARED}, @code{MAP_SHARED_VALIDATE}, or
+@code{MAP_PRIVATE} must be specified. Additional flags may be bitwise
+OR'd to further define the mapping.
-They include:
+Note that, aside from @code{MAP_PRIVATE} and @code{MAP_SHARED}, not
+all flags are supported on all versions of all operating systems.
+Consult the kernel-specific documentation for details. The flags
+include:
@vtable @code
@item MAP_PRIVATE
@@ -1598,9 +1603,19 @@ Note that actual writing may take place at any time. You need to use
@code{msync}, described below, if it is important that other processes
using conventional I/O get a consistent view of the file.
+@item MAP_SHARED_VALIDATE
+Similar to @code{MAP_SHARED} except that additional flags will be
+validated by the kernel, and the call will fail if an unrecognized
+flag is provided. With @code{MAP_SHARED} using a flag on a kernel
+that doesn't support it causes the flag to be ignored.
+@code{MAP_SHARED_VALIDATE} should be used when the behavior of all
+flags is required.
+
@item MAP_FIXED
This forces the system to use the exact mapping address specified in
-@var{address} and fail if it can't.
+@var{address} and fail if it can't. Note that if the new mapping
+would overlap an existing mapping, the overlapping portion of the
+existing map is unmapped.
@c One of these is official - the other is obviously an obsolete synonym
@c Which is which?
@@ -1641,10 +1656,73 @@ The @code{MAP_HUGETLB} flag is specific to Linux.
@c There is a mechanism to select different hugepage sizes; see
@c include/uapi/asm-generic/hugetlb_encode.h in the kernel sources.
-@c Linux has some other MAP_ options, which I have not discussed here.
-@c MAP_DENYWRITE, MAP_EXECUTABLE and MAP_GROWSDOWN don't seem applicable to
-@c user programs (and I don't understand the last two). MAP_LOCKED does
-@c not appear to be implemented.
+@item MAP_32BIT
+Require addresses that can be accessed with a signed 32 bit pointer,
+i.e., within the first 2 GiB. Ignored if MAP_FIXED is specified.
+
+@item MAP_DENYWRITE
+@itemx MAP_EXECUTABLE
+@itemx MAP_FILE
+
+Provided for compatibility. Ignored by the Linux kernel.
+
+@item MAP_FIXED_NOREPLACE
+Similar to @code{MAP_FIXED} except the call will fail with
+@code{EEXIST} if the new mapping would overwrite an existing mapping.
+To test for support for this flag, specify MAP_FIXED_NOREPLACE without
+MAP_FIXED, and (if the call was successful) check the actual address
+returned. If it does not match the address passed, then this flag is
+not supported.
+
+@item MAP_GROWSDOWN
+This flag is used to make stacks, and is typically only needed inside
+the program loader to set up the main stack for the running process.
+The mapping is created according to the other flags, except an
+additional page just prior to the mapping is marked as a ``guard
+page''. If a write is attempted inside this guard page, that page is
+mapped, the mapping is extended, and a new guard page is created.
+Thus, the mapping continues to grow towards lower addresses until it
+encounters some other mapping.
+
+Note that accessing memory beyond the guard page will not trigger this
+feature. In gcc, use @code{-fstack-clash-protection} to ensure the
+guard page is always touched.
+
+@item MAP_LOCKED
+A hint that requests that mapped pages are locked in memory (i.e. not
+paged out). Note that this is a request and not a requirement; use
+@code{mlock} if locking is required.
+
+@item MAP_POPULATE
+@itemx MAP_NONBLOCK
+@code{MAP_POPULATE} is a hint that requests that the kernel read-ahead
+a file-backed mapping, causing pages to be mapped before they're
+needed. @code{MAP_NONBLOCK} is a hint that requests that the kernel
+@emph{not} attempt such except for pages are already in memory. Note
+that neither of these hints affects future paging activity, use
+@code{mlock} if such needs to be controlled.
+
+@item MAP_NORESERVE
+Asks the kernel to not reserve physical backing (i.e. space in a swap
+device) for a mapping. This would be useful for, for example, a very
+large but sparsely used mapping which need not be limited in total
+length by available RAM, but with very few mapped pages. Note that
+writes to such a mapping may cause a @code{SIGSEGV} if the system is
+unable to map a page due to lack of resources.
+
+On Linux, this flag's behavior may be overwridden by
+@file{/proc/sys/vm/overcommit_memory} as documented in the proc(5) man
+page.
+
+@item MAP_STACK
+Ensures that the resulting mapping is suitable for use as a program
+stack. For example, the use of huge pages might be precluded.
+
+@item MAP_SYNC
+This is a special flag for DAX devices, which tells the kernel to
+write dirty metadata out whenever dirty data is written out. Unlike
+most other flags, this one will fail unless @code{MAP_SHARED_VALIDATE}
+is also given.
@end vtable
@@ -1655,6 +1733,24 @@ Possible errors include:
@table @code
+@item EACCES
+
+@var{filedes} was not open for the type of access specified in @var{protect}.
+
+@item EAGAIN
+
+The system has temporarily run out of resources.
+
+@item EBADF
+
+The @var{fd} passed is invalid, and a valid file descriptor is
+required (i.e. MAP_ANONYMOUS was not specified).
+
+@item EEXIST
+
+@code{MAP_FIXED_NOREPLACE} was specified and an existing mapping was
+found overlapping the requested address range.
+
@item EINVAL
Either @var{address} was unusable (because it is not a multiple of the
@@ -1663,23 +1759,37 @@ applicable page size), or inconsistent @var{flags} were given.
If @code{MAP_HUGETLB} was specified, the file or system does not support
large page sizes.
-@item EACCES
+@item ENODEV
-@var{filedes} was not open for the type of access specified in @var{protect}.
+This file is of a type that doesn't support mapping, the process has
+exceeded its data space limit, or the map request would exceed the
+process's virtual address space.
@item ENOMEM
-Either there is not enough memory for the operation, or the process is
-out of address space.
-
-@item ENODEV
-
-This file is of a type that doesn't support mapping.
+There is not enough memory for the operation, the process is out of
+address space, or there are too many mappings. On Linux, the maximum
+number of mappings can be controlled via
+@file{/proc/sys/vm/max_map_count} or, if your OS supports it, via
+the @code{vm.max_map_count} @code{sysctl} setting.
@item ENOEXEC
The file is on a filesystem that doesn't support mapping.
+@item EPERM
+
+@code{PROT_EXEC} was requested but the file is on a filesystem that
+was mounted with execution denied, a file seal prevented the mapping,
+or the caller set MAP_HUDETLB but does not have the required
+priviledges.
+
+@item EOVERFLOW
+
+Either the offset into the file plus the length of the mapping causes
+internal page counts to overflow, or the offset requested exceeds the
+length of the file.
+
@c On Linux, EAGAIN will appear if the file has a conflicting mandatory lock.
@c However mandatory locks are not discussed in this manual.
@c
Author: Charles Fol <folcharles@gmail.com>
Date: Thu Mar 28 12:25:38 2024 -0300
iconv: ISO-2022-CN-EXT: fix out-of-bound writes when writing escape sequence (CVE-2024-2961)
ISO-2022-CN-EXT uses escape sequences to indicate character set changes
(as specified by RFC 1922). While the SOdesignation has the expected
bounds checks, neither SS2designation nor SS3designation have its;
allowing a write overflow of 1, 2, or 3 bytes with fixed values:
'$+I', '$+J', '$+K', '$+L', '$+M', or '$*H'.
Checked on aarch64-linux-gnu.
Co-authored-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
Tested-by: Carlos O'Donell <carlos@redhat.com>
Conflicts:
iconvdata/Makefile
(usual tests conflict)
diff --git a/iconvdata/Makefile b/iconvdata/Makefile
index d5507a048c6a6508..25bd004e7f92a994 100644
--- a/iconvdata/Makefile
+++ b/iconvdata/Makefile
@@ -75,7 +75,7 @@ ifeq (yes,$(build-shared))
tests = bug-iconv1 bug-iconv2 tst-loading tst-e2big tst-iconv4 bug-iconv4 \
tst-iconv6 bug-iconv5 bug-iconv6 tst-iconv7 bug-iconv8 bug-iconv9 \
bug-iconv10 bug-iconv11 bug-iconv12 tst-iconv-big5-hkscs-to-2ucs4 \
- bug-iconv13 bug-iconv14 bug-iconv15
+ bug-iconv13 bug-iconv14 bug-iconv15 tst-iconv-iso-2022-cn-ext
ifeq ($(have-thread-library),yes)
tests += bug-iconv3
endif
@@ -330,6 +330,8 @@ $(objpfx)bug-iconv14.out: $(addprefix $(objpfx), $(gconv-modules)) \
$(addprefix $(objpfx),$(modules.so))
$(objpfx)bug-iconv15.out: $(addprefix $(objpfx), $(gconv-modules)) \
$(addprefix $(objpfx),$(modules.so))
+$(objpfx)tst-iconv-iso-2022-cn-ext.out: $(addprefix $(objpfx), $(gconv-modules)) \
+ $(addprefix $(objpfx),$(modules.so))
$(objpfx)iconv-test.out: run-iconv-test.sh \
$(addprefix $(objpfx), $(gconv-modules)) \
diff --git a/iconvdata/iso-2022-cn-ext.c b/iconvdata/iso-2022-cn-ext.c
index 2aca91c021f21ba0..c1339fe933d9d1c4 100644
--- a/iconvdata/iso-2022-cn-ext.c
+++ b/iconvdata/iso-2022-cn-ext.c
@@ -575,6 +575,12 @@ DIAG_IGNORE_Os_NEEDS_COMMENT (5, "-Wmaybe-uninitialized");
{ \
const char *escseq; \
\
+ if (outptr + 4 > outend) \
+ { \
+ result = __GCONV_FULL_OUTPUT; \
+ break; \
+ } \
+ \
assert (used == CNS11643_2_set); /* XXX */ \
escseq = "*H"; \
*outptr++ = ESC; \
@@ -588,6 +594,12 @@ DIAG_IGNORE_Os_NEEDS_COMMENT (5, "-Wmaybe-uninitialized");
{ \
const char *escseq; \
\
+ if (outptr + 4 > outend) \
+ { \
+ result = __GCONV_FULL_OUTPUT; \
+ break; \
+ } \
+ \
assert ((used >> 5) >= 3 && (used >> 5) <= 7); \
escseq = "+I+J+K+L+M" + ((used >> 5) - 3) * 2; \
*outptr++ = ESC; \
diff --git a/iconvdata/tst-iconv-iso-2022-cn-ext.c b/iconvdata/tst-iconv-iso-2022-cn-ext.c
new file mode 100644
index 0000000000000000..96a8765fd5369681
--- /dev/null
+++ b/iconvdata/tst-iconv-iso-2022-cn-ext.c
@@ -0,0 +1,128 @@
+/* Verify ISO-2022-CN-EXT does not write out of the bounds.
+ Copyright (C) 2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <stdio.h>
+#include <string.h>
+
+#include <errno.h>
+#include <iconv.h>
+#include <sys/mman.h>
+
+#include <support/xunistd.h>
+#include <support/check.h>
+#include <support/support.h>
+
+/* The test sets up a two memory page buffer with the second page marked
+ PROT_NONE to trigger a fault if the conversion writes beyond the exact
+ expected amount. Then we carry out various conversions and precisely
+ place the start of the output buffer in order to trigger a SIGSEGV if the
+ process writes anywhere between 1 and page sized bytes more (only one
+ PROT_NONE page is setup as a canary) than expected. These tests exercise
+ all three of the cases in ISO-2022-CN-EXT where the converter must switch
+ character sets and may run out of buffer space while doing the
+ operation. */
+
+static int
+do_test (void)
+{
+ iconv_t cd = iconv_open ("ISO-2022-CN-EXT", "UTF-8");
+ TEST_VERIFY_EXIT (cd != (iconv_t) -1);
+
+ char *ntf;
+ size_t ntfsize;
+ char *outbufbase;
+ {
+ int pgz = getpagesize ();
+ TEST_VERIFY_EXIT (pgz > 0);
+ ntfsize = 2 * pgz;
+
+ ntf = xmmap (NULL, ntfsize, PROT_READ | PROT_WRITE, MAP_PRIVATE
+ | MAP_ANONYMOUS, -1);
+ xmprotect (ntf + pgz, pgz, PROT_NONE);
+
+ outbufbase = ntf + pgz;
+ }
+
+ /* Check if SOdesignation escape sequence does not trigger an OOB write. */
+ {
+ char inbuf[] = "\xe4\xba\xa4\xe6\x8d\xa2";
+
+ for (int i = 0; i < 9; i++)
+ {
+ char *inp = inbuf;
+ size_t inleft = sizeof (inbuf) - 1;
+
+ char *outp = outbufbase - i;
+ size_t outleft = i;
+
+ TEST_VERIFY_EXIT (iconv (cd, &inp, &inleft, &outp, &outleft)
+ == (size_t) -1);
+ TEST_COMPARE (errno, E2BIG);
+
+ TEST_VERIFY_EXIT (iconv (cd, NULL, NULL, NULL, NULL) == 0);
+ }
+ }
+
+ /* Same as before for SS2designation. */
+ {
+ char inbuf[] = "㴽 \xe3\xb4\xbd";
+
+ for (int i = 0; i < 14; i++)
+ {
+ char *inp = inbuf;
+ size_t inleft = sizeof (inbuf) - 1;
+
+ char *outp = outbufbase - i;
+ size_t outleft = i;
+
+ TEST_VERIFY_EXIT (iconv (cd, &inp, &inleft, &outp, &outleft)
+ == (size_t) -1);
+ TEST_COMPARE (errno, E2BIG);
+
+ TEST_VERIFY_EXIT (iconv (cd, NULL, NULL, NULL, NULL) == 0);
+ }
+ }
+
+ /* Same as before for SS3designation. */
+ {
+ char inbuf[] = "劄 \xe5\x8a\x84";
+
+ for (int i = 0; i < 14; i++)
+ {
+ char *inp = inbuf;
+ size_t inleft = sizeof (inbuf) - 1;
+
+ char *outp = outbufbase - i;
+ size_t outleft = i;
+
+ TEST_VERIFY_EXIT (iconv (cd, &inp, &inleft, &outp, &outleft)
+ == (size_t) -1);
+ TEST_COMPARE (errno, E2BIG);
+
+ TEST_VERIFY_EXIT (iconv (cd, NULL, NULL, NULL, NULL) == 0);
+ }
+ }
+
+ TEST_VERIFY_EXIT (iconv_close (cd) != -1);
+
+ xmunmap (ntf, ntfsize);
+
+ return 0;
+}
+
+#include <support/test-driver.c>
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment