From 4b2bd20c350a66d4f8a7c2da48b115fd4e06d15e Mon Sep 17 00:00:00 2001 From: Sumera Priyadarsini Date: Wed, 5 Aug 2020 14:27:40 +0530 Subject: [PATCH 001/168] scripts: coccicheck: Add chain mode to list of modes This patch adds chain mode to the list of available modes in coccicheck. Signed-off-by: Sumera Priyadarsini Signed-off-by: Julia Lawall --- scripts/coccicheck | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/coccicheck b/scripts/coccicheck index e04d328210ac..6e37cf36caae 100755 --- a/scripts/coccicheck +++ b/scripts/coccicheck @@ -99,7 +99,7 @@ fi if [ "$MODE" = "" ] ; then if [ "$ONLINE" = "0" ] ; then echo 'You have not explicitly specified the mode to use. Using default "report" mode.' - echo 'Available modes are the following: patch, report, context, org' + echo 'Available modes are the following: patch, report, context, org, chain' echo 'You can specify the mode with "make coccicheck MODE="' echo 'Note however that some modes are not implemented by some semantic patches.' fi From 7a2624e6de03050c9f15af21c216818d5508b5e9 Mon Sep 17 00:00:00 2001 From: Alex Dewar Date: Fri, 21 Aug 2020 00:55:57 +0100 Subject: [PATCH 002/168] coccinelle: add patch rule for dma_alloc_coherent Commit dfd32cad146e ("dma-mapping: remove dma_zalloc_coherent()") removed the definition of dma_zalloc_coherent() and also removed the corresponding patch rule for replacing instances of dma_alloc_coherent + memset in zalloc-simple.cocci (though left the report rule). Add a new patch rule to remove unnecessary calls to memset after allocating with dma_alloc_coherent. While we're at it, fix a couple of typos. Fixes: dfd32cad146e ("dma-mapping: remove dma_zalloc_coherent()") Signed-off-by: Alex Dewar Signed-off-by: Julia Lawall --- scripts/coccinelle/api/alloc/zalloc-simple.cocci | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/scripts/coccinelle/api/alloc/zalloc-simple.cocci b/scripts/coccinelle/api/alloc/zalloc-simple.cocci index 26cda3f48f01..b3d0c3c230c1 100644 --- a/scripts/coccinelle/api/alloc/zalloc-simple.cocci +++ b/scripts/coccinelle/api/alloc/zalloc-simple.cocci @@ -127,6 +127,16 @@ statement S; if ((x==NULL) || ...) S - memset((T2)x,0,E1); +@depends on patch@ +type T, T2; +expression x; +expression E1,E2,E3,E4; +statement S; +@@ + x = (T)dma_alloc_coherent(E1, E2, E3, E4); + if ((x==NULL) || ...) S +- memset((T2)x, 0, E2); + //---------------------------------------------------------- // For org mode //---------------------------------------------------------- @@ -199,9 +209,9 @@ statement S; position p; @@ - x = (T)dma_alloc_coherent@p(E2,E1,E3,E4); + x = (T)dma_alloc_coherent@p(E1,E2,E3,E4); if ((x==NULL) || ...) S - memset((T2)x,0,E1); + memset((T2)x,0,E2); @script:python depends on org@ p << r2.p; @@ -217,7 +227,7 @@ p << r2.p; x << r2.x; @@ -msg="WARNING: dma_alloc_coherent use in %s already zeroes out memory, so memset is not needed" % (x) +msg="WARNING: dma_alloc_coherent used in %s already zeroes out memory, so memset is not needed" % (x) coccilib.report.print_report(p[0], msg) //----------------------------------------------------------------- From a2fc3718bc22e85378085568ecc5765fb28cabce Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Fri, 21 Aug 2020 23:11:37 +0300 Subject: [PATCH 003/168] coccinelle: api: add kobj_to_dev.cocci script Use kobj_to_dev() instead of container_of(). Signed-off-by: Denis Efremov Signed-off-by: Julia Lawall --- scripts/coccinelle/api/kobj_to_dev.cocci | 45 ++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 scripts/coccinelle/api/kobj_to_dev.cocci diff --git a/scripts/coccinelle/api/kobj_to_dev.cocci b/scripts/coccinelle/api/kobj_to_dev.cocci new file mode 100644 index 000000000000..cd5d31c6fe76 --- /dev/null +++ b/scripts/coccinelle/api/kobj_to_dev.cocci @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: GPL-2.0-only +/// +/// Use kobj_to_dev() instead of container_of() +/// +// Confidence: High +// Copyright: (C) 2020 Denis Efremov ISPRAS +// Options: --no-includes --include-headers +// +// Keywords: kobj_to_dev, container_of +// + +virtual context +virtual report +virtual org +virtual patch + + +@r depends on !patch@ +expression ptr; +symbol kobj; +position p; +@@ + +* container_of(ptr, struct device, kobj)@p + + +@depends on patch@ +expression ptr; +@@ + +- container_of(ptr, struct device, kobj) ++ kobj_to_dev(ptr) + + +@script:python depends on report@ +p << r.p; +@@ + +coccilib.report.print_report(p[0], "WARNING opportunity for kobj_to_dev()") + +@script:python depends on org@ +p << r.p; +@@ + +coccilib.org.print_todo(p[0], "WARNING opportunity for kobj_to_dev()") From 160c7ba34605d9b59ee406a1b4a61b0f942b1ae9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 8 Jul 2020 16:25:43 -0700 Subject: [PATCH 004/168] lib: Add backtrace_idle parameter to force backtrace of idle CPUs Currently, the nmi_cpu_backtrace() declines to produce backtraces for idle CPUs. This is a good choice in the common case in which problems are caused only by non-idle CPUs. However, there are occasionally situations in which idle CPUs are helping to cause problems. This commit therefore adds an nmi_backtrace.backtrace_idle kernel boot parameter that causes nmi_cpu_backtrace() to dump stacks even of idle CPUs. Signed-off-by: Paul E. McKenney Cc: Jonathan Corbet Cc: Thomas Gleixner Cc: Andrew Morton Cc: Greg Kroah-Hartman Cc: --- Documentation/admin-guide/kernel-parameters.txt | 4 ++++ lib/nmi_backtrace.c | 6 +++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index bdc1f33fd3d1..5e6d19182c5f 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3073,6 +3073,10 @@ and gids from such clients. This is intended to ease migration from NFSv2/v3. + nmi_backtrace.backtrace_idle [KNL] + Dump stacks even of idle CPUs in response to an + NMI stack-backtrace request. + nmi_debug= [KNL,SH] Specify one or more actions to take when a NMI is triggered. Format: [state][,regs][,debounce][,die] diff --git a/lib/nmi_backtrace.c b/lib/nmi_backtrace.c index 15ca78e1c7d4..8abe1870dba4 100644 --- a/lib/nmi_backtrace.c +++ b/lib/nmi_backtrace.c @@ -85,12 +85,16 @@ void nmi_trigger_cpumask_backtrace(const cpumask_t *mask, put_cpu(); } +// Dump stacks even for idle CPUs. +static bool backtrace_idle; +module_param(backtrace_idle, bool, 0644); + bool nmi_cpu_backtrace(struct pt_regs *regs) { int cpu = smp_processor_id(); if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { - if (regs && cpu_in_idle(instruction_pointer(regs))) { + if (!READ_ONCE(backtrace_idle) && regs && cpu_in_idle(instruction_pointer(regs))) { pr_warn("NMI backtrace for cpu %d skipped: idling at %pS\n", cpu, (void *)instruction_pointer(regs)); } else { From 77f808607a62c3685381a5732a88b30bad8893b5 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Thu, 2 Jul 2020 18:28:10 +0200 Subject: [PATCH 005/168] docs: Fix typo in synchronize_rcu() function name s/sychronize_rcu/synchronize_rcu/ Signed-off-by: Tobias Klauser Signed-off-by: Paul E. McKenney --- Documentation/RCU/whatisRCU.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/RCU/whatisRCU.rst b/Documentation/RCU/whatisRCU.rst index c7f147b8034f..fb3ff76c3e73 100644 --- a/Documentation/RCU/whatisRCU.rst +++ b/Documentation/RCU/whatisRCU.rst @@ -360,7 +360,7 @@ order to amortize their overhead over many uses of the corresponding APIs. There are at least three flavors of RCU usage in the Linux kernel. The diagram above shows the most common one. On the updater side, the rcu_assign_pointer(), -sychronize_rcu() and call_rcu() primitives used are the same for all three +synchronize_rcu() and call_rcu() primitives used are the same for all three flavors. However for protection (on the reader side), the primitives used vary depending on the flavor: From 1b98b7c5eb2f94eddad541d6fc91f1d1995d644b Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 3 Jul 2020 14:33:41 -0700 Subject: [PATCH 006/168] doc: Drop doubled words from RCU Data-Structures.rst Drop the doubled word "the". Signed-off-by: Randy Dunlap Cc: Jonathan Corbet Cc: linux-doc@vger.kernel.org Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Cc: Joel Fernandes Cc: rcu@vger.kernel.org Signed-off-by: Paul E. McKenney --- Documentation/RCU/Design/Data-Structures/Data-Structures.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/RCU/Design/Data-Structures/Data-Structures.rst b/Documentation/RCU/Design/Data-Structures/Data-Structures.rst index 4a48e20a46f2..f4efd6897b09 100644 --- a/Documentation/RCU/Design/Data-Structures/Data-Structures.rst +++ b/Documentation/RCU/Design/Data-Structures/Data-Structures.rst @@ -963,7 +963,7 @@ exit and perhaps also vice versa. Therefore, whenever the ``->dynticks_nesting`` field is incremented up from zero, the ``->dynticks_nmi_nesting`` field is set to a large positive number, and whenever the ``->dynticks_nesting`` field is decremented down to zero, -the the ``->dynticks_nmi_nesting`` field is set to zero. Assuming that +the ``->dynticks_nmi_nesting`` field is set to zero. Assuming that the number of misnested interrupts is not sufficient to overflow the counter, this approach corrects the ``->dynticks_nmi_nesting`` field every time the corresponding CPU enters the idle loop from process From 7f45d6f8ae383ed01070883b3c74ee51c9740065 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 3 Jul 2020 14:33:42 -0700 Subject: [PATCH 007/168] doc: Drop doubled words from RCU requirements documentation Drop the doubled words "to" and "for". Signed-off-by: Randy Dunlap Cc: Jonathan Corbet Cc: linux-doc@vger.kernel.org Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Cc: Joel Fernandes Cc: rcu@vger.kernel.org Signed-off-by: Paul E. McKenney --- Documentation/RCU/Design/Requirements/Requirements.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/RCU/Design/Requirements/Requirements.rst b/Documentation/RCU/Design/Requirements/Requirements.rst index 8f41ad0aa753..1ae79a10a8de 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.rst +++ b/Documentation/RCU/Design/Requirements/Requirements.rst @@ -2162,7 +2162,7 @@ scheduling-clock interrupt be enabled when RCU needs it to be: this sort of thing. #. If a CPU is in a portion of the kernel that is absolutely positively no-joking guaranteed to never execute any RCU read-side critical - sections, and RCU believes this CPU to to be idle, no problem. This + sections, and RCU believes this CPU to be idle, no problem. This sort of thing is used by some architectures for light-weight exception handlers, which can then avoid the overhead of ``rcu_irq_enter()`` and ``rcu_irq_exit()`` at exception entry and @@ -2431,7 +2431,7 @@ However, there are legitimate preemptible-RCU implementations that do not have this property, given that any point in the code outside of an RCU read-side critical section can be a quiescent state. Therefore, *RCU-sched* was created, which follows “classic” RCU in that an -RCU-sched grace period waits for for pre-existing interrupt and NMI +RCU-sched grace period waits for pre-existing interrupt and NMI handlers. In kernels built with ``CONFIG_PREEMPT=n``, the RCU and RCU-sched APIs have identical implementations, while kernels built with ``CONFIG_PREEMPT=y`` provide a separate implementation for each. From ebc3505d507cf0aafdc31e4b2359c9b22b3927c8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 17 Jun 2020 13:25:26 -0700 Subject: [PATCH 008/168] rcu: Remove KCSAN stubs KCSAN is now in mainline, so this commit removes the stubs for the data_race(), ASSERT_EXCLUSIVE_WRITER(), and ASSERT_EXCLUSIVE_ACCESS() macros. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8ce77d9ac716..eb36779697ca 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -70,19 +70,6 @@ #endif #define MODULE_PARAM_PREFIX "rcutree." -#ifndef data_race -#define data_race(expr) \ - ({ \ - expr; \ - }) -#endif -#ifndef ASSERT_EXCLUSIVE_WRITER -#define ASSERT_EXCLUSIVE_WRITER(var) do { } while (0) -#endif -#ifndef ASSERT_EXCLUSIVE_ACCESS -#define ASSERT_EXCLUSIVE_ACCESS(var) do { } while (0) -#endif - /* Data structures. */ /* From beb27bd649a08655b6e15b71265fccad9c00bd2c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 17 Jun 2020 13:26:20 -0700 Subject: [PATCH 009/168] rcu: Remove KCSAN stubs from update.c KCSAN is now in mainline, so this commit removes the stubs for the data_race(), ASSERT_EXCLUSIVE_WRITER(), and ASSERT_EXCLUSIVE_ACCESS() macros. Signed-off-by: Paul E. McKenney --- kernel/rcu/update.c | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 2de49b5d8dd2..5f7713a27dbb 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -53,19 +53,6 @@ #endif #define MODULE_PARAM_PREFIX "rcupdate." -#ifndef data_race -#define data_race(expr) \ - ({ \ - expr; \ - }) -#endif -#ifndef ASSERT_EXCLUSIVE_WRITER -#define ASSERT_EXCLUSIVE_WRITER(var) do { } while (0) -#endif -#ifndef ASSERT_EXCLUSIVE_ACCESS -#define ASSERT_EXCLUSIVE_ACCESS(var) do { } while (0) -#endif - #ifndef CONFIG_TINY_RCU module_param(rcu_expedited, int, 0); module_param(rcu_normal, int, 0); From d9b60741318f6f8bcb2adc4beaef724c923fcb93 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 17 Jun 2020 13:24:04 -0700 Subject: [PATCH 010/168] srcu: Remove KCSAN stubs KCSAN is now in mainline, so this commit removes the stubs for the data_race(), ASSERT_EXCLUSIVE_WRITER(), and ASSERT_EXCLUSIVE_ACCESS() macros. Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index c100acf332ed..c13348ee80a5 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -29,19 +29,6 @@ #include "rcu.h" #include "rcu_segcblist.h" -#ifndef data_race -#define data_race(expr) \ - ({ \ - expr; \ - }) -#endif -#ifndef ASSERT_EXCLUSIVE_WRITER -#define ASSERT_EXCLUSIVE_WRITER(var) do { } while (0) -#endif -#ifndef ASSERT_EXCLUSIVE_ACCESS -#define ASSERT_EXCLUSIVE_ACCESS(var) do { } while (0) -#endif - /* Holdoff in nanoseconds for auto-expediting. */ #define DEFAULT_SRCU_EXP_HOLDOFF (25 * 1000) static ulong exp_holdoff = DEFAULT_SRCU_EXP_HOLDOFF; From 7487ea07dfa9bd782a13469cab18973ea0ab8c57 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 18 Jun 2020 09:51:12 -0700 Subject: [PATCH 011/168] rcu: Initialize at declaration time in rcu_exp_handler() This commit moves the initialization of the CONFIG_PREEMPT=n version of the rcu_exp_handler() function's rdp and rnp local variables into their respective declarations to save a couple lines of code. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 1888c0eb1216..8760b6ead770 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -732,11 +732,9 @@ static void rcu_exp_need_qs(void) /* Invoked on each online non-idle CPU for expedited quiescent state. */ static void rcu_exp_handler(void *unused) { - struct rcu_data *rdp; - struct rcu_node *rnp; + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); + struct rcu_node *rnp = rdp->mynode; - rdp = this_cpu_ptr(&rcu_data); - rnp = rdp->mynode; if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || __this_cpu_read(rcu_data.cpu_no_qs.b.exp)) return; From c30068f41a0e899f870e0158a2c69c68d738bf96 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Thu, 18 Jun 2020 21:36:39 -0400 Subject: [PATCH 012/168] rcu/trace: Print negative GP numbers correctly GP numbers start from -300 and gp_seq numbers start of -1200 (for a shift of 2). These negative numbers are printed as unsigned long which not only takes up more text space, but is rather confusing to the reader as they have to constantly expend energy to truncate the number. Just print the negative numbering directly. Cc: Uladzislau Rezki (Sony) Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 54 +++++++++++++++++++------------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index ced71237b7e4..155b5cb43cfd 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -74,17 +74,17 @@ TRACE_EVENT_RCU(rcu_grace_period, TP_STRUCT__entry( __field(const char *, rcuname) - __field(unsigned long, gp_seq) + __field(long, gp_seq) __field(const char *, gpevent) ), TP_fast_assign( __entry->rcuname = rcuname; - __entry->gp_seq = gp_seq; + __entry->gp_seq = (long)gp_seq; __entry->gpevent = gpevent; ), - TP_printk("%s %lu %s", + TP_printk("%s %ld %s", __entry->rcuname, __entry->gp_seq, __entry->gpevent) ); @@ -114,8 +114,8 @@ TRACE_EVENT_RCU(rcu_future_grace_period, TP_STRUCT__entry( __field(const char *, rcuname) - __field(unsigned long, gp_seq) - __field(unsigned long, gp_seq_req) + __field(long, gp_seq) + __field(long, gp_seq_req) __field(u8, level) __field(int, grplo) __field(int, grphi) @@ -124,16 +124,16 @@ TRACE_EVENT_RCU(rcu_future_grace_period, TP_fast_assign( __entry->rcuname = rcuname; - __entry->gp_seq = gp_seq; - __entry->gp_seq_req = gp_seq_req; + __entry->gp_seq = (long)gp_seq; + __entry->gp_seq_req = (long)gp_seq_req; __entry->level = level; __entry->grplo = grplo; __entry->grphi = grphi; __entry->gpevent = gpevent; ), - TP_printk("%s %lu %lu %u %d %d %s", - __entry->rcuname, __entry->gp_seq, __entry->gp_seq_req, __entry->level, + TP_printk("%s %ld %ld %u %d %d %s", + __entry->rcuname, (long)__entry->gp_seq, (long)__entry->gp_seq_req, __entry->level, __entry->grplo, __entry->grphi, __entry->gpevent) ); @@ -153,7 +153,7 @@ TRACE_EVENT_RCU(rcu_grace_period_init, TP_STRUCT__entry( __field(const char *, rcuname) - __field(unsigned long, gp_seq) + __field(long, gp_seq) __field(u8, level) __field(int, grplo) __field(int, grphi) @@ -162,14 +162,14 @@ TRACE_EVENT_RCU(rcu_grace_period_init, TP_fast_assign( __entry->rcuname = rcuname; - __entry->gp_seq = gp_seq; + __entry->gp_seq = (long)gp_seq; __entry->level = level; __entry->grplo = grplo; __entry->grphi = grphi; __entry->qsmask = qsmask; ), - TP_printk("%s %lu %u %d %d %lx", + TP_printk("%s %ld %u %d %d %lx", __entry->rcuname, __entry->gp_seq, __entry->level, __entry->grplo, __entry->grphi, __entry->qsmask) ); @@ -197,17 +197,17 @@ TRACE_EVENT_RCU(rcu_exp_grace_period, TP_STRUCT__entry( __field(const char *, rcuname) - __field(unsigned long, gpseq) + __field(long, gpseq) __field(const char *, gpevent) ), TP_fast_assign( __entry->rcuname = rcuname; - __entry->gpseq = gpseq; + __entry->gpseq = (long)gpseq; __entry->gpevent = gpevent; ), - TP_printk("%s %lu %s", + TP_printk("%s %ld %s", __entry->rcuname, __entry->gpseq, __entry->gpevent) ); @@ -316,17 +316,17 @@ TRACE_EVENT_RCU(rcu_preempt_task, TP_STRUCT__entry( __field(const char *, rcuname) - __field(unsigned long, gp_seq) + __field(long, gp_seq) __field(int, pid) ), TP_fast_assign( __entry->rcuname = rcuname; - __entry->gp_seq = gp_seq; + __entry->gp_seq = (long)gp_seq; __entry->pid = pid; ), - TP_printk("%s %lu %d", + TP_printk("%s %ld %d", __entry->rcuname, __entry->gp_seq, __entry->pid) ); @@ -343,17 +343,17 @@ TRACE_EVENT_RCU(rcu_unlock_preempted_task, TP_STRUCT__entry( __field(const char *, rcuname) - __field(unsigned long, gp_seq) + __field(long, gp_seq) __field(int, pid) ), TP_fast_assign( __entry->rcuname = rcuname; - __entry->gp_seq = gp_seq; + __entry->gp_seq = (long)gp_seq; __entry->pid = pid; ), - TP_printk("%s %lu %d", __entry->rcuname, __entry->gp_seq, __entry->pid) + TP_printk("%s %ld %d", __entry->rcuname, __entry->gp_seq, __entry->pid) ); /* @@ -374,7 +374,7 @@ TRACE_EVENT_RCU(rcu_quiescent_state_report, TP_STRUCT__entry( __field(const char *, rcuname) - __field(unsigned long, gp_seq) + __field(long, gp_seq) __field(unsigned long, mask) __field(unsigned long, qsmask) __field(u8, level) @@ -385,7 +385,7 @@ TRACE_EVENT_RCU(rcu_quiescent_state_report, TP_fast_assign( __entry->rcuname = rcuname; - __entry->gp_seq = gp_seq; + __entry->gp_seq = (long)gp_seq; __entry->mask = mask; __entry->qsmask = qsmask; __entry->level = level; @@ -394,7 +394,7 @@ TRACE_EVENT_RCU(rcu_quiescent_state_report, __entry->gp_tasks = gp_tasks; ), - TP_printk("%s %lu %lx>%lx %u %d %d %u", + TP_printk("%s %ld %lx>%lx %u %d %d %u", __entry->rcuname, __entry->gp_seq, __entry->mask, __entry->qsmask, __entry->level, __entry->grplo, __entry->grphi, __entry->gp_tasks) @@ -415,19 +415,19 @@ TRACE_EVENT_RCU(rcu_fqs, TP_STRUCT__entry( __field(const char *, rcuname) - __field(unsigned long, gp_seq) + __field(long, gp_seq) __field(int, cpu) __field(const char *, qsevent) ), TP_fast_assign( __entry->rcuname = rcuname; - __entry->gp_seq = gp_seq; + __entry->gp_seq = (long)gp_seq; __entry->cpu = cpu; __entry->qsevent = qsevent; ), - TP_printk("%s %lu %d %s", + TP_printk("%s %ld %d %s", __entry->rcuname, __entry->gp_seq, __entry->cpu, __entry->qsevent) ); From a7886e899fd8334a03d37e66ad10295d175725ea Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Thu, 18 Jun 2020 21:36:40 -0400 Subject: [PATCH 013/168] rcu/trace: Use gp_seq_req in acceleration's rcu_grace_period tracepoint During acceleration of CB, the rsp's gp_seq is rcu_seq_snap'd. This is the value used for acceleration - it is the value of gp_seq at which it is safe the execute all callbacks in the callback list. The rdp's gp_seq is not very useful for this scenario. Make rcu_grace_period report the gp_seq_req instead as it allows one to reason about how the acceleration works. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index eb36779697ca..896912034982 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1483,9 +1483,10 @@ static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp) /* Trace depending on how much we were able to accelerate. */ if (rcu_segcblist_restempty(&rdp->cblist, RCU_WAIT_TAIL)) - trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("AccWaitCB")); + trace_rcu_grace_period(rcu_state.name, gp_seq_req, TPS("AccWaitCB")); else - trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("AccReadyCB")); + trace_rcu_grace_period(rcu_state.name, gp_seq_req, TPS("AccReadyCB")); + return ret; } From e082c7b38185af0f59e55efff840939c35391f85 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 22 Jun 2020 09:25:34 -0700 Subject: [PATCH 014/168] nocb: Clarify RCU nocb CPU error message A message of the form "rcu: !!! lDTs ." can be tracked down, but doing so is not trivial. This commit therefore eases this process by adding text so that this error message now reads as follows: "rcu: nocb GP activity on CB-only CPU!!! lDTs ." Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 982fc5be5269..bbc0c07ce56e 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2417,7 +2417,7 @@ static void show_rcu_nocb_state(struct rcu_data *rdp) !waslocked && !wastimer && !wassleep) return; /* Nothing untowards. */ - pr_info(" !!! %c%c%c%c %c\n", + pr_info(" nocb GP activity on CB-only CPU!!! %c%c%c%c %c\n", "lL"[waslocked], "dD"[!!rdp->nocb_defer_wakeup], "tT"[wastimer], From 9c39245382de4d52a122641952900709d4a9950b Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Mon, 22 Jun 2020 00:07:27 +0530 Subject: [PATCH 015/168] rcu/tree: Force quiescent state on callback overload On callback overload, it is necessary to quickly detect idle CPUs, and rcu_gp_fqs_check_wake() checks for this condition. Unfortunately, the code following the call to this function does not repeat this check, which means that in reality no actual quiescent-state forcing, instead only a couple of quick and pointless wakeups at the beginning of the grace period. This commit therefore adds a check for the RCU_GP_FLAG_OVLD flag in the post-wakeup "if" statement in rcu_gp_fqs_loop(). Fixes: 1fca4d12f4637 ("rcu: Expedite first two FQS scans under callback-overload conditions") Reviewed-by: Joel Fernandes (Google) Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 896912034982..4770d7709dc2 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1884,7 +1884,7 @@ static void rcu_gp_fqs_loop(void) break; /* If time for quiescent-state forcing, do it. */ if (!time_after(rcu_state.jiffies_force_qs, jiffies) || - (gf & RCU_GP_FLAG_FQS)) { + (gf & (RCU_GP_FLAG_FQS | RCU_GP_FLAG_OVLD))) { trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("fqsstart")); rcu_gp_fqs(first_gp_fqs); From 9b1ce0acb5e65e9ea1e6b322562d072f9f7d1f6e Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Mon, 22 Jun 2020 23:37:03 +0530 Subject: [PATCH 016/168] rcu/tree: Remove CONFIG_PREMPT_RCU check in force_qs_rnp() Originally, the call to rcu_preempt_blocked_readers_cgp() from force_qs_rnp() had to be conditioned on CONFIG_PREEMPT_RCU=y, as in commit a77da14ce9af ("rcu: Yet another fix for preemption and CPU hotplug"). However, there is now a CONFIG_PREEMPT_RCU=n definition of rcu_preempt_blocked_readers_cgp() that unconditionally returns zero, so invoking it is now safe. In addition, the CONFIG_PREEMPT_RCU=n definition of rcu_initiate_boost() simply releases the rcu_node structure's ->lock, which is what happens when the "if" condition evaluates to false. This commit therefore drops the IS_ENABLED(CONFIG_PREEMPT_RCU) check, so that rcu_initiate_boost() is called only in CONFIG_PREEMPT_RCU=y kernels when there are readers blocking the current grace period. This does not change the behavior, but reduces code-reader confusion by eliminating non-CONFIG_PREEMPT_RCU=y calls to rcu_initiate_boost(). Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4770d7709dc2..acc926f07dc1 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2533,8 +2533,7 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) raw_spin_lock_irqsave_rcu_node(rnp, flags); rcu_state.cbovldnext |= !!rnp->cbovldmask; if (rnp->qsmask == 0) { - if (!IS_ENABLED(CONFIG_PREEMPT_RCU) || - rcu_preempt_blocked_readers_cgp(rnp)) { + if (rcu_preempt_blocked_readers_cgp(rnp)) { /* * No point in scanning bits because they * are all zero. But we might need to From 2130c6b4f610ea65e9df71dfa79ee08f2fc17743 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 22 Jun 2020 16:46:43 -0700 Subject: [PATCH 017/168] nocb: Remove show_rcu_nocb_state() false positive printout The rcu_data structure's ->nocb_timer field is used to defer wakeups of the corresponding no-CBs CPU's grace-period kthread ("rcuog*"), and that structure's ->nocb_defer_wakeup field is used to track such deferral. This means that the show_rcu_nocb_state() printing an error when those fields are set for a CPU not corresponding to a no-CBs grace-period kthread is erroneous. This commit therefore switches the check from ->nocb_timer to ->nocb_bypass_timer and removes the check of ->nocb_defer_wakeup. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index bbc0c07ce56e..4d63ee3de7a9 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2411,10 +2411,9 @@ static void show_rcu_nocb_state(struct rcu_data *rdp) return; waslocked = raw_spin_is_locked(&rdp->nocb_gp_lock); - wastimer = timer_pending(&rdp->nocb_timer); + wastimer = timer_pending(&rdp->nocb_bypass_timer); wassleep = swait_active(&rdp->nocb_gp_wq); - if (!rdp->nocb_defer_wakeup && !rdp->nocb_gp_sleep && - !waslocked && !wastimer && !wassleep) + if (!rdp->nocb_gp_sleep && !waslocked && !wastimer && !wassleep) return; /* Nothing untowards. */ pr_info(" nocb GP activity on CB-only CPU!!! %c%c%c%c %c\n", From b5374b2df0ac1c78895b8eb8d9582a7bdc67257d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 23 Jun 2020 17:09:27 -0700 Subject: [PATCH 018/168] rcu: Add READ_ONCE() to rcu_do_batch() access to rcu_divisor Given that sysfs can change the value of rcu_divisor at any time, this commit adds a READ_ONCE to the sole access to that variable. While in the area, this commit also adds bounds checking, clamping the value to a shift that makes sense for a signed long. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index acc926f07dc1..1dca14cf66f9 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2362,6 +2362,7 @@ int rcutree_dead_cpu(unsigned int cpu) */ static void rcu_do_batch(struct rcu_data *rdp) { + int div; unsigned long flags; const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) && rcu_segcblist_is_offloaded(&rdp->cblist); @@ -2390,7 +2391,9 @@ static void rcu_do_batch(struct rcu_data *rdp) rcu_nocb_lock(rdp); WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); pending = rcu_segcblist_n_cbs(&rdp->cblist); - bl = max(rdp->blimit, pending >> rcu_divisor); + div = READ_ONCE(rcu_divisor); + div = div < 0 ? 7 : div > sizeof(long) * 8 - 2 ? sizeof(long) * 8 - 2 : div; + bl = max(rdp->blimit, pending >> div); if (unlikely(bl > 100)) tlimit = local_clock() + rcu_resched_ns; trace_rcu_batch_start(rcu_state.name, From a2b354b9950bb859d8d959f951dda26725b041fb Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 23 Jun 2020 17:49:40 -0700 Subject: [PATCH 019/168] rcu: Add READ_ONCE() to rcu_do_batch() access to rcu_resched_ns Given that sysfs can change the value of rcu_resched_ns at any time, this commit adds a READ_ONCE() to the sole access to that variable. While in the area, this commit also adds bounds checking, clamping the value to at least a millisecond, but no longer than a second. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 1dca14cf66f9..da05afc53493 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2394,8 +2394,12 @@ static void rcu_do_batch(struct rcu_data *rdp) div = READ_ONCE(rcu_divisor); div = div < 0 ? 7 : div > sizeof(long) * 8 - 2 ? sizeof(long) * 8 - 2 : div; bl = max(rdp->blimit, pending >> div); - if (unlikely(bl > 100)) - tlimit = local_clock() + rcu_resched_ns; + if (unlikely(bl > 100)) { + long rrn = READ_ONCE(rcu_resched_ns); + + rrn = rrn < NSEC_PER_MSEC ? NSEC_PER_MSEC : rrn > NSEC_PER_SEC ? NSEC_PER_SEC : rrn; + tlimit = local_clock() + rrn; + } trace_rcu_batch_start(rcu_state.name, rcu_segcblist_n_cbs(&rdp->cblist), bl); rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl); From fe63b723cc7ca3a91ea91274e0f2cba29452b3fa Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 23 Jun 2020 18:04:45 -0700 Subject: [PATCH 020/168] rcu: Add READ_ONCE() to rcu_do_batch() access to rcu_kick_kthreads Given that sysfs can change the value of rcu_kick_kthreads at any time, this commit adds a READ_ONCE() to the sole access to that variable. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index b5d3b4794db4..a1780a621b5e 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -158,7 +158,7 @@ static void rcu_stall_kick_kthreads(void) { unsigned long j; - if (!rcu_kick_kthreads) + if (!READ_ONCE(rcu_kick_kthreads)) return; j = READ_ONCE(rcu_state.jiffies_kick_kthreads); if (time_after(jiffies, j) && rcu_state.gp_kthread && @@ -580,7 +580,7 @@ static void check_cpu_stall(struct rcu_data *rdp) unsigned long js; struct rcu_node *rnp; - if ((rcu_stall_is_suppressed() && !rcu_kick_kthreads) || + if ((rcu_stall_is_suppressed() && !READ_ONCE(rcu_kick_kthreads)) || !rcu_gp_in_progress()) return; rcu_stall_kick_kthreads(); From 1ef5a442a113d140580b3b8bbd6f50c9f7746397 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 23 Jun 2020 20:57:59 -0700 Subject: [PATCH 021/168] rcu: Add READ_ONCE() to rcu_do_batch() access to rcu_cpu_stall_ftrace_dump Given that sysfs can change the value of rcu_cpu_stall_ftrace_dump at any time, this commit adds a READ_ONCE() to the accesses to that variable. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index a1780a621b5e..0fde39b8daab 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -623,7 +623,7 @@ static void check_cpu_stall(struct rcu_data *rdp) /* We haven't checked in, so go dump stack. */ print_cpu_stall(gps); - if (rcu_cpu_stall_ftrace_dump) + if (READ_ONCE(rcu_cpu_stall_ftrace_dump)) rcu_ftrace_dump(DUMP_ALL); } else if (rcu_gp_in_progress() && @@ -632,7 +632,7 @@ static void check_cpu_stall(struct rcu_data *rdp) /* They had a few time units to dump stack, so complain. */ print_other_cpu_stall(gs2, gps); - if (rcu_cpu_stall_ftrace_dump) + if (READ_ONCE(rcu_cpu_stall_ftrace_dump)) rcu_ftrace_dump(DUMP_ALL); } } From 000601bb62330f18dc8f5d2d0b82e9aec3e207c4 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Thu, 9 Jul 2020 15:05:59 +0200 Subject: [PATCH 022/168] rcu: Fix kerneldoc comments in rcupdate.h This commit fixes the kerneldoc comments for rcu_read_unlock_bh(), rcu_read_unlock_sched() and rcu_head_after_call_rcu() so they e.g. get properly linked in the API documentation. Also add parenthesis after function names to match the notation used in other kerneldoc comments in the same file. Signed-off-by: Tobias Klauser Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index d15d46db61f7..b47d6b66665e 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -709,8 +709,8 @@ static inline void rcu_read_lock_bh(void) "rcu_read_lock_bh() used illegally while idle"); } -/* - * rcu_read_unlock_bh - marks the end of a softirq-only RCU critical section +/** + * rcu_read_unlock_bh() - marks the end of a softirq-only RCU critical section * * See rcu_read_lock_bh() for more information. */ @@ -751,10 +751,10 @@ static inline notrace void rcu_read_lock_sched_notrace(void) __acquire(RCU_SCHED); } -/* - * rcu_read_unlock_sched - marks the end of a RCU-classic critical section +/** + * rcu_read_unlock_sched() - marks the end of a RCU-classic critical section * - * See rcu_read_lock_sched for more information. + * See rcu_read_lock_sched() for more information. */ static inline void rcu_read_unlock_sched(void) { @@ -945,7 +945,7 @@ static inline void rcu_head_init(struct rcu_head *rhp) } /** - * rcu_head_after_call_rcu - Has this rcu_head been passed to call_rcu()? + * rcu_head_after_call_rcu() - Has this rcu_head been passed to call_rcu()? * @rhp: The rcu_head structure to test. * @f: The function passed to call_rcu() along with @rhp. * From ae2212a7216b674633bdc3bd2e24947a0665efb8 Mon Sep 17 00:00:00 2001 From: Madhuparna Bhowmik Date: Sun, 12 Jul 2020 18:40:02 +0530 Subject: [PATCH 023/168] rculist: Introduce list/hlist_for_each_entry_srcu() macros list/hlist_for_each_entry_rcu() provides an optional cond argument to specify the lock held in the updater side. However for SRCU read side, not providing the cond argument results into false positive as whether srcu_read_lock is held or not is not checked implicitly. Therefore, on read side the lockdep expression srcu_read_lock_held(srcu struct) can solve this issue. However, the function still fails to check the cases where srcu protected list is traversed with rcu_read_lock() instead of srcu_read_lock(). Therefore, to remove the false negative, this patch introduces two new list traversal primitives : list_for_each_entry_srcu() and hlist_for_each_entry_srcu(). Both of the functions have non-optional cond argument as it is required for both read and update side, and simply checks if the cond is true. For regular read side the lockdep expression srcu_read_lock_head() can be passed as the cond argument to list/hlist_for_each_entry_srcu(). Suggested-by: Paolo Bonzini Tested-by: Suraj Upadhyay Tested-by: Naresh Kamboju [ paulmck: Add "true" per kbuild test robot feedback. ] Signed-off-by: Madhuparna Bhowmik Signed-off-by: Paul E. McKenney --- include/linux/rculist.h | 48 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 7a6fc9956510..f8633d37e358 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -63,9 +63,17 @@ static inline void INIT_LIST_HEAD_RCU(struct list_head *list) RCU_LOCKDEP_WARN(!(cond) && !rcu_read_lock_any_held(), \ "RCU-list traversed in non-reader section!"); \ }) + +#define __list_check_srcu(cond) \ + ({ \ + RCU_LOCKDEP_WARN(!(cond), \ + "RCU-list traversed without holding the required lock!");\ + }) #else #define __list_check_rcu(dummy, cond, extra...) \ ({ check_arg_count_one(extra); }) + +#define __list_check_srcu(cond) ({ }) #endif /* @@ -385,6 +393,25 @@ static inline void list_splice_tail_init_rcu(struct list_head *list, &pos->member != (head); \ pos = list_entry_rcu(pos->member.next, typeof(*pos), member)) +/** + * list_for_each_entry_srcu - iterate over rcu list of given type + * @pos: the type * to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the list_head within the struct. + * @cond: lockdep expression for the lock required to traverse the list. + * + * This list-traversal primitive may safely run concurrently with + * the _rcu list-mutation primitives such as list_add_rcu() + * as long as the traversal is guarded by srcu_read_lock(). + * The lockdep expression srcu_read_lock_held() can be passed as the + * cond argument from read side. + */ +#define list_for_each_entry_srcu(pos, head, member, cond) \ + for (__list_check_srcu(cond), \ + pos = list_entry_rcu((head)->next, typeof(*pos), member); \ + &pos->member != (head); \ + pos = list_entry_rcu(pos->member.next, typeof(*pos), member)) + /** * list_entry_lockless - get the struct for this entry * @ptr: the &struct list_head pointer. @@ -683,6 +710,27 @@ static inline void hlist_add_behind_rcu(struct hlist_node *n, pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(\ &(pos)->member)), typeof(*(pos)), member)) +/** + * hlist_for_each_entry_srcu - iterate over rcu list of given type + * @pos: the type * to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the hlist_node within the struct. + * @cond: lockdep expression for the lock required to traverse the list. + * + * This list-traversal primitive may safely run concurrently with + * the _rcu list-mutation primitives such as hlist_add_head_rcu() + * as long as the traversal is guarded by srcu_read_lock(). + * The lockdep expression srcu_read_lock_held() can be passed as the + * cond argument from read side. + */ +#define hlist_for_each_entry_srcu(pos, head, member, cond) \ + for (__list_check_srcu(cond), \ + pos = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),\ + typeof(*(pos)), member); \ + pos; \ + pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(\ + &(pos)->member)), typeof(*(pos)), member)) + /** * hlist_for_each_entry_rcu_notrace - iterate over rcu list of given type (for tracing) * @pos: the type * to use as a loop cursor. From df9a30fd1f70a757df193acd7396622eee23e527 Mon Sep 17 00:00:00 2001 From: Madhuparna Bhowmik Date: Sun, 12 Jul 2020 18:40:03 +0530 Subject: [PATCH 024/168] kvm: mmu: page_track: Fix RCU list API usage Use hlist_for_each_entry_srcu() instead of hlist_for_each_entry_rcu() as it also checkes if the right lock is held. Using hlist_for_each_entry_rcu() with a condition argument will not report the cases where a SRCU protected list is traversed using rcu_read_lock(). Hence, use hlist_for_each_entry_srcu(). Signed-off-by: Madhuparna Bhowmik Signed-off-by: Paul E. McKenney Acked-by: Paolo Bonzini Cc: --- arch/x86/kvm/mmu/page_track.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c index a84a141a2ad2..8443a675715b 100644 --- a/arch/x86/kvm/mmu/page_track.c +++ b/arch/x86/kvm/mmu/page_track.c @@ -229,7 +229,8 @@ void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, return; idx = srcu_read_lock(&head->track_srcu); - hlist_for_each_entry_rcu(n, &head->track_notifier_list, node) + hlist_for_each_entry_srcu(n, &head->track_notifier_list, node, + srcu_read_lock_held(&head->track_srcu)) if (n->track_write) n->track_write(vcpu, gpa, new, bytes, n); srcu_read_unlock(&head->track_srcu, idx); @@ -254,7 +255,8 @@ void kvm_page_track_flush_slot(struct kvm *kvm, struct kvm_memory_slot *slot) return; idx = srcu_read_lock(&head->track_srcu); - hlist_for_each_entry_rcu(n, &head->track_notifier_list, node) + hlist_for_each_entry_srcu(n, &head->track_notifier_list, node, + srcu_read_lock_held(&head->track_srcu)) if (n->track_flush_slot) n->track_flush_slot(kvm, slot, n); srcu_read_unlock(&head->track_srcu, idx); From c0f97f20e5d97a1358ade650fcf6a322c0c9bc72 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 24 Jul 2020 20:22:05 -0700 Subject: [PATCH 025/168] rcu: Move rcu_cpu_started per-CPU variable to rcu_data When the rcu_cpu_started per-CPU variable was added by commit f64c6013a202 ("rcu/x86: Provide early rcu_cpu_starting() callback"), there were multiple sets of per-CPU rcu_data structures. Therefore, the rcu_cpu_started flag was added as a separate per-CPU variable. But now there is only one set of per-CPU rcu_data structures, so this commit moves rcu_cpu_started to a new ->cpu_started field in that structure. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 13 +++++-------- kernel/rcu/tree.h | 1 + 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index da05afc53493..52108dd92169 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3967,8 +3967,6 @@ int rcutree_offline_cpu(unsigned int cpu) return 0; } -static DEFINE_PER_CPU(int, rcu_cpu_started); - /* * Mark the specified CPU as being online so that subsequent grace periods * (both expedited and normal) will wait on it. Note that this means that @@ -3988,12 +3986,11 @@ void rcu_cpu_starting(unsigned int cpu) struct rcu_node *rnp; bool newcpu; - if (per_cpu(rcu_cpu_started, cpu)) - return; - - per_cpu(rcu_cpu_started, cpu) = 1; - rdp = per_cpu_ptr(&rcu_data, cpu); + if (rdp->cpu_started) + return; + rdp->cpu_started = true; + rnp = rdp->mynode; mask = rdp->grpmask; raw_spin_lock_irqsave_rcu_node(rnp, flags); @@ -4053,7 +4050,7 @@ void rcu_report_dead(unsigned int cpu) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); raw_spin_unlock(&rcu_state.ofl_lock); - per_cpu(rcu_cpu_started, cpu) = 0; + rdp->cpu_started = false; } /* diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index c96ae351688b..309bc7f41d35 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -156,6 +156,7 @@ struct rcu_data { bool beenonline; /* CPU online at least once. */ bool gpwrap; /* Possible ->gp_seq wrap. */ bool exp_deferred_qs; /* This CPU awaiting a deferred QS? */ + bool cpu_started; /* RCU watching this onlining CPU. */ struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ unsigned long grpmask; /* Mask to apply to leaf qsmask. */ unsigned long ticks_this_gp; /* The number of scheduling-clock */ From 4569c5ee95d5695bfd794ae968c2d59b3e69129a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 5 Aug 2020 10:35:16 -0700 Subject: [PATCH 026/168] rcu/nocb: Add a warning for non-GP kthread running GP code This commit increases RCU's ability to defend itself by emitting a warning if one of the nocb CB kthreads invokes the GP kthread's wait function. This warning augments a similar check that is carried out at the end of rcutorture testing and when RCU CPU stall warnings are emitted. The problem with those checks is that the miscreants have long since departed and disposed of any and all evidence. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 4d63ee3de7a9..cb1e8c8befb9 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1926,6 +1926,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) * nearest grace period (if any) to wait for next. The CB kthreads * and the global grace-period kthread are awakened if needed. */ + WARN_ON_ONCE(my_rdp->nocb_gp_rdp != my_rdp); for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_cb_rdp) { trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check")); rcu_nocb_lock_irqsave(rdp, flags); From f37599e6f06da47e49c3408afe66c5b6e83a90bd Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Fri, 7 Aug 2020 13:07:19 -0400 Subject: [PATCH 027/168] rcu: Clarify comments about FQS loop reporting quiescent states Since at least v4.19, the FQS loop no longer reports quiescent states for offline CPUs except in emergency situations. This commit therefore fixes the comment in rcu_gp_init() to match the current code. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 52108dd92169..2c7afe491c45 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1706,10 +1706,13 @@ static bool rcu_gp_init(void) raw_spin_unlock_irq_rcu_node(rnp); /* - * Apply per-leaf buffered online and offline operations to the - * rcu_node tree. Note that this new grace period need not wait - * for subsequent online CPUs, and that quiescent-state forcing - * will handle subsequent offline CPUs. + * Apply per-leaf buffered online and offline operations to + * the rcu_node tree. Note that this new grace period need not + * wait for subsequent online CPUs, and that RCU hooks in the CPU + * offlining path, when combined with checks in this function, + * will handle CPUs that are currently going offline or that will + * go offline later. Please also refer to "Hotplug CPU" section + * of RCU's Requirements documentation. */ rcu_state.gp_state = RCU_GP_ONOFF; rcu_for_each_leaf_node(rnp) { From 666ca2907e6b75960ce2f0fe50afc5d8a46f296d Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Fri, 7 Aug 2020 13:07:20 -0400 Subject: [PATCH 028/168] rcu: Make FQS more aggressive in complaining about offline CPUs The RCU grace-period kthread's force-quiescent state (FQS) loop should never see an offline CPU that has not yet reported a quiescent state. After all, the offline CPU should have reported a quiescent state during the CPU-offline process, or, failing that, by rcu_gp_init() if it ran concurrently with either the CPU going offline or the last task on a leaf rcu_node structure exiting its RCU read-side critical section while all CPUs corresponding to that structure are offline. The FQS loop should therefore complain if it does see an offline CPU that has not yet reported a quiescent state. And it does, but only once the grace period has been in force for a full second. This commit therefore makes this warning more aggressive, so that it will trigger as soon as the condition makes its appearance. Light testing with TREE03 and hotplug shows no warnings. This commit also converts the warning to WARN_ON_ONCE() in order to stave off possible log spam. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 2c7afe491c45..396abe0e0d01 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1214,13 +1214,28 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) return 1; } - /* If waiting too long on an offline CPU, complain. */ - if (!(rdp->grpmask & rcu_rnp_online_cpus(rnp)) && - time_after(jiffies, rcu_state.gp_start + HZ)) { + /* + * Complain if a CPU that is considered to be offline from RCU's + * perspective has not yet reported a quiescent state. After all, + * the offline CPU should have reported a quiescent state during + * the CPU-offline process, or, failing that, by rcu_gp_init() + * if it ran concurrently with either the CPU going offline or the + * last task on a leaf rcu_node structure exiting its RCU read-side + * critical section while all CPUs corresponding to that structure + * are offline. This added warning detects bugs in any of these + * code paths. + * + * The rcu_node structure's ->lock is held here, which excludes + * the relevant portions the CPU-hotplug code, the grace-period + * initialization code, and the rcu_read_unlock() code paths. + * + * For more detail, please refer to the "Hotplug CPU" section + * of RCU's Requirements documentation. + */ + if (WARN_ON_ONCE(!(rdp->grpmask & rcu_rnp_online_cpus(rnp)))) { bool onl; struct rcu_node *rnp1; - WARN_ON(1); /* Offline CPUs are supposed to report QS! */ pr_info("%s: grp: %d-%d level: %d ->gp_seq %ld ->completedqs %ld\n", __func__, rnp->grplo, rnp->grphi, rnp->level, (long)rnp->gp_seq, (long)rnp->completedqs); From 7f2a53c231fe5d9522c3b695ab454203904031ac Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 17 Aug 2020 10:37:22 -0700 Subject: [PATCH 029/168] rcu: Remove unused __rcu_is_watching() function The x86/entry work removed all uses of __rcu_is_watching(), therefore this commit removes it entirely. Cc: Andy Lutomirski Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Signed-off-by: Paul E. McKenney --- include/linux/rcutiny.h | 1 - include/linux/rcutree.h | 1 - kernel/entry/common.c | 2 +- kernel/rcu/tree.c | 5 ----- 4 files changed, 1 insertion(+), 8 deletions(-) diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 5cc9637cac16..7c1ecdb356d8 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -103,7 +103,6 @@ static inline void rcu_scheduler_starting(void) { } static inline void rcu_end_inkernel_boot(void) { } static inline bool rcu_inkernel_boot_has_ended(void) { return true; } static inline bool rcu_is_watching(void) { return true; } -static inline bool __rcu_is_watching(void) { return true; } static inline void rcu_momentary_dyntick_idle(void) { } static inline void kfree_rcu_scheduler_running(void) { } static inline bool rcu_gp_might_be_stalled(void) { return false; } diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index d2f4064ebd1d..59eb5cd567d7 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -64,7 +64,6 @@ extern int rcu_scheduler_active __read_mostly; void rcu_end_inkernel_boot(void); bool rcu_inkernel_boot_has_ended(void); bool rcu_is_watching(void); -bool __rcu_is_watching(void); #ifndef CONFIG_PREEMPTION void rcu_all_qs(void); #endif diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 9852e0d62d95..ad794a10fa80 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -278,7 +278,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) * terminate a grace period, if and only if the timer interrupt is * not nested into another interrupt. * - * Checking for __rcu_is_watching() here would prevent the nesting + * Checking for rcu_is_watching() here would prevent the nesting * interrupt to invoke rcu_irq_enter(). If that nested interrupt is * the tick then rcu_flavor_sched_clock_irq() would wrongfully * assume that it is the first interupt and eventually claim diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 396abe0e0d01..232362293678 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1077,11 +1077,6 @@ static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp) } } -noinstr bool __rcu_is_watching(void) -{ - return !rcu_dynticks_curr_cpu_in_eqs(); -} - /** * rcu_is_watching - see if RCU thinks that the current CPU is not idle * From e9d338a0b1799c988b678e8ccb66a442272e6aa3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 24 Jun 2020 15:59:59 -0700 Subject: [PATCH 030/168] scftorture: Add smp_call_function() torture test This commit adds an smp_call_function() torture test that repeatedly invokes this function and complains if things go badly awry. Signed-off-by: Paul E. McKenney --- .../admin-guide/kernel-parameters.txt | 92 +++++ kernel/Makefile | 2 + kernel/scftorture.c | 350 ++++++++++++++++++ lib/Kconfig.debug | 10 + 4 files changed, 454 insertions(+) create mode 100644 kernel/scftorture.c diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index bdc1f33fd3d1..91a56382ae56 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4637,6 +4637,98 @@ Format: integer between 0 and 10 Default is 0. + scftorture.holdoff= [KNL] + Number of seconds to hold off before starting + test. Defaults to zero for module insertion and + to 10 seconds for built-in smp_call_function() + tests. + + scftorture.longwait= [KNL] + Request ridiculously long waits randomly selected + up to the chosen limit in seconds. Zero (the + default) disables this feature. Please note + that requesting even small non-zero numbers of + seconds can result in RCU CPU stall warnings, + softlockup complaints, and so on. + + scftorture.nthreads= [KNL] + Number of kthreads to spawn to invoke the + smp_call_function() family of functions. + The default of -1 specifies a number of kthreads + equal to the number of CPUs. + + scftorture.onoff_holdoff= [KNL] + Number seconds to wait after the start of the + test before initiating CPU-hotplug operations. + + scftorture.onoff_interval= [KNL] + Number seconds to wait between successive + CPU-hotplug operations. Specifying zero (which + is the default) disables CPU-hotplug operations. + + scftorture.shutdown_secs= [KNL] + The number of seconds following the start of the + test after which to shut down the system. The + default of zero avoids shutting down the system. + Non-zero values are useful for automated tests. + + scftorture.stat_interval= [KNL] + The number of seconds between outputting the + current test statistics to the console. A value + of zero disables statistics output. + + scftorture.stutter_cpus= [KNL] + The number of jiffies to wait between each change + to the set of CPUs under test. + + scftorture.use_cpus_read_lock= [KNL] + Use use_cpus_read_lock() instead of the default + preempt_disable() to disable CPU hotplug + while invoking one of the smp_call_function*() + functions. + + scftorture.verbose= [KNL] + Enable additional printk() statements. + + scftorture.weight_single= [KNL] + The probability weighting to use for the + smp_call_function_single() function with a zero + "wait" parameter. A value of -1 selects the + default if all other weights are -1. However, + if at least one weight has some other value, a + value of -1 will instead select a weight of zero. + + scftorture.weight_single_wait= [KNL] + The probability weighting to use for the + smp_call_function_single() function with a + non-zero "wait" parameter. See weight_single. + + scftorture.weight_many= [KNL] + The probability weighting to use for the + smp_call_function_many() function with a zero + "wait" parameter. See weight_single. + Note well that setting a high probability for + this weighting can place serious IPI load + on the system. + + scftorture.weight_many_wait= [KNL] + The probability weighting to use for the + smp_call_function_many() function with a + non-zero "wait" parameter. See weight_single + and weight_many. + + scftorture.weight_all= [KNL] + The probability weighting to use for the + smp_call_function_all() function with a zero + "wait" parameter. See weight_single and + weight_many. + + scftorture.weight_all_wait= [KNL] + The probability weighting to use for the + smp_call_function_all() function with a + non-zero "wait" parameter. See weight_single + and weight_many. + skew_tick= [KNL] Offset the periodic timer tick per cpu to mitigate xtime_lock contention on larger systems, and/or RCU lock contention on all systems with CONFIG_MAXSMP set. diff --git a/kernel/Makefile b/kernel/Makefile index 9a20016d4900..c45f551deaaa 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -133,6 +133,8 @@ KASAN_SANITIZE_stackleak.o := n KCSAN_SANITIZE_stackleak.o := n KCOV_INSTRUMENT_stackleak.o := n +obj-$(CONFIG_SCF_TORTURE_TEST) += scftorture.o + $(obj)/configs.o: $(obj)/config_data.gz targets += config_data.gz diff --git a/kernel/scftorture.c b/kernel/scftorture.c new file mode 100644 index 000000000000..44f1e49ba6e9 --- /dev/null +++ b/kernel/scftorture.c @@ -0,0 +1,350 @@ +// SPDX-License-Identifier: GPL-2.0+ +// +// Torture test for smp_call_function() and friends. +// +// Copyright (C) Facebook, 2020. +// +// Author: Paul E. McKenney + +#define pr_fmt(fmt) fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define SCFTORT_STRING "scftorture" +#define SCFTORT_FLAG SCFTORT_STRING ": " + +#define SCFTORTOUT(s, x...) \ + pr_alert(SCFTORT_FLAG s, ## x) + +#define VERBOSE_SCFTORTOUT(s, x...) \ + do { if (verbose) pr_alert(SCFTORT_FLAG s, ## x); } while (0) + +#define VERBOSE_SCFTORTOUT_ERRSTRING(s, x...) \ + do { if (verbose) pr_alert(SCFTORT_FLAG "!!! " s, ## x); } while (0) + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Paul E. McKenney "); + +// Wait until there are multiple CPUs before starting test. +torture_param(int, holdoff, IS_BUILTIN(CONFIG_SCF_TORTURE_TEST) ? 10 : 0, + "Holdoff time before test start (s)"); +torture_param(int, longwait, 0, "Include ridiculously long waits? (seconds)"); +torture_param(int, nthreads, -1, "# threads, defaults to -1 for all CPUs."); +torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)"); +torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (s), 0=disable"); +torture_param(int, shutdown_secs, 0, "Shutdown time (ms), <= zero to disable."); +torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s."); +torture_param(int, stutter_cpus, 5, "Number of jiffies to change CPUs under test, 0=disable"); +torture_param(bool, use_cpus_read_lock, 0, "Use cpus_read_lock() to exclude CPU hotplug."); +torture_param(int, verbose, 0, "Enable verbose debugging printk()s"); +torture_param(int, weight_single, -1, "Testing weight for single-CPU no-wait operations."); +torture_param(int, weight_single_wait, -1, "Testing weight for single-CPU operations."); +torture_param(int, weight_mult, -1, "Testing weight for multi-CPU no-wait operations."); +torture_param(int, weight_mult_wait, -1, "Testing weight for multi-CPU operations."); +torture_param(int, weight_all, -1, "Testing weight for all-CPU no-wait operations."); +torture_param(int, weight_all_wait, -1, "Testing weight for all-CPU operations."); + +char *torture_type = ""; + +#ifdef MODULE +# define SCFTORT_SHUTDOWN 0 +#else +# define SCFTORT_SHUTDOWN 1 +#endif + +torture_param(bool, shutdown, SCFTORT_SHUTDOWN, "Shutdown at end of torture test."); + +struct scf_statistics { + struct task_struct *task; + int cpu; + long long n_single; + long long n_single_wait; + long long n_multi; + long long n_multi_wait; + long long n_all; + long long n_all_wait; +}; + +static struct scf_statistics *scf_stats_p; +static struct task_struct *scf_torture_stats_task; +static DEFINE_PER_CPU(long long, scf_invoked_count); + +// Use to wait for all threads to start. +static atomic_t n_started; +static atomic_t n_errs; +static bool scfdone; + +DEFINE_TORTURE_RANDOM_PERCPU(scf_torture_rand); + +// Print torture statistics. Caller must ensure serialization. +static void scf_torture_stats_print(void) +{ + int cpu; + long long invoked_count = 0; + bool isdone = READ_ONCE(scfdone); + + for_each_possible_cpu(cpu) + invoked_count += data_race(per_cpu(scf_invoked_count, cpu)); + pr_alert("%s scf_invoked_count %s: %lld ", + SCFTORT_FLAG, isdone ? "VER" : "ver", invoked_count); + torture_onoff_stats(); + pr_cont("\n"); +} + +// Periodically prints torture statistics, if periodic statistics printing +// was specified via the stat_interval module parameter. +static int +scf_torture_stats(void *arg) +{ + VERBOSE_TOROUT_STRING("scf_torture_stats task started"); + do { + schedule_timeout_interruptible(stat_interval * HZ); + scf_torture_stats_print(); + torture_shutdown_absorb("scf_torture_stats"); + } while (!torture_must_stop()); + torture_kthread_stopping("scf_torture_stats"); + return 0; +} + +// Update statistics and occasionally burn up mass quantities of CPU time, +// if told to do so via scftorture.longwait. Otherwise, occasionally burn +// a little bit. +static void scf_handler(void *unused) +{ + int i; + int j; + unsigned long r = torture_random(this_cpu_ptr(&scf_torture_rand)); + + this_cpu_inc(scf_invoked_count); + if (longwait <= 0) { + if (!(r & 0xffc0)) + udelay(r & 0x3f); + return; + } + if (r & 0xfff) + return; + r = (r >> 12); + if (longwait <= 0) { + udelay((r & 0xff) + 1); + return; + } + r = r % longwait + 1; + for (i = 0; i < r; i++) { + for (j = 0; j < 1000; j++) { + udelay(1000); + cpu_relax(); + } + } +} + +// Randomly do an smp_call_function*() invocation. +static void scftorture_invoke_one(struct scf_statistics *scfp,struct torture_random_state *trsp) +{ + if (use_cpus_read_lock) + cpus_read_lock(); + else + preempt_disable(); + scfp->n_all++; + smp_call_function(scf_handler, NULL, 0); + if (use_cpus_read_lock) + cpus_read_unlock(); + else + preempt_enable(); + if (!(torture_random(trsp) & 0xfff)) + schedule_timeout_uninterruptible(1); +} + +// SCF test kthread. Repeatedly does calls to members of the +// smp_call_function() family of functions. +static int scftorture_invoker(void *arg) +{ + DEFINE_TORTURE_RANDOM(rand); + struct scf_statistics *scfp = (struct scf_statistics *)arg; + + VERBOSE_SCFTORTOUT("scftorture_invoker %d: task started", scfp->cpu); + set_cpus_allowed_ptr(current, cpumask_of(scfp->cpu % nr_cpu_ids)); + set_user_nice(current, MAX_NICE); + if (holdoff) + schedule_timeout_interruptible(holdoff * HZ); + + VERBOSE_SCFTORTOUT("scftorture_invoker %d: Waiting for all SCF torturers from cpu %d", scfp->cpu, smp_processor_id()); + + // Make sure that the CPU is affinitized appropriately during testing. + WARN_ON_ONCE(smp_processor_id() != scfp->cpu); + + if (!atomic_dec_return(&n_started)) + while (atomic_read_acquire(&n_started)) { + if (torture_must_stop()) { + VERBOSE_SCFTORTOUT("scftorture_invoker %d ended before starting", scfp->cpu); + goto end; + } + schedule_timeout_uninterruptible(1); + } + + VERBOSE_SCFTORTOUT("scftorture_invoker %d started", scfp->cpu); + + do { + scftorture_invoke_one(scfp, &rand); + } while (!torture_must_stop()); + + VERBOSE_SCFTORTOUT("scftorture_invoker %d ended", scfp->cpu); +end: + torture_kthread_stopping("scftorture_invoker"); + return 0; +} + +static void +scftorture_print_module_parms(const char *tag) +{ + pr_alert(SCFTORT_FLAG + "--- %s: verbose=%d holdoff=%d longwait=%d nthreads=%d onoff_holdoff=%d onoff_interval=%d shutdown_secs=%d stat_interval=%d stutter_cpus=%d use_cpus_read_lock=%d, weight_single=%d, weight_single_wait=%d, weight_mult=%d, weight_mult_wait=%d, weight_all=%d, weight_all_wait=%d\n", tag, + verbose, holdoff, longwait, nthreads, onoff_holdoff, onoff_interval, shutdown, stat_interval, stutter_cpus, use_cpus_read_lock, weight_single, weight_single_wait, weight_mult, weight_mult_wait, weight_all, weight_all_wait); +} + +static void scf_cleanup_handler(void *unused) +{ +} + +static void scf_torture_cleanup(void) +{ + int i; + + if (torture_cleanup_begin()) + return; + + WRITE_ONCE(scfdone, true); + if (nthreads) + for (i = 0; i < nthreads; i++) + torture_stop_kthread("scftorture_invoker", scf_stats_p[i].task); + else + goto end; + kfree(scf_stats_p); + scf_stats_p = NULL; + smp_call_function(scf_cleanup_handler, NULL, 0); + torture_stop_kthread(scf_torture_stats, scf_torture_stats_task); + scf_torture_stats_print(); // -After- the stats thread is stopped! + + if (atomic_read(&n_errs)) + scftorture_print_module_parms("End of test: FAILURE"); + else if (torture_onoff_failures()) + scftorture_print_module_parms("End of test: LOCK_HOTPLUG"); + else + scftorture_print_module_parms("End of test: SUCCESS"); + +end: + torture_cleanup_end(); +} + +static int __init scf_torture_init(void) +{ + long i; + int firsterr = 0; + + if (!torture_init_begin(SCFTORT_STRING, verbose)) + return -EBUSY; + + scftorture_print_module_parms("Start of test"); + + if (weight_single == -1 && weight_single_wait == -1 && + weight_mult == -1 && weight_mult_wait == -1 && + weight_all == -1 && weight_all_wait == -1) { + weight_single = 1; + weight_single_wait = 1; + weight_mult = 1; + weight_mult_wait = 1; + weight_all = 1; + weight_all_wait = 1; + } else { + if (weight_single == -1) + weight_single = 0; + if (weight_single_wait == -1) + weight_single_wait = 0; + if (weight_mult == -1) + weight_mult = 0; + if (weight_mult_wait == -1) + weight_mult_wait = 0; + if (weight_all == -1) + weight_all = 0; + if (weight_all_wait == -1) + weight_all_wait = 0; + } + if (weight_single == 0 && weight_single_wait == 0 && + weight_mult == 0 && weight_mult_wait == 0 && + weight_all == 0 && weight_all_wait == 0) { + firsterr = -EINVAL; + goto unwind; + } + + if (onoff_interval > 0) { + firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval, NULL); + if (firsterr) + goto unwind; + } + if (shutdown_secs > 0) { + firsterr = torture_shutdown_init(shutdown_secs, scf_torture_cleanup); + if (firsterr) + goto unwind; + } + + // Worker tasks invoking smp_call_function(). + if (nthreads < 0) + nthreads = num_online_cpus(); + scf_stats_p = kcalloc(nthreads, sizeof(scf_stats_p[0]), GFP_KERNEL); + if (!scf_stats_p) { + VERBOSE_SCFTORTOUT_ERRSTRING("out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + + VERBOSE_SCFTORTOUT("Starting %d smp_call_function() threads\n", nthreads); + + atomic_set(&n_started, nthreads); + for (i = 0; i < nthreads; i++) { + scf_stats_p[i].cpu = i; + firsterr = torture_create_kthread(scftorture_invoker, (void *)&scf_stats_p[i], + scf_stats_p[i].task); + if (firsterr) + goto unwind; + } + if (stat_interval > 0) { + firsterr = torture_create_kthread(scf_torture_stats, NULL, scf_torture_stats_task); + if (firsterr) + goto unwind; + } + + torture_init_end(); + return 0; + +unwind: + torture_init_end(); + scf_torture_cleanup(); + return firsterr; +} + +module_init(scf_torture_init); +module_exit(scf_torture_cleanup); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index e068c3c7189a..0c3a6c752ede 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1367,6 +1367,16 @@ config WW_MUTEX_SELFTEST Say M if you want these self tests to build as a module. Say N if you are unsure. +config SCF_TORTURE_TEST + tristate "torture tests for smp_call_function*()" + depends on DEBUG_KERNEL + select TORTURE_TEST + help + This option provides a kernel module that runs torture tests + on the smp_call_function() family of primitives. The kernel + module may be built after the fact on the running kernel to + be tested, if desired. + endmenu # lock debugging config TRACE_IRQFLAGS From 687d4775db56d24c81b4704056186d6c506de30d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 30 Jun 2020 13:37:22 -0700 Subject: [PATCH 031/168] torture: Declare parse-console.sh independence from rcutorture Currently, parse-torture.sh looks at the fifth field of torture-test console output for the version number. This works fine for rcutorture, but not for scftorture, which lacks the pointer field. This commit therefore adjusts matching lines so that the parse-console.sh awk script always sees the version number as the first field in the lines passed to it. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/parse-console.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/parse-console.sh b/tools/testing/selftests/rcutorture/bin/parse-console.sh index 71a9f43a3918..4e081a25761e 100755 --- a/tools/testing/selftests/rcutorture/bin/parse-console.sh +++ b/tools/testing/selftests/rcutorture/bin/parse-console.sh @@ -67,6 +67,7 @@ then grep --binary-files=text 'torture:.*ver:' $file | egrep --binary-files=text -v '\(null\)|rtc: 000000000* ' | sed -e 's/^(initramfs)[^]]*] //' -e 's/^\[[^]]*] //' | + sed -e 's/^.*ver: //' | awk ' BEGIN { ver = 0; @@ -74,13 +75,13 @@ then } { - if (!badseq && ($5 + 0 != $5 || $5 <= ver)) { + if (!badseq && ($1 + 0 != $1 || $1 <= ver)) { badseqno1 = ver; - badseqno2 = $5; + badseqno2 = $1; badseqnr = NR; badseq = 1; } - ver = $5 + ver = $1 } END { From 80c9476e683ec37ba45fd8e6a5c5081bea207e1a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 24 Jun 2020 17:57:07 -0700 Subject: [PATCH 032/168] torture: Add scftorture to the rcutorture scripting This commit updates the rcutorture scripting to include the new scftorture torture-test module. Signed-off-by: Paul E. McKenney --- .../rcutorture/bin/kvm-recheck-scf.sh | 38 +++++++++++++++++++ tools/testing/selftests/rcutorture/bin/kvm.sh | 2 +- .../selftests/rcutorture/configs/scf/CFLIST | 2 + .../selftests/rcutorture/configs/scf/CFcommon | 2 + .../rcutorture/configs/scf/NOPREEMPT | 9 +++++ .../rcutorture/configs/scf/NOPREEMPT.boot | 1 + .../selftests/rcutorture/configs/scf/PREEMPT | 9 +++++ .../rcutorture/configs/scf/ver_functions.sh | 30 +++++++++++++++ 8 files changed, 92 insertions(+), 1 deletion(-) create mode 100755 tools/testing/selftests/rcutorture/bin/kvm-recheck-scf.sh create mode 100644 tools/testing/selftests/rcutorture/configs/scf/CFLIST create mode 100644 tools/testing/selftests/rcutorture/configs/scf/CFcommon create mode 100644 tools/testing/selftests/rcutorture/configs/scf/NOPREEMPT create mode 100644 tools/testing/selftests/rcutorture/configs/scf/NOPREEMPT.boot create mode 100644 tools/testing/selftests/rcutorture/configs/scf/PREEMPT create mode 100644 tools/testing/selftests/rcutorture/configs/scf/ver_functions.sh diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-scf.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-scf.sh new file mode 100755 index 000000000000..671bfee4fcef --- /dev/null +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-scf.sh @@ -0,0 +1,38 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0+ +# +# Analyze a given results directory for rcutorture progress. +# +# Usage: kvm-recheck-rcu.sh resdir +# +# Copyright (C) Facebook, 2020 +# +# Authors: Paul E. McKenney + +i="$1" +if test -d "$i" -a -r "$i" +then + : +else + echo Unreadable results directory: $i + exit 1 +fi +. functions.sh + +configfile=`echo $i | sed -e 's/^.*\///'` +nscfs="`grep 'scf_invoked_count ver:' $i/console.log 2> /dev/null | tail -1 | sed -e 's/^.* scf_invoked_count ver: //' -e 's/ .*$//' | tr -d '\015'`" +if test -z "$nscfs" +then + echo "$configfile ------- " +else + dur="`sed -e 's/^.* scftorture.shutdown_secs=//' -e 's/ .*$//' < $i/qemu-cmd 2> /dev/null`" + if test -z "$dur" + then + rate="" + else + nscfss=`awk -v nscfs=$nscfs -v dur=$dur ' + BEGIN { print nscfs / dur }' < /dev/null` + rate=" ($nscfss/s)" + fi + echo "${configfile} ------- ${nscfs} SCF handler invocations$rate" +fi diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index e655983b7429..44dfdd9be67e 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -184,7 +184,7 @@ do shift ;; --torture) - checkarg --torture "(suite name)" "$#" "$2" '^\(lock\|rcu\|rcuperf\|refscale\)$' '^--' + checkarg --torture "(suite name)" "$#" "$2" '^\(lock\|rcu\|rcuperf\|refscale\|scf\)$' '^--' TORTURE_SUITE=$2 shift if test "$TORTURE_SUITE" = rcuperf || test "$TORTURE_SUITE" = refscale diff --git a/tools/testing/selftests/rcutorture/configs/scf/CFLIST b/tools/testing/selftests/rcutorture/configs/scf/CFLIST new file mode 100644 index 000000000000..4d62eb4a39f9 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/scf/CFLIST @@ -0,0 +1,2 @@ +NOPREEMPT +PREEMPT diff --git a/tools/testing/selftests/rcutorture/configs/scf/CFcommon b/tools/testing/selftests/rcutorture/configs/scf/CFcommon new file mode 100644 index 000000000000..c11ab91f49f5 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/scf/CFcommon @@ -0,0 +1,2 @@ +CONFIG_SCF_TORTURE_TEST=y +CONFIG_PRINTK_TIME=y diff --git a/tools/testing/selftests/rcutorture/configs/scf/NOPREEMPT b/tools/testing/selftests/rcutorture/configs/scf/NOPREEMPT new file mode 100644 index 000000000000..b8429d6c6ebc --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/scf/NOPREEMPT @@ -0,0 +1,9 @@ +CONFIG_SMP=y +CONFIG_PREEMPT_NONE=y +CONFIG_PREEMPT_VOLUNTARY=n +CONFIG_PREEMPT=n +CONFIG_HZ_PERIODIC=n +CONFIG_NO_HZ_IDLE=n +CONFIG_NO_HZ_FULL=y +CONFIG_DEBUG_LOCK_ALLOC=n +CONFIG_PROVE_LOCKING=n diff --git a/tools/testing/selftests/rcutorture/configs/scf/NOPREEMPT.boot b/tools/testing/selftests/rcutorture/configs/scf/NOPREEMPT.boot new file mode 100644 index 000000000000..d6a7fa097c2e --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/scf/NOPREEMPT.boot @@ -0,0 +1 @@ +nohz_full=1 diff --git a/tools/testing/selftests/rcutorture/configs/scf/PREEMPT b/tools/testing/selftests/rcutorture/configs/scf/PREEMPT new file mode 100644 index 000000000000..ae4992b141b0 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/scf/PREEMPT @@ -0,0 +1,9 @@ +CONFIG_SMP=y +CONFIG_PREEMPT_NONE=n +CONFIG_PREEMPT_VOLUNTARY=n +CONFIG_PREEMPT=y +CONFIG_HZ_PERIODIC=n +CONFIG_NO_HZ_IDLE=y +CONFIG_NO_HZ_FULL=n +CONFIG_DEBUG_LOCK_ALLOC=y +CONFIG_PROVE_LOCKING=y diff --git a/tools/testing/selftests/rcutorture/configs/scf/ver_functions.sh b/tools/testing/selftests/rcutorture/configs/scf/ver_functions.sh new file mode 100644 index 000000000000..d3d9e35d3d55 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/scf/ver_functions.sh @@ -0,0 +1,30 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0+ +# +# Torture-suite-dependent shell functions for the rest of the scripts. +# +# Copyright (C) Facebook, 2020 +# +# Authors: Paul E. McKenney + +# scftorture_param_onoff bootparam-string config-file +# +# Adds onoff scftorture module parameters to kernels having it. +scftorture_param_onoff () { + if ! bootparam_hotplug_cpu "$1" && configfrag_hotplug_cpu "$2" + then + echo CPU-hotplug kernel, adding scftorture onoff. 1>&2 + echo scftorture.onoff_interval=1000 scftorture.onoff_holdoff=30 + fi +} + +# per_version_boot_params bootparam-string config-file seconds +# +# Adds per-version torture-module parameters to kernels supporting them. +per_version_boot_params () { + echo $1 `scftorture_param_onoff "$1" "$2"` \ + scftorture.stat_interval=15 \ + scftorture.shutdown_secs=$3 \ + scftorture.verbose=1 \ + scf +} From 5022b8ac608f8b80b042a8041fe2738c4b9ea8cf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 25 Jun 2020 17:05:58 -0700 Subject: [PATCH 033/168] scftorture: Implement weighted primitive selection This commit uses the scftorture.weight* kernel parameters to randomly chooses between smp_call_function_single(), smp_call_function_many(), and smp_call_function(). For each variant, it also randomly chooses whether to invoke it synchronously (wait=1) or asynchronously (wait=0). The percentage weighting for each option are dumped to the console log (search for "scf_sel_dump"). This accumulates statistics, which a later commit will dump out at the end of the run. Signed-off-by: Paul E. McKenney --- kernel/scftorture.c | 184 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 156 insertions(+), 28 deletions(-) diff --git a/kernel/scftorture.c b/kernel/scftorture.c index 44f1e49ba6e9..5f1984522ae4 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -64,8 +64,8 @@ torture_param(bool, use_cpus_read_lock, 0, "Use cpus_read_lock() to exclude CPU torture_param(int, verbose, 0, "Enable verbose debugging printk()s"); torture_param(int, weight_single, -1, "Testing weight for single-CPU no-wait operations."); torture_param(int, weight_single_wait, -1, "Testing weight for single-CPU operations."); -torture_param(int, weight_mult, -1, "Testing weight for multi-CPU no-wait operations."); -torture_param(int, weight_mult_wait, -1, "Testing weight for multi-CPU operations."); +torture_param(int, weight_many, -1, "Testing weight for multi-CPU no-wait operations."); +torture_param(int, weight_many_wait, -1, "Testing weight for multi-CPU operations."); torture_param(int, weight_all, -1, "Testing weight for all-CPU no-wait operations."); torture_param(int, weight_all_wait, -1, "Testing weight for all-CPU operations."); @@ -83,9 +83,11 @@ struct scf_statistics { struct task_struct *task; int cpu; long long n_single; + long long n_single_ofl; long long n_single_wait; - long long n_multi; - long long n_multi_wait; + long long n_single_wait_ofl; + long long n_many; + long long n_many_wait; long long n_all; long long n_all_wait; }; @@ -94,6 +96,27 @@ static struct scf_statistics *scf_stats_p; static struct task_struct *scf_torture_stats_task; static DEFINE_PER_CPU(long long, scf_invoked_count); +// Data for random primitive selection +#define SCF_PRIM_SINGLE 0 +#define SCF_PRIM_MANY 1 +#define SCF_PRIM_ALL 2 +#define SCF_NPRIMS (2 * 3) // Need wait and no-wait versions of each. + +static char *scf_prim_name[] = { + "smp_call_function_single", + "smp_call_function_many", + "smp_call_function", +}; + +struct scf_selector { + unsigned long scfs_weight; + int scfs_prim; + bool scfs_wait; +}; +static struct scf_selector scf_sel_array[SCF_NPRIMS]; +static int scf_sel_array_len; +static unsigned long scf_sel_totweight; + // Use to wait for all threads to start. static atomic_t n_started; static atomic_t n_errs; @@ -131,6 +154,57 @@ scf_torture_stats(void *arg) return 0; } +// Add a primitive to the scf_sel_array[]. +static void scf_sel_add(unsigned long weight, int prim, bool wait) +{ + struct scf_selector *scfsp = &scf_sel_array[scf_sel_array_len]; + + // If no weight, if array would overflow, if computing three-place + // percentages would overflow, or if the scf_prim_name[] array would + // overflow, don't bother. In the last three two cases, complain. + if (!weight || + WARN_ON_ONCE(scf_sel_array_len >= ARRAY_SIZE(scf_sel_array)) || + WARN_ON_ONCE(0 - 100000 * weight <= 100000 * scf_sel_totweight) || + WARN_ON_ONCE(prim >= ARRAY_SIZE(scf_prim_name))) + return; + scf_sel_totweight += weight; + scfsp->scfs_weight = scf_sel_totweight; + scfsp->scfs_prim = prim; + scfsp->scfs_wait = wait; + scf_sel_array_len++; +} + +// Dump out weighting percentages for scf_prim_name[] array. +static void scf_sel_dump(void) +{ + int i; + unsigned long oldw = 0; + struct scf_selector *scfsp; + unsigned long w; + + for (i = 0; i < scf_sel_array_len; i++) { + scfsp = &scf_sel_array[i]; + w = (scfsp->scfs_weight - oldw) * 100000 / scf_sel_totweight; + pr_info("%s: %3lu.%03lu %s(%s)\n", __func__, w / 1000, w % 1000, + scf_prim_name[scfsp->scfs_prim], + scfsp->scfs_wait ? "wait" : "nowait"); + oldw = scfsp->scfs_weight; + } +} + +// Randomly pick a primitive and wait/nowait, based on weightings. +static struct scf_selector *scf_sel_rand(struct torture_random_state *trsp) +{ + int i; + unsigned long w = torture_random(trsp) % (scf_sel_totweight + 1); + + for (i = 0; i < scf_sel_array_len; i++) + if (scf_sel_array[i].scfs_weight >= w) + return &scf_sel_array[i]; + WARN_ON_ONCE(1); + return &scf_sel_array[0]; +} + // Update statistics and occasionally burn up mass quantities of CPU time, // if told to do so via scftorture.longwait. Otherwise, occasionally burn // a little bit. @@ -162,15 +236,55 @@ static void scf_handler(void *unused) } } -// Randomly do an smp_call_function*() invocation. -static void scftorture_invoke_one(struct scf_statistics *scfp,struct torture_random_state *trsp) +// As above, but check for correct CPU. +static void scf_handler_1(void *me) { + if (WARN_ON_ONCE(smp_processor_id() != (uintptr_t)me)) + atomic_inc(&n_errs); + scf_handler(NULL); +} + +// Randomly do an smp_call_function*() invocation. +static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_random_state *trsp) +{ + uintptr_t cpu; + int ret; + struct scf_selector *scfsp = scf_sel_rand(trsp); + if (use_cpus_read_lock) cpus_read_lock(); else preempt_disable(); - scfp->n_all++; - smp_call_function(scf_handler, NULL, 0); + switch (scfsp->scfs_prim) { + case SCF_PRIM_SINGLE: + cpu = torture_random(trsp) % nr_cpu_ids; + if (scfsp->scfs_wait) + scfp->n_single_wait++; + else + scfp->n_single++; + ret = smp_call_function_single(cpu, scf_handler_1, (void *)cpu, scfsp->scfs_wait); + if (ret) { + if (scfsp->scfs_wait) + scfp->n_single_wait_ofl++; + else + scfp->n_single_ofl++; + } + break; + case SCF_PRIM_MANY: + if (scfsp->scfs_wait) + scfp->n_many_wait++; + else + scfp->n_many++; + smp_call_function_many(cpu_online_mask, scf_handler, NULL, scfsp->scfs_wait); + break; + case SCF_PRIM_ALL: + if (scfsp->scfs_wait) + scfp->n_all_wait++; + else + scfp->n_all++; + smp_call_function(scf_handler, NULL, scfsp->scfs_wait); + break; + } if (use_cpus_read_lock) cpus_read_unlock(); else @@ -222,8 +336,8 @@ static void scftorture_print_module_parms(const char *tag) { pr_alert(SCFTORT_FLAG - "--- %s: verbose=%d holdoff=%d longwait=%d nthreads=%d onoff_holdoff=%d onoff_interval=%d shutdown_secs=%d stat_interval=%d stutter_cpus=%d use_cpus_read_lock=%d, weight_single=%d, weight_single_wait=%d, weight_mult=%d, weight_mult_wait=%d, weight_all=%d, weight_all_wait=%d\n", tag, - verbose, holdoff, longwait, nthreads, onoff_holdoff, onoff_interval, shutdown, stat_interval, stutter_cpus, use_cpus_read_lock, weight_single, weight_single_wait, weight_mult, weight_mult_wait, weight_all, weight_all_wait); + "--- %s: verbose=%d holdoff=%d longwait=%d nthreads=%d onoff_holdoff=%d onoff_interval=%d shutdown_secs=%d stat_interval=%d stutter_cpus=%d use_cpus_read_lock=%d, weight_single=%d, weight_single_wait=%d, weight_many=%d, weight_many_wait=%d, weight_all=%d, weight_all_wait=%d\n", tag, + verbose, holdoff, longwait, nthreads, onoff_holdoff, onoff_interval, shutdown, stat_interval, stutter_cpus, use_cpus_read_lock, weight_single, weight_single_wait, weight_many, weight_many_wait, weight_all, weight_all_wait); } static void scf_cleanup_handler(void *unused) @@ -264,6 +378,12 @@ static int __init scf_torture_init(void) { long i; int firsterr = 0; + unsigned long weight_single1 = weight_single; + unsigned long weight_single_wait1 = weight_single_wait; + unsigned long weight_many1 = weight_many; + unsigned long weight_many_wait1 = weight_many_wait; + unsigned long weight_all1 = weight_all; + unsigned long weight_all_wait1 = weight_all_wait; if (!torture_init_begin(SCFTORT_STRING, verbose)) return -EBUSY; @@ -271,34 +391,42 @@ static int __init scf_torture_init(void) scftorture_print_module_parms("Start of test"); if (weight_single == -1 && weight_single_wait == -1 && - weight_mult == -1 && weight_mult_wait == -1 && + weight_many == -1 && weight_many_wait == -1 && weight_all == -1 && weight_all_wait == -1) { - weight_single = 1; - weight_single_wait = 1; - weight_mult = 1; - weight_mult_wait = 1; - weight_all = 1; - weight_all_wait = 1; + weight_single1 = 2 * nr_cpu_ids; + weight_single_wait1 = 2 * nr_cpu_ids; + weight_many1 = 2; + weight_many_wait1 = 2; + weight_all1 = 1; + weight_all_wait1 = 1; } else { if (weight_single == -1) - weight_single = 0; + weight_single1 = 0; if (weight_single_wait == -1) - weight_single_wait = 0; - if (weight_mult == -1) - weight_mult = 0; - if (weight_mult_wait == -1) - weight_mult_wait = 0; + weight_single_wait1 = 0; + if (weight_many == -1) + weight_many1 = 0; + if (weight_many_wait == -1) + weight_many_wait1 = 0; if (weight_all == -1) - weight_all = 0; + weight_all1 = 0; if (weight_all_wait == -1) - weight_all_wait = 0; + weight_all_wait1 = 0; } - if (weight_single == 0 && weight_single_wait == 0 && - weight_mult == 0 && weight_mult_wait == 0 && - weight_all == 0 && weight_all_wait == 0) { + if (weight_single1 == 0 && weight_single_wait1 == 0 && + weight_many1 == 0 && weight_many_wait1 == 0 && + weight_all1 == 0 && weight_all_wait1 == 0) { + VERBOSE_SCFTORTOUT_ERRSTRING("all zero weights makes no sense"); firsterr = -EINVAL; goto unwind; } + scf_sel_add(weight_single1, SCF_PRIM_SINGLE, false); + scf_sel_add(weight_single_wait1, SCF_PRIM_SINGLE, true); + scf_sel_add(weight_many1, SCF_PRIM_MANY, false); + scf_sel_add(weight_many_wait1, SCF_PRIM_MANY, true); + scf_sel_add(weight_all1, SCF_PRIM_ALL, false); + scf_sel_add(weight_all_wait1, SCF_PRIM_ALL, true); + scf_sel_dump(); if (onoff_interval > 0) { firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval, NULL); From bca37119c57bdc2c68c84b313a5118005e8693cf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 26 Jun 2020 13:39:41 -0700 Subject: [PATCH 034/168] tick-sched: Clarify "NOHZ: local_softirq_pending" warning Currently, can_stop_idle_tick() prints "NOHZ: local_softirq_pending HH" (where "HH" is the hexadecimal softirq vector number) when one or more non-RCU softirq handlers are still enabled when checking to stop the scheduler-tick interrupt. This message is not as enlightening as one might hope, so this commit changes it to "NOHZ tick-stop error: Non-RCU local softirq work is pending, handler #HH". Reported-by: Andy Lutomirski Cc: Frederic Weisbecker Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Paul E. McKenney --- kernel/time/tick-sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index f0199a4ba1ad..81632cd5e3b7 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -927,7 +927,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) if (ratelimit < 10 && (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) { - pr_warn("NOHZ: local_softirq_pending %02x\n", + pr_warn("NOHZ tick-stop error: Non-RCU local softirq work is pending, handler #%02x!!!\n", (unsigned int) local_softirq_pending()); ratelimit++; } From dba3142b37f343734bf61dbce2914acb76e69fb6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 30 Jun 2020 16:13:37 -0700 Subject: [PATCH 035/168] scftorture: Summarize per-thread statistics This commit summarizes the per-thread statistics, providing counts of the number of single, many, and all calls, both no-wait and wait, and, for the single case, the number where the target CPU was offline. Signed-off-by: Paul E. McKenney --- kernel/scftorture.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/kernel/scftorture.c b/kernel/scftorture.c index 5f1984522ae4..09a62424bb8c 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -128,13 +128,27 @@ DEFINE_TORTURE_RANDOM_PERCPU(scf_torture_rand); static void scf_torture_stats_print(void) { int cpu; + int i; long long invoked_count = 0; bool isdone = READ_ONCE(scfdone); + struct scf_statistics scfs = {}; for_each_possible_cpu(cpu) invoked_count += data_race(per_cpu(scf_invoked_count, cpu)); - pr_alert("%s scf_invoked_count %s: %lld ", - SCFTORT_FLAG, isdone ? "VER" : "ver", invoked_count); + for (i = 0; i < nthreads; i++) { + scfs.n_single += scf_stats_p[i].n_single; + scfs.n_single_ofl += scf_stats_p[i].n_single_ofl; + scfs.n_single_wait += scf_stats_p[i].n_single_wait; + scfs.n_single_wait_ofl += scf_stats_p[i].n_single_wait_ofl; + scfs.n_many += scf_stats_p[i].n_many; + scfs.n_many_wait += scf_stats_p[i].n_many_wait; + scfs.n_all += scf_stats_p[i].n_all; + scfs.n_all_wait += scf_stats_p[i].n_all_wait; + } + pr_alert("%s scf_invoked_count %s: %lld single: %lld/%lld single_ofl: %lld/%lld many: %lld/%lld all: %lld/%lld ", + SCFTORT_FLAG, isdone ? "VER" : "ver", invoked_count, + scfs.n_single, scfs.n_single_wait, scfs.n_single_ofl, scfs.n_single_wait_ofl, + scfs.n_many, scfs.n_many_wait, scfs.n_all, scfs.n_all_wait); torture_onoff_stats(); pr_cont("\n"); } @@ -357,11 +371,11 @@ static void scf_torture_cleanup(void) torture_stop_kthread("scftorture_invoker", scf_stats_p[i].task); else goto end; - kfree(scf_stats_p); - scf_stats_p = NULL; smp_call_function(scf_cleanup_handler, NULL, 0); torture_stop_kthread(scf_torture_stats, scf_torture_stats_task); scf_torture_stats_print(); // -After- the stats thread is stopped! + kfree(scf_stats_p); // -After- the last stats print has completed! + scf_stats_p = NULL; if (atomic_read(&n_errs)) scftorture_print_module_parms("End of test: FAILURE"); From b93e21a51e1c8ed3816da888d34f88193ad1b917 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 30 Jun 2020 20:49:50 -0700 Subject: [PATCH 036/168] scftorture: Add smp_call_function_single() memory-ordering checks This commit adds checks for memory misordering across calls to smp_call_function_single() and also across returns in the case where the caller waits. Misordering results in a splat. [ paulmck: s/GFP_KERNEL/GFP_ATOMIC/ per kernel test robot feedback. ] Signed-off-by: Paul E. McKenney --- kernel/scftorture.c | 56 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 48 insertions(+), 8 deletions(-) diff --git a/kernel/scftorture.c b/kernel/scftorture.c index 09a62424bb8c..9b42271d64f1 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -117,9 +117,20 @@ static struct scf_selector scf_sel_array[SCF_NPRIMS]; static int scf_sel_array_len; static unsigned long scf_sel_totweight; +// Communicate between caller and handler. +struct scf_check { + bool scfc_in; + bool scfc_out; + int scfc_cpu; // -1 for not _single(). + bool scfc_wait; +}; + // Use to wait for all threads to start. static atomic_t n_started; static atomic_t n_errs; +static atomic_t n_mb_in_errs; +static atomic_t n_mb_out_errs; +static atomic_t n_alloc_errs; static bool scfdone; DEFINE_TORTURE_RANDOM_PERCPU(scf_torture_rand); @@ -222,24 +233,27 @@ static struct scf_selector *scf_sel_rand(struct torture_random_state *trsp) // Update statistics and occasionally burn up mass quantities of CPU time, // if told to do so via scftorture.longwait. Otherwise, occasionally burn // a little bit. -static void scf_handler(void *unused) +static void scf_handler(void *scfc_in) { int i; int j; unsigned long r = torture_random(this_cpu_ptr(&scf_torture_rand)); + struct scf_check *scfcp = scfc_in; + if (likely(scfcp) && WARN_ON_ONCE(unlikely(!READ_ONCE(scfcp->scfc_in)))) + atomic_inc(&n_mb_in_errs); this_cpu_inc(scf_invoked_count); if (longwait <= 0) { if (!(r & 0xffc0)) udelay(r & 0x3f); - return; + goto out; } if (r & 0xfff) - return; + goto out; r = (r >> 12); if (longwait <= 0) { udelay((r & 0xff) + 1); - return; + goto out; } r = r % longwait + 1; for (i = 0; i < r; i++) { @@ -248,14 +262,24 @@ static void scf_handler(void *unused) cpu_relax(); } } +out: + if (unlikely(!scfcp)) + return; + if (scfcp->scfc_wait) + WRITE_ONCE(scfcp->scfc_out, true); + else + kfree(scfcp); } // As above, but check for correct CPU. -static void scf_handler_1(void *me) +static void scf_handler_1(void *scfc_in) { - if (WARN_ON_ONCE(smp_processor_id() != (uintptr_t)me)) + struct scf_check *scfcp = scfc_in; + + if (likely(scfcp) && WARN_ONCE(smp_processor_id() != scfcp->scfc_cpu, "%s: Wanted CPU %d got CPU %d\n", __func__, scfcp->scfc_cpu, smp_processor_id())) { atomic_inc(&n_errs); - scf_handler(NULL); + } + scf_handler(scfcp); } // Randomly do an smp_call_function*() invocation. @@ -263,6 +287,7 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra { uintptr_t cpu; int ret; + struct scf_check *scfcp = NULL; struct scf_selector *scfsp = scf_sel_rand(trsp); if (use_cpus_read_lock) @@ -271,17 +296,32 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra preempt_disable(); switch (scfsp->scfs_prim) { case SCF_PRIM_SINGLE: + scfcp = kmalloc(sizeof(*scfcp), GFP_ATOMIC); + if (WARN_ON_ONCE(!scfcp)) + atomic_inc(&n_alloc_errs); cpu = torture_random(trsp) % nr_cpu_ids; if (scfsp->scfs_wait) scfp->n_single_wait++; else scfp->n_single++; - ret = smp_call_function_single(cpu, scf_handler_1, (void *)cpu, scfsp->scfs_wait); + if (scfcp) { + scfcp->scfc_cpu = cpu; + scfcp->scfc_wait = scfsp->scfs_wait; + scfcp->scfc_out = false; + scfcp->scfc_in = true; + } + ret = smp_call_function_single(cpu, scf_handler_1, (void *)scfcp, scfsp->scfs_wait); if (ret) { if (scfsp->scfs_wait) scfp->n_single_wait_ofl++; else scfp->n_single_ofl++; + kfree(scfcp); + } else if (scfcp && scfsp->scfs_wait) { + if (WARN_ON_ONCE(!scfcp->scfc_out)) + atomic_inc(&n_mb_out_errs); // Leak rather than trash! + else + kfree(scfcp); } break; case SCF_PRIM_MANY: From 980205ee8489d53c4380f7762debac87312b0fb3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 1 Jul 2020 12:30:02 -0700 Subject: [PATCH 037/168] scftorture: Add smp_call_function_many() memory-ordering checks This commit adds checks for memory misordering across calls to and returns from smp_call_function_many() in the case where the caller waits. Misordering results in a splat. Note that in contrast to smp_call_function_single(), this code does not test memory ordering into the handler in the no-wait case because none of the handlers would be able to free the scf_check structure without introducing heavy synchronization to work out which was last. [ paulmck: s/GFP_KERNEL/GFP_ATOMIC/ per kernel test robot feedback. ] Signed-off-by: Paul E. McKenney --- kernel/scftorture.c | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/kernel/scftorture.c b/kernel/scftorture.c index 9b42271d64f1..3519ad1b3278 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -240,8 +240,11 @@ static void scf_handler(void *scfc_in) unsigned long r = torture_random(this_cpu_ptr(&scf_torture_rand)); struct scf_check *scfcp = scfc_in; - if (likely(scfcp) && WARN_ON_ONCE(unlikely(!READ_ONCE(scfcp->scfc_in)))) - atomic_inc(&n_mb_in_errs); + if (likely(scfcp)) { + WRITE_ONCE(scfcp->scfc_out, false); // For multiple receivers. + if (WARN_ON_ONCE(unlikely(!READ_ONCE(scfcp->scfc_in)))) + atomic_inc(&n_mb_in_errs); + } this_cpu_inc(scf_invoked_count); if (longwait <= 0) { if (!(r & 0xffc0)) @@ -325,11 +328,28 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra } break; case SCF_PRIM_MANY: + if (scfsp->scfs_wait) { + scfcp = kmalloc(sizeof(*scfcp), GFP_ATOMIC); + if (WARN_ON_ONCE(!scfcp)) + atomic_inc(&n_alloc_errs); + } if (scfsp->scfs_wait) scfp->n_many_wait++; else scfp->n_many++; - smp_call_function_many(cpu_online_mask, scf_handler, NULL, scfsp->scfs_wait); + if (scfcp) { + scfcp->scfc_cpu = -1; + scfcp->scfc_wait = true; + scfcp->scfc_out = false; + scfcp->scfc_in = true; + } + smp_call_function_many(cpu_online_mask, scf_handler, scfcp, scfsp->scfs_wait); + if (scfcp) { + if (WARN_ON_ONCE(!scfcp->scfc_out)) + atomic_inc(&n_mb_out_errs); // Leak rather than trash! + else + kfree(scfcp); + } break; case SCF_PRIM_ALL: if (scfsp->scfs_wait) From 34e8c4837adb579962e528a4f7dd1f75cb120be4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 1 Jul 2020 13:49:06 -0700 Subject: [PATCH 038/168] scftorture: Add smp_call_function() memory-ordering checks This commit adds checks for memory misordering across calls to and returns from smp_call_function() in the case where the caller waits. Misordering results in a splat. Note that in contrast to smp_call_function_single(), this code does not test memory ordering into the handler in the no-wait case because none of the handlers would be able to free the scf_check structure without introducing heavy synchronization to work out which was last. [ paulmck: s/GFP_KERNEL/GFP_ATOMIC/ per kernel test robot feedback. ] Signed-off-by: Paul E. McKenney --- kernel/scftorture.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/kernel/scftorture.c b/kernel/scftorture.c index 3519ad1b3278..0d7299d32dd0 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -297,11 +297,13 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra cpus_read_lock(); else preempt_disable(); - switch (scfsp->scfs_prim) { - case SCF_PRIM_SINGLE: + if (scfsp->scfs_prim == SCF_PRIM_SINGLE || scfsp->scfs_wait) { scfcp = kmalloc(sizeof(*scfcp), GFP_ATOMIC); if (WARN_ON_ONCE(!scfcp)) atomic_inc(&n_alloc_errs); + } + switch (scfsp->scfs_prim) { + case SCF_PRIM_SINGLE: cpu = torture_random(trsp) % nr_cpu_ids; if (scfsp->scfs_wait) scfp->n_single_wait++; @@ -328,11 +330,6 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra } break; case SCF_PRIM_MANY: - if (scfsp->scfs_wait) { - scfcp = kmalloc(sizeof(*scfcp), GFP_ATOMIC); - if (WARN_ON_ONCE(!scfcp)) - atomic_inc(&n_alloc_errs); - } if (scfsp->scfs_wait) scfp->n_many_wait++; else @@ -356,7 +353,19 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra scfp->n_all_wait++; else scfp->n_all++; - smp_call_function(scf_handler, NULL, scfsp->scfs_wait); + if (scfcp) { + scfcp->scfc_cpu = -1; + scfcp->scfc_wait = true; + scfcp->scfc_out = false; + scfcp->scfc_in = true; + } + smp_call_function(scf_handler, scfcp, scfsp->scfs_wait); + if (scfcp) { + if (WARN_ON_ONCE(!scfcp->scfc_out)) + atomic_inc(&n_mb_out_errs); // Leak rather than trash! + else + kfree(scfcp); + } break; } if (use_cpus_read_lock) From 676e5469643e716df7f39ef77ba8f09c85b0c4f8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 1 Jul 2020 14:13:02 -0700 Subject: [PATCH 039/168] scftorture: Consolidate scftorture_invoke_one() check and kfree() This commit moves checking of the ->scfc_out field and the freeing of the scf_check structure down below the end of switch statement, thus saving a few lines of code. Signed-off-by: Paul E. McKenney --- kernel/scftorture.c | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/kernel/scftorture.c b/kernel/scftorture.c index 0d7299d32dd0..f220cd364e23 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -289,7 +289,7 @@ static void scf_handler_1(void *scfc_in) static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_random_state *trsp) { uintptr_t cpu; - int ret; + int ret = 0; struct scf_check *scfcp = NULL; struct scf_selector *scfsp = scf_sel_rand(trsp); @@ -322,11 +322,7 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra else scfp->n_single_ofl++; kfree(scfcp); - } else if (scfcp && scfsp->scfs_wait) { - if (WARN_ON_ONCE(!scfcp->scfc_out)) - atomic_inc(&n_mb_out_errs); // Leak rather than trash! - else - kfree(scfcp); + scfcp = NULL; } break; case SCF_PRIM_MANY: @@ -341,12 +337,6 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra scfcp->scfc_in = true; } smp_call_function_many(cpu_online_mask, scf_handler, scfcp, scfsp->scfs_wait); - if (scfcp) { - if (WARN_ON_ONCE(!scfcp->scfc_out)) - atomic_inc(&n_mb_out_errs); // Leak rather than trash! - else - kfree(scfcp); - } break; case SCF_PRIM_ALL: if (scfsp->scfs_wait) @@ -360,14 +350,14 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra scfcp->scfc_in = true; } smp_call_function(scf_handler, scfcp, scfsp->scfs_wait); - if (scfcp) { - if (WARN_ON_ONCE(!scfcp->scfc_out)) - atomic_inc(&n_mb_out_errs); // Leak rather than trash! - else - kfree(scfcp); - } break; } + if (scfcp && scfsp->scfs_wait) { + if (WARN_ON_ONCE(!scfcp->scfc_out)) + atomic_inc(&n_mb_out_errs); // Leak rather than trash! + else + kfree(scfcp); + } if (use_cpus_read_lock) cpus_read_unlock(); else From 4df55bddc1a360e94c86e227fe417ac9422cb615 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 9 Jul 2020 13:58:32 -0700 Subject: [PATCH 040/168] scftorture: Consolidate scftorture_invoke_one() scf_check initialization This commit hoists much of the initialization of the scf_check structure out of the switch statement, thus saving a few lines of code. The initialization of the ->scfc_in field remains in each leg of the switch statement in order to more heavily stress memory ordering. Signed-off-by: Paul E. McKenney --- kernel/scftorture.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/kernel/scftorture.c b/kernel/scftorture.c index f220cd364e23..8ab72e545a61 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -299,8 +299,13 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra preempt_disable(); if (scfsp->scfs_prim == SCF_PRIM_SINGLE || scfsp->scfs_wait) { scfcp = kmalloc(sizeof(*scfcp), GFP_ATOMIC); - if (WARN_ON_ONCE(!scfcp)) + if (WARN_ON_ONCE(!scfcp)) { atomic_inc(&n_alloc_errs); + } else { + scfcp->scfc_cpu = -1; + scfcp->scfc_wait = scfsp->scfs_wait; + scfcp->scfc_out = false; + } } switch (scfsp->scfs_prim) { case SCF_PRIM_SINGLE: @@ -311,8 +316,6 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra scfp->n_single++; if (scfcp) { scfcp->scfc_cpu = cpu; - scfcp->scfc_wait = scfsp->scfs_wait; - scfcp->scfc_out = false; scfcp->scfc_in = true; } ret = smp_call_function_single(cpu, scf_handler_1, (void *)scfcp, scfsp->scfs_wait); @@ -330,12 +333,8 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra scfp->n_many_wait++; else scfp->n_many++; - if (scfcp) { - scfcp->scfc_cpu = -1; - scfcp->scfc_wait = true; - scfcp->scfc_out = false; + if (scfcp) scfcp->scfc_in = true; - } smp_call_function_many(cpu_online_mask, scf_handler, scfcp, scfsp->scfs_wait); break; case SCF_PRIM_ALL: @@ -343,12 +342,8 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra scfp->n_all_wait++; else scfp->n_all++; - if (scfcp) { - scfcp->scfc_cpu = -1; - scfcp->scfc_wait = true; - scfcp->scfc_out = false; + if (scfcp) scfcp->scfc_in = true; - } smp_call_function(scf_handler, scfcp, scfsp->scfs_wait); break; } From dbf83b655a7853bc430af10e9a3e7eb1f4c90f86 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 1 Jul 2020 16:06:22 -0700 Subject: [PATCH 041/168] scftorture: Flag errors in torture-compatible manner This commit prints error counts on the statistics line and also adds a "!!!" if any of the counters are non-zero. Allocation failures are (somewhat) forgiven, but all other errors result in a "FAILURE" print at the end of the test. Signed-off-by: Paul E. McKenney --- kernel/scftorture.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/kernel/scftorture.c b/kernel/scftorture.c index 8ab72e545a61..880c2cef13e7 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -132,6 +132,7 @@ static atomic_t n_mb_in_errs; static atomic_t n_mb_out_errs; static atomic_t n_alloc_errs; static bool scfdone; +static char *bangstr = ""; DEFINE_TORTURE_RANDOM_PERCPU(scf_torture_rand); @@ -156,12 +157,17 @@ static void scf_torture_stats_print(void) scfs.n_all += scf_stats_p[i].n_all; scfs.n_all_wait += scf_stats_p[i].n_all_wait; } - pr_alert("%s scf_invoked_count %s: %lld single: %lld/%lld single_ofl: %lld/%lld many: %lld/%lld all: %lld/%lld ", - SCFTORT_FLAG, isdone ? "VER" : "ver", invoked_count, + if (atomic_read(&n_errs) || atomic_read(&n_mb_in_errs) || + atomic_read(&n_mb_out_errs) || atomic_read(&n_alloc_errs)) + bangstr = "!!! "; + pr_alert("%s %sscf_invoked_count %s: %lld single: %lld/%lld single_ofl: %lld/%lld many: %lld/%lld all: %lld/%lld ", + SCFTORT_FLAG, bangstr, isdone ? "VER" : "ver", invoked_count, scfs.n_single, scfs.n_single_wait, scfs.n_single_ofl, scfs.n_single_wait_ofl, scfs.n_many, scfs.n_many_wait, scfs.n_all, scfs.n_all_wait); torture_onoff_stats(); - pr_cont("\n"); + pr_cont("ste: %d stnmie: %d stnmoe: %d staf: %d\n", atomic_read(&n_errs), + atomic_read(&n_mb_in_errs), atomic_read(&n_mb_out_errs), + atomic_read(&n_alloc_errs)); } // Periodically prints torture statistics, if periodic statistics printing @@ -431,7 +437,7 @@ static void scf_torture_cleanup(void) kfree(scf_stats_p); // -After- the last stats print has completed! scf_stats_p = NULL; - if (atomic_read(&n_errs)) + if (atomic_read(&n_errs) || atomic_read(&n_mb_in_errs) || atomic_read(&n_mb_out_errs)) scftorture_print_module_parms("End of test: FAILURE"); else if (torture_onoff_failures()) scftorture_print_module_parms("End of test: LOCK_HOTPLUG"); From ee7035d29576dcb59b1191e5f609517cacab1e56 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 1 Jul 2020 16:38:16 -0700 Subject: [PATCH 042/168] scftorture: Prevent compiler from reducing race probabilities Detecting smp_call_function() memory misordering requires close timing, so it is necessary to have the checks immediately before and after the call to the smp_call_function*() function under test. This commit therefore inserts barrier() calls to prevent the compiler from optimizing memory-misordering detection down into the zone of extreme improbability. Signed-off-by: Paul E. McKenney --- kernel/scftorture.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/kernel/scftorture.c b/kernel/scftorture.c index 880c2cef13e7..83496810fc48 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -322,6 +322,7 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra scfp->n_single++; if (scfcp) { scfcp->scfc_cpu = cpu; + barrier(); // Prevent race-reduction compiler optimizations. scfcp->scfc_in = true; } ret = smp_call_function_single(cpu, scf_handler_1, (void *)scfcp, scfsp->scfs_wait); @@ -339,8 +340,10 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra scfp->n_many_wait++; else scfp->n_many++; - if (scfcp) + if (scfcp) { + barrier(); // Prevent race-reduction compiler optimizations. scfcp->scfc_in = true; + } smp_call_function_many(cpu_online_mask, scf_handler, scfcp, scfsp->scfs_wait); break; case SCF_PRIM_ALL: @@ -348,8 +351,10 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra scfp->n_all_wait++; else scfp->n_all++; - if (scfcp) + if (scfcp) { + barrier(); // Prevent race-reduction compiler optimizations. scfcp->scfc_in = true; + } smp_call_function(scf_handler, scfcp, scfsp->scfs_wait); break; } @@ -358,6 +363,7 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra atomic_inc(&n_mb_out_errs); // Leak rather than trash! else kfree(scfcp); + barrier(); // Prevent race-reduction compiler optimizations. } if (use_cpus_read_lock) cpus_read_unlock(); From 9a52a574676f8d4aa55f69319231ce6c343b00bb Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Thu, 2 Jul 2020 09:56:50 -0700 Subject: [PATCH 043/168] scftorture: Make symbol 'scf_torture_rand' static The sparse tool complains as follows kernel/scftorture.c:124:1: warning: symbol '__pcpu_scope_scf_torture_rand' was not declared. Should it be static? And this per-CPU variable is not used outside of scftorture.c, so this commit marks it static. Reported-by: Hulk Robot Signed-off-by: Wei Yongjun Signed-off-by: Paul E. McKenney --- kernel/scftorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/scftorture.c b/kernel/scftorture.c index 83496810fc48..9180de73e4e8 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -134,7 +134,7 @@ static atomic_t n_alloc_errs; static bool scfdone; static char *bangstr = ""; -DEFINE_TORTURE_RANDOM_PERCPU(scf_torture_rand); +static DEFINE_TORTURE_RANDOM_PERCPU(scf_torture_rand); // Print torture statistics. Caller must ensure serialization. static void scf_torture_stats_print(void) From de77d4da54d10df97d265e7e99112bfc2fef7d4a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 2 Jul 2020 12:15:37 -0700 Subject: [PATCH 044/168] scftorture: Check unexpected "switch" statement value This commit adds a "default" case to the switch statement in scftorture_invoke_one() which contains a WARN_ON_ONCE() and an assignment to ->scfc_out to suppress knock-on warnings. These knock-on warnings could otherwise cause the user to think that there was a memory-ordering problem in smp_call_function() instead of a bug in scftorture.c itself. Signed-off-by: Paul E. McKenney --- kernel/scftorture.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/scftorture.c b/kernel/scftorture.c index 9180de73e4e8..d9c01c722e2a 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -357,6 +357,10 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra } smp_call_function(scf_handler, scfcp, scfsp->scfs_wait); break; + default: + WARN_ON_ONCE(1); + if (scfcp) + scfcp->scfc_out = true; } if (scfcp && scfsp->scfs_wait) { if (WARN_ON_ONCE(!scfcp->scfc_out)) From a7c072ef26644b632241d549869f10f8d2dd3b5c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 2 Jul 2020 14:15:33 -0700 Subject: [PATCH 045/168] scftorture: Block scftorture_invoker() kthreads for offline CPUs Currently, CPU-hotplug operations might result in all but two of (say) 100 CPUs being offline, which in turn might result in false-positive diagnostics due to overload. This commit therefore causes scftorture_invoker() kthreads for offline CPUs to loop blocking for 200 milliseconds at a time, thus continuously adjusting the number of threads to match the number of online CPUs. Signed-off-by: Paul E. McKenney --- kernel/scftorture.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/kernel/scftorture.c b/kernel/scftorture.c index d9c01c722e2a..04d3a4279413 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -381,11 +381,14 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra // smp_call_function() family of functions. static int scftorture_invoker(void *arg) { + int cpu; DEFINE_TORTURE_RANDOM(rand); struct scf_statistics *scfp = (struct scf_statistics *)arg; + bool was_offline = false; VERBOSE_SCFTORTOUT("scftorture_invoker %d: task started", scfp->cpu); - set_cpus_allowed_ptr(current, cpumask_of(scfp->cpu % nr_cpu_ids)); + cpu = scfp->cpu % nr_cpu_ids; + set_cpus_allowed_ptr(current, cpumask_of(cpu)); set_user_nice(current, MAX_NICE); if (holdoff) schedule_timeout_interruptible(holdoff * HZ); @@ -408,6 +411,14 @@ static int scftorture_invoker(void *arg) do { scftorture_invoke_one(scfp, &rand); + while (cpu_is_offline(cpu) && !torture_must_stop()) { + schedule_timeout_interruptible(HZ / 5); + was_offline = true; + } + if (was_offline) { + set_cpus_allowed_ptr(current, cpumask_of(cpu)); + was_offline = false; + } } while (!torture_must_stop()); VERBOSE_SCFTORTOUT("scftorture_invoker %d ended", scfp->cpu); From 9e66bf03f9c538863e614a72c5799bcd9579630e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 3 Jul 2020 15:23:19 -0700 Subject: [PATCH 046/168] scftorture: Adapt memory-ordering test to UP operation On uniprocessor systems, smp_call_function() does nothing. This commit therefore avoids complaining about the lack of handler accesses in the single-CPU case where there is no handler. Signed-off-by: Paul E. McKenney --- kernel/scftorture.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/scftorture.c b/kernel/scftorture.c index 04d3a4279413..fc22bcc9a589 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -363,7 +363,8 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra scfcp->scfc_out = true; } if (scfcp && scfsp->scfs_wait) { - if (WARN_ON_ONCE(!scfcp->scfc_out)) + if (WARN_ON_ONCE((num_online_cpus() > 1 || scfsp->scfs_prim == SCF_PRIM_SINGLE) && + !scfcp->scfc_out)) atomic_inc(&n_mb_out_errs); // Leak rather than trash! else kfree(scfcp); From 65bd77f554336407f5fd7ced7a6df686767fba21 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 23 Jul 2020 15:53:02 -0700 Subject: [PATCH 047/168] scftorture: Add cond_resched() to test loop Although the test loop does randomly delay, which would provide quiescent states and so forth, it is possible for there to be a series of long smp_call_function*() handler runtimes with no delays, which results in softlockup and RCU CPU stall warning messages. This commit therefore inserts a cond_resched() into the main test loop. Signed-off-by: Paul E. McKenney --- kernel/scftorture.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/scftorture.c b/kernel/scftorture.c index fc22bcc9a589..554a521ee235 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -420,6 +420,7 @@ static int scftorture_invoker(void *arg) set_cpus_allowed_ptr(current, cpumask_of(cpu)); was_offline = false; } + cond_resched(); } while (!torture_must_stop()); VERBOSE_SCFTORTOUT("scftorture_invoker %d ended", scfp->cpu); From 4e88ec4a9eb17527e640b063f79e5b875733eb53 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 11 Aug 2020 21:18:12 -0700 Subject: [PATCH 048/168] rcuperf: Change rcuperf to rcuscale This commit further avoids conflation of rcuperf with the kernel's perf feature by renaming kernel/rcu/rcuperf.c to kernel/rcu/rcuscale.c, and also by similarly renaming the functions and variables inside this file. This has the side effect of changing the names of the kernel boot parameters, so kernel-parameters.txt and ver_functions.sh are also updated. The rcutorture --torture type was also updated from rcuperf to rcuscale. [ paulmck: Fix bugs located by Stephen Rothwell. ] Reported-by: Ingo Molnar Signed-off-by: Paul E. McKenney --- .../admin-guide/kernel-parameters.txt | 36 +- MAINTAINERS | 3 +- kernel/rcu/Kconfig.debug | 2 +- kernel/rcu/Makefile | 2 +- kernel/rcu/{rcuperf.c => rcuscale.c} | 330 +++++++++--------- ...race.sh => kvm-recheck-rcuscale-ftrace.sh} | 6 +- ...eck-rcuperf.sh => kvm-recheck-rcuscale.sh} | 14 +- tools/testing/selftests/rcutorture/bin/kvm.sh | 8 +- .../selftests/rcutorture/bin/parse-console.sh | 4 +- .../rcutorture/configs/rcuperf/CFcommon | 2 - .../configs/{rcuperf => rcuscale}/CFLIST | 0 .../rcutorture/configs/rcuscale/CFcommon | 2 + .../configs/{rcuperf => rcuscale}/TINY | 0 .../configs/{rcuperf => rcuscale}/TREE | 0 .../configs/{rcuperf => rcuscale}/TREE54 | 0 .../{rcuperf => rcuscale}/ver_functions.sh | 4 +- 16 files changed, 207 insertions(+), 206 deletions(-) rename kernel/rcu/{rcuperf.c => rcuscale.c} (64%) rename tools/testing/selftests/rcutorture/bin/{kvm-recheck-rcuperf-ftrace.sh => kvm-recheck-rcuscale-ftrace.sh} (92%) rename tools/testing/selftests/rcutorture/bin/{kvm-recheck-rcuperf.sh => kvm-recheck-rcuscale.sh} (84%) delete mode 100644 tools/testing/selftests/rcutorture/configs/rcuperf/CFcommon rename tools/testing/selftests/rcutorture/configs/{rcuperf => rcuscale}/CFLIST (100%) create mode 100644 tools/testing/selftests/rcutorture/configs/rcuscale/CFcommon rename tools/testing/selftests/rcutorture/configs/{rcuperf => rcuscale}/TINY (100%) rename tools/testing/selftests/rcutorture/configs/{rcuperf => rcuscale}/TREE (100%) rename tools/testing/selftests/rcutorture/configs/{rcuperf => rcuscale}/TREE54 (100%) rename tools/testing/selftests/rcutorture/configs/{rcuperf => rcuscale}/ver_functions.sh (88%) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 91a56382ae56..c27bbe95e7cb 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4157,41 +4157,41 @@ rcu_node tree with an eye towards determining why a new grace period has not yet started. - rcuperf.gp_async= [KNL] + rcuscale.gp_async= [KNL] Measure performance of asynchronous grace-period primitives such as call_rcu(). - rcuperf.gp_async_max= [KNL] + rcuscale.gp_async_max= [KNL] Specify the maximum number of outstanding callbacks per writer thread. When a writer thread exceeds this limit, it invokes the corresponding flavor of rcu_barrier() to allow previously posted callbacks to drain. - rcuperf.gp_exp= [KNL] + rcuscale.gp_exp= [KNL] Measure performance of expedited synchronous grace-period primitives. - rcuperf.holdoff= [KNL] + rcuscale.holdoff= [KNL] Set test-start holdoff period. The purpose of this parameter is to delay the start of the test until boot completes in order to avoid interference. - rcuperf.kfree_rcu_test= [KNL] + rcuscale.kfree_rcu_test= [KNL] Set to measure performance of kfree_rcu() flooding. - rcuperf.kfree_nthreads= [KNL] + rcuscale.kfree_nthreads= [KNL] The number of threads running loops of kfree_rcu(). - rcuperf.kfree_alloc_num= [KNL] + rcuscale.kfree_alloc_num= [KNL] Number of allocations and frees done in an iteration. - rcuperf.kfree_loops= [KNL] - Number of loops doing rcuperf.kfree_alloc_num number + rcuscale.kfree_loops= [KNL] + Number of loops doing rcuscale.kfree_alloc_num number of allocations and frees. - rcuperf.nreaders= [KNL] + rcuscale.nreaders= [KNL] Set number of RCU readers. The value -1 selects N, where N is the number of CPUs. A value "n" less than -1 selects N-n+1, where N is again @@ -4200,23 +4200,23 @@ A value of "n" less than or equal to -N selects a single reader. - rcuperf.nwriters= [KNL] + rcuscale.nwriters= [KNL] Set number of RCU writers. The values operate - the same as for rcuperf.nreaders. + the same as for rcuscale.nreaders. N, where N is the number of CPUs - rcuperf.perf_type= [KNL] + rcuscale.perf_type= [KNL] Specify the RCU implementation to test. - rcuperf.shutdown= [KNL] + rcuscale.shutdown= [KNL] Shut the system down after performance tests complete. This is useful for hands-off automated testing. - rcuperf.verbose= [KNL] + rcuscale.verbose= [KNL] Enable additional printk() statements. - rcuperf.writer_holdoff= [KNL] + rcuscale.writer_holdoff= [KNL] Write-side holdoff between grace periods, in microseconds. The default of zero says no holdoff. @@ -4490,8 +4490,8 @@ refscale.shutdown= [KNL] Shut down the system at the end of the performance test. This defaults to 1 (shut it down) when - rcuperf is built into the kernel and to 0 (leave - it running) when rcuperf is built as a module. + refscale is built into the kernel and to 0 (leave + it running) when refscale is built as a module. refscale.verbose= [KNL] Enable additional printk() statements. diff --git a/MAINTAINERS b/MAINTAINERS index deaafb617361..d299e3bb10ad 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -17510,8 +17510,9 @@ S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git dev F: Documentation/RCU/torture.rst F: kernel/locking/locktorture.c -F: kernel/rcu/rcuperf.c +F: kernel/rcu/rcuscale.c F: kernel/rcu/rcutorture.c +F: kernel/rcu/refscale.c F: kernel/torture.c TOSHIBA ACPI EXTRAS DRIVER diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 3cf6132a4bb9..5cb175df6ece 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -23,7 +23,7 @@ config TORTURE_TEST tristate default n -config RCU_PERF_TEST +config RCU_SCALE_TEST tristate "performance tests for RCU" depends on DEBUG_KERNEL select TORTURE_TEST diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 95f5117ef8da..0cfb009a99b9 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -11,7 +11,7 @@ obj-y += update.o sync.o obj-$(CONFIG_TREE_SRCU) += srcutree.o obj-$(CONFIG_TINY_SRCU) += srcutiny.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o -obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o +obj-$(CONFIG_RCU_SCALE_TEST) += rcuscale.o obj-$(CONFIG_RCU_REF_SCALE_TEST) += refscale.o obj-$(CONFIG_TREE_RCU) += tree.o obj-$(CONFIG_TINY_RCU) += tiny.o diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuscale.c similarity index 64% rename from kernel/rcu/rcuperf.c rename to kernel/rcu/rcuscale.c index 21448d3374e2..2819b95479af 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuscale.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0+ /* - * Read-Copy Update module-based performance-test facility + * Read-Copy Update module-based scalability-test facility * * Copyright (C) IBM Corporation, 2015 * @@ -44,13 +44,13 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney "); -#define PERF_FLAG "-perf:" -#define PERFOUT_STRING(s) \ - pr_alert("%s" PERF_FLAG " %s\n", perf_type, s) -#define VERBOSE_PERFOUT_STRING(s) \ - do { if (verbose) pr_alert("%s" PERF_FLAG " %s\n", perf_type, s); } while (0) -#define VERBOSE_PERFOUT_ERRSTRING(s) \ - do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0) +#define SCALE_FLAG "-scale:" +#define SCALEOUT_STRING(s) \ + pr_alert("%s" SCALE_FLAG " %s\n", scale_type, s) +#define VERBOSE_SCALEOUT_STRING(s) \ + do { if (verbose) pr_alert("%s" SCALE_FLAG " %s\n", scale_type, s); } while (0) +#define VERBOSE_SCALEOUT_ERRSTRING(s) \ + do { if (verbose) pr_alert("%s" SCALE_FLAG "!!! %s\n", scale_type, s); } while (0) /* * The intended use cases for the nreaders and nwriters module parameters @@ -61,25 +61,25 @@ MODULE_AUTHOR("Paul E. McKenney "); * nr_cpus for a mixed reader/writer test. * * 2. Specify the nr_cpus kernel boot parameter, but set - * rcuperf.nreaders to zero. This will set nwriters to the + * rcuscale.nreaders to zero. This will set nwriters to the * value specified by nr_cpus for an update-only test. * * 3. Specify the nr_cpus kernel boot parameter, but set - * rcuperf.nwriters to zero. This will set nreaders to the + * rcuscale.nwriters to zero. This will set nreaders to the * value specified by nr_cpus for a read-only test. * * Various other use cases may of course be specified. * * Note that this test's readers are intended only as a test load for - * the writers. The reader performance statistics will be overly + * the writers. The reader scalability statistics will be overly * pessimistic due to the per-critical-section interrupt disabling, * test-end checks, and the pair of calls through pointers. */ #ifdef MODULE -# define RCUPERF_SHUTDOWN 0 +# define RCUSCALE_SHUTDOWN 0 #else -# define RCUPERF_SHUTDOWN 1 +# define RCUSCALE_SHUTDOWN 1 #endif torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives"); @@ -88,16 +88,16 @@ torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); torture_param(int, holdoff, 10, "Holdoff time before test start (s)"); torture_param(int, nreaders, -1, "Number of RCU reader threads"); torture_param(int, nwriters, -1, "Number of RCU updater threads"); -torture_param(bool, shutdown, RCUPERF_SHUTDOWN, - "Shutdown at end of performance tests."); +torture_param(bool, shutdown, RCUSCALE_SHUTDOWN, + "Shutdown at end of scalability tests."); torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable"); -torture_param(int, kfree_rcu_test, 0, "Do we run a kfree_rcu() perf test?"); +torture_param(int, kfree_rcu_test, 0, "Do we run a kfree_rcu() scale test?"); torture_param(int, kfree_mult, 1, "Multiple of kfree_obj size to allocate."); -static char *perf_type = "rcu"; -module_param(perf_type, charp, 0444); -MODULE_PARM_DESC(perf_type, "Type of RCU to performance-test (rcu, srcu, ...)"); +static char *scale_type = "rcu"; +module_param(scale_type, charp, 0444); +MODULE_PARM_DESC(scale_type, "Type of RCU to scalability-test (rcu, srcu, ...)"); static int nrealreaders; static int nrealwriters; @@ -107,12 +107,12 @@ static struct task_struct *shutdown_task; static u64 **writer_durations; static int *writer_n_durations; -static atomic_t n_rcu_perf_reader_started; -static atomic_t n_rcu_perf_writer_started; -static atomic_t n_rcu_perf_writer_finished; +static atomic_t n_rcu_scale_reader_started; +static atomic_t n_rcu_scale_writer_started; +static atomic_t n_rcu_scale_writer_finished; static wait_queue_head_t shutdown_wq; -static u64 t_rcu_perf_writer_started; -static u64 t_rcu_perf_writer_finished; +static u64 t_rcu_scale_writer_started; +static u64 t_rcu_scale_writer_finished; static unsigned long b_rcu_gp_test_started; static unsigned long b_rcu_gp_test_finished; static DEFINE_PER_CPU(atomic_t, n_async_inflight); @@ -124,7 +124,7 @@ static DEFINE_PER_CPU(atomic_t, n_async_inflight); * Operations vector for selecting different types of tests. */ -struct rcu_perf_ops { +struct rcu_scale_ops { int ptype; void (*init)(void); void (*cleanup)(void); @@ -140,19 +140,19 @@ struct rcu_perf_ops { const char *name; }; -static struct rcu_perf_ops *cur_ops; +static struct rcu_scale_ops *cur_ops; /* - * Definitions for rcu perf testing. + * Definitions for rcu scalability testing. */ -static int rcu_perf_read_lock(void) __acquires(RCU) +static int rcu_scale_read_lock(void) __acquires(RCU) { rcu_read_lock(); return 0; } -static void rcu_perf_read_unlock(int idx) __releases(RCU) +static void rcu_scale_read_unlock(int idx) __releases(RCU) { rcu_read_unlock(); } @@ -162,15 +162,15 @@ static unsigned long __maybe_unused rcu_no_completed(void) return 0; } -static void rcu_sync_perf_init(void) +static void rcu_sync_scale_init(void) { } -static struct rcu_perf_ops rcu_ops = { +static struct rcu_scale_ops rcu_ops = { .ptype = RCU_FLAVOR, - .init = rcu_sync_perf_init, - .readlock = rcu_perf_read_lock, - .readunlock = rcu_perf_read_unlock, + .init = rcu_sync_scale_init, + .readlock = rcu_scale_read_lock, + .readunlock = rcu_scale_read_unlock, .get_gp_seq = rcu_get_gp_seq, .gp_diff = rcu_seq_diff, .exp_completed = rcu_exp_batches_completed, @@ -182,23 +182,23 @@ static struct rcu_perf_ops rcu_ops = { }; /* - * Definitions for srcu perf testing. + * Definitions for srcu scalability testing. */ -DEFINE_STATIC_SRCU(srcu_ctl_perf); -static struct srcu_struct *srcu_ctlp = &srcu_ctl_perf; +DEFINE_STATIC_SRCU(srcu_ctl_scale); +static struct srcu_struct *srcu_ctlp = &srcu_ctl_scale; -static int srcu_perf_read_lock(void) __acquires(srcu_ctlp) +static int srcu_scale_read_lock(void) __acquires(srcu_ctlp) { return srcu_read_lock(srcu_ctlp); } -static void srcu_perf_read_unlock(int idx) __releases(srcu_ctlp) +static void srcu_scale_read_unlock(int idx) __releases(srcu_ctlp) { srcu_read_unlock(srcu_ctlp, idx); } -static unsigned long srcu_perf_completed(void) +static unsigned long srcu_scale_completed(void) { return srcu_batches_completed(srcu_ctlp); } @@ -213,78 +213,78 @@ static void srcu_rcu_barrier(void) srcu_barrier(srcu_ctlp); } -static void srcu_perf_synchronize(void) +static void srcu_scale_synchronize(void) { synchronize_srcu(srcu_ctlp); } -static void srcu_perf_synchronize_expedited(void) +static void srcu_scale_synchronize_expedited(void) { synchronize_srcu_expedited(srcu_ctlp); } -static struct rcu_perf_ops srcu_ops = { +static struct rcu_scale_ops srcu_ops = { .ptype = SRCU_FLAVOR, - .init = rcu_sync_perf_init, - .readlock = srcu_perf_read_lock, - .readunlock = srcu_perf_read_unlock, - .get_gp_seq = srcu_perf_completed, + .init = rcu_sync_scale_init, + .readlock = srcu_scale_read_lock, + .readunlock = srcu_scale_read_unlock, + .get_gp_seq = srcu_scale_completed, .gp_diff = rcu_seq_diff, - .exp_completed = srcu_perf_completed, + .exp_completed = srcu_scale_completed, .async = srcu_call_rcu, .gp_barrier = srcu_rcu_barrier, - .sync = srcu_perf_synchronize, - .exp_sync = srcu_perf_synchronize_expedited, + .sync = srcu_scale_synchronize, + .exp_sync = srcu_scale_synchronize_expedited, .name = "srcu" }; static struct srcu_struct srcud; -static void srcu_sync_perf_init(void) +static void srcu_sync_scale_init(void) { srcu_ctlp = &srcud; init_srcu_struct(srcu_ctlp); } -static void srcu_sync_perf_cleanup(void) +static void srcu_sync_scale_cleanup(void) { cleanup_srcu_struct(srcu_ctlp); } -static struct rcu_perf_ops srcud_ops = { +static struct rcu_scale_ops srcud_ops = { .ptype = SRCU_FLAVOR, - .init = srcu_sync_perf_init, - .cleanup = srcu_sync_perf_cleanup, - .readlock = srcu_perf_read_lock, - .readunlock = srcu_perf_read_unlock, - .get_gp_seq = srcu_perf_completed, + .init = srcu_sync_scale_init, + .cleanup = srcu_sync_scale_cleanup, + .readlock = srcu_scale_read_lock, + .readunlock = srcu_scale_read_unlock, + .get_gp_seq = srcu_scale_completed, .gp_diff = rcu_seq_diff, - .exp_completed = srcu_perf_completed, + .exp_completed = srcu_scale_completed, .async = srcu_call_rcu, .gp_barrier = srcu_rcu_barrier, - .sync = srcu_perf_synchronize, - .exp_sync = srcu_perf_synchronize_expedited, + .sync = srcu_scale_synchronize, + .exp_sync = srcu_scale_synchronize_expedited, .name = "srcud" }; /* - * Definitions for RCU-tasks perf testing. + * Definitions for RCU-tasks scalability testing. */ -static int tasks_perf_read_lock(void) +static int tasks_scale_read_lock(void) { return 0; } -static void tasks_perf_read_unlock(int idx) +static void tasks_scale_read_unlock(int idx) { } -static struct rcu_perf_ops tasks_ops = { +static struct rcu_scale_ops tasks_ops = { .ptype = RCU_TASKS_FLAVOR, - .init = rcu_sync_perf_init, - .readlock = tasks_perf_read_lock, - .readunlock = tasks_perf_read_unlock, + .init = rcu_sync_scale_init, + .readlock = tasks_scale_read_lock, + .readunlock = tasks_scale_read_unlock, .get_gp_seq = rcu_no_completed, .gp_diff = rcu_seq_diff, .async = call_rcu_tasks, @@ -294,7 +294,7 @@ static struct rcu_perf_ops tasks_ops = { .name = "tasks" }; -static unsigned long rcuperf_seq_diff(unsigned long new, unsigned long old) +static unsigned long rcuscale_seq_diff(unsigned long new, unsigned long old) { if (!cur_ops->gp_diff) return new - old; @@ -302,60 +302,60 @@ static unsigned long rcuperf_seq_diff(unsigned long new, unsigned long old) } /* - * If performance tests complete, wait for shutdown to commence. + * If scalability tests complete, wait for shutdown to commence. */ -static void rcu_perf_wait_shutdown(void) +static void rcu_scale_wait_shutdown(void) { cond_resched_tasks_rcu_qs(); - if (atomic_read(&n_rcu_perf_writer_finished) < nrealwriters) + if (atomic_read(&n_rcu_scale_writer_finished) < nrealwriters) return; while (!torture_must_stop()) schedule_timeout_uninterruptible(1); } /* - * RCU perf reader kthread. Repeatedly does empty RCU read-side critical - * section, minimizing update-side interference. However, the point of - * this test is not to evaluate reader performance, but instead to serve - * as a test load for update-side performance testing. + * RCU scalability reader kthread. Repeatedly does empty RCU read-side + * critical section, minimizing update-side interference. However, the + * point of this test is not to evaluate reader scalability, but instead + * to serve as a test load for update-side scalability testing. */ static int -rcu_perf_reader(void *arg) +rcu_scale_reader(void *arg) { unsigned long flags; int idx; long me = (long)arg; - VERBOSE_PERFOUT_STRING("rcu_perf_reader task started"); + VERBOSE_SCALEOUT_STRING("rcu_scale_reader task started"); set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); set_user_nice(current, MAX_NICE); - atomic_inc(&n_rcu_perf_reader_started); + atomic_inc(&n_rcu_scale_reader_started); do { local_irq_save(flags); idx = cur_ops->readlock(); cur_ops->readunlock(idx); local_irq_restore(flags); - rcu_perf_wait_shutdown(); + rcu_scale_wait_shutdown(); } while (!torture_must_stop()); - torture_kthread_stopping("rcu_perf_reader"); + torture_kthread_stopping("rcu_scale_reader"); return 0; } /* - * Callback function for asynchronous grace periods from rcu_perf_writer(). + * Callback function for asynchronous grace periods from rcu_scale_writer(). */ -static void rcu_perf_async_cb(struct rcu_head *rhp) +static void rcu_scale_async_cb(struct rcu_head *rhp) { atomic_dec(this_cpu_ptr(&n_async_inflight)); kfree(rhp); } /* - * RCU perf writer kthread. Repeatedly does a grace period. + * RCU scale writer kthread. Repeatedly does a grace period. */ static int -rcu_perf_writer(void *arg) +rcu_scale_writer(void *arg) { int i = 0; int i_max; @@ -366,7 +366,7 @@ rcu_perf_writer(void *arg) u64 *wdp; u64 *wdpp = writer_durations[me]; - VERBOSE_PERFOUT_STRING("rcu_perf_writer task started"); + VERBOSE_SCALEOUT_STRING("rcu_scale_writer task started"); WARN_ON(!wdpp); set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); sched_set_fifo_low(current); @@ -383,8 +383,8 @@ rcu_perf_writer(void *arg) schedule_timeout_uninterruptible(1); t = ktime_get_mono_fast_ns(); - if (atomic_inc_return(&n_rcu_perf_writer_started) >= nrealwriters) { - t_rcu_perf_writer_started = t; + if (atomic_inc_return(&n_rcu_scale_writer_started) >= nrealwriters) { + t_rcu_scale_writer_started = t; if (gp_exp) { b_rcu_gp_test_started = cur_ops->exp_completed() / 2; @@ -404,7 +404,7 @@ retry: rhp = kmalloc(sizeof(*rhp), GFP_KERNEL); if (rhp && atomic_read(this_cpu_ptr(&n_async_inflight)) < gp_async_max) { atomic_inc(this_cpu_ptr(&n_async_inflight)); - cur_ops->async(rhp, rcu_perf_async_cb); + cur_ops->async(rhp, rcu_scale_async_cb); rhp = NULL; } else if (!kthread_should_stop()) { cur_ops->gp_barrier(); @@ -421,19 +421,19 @@ retry: *wdp = t - *wdp; i_max = i; if (!started && - atomic_read(&n_rcu_perf_writer_started) >= nrealwriters) + atomic_read(&n_rcu_scale_writer_started) >= nrealwriters) started = true; if (!done && i >= MIN_MEAS) { done = true; sched_set_normal(current, 0); - pr_alert("%s%s rcu_perf_writer %ld has %d measurements\n", - perf_type, PERF_FLAG, me, MIN_MEAS); - if (atomic_inc_return(&n_rcu_perf_writer_finished) >= + pr_alert("%s%s rcu_scale_writer %ld has %d measurements\n", + scale_type, SCALE_FLAG, me, MIN_MEAS); + if (atomic_inc_return(&n_rcu_scale_writer_finished) >= nrealwriters) { schedule_timeout_interruptible(10); rcu_ftrace_dump(DUMP_ALL); - PERFOUT_STRING("Test complete"); - t_rcu_perf_writer_finished = t; + SCALEOUT_STRING("Test complete"); + t_rcu_scale_writer_finished = t; if (gp_exp) { b_rcu_gp_test_finished = cur_ops->exp_completed() / 2; @@ -448,30 +448,30 @@ retry: } } if (done && !alldone && - atomic_read(&n_rcu_perf_writer_finished) >= nrealwriters) + atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters) alldone = true; if (started && !alldone && i < MAX_MEAS - 1) i++; - rcu_perf_wait_shutdown(); + rcu_scale_wait_shutdown(); } while (!torture_must_stop()); if (gp_async) { cur_ops->gp_barrier(); } writer_n_durations[me] = i_max; - torture_kthread_stopping("rcu_perf_writer"); + torture_kthread_stopping("rcu_scale_writer"); return 0; } static void -rcu_perf_print_module_parms(struct rcu_perf_ops *cur_ops, const char *tag) +rcu_scale_print_module_parms(struct rcu_scale_ops *cur_ops, const char *tag) { - pr_alert("%s" PERF_FLAG + pr_alert("%s" SCALE_FLAG "--- %s: nreaders=%d nwriters=%d verbose=%d shutdown=%d\n", - perf_type, tag, nrealreaders, nrealwriters, verbose, shutdown); + scale_type, tag, nrealreaders, nrealwriters, verbose, shutdown); } static void -rcu_perf_cleanup(void) +rcu_scale_cleanup(void) { int i; int j; @@ -484,11 +484,11 @@ rcu_perf_cleanup(void) * during the mid-boot phase, so have to wait till the end. */ if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp) - VERBOSE_PERFOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!"); + VERBOSE_SCALEOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!"); if (rcu_gp_is_normal() && gp_exp) - VERBOSE_PERFOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!"); + VERBOSE_SCALEOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!"); if (gp_exp && gp_async) - VERBOSE_PERFOUT_ERRSTRING("No expedited async GPs, so went with async!"); + VERBOSE_SCALEOUT_ERRSTRING("No expedited async GPs, so went with async!"); if (torture_cleanup_begin()) return; @@ -499,30 +499,30 @@ rcu_perf_cleanup(void) if (reader_tasks) { for (i = 0; i < nrealreaders; i++) - torture_stop_kthread(rcu_perf_reader, + torture_stop_kthread(rcu_scale_reader, reader_tasks[i]); kfree(reader_tasks); } if (writer_tasks) { for (i = 0; i < nrealwriters; i++) { - torture_stop_kthread(rcu_perf_writer, + torture_stop_kthread(rcu_scale_writer, writer_tasks[i]); if (!writer_n_durations) continue; j = writer_n_durations[i]; pr_alert("%s%s writer %d gps: %d\n", - perf_type, PERF_FLAG, i, j); + scale_type, SCALE_FLAG, i, j); ngps += j; } pr_alert("%s%s start: %llu end: %llu duration: %llu gps: %d batches: %ld\n", - perf_type, PERF_FLAG, - t_rcu_perf_writer_started, t_rcu_perf_writer_finished, - t_rcu_perf_writer_finished - - t_rcu_perf_writer_started, + scale_type, SCALE_FLAG, + t_rcu_scale_writer_started, t_rcu_scale_writer_finished, + t_rcu_scale_writer_finished - + t_rcu_scale_writer_started, ngps, - rcuperf_seq_diff(b_rcu_gp_test_finished, - b_rcu_gp_test_started)); + rcuscale_seq_diff(b_rcu_gp_test_finished, + b_rcu_gp_test_started)); for (i = 0; i < nrealwriters; i++) { if (!writer_durations) break; @@ -534,7 +534,7 @@ rcu_perf_cleanup(void) for (j = 0; j <= writer_n_durations[i]; j++) { wdp = &wdpp[j]; pr_alert("%s%s %4d writer-duration: %5d %llu\n", - perf_type, PERF_FLAG, + scale_type, SCALE_FLAG, i, j, *wdp); if (j % 100 == 0) schedule_timeout_uninterruptible(1); @@ -573,22 +573,22 @@ static int compute_real(int n) } /* - * RCU perf shutdown kthread. Just waits to be awakened, then shuts + * RCU scalability shutdown kthread. Just waits to be awakened, then shuts * down system. */ static int -rcu_perf_shutdown(void *arg) +rcu_scale_shutdown(void *arg) { wait_event(shutdown_wq, - atomic_read(&n_rcu_perf_writer_finished) >= nrealwriters); + atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters); smp_mb(); /* Wake before output. */ - rcu_perf_cleanup(); + rcu_scale_cleanup(); kernel_power_off(); return -EINVAL; } /* - * kfree_rcu() performance tests: Start a kfree_rcu() loop on all CPUs for number + * kfree_rcu() scalability tests: Start a kfree_rcu() loop on all CPUs for number * of iterations and measure total time and number of GP for all iterations to complete. */ @@ -598,8 +598,8 @@ torture_param(int, kfree_loops, 10, "Number of loops doing kfree_alloc_num alloc static struct task_struct **kfree_reader_tasks; static int kfree_nrealthreads; -static atomic_t n_kfree_perf_thread_started; -static atomic_t n_kfree_perf_thread_ended; +static atomic_t n_kfree_scale_thread_started; +static atomic_t n_kfree_scale_thread_ended; struct kfree_obj { char kfree_obj[8]; @@ -607,7 +607,7 @@ struct kfree_obj { }; static int -kfree_perf_thread(void *arg) +kfree_scale_thread(void *arg) { int i, loop = 0; long me = (long)arg; @@ -615,13 +615,13 @@ kfree_perf_thread(void *arg) u64 start_time, end_time; long long mem_begin, mem_during = 0; - VERBOSE_PERFOUT_STRING("kfree_perf_thread task started"); + VERBOSE_SCALEOUT_STRING("kfree_scale_thread task started"); set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); set_user_nice(current, MAX_NICE); start_time = ktime_get_mono_fast_ns(); - if (atomic_inc_return(&n_kfree_perf_thread_started) >= kfree_nrealthreads) { + if (atomic_inc_return(&n_kfree_scale_thread_started) >= kfree_nrealthreads) { if (gp_exp) b_rcu_gp_test_started = cur_ops->exp_completed() / 2; else @@ -646,7 +646,7 @@ kfree_perf_thread(void *arg) cond_resched(); } while (!torture_must_stop() && ++loop < kfree_loops); - if (atomic_inc_return(&n_kfree_perf_thread_ended) >= kfree_nrealthreads) { + if (atomic_inc_return(&n_kfree_scale_thread_ended) >= kfree_nrealthreads) { end_time = ktime_get_mono_fast_ns(); if (gp_exp) @@ -656,7 +656,7 @@ kfree_perf_thread(void *arg) pr_alert("Total time taken by all kfree'ers: %llu ns, loops: %d, batches: %ld, memory footprint: %lldMB\n", (unsigned long long)(end_time - start_time), kfree_loops, - rcuperf_seq_diff(b_rcu_gp_test_finished, b_rcu_gp_test_started), + rcuscale_seq_diff(b_rcu_gp_test_finished, b_rcu_gp_test_started), (mem_begin - mem_during) >> (20 - PAGE_SHIFT)); if (shutdown) { @@ -665,12 +665,12 @@ kfree_perf_thread(void *arg) } } - torture_kthread_stopping("kfree_perf_thread"); + torture_kthread_stopping("kfree_scale_thread"); return 0; } static void -kfree_perf_cleanup(void) +kfree_scale_cleanup(void) { int i; @@ -679,7 +679,7 @@ kfree_perf_cleanup(void) if (kfree_reader_tasks) { for (i = 0; i < kfree_nrealthreads; i++) - torture_stop_kthread(kfree_perf_thread, + torture_stop_kthread(kfree_scale_thread, kfree_reader_tasks[i]); kfree(kfree_reader_tasks); } @@ -691,20 +691,20 @@ kfree_perf_cleanup(void) * shutdown kthread. Just waits to be awakened, then shuts down system. */ static int -kfree_perf_shutdown(void *arg) +kfree_scale_shutdown(void *arg) { wait_event(shutdown_wq, - atomic_read(&n_kfree_perf_thread_ended) >= kfree_nrealthreads); + atomic_read(&n_kfree_scale_thread_ended) >= kfree_nrealthreads); smp_mb(); /* Wake before output. */ - kfree_perf_cleanup(); + kfree_scale_cleanup(); kernel_power_off(); return -EINVAL; } static int __init -kfree_perf_init(void) +kfree_scale_init(void) { long i; int firsterr = 0; @@ -713,7 +713,7 @@ kfree_perf_init(void) /* Start up the kthreads. */ if (shutdown) { init_waitqueue_head(&shutdown_wq); - firsterr = torture_create_kthread(kfree_perf_shutdown, NULL, + firsterr = torture_create_kthread(kfree_scale_shutdown, NULL, shutdown_task); if (firsterr) goto unwind; @@ -730,13 +730,13 @@ kfree_perf_init(void) } for (i = 0; i < kfree_nrealthreads; i++) { - firsterr = torture_create_kthread(kfree_perf_thread, (void *)i, + firsterr = torture_create_kthread(kfree_scale_thread, (void *)i, kfree_reader_tasks[i]); if (firsterr) goto unwind; } - while (atomic_read(&n_kfree_perf_thread_started) < kfree_nrealthreads) + while (atomic_read(&n_kfree_scale_thread_started) < kfree_nrealthreads) schedule_timeout_uninterruptible(1); torture_init_end(); @@ -744,35 +744,35 @@ kfree_perf_init(void) unwind: torture_init_end(); - kfree_perf_cleanup(); + kfree_scale_cleanup(); return firsterr; } static int __init -rcu_perf_init(void) +rcu_scale_init(void) { long i; int firsterr = 0; - static struct rcu_perf_ops *perf_ops[] = { + static struct rcu_scale_ops *scale_ops[] = { &rcu_ops, &srcu_ops, &srcud_ops, &tasks_ops, }; - if (!torture_init_begin(perf_type, verbose)) + if (!torture_init_begin(scale_type, verbose)) return -EBUSY; - /* Process args and tell the world that the perf'er is on the job. */ - for (i = 0; i < ARRAY_SIZE(perf_ops); i++) { - cur_ops = perf_ops[i]; - if (strcmp(perf_type, cur_ops->name) == 0) + /* Process args and announce that the scalability'er is on the job. */ + for (i = 0; i < ARRAY_SIZE(scale_ops); i++) { + cur_ops = scale_ops[i]; + if (strcmp(scale_type, cur_ops->name) == 0) break; } - if (i == ARRAY_SIZE(perf_ops)) { - pr_alert("rcu-perf: invalid perf type: \"%s\"\n", perf_type); - pr_alert("rcu-perf types:"); - for (i = 0; i < ARRAY_SIZE(perf_ops); i++) - pr_cont(" %s", perf_ops[i]->name); + if (i == ARRAY_SIZE(scale_ops)) { + pr_alert("rcu-scale: invalid scale type: \"%s\"\n", scale_type); + pr_alert("rcu-scale types:"); + for (i = 0; i < ARRAY_SIZE(scale_ops); i++) + pr_cont(" %s", scale_ops[i]->name); pr_cont("\n"); - WARN_ON(!IS_MODULE(CONFIG_RCU_PERF_TEST)); + WARN_ON(!IS_MODULE(CONFIG_RCU_SCALE_TEST)); firsterr = -EINVAL; cur_ops = NULL; goto unwind; @@ -781,20 +781,20 @@ rcu_perf_init(void) cur_ops->init(); if (kfree_rcu_test) - return kfree_perf_init(); + return kfree_scale_init(); nrealwriters = compute_real(nwriters); nrealreaders = compute_real(nreaders); - atomic_set(&n_rcu_perf_reader_started, 0); - atomic_set(&n_rcu_perf_writer_started, 0); - atomic_set(&n_rcu_perf_writer_finished, 0); - rcu_perf_print_module_parms(cur_ops, "Start of test"); + atomic_set(&n_rcu_scale_reader_started, 0); + atomic_set(&n_rcu_scale_writer_started, 0); + atomic_set(&n_rcu_scale_writer_finished, 0); + rcu_scale_print_module_parms(cur_ops, "Start of test"); /* Start up the kthreads. */ if (shutdown) { init_waitqueue_head(&shutdown_wq); - firsterr = torture_create_kthread(rcu_perf_shutdown, NULL, + firsterr = torture_create_kthread(rcu_scale_shutdown, NULL, shutdown_task); if (firsterr) goto unwind; @@ -803,17 +803,17 @@ rcu_perf_init(void) reader_tasks = kcalloc(nrealreaders, sizeof(reader_tasks[0]), GFP_KERNEL); if (reader_tasks == NULL) { - VERBOSE_PERFOUT_ERRSTRING("out of memory"); + VERBOSE_SCALEOUT_ERRSTRING("out of memory"); firsterr = -ENOMEM; goto unwind; } for (i = 0; i < nrealreaders; i++) { - firsterr = torture_create_kthread(rcu_perf_reader, (void *)i, + firsterr = torture_create_kthread(rcu_scale_reader, (void *)i, reader_tasks[i]); if (firsterr) goto unwind; } - while (atomic_read(&n_rcu_perf_reader_started) < nrealreaders) + while (atomic_read(&n_rcu_scale_reader_started) < nrealreaders) schedule_timeout_uninterruptible(1); writer_tasks = kcalloc(nrealwriters, sizeof(reader_tasks[0]), GFP_KERNEL); @@ -823,7 +823,7 @@ rcu_perf_init(void) kcalloc(nrealwriters, sizeof(*writer_n_durations), GFP_KERNEL); if (!writer_tasks || !writer_durations || !writer_n_durations) { - VERBOSE_PERFOUT_ERRSTRING("out of memory"); + VERBOSE_SCALEOUT_ERRSTRING("out of memory"); firsterr = -ENOMEM; goto unwind; } @@ -835,7 +835,7 @@ rcu_perf_init(void) firsterr = -ENOMEM; goto unwind; } - firsterr = torture_create_kthread(rcu_perf_writer, (void *)i, + firsterr = torture_create_kthread(rcu_scale_writer, (void *)i, writer_tasks[i]); if (firsterr) goto unwind; @@ -845,9 +845,9 @@ rcu_perf_init(void) unwind: torture_init_end(); - rcu_perf_cleanup(); + rcu_scale_cleanup(); return firsterr; } -module_init(rcu_perf_init); -module_exit(rcu_perf_cleanup); +module_init(rcu_scale_init); +module_exit(rcu_scale_cleanup); diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf-ftrace.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuscale-ftrace.sh similarity index 92% rename from tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf-ftrace.sh rename to tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuscale-ftrace.sh index 7d3c2be66c64..d4bec538086d 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf-ftrace.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuscale-ftrace.sh @@ -1,12 +1,12 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0+ # -# Analyze a given results directory for rcuperf performance measurements, +# Analyze a given results directory for rcuscale performance measurements, # looking for ftrace data. Exits with 0 if data was found, analyzed, and -# printed. Intended to be invoked from kvm-recheck-rcuperf.sh after +# printed. Intended to be invoked from kvm-recheck-rcuscale.sh after # argument checking. # -# Usage: kvm-recheck-rcuperf-ftrace.sh resdir +# Usage: kvm-recheck-rcuscale-ftrace.sh resdir # # Copyright (C) IBM Corporation, 2016 # diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuscale.sh similarity index 84% rename from tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf.sh rename to tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuscale.sh index db0375a57f28..aa745152a525 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuscale.sh @@ -1,9 +1,9 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0+ # -# Analyze a given results directory for rcuperf performance measurements. +# Analyze a given results directory for rcuscale scalability measurements. # -# Usage: kvm-recheck-rcuperf.sh resdir +# Usage: kvm-recheck-rcuscale.sh resdir # # Copyright (C) IBM Corporation, 2016 # @@ -20,7 +20,7 @@ fi PATH=`pwd`/tools/testing/selftests/rcutorture/bin:$PATH; export PATH . functions.sh -if kvm-recheck-rcuperf-ftrace.sh $i +if kvm-recheck-rcuscale-ftrace.sh $i then # ftrace data was successfully analyzed, call it good! exit 0 @@ -30,12 +30,12 @@ configfile=`echo $i | sed -e 's/^.*\///'` sed -e 's/^\[[^]]*]//' < $i/console.log | awk ' -/-perf: .* gps: .* batches:/ { +/-scale: .* gps: .* batches:/ { ngps = $9; nbatches = $11; } -/-perf: .*writer-duration/ { +/-scale: .*writer-duration/ { gptimes[++n] = $5 / 1000.; sum += $5 / 1000.; } @@ -43,7 +43,7 @@ awk ' END { newNR = asort(gptimes); if (newNR <= 0) { - print "No rcuperf records found???" + print "No rcuscale records found???" exit; } pct50 = int(newNR * 50 / 100); @@ -79,5 +79,5 @@ END { print "99th percentile grace-period duration: " gptimes[pct99]; print "Maximum grace-period duration: " gptimes[newNR]; print "Grace periods: " ngps + 0 " Batches: " nbatches + 0 " Ratio: " ngps / nbatches; - print "Computed from rcuperf printk output."; + print "Computed from rcuscale printk output."; }' diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 44dfdd9be67e..0489c198a72a 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -65,7 +65,7 @@ usage () { echo " --qemu-args qemu-arguments" echo " --qemu-cmd qemu-system-..." echo " --results absolute-pathname" - echo " --torture rcu" + echo " --torture lock|rcu|rcuscale|refscale|scf" echo " --trust-make" exit 1 } @@ -184,13 +184,13 @@ do shift ;; --torture) - checkarg --torture "(suite name)" "$#" "$2" '^\(lock\|rcu\|rcuperf\|refscale\|scf\)$' '^--' + checkarg --torture "(suite name)" "$#" "$2" '^\(lock\|rcu\|rcuscale\|refscale\|scf\)$' '^--' TORTURE_SUITE=$2 shift - if test "$TORTURE_SUITE" = rcuperf || test "$TORTURE_SUITE" = refscale + if test "$TORTURE_SUITE" = rcuscale || test "$TORTURE_SUITE" = refscale then # If you really want jitter for refscale or - # rcuperf, specify it after specifying the rcuperf + # rcuscale, specify it after specifying the rcuscale # or the refscale. (But why jitter in these cases?) jitter=0 fi diff --git a/tools/testing/selftests/rcutorture/bin/parse-console.sh b/tools/testing/selftests/rcutorture/bin/parse-console.sh index 4e081a25761e..e03338091a06 100755 --- a/tools/testing/selftests/rcutorture/bin/parse-console.sh +++ b/tools/testing/selftests/rcutorture/bin/parse-console.sh @@ -33,8 +33,8 @@ then fi cat /dev/null > $file.diags -# Check for proper termination, except for rcuperf and refscale. -if test "$TORTURE_SUITE" != rcuperf && test "$TORTURE_SUITE" != refscale +# Check for proper termination, except for rcuscale and refscale. +if test "$TORTURE_SUITE" != rcuscale && test "$TORTURE_SUITE" != refscale then # check for abject failure diff --git a/tools/testing/selftests/rcutorture/configs/rcuperf/CFcommon b/tools/testing/selftests/rcutorture/configs/rcuperf/CFcommon deleted file mode 100644 index a09816b8c0f3..000000000000 --- a/tools/testing/selftests/rcutorture/configs/rcuperf/CFcommon +++ /dev/null @@ -1,2 +0,0 @@ -CONFIG_RCU_PERF_TEST=y -CONFIG_PRINTK_TIME=y diff --git a/tools/testing/selftests/rcutorture/configs/rcuperf/CFLIST b/tools/testing/selftests/rcutorture/configs/rcuscale/CFLIST similarity index 100% rename from tools/testing/selftests/rcutorture/configs/rcuperf/CFLIST rename to tools/testing/selftests/rcutorture/configs/rcuscale/CFLIST diff --git a/tools/testing/selftests/rcutorture/configs/rcuscale/CFcommon b/tools/testing/selftests/rcutorture/configs/rcuscale/CFcommon new file mode 100644 index 000000000000..87caa0e932c7 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcuscale/CFcommon @@ -0,0 +1,2 @@ +CONFIG_RCU_SCALE_TEST=y +CONFIG_PRINTK_TIME=y diff --git a/tools/testing/selftests/rcutorture/configs/rcuperf/TINY b/tools/testing/selftests/rcutorture/configs/rcuscale/TINY similarity index 100% rename from tools/testing/selftests/rcutorture/configs/rcuperf/TINY rename to tools/testing/selftests/rcutorture/configs/rcuscale/TINY diff --git a/tools/testing/selftests/rcutorture/configs/rcuperf/TREE b/tools/testing/selftests/rcutorture/configs/rcuscale/TREE similarity index 100% rename from tools/testing/selftests/rcutorture/configs/rcuperf/TREE rename to tools/testing/selftests/rcutorture/configs/rcuscale/TREE diff --git a/tools/testing/selftests/rcutorture/configs/rcuperf/TREE54 b/tools/testing/selftests/rcutorture/configs/rcuscale/TREE54 similarity index 100% rename from tools/testing/selftests/rcutorture/configs/rcuperf/TREE54 rename to tools/testing/selftests/rcutorture/configs/rcuscale/TREE54 diff --git a/tools/testing/selftests/rcutorture/configs/rcuperf/ver_functions.sh b/tools/testing/selftests/rcutorture/configs/rcuscale/ver_functions.sh similarity index 88% rename from tools/testing/selftests/rcutorture/configs/rcuperf/ver_functions.sh rename to tools/testing/selftests/rcutorture/configs/rcuscale/ver_functions.sh index 777d5b0c190f..0333e9b18522 100644 --- a/tools/testing/selftests/rcutorture/configs/rcuperf/ver_functions.sh +++ b/tools/testing/selftests/rcutorture/configs/rcuscale/ver_functions.sh @@ -11,6 +11,6 @@ # # Adds per-version torture-module parameters to kernels supporting them. per_version_boot_params () { - echo $1 rcuperf.shutdown=1 \ - rcuperf.verbose=1 + echo $1 rcuscale.shutdown=1 \ + rcuscale.verbose=1 } From 8cbd0e38a9f2de38e8991c5c1c6f9024b2731d17 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 5 Aug 2020 15:51:20 -0700 Subject: [PATCH 049/168] rcu: Add Kconfig option for strict RCU grace periods People running automated tests have asked for a way to make RCU minimize grace-period duration in order to increase the probability of KASAN detecting a pointer being improperly leaked from an RCU read-side critical section, for example, like this: rcu_read_lock(); p = rcu_dereference(gp); do_something_with(p); // OK rcu_read_unlock(); do_something_else_with(p); // BUG!!! The rcupdate.rcu_expedited boot parameter is a start in this direction, given that it makes calls to synchronize_rcu() instead invoke the faster (and more wasteful) synchronize_rcu_expedited(). However, this does nothing to shorten RCU grace periods that are instead initiated by call_rcu(), and RCU pointer-leak bugs can involve call_rcu() just as surely as they can synchronize_rcu(). This commit therefore adds a RCU_STRICT_GRACE_PERIOD Kconfig option that will be used to shorten normal (non-expedited) RCU grace periods. This commit also dumps out a message when this option is in effect. Later commits will actually shorten grace periods. Reported-by Jann Horn Signed-off-by: Paul E. McKenney --- kernel/rcu/Kconfig.debug | 15 +++++++++++++++ kernel/rcu/tree_plugin.h | 2 ++ 2 files changed, 17 insertions(+) diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 3cf6132a4bb9..cab5a4bebe9c 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -114,4 +114,19 @@ config RCU_EQS_DEBUG Say N here if you need ultimate kernel/user switch latencies Say Y if you are unsure +config RCU_STRICT_GRACE_PERIOD + bool "Provide debug RCU implementation with short grace periods" + depends on DEBUG_KERNEL && RCU_EXPERT + default n + select PREEMPT_COUNT if PREEMPT=n + help + Select this option to build an RCU variant that is strict about + grace periods, making them as short as it can. This limits + scalability, destroys real-time response, degrades battery + lifetime and kills performance. Don't try this on large + machines, as in systems with more than about 10 or 20 CPUs. + But in conjunction with tools like KASAN, it can be helpful + when looking for certain types of RCU usage bugs, for example, + too-short RCU read-side critical sections. + endmenu # "RCU Debugging" diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 982fc5be5269..44cf77db7cae 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -36,6 +36,8 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n"); if (IS_ENABLED(CONFIG_PROVE_RCU)) pr_info("\tRCU lockdep checking is enabled.\n"); + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) + pr_info("\tRCU strict (and thus non-scalable) grace periods enabled.\n"); if (RCU_NUM_LVLS >= 4) pr_info("\tFour(or more)-level hierarchy is enabled.\n"); if (RCU_FANOUT_LEAF != 16) From dc1269186bed3afc5a2018527516be84fe55d3e0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 5 Aug 2020 16:52:17 -0700 Subject: [PATCH 050/168] rcu: Reduce leaf fanout for strict RCU grace periods Because strict RCU grace periods will complete more quickly, they will experience greater lock contention on each leaf rcu_node structure's ->lock. This commit therefore reduces the leaf fanout in order to reduce this lock contention. Note that this also has the effect of reducing the number of CPUs supported to 16 in the case of CONFIG_RCU_FANOUT_LEAF=2 or 81 in the case of CONFIG_RCU_FANOUT_LEAF=3. However, greater numbers of CPUs are probably a bad idea when using CONFIG_RCU_STRICT_GRACE_PERIOD=y. Those wishing to live dangerously are free to edit their kernel/rcu/Kconfig files accordingly. Reported-by Jann Horn Signed-off-by: Paul E. McKenney --- kernel/rcu/Kconfig | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 0ebe15a84985..b71e21f73c40 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -135,10 +135,12 @@ config RCU_FANOUT config RCU_FANOUT_LEAF int "Tree-based hierarchical RCU leaf-level fanout value" - range 2 64 if 64BIT - range 2 32 if !64BIT + range 2 64 if 64BIT && !RCU_STRICT_GRACE_PERIOD + range 2 32 if !64BIT && !RCU_STRICT_GRACE_PERIOD + range 2 3 if RCU_STRICT_GRACE_PERIOD depends on TREE_RCU && RCU_EXPERT - default 16 + default 16 if !RCU_STRICT_GRACE_PERIOD + default 2 if RCU_STRICT_GRACE_PERIOD help This option controls the leaf-level fanout of hierarchical implementations of RCU, and allows trading off cache misses From aecd34b9765de3b58c98a1d75b982fc64becd1e9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 5 Aug 2020 17:25:23 -0700 Subject: [PATCH 051/168] rcu: Restrict default jiffies_till_first_fqs for strict RCU GPs If there are idle CPUs, RCU's grace-period kthread will wait several jiffies before even thinking about polling them. This promotes efficiency, which is normally a good thing, but when the kernel has been built with CONFIG_RCU_STRICT_GRACE_PERIOD=y, we care more about short grace periods. This commit therefore restricts the default jiffies_till_first_fqs value to zero in kernels built with CONFIG_RCU_STRICT_GRACE_PERIOD=y, which causes RCU's grace-period kthread to poll for idle CPUs immediately after starting a grace period. Reported-by Jann Horn Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8ce77d9ac716..85511590fc38 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -485,7 +485,7 @@ module_param(qhimark, long, 0444); module_param(qlowmark, long, 0444); module_param(qovld, long, 0444); -static ulong jiffies_till_first_fqs = ULONG_MAX; +static ulong jiffies_till_first_fqs = IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ? 0 : ULONG_MAX; static ulong jiffies_till_next_fqs = ULONG_MAX; static bool rcu_kick_kthreads; static int rcu_divisor = 7; From 29fc5f93320cb447f83baedfe103ed784cadb073 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 6 Aug 2020 06:39:30 -0700 Subject: [PATCH 052/168] rcu: Force DEFAULT_RCU_BLIMIT to 1000 for strict RCU GPs The value of DEFAULT_RCU_BLIMIT is normally set to 10, the idea being to avoid needless response-time degradation due to RCU callback invocation. However, when CONFIG_RCU_STRICT_GRACE_PERIOD=y it is better to avoid throttling callback execution in order to better detect pointer leaks from RCU read-side critical sections. This commit therefore sets the value of DEFAULT_RCU_BLIMIT to 1000 in kernels built with CONFIG_RCU_STRICT_GRACE_PERIOD=y. Reported-by Jann Horn Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 85511590fc38..443685704f5e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -468,17 +468,18 @@ static int rcu_is_cpu_rrupt_from_idle(void) return __this_cpu_read(rcu_data.dynticks_nesting) == 0; } -#define DEFAULT_RCU_BLIMIT 10 /* Maximum callbacks per rcu_do_batch ... */ -#define DEFAULT_MAX_RCU_BLIMIT 10000 /* ... even during callback flood. */ +#define DEFAULT_RCU_BLIMIT (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ? 1000 : 10) + // Maximum callbacks per rcu_do_batch ... +#define DEFAULT_MAX_RCU_BLIMIT 10000 // ... even during callback flood. static long blimit = DEFAULT_RCU_BLIMIT; -#define DEFAULT_RCU_QHIMARK 10000 /* If this many pending, ignore blimit. */ +#define DEFAULT_RCU_QHIMARK 10000 // If this many pending, ignore blimit. static long qhimark = DEFAULT_RCU_QHIMARK; -#define DEFAULT_RCU_QLOMARK 100 /* Once only this many pending, use blimit. */ +#define DEFAULT_RCU_QLOMARK 100 // Once only this many pending, use blimit. static long qlowmark = DEFAULT_RCU_QLOMARK; #define DEFAULT_RCU_QOVLD_MULT 2 #define DEFAULT_RCU_QOVLD (DEFAULT_RCU_QOVLD_MULT * DEFAULT_RCU_QHIMARK) -static long qovld = DEFAULT_RCU_QOVLD; /* If this many pending, hammer QS. */ -static long qovld_calc = -1; /* No pre-initialization lock acquisitions! */ +static long qovld = DEFAULT_RCU_QOVLD; // If this many pending, hammer QS. +static long qovld_calc = -1; // No pre-initialization lock acquisitions! module_param(blimit, long, 0444); module_param(qhimark, long, 0444); From f19920e412fdeed1e15691bcee5b40e18b8e96ff Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 6 Aug 2020 09:40:18 -0700 Subject: [PATCH 053/168] rcu: Always set .need_qs from __rcu_read_lock() for strict GPs The ->rcu_read_unlock_special.b.need_qs field in the task_struct structure indicates that the RCU core needs a quiscent state from the corresponding task. The __rcu_read_unlock() function checks this (via an eventual call to rcu_preempt_deferred_qs_irqrestore()), and if set reports a quiscent state immediately upon exit from the outermost RCU read-side critical section. Currently, this flag is only set when the scheduling-clock interrupt decides that the current RCU grace period is too old, as in about one full second too old. But if the kernel has been built with CONFIG_RCU_STRICT_GRACE_PERIOD=y, we clearly do not want to wait that long. This commit therefore sets the .need_qs field immediately at the start of the RCU read-side critical section from within __rcu_read_lock() in order to unconditionally enlist help from __rcu_read_unlock(). But note the additional check for rcu_state.gp_kthread, which prevents attempts to awaken RCU's grace-period kthread during early boot before there is a scheduler. Leaving off this check results in early boot hangs. So early that there is no console output. Thus, this additional check fails until such time as RCU's grace-period kthread has been created, avoiding these empty-console hangs. Reported-by Jann Horn Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 44cf77db7cae..668bbd2be807 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -376,6 +376,8 @@ void __rcu_read_lock(void) rcu_preempt_read_enter(); if (IS_ENABLED(CONFIG_PROVE_LOCKING)) WARN_ON_ONCE(rcu_preempt_depth() > RCU_NEST_PMAX); + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) && rcu_state.gp_kthread) + WRITE_ONCE(current->rcu_read_unlock_special.b.need_qs, true); barrier(); /* critical section after entry code. */ } EXPORT_SYMBOL_GPL(__rcu_read_lock); From 44bad5b3cca2d452d17ef82841b20b42a2cf11a0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 6 Aug 2020 15:12:50 -0700 Subject: [PATCH 054/168] rcu: Do full report for .need_qs for strict GPs The rcu_preempt_deferred_qs_irqrestore() function is invoked at the end of an RCU read-side critical section (for example, directly from rcu_read_unlock()) and, if .need_qs is set, invokes rcu_qs() to report the new quiescent state. This works, except that rcu_qs() only updates per-CPU state, leaving reporting of the actual quiescent state to a later call to rcu_report_qs_rdp(), for example from within a later RCU_SOFTIRQ instance. Although this approach is exactly what you want if you are more concerned about efficiency than about short grace periods, in CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels, short grace periods are the name of the game. This commit therefore makes rcu_preempt_deferred_qs_irqrestore() directly invoke rcu_report_qs_rdp() in CONFIG_RCU_STRICT_GRACE_PERIOD=y, thus shortening grace periods. Historical note: To the best of my knowledge, causing rcu_read_unlock() to directly report a quiescent state first appeared in Jim Houston's and Joe Korty's JRCU. This is the second instance of a Linux-kernel RCU feature being inspired by JRCU, the first being RCU callback offloading (as in the RCU_NOCB_CPU Kconfig option). Reported-by Jann Horn Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 668bbd2be807..dfdb9020f136 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -459,8 +459,12 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) return; } t->rcu_read_unlock_special.s = 0; - if (special.b.need_qs) - rcu_qs(); + if (special.b.need_qs) { + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) + rcu_report_qs_rdp(rdp->cpu, rdp); + else + rcu_qs(); + } /* * Respond to a request by an expedited grace period for a From 1a2f5d57a33f7b9189b6b3e997eb858301482d79 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 6 Aug 2020 16:35:08 -0700 Subject: [PATCH 055/168] rcu: Attempt QS when CPU discovers GP for strict GPs A given CPU normally notes a new grace period during one RCU_SOFTIRQ, but avoids reporting the corresponding quiescent state until some later RCU_SOFTIRQ. This leisurly approach improves efficiency by increasing the number of update requests served by each grace period, but is not what is needed for kernels built with CONFIG_RCU_STRICT_GRACE_PERIOD=y. This commit therefore adds a new rcu_strict_gp_check_qs() function which, in CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels, simply enters and immediately exist an RCU read-side critical section. If the CPU is in a quiescent state, the rcu_read_unlock() will attempt to report an immediate quiescent state. This rcu_strict_gp_check_qs() function is invoked from note_gp_changes(), so that a CPU just noticing a new grace period might immediately report a quiescent state for that grace period. Reported-by Jann Horn Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 443685704f5e..36a860c4648b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1574,6 +1574,19 @@ static void __maybe_unused rcu_advance_cbs_nowake(struct rcu_node *rnp, raw_spin_unlock_rcu_node(rnp); } +/* + * In CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels, attempt to generate a + * quiescent state. This is intended to be invoked when the CPU notices + * a new grace period. + */ +static void rcu_strict_gp_check_qs(void) +{ + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) { + rcu_read_lock(); + rcu_read_unlock(); + } +} + /* * Update CPU-local rcu_data state to record the beginnings and ends of * grace periods. The caller must hold the ->lock of the leaf rcu_node @@ -1644,6 +1657,7 @@ static void note_gp_changes(struct rcu_data *rdp) } needwake = __note_gp_changes(rnp, rdp); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + rcu_strict_gp_check_qs(); if (needwake) rcu_gp_kthread_wake(); } From 933ada2c3310aa88807e65c8d498b74a2159a9a2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 6 Aug 2020 19:21:48 -0700 Subject: [PATCH 056/168] rcu: IPI all CPUs at GP start for strict GPs Currently, each CPU discovers the beginning of a given grace period on its own time, which is again good for efficiency but bad for fast grace periods. This commit therefore uses on_each_cpu() to IPI each CPU after grace-period initialization in order to inform each CPU of the new grace period in a timely manner, but only in kernels build with CONFIG_RCU_STRICT_GRACE_PERIOD=y. Reported-by Jann Horn Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 36a860c4648b..88f4fa639964 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1695,6 +1695,15 @@ static void rcu_gp_torture_wait(void) } } +/* + * Handler for on_each_cpu() to invoke the target CPU's RCU core + * processing. + */ +static void rcu_strict_gp_boundary(void *unused) +{ + invoke_rcu_core(); +} + /* * Initialize a new grace period. Return false if no grace period required. */ @@ -1823,6 +1832,10 @@ static bool rcu_gp_init(void) WRITE_ONCE(rcu_state.gp_activity, jiffies); } + // If strict, make all CPUs aware of new grace period. + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) + on_each_cpu(rcu_strict_gp_boundary, NULL, 0); + return true; } From 4e025f52a1e0e8ff4e303fa0a80e2061ccfa27d6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 6 Aug 2020 19:42:47 -0700 Subject: [PATCH 057/168] rcu: IPI all CPUs at GP end for strict GPs Currently, each CPU discovers the end of a given grace period on its own time, which is again good for efficiency but bad for fast grace periods, given that it is things like kfree() within the RCU callbacks that will cause trouble for pointers leaked from RCU read-side critical sections. This commit therefore uses on_each_cpu() to IPI each CPU after grace-period cleanup in order to inform each CPU of the end of the old grace period in a timely manner, but only in kernels build with CONFIG_RCU_STRICT_GRACE_PERIOD=y. Reported-by Jann Horn Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 88f4fa639964..4bbedfc0f79b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2052,6 +2052,10 @@ static void rcu_gp_cleanup(void) rcu_state.gp_flags & RCU_GP_FLAG_INIT); } raw_spin_unlock_irq_rcu_node(rnp); + + // If strict, make all CPUs aware of the end of the old grace period. + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) + on_each_cpu(rcu_strict_gp_boundary, NULL, 0); } /* From 3d29aaf1ef992b5b4612fe32b9e6f517f7bba904 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 7 Aug 2020 13:44:10 -0700 Subject: [PATCH 058/168] rcu: Provide optional RCU-reader exit delay for strict GPs The goal of this series is to increase the probability of tools like KASAN detecting that an RCU-protected pointer was used outside of its RCU read-side critical section. Thus far, the approach has been to make grace periods and callback processing happen faster. Another approach is to delay the pointer leaker. This commit therefore allows a delay to be applied to exit from RCU read-side critical sections. This slowdown is specified by a new rcutree.rcu_unlock_delay kernel boot parameter that specifies this delay in microseconds, defaulting to zero. Reported-by Jann Horn Signed-off-by: Paul E. McKenney --- Documentation/admin-guide/kernel-parameters.txt | 9 +++++++++ kernel/rcu/tree_plugin.h | 12 ++++++++++-- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index bdc1f33fd3d1..cb9062440dda 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4152,6 +4152,15 @@ This wake_up() will be accompanied by a WARN_ONCE() splat and an ftrace_dump(). + rcutree.rcu_unlock_delay= [KNL] + In CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels, + this specifies an rcu_read_unlock()-time delay + in microseconds. This defaults to zero. + Larger delays increase the probability of + catching RCU pointer leaks, that is, buggy use + of RCU-protected pointers after the relevant + rcu_read_unlock() has completed. + rcutree.sysrq_rcu= [KNL] Commandeer a sysrq key to dump out Tree RCU's rcu_node tree with an eye towards determining diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index dfdb9020f136..3f3a4ffd4df2 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -430,6 +430,12 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp) return !list_empty(&rnp->blkd_tasks); } +// Add delay to rcu_read_unlock() for strict grace periods. +static int rcu_unlock_delay; +#ifdef CONFIG_RCU_STRICT_GRACE_PERIOD +module_param(rcu_unlock_delay, int, 0444); +#endif + /* * Report deferred quiescent states. The deferral time can * be quite short, for example, in the case of the call from @@ -460,10 +466,12 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) } t->rcu_read_unlock_special.s = 0; if (special.b.need_qs) { - if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) { rcu_report_qs_rdp(rdp->cpu, rdp); - else + udelay(rcu_unlock_delay); + } else { rcu_qs(); + } } /* From a657f2617010ae237db5693f875968c28e8f732f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 8 Aug 2020 07:56:31 -0700 Subject: [PATCH 059/168] rcu: Execute RCU reader shortly after rcu_core for strict GPs A kernel built with CONFIG_RCU_STRICT_GRACE_PERIOD=y needs a quiescent state to appear very shortly after a CPU has noticed a new grace period. Placing an RCU reader immediately after this point is ineffective because this normally happens in softirq context, which acts as a big RCU reader. This commit therefore introduces a new per-CPU work_struct, which is used at the end of rcu_core() processing to schedule an RCU read-side critical section from within a clean environment. Reported-by Jann Horn Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 13 +++++++++++++ kernel/rcu/tree.h | 1 + 2 files changed, 14 insertions(+) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4bbedfc0f79b..31995b3f0ed9 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2646,6 +2646,14 @@ void rcu_force_quiescent_state(void) } EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); +// Workqueue handler for an RCU reader for kernels enforcing struct RCU +// grace periods. +static void strict_work_handler(struct work_struct *work) +{ + rcu_read_lock(); + rcu_read_unlock(); +} + /* Perform RCU core processing work for the current CPU. */ static __latent_entropy void rcu_core(void) { @@ -2690,6 +2698,10 @@ static __latent_entropy void rcu_core(void) /* Do any needed deferred wakeups of rcuo kthreads. */ do_nocb_deferred_wakeup(rdp); trace_rcu_utilization(TPS("End RCU core")); + + // If strict GPs, schedule an RCU reader in a clean environment. + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) + queue_work_on(rdp->cpu, rcu_gp_wq, &rdp->strict_work); } static void rcu_core_si(struct softirq_action *h) @@ -3887,6 +3899,7 @@ rcu_boot_init_percpu_data(int cpu) /* Set up local state, ensuring consistent view of global state. */ rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu); + INIT_WORK(&rdp->strict_work, strict_work_handler); WARN_ON_ONCE(rdp->dynticks_nesting != 1); WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp))); rdp->rcu_ofl_gp_seq = rcu_state.gp_seq; diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index c96ae351688b..5831ac0b254f 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -164,6 +164,7 @@ struct rcu_data { /* period it is aware of. */ struct irq_work defer_qs_iw; /* Obtain later scheduler attention. */ bool defer_qs_iw_pending; /* Scheduler attention pending? */ + struct work_struct strict_work; /* Schedule readers for strict GPs. */ /* 2) batch handling */ struct rcu_segcblist cblist; /* Segmented callback list, with */ From aa40c138cc8f36e2f5c721fd1bdb823a1ef1a237 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 10 Aug 2020 09:58:03 -0700 Subject: [PATCH 060/168] rcu: Report QS for outermost PREEMPT=n rcu_read_unlock() for strict GPs The CONFIG_PREEMPT=n instance of rcu_read_unlock is even more aggressively than that of CONFIG_PREEMPT=y in deferring reporting quiescent states to the RCU core. This is just what is wanted in normal use because it reduces overhead, but the resulting delay is not what is wanted for kernels built with CONFIG_RCU_STRICT_GRACE_PERIOD=y. This commit therefore adds an rcu_read_unlock_strict() function that checks for exceptional conditions, and reports the newly started quiescent state if it is safe to do so, also doing a spin-delay if requested via rcutree.rcu_unlock_delay. This commit also adds a call to rcu_read_unlock_strict() from the CONFIG_PREEMPT=n instance of __rcu_read_unlock(). [ paulmck: Fixed bug located by kernel test robot ] Reported-by Jann Horn Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 7 +++++++ kernel/rcu/tree.c | 6 ++++++ kernel/rcu/tree_plugin.h | 24 ++++++++++++++++++------ 3 files changed, 31 insertions(+), 6 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index d15d46db61f7..522529a13786 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -55,6 +55,12 @@ void __rcu_read_unlock(void); #else /* #ifdef CONFIG_PREEMPT_RCU */ +#ifdef CONFIG_TINY_RCU +#define rcu_read_unlock_strict() do { } while (0) +#else +void rcu_read_unlock_strict(void); +#endif + static inline void __rcu_read_lock(void) { preempt_disable(); @@ -63,6 +69,7 @@ static inline void __rcu_read_lock(void) static inline void __rcu_read_unlock(void) { preempt_enable(); + rcu_read_unlock_strict(); } static inline int rcu_preempt_depth(void) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 31995b3f0ed9..a295cadf7c2f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -178,6 +178,12 @@ module_param(gp_init_delay, int, 0444); static int gp_cleanup_delay; module_param(gp_cleanup_delay, int, 0444); +// Add delay to rcu_read_unlock() for strict grace periods. +static int rcu_unlock_delay; +#ifdef CONFIG_RCU_STRICT_GRACE_PERIOD +module_param(rcu_unlock_delay, int, 0444); +#endif + /* * This rcu parameter is runtime-read-only. It reflects * a minimum allowed number of objects which can be cached diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 3f3a4ffd4df2..25a676dff5de 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -430,12 +430,6 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp) return !list_empty(&rnp->blkd_tasks); } -// Add delay to rcu_read_unlock() for strict grace periods. -static int rcu_unlock_delay; -#ifdef CONFIG_RCU_STRICT_GRACE_PERIOD -module_param(rcu_unlock_delay, int, 0444); -#endif - /* * Report deferred quiescent states. The deferral time can * be quite short, for example, in the case of the call from @@ -784,6 +778,24 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) #else /* #ifdef CONFIG_PREEMPT_RCU */ +/* + * If strict grace periods are enabled, and if the calling + * __rcu_read_unlock() marks the beginning of a quiescent state, immediately + * report that quiescent state and, if requested, spin for a bit. + */ +void rcu_read_unlock_strict(void) +{ + struct rcu_data *rdp; + + if (!IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) || + irqs_disabled() || preempt_count() || !rcu_state.gp_kthread) + return; + rdp = this_cpu_ptr(&rcu_data); + rcu_report_qs_rdp(rdp->cpu, rdp); + udelay(rcu_unlock_delay); +} +EXPORT_SYMBOL_GPL(rcu_read_unlock_strict); + /* * Tell them what RCU they are running. */ From cfeac3977ab4b6222a01f79997739d2367a8cc94 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 20 Aug 2020 11:26:14 -0700 Subject: [PATCH 061/168] rcu: Remove unused "cpu" parameter from rcu_report_qs_rdp() The "cpu" parameter to rcu_report_qs_rdp() is not used, with rdp->cpu being used instead. Furtheremore, every call to rcu_report_qs_rdp() invokes it on rdp->cpu. This commit therefore removes this unused "cpu" parameter and converts a check of rdp->cpu against smp_processor_id() to a WARN_ON_ONCE(). Reported-by: Jann Horn Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 8 ++++---- kernel/rcu/tree_plugin.h | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a295cadf7c2f..c6127651efc6 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2240,7 +2240,7 @@ rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) * structure. This must be called from the specified CPU. */ static void -rcu_report_qs_rdp(int cpu, struct rcu_data *rdp) +rcu_report_qs_rdp(struct rcu_data *rdp) { unsigned long flags; unsigned long mask; @@ -2249,6 +2249,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp) rcu_segcblist_is_offloaded(&rdp->cblist); struct rcu_node *rnp; + WARN_ON_ONCE(rdp->cpu != smp_processor_id()); rnp = rdp->mynode; raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rdp->cpu_no_qs.b.norm || rdp->gp_seq != rnp->gp_seq || @@ -2265,8 +2266,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp) return; } mask = rdp->grpmask; - if (rdp->cpu == smp_processor_id()) - rdp->core_needs_qs = false; + rdp->core_needs_qs = false; if ((rnp->qsmask & mask) == 0) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } else { @@ -2315,7 +2315,7 @@ rcu_check_quiescent_state(struct rcu_data *rdp) * Tell RCU we are done (but rcu_report_qs_rdp() will be the * judge of that). */ - rcu_report_qs_rdp(rdp->cpu, rdp); + rcu_report_qs_rdp(rdp); } /* diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 25a676dff5de..ca31be019f55 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -461,7 +461,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) t->rcu_read_unlock_special.s = 0; if (special.b.need_qs) { if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) { - rcu_report_qs_rdp(rdp->cpu, rdp); + rcu_report_qs_rdp(rdp); udelay(rcu_unlock_delay); } else { rcu_qs(); @@ -791,7 +791,7 @@ void rcu_read_unlock_strict(void) irqs_disabled() || preempt_count() || !rcu_state.gp_kthread) return; rdp = this_cpu_ptr(&rcu_data); - rcu_report_qs_rdp(rdp->cpu, rdp); + rcu_report_qs_rdp(rdp); udelay(rcu_unlock_delay); } EXPORT_SYMBOL_GPL(rcu_read_unlock_strict); From 83224afd11d71e0d6effb86fe1ab5725d5415251 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 17 Jun 2020 13:22:17 -0700 Subject: [PATCH 062/168] rcutorture: Remove KCSAN stubs KCSAN is now in mainline, so this commit removes the stubs for the data_race(), ASSERT_EXCLUSIVE_WRITER(), and ASSERT_EXCLUSIVE_ACCESS() macros. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index f453bf8d2f1e..db3786133644 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -52,19 +52,6 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney and Josh Triplett "); -#ifndef data_race -#define data_race(expr) \ - ({ \ - expr; \ - }) -#endif -#ifndef ASSERT_EXCLUSIVE_WRITER -#define ASSERT_EXCLUSIVE_WRITER(var) do { } while (0) -#endif -#ifndef ASSERT_EXCLUSIVE_ACCESS -#define ASSERT_EXCLUSIVE_ACCESS(var) do { } while (0) -#endif - /* Bits for ->extendables field, extendables param, and related definitions. */ #define RCUTORTURE_RDR_SHIFT 8 /* Put SRCU index in upper bits. */ #define RCUTORTURE_RDR_MASK ((1 << RCUTORTURE_RDR_SHIFT) - 1) From 959954df0ca7da2111c3fb67a666681798d15b9d Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Thu, 18 Jun 2020 16:29:55 -0400 Subject: [PATCH 063/168] rcutorture: Output number of elapsed grace periods This commit adds code to print the grace-period number at the start of the test along with both the grace-period number and the number of elapsed grace periods at the end of the test. Note that variants of RCU)without the notion of a grace-period number (for example, Tiny RCU) just print zeroes. [ paulmck: Adjust commit log. ] Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index db3786133644..c8206ff6007f 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -172,6 +172,7 @@ static long n_barrier_successes; /* did rcu_barrier test succeed? */ static unsigned long n_read_exits; static struct list_head rcu_torture_removed; static unsigned long shutdown_jiffies; +static unsigned long start_gp_seq; static int rcu_torture_writer_state; #define RTWS_FIXED_DELAY 0 @@ -2469,8 +2470,9 @@ rcu_torture_cleanup(void) rcutorture_get_gp_data(cur_ops->ttype, &flags, &gp_seq); srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, &flags, &gp_seq); - pr_alert("%s: End-test grace-period state: g%lu f%#x\n", - cur_ops->name, gp_seq, flags); + pr_alert("%s: End-test grace-period state: g%ld f%#x total-gps=%ld\n", + cur_ops->name, (long)gp_seq, flags, + rcutorture_seq_diff(gp_seq, start_gp_seq)); torture_stop_kthread(rcu_torture_stats, stats_task); torture_stop_kthread(rcu_torture_fqs, fqs_task); if (rcu_torture_can_boost()) @@ -2594,6 +2596,8 @@ rcu_torture_init(void) long i; int cpu; int firsterr = 0; + int flags = 0; + unsigned long gp_seq = 0; static struct rcu_torture_ops *torture_ops[] = { &rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, &busted_srcud_ops, &tasks_ops, &tasks_rude_ops, @@ -2636,6 +2640,11 @@ rcu_torture_init(void) nrealreaders = 1; } rcu_torture_print_module_parms(cur_ops, "Start of test"); + rcutorture_get_gp_data(cur_ops->ttype, &flags, &gp_seq); + srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, &flags, &gp_seq); + start_gp_seq = gp_seq; + pr_alert("%s: Start-test grace-period state: g%ld f%#x\n", + cur_ops->name, (long)gp_seq, flags); /* Set up the freelist. */ From fbb9f8531a0d6693189783d295114db4c30624ca Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Thu, 2 Jul 2020 15:59:05 -0700 Subject: [PATCH 064/168] torture: document --allcpus argument added to the kvm.sh script Signed-off-by: Paul Gortmaker Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index e655983b7429..0a0846389a6e 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -46,6 +46,7 @@ jitter="-1" usage () { echo "Usage: $scriptname optional arguments:" + echo " --allcpus" echo " --bootargs kernel-boot-arguments" echo " --bootimage relative-path-to-kernel-boot-image" echo " --buildonly" From d49bed9abc3454bd123cbe974ecbeae119701b92 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Fri, 3 Jul 2020 13:05:27 +0800 Subject: [PATCH 065/168] locktorture: Make function torture_percpu_rwsem_init() static The sparse tool complains as follows: kernel/locking/locktorture.c:569:6: warning: symbol 'torture_percpu_rwsem_init' was not declared. Should it be static? And this function is not used outside of locktorture.c, so this commit marks it static. Signed-off-by: Wei Yongjun Signed-off-by: Paul E. McKenney --- kernel/locking/locktorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 9cfa5e89cff7..62d215b2e39f 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -566,7 +566,7 @@ static struct lock_torture_ops rwsem_lock_ops = { #include static struct percpu_rw_semaphore pcpu_rwsem; -void torture_percpu_rwsem_init(void) +static void torture_percpu_rwsem_init(void) { BUG_ON(percpu_init_rwsem(&pcpu_rwsem)); } From afcdf2319d11e0d68e45babd5df65f79771074b5 Mon Sep 17 00:00:00 2001 From: "Alexander A. Klimov" Date: Mon, 13 Jul 2020 21:37:06 +0200 Subject: [PATCH 066/168] rcutorture: Replace HTTP links with HTTPS ones Rationale: Reduces attack surface on kernel devs opening the links for MITM as HTTPS traffic is much harder to manipulate. Deterministic algorithm: For each file: If not .svg: For each line: If doesn't contain `\bxmlns\b`: For each link, `\bhttp://[^# \t\r\n]*(?:\w|/)`: If neither `\bgnu\.org/license`, nor `\bmozilla\.org/MPL\b`: If both the HTTP and HTTPS versions return 200 OK and serve the same content: Replace HTTP with HTTPS. Signed-off-by: Alexander A. Klimov Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/doc/rcu-test-image.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/rcutorture/doc/rcu-test-image.txt b/tools/testing/selftests/rcutorture/doc/rcu-test-image.txt index 449cf579d6f9..cc280ba157a3 100644 --- a/tools/testing/selftests/rcutorture/doc/rcu-test-image.txt +++ b/tools/testing/selftests/rcutorture/doc/rcu-test-image.txt @@ -36,7 +36,7 @@ References: https://help.ubuntu.com/community/JeOSVMBuilder http://wiki.libvirt.org/page/UbuntuKVMWalkthrough http://www.moe.co.uk/2011/01/07/pci_add_option_rom-failed-to-find-romfile-pxe-rtl8139-bin/ -- "apt-get install kvm-pxe" - http://www.landley.net/writing/rootfs-howto.html - http://en.wikipedia.org/wiki/Initrd - http://en.wikipedia.org/wiki/Cpio + https://www.landley.net/writing/rootfs-howto.html + https://en.wikipedia.org/wiki/Initrd + https://en.wikipedia.org/wiki/Cpio http://wiki.libvirt.org/page/UbuntuKVMWalkthrough From 33595581f53011d1f0ba64a9a2f76d6fa5528f7f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 13 Jul 2020 14:18:33 -0700 Subject: [PATCH 067/168] torture: Update initrd documentation The rcu-test-image.txt documentation covers a very uncommon case where a real userspace environment is required. However, someone reading this document might reasonably conclude that this is in fact a prerequisite. In addition, the initrd.txt file mentions dracut, which is no longer used. This commit therefore provides the needed updates. Signed-off-by: Paul E. McKenney --- .../selftests/rcutorture/doc/initrd.txt | 36 ++++--------------- .../rcutorture/doc/rcu-test-image.txt | 35 +++++++++++++++--- 2 files changed, 37 insertions(+), 34 deletions(-) diff --git a/tools/testing/selftests/rcutorture/doc/initrd.txt b/tools/testing/selftests/rcutorture/doc/initrd.txt index 933b4fd12327..41a4255865d4 100644 --- a/tools/testing/selftests/rcutorture/doc/initrd.txt +++ b/tools/testing/selftests/rcutorture/doc/initrd.txt @@ -1,12 +1,11 @@ -The rcutorture scripting tools automatically create the needed initrd -directory using dracut. Failing that, this tool will create an initrd -containing a single statically linked binary named "init" that loops -over a very long sleep() call. In both cases, this creation is done -by tools/testing/selftests/rcutorture/bin/mkinitrd.sh. +The rcutorture scripting tools automatically create an initrd containing +a single statically linked binary named "init" that loops over a +very long sleep() call. In both cases, this creation is done by +tools/testing/selftests/rcutorture/bin/mkinitrd.sh. -However, if you are attempting to run rcutorture on a system that does -not have dracut installed, and if you don't like the notion of static -linking, you might wish to press an existing initrd into service: +However, if you don't like the notion of statically linked bare-bones +userspace environments, you might wish to press an existing initrd +into service: ------------------------------------------------------------------------ cd tools/testing/selftests/rcutorture @@ -15,24 +14,3 @@ mkdir initrd cd initrd cpio -id < /tmp/initrd.img.zcat # Manually verify that initrd contains needed binaries and libraries. ------------------------------------------------------------------------- - -Interestingly enough, if you are running rcutorture, you don't really -need userspace in many cases. Running without userspace has the -advantage of allowing you to test your kernel independently of the -distro in place, the root-filesystem layout, and so on. To make this -happen, put the following script in the initrd's tree's "/init" file, -with 0755 mode. - ------------------------------------------------------------------------- -#!/bin/sh - -while : -do - sleep 10 -done ------------------------------------------------------------------------- - -This approach also allows most of the binaries and libraries in the -initrd filesystem to be dispensed with, which can save significant -space in rcutorture's "res" directory. diff --git a/tools/testing/selftests/rcutorture/doc/rcu-test-image.txt b/tools/testing/selftests/rcutorture/doc/rcu-test-image.txt index cc280ba157a3..b2fc247976b1 100644 --- a/tools/testing/selftests/rcutorture/doc/rcu-test-image.txt +++ b/tools/testing/selftests/rcutorture/doc/rcu-test-image.txt @@ -1,8 +1,33 @@ -This document describes one way to create the rcu-test-image file -that contains the filesystem used by the guest-OS kernel. There are -probably much better ways of doing this, and this filesystem could no -doubt be smaller. It is probably also possible to simply download -an appropriate image from any number of places. +Normally, a minimal initrd is created automatically by the rcutorture +scripting. But minimal really does mean "minimal", namely just a single +root directory with a single statically linked executable named "init": + +$ size tools/testing/selftests/rcutorture/initrd/init + text data bss dec hex filename + 328 0 8 336 150 tools/testing/selftests/rcutorture/initrd/init + +Suppose you need to run some scripts, perhaps to monitor or control +some aspect of the rcutorture testing. This will require a more fully +filled-out userspace, perhaps containing libraries, executables for +the shell and other utilities, and soforth. In that case, place your +desired filesystem here: + + tools/testing/selftests/rcutorture/initrd + +For example, your tools/testing/selftests/rcutorture/initrd/init might +be a script that does any needed mount operations and starts whatever +scripts need starting to properly monitor or control your testing. +The next rcutorture build will then incorporate this filesystem into +the kernel image that is passed to qemu. + +Or maybe you need a real root filesystem for some reason, in which case +please read on! + +The remainder of this document describes one way to create the +rcu-test-image file that contains the filesystem used by the guest-OS +kernel. There are probably much better ways of doing this, and this +filesystem could no doubt be smaller. It is probably also possible to +simply download an appropriate image from any number of places. That said, here are the commands: From fc848cf4face352dce663c1fcc73717fba2d4557 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 14 Jul 2020 11:02:15 -0700 Subject: [PATCH 068/168] rcutorture: Add CONFIG_PROVE_RCU_LIST to TREE05 Currently, the CONFIG_PROVE_RCU_LIST=y case is untested. This commit therefore adds CONFIG_PROVE_RCU_LIST=y to rcutorture's TREE05 scenario. Cc: Madhuparna Bhowmik Cc: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/configs/rcu/TREE05 | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE05 b/tools/testing/selftests/rcutorture/configs/rcu/TREE05 index 2dde0d9964e3..4f95f8544f3f 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE05 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE05 @@ -16,5 +16,6 @@ CONFIG_RCU_NOCB_CPU=y CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y #CHECK#CONFIG_PROVE_RCU=y +CONFIG_PROVE_RCU_LIST=y CONFIG_DEBUG_OBJECTS_RCU_HEAD=n CONFIG_RCU_EXPERT=y From 5461808889405de254ab3370aa7f07ac0b6cb938 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 19 Jul 2020 12:17:53 -0700 Subject: [PATCH 069/168] torture: Add kvm.sh --help and update help message This commit adds a --help argument (along with its synonym -h) to display the help text. While in the area, this commit also updates the help text. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm.sh | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 0a0846389a6e..fc15b527172f 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -56,17 +56,18 @@ usage () { echo " --defconfig string" echo " --dryrun sched|script" echo " --duration minutes" + echo " --help" echo " --interactive" echo " --jitter N [ maxsleep (us) [ maxspin (us) ] ]" echo " --kconfig Kconfig-options" echo " --kmake-arg kernel-make-arguments" echo " --mac nn:nn:nn:nn:nn:nn" - echo " --memory megabytes | nnnG" + echo " --memory megabytes|nnnG" echo " --no-initrd" echo " --qemu-args qemu-arguments" echo " --qemu-cmd qemu-system-..." echo " --results absolute-pathname" - echo " --torture rcu" + echo " --torture lock|rcu|rcuperf|refscale|scf" echo " --trust-make" exit 1 } @@ -127,6 +128,9 @@ do dur=$(($2*60)) shift ;; + --help|-h) + usage + ;; --interactive) TORTURE_QEMU_INTERACTIVE=1; export TORTURE_QEMU_INTERACTIVE ;; From c8fa63714763b7795a3f5fb7ed6d000763e6dccc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 19 Jul 2020 14:40:31 -0700 Subject: [PATCH 070/168] rcutorture: Properly set rcu_fwds for OOM handling The conversion of rcu_fwds to dynamic allocation failed to actually allocate the required structure. This commit therefore allocates it, frees it, and updates rcu_fwds accordingly. While in the area, it abstracts the cleanup actions into rcu_torture_fwd_prog_cleanup(). Fixes: 5155be9994e5 ("rcutorture: Dynamically allocate rcu_fwds structure") Reported-by: kernel test robot Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index c8206ff6007f..7942be453a14 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2148,9 +2148,20 @@ static int __init rcu_torture_fwd_prog_init(void) return -ENOMEM; spin_lock_init(&rfp->rcu_fwd_lock); rfp->rcu_fwd_cb_tail = &rfp->rcu_fwd_cb_head; + rcu_fwds = rfp; return torture_create_kthread(rcu_torture_fwd_prog, rfp, fwd_prog_task); } +static void rcu_torture_fwd_prog_cleanup(void) +{ + struct rcu_fwd *rfp; + + torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task); + rfp = rcu_fwds; + rcu_fwds = NULL; + kfree(rfp); +} + /* Callback function for RCU barrier testing. */ static void rcu_torture_barrier_cbf(struct rcu_head *rcu) { @@ -2448,7 +2459,7 @@ rcu_torture_cleanup(void) show_rcu_gp_kthreads(); rcu_torture_read_exit_cleanup(); rcu_torture_barrier_cleanup(); - torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task); + rcu_torture_fwd_prog_cleanup(); torture_stop_kthread(rcu_torture_stall, stall_task); torture_stop_kthread(rcu_torture_writer, writer_task); From 57f602022e82ee8fa6476d0e16ddbaf3eb86b245 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 20 Jul 2020 08:34:07 -0700 Subject: [PATCH 071/168] rcutorture: Properly synchronize with OOM notifier The current rcutorture forward-progress code assumes that it is the only cause of out-of-memory (OOM) events. For script-based rcutorture testing, this assumption is in fact correct. However, testing based on modprobe/rmmod might well encounter external OOM events, which could happen at any time. This commit therefore properly synchronizes the interaction between rcutorture's forward-progress testing and its OOM notifier by adding a global mutex. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 7942be453a14..2b3f04e0af03 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1796,6 +1796,7 @@ struct rcu_fwd { unsigned long rcu_launder_gp_seq_start; }; +static DEFINE_MUTEX(rcu_fwd_mutex); static struct rcu_fwd *rcu_fwds; static bool rcu_fwd_emergency_stop; @@ -2062,8 +2063,14 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp) static int rcutorture_oom_notify(struct notifier_block *self, unsigned long notused, void *nfreed) { - struct rcu_fwd *rfp = rcu_fwds; + struct rcu_fwd *rfp; + mutex_lock(&rcu_fwd_mutex); + rfp = rcu_fwds; + if (!rfp) { + mutex_unlock(&rcu_fwd_mutex); + return NOTIFY_OK; + } WARN(1, "%s invoked upon OOM during forward-progress testing.\n", __func__); rcu_torture_fwd_cb_hist(rfp); @@ -2081,6 +2088,7 @@ static int rcutorture_oom_notify(struct notifier_block *self, smp_mb(); /* Frees before return to avoid redoing OOM. */ (*(unsigned long *)nfreed)++; /* Forward progress CBs freed! */ pr_info("%s returning after OOM processing.\n", __func__); + mutex_unlock(&rcu_fwd_mutex); return NOTIFY_OK; } @@ -2148,7 +2156,9 @@ static int __init rcu_torture_fwd_prog_init(void) return -ENOMEM; spin_lock_init(&rfp->rcu_fwd_lock); rfp->rcu_fwd_cb_tail = &rfp->rcu_fwd_cb_head; + mutex_lock(&rcu_fwd_mutex); rcu_fwds = rfp; + mutex_unlock(&rcu_fwd_mutex); return torture_create_kthread(rcu_torture_fwd_prog, rfp, fwd_prog_task); } @@ -2158,7 +2168,9 @@ static void rcu_torture_fwd_prog_cleanup(void) torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task); rfp = rcu_fwds; + mutex_lock(&rcu_fwd_mutex); rcu_fwds = NULL; + mutex_unlock(&rcu_fwd_mutex); kfree(rfp); } From 58db5785b0d76be4582a32a7900acce88e691d36 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 16 Jul 2020 15:38:56 +0100 Subject: [PATCH 072/168] refperf: Avoid null pointer dereference when buf fails to allocate Currently in the unlikely event that buf fails to be allocated it is dereferenced a few times. Use the errexit flag to determine if buf should be written to to avoid the null pointer dereferences. Addresses-Coverity: ("Dereference after null check") Fixes: f518f154ecef ("refperf: Dynamically allocate experiment-summary output buffer") Signed-off-by: Colin Ian King Signed-off-by: Paul E. McKenney --- kernel/rcu/refscale.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index d9291f883b54..952595c678b3 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -546,9 +546,11 @@ static int main_func(void *arg) // Print the average of all experiments SCALEOUT("END OF TEST. Calculating average duration per loop (nanoseconds)...\n"); - buf[0] = 0; - strcat(buf, "\n"); - strcat(buf, "Runs\tTime(ns)\n"); + if (!errexit) { + buf[0] = 0; + strcat(buf, "\n"); + strcat(buf, "Runs\tTime(ns)\n"); + } for (exp = 0; exp < nruns; exp++) { u64 avg; From 299c7d94f635ab93ffb0468aec6b6e2176ec5cbf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 22 Jul 2020 10:45:12 -0700 Subject: [PATCH 073/168] rcutorture: Hoist OOM registry up one level Currently, registering and unregistering the OOM notifier is done right before and after the test, respectively. This will not work well for multi-threaded tests, so this commit hoists this registering and unregistering up into the rcu_torture_fwd_prog_init() and rcu_torture_fwd_prog_cleanup() functions. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 2b3f04e0af03..983f82fccb18 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2110,13 +2110,11 @@ static int rcu_torture_fwd_prog(void *args) do { schedule_timeout_interruptible(fwd_progress_holdoff * HZ); WRITE_ONCE(rcu_fwd_emergency_stop, false); - register_oom_notifier(&rcutorture_oom_nb); if (!IS_ENABLED(CONFIG_TINY_RCU) || rcu_inkernel_boot_has_ended()) rcu_torture_fwd_prog_nr(rfp, &tested, &tested_tries); if (rcu_inkernel_boot_has_ended()) rcu_torture_fwd_prog_cr(rfp); - unregister_oom_notifier(&rcutorture_oom_nb); /* Avoid slow periods, better to test when busy. */ stutter_wait("rcu_torture_fwd_prog"); @@ -2159,6 +2157,7 @@ static int __init rcu_torture_fwd_prog_init(void) mutex_lock(&rcu_fwd_mutex); rcu_fwds = rfp; mutex_unlock(&rcu_fwd_mutex); + register_oom_notifier(&rcutorture_oom_nb); return torture_create_kthread(rcu_torture_fwd_prog, rfp, fwd_prog_task); } @@ -2171,6 +2170,7 @@ static void rcu_torture_fwd_prog_cleanup(void) mutex_lock(&rcu_fwd_mutex); rcu_fwds = NULL; mutex_unlock(&rcu_fwd_mutex); + unregister_oom_notifier(&rcutorture_oom_nb); kfree(rfp); } From d685514260e21aabd65a9aa8be045766bdaa0549 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 11 Aug 2020 10:33:39 -0700 Subject: [PATCH 074/168] rcutorture: Allow pointer leaks to test diagnostic code This commit adds an rcutorture.leakpointer module parameter that intentionally leaks an RCU-protected pointer out of the RCU read-side critical section and checks to see if the corresponding grace period has elapsed, emitting a WARN_ON_ONCE() if so. This module parameter can be used to test facilities like CONFIG_RCU_STRICT_GRACE_PERIOD that end grace periods quickly. While in the area, also document rcutorture.irqreader, which was previously left out. Reported-by Jann Horn Signed-off-by: Paul E. McKenney --- Documentation/admin-guide/kernel-parameters.txt | 12 ++++++++++++ kernel/rcu/rcutorture.c | 4 ++++ 2 files changed, 16 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index bdc1f33fd3d1..6d984f153669 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4269,6 +4269,18 @@ are zero, rcutorture acts as if is interpreted they are all non-zero. + rcutorture.irqreader= [KNL] + Run RCU readers from irq handlers, or, more + accurately, from a timer handler. Not all RCU + flavors take kindly to this sort of thing. + + rcutorture.leakpointer= [KNL] + Leak an RCU-protected pointer out of the reader. + This can of course result in splats, and is + intended to test the ability of things like + CONFIG_RCU_STRICT_GRACE_PERIOD=y to detect + such leaks. + rcutorture.n_barrier_cbs= [KNL] Set callbacks/threads for rcu_barrier() testing. diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 983f82fccb18..916ea4f66e4b 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -87,6 +87,7 @@ torture_param(bool, gp_normal, false, "Use normal (non-expedited) GP wait primitives"); torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives"); torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers"); +torture_param(int, leakpointer, 0, "Leak pointer dereferences from readers"); torture_param(int, n_barrier_cbs, 0, "# of callbacks/kthreads for barrier testing"); torture_param(int, nfakewriters, 4, "Number of RCU fake writer threads"); @@ -1401,6 +1402,9 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp) preempt_enable(); rcutorture_one_extend(&readstate, 0, trsp, rtrsp); WARN_ON_ONCE(readstate & RCUTORTURE_RDR_MASK); + // This next splat is expected behavior if leakpointer, especially + // for CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels. + WARN_ON_ONCE(leakpointer && READ_ONCE(p->rtort_pipe_count) > 1); /* If error or close call, record the sequence of reader protections. */ if ((pipe_count > 1 || completed > 1) && !xchg(&err_segs_recorded, 1)) { From b67a91703a29b93f5b114052b0b8e0d84e717ad3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 17 Aug 2020 16:44:48 -0700 Subject: [PATCH 075/168] torture: Add gdb support This commit adds a "--gdb" parameter to kvm.sh, which causes "CONFIG_DEBUG_INFO=y" to be added to the Kconfig options, "nokaslr" to be added to the boot parameters, and "-s -S" to be added to the qemu arguments. Furthermore, the scripting prints messages telling the user how to start up gdb for the run in question. Because of the interactive nature of gdb sessions, only one "--configs" scenario is permitted when "--gdb" is specified. For most torture types, this means that a "--configs" argument is required, and that argument must specify the single scenario of interest. The usual cautions about breakpoints and timing apply, for example, staring at your gdb prompt for too long will likely get you many complaints, including RCU CPU stall warnings. Omar Sandoval further suggests using gdb's "hbreak" command instead of the "break" command on systems supporting hardware breakpoints, and further using the "commands" option because the resulting non-interactive breakpoints are less likely to get you RCU CPU stall warnings. Signed-off-by: Paul E. McKenney --- .../rcutorture/bin/kvm-test-1-run.sh | 33 ++++++++++++++----- tools/testing/selftests/rcutorture/bin/kvm.sh | 21 ++++++++++++ 2 files changed, 46 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index e07779a62634..6dc2b49b85ea 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -66,6 +66,7 @@ config_override_param () { echo > $T/KcList config_override_param "$config_dir/CFcommon" KcList "`cat $config_dir/CFcommon 2> /dev/null`" config_override_param "$config_template" KcList "`cat $config_template 2> /dev/null`" +config_override_param "--gdb options" KcList "$TORTURE_KCONFIG_GDB_ARG" config_override_param "--kasan options" KcList "$TORTURE_KCONFIG_KASAN_ARG" config_override_param "--kcsan options" KcList "$TORTURE_KCONFIG_KCSAN_ARG" config_override_param "--kconfig argument" KcList "$TORTURE_KCONFIG_ARG" @@ -152,7 +153,11 @@ qemu_append="`identify_qemu_append "$QEMU"`" boot_args="`configfrag_boot_params "$boot_args" "$config_template"`" # Generate kernel-version-specific boot parameters boot_args="`per_version_boot_params "$boot_args" $resdir/.config $seconds`" -echo $QEMU $qemu_args -m $TORTURE_QEMU_MEM -kernel $KERNEL -append \"$qemu_append $boot_args\" > $resdir/qemu-cmd +if test -n "$TORTURE_BOOT_GDB_ARG" +then + boot_args="$boot_args $TORTURE_BOOT_GDB_ARG" +fi +echo $QEMU $qemu_args -m $TORTURE_QEMU_MEM -kernel $KERNEL -append \"$qemu_append $boot_args\" $TORTURE_QEMU_GDB_ARG > $resdir/qemu-cmd if test -n "$TORTURE_BUILDONLY" then @@ -171,14 +176,26 @@ echo "NOTE: $QEMU either did not run or was interactive" > $resdir/console.log # Attempt to run qemu ( . $T/qemu-cmd; wait `cat $resdir/qemu_pid`; echo $? > $resdir/qemu-retval ) & commandcompleted=0 -sleep 10 # Give qemu's pid a chance to reach the file -if test -s "$resdir/qemu_pid" +if test -z "$TORTURE_KCONFIG_GDB_ARG" then - qemu_pid=`cat "$resdir/qemu_pid"` - echo Monitoring qemu job at pid $qemu_pid -else - qemu_pid="" - echo Monitoring qemu job at yet-as-unknown pid + sleep 10 # Give qemu's pid a chance to reach the file + if test -s "$resdir/qemu_pid" + then + qemu_pid=`cat "$resdir/qemu_pid"` + echo Monitoring qemu job at pid $qemu_pid + else + qemu_pid="" + echo Monitoring qemu job at yet-as-unknown pid + fi +fi +if test -n "$TORTURE_KCONFIG_GDB_ARG" +then + echo Waiting for you to attach a debug session, for example: > /dev/tty + echo " gdb $base_resdir/vmlinux" > /dev/tty + echo 'After symbols load and the "(gdb)" prompt appears:' > /dev/tty + echo " target remote :1234" > /dev/tty + echo " continue" > /dev/tty + kstarttime=`gawk 'BEGIN { print systime() }' < /dev/null` fi while : do diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index fc15b527172f..c30047e52b54 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -31,6 +31,9 @@ TORTURE_DEFCONFIG=defconfig TORTURE_BOOT_IMAGE="" TORTURE_INITRD="$KVM/initrd"; export TORTURE_INITRD TORTURE_KCONFIG_ARG="" +TORTURE_KCONFIG_GDB_ARG="" +TORTURE_BOOT_GDB_ARG="" +TORTURE_QEMU_GDB_ARG="" TORTURE_KCONFIG_KASAN_ARG="" TORTURE_KCONFIG_KCSAN_ARG="" TORTURE_KMAKE_ARG="" @@ -56,6 +59,7 @@ usage () { echo " --defconfig string" echo " --dryrun sched|script" echo " --duration minutes" + echo " --gdb" echo " --help" echo " --interactive" echo " --jitter N [ maxsleep (us) [ maxspin (us) ] ]" @@ -128,6 +132,11 @@ do dur=$(($2*60)) shift ;; + --gdb) + TORTURE_KCONFIG_GDB_ARG="CONFIG_DEBUG_INFO=y"; export TORTURE_KCONFIG_GDB_ARG + TORTURE_BOOT_GDB_ARG="nokaslr"; export TORTURE_BOOT_GDB_ARG + TORTURE_QEMU_GDB_ARG="-s -S"; export TORTURE_QEMU_GDB_ARG + ;; --help|-h) usage ;; @@ -253,6 +262,15 @@ do done touch $T/cfgcpu configs_derep="`echo $configs_derep | sed -e "s/\/$defaultconfigs/g"`" +if test -n "$TORTURE_KCONFIG_GDB_ARG" +then + if test "`echo $configs_derep | wc -w`" -gt 1 + then + echo "The --config list is: $configs_derep." + echo "Only one --config permitted with --gdb, terminating." + exit 1 + fi +fi for CF1 in $configs_derep do if test -f "$CONFIGFRAG/$CF1" @@ -328,6 +346,9 @@ TORTURE_BUILDONLY="$TORTURE_BUILDONLY"; export TORTURE_BUILDONLY TORTURE_DEFCONFIG="$TORTURE_DEFCONFIG"; export TORTURE_DEFCONFIG TORTURE_INITRD="$TORTURE_INITRD"; export TORTURE_INITRD TORTURE_KCONFIG_ARG="$TORTURE_KCONFIG_ARG"; export TORTURE_KCONFIG_ARG +TORTURE_KCONFIG_GDB_ARG="$TORTURE_KCONFIG_GDB_ARG"; export TORTURE_KCONFIG_GDB_ARG +TORTURE_BOOT_GDB_ARG="$TORTURE_BOOT_GDB_ARG"; export TORTURE_BOOT_GDB_ARG +TORTURE_QEMU_GDB_ARG="$TORTURE_QEMU_GDB_ARG"; export TORTURE_QEMU_GDB_ARG TORTURE_KCONFIG_KASAN_ARG="$TORTURE_KCONFIG_KASAN_ARG"; export TORTURE_KCONFIG_KASAN_ARG TORTURE_KCONFIG_KCSAN_ARG="$TORTURE_KCONFIG_KCSAN_ARG"; export TORTURE_KCONFIG_KCSAN_ARG TORTURE_KMAKE_ARG="$TORTURE_KMAKE_ARG"; export TORTURE_KMAKE_ARG From 5e0c074e5b4be02d57d1b60abc3391afe7edd088 Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Wed, 2 Sep 2020 07:40:59 +0300 Subject: [PATCH 076/168] coccinelle: ifnullfree: add vfree(), kvfree*() functions Extend the list of free functions with kvfree(), kvfree_sensitive(), vfree(). Signed-off-by: Denis Efremov Signed-off-by: Julia Lawall --- scripts/coccinelle/free/ifnullfree.cocci | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/scripts/coccinelle/free/ifnullfree.cocci b/scripts/coccinelle/free/ifnullfree.cocci index 2045391e36a0..285b92d5c665 100644 --- a/scripts/coccinelle/free/ifnullfree.cocci +++ b/scripts/coccinelle/free/ifnullfree.cocci @@ -20,8 +20,14 @@ expression E; - if (E != NULL) ( kfree(E); +| + kvfree(E); | kfree_sensitive(E); +| + kvfree_sensitive(E, ...); +| + vfree(E); | debugfs_remove(E); | @@ -42,9 +48,10 @@ position p; @@ * if (E != NULL) -* \(kfree@p\|kfree_sensitive@p\|debugfs_remove@p\|debugfs_remove_recursive@p\| +* \(kfree@p\|kvfree@p\|kfree_sensitive@p\|kvfree_sensitive@p\|vfree@p\| +* debugfs_remove@p\|debugfs_remove_recursive@p\| * usb_free_urb@p\|kmem_cache_destroy@p\|mempool_destroy@p\| -* dma_pool_destroy@p\)(E); +* dma_pool_destroy@p\)(E, ...); @script:python depends on org@ p << r.p; From 53922270d21de707a1a0ffaf1e07644e77fcb8db Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Thu, 18 Jun 2020 16:29:49 -0400 Subject: [PATCH 077/168] rcu/segcblist: Prevent useless GP start if no CBs to accelerate The rcu_segcblist_accelerate() function returns true iff it is necessary to request another grace period. A tracing session showed that this function unnecessarily requests grace periods. For example, consider the following sequence of events: 1. Callbacks are queued only on the NEXT segment of CPU A's callback list. 2. CPU A runs RCU_SOFTIRQ, accelerating these callbacks from NEXT to WAIT. 3. Thus rcu_segcblist_accelerate() returns true, requesting grace period N. 4. RCU's grace-period kthread wakes up on CPU B and starts grace period N. 4. CPU A notices the new grace period and invokes RCU_SOFTIRQ. 5. CPU A's RCU_SOFTIRQ again invokes rcu_segcblist_accelerate(), but there are no new callbacks. However, rcu_segcblist_accelerate() nevertheless (uselessly) requests a new grace period N+1. This extra grace period results in additional lock contention and also additional wakeups, all for no good reason. This commit therefore adds a check to rcu_segcblist_accelerate() that prevents the return of true when there are no new callbacks. This change reduces the number of grace periods (GPs) and wakeups in each of eleven five-second rcutorture runs as follows: +----+-------------------+-------------------+ | # | Number of GPs | Number of Wakeups | +====+=========+=========+=========+=========+ | 1 | With | Without | With | Without | +----+---------+---------+---------+---------+ | 2 | 75 | 89 | 113 | 119 | +----+---------+---------+---------+---------+ | 3 | 62 | 91 | 105 | 123 | +----+---------+---------+---------+---------+ | 4 | 60 | 79 | 98 | 110 | +----+---------+---------+---------+---------+ | 5 | 63 | 79 | 99 | 112 | +----+---------+---------+---------+---------+ | 6 | 57 | 89 | 96 | 123 | +----+---------+---------+---------+---------+ | 7 | 64 | 85 | 97 | 118 | +----+---------+---------+---------+---------+ | 8 | 58 | 83 | 98 | 113 | +----+---------+---------+---------+---------+ | 9 | 57 | 77 | 89 | 104 | +----+---------+---------+---------+---------+ | 10 | 66 | 82 | 98 | 119 | +----+---------+---------+---------+---------+ | 11 | 52 | 82 | 83 | 117 | +----+---------+---------+---------+---------+ The reduction in the number of wakeups ranges from 5% to 40%. Cc: urezki@gmail.com [ paulmck: Rework commit log and comment. ] Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu_segcblist.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 9a0f66133b4b..2d2a6b6b9dfb 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -475,8 +475,16 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq) * Also advance to the oldest segment of callbacks whose * ->gp_seq[] completion is at or after that passed in via "seq", * skipping any empty segments. + * + * Note that segment "i" (and any lower-numbered segments + * containing older callbacks) will be unaffected, and their + * grace-period numbers remain unchanged. For example, if i == + * WAIT_TAIL, then neither WAIT_TAIL nor DONE_TAIL will be touched. + * Instead, the CBs in NEXT_TAIL will be merged with those in + * NEXT_READY_TAIL and the grace-period number of NEXT_READY_TAIL + * would be updated. NEXT_TAIL would then be empty. */ - if (++i >= RCU_NEXT_TAIL) + if (rcu_segcblist_restempty(rsclp, i) || ++i >= RCU_NEXT_TAIL) return false; /* From 70060b8770d34f83e9fa4c3526db013dd2773611 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Fri, 14 Aug 2020 14:45:57 +0800 Subject: [PATCH 078/168] rcu: Shrink each possible cpu krcp CPUs can go offline shortly after kfree_call_rcu() has been invoked, which can leave memory stranded until those CPUs come back online. This commit therefore drains the kcrp of each CPU, not just the ones that happen to be online. Acked-by: Joel Fernandes Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 232362293678..92450642c4d8 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3450,7 +3450,7 @@ kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) unsigned long count = 0; /* Snapshot count of all CPUs */ - for_each_online_cpu(cpu) { + for_each_possible_cpu(cpu) { struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); count += READ_ONCE(krcp->count); @@ -3465,7 +3465,7 @@ kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) int cpu, freed = 0; unsigned long flags; - for_each_online_cpu(cpu) { + for_each_possible_cpu(cpu) { int count; struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); @@ -3498,7 +3498,7 @@ void __init kfree_rcu_scheduler_running(void) int cpu; unsigned long flags; - for_each_online_cpu(cpu) { + for_each_possible_cpu(cpu) { struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); raw_spin_lock_irqsave(&krcp->lock, flags); From e48c15b796d412ede883bb2ef7779b2a142f7962 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 29 Jun 2020 17:21:32 -0700 Subject: [PATCH 079/168] smp: Add source and destination CPUs to __call_single_data This commit adds a destination CPU to __call_single_data, and is inspired by an earlier commit by Peter Zijlstra. This version adds #ifdef to permit use by 32-bit systems and supplying the destination CPU for all smp_call_function*() requests, not just smp_call_function_single(). If need be, 32-bit systems could be accommodated by shrinking the flags field to 16 bits (the atomic_t variant is currently unused) and by providing only eight bits for CPU on such systems. It is not clear that the addition of the fields to __call_single_node are really needed. [ paulmck: Apply Boqun Feng feedback on 32-bit builds. ] Link: https://lore.kernel.org/lkml/20200615164048.GC2531@hirez.programming.kicks-ass.net/ Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Sebastian Andrzej Siewior Cc: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- include/linux/smp.h | 3 +++ include/linux/smp_types.h | 3 +++ kernel/smp.c | 6 ++++++ 3 files changed, 12 insertions(+) diff --git a/include/linux/smp.h b/include/linux/smp.h index 80d557ef8a11..9f13966d3d92 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -26,6 +26,9 @@ struct __call_single_data { struct { struct llist_node llist; unsigned int flags; +#ifdef CONFIG_64BIT + u16 src, dst; +#endif }; }; smp_call_func_t func; diff --git a/include/linux/smp_types.h b/include/linux/smp_types.h index 364b3ae3e41d..2e8461af8df6 100644 --- a/include/linux/smp_types.h +++ b/include/linux/smp_types.h @@ -61,6 +61,9 @@ struct __call_single_node { unsigned int u_flags; atomic_t a_flags; }; +#ifdef CONFIG_64BIT + u16 src, dst; +#endif }; #endif /* __LINUX_SMP_TYPES_H */ diff --git a/kernel/smp.c b/kernel/smp.c index d0ae8eb6bf8b..865a876f83ce 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -375,6 +375,9 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info, csd->func = func; csd->info = info; +#ifdef CONFIG_64BIT + csd->dst = cpu; +#endif err = generic_exec_single(cpu, csd); @@ -540,6 +543,9 @@ static void smp_call_function_many_cond(const struct cpumask *mask, csd->flags |= CSD_TYPE_SYNC; csd->func = func; csd->info = info; +#ifdef CONFIG_64BIT + csd->dst = cpu; +#endif if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu))) __cpumask_set_cpu(cpu, cfd->cpumask_ipi); } From 35feb60474bf4f7fa7840e14fc7fd344996b919d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 30 Jun 2020 13:22:54 -0700 Subject: [PATCH 080/168] kernel/smp: Provide CSD lock timeout diagnostics This commit causes csd_lock_wait() to emit diagnostics when a CPU fails to respond quickly enough to one of the smp_call_function() family of function calls. These diagnostics are enabled by a new CSD_LOCK_WAIT_DEBUG Kconfig option that depends on DEBUG_KERNEL. This commit was inspired by an earlier patch by Josef Bacik. [ paulmck: Fix for syzbot+0f719294463916a3fc0e@syzkaller.appspotmail.com ] [ paulmck: Fix KASAN use-after-free issue reported by Qian Cai. ] [ paulmck: Fix botched nr_cpu_ids comparison per Dan Carpenter. ] [ paulmck: Apply Peter Zijlstra feedback. ] Link: https://lore.kernel.org/lkml/00000000000042f21905a991ecea@google.com Link: https://lore.kernel.org/lkml/0000000000002ef21705a9933cf3@google.com Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Sebastian Andrzej Siewior Signed-off-by: Paul E. McKenney --- kernel/smp.c | 132 +++++++++++++++++++++++++++++++++++++++++++++- lib/Kconfig.debug | 11 ++++ 2 files changed, 141 insertions(+), 2 deletions(-) diff --git a/kernel/smp.c b/kernel/smp.c index 865a876f83ce..c5d31885bd30 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -20,6 +20,9 @@ #include #include #include +#include +#include +#include #include "smpboot.h" #include "sched/smp.h" @@ -96,6 +99,103 @@ void __init call_function_init(void) smpcfd_prepare_cpu(smp_processor_id()); } +#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG + +static DEFINE_PER_CPU(call_single_data_t *, cur_csd); +static DEFINE_PER_CPU(smp_call_func_t, cur_csd_func); +static DEFINE_PER_CPU(void *, cur_csd_info); + +#define CSD_LOCK_TIMEOUT (5ULL * NSEC_PER_SEC) +atomic_t csd_bug_count = ATOMIC_INIT(0); + +/* Record current CSD work for current CPU, NULL to erase. */ +static void csd_lock_record(call_single_data_t *csd) +{ + if (!csd) { + smp_mb(); /* NULL cur_csd after unlock. */ + __this_cpu_write(cur_csd, NULL); + return; + } + __this_cpu_write(cur_csd_func, csd->func); + __this_cpu_write(cur_csd_info, csd->info); + smp_wmb(); /* func and info before csd. */ + __this_cpu_write(cur_csd, csd); + smp_mb(); /* Update cur_csd before function call. */ + /* Or before unlock, as the case may be. */ +} + +static __always_inline int csd_lock_wait_getcpu(call_single_data_t *csd) +{ + unsigned int csd_type; + + csd_type = CSD_TYPE(csd); + if (csd_type == CSD_TYPE_ASYNC || csd_type == CSD_TYPE_SYNC) + return csd->dst; /* Other CSD_TYPE_ values might not have ->dst. */ + return -1; +} + +/* + * Complain if too much time spent waiting. Note that only + * the CSD_TYPE_SYNC/ASYNC types provide the destination CPU, + * so waiting on other types gets much less information. + */ +static __always_inline bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, int *bug_id) +{ + int cpu = -1; + int cpux; + bool firsttime; + u64 ts2, ts_delta; + call_single_data_t *cpu_cur_csd; + unsigned int flags = READ_ONCE(csd->flags); + + if (!(flags & CSD_FLAG_LOCK)) { + if (!unlikely(*bug_id)) + return true; + cpu = csd_lock_wait_getcpu(csd); + pr_alert("csd: CSD lock (#%d) got unstuck on CPU#%02d, CPU#%02d released the lock.\n", + *bug_id, raw_smp_processor_id(), cpu); + return true; + } + + ts2 = sched_clock(); + ts_delta = ts2 - *ts1; + if (likely(ts_delta <= CSD_LOCK_TIMEOUT)) + return false; + + firsttime = !*bug_id; + if (firsttime) + *bug_id = atomic_inc_return(&csd_bug_count); + cpu = csd_lock_wait_getcpu(csd); + if (WARN_ONCE(cpu < 0 || cpu >= nr_cpu_ids, "%s: cpu = %d\n", __func__, cpu)) + cpux = 0; + else + cpux = cpu; + cpu_cur_csd = smp_load_acquire(&per_cpu(cur_csd, cpux)); /* Before func and info. */ + pr_alert("csd: %s non-responsive CSD lock (#%d) on CPU#%d, waiting %llu ns for CPU#%02d %pS(%ps).\n", + firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), ts2 - ts0, + cpu, csd->func, csd->info); + if (cpu_cur_csd && csd != cpu_cur_csd) { + pr_alert("\tcsd: CSD lock (#%d) handling prior %pS(%ps) request.\n", + *bug_id, READ_ONCE(per_cpu(cur_csd_func, cpux)), + READ_ONCE(per_cpu(cur_csd_info, cpux))); + } else { + pr_alert("\tcsd: CSD lock (#%d) %s.\n", + *bug_id, !cpu_cur_csd ? "unresponsive" : "handling this request"); + } + if (cpu >= 0) { + if (!trigger_single_cpu_backtrace(cpu)) + dump_cpu_task(cpu); + if (!cpu_cur_csd) { + pr_alert("csd: Re-sending CSD lock (#%d) IPI from CPU#%02d to CPU#%02d\n", *bug_id, raw_smp_processor_id(), cpu); + arch_send_call_function_single_ipi(cpu); + } + } + dump_stack(); + *ts1 = ts2; + + return false; +} + /* * csd_lock/csd_unlock used to serialize access to per-cpu csd resources * @@ -103,10 +203,30 @@ void __init call_function_init(void) * previous function call. For multi-cpu calls its even more interesting * as we'll have to ensure no other cpu is observing our csd. */ +static __always_inline void csd_lock_wait(call_single_data_t *csd) +{ + int bug_id = 0; + u64 ts0, ts1; + + ts1 = ts0 = sched_clock(); + for (;;) { + if (csd_lock_wait_toolong(csd, ts0, &ts1, &bug_id)) + break; + cpu_relax(); + } + smp_acquire__after_ctrl_dep(); +} + +#else +static void csd_lock_record(call_single_data_t *csd) +{ +} + static __always_inline void csd_lock_wait(call_single_data_t *csd) { smp_cond_load_acquire(&csd->flags, !(VAL & CSD_FLAG_LOCK)); } +#endif static __always_inline void csd_lock(call_single_data_t *csd) { @@ -166,9 +286,11 @@ static int generic_exec_single(int cpu, call_single_data_t *csd) * We can unlock early even for the synchronous on-stack case, * since we're doing this from the same CPU.. */ + csd_lock_record(csd); csd_unlock(csd); local_irq_save(flags); func(info); + csd_lock_record(NULL); local_irq_restore(flags); return 0; } @@ -268,8 +390,10 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) entry = &csd_next->llist; } + csd_lock_record(csd); func(info); csd_unlock(csd); + csd_lock_record(NULL); } else { prev = &csd->llist; } @@ -296,8 +420,10 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) smp_call_func_t func = csd->func; void *info = csd->info; + csd_lock_record(csd); csd_unlock(csd); func(info); + csd_lock_record(NULL); } else if (type == CSD_TYPE_IRQ_WORK) { irq_work_single(csd); } @@ -375,7 +501,8 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info, csd->func = func; csd->info = info; -#ifdef CONFIG_64BIT +#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG + csd->src = smp_processor_id(); csd->dst = cpu; #endif @@ -543,7 +670,8 @@ static void smp_call_function_many_cond(const struct cpumask *mask, csd->flags |= CSD_TYPE_SYNC; csd->func = func; csd->info = info; -#ifdef CONFIG_64BIT +#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG + csd->src = smp_processor_id(); csd->dst = cpu; #endif if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu))) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index e068c3c7189a..86a35fdfe021 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1367,6 +1367,17 @@ config WW_MUTEX_SELFTEST Say M if you want these self tests to build as a module. Say N if you are unsure. +config CSD_LOCK_WAIT_DEBUG + bool "Debugging for csd_lock_wait(), called from smp_call_function*()" + depends on DEBUG_KERNEL + depends on 64BIT + default n + help + This option enables debug prints when CPUs are slow to respond + to the smp_call_function*() IPI wrappers. These debug prints + include the IPI handler function currently executing (if any) + and relevant stack traces. + endmenu # lock debugging config TRACE_IRQFLAGS From 2b722160f1a7929f38dfb648c7bbb45f96e65a5b Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Mon, 6 Jul 2020 21:49:41 +0800 Subject: [PATCH 081/168] smp: Make symbol 'csd_bug_count' static The sparse tool complains as follows: kernel/smp.c:107:10: warning: symbol 'csd_bug_count' was not declared. Should it be static? Because variable is not used outside of smp.c, this commit marks it static. Reported-by: Hulk Robot Signed-off-by: Wei Yongjun Signed-off-by: Paul E. McKenney Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Sebastian Andrzej Siewior --- kernel/smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/smp.c b/kernel/smp.c index c5d31885bd30..b25383d16e8e 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -106,7 +106,7 @@ static DEFINE_PER_CPU(smp_call_func_t, cur_csd_func); static DEFINE_PER_CPU(void *, cur_csd_info); #define CSD_LOCK_TIMEOUT (5ULL * NSEC_PER_SEC) -atomic_t csd_bug_count = ATOMIC_INIT(0); +static atomic_t csd_bug_count = ATOMIC_INIT(0); /* Record current CSD work for current CPU, NULL to erase. */ static void csd_lock_record(call_single_data_t *csd) From 7c9dc603d55617edcc3190f9a6d7b5cf763feb57 Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Tue, 1 Sep 2020 12:48:12 +0300 Subject: [PATCH 082/168] coccinelle: misc: add uninitialized_var.cocci script uninitialized_var() macro was removed from the sources [1] and other warning-silencing tricks were deprecated [2]. The purpose of this cocci script is to prevent new occurrences of uninitialized_var() open-coded variants. [1] commit 63a0895d960a ("compiler: Remove uninitialized_var() macro") [2] commit 4b19bec97c88 ("docs: deprecated.rst: Add uninitialized_var()") Cc: Kees Cook Cc: Gustavo A. R. Silva Signed-off-by: Denis Efremov Signed-off-by: Julia Lawall --- .../coccinelle/misc/uninitialized_var.cocci | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 scripts/coccinelle/misc/uninitialized_var.cocci diff --git a/scripts/coccinelle/misc/uninitialized_var.cocci b/scripts/coccinelle/misc/uninitialized_var.cocci new file mode 100644 index 000000000000..8fa845cefe11 --- /dev/null +++ b/scripts/coccinelle/misc/uninitialized_var.cocci @@ -0,0 +1,51 @@ +// SPDX-License-Identifier: GPL-2.0-only +/// +/// Please, don't reintroduce uninitialized_var(). +/// From Documentation/process/deprecated.rst: +/// For any compiler warnings about uninitialized variables, just add +/// an initializer. Using warning-silencing tricks is dangerous as it +/// papers over real bugs (or can in the future), and suppresses unrelated +/// compiler warnings (e.g. "unused variable"). If the compiler thinks it +/// is uninitialized, either simply initialize the variable or make compiler +/// changes. Keep in mind that in most cases, if an initialization is +/// obviously redundant, the compiler's dead-store elimination pass will make +/// sure there are no needless variable writes. +/// +// Confidence: High +// Copyright: (C) 2020 Denis Efremov ISPRAS +// Options: --no-includes --include-headers +// + +virtual context +virtual report +virtual org + +@r@ +identifier var; +type T; +position p; +@@ + +( +* T var =@p var; +| +* T var =@p *(&(var)); +| +* var =@p var +| +* var =@p *(&(var)) +) + +@script:python depends on report@ +p << r.p; +@@ + +coccilib.report.print_report(p[0], + "WARNING this kind of initialization is deprecated (https://www.kernel.org/doc/html/latest/process/deprecated.html#uninitialized-var)") + +@script:python depends on org@ +p << r.p; +@@ + +coccilib.org.print_todo(p[0], + "WARNING this kind of initialization is deprecated (https://www.kernel.org/doc/html/latest/process/deprecated.html#uninitialized-var)") From 6519a5ab1a9ffe5cf8056f688a69960bf126e723 Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Tue, 11 Aug 2020 10:49:53 +0300 Subject: [PATCH 083/168] coccinelle: api: update kzfree script to kfree_sensitive Commit 453431a54934 ("mm, treewide: rename kzfree() to kfree_sensitive()") renames kzfree to kfree_sensitive and uses memzero_explicit(...) instead of memset(..., 0, ...) internally. Update cocci script to reflect these changes. Signed-off-by: Denis Efremov Signed-off-by: Julia Lawall --- .../{kzfree.cocci => kfree_sensitive.cocci} | 29 +++++++++---------- 1 file changed, 13 insertions(+), 16 deletions(-) rename scripts/coccinelle/api/{kzfree.cocci => kfree_sensitive.cocci} (70%) diff --git a/scripts/coccinelle/api/kzfree.cocci b/scripts/coccinelle/api/kfree_sensitive.cocci similarity index 70% rename from scripts/coccinelle/api/kzfree.cocci rename to scripts/coccinelle/api/kfree_sensitive.cocci index 33625bd7cec9..e4a066a0b77d 100644 --- a/scripts/coccinelle/api/kzfree.cocci +++ b/scripts/coccinelle/api/kfree_sensitive.cocci @@ -1,13 +1,13 @@ // SPDX-License-Identifier: GPL-2.0-only /// -/// Use kzfree, kvfree_sensitive rather than memset or -/// memzero_explicit followed by kfree +/// Use kfree_sensitive, kvfree_sensitive rather than memset or +/// memzero_explicit followed by kfree. /// // Confidence: High // Copyright: (C) 2020 Denis Efremov ISPRAS // Options: --no-includes --include-headers // -// Keywords: kzfree, kvfree_sensitive +// Keywords: kfree_sensitive, kvfree_sensitive // virtual context @@ -18,7 +18,8 @@ virtual report @initialize:python@ @@ # kmalloc_oob_in_memset uses memset to explicitly trigger out-of-bounds access -filter = frozenset(['kmalloc_oob_in_memset', 'kzfree', 'kvfree_sensitive']) +filter = frozenset(['kmalloc_oob_in_memset', + 'kfree_sensitive', 'kvfree_sensitive']) def relevant(p): return not (filter & {el.current_element for el in p}) @@ -56,17 +57,13 @@ type T; - memzero_explicit@m((T)E, size); ... when != E when strict -// TODO: uncomment when kfree_sensitive will be merged. -// Only this case is commented out because developers -// may not like patches like this since kzfree uses memset -// internally (not memzero_explicit). -//( -//- kfree(E)@p; -//+ kfree_sensitive(E); -//| +( +- kfree(E)@p; ++ kfree_sensitive(E); +| - \(vfree\|kvfree\)(E)@p; + kvfree_sensitive(E, size); -//) +) @rp_memset depends on patch@ expression E, size; @@ -80,7 +77,7 @@ type T; when strict ( - kfree(E)@p; -+ kzfree(E); ++ kfree_sensitive(E); | - \(vfree\|kvfree\)(E)@p; + kvfree_sensitive(E, size); @@ -91,11 +88,11 @@ p << r.p; @@ coccilib.report.print_report(p[0], - "WARNING: opportunity for kzfree/kvfree_sensitive") + "WARNING: opportunity for kfree_sensitive/kvfree_sensitive") @script:python depends on org@ p << r.p; @@ coccilib.org.print_todo(p[0], - "WARNING: opportunity for kzfree/kvfree_sensitive") + "WARNING: opportunity for kfree_sensitive/kvfree_sensitive") From a19d1358345e040af9164ee7dd0f39ea0a99d565 Mon Sep 17 00:00:00 2001 From: Sumera Priyadarsini Date: Sun, 13 Sep 2020 17:35:48 +0530 Subject: [PATCH 084/168] scripts: coccicheck: Improve error feedback when coccicheck fails Currently, coccicheck fails with only the message "coccicheck failed" and the error code for the failure. To obtain the error logs, one needs to specify a debug file using the DEBUG_FILE option. Modify coccicheck to display error logs when it crashes unless DEBUG_FILE is set, in which case, the error logs are stored in the specified debug file. Signed-off-by: Sumera Priyadarsini Signed-off-by: Julia Lawall --- scripts/coccicheck | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/scripts/coccicheck b/scripts/coccicheck index 6e37cf36caae..85136f4fe970 100755 --- a/scripts/coccicheck +++ b/scripts/coccicheck @@ -126,8 +126,14 @@ run_cmd_parmap() { if [ $VERBOSE -ne 0 ] ; then echo "Running ($NPROC in parallel): $@" fi - echo $@ >>$DEBUG_FILE - $@ 2>>$DEBUG_FILE + if [ "$DEBUG_FILE" != "/dev/null" -a "$DEBUG_FILE" != "" ]; then + echo $@>>$DEBUG_FILE + $@ 2>>$DEBUG_FILE + else + echo $@ + $@ 2>&1 + fi + err=$? if [[ $err -ne 0 ]]; then echo "coccicheck failed" From 5498d5f93210ab4c55cd191473c8f4a59cc68e8e Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Fri, 11 Sep 2020 17:56:48 +0900 Subject: [PATCH 085/168] zonefs: introduce helper for zone management Introduce a helper function for sending zone management commands to the block device. As zone management commands can change a zone write pointer position reflected in the size of the zone file, this function expects the truncate mutex to be held. Signed-off-by: Johannes Thumshirn Reviewed-by: Damien Le Moal Signed-off-by: Damien Le Moal --- fs/zonefs/super.c | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 8ec7c8f109d7..6e13a5127b01 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -24,6 +24,26 @@ #include "zonefs.h" +static inline int zonefs_zone_mgmt(struct inode *inode, + enum req_opf op) +{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + int ret; + + lockdep_assert_held(&zi->i_truncate_mutex); + + ret = blkdev_zone_mgmt(inode->i_sb->s_bdev, op, zi->i_zsector, + zi->i_zone_size >> SECTOR_SHIFT, GFP_NOFS); + if (ret) { + zonefs_err(inode->i_sb, + "Zone management operation %s at %llu failed %d\n", + blk_op_str(op), zi->i_zsector, ret); + return ret; + } + + return 0; +} + static int zonefs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned int flags, struct iomap *iomap, struct iomap *srcmap) @@ -397,14 +417,9 @@ static int zonefs_file_truncate(struct inode *inode, loff_t isize) if (isize == old_isize) goto unlock; - ret = blkdev_zone_mgmt(inode->i_sb->s_bdev, op, zi->i_zsector, - zi->i_zone_size >> SECTOR_SHIFT, GFP_NOFS); - if (ret) { - zonefs_err(inode->i_sb, - "Zone management operation at %llu failed %d", - zi->i_zsector, ret); + ret = zonefs_zone_mgmt(inode, op); + if (ret) goto unlock; - } zonefs_update_stats(inode, isize); truncate_setsize(inode, isize); From 48d546a8dad4c09745d464e12b95f21c773bff39 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Fri, 11 Sep 2020 17:56:49 +0900 Subject: [PATCH 086/168] zonefs: provide no-lock zonefs_io_error variant Subsequent patches need to call zonefs_io_error() with the i_truncate_mutex already held, so factor out the body of zonefs_io_error() into __zonefs_io_error() which can be called from with the i_truncate_mutex held. Signed-off-by: Johannes Thumshirn Reviewed-by: Damien Le Moal Signed-off-by: Damien Le Moal --- fs/zonefs/super.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 6e13a5127b01..4309979eeb36 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -348,7 +348,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, * eventually correct the file size and zonefs inode write pointer offset * (which can be out of sync with the drive due to partial write failures). */ -static void zonefs_io_error(struct inode *inode, bool write) +static void __zonefs_io_error(struct inode *inode, bool write) { struct zonefs_inode_info *zi = ZONEFS_I(inode); struct super_block *sb = inode->i_sb; @@ -362,8 +362,6 @@ static void zonefs_io_error(struct inode *inode, bool write) }; int ret; - mutex_lock(&zi->i_truncate_mutex); - /* * Memory allocations in blkdev_report_zones() can trigger a memory * reclaim which may in turn cause a recursion into zonefs as well as @@ -379,7 +377,14 @@ static void zonefs_io_error(struct inode *inode, bool write) zonefs_err(sb, "Get inode %lu zone information failed %d\n", inode->i_ino, ret); memalloc_noio_restore(noio_flag); +} +static void zonefs_io_error(struct inode *inode, bool write) +{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + + mutex_lock(&zi->i_truncate_mutex); + __zonefs_io_error(inode, write); mutex_unlock(&zi->i_truncate_mutex); } From b5c00e975779c3d9e6d530c5481309257d5e4220 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Fri, 11 Sep 2020 17:56:50 +0900 Subject: [PATCH 087/168] zonefs: open/close zone on file open/close NVMe Zoned Namespace introduced the concept of active zones, which are zones in the implicit open, explicit open or closed condition. Drives may have a limit on the number of zones that can be simultaneously active. This potential limitation translate into a risk for applications to see write IO errors due to this limit if the zone of a file being written to is not already active when a write request is issued. To avoid these potential errors, the zone of a file can explicitly be made active using an open zone command when the file is open for the first time. If the zone open command succeeds, the application is then guaranteed that write requests can be processed. This indirect management of active zones relies on the maximum number of open zones of a drive, which is always lower or equal to the maximum number of active zones. On the first open of a sequential zone file, send a REQ_OP_ZONE_OPEN command to the block device. Conversely, on the last release of a zone file and send a REQ_OP_ZONE_CLOSE to the device if the zone is not full or empty. As truncating a zone file to 0 or max can deactivate a zone as well, we need to serialize against truncates and also be careful not to close a zone as the file may still be open for writing, e.g. the user called ftruncate(). If the zone file is not open and a process does a truncate(), then no close operation is needed. Signed-off-by: Johannes Thumshirn Reviewed-by: Damien Le Moal Signed-off-by: Damien Le Moal --- fs/zonefs/super.c | 183 ++++++++++++++++++++++++++++++++++++++++++++- fs/zonefs/zonefs.h | 10 +++ 2 files changed, 189 insertions(+), 4 deletions(-) diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 4309979eeb36..64cc2a9c38c8 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -44,6 +44,19 @@ static inline int zonefs_zone_mgmt(struct inode *inode, return 0; } +static inline void zonefs_i_size_write(struct inode *inode, loff_t isize) +{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + + i_size_write(inode, isize); + /* + * A full zone is no longer open/active and does not need + * explicit closing. + */ + if (isize >= zi->i_max_size) + zi->i_flags &= ~ZONEFS_ZONE_OPEN; +} + static int zonefs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned int flags, struct iomap *iomap, struct iomap *srcmap) @@ -321,6 +334,17 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, } } + /* + * If the filesystem is mounted with the explicit-open mount option, we + * need to clear the ZONEFS_ZONE_OPEN flag if the zone transitioned to + * the read-only or offline condition, to avoid attempting an explicit + * close of the zone when the inode file is closed. + */ + if ((sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) && + (zone->cond == BLK_ZONE_COND_OFFLINE || + zone->cond == BLK_ZONE_COND_READONLY)) + zi->i_flags &= ~ZONEFS_ZONE_OPEN; + /* * If error=remount-ro was specified, any error result in remounting * the volume as read-only. @@ -335,7 +359,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, * invalid data. */ zonefs_update_stats(inode, data_size); - i_size_write(inode, data_size); + zonefs_i_size_write(inode, data_size); zi->i_wpoffset = data_size; return 0; @@ -426,6 +450,25 @@ static int zonefs_file_truncate(struct inode *inode, loff_t isize) if (ret) goto unlock; + /* + * If the mount option ZONEFS_MNTOPT_EXPLICIT_OPEN is set, + * take care of open zones. + */ + if (zi->i_flags & ZONEFS_ZONE_OPEN) { + /* + * Truncating a zone to EMPTY or FULL is the equivalent of + * closing the zone. For a truncation to 0, we need to + * re-open the zone to ensure new writes can be processed. + * For a truncation to the maximum file size, the zone is + * closed and writes cannot be accepted anymore, so clear + * the open flag. + */ + if (!isize) + ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN); + else + zi->i_flags &= ~ZONEFS_ZONE_OPEN; + } + zonefs_update_stats(inode, isize); truncate_setsize(inode, isize); zi->i_wpoffset = isize; @@ -604,7 +647,7 @@ static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size, mutex_lock(&zi->i_truncate_mutex); if (i_size_read(inode) < iocb->ki_pos + size) { zonefs_update_stats(inode, iocb->ki_pos + size); - i_size_write(inode, iocb->ki_pos + size); + zonefs_i_size_write(inode, iocb->ki_pos + size); } mutex_unlock(&zi->i_truncate_mutex); } @@ -885,8 +928,128 @@ inode_unlock: return ret; } +static inline bool zonefs_file_use_exp_open(struct inode *inode, struct file *file) +{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); + + if (!(sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN)) + return false; + + if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) + return false; + + if (!(file->f_mode & FMODE_WRITE)) + return false; + + return true; +} + +static int zonefs_open_zone(struct inode *inode) +{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); + int ret = 0; + + mutex_lock(&zi->i_truncate_mutex); + + zi->i_wr_refcnt++; + if (zi->i_wr_refcnt == 1) { + + if (atomic_inc_return(&sbi->s_open_zones) > sbi->s_max_open_zones) { + atomic_dec(&sbi->s_open_zones); + ret = -EBUSY; + goto unlock; + } + + if (i_size_read(inode) < zi->i_max_size) { + ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN); + if (ret) { + zi->i_wr_refcnt--; + atomic_dec(&sbi->s_open_zones); + goto unlock; + } + zi->i_flags |= ZONEFS_ZONE_OPEN; + } + } + +unlock: + mutex_unlock(&zi->i_truncate_mutex); + + return ret; +} + +static int zonefs_file_open(struct inode *inode, struct file *file) +{ + int ret; + + ret = generic_file_open(inode, file); + if (ret) + return ret; + + if (zonefs_file_use_exp_open(inode, file)) + return zonefs_open_zone(inode); + + return 0; +} + +static void zonefs_close_zone(struct inode *inode) +{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + int ret = 0; + + mutex_lock(&zi->i_truncate_mutex); + zi->i_wr_refcnt--; + if (!zi->i_wr_refcnt) { + struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); + struct super_block *sb = inode->i_sb; + + /* + * If the file zone is full, it is not open anymore and we only + * need to decrement the open count. + */ + if (!(zi->i_flags & ZONEFS_ZONE_OPEN)) + goto dec; + + ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE); + if (ret) { + __zonefs_io_error(inode, false); + /* + * Leaving zones explicitly open may lead to a state + * where most zones cannot be written (zone resources + * exhausted). So take preventive action by remounting + * read-only. + */ + if (zi->i_flags & ZONEFS_ZONE_OPEN && + !(sb->s_flags & SB_RDONLY)) { + zonefs_warn(sb, "closing zone failed, remounting filesystem read-only\n"); + sb->s_flags |= SB_RDONLY; + } + } + zi->i_flags &= ~ZONEFS_ZONE_OPEN; +dec: + atomic_dec(&sbi->s_open_zones); + } + mutex_unlock(&zi->i_truncate_mutex); +} + +static int zonefs_file_release(struct inode *inode, struct file *file) +{ + /* + * If we explicitly open a zone we must close it again as well, but the + * zone management operation can fail (either due to an IO error or as + * the zone has gone offline or read-only). Make sure we don't fail the + * close(2) for user-space. + */ + if (zonefs_file_use_exp_open(inode, file)) + zonefs_close_zone(inode); + + return 0; +} + static const struct file_operations zonefs_file_operations = { - .open = generic_file_open, + .open = zonefs_file_open, + .release = zonefs_file_release, .fsync = zonefs_file_fsync, .mmap = zonefs_file_mmap, .llseek = zonefs_file_llseek, @@ -910,6 +1073,7 @@ static struct inode *zonefs_alloc_inode(struct super_block *sb) inode_init_once(&zi->i_vnode); mutex_init(&zi->i_truncate_mutex); init_rwsem(&zi->i_mmap_sem); + zi->i_wr_refcnt = 0; return &zi->i_vnode; } @@ -960,7 +1124,7 @@ static int zonefs_statfs(struct dentry *dentry, struct kstatfs *buf) enum { Opt_errors_ro, Opt_errors_zro, Opt_errors_zol, Opt_errors_repair, - Opt_err, + Opt_explicit_open, Opt_err, }; static const match_table_t tokens = { @@ -968,6 +1132,7 @@ static const match_table_t tokens = { { Opt_errors_zro, "errors=zone-ro"}, { Opt_errors_zol, "errors=zone-offline"}, { Opt_errors_repair, "errors=repair"}, + { Opt_explicit_open, "explicit-open" }, { Opt_err, NULL} }; @@ -1004,6 +1169,9 @@ static int zonefs_parse_options(struct super_block *sb, char *options) sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK; sbi->s_mount_opts |= ZONEFS_MNTOPT_ERRORS_REPAIR; break; + case Opt_explicit_open: + sbi->s_mount_opts |= ZONEFS_MNTOPT_EXPLICIT_OPEN; + break; default: return -EINVAL; } @@ -1423,6 +1591,13 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent) sbi->s_gid = GLOBAL_ROOT_GID; sbi->s_perm = 0640; sbi->s_mount_opts = ZONEFS_MNTOPT_ERRORS_RO; + sbi->s_max_open_zones = bdev_max_open_zones(sb->s_bdev); + atomic_set(&sbi->s_open_zones, 0); + if (!sbi->s_max_open_zones && + sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) { + zonefs_info(sb, "No open zones limit. Ignoring explicit_open mount option\n"); + sbi->s_mount_opts &= ~ZONEFS_MNTOPT_EXPLICIT_OPEN; + } ret = zonefs_read_super(sb); if (ret) diff --git a/fs/zonefs/zonefs.h b/fs/zonefs/zonefs.h index 55b39970acb2..51141907097c 100644 --- a/fs/zonefs/zonefs.h +++ b/fs/zonefs/zonefs.h @@ -38,6 +38,8 @@ static inline enum zonefs_ztype zonefs_zone_type(struct blk_zone *zone) return ZONEFS_ZTYPE_SEQ; } +#define ZONEFS_ZONE_OPEN (1 << 0) + /* * In-memory inode data. */ @@ -74,6 +76,10 @@ struct zonefs_inode_info { */ struct mutex i_truncate_mutex; struct rw_semaphore i_mmap_sem; + + /* guarded by i_truncate_mutex */ + unsigned int i_wr_refcnt; + unsigned int i_flags; }; static inline struct zonefs_inode_info *ZONEFS_I(struct inode *inode) @@ -154,6 +160,7 @@ enum zonefs_features { #define ZONEFS_MNTOPT_ERRORS_MASK \ (ZONEFS_MNTOPT_ERRORS_RO | ZONEFS_MNTOPT_ERRORS_ZRO | \ ZONEFS_MNTOPT_ERRORS_ZOL | ZONEFS_MNTOPT_ERRORS_REPAIR) +#define ZONEFS_MNTOPT_EXPLICIT_OPEN (1 << 4) /* Explicit open/close of zones on open/close */ /* * In-memory Super block information. @@ -175,6 +182,9 @@ struct zonefs_sb_info { loff_t s_blocks; loff_t s_used_blocks; + + unsigned int s_max_open_zones; + atomic_t s_open_zones; }; static inline struct zonefs_sb_info *ZONEFS_SB(struct super_block *sb) From 48bfd5c6fac10e10b7066bf4aeb919ed9a4e87d3 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Fri, 11 Sep 2020 17:56:51 +0900 Subject: [PATCH 088/168] zonefs: document the explicit-open mount option Document the newly introduced explicit-open mount option. Signed-off-by: Johannes Thumshirn Reviewed-by: Damien Le Moal Signed-off-by: Damien Le Moal --- Documentation/filesystems/zonefs.rst | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/Documentation/filesystems/zonefs.rst b/Documentation/filesystems/zonefs.rst index 6c18bc8ce332..6b213fe9a33e 100644 --- a/Documentation/filesystems/zonefs.rst +++ b/Documentation/filesystems/zonefs.rst @@ -326,6 +326,21 @@ discover the amount of data that has been written to the zone. In the case of a read-only zone discovered at run-time, as indicated in the previous section. The size of the zone file is left unchanged from its last updated value. +A zoned block device (e.g. an NVMe Zoned Namespace device) may have limits on +the number of zones that can be active, that is, zones that are in the +implicit open, explicit open or closed conditions. This potential limitation +translates into a risk for applications to see write IO errors due to this +limit being exceeded if the zone of a file is not already active when a write +request is issued by the user. + +To avoid these potential errors, the "explicit-open" mount option forces zones +to be made active using an open zone command when a file is opened for writing +for the first time. If the zone open command succeeds, the application is then +guaranteed that write requests can be processed. Conversely, the +"explicit-open" mount option will result in a zone close command being issued +to the device on the last close() of a zone file if the zone is not full nor +empty. + Zonefs User Space Tools ======================= From f2aae745b82c842221f4f233051f9ac641790959 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Mon, 1 Jun 2020 17:10:36 +0800 Subject: [PATCH 089/168] ubifs: xattr: Fix some potential memory leaks while iterating entries Fix some potential memory leaks in error handling branches while iterating xattr entries. For example, function ubifs_tnc_remove_ino() forgets to free pxent if it exists. Similar problems also exist in ubifs_purge_xattrs(), ubifs_add_orphan() and ubifs_jnl_write_inode(). Signed-off-by: Zhihao Cheng Cc: Fixes: 1e51764a3c2ac05a2 ("UBIFS: add new flash file system") Signed-off-by: Richard Weinberger --- fs/ubifs/journal.c | 2 ++ fs/ubifs/orphan.c | 2 ++ fs/ubifs/tnc.c | 3 +++ fs/ubifs/xattr.c | 2 ++ 4 files changed, 9 insertions(+) diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 4a5b06f8d812..ed935aefe3e0 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -894,6 +894,7 @@ int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode) if (err == -ENOENT) break; + kfree(pxent); goto out_release; } @@ -906,6 +907,7 @@ int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode) ubifs_err(c, "dead directory entry '%s', error %d", xent->name, err); ubifs_ro_mode(c, err); + kfree(pxent); kfree(xent); goto out_release; } diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index 2c294085ffed..0fb61956146d 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -173,6 +173,7 @@ int ubifs_add_orphan(struct ubifs_info *c, ino_t inum) err = PTR_ERR(xent); if (err == -ENOENT) break; + kfree(pxent); return err; } @@ -182,6 +183,7 @@ int ubifs_add_orphan(struct ubifs_info *c, ino_t inum) xattr_orphan = orphan_add(c, xattr_inum, orphan); if (IS_ERR(xattr_orphan)) { + kfree(pxent); kfree(xent); return PTR_ERR(xattr_orphan); } diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index f609f6cdde70..b120a00773f8 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -2885,6 +2885,7 @@ int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum) err = PTR_ERR(xent); if (err == -ENOENT) break; + kfree(pxent); return err; } @@ -2898,6 +2899,7 @@ int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum) fname_len(&nm) = le16_to_cpu(xent->nlen); err = ubifs_tnc_remove_nm(c, &key1, &nm); if (err) { + kfree(pxent); kfree(xent); return err; } @@ -2906,6 +2908,7 @@ int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum) highest_ino_key(c, &key2, xattr_inum); err = ubifs_tnc_remove_range(c, &key1, &key2); if (err) { + kfree(pxent); kfree(xent); return err; } diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index 9aefbb60074f..a0b9b349efe6 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -522,6 +522,7 @@ int ubifs_purge_xattrs(struct inode *host) xent->name, err); ubifs_ro_mode(c, err); kfree(pxent); + kfree(xent); return err; } @@ -531,6 +532,7 @@ int ubifs_purge_xattrs(struct inode *host) err = remove_xattr(c, host, xino, &nm); if (err) { kfree(pxent); + kfree(xent); iput(xino); ubifs_err(c, "cannot remove xattr, error %d", err); return err; From 58f6e78a65f1fcbf732f60a7478ccc99873ff3ba Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Mon, 1 Jun 2020 17:10:37 +0800 Subject: [PATCH 090/168] ubifs: dent: Fix some potential memory leaks while iterating entries Fix some potential memory leaks in error handling branches while iterating dent entries. For example, function dbg_check_dir() forgets to free pdent if it exists. Signed-off-by: Zhihao Cheng Cc: Fixes: 1e51764a3c2ac05a2 ("UBIFS: add new flash file system") Signed-off-by: Richard Weinberger --- fs/ubifs/debug.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 31288d8fa2ce..ebff43f8009c 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -1123,6 +1123,7 @@ int dbg_check_dir(struct ubifs_info *c, const struct inode *dir) err = PTR_ERR(dent); if (err == -ENOENT) break; + kfree(pdent); return err; } From d005f8c6588efcfbe88099b6edafc6f58c84a9c1 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Mon, 1 Jun 2020 17:12:31 +0800 Subject: [PATCH 091/168] ubi: check kthread_should_stop() after the setting of task state A detach hung is possible when a race occurs between the detach process and the ubi background thread. The following sequences outline the race: ubi thread: if (list_empty(&ubi->works)... ubi detach: set_bit(KTHREAD_SHOULD_STOP, &kthread->flags) => by kthread_stop() wake_up_process() => ubi thread is still running, so 0 is returned ubi thread: set_current_state(TASK_INTERRUPTIBLE) schedule() => ubi thread will never be scheduled again ubi detach: wait_for_completion() => hung task! To fix that, we need to check kthread_should_stop() after we set the task state, so the ubi thread will either see the stop bit and exit or the task state is reset to runnable such that it isn't scheduled out indefinitely. Signed-off-by: Zhihao Cheng Cc: Fixes: 801c135ce73d5df1ca ("UBI: Unsorted Block Images") Reported-by: syzbot+853639d0cb16c31c7a14@syzkaller.appspotmail.com Signed-off-by: Richard Weinberger --- drivers/mtd/ubi/wl.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/mtd/ubi/wl.c b/drivers/mtd/ubi/wl.c index 42cac572f82d..7847de75a74c 100644 --- a/drivers/mtd/ubi/wl.c +++ b/drivers/mtd/ubi/wl.c @@ -1639,6 +1639,19 @@ int ubi_thread(void *u) !ubi->thread_enabled || ubi_dbg_is_bgt_disabled(ubi)) { set_current_state(TASK_INTERRUPTIBLE); spin_unlock(&ubi->wl_lock); + + /* + * Check kthread_should_stop() after we set the task + * state to guarantee that we either see the stop bit + * and exit or the task state is reset to runnable such + * that it's not scheduled out indefinitely and detects + * the stop bit at kthread_should_stop(). + */ + if (kthread_should_stop()) { + set_current_state(TASK_RUNNING); + break; + } + schedule(); continue; } From dd7db149bcd914db8fdeb19cd5597c0740121bc7 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Mon, 17 Aug 2020 22:29:09 +0800 Subject: [PATCH 092/168] ubifs: ubifs_jnl_change_xattr: Remove assertion 'nlink > 0' for host inode Changing xattr of a temp file will trigger following assertion failed and make ubifs turn into readonly filesystem: ubifs_assert_failed [ubifs]: UBIFS assert failed: host->i_nlink > 0, in fs/ubifs/journal.c:1801 Reproducer: 1. fd = open(__O_TMPFILE) 2. fsetxattr(fd, key, value2, XATTR_CREATE) 3. fsetxattr(fd, key, value2, XATTR_REPLACE) Fix this by removing assertion 'nlink > 0' for host inode. Reported-by: Chengsong Ke Signed-off-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/journal.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index ed935aefe3e0..cb1fa0c37322 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -1800,7 +1800,6 @@ int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode, u8 hash[UBIFS_HASH_ARR_SZ]; dbg_jnl("ino %lu, ino %lu", host->i_ino, inode->i_ino); - ubifs_assert(c, host->i_nlink > 0); ubifs_assert(c, inode->i_nlink > 0); ubifs_assert(c, mutex_is_locked(&host_ui->ui_mutex)); From 121b8fcbf98892d5f26d8326a00d09e4cdb76369 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Fri, 28 Aug 2020 11:32:50 +0800 Subject: [PATCH 093/168] ubifs: setflags: Don't show error message when vfs_ioc_setflags_prepare() fails Following process will trigger ubifs_err: 1. useradd -m freg (Under root) 2. cd /home/freg && mkdir mp (Under freg) 3. mount -t ubifs /dev/ubi0_0 /home/freg/mp (Under root) 4. cd /home/freg && echo 123 > mp/a (Under root) 5. cd mp && chown freg a && chgrp freg a && chmod 777 a (Under root) 6. chattr +i a (Under freg) UBIFS error (ubi0:0 pid 1723): ubifs_ioctl [ubifs]: can't modify inode 65 attributes chattr: Operation not permitted while setting flags on a This is not an UBIFS problem, it was caused by task priviliage checking on file operations. Remove error message printing from kernel just like other filesystems (eg. ext4), since we already have enough information from userspace tools. Signed-off-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/ioctl.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c index 3df9be2c684c..4363d85a3fd4 100644 --- a/fs/ubifs/ioctl.c +++ b/fs/ubifs/ioctl.c @@ -134,7 +134,6 @@ static int setflags(struct inode *inode, int flags) return err; out_unlock: - ubifs_err(c, "can't modify inode %lu attributes", inode->i_ino); mutex_unlock(&ui->ui_mutex); ubifs_release_budget(c, &req); return err; From f39d9f4cb902fb42fce39a090304d428f9ab3c30 Mon Sep 17 00:00:00 2001 From: Wang Hai Date: Mon, 14 Sep 2020 21:56:51 +0800 Subject: [PATCH 094/168] ubifs: Fix 'hash' kernel-doc warning in auth.c Fixes the following W=1 kernel build warning(s): fs/ubifs/auth.c:66: warning: Excess function parameter 'hash' description in 'ubifs_prepare_auth_node' Rename hash to inhash. Reported-by: Hulk Robot Signed-off-by: Wang Hai Signed-off-by: Richard Weinberger --- fs/ubifs/auth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ubifs/auth.c b/fs/ubifs/auth.c index cc5c0abfd536..b93b3cd10bfd 100644 --- a/fs/ubifs/auth.c +++ b/fs/ubifs/auth.c @@ -54,7 +54,7 @@ static int ubifs_hash_calc_hmac(const struct ubifs_info *c, const u8 *hash, * ubifs_prepare_auth_node - Prepare an authentication node * @c: UBIFS file-system description object * @node: the node to calculate a hash for - * @hash: input hash of previous nodes + * @inhash: input hash of previous nodes * * This function prepares an authentication node for writing onto flash. * It creates a HMAC from the given input hash and writes it to the node. From 7889042b658085248581bfd30e9b1abf415c3c0b Mon Sep 17 00:00:00 2001 From: Wang Hai Date: Mon, 14 Sep 2020 21:56:52 +0800 Subject: [PATCH 095/168] ubifs: Fix some kernel-doc warnings in gc.c Fixes the following W=1 kernel build warning(s): fs/ubifs/gc.c:70: warning: Excess function parameter 'buf' description in 'switch_gc_head' fs/ubifs/gc.c:70: warning: Excess function parameter 'len' description in 'switch_gc_head' fs/ubifs/gc.c:70: warning: Excess function parameter 'lnum' description in 'switch_gc_head' fs/ubifs/gc.c:70: warning: Excess function parameter 'offs' description in 'switch_gc_head' They're not in use. Remove them. Reported-by: Hulk Robot Signed-off-by: Wang Hai Signed-off-by: Richard Weinberger --- fs/ubifs/gc.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c index 62cb3db44e6e..a4aaeea63893 100644 --- a/fs/ubifs/gc.c +++ b/fs/ubifs/gc.c @@ -57,10 +57,6 @@ /** * switch_gc_head - switch the garbage collection journal head. * @c: UBIFS file-system description object - * @buf: buffer to write - * @len: length of the buffer to write - * @lnum: LEB number written is returned here - * @offs: offset written is returned here * * This function switch the GC head to the next LEB which is reserved in * @c->gc_lnum. Returns %0 in case of success, %-EAGAIN if commit is required, From f279e5a491fd96c1347c3268dacc771dc796778e Mon Sep 17 00:00:00 2001 From: Wang Hai Date: Mon, 14 Sep 2020 21:56:53 +0800 Subject: [PATCH 096/168] ubifs: Fix some kernel-doc warnings in replay.c Fixes the following W=1 kernel build warning(s): fs/ubifs/replay.c:942: warning: Excess function parameter 'ref_lnum' description in 'validate_ref' fs/ubifs/replay.c:942: warning: Excess function parameter 'ref_offs' description in 'validate_ref' They're not in use. Remove them. Reported-by: Hulk Robot Signed-off-by: Wang Hai Signed-off-by: Richard Weinberger --- fs/ubifs/replay.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c index b69ffac7e415..2f8d8f4f411a 100644 --- a/fs/ubifs/replay.c +++ b/fs/ubifs/replay.c @@ -931,8 +931,6 @@ out: * validate_ref - validate a reference node. * @c: UBIFS file-system description object * @ref: the reference node to validate - * @ref_lnum: LEB number of the reference node - * @ref_offs: reference node offset * * This function returns %1 if a bud reference already exists for the LEB. %0 is * returned if the reference node is new, otherwise %-EINVAL is returned if From b30e2238b7ff0c3cbe27f7bf024d73e17842f5fc Mon Sep 17 00:00:00 2001 From: Wang Hai Date: Mon, 14 Sep 2020 21:56:54 +0800 Subject: [PATCH 097/168] ubifs: Fix some kernel-doc warnings in tnc.c Fixes the following W=1 kernel build warning(s): fs/ubifs/tnc.c:3479: warning: Excess function parameter 'inum' description in 'dbg_check_inode_size' fs/ubifs/tnc.c:366: warning: Excess function parameter 'node' description in 'lnc_free' @inum in 'dbg_check_inode_size' should be @inode, fix it. @node in 'lnc_free' is not in use, Remove it. Reported-by: Hulk Robot Signed-off-by: Wang Hai Signed-off-by: Richard Weinberger --- fs/ubifs/tnc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index f609f6cdde70..de21625804a9 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -360,7 +360,6 @@ static int lnc_add_directly(struct ubifs_info *c, struct ubifs_zbranch *zbr, /** * lnc_free - remove a leaf node from the leaf node cache. * @zbr: zbranch of leaf node - * @node: leaf node */ static void lnc_free(struct ubifs_zbranch *zbr) { @@ -3466,7 +3465,7 @@ out_unlock: /** * dbg_check_inode_size - check if inode size is correct. * @c: UBIFS file-system description object - * @inum: inode number + * @inode: inode to check * @size: inode size * * This function makes sure that the inode size (@size) is correct and it does From b76f0ea013125358d1b4ca147a6f9b6883dd2493 Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Mon, 21 Sep 2020 18:28:50 +0300 Subject: [PATCH 098/168] coccinelle: misc: add excluded_middle.cocci script Check for !A || A && B condition. It's equivalent to !A || B. Signed-off-by: Denis Efremov Signed-off-by: Julia Lawall --- scripts/coccinelle/misc/excluded_middle.cocci | 39 +++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 scripts/coccinelle/misc/excluded_middle.cocci diff --git a/scripts/coccinelle/misc/excluded_middle.cocci b/scripts/coccinelle/misc/excluded_middle.cocci new file mode 100644 index 000000000000..ab28393e4843 --- /dev/null +++ b/scripts/coccinelle/misc/excluded_middle.cocci @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0-only +/// +/// Condition !A || A && B is equivalent to !A || B. +/// +// Confidence: High +// Copyright: (C) 2020 Denis Efremov ISPRAS +// Options: --no-includes --include-headers + +virtual patch +virtual context +virtual org +virtual report + +@r depends on !patch@ +expression A, B; +position p; +@@ + +* !A || (A &&@p B) + +@depends on patch@ +expression A, B; +@@ + + !A || +- (A && B) ++ B + +@script:python depends on report@ +p << r.p; +@@ + +coccilib.report.print_report(p[0], "WARNING !A || A && B is equivalent to !A || B") + +@script:python depends on org@ +p << r.p; +@@ + +coccilib.org.print_todo(p[0], "WARNING !A || A && B is equivalent to !A || B") From e16a7c47d56b4eeee82be662014c145bce2380e5 Mon Sep 17 00:00:00 2001 From: Sumera Priyadarsini Date: Thu, 24 Sep 2020 16:56:08 +0530 Subject: [PATCH 099/168] scripts: coccicheck: Change default value for parallelism By default, coccicheck utilizes all available threads to implement parallelisation. However, when all available threads are used, a decrease in performance is noted. The elapsed time is minimum when at most one thread per core is used. For example, on benchmarking the semantic patch kfree.cocci for usb/serial using hyperfine, the outputs obtained for J=5 and J=2 are 1.32 and 1.90 times faster than those for J=10 and J=9 respectively for two separate runs. For the larger drivers/staging directory, minimium elapsed time is obtained for J=3 which is 1.86 times faster than that for J=12. The optimal J value does not exceed 6 in any of the test runs. The benchmarks are run on a machine with 6 cores, with 2 threads per core, i.e, 12 hyperthreads in all. To improve performance, modify coccicheck to use at most only one thread per core by default. Signed-off-by: Sumera Priyadarsini Signed-off-by: Julia Lawall --- scripts/coccicheck | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/scripts/coccicheck b/scripts/coccicheck index 85136f4fe970..6789751607f5 100755 --- a/scripts/coccicheck +++ b/scripts/coccicheck @@ -75,8 +75,13 @@ else OPTIONS="--dir $KBUILD_EXTMOD $COCCIINCLUDE" fi + # Use only one thread per core by default if hyperthreading is enabled + THREADS_PER_CORE=$(lscpu | grep "Thread(s) per core: " | tr -cd [:digit:]) if [ -z "$J" ]; then NPROC=$(getconf _NPROCESSORS_ONLN) + if [ $THREADS_PER_CORE -gt 1 -a $NPROC -gt 2 ] ; then + NPROC=$((NPROC/2)) + fi else NPROC="$J" fi From 78c7d49f55d8631b67c09f9bfbe8155211a9ea06 Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Mon, 28 Sep 2020 20:58:59 +0200 Subject: [PATCH 100/168] ubifs: journal: Make sure to not dirty twice for auth nodes When removing the last reference of an inode the size of an auth node is already part of write_len. So we must not call ubifs_add_auth_dirt(). Call it only when needed. Cc: Cc: Sascha Hauer Cc: Kristof Havasi Fixes: 6a98bc4614de ("ubifs: Add authentication nodes to journal") Reported-and-tested-by: Kristof Havasi Reviewed-by: Sascha Hauer Signed-off-by: Richard Weinberger --- fs/ubifs/journal.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index cb1fa0c37322..091c2ad8f211 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -938,8 +938,6 @@ int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode) inode->i_ino); release_head(c, BASEHD); - ubifs_add_auth_dirt(c, lnum); - if (last_reference) { err = ubifs_tnc_remove_ino(c, inode->i_ino); if (err) @@ -949,6 +947,8 @@ int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode) } else { union ubifs_key key; + ubifs_add_auth_dirt(c, lnum); + ino_key_init(c, &key, inode->i_ino); err = ubifs_tnc_add(c, &key, lnum, offs, ilen, hash); } From 44d8870f21529cfa8f50b503b5d949c6d46e6fc1 Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Wed, 30 Sep 2020 19:00:46 +0300 Subject: [PATCH 101/168] coccinelle: api: add kvmalloc script Suggest kvmalloc, kvfree instead of opencoded patterns. Signed-off-by: Denis Efremov Signed-off-by: Julia Lawall --- scripts/coccinelle/api/kvmalloc.cocci | 256 ++++++++++++++++++++++++++ 1 file changed, 256 insertions(+) create mode 100644 scripts/coccinelle/api/kvmalloc.cocci diff --git a/scripts/coccinelle/api/kvmalloc.cocci b/scripts/coccinelle/api/kvmalloc.cocci new file mode 100644 index 000000000000..c30dab718a49 --- /dev/null +++ b/scripts/coccinelle/api/kvmalloc.cocci @@ -0,0 +1,256 @@ +// SPDX-License-Identifier: GPL-2.0-only +/// +/// Find if/else condition with kmalloc/vmalloc calls. +/// Suggest to use kvmalloc instead. Same for kvfree. +/// +// Confidence: High +// Copyright: (C) 2020 Denis Efremov ISPRAS +// Options: --no-includes --include-headers +// + +virtual patch +virtual report +virtual org +virtual context + +@initialize:python@ +@@ +filter = frozenset(['kvfree']) + +def relevant(p): + return not (filter & {el.current_element for el in p}) + +@kvmalloc depends on !patch@ +expression E, E1, size; +identifier flags; +binary operator cmp = {<=, <, ==, >, >=}; +identifier x; +type T; +position p; +@@ + +( +* if (size cmp E1 || ...)@p { + ... +* E = \(kmalloc\|kzalloc\|kcalloc\|kmalloc_node\|kzalloc_node\| +* kmalloc_array\|kmalloc_array_node\|kcalloc_node\) +* (..., size, \(flags\|GFP_KERNEL\|\(GFP_KERNEL\|flags\)|__GFP_NOWARN\), ...) + ... + } else { + ... +* E = \(vmalloc\|vzalloc\|vmalloc_node\|vzalloc_node\)(..., size, ...) + ... + } +| +* E = \(kmalloc\|kzalloc\|kcalloc\|kmalloc_node\|kzalloc_node\| +* kmalloc_array\|kmalloc_array_node\|kcalloc_node\) +* (..., size, \(flags\|GFP_KERNEL\|\(GFP_KERNEL\|flags\)|__GFP_NOWARN\), ...) + ... when != E = E1 + when != size = E1 + when any +* if (E == NULL)@p { + ... +* E = \(vmalloc\|vzalloc\|vmalloc_node\|vzalloc_node\)(..., size, ...) + ... + } +| +* T x = \(kmalloc\|kzalloc\|kcalloc\|kmalloc_node\|kzalloc_node\| +* kmalloc_array\|kmalloc_array_node\|kcalloc_node\) +* (..., size, \(flags\|GFP_KERNEL\|\(GFP_KERNEL\|flags\)|__GFP_NOWARN\), ...); + ... when != x = E1 + when != size = E1 + when any +* if (x == NULL)@p { + ... +* x = \(vmalloc\|vzalloc\|vmalloc_node\|vzalloc_node\)(..., size, ...) + ... + } +) + +@kvfree depends on !patch@ +expression E; +position p : script:python() { relevant(p) }; +@@ + +* if (is_vmalloc_addr(E))@p { + ... +* vfree(E) + ... + } else { + ... when != krealloc(E, ...) + when any +* \(kfree\|kzfree\)(E) + ... + } + +@depends on patch@ +expression E, E1, size, node; +binary operator cmp = {<=, <, ==, >, >=}; +identifier flags, x; +type T; +@@ + +( +- if (size cmp E1) +- E = kmalloc(size, flags); +- else +- E = vmalloc(size); ++ E = kvmalloc(size, flags); +| +- if (size cmp E1) +- E = kmalloc(size, \(GFP_KERNEL\|GFP_KERNEL|__GFP_NOWARN\)); +- else +- E = vmalloc(size); ++ E = kvmalloc(size, GFP_KERNEL); +| +- E = kmalloc(size, flags | __GFP_NOWARN); +- if (E == NULL) +- E = vmalloc(size); ++ E = kvmalloc(size, flags); +| +- E = kmalloc(size, \(GFP_KERNEL\|GFP_KERNEL|__GFP_NOWARN\)); +- if (E == NULL) +- E = vmalloc(size); ++ E = kvmalloc(size, GFP_KERNEL); +| +- T x = kmalloc(size, flags | __GFP_NOWARN); +- if (x == NULL) +- x = vmalloc(size); ++ T x = kvmalloc(size, flags); +| +- T x = kmalloc(size, \(GFP_KERNEL\|GFP_KERNEL|__GFP_NOWARN\)); +- if (x == NULL) +- x = vmalloc(size); ++ T x = kvmalloc(size, GFP_KERNEL); +| +- if (size cmp E1) +- E = kzalloc(size, flags); +- else +- E = vzalloc(size); ++ E = kvzalloc(size, flags); +| +- if (size cmp E1) +- E = kzalloc(size, \(GFP_KERNEL\|GFP_KERNEL|__GFP_NOWARN\)); +- else +- E = vzalloc(size); ++ E = kvzalloc(size, GFP_KERNEL); +| +- E = kzalloc(size, flags | __GFP_NOWARN); +- if (E == NULL) +- E = vzalloc(size); ++ E = kvzalloc(size, flags); +| +- E = kzalloc(size, \(GFP_KERNEL\|GFP_KERNEL|__GFP_NOWARN\)); +- if (E == NULL) +- E = vzalloc(size); ++ E = kvzalloc(size, GFP_KERNEL); +| +- T x = kzalloc(size, flags | __GFP_NOWARN); +- if (x == NULL) +- x = vzalloc(size); ++ T x = kvzalloc(size, flags); +| +- T x = kzalloc(size, \(GFP_KERNEL\|GFP_KERNEL|__GFP_NOWARN\)); +- if (x == NULL) +- x = vzalloc(size); ++ T x = kvzalloc(size, GFP_KERNEL); +| +- if (size cmp E1) +- E = kmalloc_node(size, flags, node); +- else +- E = vmalloc_node(size, node); ++ E = kvmalloc_node(size, flags, node); +| +- if (size cmp E1) +- E = kmalloc_node(size, \(GFP_KERNEL\|GFP_KERNEL|__GFP_NOWARN\), node); +- else +- E = vmalloc_node(size, node); ++ E = kvmalloc_node(size, GFP_KERNEL, node); +| +- E = kmalloc_node(size, flags | __GFP_NOWARN, node); +- if (E == NULL) +- E = vmalloc_node(size, node); ++ E = kvmalloc_node(size, flags, node); +| +- E = kmalloc_node(size, \(GFP_KERNEL\|GFP_KERNEL|__GFP_NOWARN\), node); +- if (E == NULL) +- E = vmalloc_node(size, node); ++ E = kvmalloc_node(size, GFP_KERNEL, node); +| +- T x = kmalloc_node(size, flags | __GFP_NOWARN, node); +- if (x == NULL) +- x = vmalloc_node(size, node); ++ T x = kvmalloc_node(size, flags, node); +| +- T x = kmalloc_node(size, \(GFP_KERNEL\|GFP_KERNEL|__GFP_NOWARN\), node); +- if (x == NULL) +- x = vmalloc_node(size, node); ++ T x = kvmalloc_node(size, GFP_KERNEL, node); +| +- if (size cmp E1) +- E = kvzalloc_node(size, flags, node); +- else +- E = vzalloc_node(size, node); ++ E = kvzalloc_node(size, flags, node); +| +- if (size cmp E1) +- E = kvzalloc_node(size, \(GFP_KERNEL\|GFP_KERNEL|__GFP_NOWARN\), node); +- else +- E = vzalloc_node(size, node); ++ E = kvzalloc_node(size, GFP_KERNEL, node); +| +- E = kvzalloc_node(size, flags | __GFP_NOWARN, node); +- if (E == NULL) +- E = vzalloc_node(size, node); ++ E = kvzalloc_node(size, flags, node); +| +- E = kvzalloc_node(size, \(GFP_KERNEL\|GFP_KERNEL|__GFP_NOWARN\), node); +- if (E == NULL) +- E = vzalloc_node(size, node); ++ E = kvzalloc_node(size, GFP_KERNEL, node); +| +- T x = kvzalloc_node(size, flags | __GFP_NOWARN, node); +- if (x == NULL) +- x = vzalloc_node(size, node); ++ T x = kvzalloc_node(size, flags, node); +| +- T x = kvzalloc_node(size, \(GFP_KERNEL\|GFP_KERNEL|__GFP_NOWARN\), node); +- if (x == NULL) +- x = vzalloc_node(size, node); ++ T x = kvzalloc_node(size, GFP_KERNEL, node); +) + +@depends on patch@ +expression E; +position p : script:python() { relevant(p) }; +@@ + +- if (is_vmalloc_addr(E))@p +- vfree(E); +- else +- kfree(E); ++ kvfree(E); + +@script: python depends on report@ +p << kvmalloc.p; +@@ + +coccilib.report.print_report(p[0], "WARNING opportunity for kvmalloc") + +@script: python depends on org@ +p << kvmalloc.p; +@@ + +coccilib.org.print_todo(p[0], "WARNING opportunity for kvmalloc") + +@script: python depends on report@ +p << kvfree.p; +@@ + +coccilib.report.print_report(p[0], "WARNING opportunity for kvfree") + +@script: python depends on org@ +p << kvfree.p; +@@ + +coccilib.org.print_todo(p[0], "WARNING opportunity for kvfree") From 7b36c1398fb63f9c38cc83dc75f143d2e5995062 Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Mon, 21 Sep 2020 20:49:20 +0300 Subject: [PATCH 102/168] coccinelle: misc: add flexible_array.cocci script One-element and zero-length arrays are deprecated [1]. Kernel code should always use "flexible array members" instead, except for existing uapi definitions. The script warns about one-element and zero-length arrays in structs. [1] commit 68e4cd17e218 ("docs: deprecated.rst: Add zero-length and one-element arrays") Cc: Kees Cook Cc: Gustavo A. R. Silva Signed-off-by: Denis Efremov Signed-off-by: Julia Lawall --- scripts/coccinelle/misc/flexible_array.cocci | 88 ++++++++++++++++++++ 1 file changed, 88 insertions(+) create mode 100644 scripts/coccinelle/misc/flexible_array.cocci diff --git a/scripts/coccinelle/misc/flexible_array.cocci b/scripts/coccinelle/misc/flexible_array.cocci new file mode 100644 index 000000000000..947fbaff82a9 --- /dev/null +++ b/scripts/coccinelle/misc/flexible_array.cocci @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: GPL-2.0-only +/// +/// Zero-length and one-element arrays are deprecated, see +/// Documentation/process/deprecated.rst +/// Flexible-array members should be used instead. +/// +// +// Confidence: High +// Copyright: (C) 2020 Denis Efremov ISPRAS. +// Comments: +// Options: --no-includes --include-headers + +virtual context +virtual report +virtual org +virtual patch + +@initialize:python@ +@@ +def relevant(positions): + for p in positions: + if "uapi" in p.file: + return False + return True + +@r depends on !patch@ +identifier name, array; +type T; +position p : script:python() { relevant(p) }; +@@ + +( + struct name { + ... +* T array@p[\(0\|1\)]; + }; +| + struct { + ... +* T array@p[\(0\|1\)]; + }; +| + union name { + ... +* T array@p[\(0\|1\)]; + }; +| + union { + ... +* T array@p[\(0\|1\)]; + }; +) + +@depends on patch@ +identifier name, array; +type T; +position p : script:python() { relevant(p) }; +@@ + +( + struct name { + ... + T array@p[ +- 0 + ]; + }; +| + struct { + ... + T array@p[ +- 0 + ]; + }; +) + +@script: python depends on report@ +p << r.p; +@@ + +msg = "WARNING use flexible-array member instead (https://www.kernel.org/doc/html/latest/process/deprecated.html#zero-length-and-one-element-arrays)" +coccilib.report.print_report(p[0], msg) + +@script: python depends on org@ +p << r.p; +@@ + +msg = "WARNING use flexible-array member instead (https://www.kernel.org/doc/html/latest/process/deprecated.html#zero-length-and-one-element-arrays)" +coccilib.org.print_todo(p[0], msg) From 2a41fc52c21b6ece49921716bd289bfebaadcc04 Mon Sep 17 00:00:00 2001 From: David Gow Date: Thu, 10 Sep 2020 21:24:04 -0700 Subject: [PATCH 103/168] Documentation: kunit: Add naming guidelines As discussed in [1], KUnit tests have hitherto not had a particularly consistent naming scheme. This adds documentation outlining how tests and test suites should be named, including how those names should be used in Kconfig entries and filenames. [1]: https://lore.kernel.org/linux-kselftest/202006141005.BA19A9D3@keescook/t/#u Signed-off-by: David Gow Reviewed-by: Kees Cook Reviewed-by: Brendan Higgins Reviewed-by: Marco Elver Reviewed-by: Tim Bird Signed-off-by: Shuah Khan --- Documentation/dev-tools/kunit/index.rst | 1 + Documentation/dev-tools/kunit/style.rst | 205 ++++++++++++++++++++++++ 2 files changed, 206 insertions(+) create mode 100644 Documentation/dev-tools/kunit/style.rst diff --git a/Documentation/dev-tools/kunit/index.rst b/Documentation/dev-tools/kunit/index.rst index e93606ecfb01..c234a3ab3c34 100644 --- a/Documentation/dev-tools/kunit/index.rst +++ b/Documentation/dev-tools/kunit/index.rst @@ -11,6 +11,7 @@ KUnit - Unit Testing for the Linux Kernel usage kunit-tool api/index + style faq What is KUnit? diff --git a/Documentation/dev-tools/kunit/style.rst b/Documentation/dev-tools/kunit/style.rst new file mode 100644 index 000000000000..da1d6f0ed6bc --- /dev/null +++ b/Documentation/dev-tools/kunit/style.rst @@ -0,0 +1,205 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=========================== +Test Style and Nomenclature +=========================== + +To make finding, writing, and using KUnit tests as simple as possible, it's +strongly encouraged that they are named and written according to the guidelines +below. While it's possible to write KUnit tests which do not follow these rules, +they may break some tooling, may conflict with other tests, and may not be run +automatically by testing systems. + +It's recommended that you only deviate from these guidelines when: + +1. Porting tests to KUnit which are already known with an existing name, or +2. Writing tests which would cause serious problems if automatically run (e.g., + non-deterministically producing false positives or negatives, or taking an + extremely long time to run). + +Subsystems, Suites, and Tests +============================= + +In order to make tests as easy to find as possible, they're grouped into suites +and subsystems. A test suite is a group of tests which test a related area of +the kernel, and a subsystem is a set of test suites which test different parts +of the same kernel subsystem or driver. + +Subsystems +---------- + +Every test suite must belong to a subsystem. A subsystem is a collection of one +or more KUnit test suites which test the same driver or part of the kernel. A +rule of thumb is that a test subsystem should match a single kernel module. If +the code being tested can't be compiled as a module, in many cases the subsystem +should correspond to a directory in the source tree or an entry in the +MAINTAINERS file. If unsure, follow the conventions set by tests in similar +areas. + +Test subsystems should be named after the code being tested, either after the +module (wherever possible), or after the directory or files being tested. Test +subsystems should be named to avoid ambiguity where necessary. + +If a test subsystem name has multiple components, they should be separated by +underscores. *Do not* include "test" or "kunit" directly in the subsystem name +unless you are actually testing other tests or the kunit framework itself. + +Example subsystems could be: + +``ext4`` + Matches the module and filesystem name. +``apparmor`` + Matches the module name and LSM name. +``kasan`` + Common name for the tool, prominent part of the path ``mm/kasan`` +``snd_hda_codec_hdmi`` + Has several components (``snd``, ``hda``, ``codec``, ``hdmi``) separated by + underscores. Matches the module name. + +Avoid names like these: + +``linear-ranges`` + Names should use underscores, not dashes, to separate words. Prefer + ``linear_ranges``. +``qos-kunit-test`` + As well as using underscores, this name should not have "kunit-test" as a + suffix, and ``qos`` is ambiguous as a subsystem name. ``power_qos`` would be a + better name. +``pc_parallel_port`` + The corresponding module name is ``parport_pc``, so this subsystem should also + be named ``parport_pc``. + +.. note:: + The KUnit API and tools do not explicitly know about subsystems. They're + simply a way of categorising test suites and naming modules which + provides a simple, consistent way for humans to find and run tests. This + may change in the future, though. + +Suites +------ + +KUnit tests are grouped into test suites, which cover a specific area of +functionality being tested. Test suites can have shared initialisation and +shutdown code which is run for all tests in the suite. +Not all subsystems will need to be split into multiple test suites (e.g. simple drivers). + +Test suites are named after the subsystem they are part of. If a subsystem +contains several suites, the specific area under test should be appended to the +subsystem name, separated by an underscore. + +In the event that there are multiple types of test using KUnit within a +subsystem (e.g., both unit tests and integration tests), they should be put into +separate suites, with the type of test as the last element in the suite name. +Unless these tests are actually present, avoid using ``_test``, ``_unittest`` or +similar in the suite name. + +The full test suite name (including the subsystem name) should be specified as +the ``.name`` member of the ``kunit_suite`` struct, and forms the base for the +module name (see below). + +Example test suites could include: + +``ext4_inode`` + Part of the ``ext4`` subsystem, testing the ``inode`` area. +``kunit_try_catch`` + Part of the ``kunit`` implementation itself, testing the ``try_catch`` area. +``apparmor_property_entry`` + Part of the ``apparmor`` subsystem, testing the ``property_entry`` area. +``kasan`` + The ``kasan`` subsystem has only one suite, so the suite name is the same as + the subsystem name. + +Avoid names like: + +``ext4_ext4_inode`` + There's no reason to state the subsystem twice. +``property_entry`` + The suite name is ambiguous without the subsystem name. +``kasan_integration_test`` + Because there is only one suite in the ``kasan`` subsystem, the suite should + just be called ``kasan``. There's no need to redundantly add + ``integration_test``. Should a separate test suite with, for example, unit + tests be added, then that suite could be named ``kasan_unittest`` or similar. + +Test Cases +---------- + +Individual tests consist of a single function which tests a constrained +codepath, property, or function. In the test output, individual tests' results +will show up as subtests of the suite's results. + +Tests should be named after what they're testing. This is often the name of the +function being tested, with a description of the input or codepath being tested. +As tests are C functions, they should be named and written in accordance with +the kernel coding style. + +.. note:: + As tests are themselves functions, their names cannot conflict with + other C identifiers in the kernel. This may require some creative + naming. It's a good idea to make your test functions `static` to avoid + polluting the global namespace. + +Example test names include: + +``unpack_u32_with_null_name`` + Tests the ``unpack_u32`` function when a NULL name is passed in. +``test_list_splice`` + Tests the ``list_splice`` macro. It has the prefix ``test_`` to avoid a + name conflict with the macro itself. + + +Should it be necessary to refer to a test outside the context of its test suite, +the *fully-qualified* name of a test should be the suite name followed by the +test name, separated by a colon (i.e. ``suite:test``). + +Test Kconfig Entries +==================== + +Every test suite should be tied to a Kconfig entry. + +This Kconfig entry must: + +* be named ``CONFIG__KUNIT_TEST``: where is the name of the test + suite. +* be listed either alongside the config entries for the driver/subsystem being + tested, or be under [Kernel Hacking]→[Kernel Testing and Coverage] +* depend on ``CONFIG_KUNIT`` +* be visible only if ``CONFIG_KUNIT_ALL_TESTS`` is not enabled. +* have a default value of ``CONFIG_KUNIT_ALL_TESTS``. +* have a brief description of KUnit in the help text + +Unless there's a specific reason not to (e.g. the test is unable to be built as +a module), Kconfig entries for tests should be tristate. + +An example Kconfig entry: + +.. code-block:: none + + config FOO_KUNIT_TEST + tristate "KUnit test for foo" if !KUNIT_ALL_TESTS + depends on KUNIT + default KUNIT_ALL_TESTS + help + This builds unit tests for foo. + + For more information on KUnit and unit tests in general, please refer + to the KUnit documentation in Documentation/dev-tools/kunit + + If unsure, say N + + +Test File and Module Names +========================== + +KUnit tests can often be compiled as a module. These modules should be named +after the test suite, followed by ``_test``. If this is likely to conflict with +non-KUnit tests, the suffix ``_kunit`` can also be used. + +The easiest way of achieving this is to name the file containing the test suite +``_test.c`` (or, as above, ``_kunit.c``). This file should be +placed next to the code under test. + +If the suite name contains some or all of the name of the test's parent +directory, it may make sense to modify the source filename to reduce redundancy. +For example, a ``foo_firmware`` suite could be in the ``foo/firmware_test.c`` +file. From 90a025a859a3ac4dfe3db62edf21070a90e98766 Mon Sep 17 00:00:00 2001 From: Brendan Higgins Date: Tue, 4 Aug 2020 13:47:41 -0700 Subject: [PATCH 104/168] vmlinux.lds.h: add linker section for KUnit test suites Add a linker section where KUnit can put references to its test suites. This patch is the first step in transitioning to dispatching all KUnit tests from a centralized executor rather than having each as its own separate late_initcall. Co-developed-by: Iurii Zaikin Signed-off-by: Iurii Zaikin Signed-off-by: Brendan Higgins Reviewed-by: Stephen Boyd Signed-off-by: Shuah Khan --- include/asm-generic/vmlinux.lds.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 5430febd34be..31e08674b542 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -717,7 +717,8 @@ THERMAL_TABLE(governor) \ EARLYCON_TABLE() \ LSM_TABLE() \ - EARLY_LSM_TABLE() + EARLY_LSM_TABLE() \ + KUNIT_TABLE() #define INIT_TEXT \ *(.init.text .init.text.*) \ @@ -909,6 +910,13 @@ KEEP(*(.con_initcall.init)) \ __con_initcall_end = .; +/* Alignment must be consistent with (kunit_suite *) in include/kunit/test.h */ +#define KUNIT_TABLE() \ + . = ALIGN(8); \ + __kunit_suites_start = .; \ + KEEP(*(.kunit_test_suites)) \ + __kunit_suites_end = .; + #ifdef CONFIG_BLK_DEV_INITRD #define INIT_RAM_FS \ . = ALIGN(4); \ From aac35468ca20a3a0e75a24c13c0e31610727f120 Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Tue, 4 Aug 2020 13:47:42 -0700 Subject: [PATCH 105/168] kunit: test: create a single centralized executor for all tests Add a centralized executor to dispatch tests rather than relying on late_initcall to schedule each test suite separately. Centralized execution is for built-in tests only; modules will execute tests when loaded. Signed-off-by: Alan Maguire Co-developed-by: Iurii Zaikin Signed-off-by: Iurii Zaikin Co-developed-by: Brendan Higgins Signed-off-by: Brendan Higgins Reviewed-by: Stephen Boyd Reviewed-by: Kees Cook Signed-off-by: Shuah Khan --- include/kunit/test.h | 67 +++++++++++++++++++++++++++++--------------- lib/kunit/Makefile | 3 +- lib/kunit/executor.c | 28 ++++++++++++++++++ lib/kunit/test.c | 2 +- 4 files changed, 76 insertions(+), 24 deletions(-) create mode 100644 lib/kunit/executor.c diff --git a/include/kunit/test.h b/include/kunit/test.h index 59f3144f009a..d2d261f58259 100644 --- a/include/kunit/test.h +++ b/include/kunit/test.h @@ -233,7 +233,7 @@ size_t kunit_suite_num_test_cases(struct kunit_suite *suite); unsigned int kunit_test_case_num(struct kunit_suite *suite, struct kunit_case *test_case); -int __kunit_test_suites_init(struct kunit_suite **suites); +int __kunit_test_suites_init(struct kunit_suite * const * const suites); void __kunit_test_suites_exit(struct kunit_suite **suites); @@ -246,34 +246,57 @@ void __kunit_test_suites_exit(struct kunit_suite **suites); * Registers @suites_list with the test framework. See &struct kunit_suite for * more information. * - * When builtin, KUnit tests are all run as late_initcalls; this means - * that they cannot test anything where tests must run at a different init - * phase. One significant restriction resulting from this is that KUnit - * cannot reliably test anything that is initialize in the late_init phase; - * another is that KUnit is useless to test things that need to be run in - * an earlier init phase. - * - * An alternative is to build the tests as a module. Because modules - * do not support multiple late_initcall()s, we need to initialize an - * array of suites for a module. - * - * TODO(brendanhiggins@google.com): Don't run all KUnit tests as - * late_initcalls. I have some future work planned to dispatch all KUnit - * tests from the same place, and at the very least to do so after - * everything else is definitely initialized. + * If a test suite is built-in, module_init() gets translated into + * an initcall which we don't want as the idea is that for builtins + * the executor will manage execution. So ensure we do not define + * module_{init|exit} functions for the builtin case when registering + * suites via kunit_test_suites() below. */ -#define kunit_test_suites(suites_list...) \ - static struct kunit_suite *suites[] = {suites_list, NULL}; \ - static int kunit_test_suites_init(void) \ +#ifdef MODULE +#define kunit_test_suites_for_module(__suites) \ + static int __init kunit_test_suites_init(void) \ { \ - return __kunit_test_suites_init(suites); \ + return __kunit_test_suites_init(__suites); \ } \ - late_initcall(kunit_test_suites_init); \ + module_init(kunit_test_suites_init); \ + \ static void __exit kunit_test_suites_exit(void) \ { \ - return __kunit_test_suites_exit(suites); \ + return __kunit_test_suites_exit(__suites); \ } \ module_exit(kunit_test_suites_exit) +#else +#define kunit_test_suites_for_module(__suites) +#endif /* MODULE */ + +#define __kunit_test_suites(unique_array, unique_suites, ...) \ + static struct kunit_suite *unique_array[] = { __VA_ARGS__, NULL }; \ + kunit_test_suites_for_module(unique_array); \ + static struct kunit_suite **unique_suites \ + __used __section(.kunit_test_suites) = unique_array + +/** + * kunit_test_suites() - used to register one or more &struct kunit_suite + * with KUnit. + * + * @suites: a statically allocated list of &struct kunit_suite. + * + * Registers @suites with the test framework. See &struct kunit_suite for + * more information. + * + * When builtin, KUnit tests are all run via executor; this is done + * by placing the array of struct kunit_suite * in the .kunit_test_suites + * ELF section. + * + * An alternative is to build the tests as a module. Because modules do not + * support multiple initcall()s, we need to initialize an array of suites for a + * module. + * + */ +#define kunit_test_suites(...) \ + __kunit_test_suites(__UNIQUE_ID(array), \ + __UNIQUE_ID(suites), \ + __VA_ARGS__) #define kunit_test_suite(suite) kunit_test_suites(&suite) diff --git a/lib/kunit/Makefile b/lib/kunit/Makefile index 724b94311ca3..c49f4ffb6273 100644 --- a/lib/kunit/Makefile +++ b/lib/kunit/Makefile @@ -3,7 +3,8 @@ obj-$(CONFIG_KUNIT) += kunit.o kunit-objs += test.o \ string-stream.o \ assert.o \ - try-catch.o + try-catch.o \ + executor.o ifeq ($(CONFIG_KUNIT_DEBUGFS),y) kunit-objs += debugfs.o diff --git a/lib/kunit/executor.c b/lib/kunit/executor.c new file mode 100644 index 000000000000..7015e7328dce --- /dev/null +++ b/lib/kunit/executor.c @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include + +/* + * These symbols point to the .kunit_test_suites section and are defined in + * include/asm-generic/vmlinux.lds.h, and consequently must be extern. + */ +extern struct kunit_suite * const * const __kunit_suites_start[]; +extern struct kunit_suite * const * const __kunit_suites_end[]; + +#if IS_BUILTIN(CONFIG_KUNIT) + +static int kunit_run_all_tests(void) +{ + struct kunit_suite * const * const *suites; + + for (suites = __kunit_suites_start; + suites < __kunit_suites_end; + suites++) + __kunit_test_suites_init(*suites); + + return 0; +} + +late_initcall(kunit_run_all_tests); + +#endif /* IS_BUILTIN(CONFIG_KUNIT) */ diff --git a/lib/kunit/test.c b/lib/kunit/test.c index c36037200310..3fd89f91937f 100644 --- a/lib/kunit/test.c +++ b/lib/kunit/test.c @@ -381,7 +381,7 @@ static void kunit_init_suite(struct kunit_suite *suite) kunit_debugfs_create_suite(suite); } -int __kunit_test_suites_init(struct kunit_suite **suites) +int __kunit_test_suites_init(struct kunit_suite * const * const suites) { unsigned int i; From 8c0d884986ba22f1020be9c02e41c030890ee8f2 Mon Sep 17 00:00:00 2001 From: Brendan Higgins Date: Tue, 4 Aug 2020 13:47:43 -0700 Subject: [PATCH 106/168] init: main: add KUnit to kernel init Although we have not seen any actual examples where KUnit doesn't work because it runs in the late init phase of the kernel, it has been a concern for some time that this could potentially be an issue in the future. So, remove KUnit from init calls entirely, instead call directly from kernel_init() so that KUnit runs after late init. Co-developed-by: Alan Maguire Signed-off-by: Alan Maguire Signed-off-by: Brendan Higgins Reviewed-by: Stephen Boyd Reviewed-by: Kees Cook Reviewed-by: Luis Chamberlain Signed-off-by: Shuah Khan --- include/kunit/test.h | 9 +++++++++ init/main.c | 4 ++++ lib/kunit/executor.c | 4 +--- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/include/kunit/test.h b/include/kunit/test.h index d2d261f58259..7ea24466e49c 100644 --- a/include/kunit/test.h +++ b/include/kunit/test.h @@ -237,6 +237,15 @@ int __kunit_test_suites_init(struct kunit_suite * const * const suites); void __kunit_test_suites_exit(struct kunit_suite **suites); +#if IS_BUILTIN(CONFIG_KUNIT) +int kunit_run_all_tests(void); +#else +static inline int kunit_run_all_tests(void) +{ + return 0; +} +#endif /* IS_BUILTIN(CONFIG_KUNIT) */ + /** * kunit_test_suites() - used to register one or more &struct kunit_suite * with KUnit. diff --git a/init/main.c b/init/main.c index ae78fb68d231..232c8df465ee 100644 --- a/init/main.c +++ b/init/main.c @@ -107,6 +107,8 @@ #define CREATE_TRACE_POINTS #include +#include + static int kernel_init(void *); extern void init_IRQ(void); @@ -1511,6 +1513,8 @@ static noinline void __init kernel_init_freeable(void) do_basic_setup(); + kunit_run_all_tests(); + console_on_rootfs(); /* diff --git a/lib/kunit/executor.c b/lib/kunit/executor.c index 7015e7328dce..4aab7f70a88c 100644 --- a/lib/kunit/executor.c +++ b/lib/kunit/executor.c @@ -11,7 +11,7 @@ extern struct kunit_suite * const * const __kunit_suites_end[]; #if IS_BUILTIN(CONFIG_KUNIT) -static int kunit_run_all_tests(void) +int kunit_run_all_tests(void) { struct kunit_suite * const * const *suites; @@ -23,6 +23,4 @@ static int kunit_run_all_tests(void) return 0; } -late_initcall(kunit_run_all_tests); - #endif /* IS_BUILTIN(CONFIG_KUNIT) */ From 45dcbb6f5ef78b0a9c1b91bea2f6f227642a65aa Mon Sep 17 00:00:00 2001 From: Brendan Higgins Date: Tue, 4 Aug 2020 13:47:44 -0700 Subject: [PATCH 107/168] kunit: test: add test plan to KUnit TAP format TAP 14 allows an optional test plan to be emitted before the start of the start of testing[1]; this is valuable because it makes it possible for a test harness to detect whether the number of tests run matches the number of tests expected to be run, ensuring that no tests silently failed. Link[1]: https://github.com/isaacs/testanything.github.io/blob/tap14/tap-version-14-specification.md#the-plan Signed-off-by: Brendan Higgins Reviewed-by: Stephen Boyd Signed-off-by: Shuah Khan --- lib/kunit/executor.c | 17 +++++ lib/kunit/test.c | 11 --- tools/testing/kunit/kunit_parser.py | 76 +++++++++++++++---- .../test_is_test_passed-all_passed.log | 1 + .../test_data/test_is_test_passed-crash.log | 1 + .../test_data/test_is_test_passed-failure.log | 1 + 6 files changed, 82 insertions(+), 25 deletions(-) diff --git a/lib/kunit/executor.c b/lib/kunit/executor.c index 4aab7f70a88c..a95742a4ece7 100644 --- a/lib/kunit/executor.c +++ b/lib/kunit/executor.c @@ -11,10 +11,27 @@ extern struct kunit_suite * const * const __kunit_suites_end[]; #if IS_BUILTIN(CONFIG_KUNIT) +static void kunit_print_tap_header(void) +{ + struct kunit_suite * const * const *suites, * const *subsuite; + int num_of_suites = 0; + + for (suites = __kunit_suites_start; + suites < __kunit_suites_end; + suites++) + for (subsuite = *suites; *subsuite != NULL; subsuite++) + num_of_suites++; + + pr_info("TAP version 14\n"); + pr_info("1..%d\n", num_of_suites); +} + int kunit_run_all_tests(void) { struct kunit_suite * const * const *suites; + kunit_print_tap_header(); + for (suites = __kunit_suites_start; suites < __kunit_suites_end; suites++) diff --git a/lib/kunit/test.c b/lib/kunit/test.c index 3fd89f91937f..de07876b6601 100644 --- a/lib/kunit/test.c +++ b/lib/kunit/test.c @@ -20,16 +20,6 @@ static void kunit_set_failure(struct kunit *test) WRITE_ONCE(test->success, false); } -static void kunit_print_tap_version(void) -{ - static bool kunit_has_printed_tap_version; - - if (!kunit_has_printed_tap_version) { - pr_info("TAP version 14\n"); - kunit_has_printed_tap_version = true; - } -} - /* * Append formatted message to log, size of which is limited to * KUNIT_LOG_SIZE bytes (including null terminating byte). @@ -69,7 +59,6 @@ EXPORT_SYMBOL_GPL(kunit_suite_num_test_cases); static void kunit_print_subtest_start(struct kunit_suite *suite) { - kunit_print_tap_version(); kunit_log(KERN_INFO, suite, KUNIT_SUBTEST_INDENT "# Subtest: %s", suite->name); kunit_log(KERN_INFO, suite, KUNIT_SUBTEST_INDENT "1..%zd", diff --git a/tools/testing/kunit/kunit_parser.py b/tools/testing/kunit/kunit_parser.py index f13e0c0d6663..8019e3dd4c32 100644 --- a/tools/testing/kunit/kunit_parser.py +++ b/tools/testing/kunit/kunit_parser.py @@ -45,10 +45,11 @@ class TestStatus(Enum): FAILURE = auto() TEST_CRASHED = auto() NO_TESTS = auto() + FAILURE_TO_PARSE_TESTS = auto() kunit_start_re = re.compile(r'TAP version [0-9]+$') kunit_end_re = re.compile('(List of all partitions:|' - 'Kernel panic - not syncing: VFS:|reboot: System halted)') + 'Kernel panic - not syncing: VFS:)') def isolate_kunit_output(kernel_output): started = False @@ -109,7 +110,7 @@ OkNotOkResult = namedtuple('OkNotOkResult', ['is_ok','description', 'text']) OK_NOT_OK_SUBTEST = re.compile(r'^[\s]+(ok|not ok) [0-9]+ - (.*)$') -OK_NOT_OK_MODULE = re.compile(r'^(ok|not ok) [0-9]+ - (.*)$') +OK_NOT_OK_MODULE = re.compile(r'^(ok|not ok) ([0-9]+) - (.*)$') def parse_ok_not_ok_test_case(lines: List[str], test_case: TestCase) -> bool: save_non_diagnositic(lines, test_case) @@ -197,7 +198,9 @@ def max_status(left: TestStatus, right: TestStatus) -> TestStatus: else: return TestStatus.SUCCESS -def parse_ok_not_ok_test_suite(lines: List[str], test_suite: TestSuite) -> bool: +def parse_ok_not_ok_test_suite(lines: List[str], + test_suite: TestSuite, + expected_suite_index: int) -> bool: consume_non_diagnositic(lines) if not lines: test_suite.status = TestStatus.TEST_CRASHED @@ -210,6 +213,12 @@ def parse_ok_not_ok_test_suite(lines: List[str], test_suite: TestSuite) -> bool: test_suite.status = TestStatus.SUCCESS else: test_suite.status = TestStatus.FAILURE + suite_index = int(match.group(2)) + if suite_index != expected_suite_index: + print_with_timestamp( + red('[ERROR] ') + 'expected_suite_index ' + + str(expected_suite_index) + ', but got ' + + str(suite_index)) return True else: return False @@ -222,7 +231,7 @@ def bubble_up_test_case_errors(test_suite: TestSuite) -> TestStatus: max_test_case_status = bubble_up_errors(lambda x: x.status, test_suite.cases) return max_status(max_test_case_status, test_suite.status) -def parse_test_suite(lines: List[str]) -> TestSuite: +def parse_test_suite(lines: List[str], expected_suite_index: int) -> TestSuite: if not lines: return None consume_non_diagnositic(lines) @@ -241,7 +250,7 @@ def parse_test_suite(lines: List[str]) -> TestSuite: break test_suite.cases.append(test_case) expected_test_case_num -= 1 - if parse_ok_not_ok_test_suite(lines, test_suite): + if parse_ok_not_ok_test_suite(lines, test_suite, expected_suite_index): test_suite.status = bubble_up_test_case_errors(test_suite) return test_suite elif not lines: @@ -261,6 +270,17 @@ def parse_tap_header(lines: List[str]) -> bool: else: return False +TEST_PLAN = re.compile(r'[0-9]+\.\.([0-9]+)') + +def parse_test_plan(lines: List[str]) -> int: + consume_non_diagnositic(lines) + match = TEST_PLAN.match(lines[0]) + if match: + lines.pop(0) + return int(match.group(1)) + else: + return None + def bubble_up_suite_errors(test_suite_list: List[TestSuite]) -> TestStatus: return bubble_up_errors(lambda x: x.status, test_suite_list) @@ -268,20 +288,33 @@ def parse_test_result(lines: List[str]) -> TestResult: consume_non_diagnositic(lines) if not lines or not parse_tap_header(lines): return TestResult(TestStatus.NO_TESTS, [], lines) + expected_test_suite_num = parse_test_plan(lines) + if not expected_test_suite_num: + return TestResult(TestStatus.FAILURE_TO_PARSE_TESTS, [], lines) test_suites = [] - test_suite = parse_test_suite(lines) - while test_suite: - test_suites.append(test_suite) - test_suite = parse_test_suite(lines) - return TestResult(bubble_up_suite_errors(test_suites), test_suites, lines) + for i in range(1, expected_test_suite_num + 1): + test_suite = parse_test_suite(lines, i) + if test_suite: + test_suites.append(test_suite) + else: + print_with_timestamp( + red('[ERROR] ') + ' expected ' + + str(expected_test_suite_num) + + ' test suites, but got ' + str(i - 2)) + break + test_suite = parse_test_suite(lines, -1) + if test_suite: + print_with_timestamp(red('[ERROR] ') + + 'got unexpected test suite: ' + test_suite.name) + if test_suites: + return TestResult(bubble_up_suite_errors(test_suites), test_suites, lines) + else: + return TestResult(TestStatus.NO_TESTS, [], lines) -def parse_run_tests(kernel_output) -> TestResult: +def print_and_count_results(test_result: TestResult) -> None: total_tests = 0 failed_tests = 0 crashed_tests = 0 - test_result = parse_test_result(list(isolate_kunit_output(kernel_output))) - if test_result.status == TestStatus.NO_TESTS: - print_with_timestamp(red('[ERROR] ') + 'no kunit output detected') for test_suite in test_result.suites: if test_suite.status == TestStatus.SUCCESS: print_suite_divider(green('[PASSED] ') + test_suite.name) @@ -303,6 +336,21 @@ def parse_run_tests(kernel_output) -> TestResult: print_with_timestamp(red('[FAILED] ') + test_case.name) print_log(map(yellow, test_case.log)) print_with_timestamp('') + return total_tests, failed_tests, crashed_tests + +def parse_run_tests(kernel_output) -> TestResult: + total_tests = 0 + failed_tests = 0 + crashed_tests = 0 + test_result = parse_test_result(list(isolate_kunit_output(kernel_output))) + if test_result.status == TestStatus.NO_TESTS: + print(red('[ERROR] ') + yellow('no tests run!')) + elif test_result.status == TestStatus.FAILURE_TO_PARSE_TESTS: + print(red('[ERROR] ') + yellow('could not parse test results!')) + else: + (total_tests, + failed_tests, + crashed_tests) = print_and_count_results(test_result) print_with_timestamp(DIVIDER) fmt = green if test_result.status == TestStatus.SUCCESS else red print_with_timestamp( diff --git a/tools/testing/kunit/test_data/test_is_test_passed-all_passed.log b/tools/testing/kunit/test_data/test_is_test_passed-all_passed.log index 62ebc0288355..bc0dc8fe35b7 100644 --- a/tools/testing/kunit/test_data/test_is_test_passed-all_passed.log +++ b/tools/testing/kunit/test_data/test_is_test_passed-all_passed.log @@ -1,4 +1,5 @@ TAP version 14 +1..2 # Subtest: sysctl_test 1..8 # sysctl_test_dointvec_null_tbl_data: sysctl_test_dointvec_null_tbl_data passed diff --git a/tools/testing/kunit/test_data/test_is_test_passed-crash.log b/tools/testing/kunit/test_data/test_is_test_passed-crash.log index 0b249870c8be..4d97f6708c4a 100644 --- a/tools/testing/kunit/test_data/test_is_test_passed-crash.log +++ b/tools/testing/kunit/test_data/test_is_test_passed-crash.log @@ -1,6 +1,7 @@ printk: console [tty0] enabled printk: console [mc-1] enabled TAP version 14 +1..2 # Subtest: sysctl_test 1..8 # sysctl_test_dointvec_null_tbl_data: sysctl_test_dointvec_null_tbl_data passed diff --git a/tools/testing/kunit/test_data/test_is_test_passed-failure.log b/tools/testing/kunit/test_data/test_is_test_passed-failure.log index 9e89d32d5667..7a416497e3be 100644 --- a/tools/testing/kunit/test_data/test_is_test_passed-failure.log +++ b/tools/testing/kunit/test_data/test_is_test_passed-failure.log @@ -1,4 +1,5 @@ TAP version 14 +1..2 # Subtest: sysctl_test 1..8 # sysctl_test_dointvec_null_tbl_data: sysctl_test_dointvec_null_tbl_data passed From a82763e63ee78c9876211998d0bcc082f4ff63d8 Mon Sep 17 00:00:00 2001 From: Brendan Higgins Date: Tue, 4 Aug 2020 13:47:45 -0700 Subject: [PATCH 108/168] Documentation: kunit: add a brief blurb about kunit_test_suite Add a brief blurb saying how and when the kunit_test_suite() macro works to the usage documentation. Signed-off-by: Brendan Higgins Reviewed-by: Stephen Boyd Signed-off-by: Shuah Khan --- Documentation/dev-tools/kunit/usage.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Documentation/dev-tools/kunit/usage.rst b/Documentation/dev-tools/kunit/usage.rst index 3c3fe8b5fecc..961d3ea3ca19 100644 --- a/Documentation/dev-tools/kunit/usage.rst +++ b/Documentation/dev-tools/kunit/usage.rst @@ -211,6 +211,11 @@ KUnit test framework. .. note:: A test case will only be run if it is associated with a test suite. +``kunit_test_suite(...)`` is a macro which tells the linker to put the specified +test suite in a special linker section so that it can be run by KUnit either +after late_init, or when the test module is loaded (depending on whether the +test was built in or not). + For more information on these types of things see the :doc:`api/test`. Isolating Behavior From 28c185a88607bc361ec7f071b907285e2e75de1b Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Fri, 9 Oct 2020 15:54:53 +0300 Subject: [PATCH 109/168] coccinelle: api: kfree_sensitive: print memset position Print memset() call position in addition to the kfree() position to ease issues identification. Signed-off-by: Denis Efremov Signed-off-by: Julia Lawall --- scripts/coccinelle/api/kfree_sensitive.cocci | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/scripts/coccinelle/api/kfree_sensitive.cocci b/scripts/coccinelle/api/kfree_sensitive.cocci index e4a066a0b77d..8d980ebf3223 100644 --- a/scripts/coccinelle/api/kfree_sensitive.cocci +++ b/scripts/coccinelle/api/kfree_sensitive.cocci @@ -85,14 +85,16 @@ type T; @script:python depends on report@ p << r.p; +m << r.m; @@ -coccilib.report.print_report(p[0], - "WARNING: opportunity for kfree_sensitive/kvfree_sensitive") +msg = "WARNING opportunity for kfree_sensitive/kvfree_sensitive (memset at line %s)" +coccilib.report.print_report(p[0], msg % (m[0].line)) @script:python depends on org@ p << r.p; +m << r.m; @@ -coccilib.org.print_todo(p[0], - "WARNING: opportunity for kfree_sensitive/kvfree_sensitive") +msg = "WARNING opportunity for kfree_sensitive/kvfree_sensitive (memset at line %s)" +coccilib.org.print_todo(p[0], msg % (m[0].line)) From 47f6d9ce45b03a40c34b668a9884754c58122b39 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Tue, 29 Sep 2020 20:45:29 +0800 Subject: [PATCH 110/168] ubifs: Fix a memleak after dumping authentication mount options Fix a memory leak after dumping authentication mount options in error handling branch. Signed-off-by: Zhihao Cheng Cc: # 4.20+ Fixes: d8a22773a12c6d7 ("ubifs: Enable authentication support") Reviewed-by: Sascha Hauer Signed-off-by: Richard Weinberger --- fs/ubifs/super.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index a2420c900275..6f85cd618766 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1141,6 +1141,18 @@ static int ubifs_parse_options(struct ubifs_info *c, char *options, return 0; } +/* + * ubifs_release_options - release mount parameters which have been dumped. + * @c: UBIFS file-system description object + */ +static void ubifs_release_options(struct ubifs_info *c) +{ + kfree(c->auth_key_name); + c->auth_key_name = NULL; + kfree(c->auth_hash_name); + c->auth_hash_name = NULL; +} + /** * destroy_journal - destroy journal data structures. * @c: UBIFS file-system description object @@ -1650,8 +1662,7 @@ static void ubifs_umount(struct ubifs_info *c) ubifs_lpt_free(c, 0); ubifs_exit_authentication(c); - kfree(c->auth_key_name); - kfree(c->auth_hash_name); + ubifs_release_options(c); kfree(c->cbuf); kfree(c->rcvrd_mst_node); kfree(c->mst_node); @@ -2219,6 +2230,7 @@ out_umount: out_unlock: mutex_unlock(&c->umount_mutex); out_close: + ubifs_release_options(c); ubi_close_volume(c->ubi); out: return err; From bb674a4d4de1032837fcbf860a63939e66f0b7ad Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Tue, 29 Sep 2020 20:45:30 +0800 Subject: [PATCH 111/168] ubifs: Don't parse authentication mount options in remount process There is no need to dump authentication options while remounting, because authentication initialization can only be doing once in the first mount process. Dumping authentication mount options in remount process may cause memory leak if UBIFS has already been mounted with old authentication mount options. Signed-off-by: Zhihao Cheng Cc: # 4.20+ Fixes: d8a22773a12c6d7 ("ubifs: Enable authentication support") Reviewed-by: Sascha Hauer Signed-off-by: Richard Weinberger --- fs/ubifs/super.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 6f85cd618766..9796f5df2f7f 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1110,14 +1110,20 @@ static int ubifs_parse_options(struct ubifs_info *c, char *options, break; } case Opt_auth_key: - c->auth_key_name = kstrdup(args[0].from, GFP_KERNEL); - if (!c->auth_key_name) - return -ENOMEM; + if (!is_remount) { + c->auth_key_name = kstrdup(args[0].from, + GFP_KERNEL); + if (!c->auth_key_name) + return -ENOMEM; + } break; case Opt_auth_hash_name: - c->auth_hash_name = kstrdup(args[0].from, GFP_KERNEL); - if (!c->auth_hash_name) - return -ENOMEM; + if (!is_remount) { + c->auth_hash_name = kstrdup(args[0].from, + GFP_KERNEL); + if (!c->auth_hash_name) + return -ENOMEM; + } break; case Opt_ignore: break; From e2a05cc7f8229e150243cdae40f2af9021d67a4a Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Tue, 29 Sep 2020 20:45:31 +0800 Subject: [PATCH 112/168] ubifs: mount_ubifs: Release authentication resource in error handling path Release the authentication related resource in some error handling branches in mount_ubifs(). Signed-off-by: Zhihao Cheng Cc: # 4.20+ Fixes: d8a22773a12c6d7 ("ubifs: Enable authentication support") Reviewed-by: Sascha Hauer Signed-off-by: Richard Weinberger --- fs/ubifs/super.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 9796f5df2f7f..732218ef6656 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1331,7 +1331,7 @@ static int mount_ubifs(struct ubifs_info *c) err = ubifs_read_superblock(c); if (err) - goto out_free; + goto out_auth; c->probing = 0; @@ -1343,18 +1343,18 @@ static int mount_ubifs(struct ubifs_info *c) ubifs_err(c, "'compressor \"%s\" is not compiled in", ubifs_compr_name(c, c->default_compr)); err = -ENOTSUPP; - goto out_free; + goto out_auth; } err = init_constants_sb(c); if (err) - goto out_free; + goto out_auth; sz = ALIGN(c->max_idx_node_sz, c->min_io_size) * 2; c->cbuf = kmalloc(sz, GFP_NOFS); if (!c->cbuf) { err = -ENOMEM; - goto out_free; + goto out_auth; } err = alloc_wbufs(c); @@ -1629,6 +1629,8 @@ out_wbufs: free_wbufs(c); out_cbuf: kfree(c->cbuf); +out_auth: + ubifs_exit_authentication(c); out_free: kfree(c->write_reserve_buf); kfree(c->bu.buf); From bab991cf40f631d18d00cb8c2a97325c8fd4292e Mon Sep 17 00:00:00 2001 From: Gaurav Singh Date: Sat, 13 Jun 2020 21:19:40 -0400 Subject: [PATCH 113/168] um: Fix null pointer dereference in vector_user_bpf The bpf_prog is being checked for !NULL after uml_kmalloc but later its used directly for example: bpf_prog->filter = bpf and is also later returned upon success. Fix this, do a NULL check and return right away. Signed-off-by: Gaurav Singh Acked-By: Anton Ivanov Signed-off-by: Richard Weinberger --- arch/um/drivers/vector_user.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/um/drivers/vector_user.c b/arch/um/drivers/vector_user.c index c4a0f26b2824..0e6d6717bf73 100644 --- a/arch/um/drivers/vector_user.c +++ b/arch/um/drivers/vector_user.c @@ -789,10 +789,12 @@ void *uml_vector_user_bpf(char *filename) return false; } bpf_prog = uml_kmalloc(sizeof(struct sock_fprog), UM_GFP_KERNEL); - if (bpf_prog != NULL) { - bpf_prog->len = statbuf.st_size / sizeof(struct sock_filter); - bpf_prog->filter = NULL; + if (bpf_prog == NULL) { + printk(KERN_ERR "Failed to allocate bpf prog buffer"); + return NULL; } + bpf_prog->len = statbuf.st_size / sizeof(struct sock_filter); + bpf_prog->filter = NULL; ffd = os_open_file(filename, of_read(OPENFLAGS()), 0); if (ffd < 0) { printk(KERN_ERR "Error %d opening bpf file", -errno); From e4e721fe4ccb504a29d1e8d4047667557281d932 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Fri, 19 Jun 2020 13:20:07 +0800 Subject: [PATCH 114/168] um: vector: Use GFP_ATOMIC under spin lock Use GFP_ATOMIC instead of GFP_KERNEL under spin lock to fix possible sleep-in-atomic-context bugs. Fixes: 9807019a62dc ("um: Loadable BPF "Firmware" for vector drivers") Signed-off-by: Tiezhu Yang Acked-By: Anton Ivanov Signed-off-by: Richard Weinberger --- arch/um/drivers/vector_kern.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/um/drivers/vector_kern.c b/arch/um/drivers/vector_kern.c index 8735c468230a..555203e3e7b4 100644 --- a/arch/um/drivers/vector_kern.c +++ b/arch/um/drivers/vector_kern.c @@ -1403,7 +1403,7 @@ static int vector_net_load_bpf_flash(struct net_device *dev, kfree(vp->bpf->filter); vp->bpf->filter = NULL; } else { - vp->bpf = kmalloc(sizeof(struct sock_fprog), GFP_KERNEL); + vp->bpf = kmalloc(sizeof(struct sock_fprog), GFP_ATOMIC); if (vp->bpf == NULL) { netdev_err(dev, "failed to allocate memory for firmware\n"); goto flash_fail; @@ -1415,7 +1415,7 @@ static int vector_net_load_bpf_flash(struct net_device *dev, if (request_firmware(&fw, efl->data, &vdevice->pdev.dev)) goto flash_fail; - vp->bpf->filter = kmemdup(fw->data, fw->size, GFP_KERNEL); + vp->bpf->filter = kmemdup(fw->data, fw->size, GFP_ATOMIC); if (!vp->bpf->filter) goto free_buffer; From 5e1121cd43d4d3436140a462bfc230ff8aeb1693 Mon Sep 17 00:00:00 2001 From: Ignat Korchagin Date: Sun, 19 Jul 2020 22:02:21 +0100 Subject: [PATCH 115/168] um: Some fixes to build UML with musl musl toolchain and headers are a bit more strict. These fixes enable building UML with musl as well as seem not to break on glibc. Signed-off-by: Ignat Korchagin Tested-by: Brendan Higgins Signed-off-by: Richard Weinberger --- arch/um/drivers/daemon_user.c | 1 + arch/um/drivers/pcap_user.c | 12 ++++++------ arch/um/drivers/slip_user.c | 2 +- arch/um/drivers/vector_user.c | 4 +--- arch/um/os-Linux/util.c | 2 +- arch/x86/um/user-offsets.c | 2 +- 6 files changed, 11 insertions(+), 12 deletions(-) diff --git a/arch/um/drivers/daemon_user.c b/arch/um/drivers/daemon_user.c index 3695821d06a2..785baedc3555 100644 --- a/arch/um/drivers/daemon_user.c +++ b/arch/um/drivers/daemon_user.c @@ -7,6 +7,7 @@ */ #include +#include #include #include #include diff --git a/arch/um/drivers/pcap_user.c b/arch/um/drivers/pcap_user.c index bbd20638788a..52ddda3e3b10 100644 --- a/arch/um/drivers/pcap_user.c +++ b/arch/um/drivers/pcap_user.c @@ -32,7 +32,7 @@ static int pcap_user_init(void *data, void *dev) return 0; } -static int pcap_open(void *data) +static int pcap_user_open(void *data) { struct pcap_data *pri = data; __u32 netmask; @@ -44,14 +44,14 @@ static int pcap_open(void *data) if (pri->filter != NULL) { err = dev_netmask(pri->dev, &netmask); if (err < 0) { - printk(UM_KERN_ERR "pcap_open : dev_netmask failed\n"); + printk(UM_KERN_ERR "pcap_user_open : dev_netmask failed\n"); return -EIO; } pri->compiled = uml_kmalloc(sizeof(struct bpf_program), UM_GFP_KERNEL); if (pri->compiled == NULL) { - printk(UM_KERN_ERR "pcap_open : kmalloc failed\n"); + printk(UM_KERN_ERR "pcap_user_open : kmalloc failed\n"); return -ENOMEM; } @@ -59,14 +59,14 @@ static int pcap_open(void *data) (struct bpf_program *) pri->compiled, pri->filter, pri->optimize, netmask); if (err < 0) { - printk(UM_KERN_ERR "pcap_open : pcap_compile failed - " + printk(UM_KERN_ERR "pcap_user_open : pcap_compile failed - " "'%s'\n", pcap_geterr(pri->pcap)); goto out; } err = pcap_setfilter(pri->pcap, pri->compiled); if (err < 0) { - printk(UM_KERN_ERR "pcap_open : pcap_setfilter " + printk(UM_KERN_ERR "pcap_user_open : pcap_setfilter " "failed - '%s'\n", pcap_geterr(pri->pcap)); goto out; } @@ -127,7 +127,7 @@ int pcap_user_read(int fd, void *buffer, int len, struct pcap_data *pri) const struct net_user_info pcap_user_info = { .init = pcap_user_init, - .open = pcap_open, + .open = pcap_user_open, .close = NULL, .remove = pcap_remove, .add_address = NULL, diff --git a/arch/um/drivers/slip_user.c b/arch/um/drivers/slip_user.c index 8016d32b6809..482a19c5105c 100644 --- a/arch/um/drivers/slip_user.c +++ b/arch/um/drivers/slip_user.c @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/um/drivers/vector_user.c b/arch/um/drivers/vector_user.c index 0e6d6717bf73..090ff93457c7 100644 --- a/arch/um/drivers/vector_user.c +++ b/arch/um/drivers/vector_user.c @@ -18,9 +18,7 @@ #include #include #include -#include #include -#include #include #include #include @@ -332,7 +330,7 @@ static struct vector_fds *user_init_unix_fds(struct arglist *ifspec, int id) } switch (id) { case ID_BESS: - if (connect(fd, remote_addr, sizeof(struct sockaddr_un)) < 0) { + if (connect(fd, (const struct sockaddr *) remote_addr, sizeof(struct sockaddr_un)) < 0) { printk(UM_KERN_ERR "bess open:cannot connect to %s %i", remote_addr->sun_path, -errno); goto unix_cleanup; } diff --git a/arch/um/os-Linux/util.c b/arch/um/os-Linux/util.c index ecf2f390fad2..07327425d06e 100644 --- a/arch/um/os-Linux/util.c +++ b/arch/um/os-Linux/util.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/um/user-offsets.c b/arch/x86/um/user-offsets.c index c51dd8363d25..bae61554abcc 100644 --- a/arch/x86/um/user-offsets.c +++ b/arch/x86/um/user-offsets.c @@ -2,7 +2,7 @@ #include #include #include -#include +#include #include #include #define __FRAME_OFFSETS From 730586ff7fada525943f1eee0681fa62a3fc6128 Mon Sep 17 00:00:00 2001 From: Ignat Korchagin Date: Sun, 19 Jul 2020 22:02:22 +0100 Subject: [PATCH 116/168] um: Allow static linking for non-glibc implementations It is possible to produce a statically linked UML binary with UML_NET_VECTOR, UML_NET_VDE and UML_NET_PCAP options enabled using alternative libc implementations, which do not rely on NSS, such as musl. Allow static linking in this case. Signed-off-by: Ignat Korchagin Reviewed-by: Brendan Higgins Tested-by: Brendan Higgins Signed-off-by: Richard Weinberger --- arch/um/Kconfig | 6 +++--- arch/um/drivers/Kconfig | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/um/Kconfig b/arch/um/Kconfig index eb51fec75948..f59defeae7bf 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig @@ -62,12 +62,12 @@ config NR_CPUS source "arch/$(HEADER_ARCH)/um/Kconfig" -config FORBID_STATIC_LINK - bool +config MAY_HAVE_RUNTIME_DEPS + bool config STATIC_LINK bool "Force a static link" - depends on !FORBID_STATIC_LINK + depends on CC_CAN_LINK_STATIC_NO_RUNTIME_DEPS || !MAY_HAVE_RUNTIME_DEPS help This option gives you the ability to force a static link of UML. Normally, UML is linked as a shared binary. This is inconvenient for diff --git a/arch/um/drivers/Kconfig b/arch/um/drivers/Kconfig index 9160ead56e33..2e7b8e0e7194 100644 --- a/arch/um/drivers/Kconfig +++ b/arch/um/drivers/Kconfig @@ -234,7 +234,7 @@ config UML_NET_DAEMON config UML_NET_VECTOR bool "Vector I/O high performance network devices" depends on UML_NET - select FORBID_STATIC_LINK + select MAY_HAVE_RUNTIME_DEPS help This User-Mode Linux network driver uses multi-message send and receive functions. The host running the UML guest must have @@ -246,7 +246,7 @@ config UML_NET_VECTOR config UML_NET_VDE bool "VDE transport (obsolete)" depends on UML_NET - select FORBID_STATIC_LINK + select MAY_HAVE_RUNTIME_DEPS help This User-Mode Linux network transport allows one or more running UMLs on a single host to communicate with each other and also @@ -294,7 +294,7 @@ config UML_NET_MCAST config UML_NET_PCAP bool "pcap transport (obsolete)" depends on UML_NET - select FORBID_STATIC_LINK + select MAY_HAVE_RUNTIME_DEPS help The pcap transport makes a pcap packet stream on the host look like an ethernet device inside UML. This is useful for making From ebef8ea2ba967026192a26f4529890893919bc57 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 10 Sep 2020 11:31:12 +0200 Subject: [PATCH 117/168] um: time-travel: Fix IRQ handling in time_travel_handle_message() As the comment here indicates, we need to do the polling in the idle loop without blocking interrupts, since interrupts can be vhost-user messages that we must process even while in our idle loop. I don't know why I explained one thing and implemented another, but we have indeed observed random hangs due to this, depending on the timing of the messages. Fixes: 88ce64249233 ("um: Implement time-travel=ext") Signed-off-by: Johannes Berg Acked-By: Anton Ivanov Signed-off-by: Richard Weinberger --- arch/um/kernel/time.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/arch/um/kernel/time.c b/arch/um/kernel/time.c index 25eaa6a0c658..c07436e89e59 100644 --- a/arch/um/kernel/time.c +++ b/arch/um/kernel/time.c @@ -70,13 +70,17 @@ static void time_travel_handle_message(struct um_timetravel_msg *msg, * read of the message and write of the ACK. */ if (mode != TTMH_READ) { + bool disabled = irqs_disabled(); + + BUG_ON(mode == TTMH_IDLE && !disabled); + + if (disabled) + local_irq_enable(); while (os_poll(1, &time_travel_ext_fd) != 0) { - if (mode == TTMH_IDLE) { - BUG_ON(!irqs_disabled()); - local_irq_enable(); - local_irq_disable(); - } + /* nothing */ } + if (disabled) + local_irq_disable(); } ret = os_read_file(time_travel_ext_fd, msg, sizeof(*msg)); From d0800609136d16418f49d01098b206c0c394d147 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 10 Sep 2020 11:31:13 +0200 Subject: [PATCH 118/168] um: time-travel: Return the sequence number in ACK messages For external time travel, the protocol says to return the incoming sequence number in the ACK message to aid debugging, so do that. Signed-off-by: Johannes Berg Acked-By: Anton Ivanov Signed-off-by: Richard Weinberger --- arch/um/kernel/time.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/um/kernel/time.c b/arch/um/kernel/time.c index c07436e89e59..3d109ff3309b 100644 --- a/arch/um/kernel/time.c +++ b/arch/um/kernel/time.c @@ -106,6 +106,7 @@ static void time_travel_handle_message(struct um_timetravel_msg *msg, break; } + resp.seq = msg->seq; os_write_file(time_travel_ext_fd, &resp, sizeof(resp)); } From f2d05059e15af3f70502074f4e3a504530af504a Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 4 Jun 2020 13:23:17 +0200 Subject: [PATCH 119/168] um: change sigio_spinlock to a mutex Lockdep complains at boot: ============================= [ BUG: Invalid wait context ] 5.7.0-05093-g46d91ecd597b #98 Not tainted ----------------------------- swapper/1 is trying to lock: 0000000060931b98 (&desc[i].request_mutex){+.+.}-{3:3}, at: __setup_irq+0x11d/0x623 other info that might help us debug this: context-{4:4} 1 lock held by swapper/1: #0: 000000006074fed8 (sigio_spinlock){+.+.}-{2:2}, at: sigio_lock+0x1a/0x1c stack backtrace: CPU: 0 PID: 1 Comm: swapper Not tainted 5.7.0-05093-g46d91ecd597b #98 Stack: 7fa4fab0 6028dfd1 0000002a 6008bea5 7fa50700 7fa50040 7fa4fac0 6028e016 7fa4fb50 6007f6da 60959c18 00000000 Call Trace: [<60023a0e>] show_stack+0x13b/0x155 [<6028e016>] dump_stack+0x2a/0x2c [<6007f6da>] __lock_acquire+0x515/0x15f2 [<6007eb50>] lock_acquire+0x245/0x273 [<6050d9f1>] __mutex_lock+0xbd/0x325 [<6050dc76>] mutex_lock_nested+0x1d/0x1f [<6008e27e>] __setup_irq+0x11d/0x623 [<6008e8ed>] request_threaded_irq+0x169/0x1a6 [<60021eb0>] um_request_irq+0x1ee/0x24b [<600234ee>] write_sigio_irq+0x3b/0x76 [<600383ca>] sigio_broken+0x146/0x2e4 [<60020bd8>] do_one_initcall+0xde/0x281 Because we hold sigio_spinlock and then get into requesting an interrupt with a mutex. Change the spinlock to a mutex to avoid that. Signed-off-by: Johannes Berg Signed-off-by: Richard Weinberger --- arch/um/kernel/sigio.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/um/kernel/sigio.c b/arch/um/kernel/sigio.c index 10c99e058fca..d1cffc2a7f21 100644 --- a/arch/um/kernel/sigio.c +++ b/arch/um/kernel/sigio.c @@ -35,14 +35,14 @@ int write_sigio_irq(int fd) } /* These are called from os-Linux/sigio.c to protect its pollfds arrays. */ -static DEFINE_SPINLOCK(sigio_spinlock); +static DEFINE_MUTEX(sigio_mutex); void sigio_lock(void) { - spin_lock(&sigio_spinlock); + mutex_lock(&sigio_mutex); } void sigio_unlock(void) { - spin_unlock(&sigio_spinlock); + mutex_unlock(&sigio_mutex); } From 9a10705b42dd541feadda4817582a7efc984c917 Mon Sep 17 00:00:00 2001 From: Li Heng Date: Thu, 23 Jul 2020 11:15:16 +0800 Subject: [PATCH 120/168] um: Remove redundant NULL check Fix below warnings reported by coccicheck: ./arch/um/drivers/vector_user.c:403:2-7: WARNING: NULL check before some freeing functions is not needed. Fixes: bc8f8e4e6e7a ("um: Add a generic "fd" vector transport") Signed-off-by: Li Heng Signed-off-by: Richard Weinberger --- arch/um/drivers/vector_user.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/um/drivers/vector_user.c b/arch/um/drivers/vector_user.c index 090ff93457c7..65b6a71a132f 100644 --- a/arch/um/drivers/vector_user.c +++ b/arch/um/drivers/vector_user.c @@ -397,8 +397,7 @@ static struct vector_fds *user_init_fd_fds(struct arglist *ifspec) fd_cleanup: if (fd >= 0) os_close_file(fd); - if (result != NULL) - kfree(result); + kfree(result); return NULL; } From 4687615d2ded1250923123e8966463827763432e Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Sun, 4 Oct 2020 01:04:36 -0400 Subject: [PATCH 121/168] um: Remove dead usage of TIF_IA32 This seems like a dead artifact since TIF_IA32 is not even defined as a TI flag for UM. Looking back in git history, it made sense in the old days, but it is apparently not used since UM was split out of the x86 arch/. It is also going away from the x86 tree soon. Also, I think the variable clean up it performs is not needed as 64-bit UML doesn't run 32-bit binaries as far as I can tell, and 32-bit UML has 32-bit ulong. Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Richard Weinberger --- arch/x86/um/ptrace_64.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/arch/x86/um/ptrace_64.c b/arch/x86/um/ptrace_64.c index 09a085bde0d4..1401899dee9b 100644 --- a/arch/x86/um/ptrace_64.c +++ b/arch/x86/um/ptrace_64.c @@ -52,14 +52,6 @@ static const int reg_offsets[] = int putreg(struct task_struct *child, int regno, unsigned long value) { -#ifdef TIF_IA32 - /* - * Some code in the 64bit emulation may not be 64bit clean. - * Don't take any chances. - */ - if (test_tsk_thread_flag(child, TIF_IA32)) - value &= 0xffffffff; -#endif switch (regno) { case R8: case R9: @@ -137,10 +129,7 @@ int poke_user(struct task_struct *child, long addr, long data) unsigned long getreg(struct task_struct *child, int regno) { unsigned long mask = ~0UL; -#ifdef TIF_IA32 - if (test_tsk_thread_flag(child, TIF_IA32)) - mask = 0xffffffff; -#endif + switch (regno) { case R8: case R9: From e8a58591afbc7dc279a11454c08fce1281958eef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20=C5=BBenczykowski?= Date: Fri, 7 Aug 2020 12:32:26 -0700 Subject: [PATCH 122/168] um: Fix incorrect assumptions about max pid length MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pids are no longer limited to 16-bits, bump to 32-bits, ie. 9 decimal characters. Additionally sizeof("/") already returns 2 - ie. it already accounts for trailing zero. Cc: Jeff Dike Cc: Richard Weinberger Cc: Anton Ivanov Cc: Linux UM Mailing List Signed-off-by: Maciej Żenczykowski Signed-off-by: Richard Weinberger --- arch/um/os-Linux/umid.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/um/os-Linux/umid.c b/arch/um/os-Linux/umid.c index 9e16078a4bf8..1d7558dac75f 100644 --- a/arch/um/os-Linux/umid.c +++ b/arch/um/os-Linux/umid.c @@ -97,7 +97,7 @@ static int remove_files_and_dir(char *dir) while ((ent = readdir(directory)) != NULL) { if (!strcmp(ent->d_name, ".") || !strcmp(ent->d_name, "..")) continue; - len = strlen(dir) + sizeof("/") + strlen(ent->d_name) + 1; + len = strlen(dir) + strlen("/") + strlen(ent->d_name) + 1; if (len > sizeof(file)) { ret = -E2BIG; goto out; @@ -135,7 +135,7 @@ out: */ static inline int is_umdir_used(char *dir) { - char pid[sizeof("nnnnn\0")], *end, *file; + char pid[sizeof("nnnnnnnnn")], *end, *file; int dead, fd, p, n, err; size_t filelen; @@ -217,10 +217,10 @@ static int umdir_take_if_dead(char *dir) static void __init create_pid_file(void) { - char pid[sizeof("nnnnn\0")], *file; + char pid[sizeof("nnnnnnnnn")], *file; int fd, n; - n = strlen(uml_dir) + UMID_LEN + sizeof("/pid\0"); + n = strlen(uml_dir) + UMID_LEN + sizeof("/pid"); file = malloc(n); if (!file) return; From 273fe1b676cb59d41e177980a981e27806872954 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 21 Sep 2020 13:43:25 +0200 Subject: [PATCH 123/168] um: Clean up stacktrace dump We currently get a few stray newlines, due to the interaction between printk() and the code here. Remove a few explicit newline prints to neaten the output. Signed-off-by: Johannes Berg Signed-off-by: Richard Weinberger --- arch/um/kernel/sysrq.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/um/kernel/sysrq.c b/arch/um/kernel/sysrq.c index acbc879d2773..7452f70d50d0 100644 --- a/arch/um/kernel/sysrq.c +++ b/arch/um/kernel/sysrq.c @@ -47,12 +47,10 @@ void show_stack(struct task_struct *task, unsigned long *stack, if (kstack_end(stack)) break; if (i && ((i % STACKSLOTS_PER_LINE) == 0)) - printk("%s\n", loglvl); + pr_cont("\n"); pr_cont(" %08lx", *stack++); } - printk("%s\n", loglvl); printk("%sCall Trace:\n", loglvl); dump_trace(current, &stackops, (void *)loglvl); - printk("%s\n", loglvl); } From f06885b3f3e3884f98351d7b72a4fc8400911cde Mon Sep 17 00:00:00 2001 From: Anton Ivanov Date: Mon, 21 Sep 2020 14:58:27 +0100 Subject: [PATCH 124/168] um: vector: Add dynamic tap interfaces and scripting Provide functionality roughly compatible with the existing qemu ifup scripting: * invocation of an ifup script. The interface name is passed as the first and only argument * allocating tap interfaces on the fly if they are not explicitly specified Signed-off-by: Anton Ivanov Signed-off-by: Richard Weinberger --- arch/um/drivers/vector_user.c | 56 +++++++++++++++++++++++++++++------ 1 file changed, 47 insertions(+), 9 deletions(-) diff --git a/arch/um/drivers/vector_user.c b/arch/um/drivers/vector_user.c index 65b6a71a132f..bae53220ce26 100644 --- a/arch/um/drivers/vector_user.c +++ b/arch/um/drivers/vector_user.c @@ -37,6 +37,7 @@ #define ID_MAX 2 #define TOKEN_IFNAME "ifname" +#define TOKEN_SCRIPT "ifup" #define TRANS_RAW "raw" #define TRANS_RAW_LEN strlen(TRANS_RAW) @@ -53,6 +54,9 @@ #define MAX_UN_LEN 107 +static const char padchar[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"; +static const char *template = "tapXXXXXX"; + /* This is very ugly and brute force lookup, but it is done * only once at initialization so not worth doing hashes or * anything more intelligent @@ -189,16 +193,21 @@ raw_fd_cleanup: return err; } + static struct vector_fds *user_init_tap_fds(struct arglist *ifspec) { - int fd = -1; + int fd = -1, i; char *iface; struct vector_fds *result = NULL; + bool dynamic = false; + char dynamic_ifname[IFNAMSIZ]; + char *argv[] = {NULL, NULL, NULL, NULL}; iface = uml_vector_fetch_arg(ifspec, TOKEN_IFNAME); if (iface == NULL) { - printk(UM_KERN_ERR "uml_tap: failed to parse interface spec\n"); - goto tap_cleanup; + dynamic = true; + iface = dynamic_ifname; + srand(getpid()); } result = uml_kmalloc(sizeof(struct vector_fds), UM_GFP_KERNEL); @@ -212,14 +221,30 @@ static struct vector_fds *user_init_tap_fds(struct arglist *ifspec) result->remote_addr_size = 0; /* TAP */ + do { + if (dynamic) { + strcpy(iface, template); + for (i = 0; i < strlen(iface); i++) { + if (iface[i] == 'X') { + iface[i] = padchar[rand() % strlen(padchar)]; + } + } + } + fd = create_tap_fd(iface); + if ((fd < 0) && (!dynamic)) { + printk(UM_KERN_ERR "uml_tap: failed to create tun interface\n"); + goto tap_cleanup; + } + result->tx_fd = fd; + result->rx_fd = fd; + } while (fd < 0); - fd = create_tap_fd(iface); - if (fd < 0) { - printk(UM_KERN_ERR "uml_tap: failed to create tun interface\n"); - goto tap_cleanup; + argv[0] = uml_vector_fetch_arg(ifspec, TOKEN_SCRIPT); + if (argv[0]) { + argv[1] = iface; + run_helper(NULL, NULL, argv); } - result->tx_fd = fd; - result->rx_fd = fd; + return result; tap_cleanup: printk(UM_KERN_ERR "user_init_tap: init failed, error %d", fd); @@ -231,6 +256,7 @@ static struct vector_fds *user_init_hybrid_fds(struct arglist *ifspec) { char *iface; struct vector_fds *result = NULL; + char *argv[] = {NULL, NULL, NULL, NULL}; iface = uml_vector_fetch_arg(ifspec, TOKEN_IFNAME); if (iface == NULL) { @@ -264,6 +290,12 @@ static struct vector_fds *user_init_hybrid_fds(struct arglist *ifspec) "uml_tap: failed to create paired raw socket: %i\n", result->rx_fd); goto hybrid_cleanup; } + + argv[0] = uml_vector_fetch_arg(ifspec, TOKEN_SCRIPT); + if (argv[0]) { + argv[1] = iface; + run_helper(NULL, NULL, argv); + } return result; hybrid_cleanup: printk(UM_KERN_ERR "user_init_hybrid: init failed"); @@ -407,6 +439,7 @@ static struct vector_fds *user_init_raw_fds(struct arglist *ifspec) int err = -ENOMEM; char *iface; struct vector_fds *result = NULL; + char *argv[] = {NULL, NULL, NULL, NULL}; iface = uml_vector_fetch_arg(ifspec, TOKEN_IFNAME); if (iface == NULL) @@ -429,6 +462,11 @@ static struct vector_fds *user_init_raw_fds(struct arglist *ifspec) result->remote_addr = NULL; result->remote_addr_size = 0; } + argv[0] = uml_vector_fetch_arg(ifspec, TOKEN_SCRIPT); + if (argv[0]) { + argv[1] = iface; + run_helper(NULL, NULL, argv); + } return result; raw_cleanup: printk(UM_KERN_ERR "user_init_raw: init failed, error %d", err); From 2f324dd8ab2e3d8b097f3eac6d380d1adbdbe6e6 Mon Sep 17 00:00:00 2001 From: Sumera Priyadarsini Date: Sun, 11 Oct 2020 15:58:43 +0530 Subject: [PATCH 125/168] scripts: coccicheck: Add quotes to improve portability While fetching the number of threads per core with lscpu, the [:digit:] set is used for translation of digits from 0-9. However, using [:digit:] instead of "[:digit:]" does not seem to work uniformly for some shell types and configurations (such as zsh). Therefore, modify coccicheck to use double quotes around the [:digit:] set for uniformity and better portability. Signed-off-by: Sumera Priyadarsini Signed-off-by: Julia Lawall --- scripts/coccicheck | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/coccicheck b/scripts/coccicheck index 6789751607f5..d67907b8a38b 100755 --- a/scripts/coccicheck +++ b/scripts/coccicheck @@ -76,7 +76,7 @@ else fi # Use only one thread per core by default if hyperthreading is enabled - THREADS_PER_CORE=$(lscpu | grep "Thread(s) per core: " | tr -cd [:digit:]) + THREADS_PER_CORE=$(lscpu | grep "Thread(s) per core: " | tr -cd "[:digit:]") if [ -z "$J" ]; then NPROC=$(getconf _NPROCESSORS_ONLN) if [ $THREADS_PER_CORE -gt 1 -a $NPROC -gt 2 ] ; then From c5864560d935db879cafa21feca0156d91eba842 Mon Sep 17 00:00:00 2001 From: Sumera Priyadarsini Date: Sun, 11 Oct 2020 15:59:42 +0530 Subject: [PATCH 126/168] scripts: coccicheck: Change default condition for parallelism Currently, Coccinelle uses at most one thread per core by default in machines with more than 2 hyperthreads. However, for systems with only 4 hyperthreads, this does not improve performance. Modify coccicheck to use all available threads in machines with upto 4 hyperthreads. Signed-off-by: Sumera Priyadarsini Signed-off-by: Julia Lawall --- scripts/coccicheck | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/coccicheck b/scripts/coccicheck index d67907b8a38b..209bb0427b43 100755 --- a/scripts/coccicheck +++ b/scripts/coccicheck @@ -79,7 +79,7 @@ else THREADS_PER_CORE=$(lscpu | grep "Thread(s) per core: " | tr -cd "[:digit:]") if [ -z "$J" ]; then NPROC=$(getconf _NPROCESSORS_ONLN) - if [ $THREADS_PER_CORE -gt 1 -a $NPROC -gt 2 ] ; then + if [ $THREADS_PER_CORE -gt 1 -a $NPROC -gt 4 ] ; then NPROC=$((NPROC/2)) fi else From 5b92d8e9e5fa8dac0e7e5dd5f04d9b445eb39c58 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Mon, 17 Aug 2020 14:46:08 +0530 Subject: [PATCH 127/168] mailbox: bcm: convert tasklets to use new tasklet_setup() API In preparation for unconditionally passing the struct tasklet_struct pointer to all tasklet callbacks, switch to using the new tasklet_setup() and from_tasklet() to pass the tasklet pointer explicitly. Signed-off-by: Romain Perier Signed-off-by: Allen Pais Signed-off-by: Jassi Brar --- drivers/mailbox/bcm-pdc-mailbox.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/mailbox/bcm-pdc-mailbox.c b/drivers/mailbox/bcm-pdc-mailbox.c index 53945ca5d785..5b375985f7b8 100644 --- a/drivers/mailbox/bcm-pdc-mailbox.c +++ b/drivers/mailbox/bcm-pdc-mailbox.c @@ -962,9 +962,9 @@ static irqreturn_t pdc_irq_handler(int irq, void *data) * a DMA receive interrupt. Reenables the receive interrupt. * @data: PDC state structure */ -static void pdc_tasklet_cb(unsigned long data) +static void pdc_tasklet_cb(struct tasklet_struct *t) { - struct pdc_state *pdcs = (struct pdc_state *)data; + struct pdc_state *pdcs = from_tasklet(pdcs, t, rx_tasklet); pdc_receive(pdcs); @@ -1589,7 +1589,7 @@ static int pdc_probe(struct platform_device *pdev) pdc_hw_init(pdcs); /* Init tasklet for deferred DMA rx processing */ - tasklet_init(&pdcs->rx_tasklet, pdc_tasklet_cb, (unsigned long)pdcs); + tasklet_setup(&pdcs->rx_tasklet, pdc_tasklet_cb); err = pdc_interrupts_init(pdcs); if (err) From 9070f35d25ce1781e3b5a3c6f68287ce3b8f937d Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 8 Oct 2020 20:14:49 +0100 Subject: [PATCH 128/168] dt-bindings: mailbox : arm,mhu: Convert to Json-schema Convert the DT binding over to Json-schema. Reviewed-by: Rob Herring Signed-off-by: Viresh Kumar Signed-off-by: Sudeep Holla Signed-off-by: Jassi Brar --- .../devicetree/bindings/mailbox/arm,mhu.yaml | 87 +++++++++++++++++++ .../devicetree/bindings/mailbox/arm-mhu.txt | 43 --------- 2 files changed, 87 insertions(+), 43 deletions(-) create mode 100644 Documentation/devicetree/bindings/mailbox/arm,mhu.yaml delete mode 100644 Documentation/devicetree/bindings/mailbox/arm-mhu.txt diff --git a/Documentation/devicetree/bindings/mailbox/arm,mhu.yaml b/Documentation/devicetree/bindings/mailbox/arm,mhu.yaml new file mode 100644 index 000000000000..2c8df7979c22 --- /dev/null +++ b/Documentation/devicetree/bindings/mailbox/arm,mhu.yaml @@ -0,0 +1,87 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/mailbox/arm,mhu.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: ARM MHU Mailbox Controller + +maintainers: + - Jassi Brar + +description: | + The ARM's Message-Handling-Unit (MHU) is a mailbox controller that has 3 + independent channels/links to communicate with remote processor(s). MHU links + are hardwired on a platform. A link raises interrupt for any received data. + However, there is no specified way of knowing if the sent data has been read + by the remote. This driver assumes the sender polls STAT register and the + remote clears it after having read the data. The last channel is specified to + be a 'Secure' resource, hence can't be used by Linux running NS. + +# We need a select here so we don't match all nodes with 'arm,primecell' +select: + properties: + compatible: + contains: + const: arm,mhu + required: + - compatible + +properties: + compatible: + items: + - const: arm,mhu + - const: arm,primecell + + reg: + maxItems: 1 + + interrupts: + items: + - description: low-priority non-secure + - description: high-priority non-secure + - description: Secure + maxItems: 3 + + clocks: + maxItems: 1 + + clock-names: + items: + - const: apb_pclk + + '#mbox-cells': + description: Index of the channel. + const: 1 + +required: + - compatible + - reg + - interrupts + - '#mbox-cells' + +additionalProperties: false + +examples: + - | + soc { + #address-cells = <2>; + #size-cells = <2>; + + mhuA: mailbox@2b1f0000 { + #mbox-cells = <1>; + compatible = "arm,mhu", "arm,primecell"; + reg = <0 0x2b1f0000 0 0x1000>; + interrupts = <0 36 4>, /* LP-NonSecure */ + <0 35 4>, /* HP-NonSecure */ + <0 37 4>; /* Secure */ + clocks = <&clock 0 2 1>; + clock-names = "apb_pclk"; + }; + + mhu_client_scb: scb@2e000000 { + compatible = "fujitsu,mb86s70-scb-1.0"; + reg = <0 0x2e000000 0 0x4000>; + mboxes = <&mhuA 1>; /* HP-NonSecure */ + }; + }; diff --git a/Documentation/devicetree/bindings/mailbox/arm-mhu.txt b/Documentation/devicetree/bindings/mailbox/arm-mhu.txt deleted file mode 100644 index 4971f03f0b33..000000000000 --- a/Documentation/devicetree/bindings/mailbox/arm-mhu.txt +++ /dev/null @@ -1,43 +0,0 @@ -ARM MHU Mailbox Driver -====================== - -The ARM's Message-Handling-Unit (MHU) is a mailbox controller that has -3 independent channels/links to communicate with remote processor(s). - MHU links are hardwired on a platform. A link raises interrupt for any -received data. However, there is no specified way of knowing if the sent -data has been read by the remote. This driver assumes the sender polls -STAT register and the remote clears it after having read the data. -The last channel is specified to be a 'Secure' resource, hence can't be -used by Linux running NS. - -Mailbox Device Node: -==================== - -Required properties: --------------------- -- compatible: Shall be "arm,mhu" & "arm,primecell" -- reg: Contains the mailbox register address range (base - address and length) -- #mbox-cells Shall be 1 - the index of the channel needed. -- interrupts: Contains the interrupt information corresponding to - each of the 3 links of MHU. - -Example: --------- - - mhu: mailbox@2b1f0000 { - #mbox-cells = <1>; - compatible = "arm,mhu", "arm,primecell"; - reg = <0 0x2b1f0000 0x1000>; - interrupts = <0 36 4>, /* LP-NonSecure */ - <0 35 4>, /* HP-NonSecure */ - <0 37 4>; /* Secure */ - clocks = <&clock 0 2 1>; - clock-names = "apb_pclk"; - }; - - mhu_client: scb@2e000000 { - compatible = "fujitsu,mb86s70-scb-1.0"; - reg = <0 0x2e000000 0x4000>; - mboxes = <&mhu 1>; /* HP-NonSecure */ - }; From 471de2c027422b372b1bb2578aa3123ee56b7125 Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Thu, 8 Oct 2020 20:14:50 +0100 Subject: [PATCH 129/168] dt-bindings: mailbox: add doorbell support to ARM MHU The ARM MHU's reference manual states following: "The MHU drives the signal using a 32-bit register, with all 32 bits logically ORed together. The MHU provides a set of registers to enable software to set, clear, and check the status of each of the bits of this register independently. The use of 32 bits for each interrupt line enables software to provide more information about the source of the interrupt. For example, each bit of the register can be associated with a type of event that can contribute to raising the interrupt." This patch thus extends the MHU controller's DT binding to add support for doorbell mode. Though the same MHU hardware controller is used in the two modes, A new compatible string is added here to represent the combination of the MHU hardware and the firmware sitting on the other side (which expects each bit to represent a different signal now). Reviewed-by: Rob Herring Acked-by: Arnd Bergmann Co-developed-by: Viresh Kumar Signed-off-by: Viresh Kumar Signed-off-by: Sudeep Holla Signed-off-by: Jassi Brar --- .../devicetree/bindings/mailbox/arm,mhu.yaml | 60 +++++++++++++++++-- 1 file changed, 54 insertions(+), 6 deletions(-) diff --git a/Documentation/devicetree/bindings/mailbox/arm,mhu.yaml b/Documentation/devicetree/bindings/mailbox/arm,mhu.yaml index 2c8df7979c22..d43791a2dde7 100644 --- a/Documentation/devicetree/bindings/mailbox/arm,mhu.yaml +++ b/Documentation/devicetree/bindings/mailbox/arm,mhu.yaml @@ -18,20 +18,40 @@ description: | remote clears it after having read the data. The last channel is specified to be a 'Secure' resource, hence can't be used by Linux running NS. + The MHU hardware also allows operations in doorbell mode. The MHU drives the + interrupt signal using a 32-bit register, with all 32-bits logically ORed + together. It provides a set of registers to enable software to set, clear and + check the status of each of the bits of this register independently. The use + of 32 bits per interrupt line enables software to provide more information + about the source of the interrupt. For example, each bit of the register can + be associated with a type of event that can contribute to raising the + interrupt. Each of the 32-bits can be used as "doorbell" to alert the remote + processor. + # We need a select here so we don't match all nodes with 'arm,primecell' select: properties: compatible: contains: - const: arm,mhu + enum: + - arm,mhu + - arm,mhu-doorbell required: - compatible properties: compatible: - items: - - const: arm,mhu - - const: arm,primecell + oneOf: + - description: Data transfer mode + items: + - const: arm,mhu + - const: arm,primecell + + - description: Doorbell mode + items: + - const: arm,mhu-doorbell + - const: arm,primecell + reg: maxItems: 1 @@ -51,8 +71,11 @@ properties: - const: apb_pclk '#mbox-cells': - description: Index of the channel. - const: 1 + description: | + Set to 1 in data transfer mode and represents index of the channel. + Set to 2 in doorbell mode and represents index of the channel and doorbell + number. + enum: [ 1, 2 ] required: - compatible @@ -63,6 +86,7 @@ required: additionalProperties: false examples: + # Data transfer mode. - | soc { #address-cells = <2>; @@ -85,3 +109,27 @@ examples: mboxes = <&mhuA 1>; /* HP-NonSecure */ }; }; + + # Doorbell mode. + - | + soc { + #address-cells = <2>; + #size-cells = <2>; + + mhuB: mailbox@2b2f0000 { + #mbox-cells = <2>; + compatible = "arm,mhu-doorbell", "arm,primecell"; + reg = <0 0x2b2f0000 0 0x1000>; + interrupts = <0 36 4>, /* LP-NonSecure */ + <0 35 4>, /* HP-NonSecure */ + <0 37 4>; /* Secure */ + clocks = <&clock 0 2 1>; + clock-names = "apb_pclk"; + }; + + mhu_client_scpi: scpi@2f000000 { + compatible = "arm,scpi"; + reg = <0 0x2f000000 0 0x200>; + mboxes = <&mhuB 1 4>; /* HP-NonSecure, 5th doorbell */ + }; + }; From ab99e237a48298c458d49b8d8bd850e1aa58e457 Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Thu, 8 Oct 2020 20:14:51 +0100 Subject: [PATCH 130/168] mailbox: arm_mhu: Match only if compatible is "arm,mhu" Since we will be soon adding a separate driver based on this ARM MHU driver to support doorbell mode, let us add explicit check to match the default compatible for this driver. This is needed as the probe and match reuses the AMBA device ids currently and don't have any explicit compatible check. Signed-off-by: Sudeep Holla Signed-off-by: Jassi Brar --- drivers/mailbox/arm_mhu.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/mailbox/arm_mhu.c b/drivers/mailbox/arm_mhu.c index 9da236552bd7..b7fbf276eb62 100644 --- a/drivers/mailbox/arm_mhu.c +++ b/drivers/mailbox/arm_mhu.c @@ -113,6 +113,9 @@ static int mhu_probe(struct amba_device *adev, const struct amba_id *id) struct device *dev = &adev->dev; int mhu_reg[MHU_CHANS] = {MHU_LP_OFFSET, MHU_HP_OFFSET, MHU_SEC_OFFSET}; + if (!of_device_is_compatible(dev->of_node, "arm,mhu")) + return -ENODEV; + /* Allocate memory for device */ mhu = devm_kzalloc(dev, sizeof(*mhu), GFP_KERNEL); if (!mhu) From 7002ca237b216d651d032439c45dc664f1f955bb Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Thu, 8 Oct 2020 20:14:52 +0100 Subject: [PATCH 131/168] mailbox: arm_mhu: Add ARM MHU doorbell driver The MHU drives the signal using a 32-bit register, with all 32 bits logically ORed together. The MHU provides a set of registers to enable software to set, clear, and check the status of each of the bits of this register independently. The use of 32 bits for each interrupt line enables software to provide more information about the source of the interrupt. For example, each bit of the register can be associated with a type of event that can contribute to raising the interrupt. This patch adds a separate the MHU controller driver for doorbel mode of operation using the extended DT binding to add support the same. Signed-off-by: Sudeep Holla Signed-off-by: Jassi Brar --- drivers/mailbox/Makefile | 2 +- drivers/mailbox/arm_mhu_db.c | 354 +++++++++++++++++++++++++++++++++++ 2 files changed, 355 insertions(+), 1 deletion(-) create mode 100644 drivers/mailbox/arm_mhu_db.c diff --git a/drivers/mailbox/Makefile b/drivers/mailbox/Makefile index 60d224b723a1..2e06e02b2e03 100644 --- a/drivers/mailbox/Makefile +++ b/drivers/mailbox/Makefile @@ -5,7 +5,7 @@ obj-$(CONFIG_MAILBOX) += mailbox.o obj-$(CONFIG_MAILBOX_TEST) += mailbox-test.o -obj-$(CONFIG_ARM_MHU) += arm_mhu.o +obj-$(CONFIG_ARM_MHU) += arm_mhu.o arm_mhu_db.o obj-$(CONFIG_IMX_MBOX) += imx-mailbox.o diff --git a/drivers/mailbox/arm_mhu_db.c b/drivers/mailbox/arm_mhu_db.c new file mode 100644 index 000000000000..275efe4cca0c --- /dev/null +++ b/drivers/mailbox/arm_mhu_db.c @@ -0,0 +1,354 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2013-2015 Fujitsu Semiconductor Ltd. + * Copyright (C) 2015 Linaro Ltd. + * Based on ARM MHU driver by Jassi Brar + * Copyright (C) 2020 ARM Ltd. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define INTR_STAT_OFS 0x0 +#define INTR_SET_OFS 0x8 +#define INTR_CLR_OFS 0x10 + +#define MHU_LP_OFFSET 0x0 +#define MHU_HP_OFFSET 0x20 +#define MHU_SEC_OFFSET 0x200 +#define TX_REG_OFFSET 0x100 + +#define MHU_CHANS 3 /* Secure, Non-Secure High and Low Priority */ +#define MHU_CHAN_MAX 20 /* Max channels to save on unused RAM */ +#define MHU_NUM_DOORBELLS 32 + +struct mhu_db_link { + unsigned int irq; + void __iomem *tx_reg; + void __iomem *rx_reg; +}; + +struct arm_mhu { + void __iomem *base; + struct mhu_db_link mlink[MHU_CHANS]; + struct mbox_controller mbox; + struct device *dev; +}; + +/** + * ARM MHU Mailbox allocated channel information + * + * @mhu: Pointer to parent mailbox device + * @pchan: Physical channel within which this doorbell resides in + * @doorbell: doorbell number pertaining to this channel + */ +struct mhu_db_channel { + struct arm_mhu *mhu; + unsigned int pchan; + unsigned int doorbell; +}; + +static inline struct mbox_chan * +mhu_db_mbox_to_channel(struct mbox_controller *mbox, unsigned int pchan, + unsigned int doorbell) +{ + int i; + struct mhu_db_channel *chan_info; + + for (i = 0; i < mbox->num_chans; i++) { + chan_info = mbox->chans[i].con_priv; + if (chan_info && chan_info->pchan == pchan && + chan_info->doorbell == doorbell) + return &mbox->chans[i]; + } + + return NULL; +} + +static void mhu_db_mbox_clear_irq(struct mbox_chan *chan) +{ + struct mhu_db_channel *chan_info = chan->con_priv; + void __iomem *base = chan_info->mhu->mlink[chan_info->pchan].rx_reg; + + writel_relaxed(BIT(chan_info->doorbell), base + INTR_CLR_OFS); +} + +static unsigned int mhu_db_mbox_irq_to_pchan_num(struct arm_mhu *mhu, int irq) +{ + unsigned int pchan; + + for (pchan = 0; pchan < MHU_CHANS; pchan++) + if (mhu->mlink[pchan].irq == irq) + break; + return pchan; +} + +static struct mbox_chan * +mhu_db_mbox_irq_to_channel(struct arm_mhu *mhu, unsigned int pchan) +{ + unsigned long bits; + unsigned int doorbell; + struct mbox_chan *chan = NULL; + struct mbox_controller *mbox = &mhu->mbox; + void __iomem *base = mhu->mlink[pchan].rx_reg; + + bits = readl_relaxed(base + INTR_STAT_OFS); + if (!bits) + /* No IRQs fired in specified physical channel */ + return NULL; + + /* An IRQ has fired, find the associated channel */ + for (doorbell = 0; bits; doorbell++) { + if (!test_and_clear_bit(doorbell, &bits)) + continue; + + chan = mhu_db_mbox_to_channel(mbox, pchan, doorbell); + if (chan) + break; + dev_err(mbox->dev, + "Channel not registered: pchan: %d doorbell: %d\n", + pchan, doorbell); + } + + return chan; +} + +static irqreturn_t mhu_db_mbox_rx_handler(int irq, void *data) +{ + struct mbox_chan *chan; + struct arm_mhu *mhu = data; + unsigned int pchan = mhu_db_mbox_irq_to_pchan_num(mhu, irq); + + while (NULL != (chan = mhu_db_mbox_irq_to_channel(mhu, pchan))) { + mbox_chan_received_data(chan, NULL); + mhu_db_mbox_clear_irq(chan); + } + + return IRQ_HANDLED; +} + +static bool mhu_db_last_tx_done(struct mbox_chan *chan) +{ + struct mhu_db_channel *chan_info = chan->con_priv; + void __iomem *base = chan_info->mhu->mlink[chan_info->pchan].tx_reg; + + if (readl_relaxed(base + INTR_STAT_OFS) & BIT(chan_info->doorbell)) + return false; + + return true; +} + +static int mhu_db_send_data(struct mbox_chan *chan, void *data) +{ + struct mhu_db_channel *chan_info = chan->con_priv; + void __iomem *base = chan_info->mhu->mlink[chan_info->pchan].tx_reg; + + /* Send event to co-processor */ + writel_relaxed(BIT(chan_info->doorbell), base + INTR_SET_OFS); + + return 0; +} + +static int mhu_db_startup(struct mbox_chan *chan) +{ + mhu_db_mbox_clear_irq(chan); + return 0; +} + +static void mhu_db_shutdown(struct mbox_chan *chan) +{ + struct mhu_db_channel *chan_info = chan->con_priv; + struct mbox_controller *mbox = &chan_info->mhu->mbox; + int i; + + for (i = 0; i < mbox->num_chans; i++) + if (chan == &mbox->chans[i]) + break; + + if (mbox->num_chans == i) { + dev_warn(mbox->dev, "Request to free non-existent channel\n"); + return; + } + + /* Reset channel */ + mhu_db_mbox_clear_irq(chan); + kfree(chan->con_priv); + chan->con_priv = NULL; +} + +static struct mbox_chan *mhu_db_mbox_xlate(struct mbox_controller *mbox, + const struct of_phandle_args *spec) +{ + struct arm_mhu *mhu = dev_get_drvdata(mbox->dev); + struct mhu_db_channel *chan_info; + struct mbox_chan *chan; + unsigned int pchan = spec->args[0]; + unsigned int doorbell = spec->args[1]; + int i; + + /* Bounds checking */ + if (pchan >= MHU_CHANS || doorbell >= MHU_NUM_DOORBELLS) { + dev_err(mbox->dev, + "Invalid channel requested pchan: %d doorbell: %d\n", + pchan, doorbell); + return ERR_PTR(-EINVAL); + } + + /* Is requested channel free? */ + chan = mhu_db_mbox_to_channel(mbox, pchan, doorbell); + if (chan) { + dev_err(mbox->dev, "Channel in use: pchan: %d doorbell: %d\n", + pchan, doorbell); + return ERR_PTR(-EBUSY); + } + + /* Find the first free slot */ + for (i = 0; i < mbox->num_chans; i++) + if (!mbox->chans[i].con_priv) + break; + + if (mbox->num_chans == i) { + dev_err(mbox->dev, "No free channels left\n"); + return ERR_PTR(-EBUSY); + } + + chan = &mbox->chans[i]; + + chan_info = devm_kzalloc(mbox->dev, sizeof(*chan_info), GFP_KERNEL); + if (!chan_info) + return ERR_PTR(-ENOMEM); + + chan_info->mhu = mhu; + chan_info->pchan = pchan; + chan_info->doorbell = doorbell; + + chan->con_priv = chan_info; + + dev_dbg(mbox->dev, "mbox: created channel phys: %d doorbell: %d\n", + pchan, doorbell); + + return chan; +} + +static const struct mbox_chan_ops mhu_db_ops = { + .send_data = mhu_db_send_data, + .startup = mhu_db_startup, + .shutdown = mhu_db_shutdown, + .last_tx_done = mhu_db_last_tx_done, +}; + +static int mhu_db_probe(struct amba_device *adev, const struct amba_id *id) +{ + u32 cell_count; + int i, err, max_chans; + struct arm_mhu *mhu; + struct mbox_chan *chans; + struct device *dev = &adev->dev; + struct device_node *np = dev->of_node; + int mhu_reg[MHU_CHANS] = { + MHU_LP_OFFSET, MHU_HP_OFFSET, MHU_SEC_OFFSET, + }; + + if (!of_device_is_compatible(np, "arm,mhu-doorbell")) + return -ENODEV; + + err = of_property_read_u32(np, "#mbox-cells", &cell_count); + if (err) { + dev_err(dev, "failed to read #mbox-cells in '%pOF'\n", np); + return err; + } + + if (cell_count == 2) { + max_chans = MHU_CHAN_MAX; + } else { + dev_err(dev, "incorrect value of #mbox-cells in '%pOF'\n", np); + return -EINVAL; + } + + mhu = devm_kzalloc(dev, sizeof(*mhu), GFP_KERNEL); + if (!mhu) + return -ENOMEM; + + mhu->base = devm_ioremap_resource(dev, &adev->res); + if (IS_ERR(mhu->base)) { + dev_err(dev, "ioremap failed\n"); + return PTR_ERR(mhu->base); + } + + chans = devm_kcalloc(dev, max_chans, sizeof(*chans), GFP_KERNEL); + if (!chans) + return -ENOMEM; + + mhu->dev = dev; + mhu->mbox.dev = dev; + mhu->mbox.chans = chans; + mhu->mbox.num_chans = max_chans; + mhu->mbox.txdone_irq = false; + mhu->mbox.txdone_poll = true; + mhu->mbox.txpoll_period = 1; + + mhu->mbox.of_xlate = mhu_db_mbox_xlate; + amba_set_drvdata(adev, mhu); + + mhu->mbox.ops = &mhu_db_ops; + + err = devm_mbox_controller_register(dev, &mhu->mbox); + if (err) { + dev_err(dev, "Failed to register mailboxes %d\n", err); + return err; + } + + for (i = 0; i < MHU_CHANS; i++) { + int irq = mhu->mlink[i].irq = adev->irq[i]; + + if (irq <= 0) { + dev_dbg(dev, "No IRQ found for Channel %d\n", i); + continue; + } + + mhu->mlink[i].rx_reg = mhu->base + mhu_reg[i]; + mhu->mlink[i].tx_reg = mhu->mlink[i].rx_reg + TX_REG_OFFSET; + + err = devm_request_threaded_irq(dev, irq, NULL, + mhu_db_mbox_rx_handler, + IRQF_ONESHOT, "mhu_db_link", mhu); + if (err) { + dev_err(dev, "Can't claim IRQ %d\n", irq); + mbox_controller_unregister(&mhu->mbox); + return err; + } + } + + dev_info(dev, "ARM MHU Doorbell mailbox registered\n"); + return 0; +} + +static struct amba_id mhu_ids[] = { + { + .id = 0x1bb098, + .mask = 0xffffff, + }, + { 0, 0 }, +}; +MODULE_DEVICE_TABLE(amba, mhu_ids); + +static struct amba_driver arm_mhu_db_driver = { + .drv = { + .name = "mhu-doorbell", + }, + .id_table = mhu_ids, + .probe = mhu_db_probe, +}; +module_amba_driver(arm_mhu_db_driver); + +MODULE_LICENSE("GPL v2"); +MODULE_DESCRIPTION("ARM MHU Doorbell Driver"); +MODULE_AUTHOR("Sudeep Holla "); From 558e4c36ec9f2722af4fe8ef84dc812bcdb5c43a Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 27 Aug 2020 09:31:28 +0200 Subject: [PATCH 132/168] maiblox: mediatek: Fix handling of platform_get_irq() error platform_get_irq() returns -ERRNO on error. In such case casting to u32 and comparing to 0 would pass the check. Fixes: 623a6143a845 ("mailbox: mediatek: Add Mediatek CMDQ driver") Signed-off-by: Krzysztof Kozlowski Signed-off-by: Jassi Brar --- drivers/mailbox/mtk-cmdq-mailbox.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/mailbox/mtk-cmdq-mailbox.c b/drivers/mailbox/mtk-cmdq-mailbox.c index 484d4438cd83..5665b6ea8119 100644 --- a/drivers/mailbox/mtk-cmdq-mailbox.c +++ b/drivers/mailbox/mtk-cmdq-mailbox.c @@ -69,7 +69,7 @@ struct cmdq_task { struct cmdq { struct mbox_controller mbox; void __iomem *base; - u32 irq; + int irq; u32 thread_nr; u32 irq_mask; struct cmdq_thread *thread; @@ -525,10 +525,8 @@ static int cmdq_probe(struct platform_device *pdev) } cmdq->irq = platform_get_irq(pdev, 0); - if (!cmdq->irq) { - dev_err(dev, "failed to get irq\n"); - return -EINVAL; - } + if (cmdq->irq < 0) + return cmdq->irq; plat_data = (struct gce_plat *)of_device_get_match_data(dev); if (!plat_data) { From d2585f5164c298aaaed14c2c8d313cbe7bd5b253 Mon Sep 17 00:00:00 2001 From: Vitor Massaru Iha Date: Wed, 29 Jul 2020 14:58:49 -0300 Subject: [PATCH 133/168] lib: kunit: add bitfield test conversion to KUnit This adds the conversion of the runtime tests of test_bitfield, from `lib/test_bitfield.c` to KUnit tests. Code Style Documentation: [0] Signed-off-by: Vitor Massaru Iha Link: [0] https://lore.kernel.org/linux-kselftest/20200620054944.167330-1-davidgow@google.com/T/#u Reviewed-by: Brendan Higgins Signed-off-by: Shuah Khan --- lib/Kconfig.debug | 23 ++++-- lib/Makefile | 2 +- lib/{test_bitfield.c => bitfield_kunit.c} | 96 ++++++++++------------- 3 files changed, 59 insertions(+), 62 deletions(-) rename lib/{test_bitfield.c => bitfield_kunit.c} (66%) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index e068c3c7189a..4f09c6505a2e 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2037,13 +2037,6 @@ config TEST_BITMAP If unsure, say N. -config TEST_BITFIELD - tristate "Test bitfield functions at runtime" - help - Enable this option to test the bitfield functions at boot. - - If unsure, say N. - config TEST_UUID tristate "Test functions located in the uuid module at runtime" @@ -2193,6 +2186,22 @@ config TEST_SYSCTL If unsure, say N. +config BITFIELD_KUNIT + tristate "KUnit test bitfield functions at runtime" + depends on KUNIT + help + Enable this option to test the bitfield functions at boot. + + KUnit tests run during boot and output the results to the debug log + in TAP format (http://testanything.org/). Only useful for kernel devs + running the KUnit test harness, and not intended for inclusion into a + production build. + + For more information on KUnit and unit tests in general please refer + to the KUnit documentation in Documentation/dev-tools/kunit/. + + If unsure, say N. + config SYSCTL_KUNIT_TEST tristate "KUnit test for sysctl" if !KUNIT_ALL_TESTS depends on KUNIT diff --git a/lib/Makefile b/lib/Makefile index e290fc5707ea..d862d41fdc3d 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -80,7 +80,6 @@ obj-$(CONFIG_TEST_STATIC_KEYS) += test_static_key_base.o obj-$(CONFIG_TEST_PRINTF) += test_printf.o obj-$(CONFIG_TEST_BITMAP) += test_bitmap.o obj-$(CONFIG_TEST_STRSCPY) += test_strscpy.o -obj-$(CONFIG_TEST_BITFIELD) += test_bitfield.o obj-$(CONFIG_TEST_UUID) += test_uuid.o obj-$(CONFIG_TEST_XARRAY) += test_xarray.o obj-$(CONFIG_TEST_PARMAN) += test_parman.o @@ -340,6 +339,7 @@ obj-$(CONFIG_OBJAGG) += objagg.o obj-$(CONFIG_PLDMFW) += pldmfw/ # KUnit tests +obj-$(CONFIG_BITFIELD_KUNIT) += bitfield_kunit.o obj-$(CONFIG_LIST_KUNIT_TEST) += list-test.o obj-$(CONFIG_LINEAR_RANGES_TEST) += test_linear_ranges.o obj-$(CONFIG_BITS_TEST) += test_bits.o diff --git a/lib/test_bitfield.c b/lib/bitfield_kunit.c similarity index 66% rename from lib/test_bitfield.c rename to lib/bitfield_kunit.c index 5b8f4108662d..d63a2be5aff8 100644 --- a/lib/test_bitfield.c +++ b/lib/bitfield_kunit.c @@ -5,8 +5,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include -#include +#include #include #define CHECK_ENC_GET_U(tp, v, field, res) do { \ @@ -14,13 +13,11 @@ u##tp _res; \ \ _res = u##tp##_encode_bits(v, field); \ - if (_res != res) { \ - pr_warn("u" #tp "_encode_bits(" #v ", " #field ") is 0x%llx != " #res "\n",\ - (u64)_res); \ - return -EINVAL; \ - } \ - if (u##tp##_get_bits(_res, field) != v) \ - return -EINVAL; \ + KUNIT_ASSERT_FALSE_MSG(context, _res != res, \ + "u" #tp "_encode_bits(" #v ", " #field ") is 0x%llx != " #res "\n", \ + (u64)_res); \ + KUNIT_ASSERT_FALSE(context, \ + u##tp##_get_bits(_res, field) != v); \ } \ } while (0) @@ -29,14 +26,13 @@ __le##tp _res; \ \ _res = le##tp##_encode_bits(v, field); \ - if (_res != cpu_to_le##tp(res)) { \ - pr_warn("le" #tp "_encode_bits(" #v ", " #field ") is 0x%llx != 0x%llx\n",\ - (u64)le##tp##_to_cpu(_res), \ - (u64)(res)); \ - return -EINVAL; \ - } \ - if (le##tp##_get_bits(_res, field) != v) \ - return -EINVAL; \ + KUNIT_ASSERT_FALSE_MSG(context, \ + _res != cpu_to_le##tp(res), \ + "le" #tp "_encode_bits(" #v ", " #field ") is 0x%llx != 0x%llx",\ + (u64)le##tp##_to_cpu(_res), \ + (u64)(res)); \ + KUNIT_ASSERT_FALSE(context, \ + le##tp##_get_bits(_res, field) != v);\ } \ } while (0) @@ -45,14 +41,13 @@ __be##tp _res; \ \ _res = be##tp##_encode_bits(v, field); \ - if (_res != cpu_to_be##tp(res)) { \ - pr_warn("be" #tp "_encode_bits(" #v ", " #field ") is 0x%llx != 0x%llx\n",\ - (u64)be##tp##_to_cpu(_res), \ - (u64)(res)); \ - return -EINVAL; \ - } \ - if (be##tp##_get_bits(_res, field) != v) \ - return -EINVAL; \ + KUNIT_ASSERT_FALSE_MSG(context, \ + _res != cpu_to_be##tp(res), \ + "be" #tp "_encode_bits(" #v ", " #field ") is 0x%llx != 0x%llx", \ + (u64)be##tp##_to_cpu(_res), \ + (u64)(res)); \ + KUNIT_ASSERT_FALSE(context, \ + be##tp##_get_bits(_res, field) != v);\ } \ } while (0) @@ -62,7 +57,7 @@ CHECK_ENC_GET_BE(tp, v, field, res); \ } while (0) -static int test_constants(void) +static void __init test_bitfields_constants(struct kunit *context) { /* * NOTE @@ -95,19 +90,17 @@ static int test_constants(void) CHECK_ENC_GET(64, 7, 0x00f0000000000000ull, 0x0070000000000000ull); CHECK_ENC_GET(64, 14, 0x0f00000000000000ull, 0x0e00000000000000ull); CHECK_ENC_GET(64, 15, 0xf000000000000000ull, 0xf000000000000000ull); - - return 0; } #define CHECK(tp, mask) do { \ u64 v; \ \ for (v = 0; v < 1 << hweight32(mask); v++) \ - if (tp##_encode_bits(v, mask) != v << __ffs64(mask)) \ - return -EINVAL; \ + KUNIT_ASSERT_FALSE(context, \ + tp##_encode_bits(v, mask) != v << __ffs64(mask));\ } while (0) -static int test_variables(void) +static void __init test_bitfields_variables(struct kunit *context) { CHECK(u8, 0x0f); CHECK(u8, 0xf0); @@ -130,39 +123,34 @@ static int test_variables(void) CHECK(u64, 0x000000007f000000ull); CHECK(u64, 0x0000000018000000ull); CHECK(u64, 0x0000001f8000000ull); - - return 0; } -static int __init test_bitfields(void) + +static void __init test_bitfields_compile(struct kunit *context) { - int ret = test_constants(); - - if (ret) { - pr_warn("constant tests failed!\n"); - return ret; - } - - ret = test_variables(); - if (ret) { - pr_warn("variable tests failed!\n"); - return ret; - } - -#ifdef TEST_BITFIELD_COMPILE /* these should fail compilation */ CHECK_ENC_GET(16, 16, 0x0f00, 0x1000); u32_encode_bits(7, 0x06000000); /* this should at least give a warning */ u16_encode_bits(0, 0x60000); -#endif - - pr_info("tests passed\n"); - - return 0; } -module_init(test_bitfields) + +static struct kunit_case __refdata bitfields_test_cases[] = { + KUNIT_CASE(test_bitfields_constants), + KUNIT_CASE(test_bitfields_variables), +#ifdef TEST_BITFIELD_COMPILE + KUNIT_CASE(test_bitfields_compile), +#endif + {} +}; + +static struct kunit_suite bitfields_test_suite = { + .name = "bitfields", + .test_cases = bitfields_test_cases, +}; + +kunit_test_suites(&bitfields_test_suite); MODULE_AUTHOR("Johannes Berg "); MODULE_LICENSE("GPL"); From 82c2d81361ecd142a54e84a9da1e287113314a4f Mon Sep 17 00:00:00 2001 From: Sumera Priyadarsini Date: Thu, 15 Oct 2020 16:21:40 +0530 Subject: [PATCH 134/168] coccinelle: iterators: Add for_each_child.cocci script While iterating over child nodes with the for_each functions, if control is transferred from the middle of the loop, as in the case of a break or return or goto, there is no decrement in the reference counter thus ultimately resulting in a memory leak. Add this script to detect potential memory leaks caused by the absence of of_node_put() before break, goto, or, return statements which transfer control outside the loop. Signed-off-by: Sumera Priyadarsini Signed-off-by: Julia Lawall --- .../coccinelle/iterators/for_each_child.cocci | 358 ++++++++++++++++++ 1 file changed, 358 insertions(+) create mode 100644 scripts/coccinelle/iterators/for_each_child.cocci diff --git a/scripts/coccinelle/iterators/for_each_child.cocci b/scripts/coccinelle/iterators/for_each_child.cocci new file mode 100644 index 000000000000..bc394615948e --- /dev/null +++ b/scripts/coccinelle/iterators/for_each_child.cocci @@ -0,0 +1,358 @@ +// SPDX-License-Identifier: GPL-2.0-only +// Adds missing of_node_put() before return/break/goto statement within a for_each iterator for child nodes. +//# False positives can be due to function calls within the for_each +//# loop that may encapsulate an of_node_put. +/// +// Confidence: High +// Copyright: (C) 2020 Sumera Priyadarsini +// URL: http://coccinelle.lip6.fr +// Options: --no-includes --include-headers + +virtual patch +virtual context +virtual org +virtual report + +@r@ +local idexpression n; +expression e1,e2; +iterator name for_each_node_by_name, for_each_node_by_type, +for_each_compatible_node, for_each_matching_node, +for_each_matching_node_and_match, for_each_child_of_node, +for_each_available_child_of_node, for_each_node_with_property; +iterator i; +statement S; +expression list [n1] es; +@@ + +( +( +for_each_node_by_name(n,e1) S +| +for_each_node_by_type(n,e1) S +| +for_each_compatible_node(n,e1,e2) S +| +for_each_matching_node(n,e1) S +| +for_each_matching_node_and_match(n,e1,e2) S +| +for_each_child_of_node(e1,n) S +| +for_each_available_child_of_node(e1,n) S +| +for_each_node_with_property(n,e1) S +) +& +i(es,n,...) S +) + +@ruleone depends on patch && !context && !org && !report@ + +local idexpression r.n; +iterator r.i,i1; +expression e; +expression list [r.n1] es; +statement S; +@@ + + i(es,n,...) { + ... +( + of_node_put(n); +| + e = n +| + return n; +| + i1(...,n,...) S +| +- return of_node_get(n); ++ return n; +| ++ of_node_put(n); +? return ...; +) + ... when any + } + +@ruletwo depends on patch && !context && !org && !report@ + +local idexpression r.n; +iterator r.i,i1,i2; +expression e,e1; +expression list [r.n1] es; +statement S,S2; +@@ + + i(es,n,...) { + ... +( + of_node_put(n); +| + e = n +| + i1(...,n,...) S +| ++ of_node_put(n); +? break; +) + ... when any + } +... when != n + when strict + when forall +( + n = e1; +| +?i2(...,n,...) S2 +) + +@rulethree depends on patch && !context && !org && !report exists@ + +local idexpression r.n; +iterator r.i,i1,i2; +expression e,e1; +identifier l; +expression list [r.n1] es; +statement S,S2; +@@ + + i(es,n,...) { + ... +( + of_node_put(n); +| + e = n +| + i1(...,n,...) S +| ++ of_node_put(n); +? goto l; +) + ... when any + } +... when exists +l: ... when != n + when strict + when forall +( + n = e1; +| +?i2(...,n,...) S2 +) + +// ---------------------------------------------------------------------------- + +@ruleone_context depends on !patch && (context || org || report) exists@ +statement S; +expression e; +expression list[r.n1] es; +iterator r.i, i1; +local idexpression r.n; +position j0, j1; +@@ + + i@j0(es,n,...) { + ... +( + of_node_put(n); +| + e = n +| + return n; +| + i1(...,n,...) S +| + return @j1 ...; +) + ... when any + } + +@ruleone_disj depends on !patch && (context || org || report)@ +expression list[r.n1] es; +iterator r.i; +local idexpression r.n; +position ruleone_context.j0, ruleone_context.j1; +@@ + +* i@j0(es,n,...) { + ... +*return @j1...; + ... when any + } + +@ruletwo_context depends on !patch && (context || org || report) exists@ +statement S, S2; +expression e, e1; +expression list[r.n1] es; +iterator r.i, i1, i2; +local idexpression r.n; +position j0, j2; +@@ + + i@j0(es,n,...) { + ... +( + of_node_put(n); +| + e = n +| + i1(...,n,...) S +| + break@j2; +) + ... when any + } +... when != n + when strict + when forall +( + n = e1; +| +?i2(...,n,...) S2 +) + +@ruletwo_disj depends on !patch && (context || org || report)@ +statement S2; +expression e1; +expression list[r.n1] es; +iterator r.i, i2; +local idexpression r.n; +position ruletwo_context.j0, ruletwo_context.j2; +@@ + +* i@j0(es,n,...) { + ... +*break @j2; + ... when any + } +... when != n + when strict + when forall +( + n = e1; +| +?i2(...,n,...) S2 +) + +@rulethree_context depends on !patch && (context || org || report) exists@ +identifier l; +statement S,S2; +expression e, e1; +expression list[r.n1] es; +iterator r.i, i1, i2; +local idexpression r.n; +position j0, j3; +@@ + + i@j0(es,n,...) { + ... +( + of_node_put(n); +| + e = n +| + i1(...,n,...) S +| + goto l@j3; +) + ... when any + } +... when exists +l: +... when != n + when strict + when forall +( + n = e1; +| +?i2(...,n,...) S2 +) + +@rulethree_disj depends on !patch && (context || org || report) exists@ +identifier l; +statement S2; +expression e1; +expression list[r.n1] es; +iterator r.i, i2; +local idexpression r.n; +position rulethree_context.j0, rulethree_context.j3; +@@ + +* i@j0(es,n,...) { + ... +*goto l@j3; + ... when any + } +... when exists + l: + ... when != n + when strict + when forall +( + n = e1; +| +?i2(...,n,...) S2 +) + +// ---------------------------------------------------------------------------- + +@script:python ruleone_org depends on org@ +i << r.i; +j0 << ruleone_context.j0; +j1 << ruleone_context. j1; +@@ + +msg = "WARNING: Function \"%s\" should have of_node_put() before return " % (i) +coccilib.org.print_safe_todo(j0[0], msg) +coccilib.org.print_link(j1[0], "") + +@script:python ruletwo_org depends on org@ +i << r.i; +j0 << ruletwo_context.j0; +j2 << ruletwo_context.j2; +@@ + +msg = "WARNING: Function \"%s\" should have of_node_put() before break " % (i) +coccilib.org.print_safe_todo(j0[0], msg) +coccilib.org.print_link(j2[0], "") + +@script:python rulethree_org depends on org@ +i << r.i; +j0 << rulethree_context.j0; +j3 << rulethree_context.j3; +@@ + +msg = "WARNING: Function \"%s\" should have of_node_put() before goto " % (i) +coccilib.org.print_safe_todo(j0[0], msg) +coccilib.org.print_link(j3[0], "") + +// ---------------------------------------------------------------------------- + +@script:python ruleone_report depends on report@ +i << r.i; +j0 << ruleone_context.j0; +j1 << ruleone_context.j1; +@@ + +msg = "WARNING: Function \"%s\" should have of_node_put() before return around line %s." % (i, j1[0].line) +coccilib.report.print_report(j0[0], msg) + +@script:python ruletwo_report depends on report@ +i << r.i; +j0 << ruletwo_context.j0; +j2 << ruletwo_context.j2; +@@ + +msg = "WARNING: Function \"%s\" should have of_node_put() before break around line %s." % (i,j2[0].line) +coccilib.report.print_report(j0[0], msg) + +@script:python rulethree_report depends on report@ +i << r.i; +j0 << rulethree_context.j0; +j3 << rulethree_context.j3; +@@ + +msg = "WARNING: Function \"%s\" should have of_node_put() before goto around lines %s." % (i,j3[0].line) +coccilib.report.print_report(j0[0], msg) From 294a7f1613ee49a608361bd319519561c0ca7e72 Mon Sep 17 00:00:00 2001 From: Vitor Massaru Iha Date: Thu, 15 Oct 2020 09:08:51 -0300 Subject: [PATCH 135/168] lib: kunit: Fix compilation test when using TEST_BIT_FIELD_COMPILE A build condition was missing around a compilation test, this compilation test comes from the original test_bitfield code. And removed unnecessary code for this test. Fixes: d2585f5164c2 ("lib: kunit: add bitfield test conversion to KUnit") Reported-by: Stephen Rothwell Signed-off-by: Vitor Massaru Iha Link: https://lore.kernel.org/linux-next/20201015163056.56fcc835@canb.auug.org.au/ Reviewed-by: Brendan Higgins Signed-off-by: Shuah Khan --- lib/bitfield_kunit.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/lib/bitfield_kunit.c b/lib/bitfield_kunit.c index d63a2be5aff8..1473d8b4bf0f 100644 --- a/lib/bitfield_kunit.c +++ b/lib/bitfield_kunit.c @@ -125,7 +125,7 @@ static void __init test_bitfields_variables(struct kunit *context) CHECK(u64, 0x0000001f8000000ull); } - +#ifdef TEST_BITFIELD_COMPILE static void __init test_bitfields_compile(struct kunit *context) { /* these should fail compilation */ @@ -135,13 +135,11 @@ static void __init test_bitfields_compile(struct kunit *context) /* this should at least give a warning */ u16_encode_bits(0, 0x60000); } +#endif static struct kunit_case __refdata bitfields_test_cases[] = { KUNIT_CASE(test_bitfields_constants), KUNIT_CASE(test_bitfields_variables), -#ifdef TEST_BITFIELD_COMPILE - KUNIT_CASE(test_bitfields_compile), -#endif {} }; From c7dacf5b0f32957b24ef29df1207dc2cd8307743 Mon Sep 17 00:00:00 2001 From: Jassi Brar Date: Fri, 16 Oct 2020 12:20:56 -0500 Subject: [PATCH 136/168] mailbox: avoid timer start from callback If the txdone is done by polling, it is possible for msg_submit() to start the timer while txdone_hrtimer() callback is running. If the timer needs recheduling, it could already be enqueued by the time hrtimer_forward_now() is called, leading hrtimer to loudly complain. WARNING: CPU: 3 PID: 74 at kernel/time/hrtimer.c:932 hrtimer_forward+0xc4/0x110 CPU: 3 PID: 74 Comm: kworker/u8:1 Not tainted 5.9.0-rc2-00236-gd3520067d01c-dirty #5 Hardware name: Libre Computer AML-S805X-AC (DT) Workqueue: events_freezable_power_ thermal_zone_device_check pstate: 20000085 (nzCv daIf -PAN -UAO BTYPE=--) pc : hrtimer_forward+0xc4/0x110 lr : txdone_hrtimer+0xf8/0x118 [...] This can be fixed by not starting the timer from the callback path. Which requires the timer reloading as long as any message is queued on the channel, and not just when current tx is not done yet. Fixes: 0cc67945ea59 ("mailbox: switch to hrtimer for tx_complete polling") Reported-by: Da Xue Reviewed-by: Sudeep Holla Tested-by: Sudeep Holla Acked-by: Jerome Brunet Tested-by: Jerome Brunet Signed-off-by: Jassi Brar --- drivers/mailbox/mailbox.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/mailbox/mailbox.c b/drivers/mailbox/mailbox.c index 0b821a5b2db8..3e7d4b20ab34 100644 --- a/drivers/mailbox/mailbox.c +++ b/drivers/mailbox/mailbox.c @@ -82,9 +82,12 @@ static void msg_submit(struct mbox_chan *chan) exit: spin_unlock_irqrestore(&chan->lock, flags); - if (!err && (chan->txdone_method & TXDONE_BY_POLL)) - /* kick start the timer immediately to avoid delays */ - hrtimer_start(&chan->mbox->poll_hrt, 0, HRTIMER_MODE_REL); + /* kick start the timer immediately to avoid delays */ + if (!err && (chan->txdone_method & TXDONE_BY_POLL)) { + /* but only if not already active */ + if (!hrtimer_active(&chan->mbox->poll_hrt)) + hrtimer_start(&chan->mbox->poll_hrt, 0, HRTIMER_MODE_REL); + } } static void tx_tick(struct mbox_chan *chan, int r) @@ -122,11 +125,10 @@ static enum hrtimer_restart txdone_hrtimer(struct hrtimer *hrtimer) struct mbox_chan *chan = &mbox->chans[i]; if (chan->active_req && chan->cl) { + resched = true; txdone = chan->mbox->ops->last_tx_done(chan); if (txdone) tx_tick(chan, 0); - else - resched = true; } } From edc05fe5559e9b79e64cfec3b960f6a913b73493 Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Fri, 16 Oct 2020 11:54:42 +0300 Subject: [PATCH 137/168] coccinelle: api: add kfree_mismatch script Check that alloc and free types of functions match each other. Signed-off-by: Denis Efremov Signed-off-by: Julia Lawall --- scripts/coccinelle/api/kfree_mismatch.cocci | 228 ++++++++++++++++++++ 1 file changed, 228 insertions(+) create mode 100644 scripts/coccinelle/api/kfree_mismatch.cocci diff --git a/scripts/coccinelle/api/kfree_mismatch.cocci b/scripts/coccinelle/api/kfree_mismatch.cocci new file mode 100644 index 000000000000..d46a9b3eb7b3 --- /dev/null +++ b/scripts/coccinelle/api/kfree_mismatch.cocci @@ -0,0 +1,228 @@ +// SPDX-License-Identifier: GPL-2.0-only +/// +/// Check that kvmalloc'ed memory is freed by kfree functions, +/// vmalloc'ed by vfree functions and kvmalloc'ed by kvfree +/// functions. +/// +// Confidence: High +// Copyright: (C) 2020 Denis Efremov ISPRAS +// Options: --no-includes --include-headers +// + +virtual patch +virtual report +virtual org +virtual context + +@alloc@ +expression E, E1; +position kok, vok; +@@ + +( + if (...) { + ... + E = \(kmalloc\|kzalloc\|krealloc\|kcalloc\| + kmalloc_node\|kzalloc_node\|kmalloc_array\| + kmalloc_array_node\|kcalloc_node\)(...)@kok + ... + } else { + ... + E = \(vmalloc\|vzalloc\|vmalloc_user\|vmalloc_node\| + vzalloc_node\|vmalloc_exec\|vmalloc_32\| + vmalloc_32_user\|__vmalloc\|__vmalloc_node_range\| + __vmalloc_node\)(...)@vok + ... + } +| + E = \(kmalloc\|kzalloc\|krealloc\|kcalloc\|kmalloc_node\|kzalloc_node\| + kmalloc_array\|kmalloc_array_node\|kcalloc_node\)(...)@kok + ... when != E = E1 + when any + if (E == NULL) { + ... + E = \(vmalloc\|vzalloc\|vmalloc_user\|vmalloc_node\| + vzalloc_node\|vmalloc_exec\|vmalloc_32\| + vmalloc_32_user\|__vmalloc\|__vmalloc_node_range\| + __vmalloc_node\)(...)@vok + ... + } +) + +@free@ +expression E; +position fok; +@@ + + E = \(kvmalloc\|kvzalloc\|kvcalloc\|kvzalloc_node\|kvmalloc_node\| + kvmalloc_array\)(...) + ... + kvfree(E)@fok + +@vfree depends on !patch@ +expression E; +position a != alloc.kok; +position f != free.fok; +@@ + +* E = \(kmalloc\|kzalloc\|krealloc\|kcalloc\|kmalloc_node\| +* kzalloc_node\|kmalloc_array\|kmalloc_array_node\| +* kcalloc_node\)(...)@a + ... when != if (...) { ... E = \(vmalloc\|vzalloc\|vmalloc_user\|vmalloc_node\|vzalloc_node\|vmalloc_exec\|vmalloc_32\|vmalloc_32_user\|__vmalloc\|__vmalloc_node_range\|__vmalloc_node\)(...); ... } + when != is_vmalloc_addr(E) + when any +* \(vfree\|vfree_atomic\|kvfree\)(E)@f + +@depends on patch exists@ +expression E; +position a != alloc.kok; +position f != free.fok; +@@ + + E = \(kmalloc\|kzalloc\|krealloc\|kcalloc\|kmalloc_node\| + kzalloc_node\|kmalloc_array\|kmalloc_array_node\| + kcalloc_node\)(...)@a + ... when != if (...) { ... E = \(vmalloc\|vzalloc\|vmalloc_user\|vmalloc_node\|vzalloc_node\|vmalloc_exec\|vmalloc_32\|vmalloc_32_user\|__vmalloc\|__vmalloc_node_range\|__vmalloc_node\)(...); ... } + when != is_vmalloc_addr(E) + when any +- \(vfree\|vfree_atomic\|kvfree\)(E)@f ++ kfree(E) + +@kfree depends on !patch@ +expression E; +position a != alloc.vok; +position f != free.fok; +@@ + +* E = \(vmalloc\|vzalloc\|vmalloc_user\|vmalloc_node\|vzalloc_node\| +* vmalloc_exec\|vmalloc_32\|vmalloc_32_user\|__vmalloc\| +* __vmalloc_node_range\|__vmalloc_node\)(...)@a + ... when != is_vmalloc_addr(E) + when any +* \(kfree\|kfree_sensitive\|kvfree\)(E)@f + +@depends on patch exists@ +expression E; +position a != alloc.vok; +position f != free.fok; +@@ + + E = \(vmalloc\|vzalloc\|vmalloc_user\|vmalloc_node\|vzalloc_node\| + vmalloc_exec\|vmalloc_32\|vmalloc_32_user\|__vmalloc\| + __vmalloc_node_range\|__vmalloc_node\)(...)@a + ... when != is_vmalloc_addr(E) + when any +- \(kfree\|kvfree\)(E)@f ++ vfree(E) + +@kvfree depends on !patch@ +expression E; +position a, f; +@@ + +* E = \(kvmalloc\|kvzalloc\|kvcalloc\|kvzalloc_node\|kvmalloc_node\| +* kvmalloc_array\)(...)@a + ... when != is_vmalloc_addr(E) + when any +* \(kfree\|kfree_sensitive\|vfree\|vfree_atomic\)(E)@f + +@depends on patch exists@ +expression E; +@@ + + E = \(kvmalloc\|kvzalloc\|kvcalloc\|kvzalloc_node\|kvmalloc_node\| + kvmalloc_array\)(...) + ... when != is_vmalloc_addr(E) + when any +- \(kfree\|vfree\)(E) ++ kvfree(E) + +@kvfree_switch depends on !patch@ +expression alloc.E; +position f; +@@ + + ... when != is_vmalloc_addr(E) + when any +* \(kfree\|kfree_sensitive\|vfree\|vfree_atomic\)(E)@f + +@depends on patch exists@ +expression alloc.E; +position f; +@@ + + ... when != is_vmalloc_addr(E) + when any +( +- \(kfree\|vfree\)(E)@f ++ kvfree(E) +| +- kfree_sensitive(E)@f ++ kvfree_sensitive(E) +) + +@script: python depends on report@ +a << vfree.a; +f << vfree.f; +@@ + +msg = "WARNING kmalloc is used to allocate this memory at line %s" % (a[0].line) +coccilib.report.print_report(f[0], msg) + +@script: python depends on org@ +a << vfree.a; +f << vfree.f; +@@ + +msg = "WARNING kmalloc is used to allocate this memory at line %s" % (a[0].line) +coccilib.org.print_todo(f[0], msg) + +@script: python depends on report@ +a << kfree.a; +f << kfree.f; +@@ + +msg = "WARNING vmalloc is used to allocate this memory at line %s" % (a[0].line) +coccilib.report.print_report(f[0], msg) + +@script: python depends on org@ +a << kfree.a; +f << kfree.f; +@@ + +msg = "WARNING vmalloc is used to allocate this memory at line %s" % (a[0].line) +coccilib.org.print_todo(f[0], msg) + +@script: python depends on report@ +a << kvfree.a; +f << kvfree.f; +@@ + +msg = "WARNING kvmalloc is used to allocate this memory at line %s" % (a[0].line) +coccilib.report.print_report(f[0], msg) + +@script: python depends on org@ +a << kvfree.a; +f << kvfree.f; +@@ + +msg = "WARNING kvmalloc is used to allocate this memory at line %s" % (a[0].line) +coccilib.org.print_todo(f[0], msg) + +@script: python depends on report@ +ka << alloc.kok; +va << alloc.vok; +f << kvfree_switch.f; +@@ + +msg = "WARNING kmalloc (line %s) && vmalloc (line %s) are used to allocate this memory" % (ka[0].line, va[0].line) +coccilib.report.print_report(f[0], msg) + +@script: python depends on org@ +ka << alloc.kok; +va << alloc.vok; +f << kvfree_switch.f; +@@ + +msg = "WARNING kmalloc (line %s) && vmalloc (line %s) are used to allocate this memory" % (ka[0].line, va[0].line) +coccilib.org.print_todo(f[0], msg) From 7404840d87557c4092bf0272bce5e0354c774bf9 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Sat, 17 Oct 2020 16:13:37 -0700 Subject: [PATCH 138/168] ia64: fix build error with !COREDUMP Fix linkage error when CONFIG_BINFMT_ELF is selected but CONFIG_COREDUMP is not: ia64-linux-ld: arch/ia64/kernel/elfcore.o: in function `elf_core_write_extra_phdrs': elfcore.c:(.text+0x172): undefined reference to `dump_emit' ia64-linux-ld: arch/ia64/kernel/elfcore.o: in function `elf_core_write_extra_data': elfcore.c:(.text+0x2b2): undefined reference to `dump_emit' Fixes: 1fcccbac89f5 ("elf coredump: replace ELF_CORE_EXTRA_* macros by functions") Reported-by: kernel test robot Signed-off-by: Krzysztof Kozlowski Signed-off-by: Andrew Morton Cc: Tony Luck Cc: Fenghua Yu Cc: Link: https://lkml.kernel.org/r/20200819064146.12529-1-krzk@kernel.org Signed-off-by: Linus Torvalds --- arch/ia64/kernel/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile index 81901c5e5426..c89bd5f8cbf8 100644 --- a/arch/ia64/kernel/Makefile +++ b/arch/ia64/kernel/Makefile @@ -40,7 +40,7 @@ obj-y += esi_stub.o # must be in kernel proper endif obj-$(CONFIG_INTEL_IOMMU) += pci-dma.o -obj-$(CONFIG_BINFMT_ELF) += elfcore.o +obj-$(CONFIG_ELF_CORE) += elfcore.o # fp_emulate() expects f2-f5,f16-f31 to contain the user-level state. CFLAGS_traps.o += -mfixed-range=f2-f5,f16-f31 From b87d8cefe43c7f22e8aa13919c1dfa2b4b4b4e01 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Sat, 17 Oct 2020 16:13:40 -0700 Subject: [PATCH 139/168] mm, memcg: rework remote charging API to support nesting Currently the remote memcg charging API consists of two functions: memalloc_use_memcg() and memalloc_unuse_memcg(), which set and clear the memcg value, which overwrites the memcg of the current task. memalloc_use_memcg(target_memcg); <...> memalloc_unuse_memcg(); It works perfectly for allocations performed from a normal context, however an attempt to call it from an interrupt context or just nest two remote charging blocks will lead to an incorrect accounting. On exit from the inner block the active memcg will be cleared instead of being restored. memalloc_use_memcg(target_memcg); memalloc_use_memcg(target_memcg_2); <...> memalloc_unuse_memcg(); Error: allocation here are charged to the memcg of the current process instead of target_memcg. memalloc_unuse_memcg(); This patch extends the remote charging API by switching to a single function: struct mem_cgroup *set_active_memcg(struct mem_cgroup *memcg), which sets the new value and returns the old one. So a remote charging block will look like: old_memcg = set_active_memcg(target_memcg); <...> set_active_memcg(old_memcg); This patch is heavily based on the patch by Johannes Weiner, which can be found here: https://lkml.org/lkml/2020/5/28/806 . Signed-off-by: Roman Gushchin Signed-off-by: Andrew Morton Reviewed-by: Shakeel Butt Cc: Johannes Weiner Cc: Dan Schatzberg Link: https://lkml.kernel.org/r/20200821212056.3769116-1-guro@fb.com Signed-off-by: Linus Torvalds --- fs/buffer.c | 6 +++--- fs/notify/fanotify/fanotify.c | 5 +++-- fs/notify/inotify/inotify_fsnotify.c | 5 +++-- include/linux/sched/mm.h | 30 ++++++++++------------------ mm/memcontrol.c | 6 +++--- 5 files changed, 22 insertions(+), 30 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index 5a28a6aa7f16..23f645657488 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -842,13 +842,13 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, struct buffer_head *bh, *head; gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT; long offset; - struct mem_cgroup *memcg; + struct mem_cgroup *memcg, *old_memcg; if (retry) gfp |= __GFP_NOFAIL; memcg = get_mem_cgroup_from_page(page); - memalloc_use_memcg(memcg); + old_memcg = set_active_memcg(memcg); head = NULL; offset = PAGE_SIZE; @@ -867,7 +867,7 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, set_bh_page(bh, page, offset); } out: - memalloc_unuse_memcg(); + set_active_memcg(old_memcg); mem_cgroup_put(memcg); return head; /* diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index c942910a8649..9167884a61ec 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -531,6 +531,7 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, struct inode *dirid = fanotify_dfid_inode(mask, data, data_type, dir); const struct path *path = fsnotify_data_path(data, data_type); unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS); + struct mem_cgroup *old_memcg; struct inode *child = NULL; bool name_event = false; @@ -580,7 +581,7 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, gfp |= __GFP_RETRY_MAYFAIL; /* Whoever is interested in the event, pays for the allocation. */ - memalloc_use_memcg(group->memcg); + old_memcg = set_active_memcg(group->memcg); if (fanotify_is_perm_event(mask)) { event = fanotify_alloc_perm_event(path, gfp); @@ -608,7 +609,7 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, event->pid = get_pid(task_tgid(current)); out: - memalloc_unuse_memcg(); + set_active_memcg(old_memcg); return event; } diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index a65cf8c9f600..9ddcbadc98e2 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -66,6 +66,7 @@ static int inotify_one_event(struct fsnotify_group *group, u32 mask, int ret; int len = 0; int alloc_len = sizeof(struct inotify_event_info); + struct mem_cgroup *old_memcg; if ((inode_mark->mask & FS_EXCL_UNLINK) && path && d_unlinked(path->dentry)) @@ -87,9 +88,9 @@ static int inotify_one_event(struct fsnotify_group *group, u32 mask, * trigger OOM killer in the target monitoring memcg as it may have * security repercussion. */ - memalloc_use_memcg(group->memcg); + old_memcg = set_active_memcg(group->memcg); event = kmalloc(alloc_len, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); - memalloc_unuse_memcg(); + set_active_memcg(old_memcg); if (unlikely(!event)) { /* diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 981e34cb1409..1a80fb128e74 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -280,38 +280,28 @@ static inline void memalloc_nocma_restore(unsigned int flags) #ifdef CONFIG_MEMCG /** - * memalloc_use_memcg - Starts the remote memcg charging scope. + * set_active_memcg - Starts the remote memcg charging scope. * @memcg: memcg to charge. * * This function marks the beginning of the remote memcg charging scope. All the * __GFP_ACCOUNT allocations till the end of the scope will be charged to the * given memcg. * - * NOTE: This function is not nesting safe. + * NOTE: This function can nest. Users must save the return value and + * reset the previous value after their own charging scope is over. */ -static inline void memalloc_use_memcg(struct mem_cgroup *memcg) +static inline struct mem_cgroup * +set_active_memcg(struct mem_cgroup *memcg) { - WARN_ON_ONCE(current->active_memcg); + struct mem_cgroup *old = current->active_memcg; current->active_memcg = memcg; -} - -/** - * memalloc_unuse_memcg - Ends the remote memcg charging scope. - * - * This function marks the end of the remote memcg charging scope started by - * memalloc_use_memcg(). - */ -static inline void memalloc_unuse_memcg(void) -{ - current->active_memcg = NULL; + return old; } #else -static inline void memalloc_use_memcg(struct mem_cgroup *memcg) -{ -} - -static inline void memalloc_unuse_memcg(void) +static inline struct mem_cgroup * +set_active_memcg(struct mem_cgroup *memcg) { + return NULL; } #endif diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7f74a158cfa8..4c741248198b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5290,12 +5290,12 @@ static struct cgroup_subsys_state * __ref mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { struct mem_cgroup *parent = mem_cgroup_from_css(parent_css); - struct mem_cgroup *memcg; + struct mem_cgroup *memcg, *old_memcg; long error = -ENOMEM; - memalloc_use_memcg(parent); + old_memcg = set_active_memcg(parent); memcg = mem_cgroup_alloc(); - memalloc_unuse_memcg(); + set_active_memcg(old_memcg); if (IS_ERR(memcg)) return ERR_CAST(memcg); From 279c3393e2c113365c999f16cd096bcf3d34319e Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Sat, 17 Oct 2020 16:13:44 -0700 Subject: [PATCH 140/168] mm: kmem: move memcg_kmem_bypass() calls to get_mem/obj_cgroup_from_current() Patch series "mm: kmem: kernel memory accounting in an interrupt context". This patchset implements memcg-based memory accounting of allocations made from an interrupt context. Historically, such allocations were passed unaccounted mostly because charging the memory cgroup of the current process wasn't an option. Also performance reasons were likely a reason too. The remote charging API allows to temporarily overwrite the currently active memory cgroup, so that all memory allocations are accounted towards some specified memory cgroup instead of the memory cgroup of the current process. This patchset extends the remote charging API so that it can be used from an interrupt context. Then it removes the fence that prevented the accounting of allocations made from an interrupt context. It also contains a couple of optimizations/code refactorings. This patchset doesn't directly enable accounting for any specific allocations, but prepares the code base for it. The bpf memory accounting will likely be the first user of it: a typical example is a bpf program parsing an incoming network packet, which allocates an entry in hashmap map to store some information. This patch (of 4): Currently memcg_kmem_bypass() is called before obtaining the current memory/obj cgroup using get_mem/obj_cgroup_from_current(). Moving memcg_kmem_bypass() into get_mem/obj_cgroup_from_current() reduces the number of call sites and allows further code simplifications. Signed-off-by: Roman Gushchin Signed-off-by: Andrew Morton Reviewed-by: Shakeel Butt Cc: Johannes Weiner Cc: Michal Hocko Link: http://lkml.kernel.org/r/20200827225843.1270629-1-guro@fb.com Link: http://lkml.kernel.org/r/20200827225843.1270629-2-guro@fb.com Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 13 ++++++++----- mm/percpu.c | 3 +-- mm/slab.h | 3 --- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4c741248198b..197b9ddb20f3 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1066,6 +1066,9 @@ EXPORT_SYMBOL(get_mem_cgroup_from_page); */ static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void) { + if (memcg_kmem_bypass()) + return NULL; + if (unlikely(current->active_memcg)) { struct mem_cgroup *memcg; @@ -2933,6 +2936,9 @@ __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void) struct obj_cgroup *objcg = NULL; struct mem_cgroup *memcg; + if (memcg_kmem_bypass()) + return NULL; + if (unlikely(!current->mm && !current->active_memcg)) return NULL; @@ -3059,19 +3065,16 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) struct mem_cgroup *memcg; int ret = 0; - if (memcg_kmem_bypass()) - return 0; - memcg = get_mem_cgroup_from_current(); - if (!mem_cgroup_is_root(memcg)) { + if (memcg && !mem_cgroup_is_root(memcg)) { ret = __memcg_kmem_charge(memcg, gfp, 1 << order); if (!ret) { page->mem_cgroup = memcg; __SetPageKmemcg(page); return 0; } + css_put(&memcg->css); } - css_put(&memcg->css); return ret; } diff --git a/mm/percpu.c b/mm/percpu.c index 1ed1a349eab8..66a93f096394 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1584,8 +1584,7 @@ static enum pcpu_chunk_type pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, { struct obj_cgroup *objcg; - if (!memcg_kmem_enabled() || !(gfp & __GFP_ACCOUNT) || - memcg_kmem_bypass()) + if (!memcg_kmem_enabled() || !(gfp & __GFP_ACCOUNT)) return PCPU_CHUNK_ROOT; objcg = get_obj_cgroup_from_current(); diff --git a/mm/slab.h b/mm/slab.h index 06c6587765a3..6d7c6a5056ba 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -280,9 +280,6 @@ static inline struct obj_cgroup *memcg_slab_pre_alloc_hook(struct kmem_cache *s, { struct obj_cgroup *objcg; - if (memcg_kmem_bypass()) - return NULL; - objcg = get_obj_cgroup_from_current(); if (!objcg) return NULL; From 67f0286498d7486a9c35f081df1dffe2ffcd02b9 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Sat, 17 Oct 2020 16:13:47 -0700 Subject: [PATCH 141/168] mm: kmem: remove redundant checks from get_obj_cgroup_from_current() There are checks for current->mm and current->active_memcg in get_obj_cgroup_from_current(), but these checks are redundant: memcg_kmem_bypass() called just above performs same checks. Signed-off-by: Roman Gushchin Signed-off-by: Andrew Morton Reviewed-by: Shakeel Butt Cc: Johannes Weiner Cc: Michal Hocko Link: http://lkml.kernel.org/r/20200827225843.1270629-3-guro@fb.com Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 197b9ddb20f3..51b1698bf06c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2939,9 +2939,6 @@ __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void) if (memcg_kmem_bypass()) return NULL; - if (unlikely(!current->mm && !current->active_memcg)) - return NULL; - rcu_read_lock(); if (unlikely(current->active_memcg)) memcg = rcu_dereference(current->active_memcg); From 37d5985c003daab138a72dd4af9853b396d91c26 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Sat, 17 Oct 2020 16:13:50 -0700 Subject: [PATCH 142/168] mm: kmem: prepare remote memcg charging infra for interrupt contexts Remote memcg charging API uses current->active_memcg to store the currently active memory cgroup, which overwrites the memory cgroup of the current process. It works well for normal contexts, but doesn't work for interrupt contexts: indeed, if an interrupt occurs during the execution of a section with an active memcg set, all allocations inside the interrupt will be charged to the active memcg set (given that we'll enable accounting for allocations from an interrupt context). But because the interrupt might have no relation to the active memcg set outside, it's obviously wrong from the accounting prospective. To resolve this problem, let's add a global percpu int_active_memcg variable, which will be used to store an active memory cgroup which will be used from interrupt contexts. set_active_memcg() will transparently use current->active_memcg or int_active_memcg depending on the context. To make the read part simple and transparent for the caller, let's introduce two new functions: - struct mem_cgroup *active_memcg(void), - struct mem_cgroup *get_active_memcg(void). They are returning the active memcg if it's set, hiding all implementation details: where to get it depending on the current context. Signed-off-by: Roman Gushchin Signed-off-by: Andrew Morton Reviewed-by: Shakeel Butt Cc: Johannes Weiner Cc: Michal Hocko Link: http://lkml.kernel.org/r/20200827225843.1270629-4-guro@fb.com Signed-off-by: Linus Torvalds --- include/linux/sched/mm.h | 13 +++++++++-- mm/memcontrol.c | 48 ++++++++++++++++++++++++++++------------ 2 files changed, 45 insertions(+), 16 deletions(-) diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 1a80fb128e74..d5ece7a9a403 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -279,6 +279,7 @@ static inline void memalloc_nocma_restore(unsigned int flags) #endif #ifdef CONFIG_MEMCG +DECLARE_PER_CPU(struct mem_cgroup *, int_active_memcg); /** * set_active_memcg - Starts the remote memcg charging scope. * @memcg: memcg to charge. @@ -293,8 +294,16 @@ static inline void memalloc_nocma_restore(unsigned int flags) static inline struct mem_cgroup * set_active_memcg(struct mem_cgroup *memcg) { - struct mem_cgroup *old = current->active_memcg; - current->active_memcg = memcg; + struct mem_cgroup *old; + + if (in_interrupt()) { + old = this_cpu_read(int_active_memcg); + this_cpu_write(int_active_memcg, memcg); + } else { + old = current->active_memcg; + current->active_memcg = memcg; + } + return old; } #else diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 51b1698bf06c..a3318b66e41e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -73,6 +73,9 @@ EXPORT_SYMBOL(memory_cgrp_subsys); struct mem_cgroup *root_mem_cgroup __read_mostly; +/* Active memory cgroup to use from an interrupt context */ +DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg); + /* Socket memory accounting disabled? */ static bool cgroup_memory_nosocket; @@ -1061,26 +1064,43 @@ struct mem_cgroup *get_mem_cgroup_from_page(struct page *page) } EXPORT_SYMBOL(get_mem_cgroup_from_page); +static __always_inline struct mem_cgroup *active_memcg(void) +{ + if (in_interrupt()) + return this_cpu_read(int_active_memcg); + else + return current->active_memcg; +} + +static __always_inline struct mem_cgroup *get_active_memcg(void) +{ + struct mem_cgroup *memcg; + + rcu_read_lock(); + memcg = active_memcg(); + if (memcg) { + /* current->active_memcg must hold a ref. */ + if (WARN_ON_ONCE(!css_tryget(&memcg->css))) + memcg = root_mem_cgroup; + else + memcg = current->active_memcg; + } + rcu_read_unlock(); + + return memcg; +} + /** - * If current->active_memcg is non-NULL, do not fallback to current->mm->memcg. + * If active memcg is set, do not fallback to current->mm->memcg. */ static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void) { if (memcg_kmem_bypass()) return NULL; - if (unlikely(current->active_memcg)) { - struct mem_cgroup *memcg; + if (unlikely(active_memcg())) + return get_active_memcg(); - rcu_read_lock(); - /* current->active_memcg must hold a ref. */ - if (WARN_ON_ONCE(!css_tryget(¤t->active_memcg->css))) - memcg = root_mem_cgroup; - else - memcg = current->active_memcg; - rcu_read_unlock(); - return memcg; - } return get_mem_cgroup_from_mm(current->mm); } @@ -2940,8 +2960,8 @@ __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void) return NULL; rcu_read_lock(); - if (unlikely(current->active_memcg)) - memcg = rcu_dereference(current->active_memcg); + if (unlikely(active_memcg())) + memcg = active_memcg(); else memcg = mem_cgroup_from_task(current); From 4127c6504f25c4fcff52dc996efda2ef859dd661 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Sat, 17 Oct 2020 16:13:53 -0700 Subject: [PATCH 143/168] mm: kmem: enable kernel memcg accounting from interrupt contexts If a memcg to charge can be determined (using remote charging API), there are no reasons to exclude allocations made from an interrupt context from the accounting. Such allocations will pass even if the resulting memcg size will exceed the hard limit, but it will affect the application of the memory pressure and an inability to put the workload under the limit will eventually trigger the OOM. To use active_memcg() helper, memcg_kmem_bypass() is moved back to memcontrol.c. Signed-off-by: Roman Gushchin Signed-off-by: Andrew Morton Reviewed-by: Shakeel Butt Cc: Johannes Weiner Cc: Michal Hocko Link: http://lkml.kernel.org/r/20200827225843.1270629-5-guro@fb.com Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 12 ------------ mm/memcontrol.c | 13 +++++++++++++ 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 6ef4a552e09d..e391e3c56de5 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1531,18 +1531,6 @@ static inline bool memcg_kmem_enabled(void) return static_branch_likely(&memcg_kmem_enabled_key); } -static inline bool memcg_kmem_bypass(void) -{ - if (in_interrupt()) - return true; - - /* Allow remote memcg charging in kthread contexts. */ - if ((!current->mm || (current->flags & PF_KTHREAD)) && - !current->active_memcg) - return true; - return false; -} - static inline int memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a3318b66e41e..3a24e3b619f5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1090,6 +1090,19 @@ static __always_inline struct mem_cgroup *get_active_memcg(void) return memcg; } +static __always_inline bool memcg_kmem_bypass(void) +{ + /* Allow remote memcg charging from any context. */ + if (unlikely(active_memcg())) + return false; + + /* Memcg to charge can't be determined. */ + if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD)) + return true; + + return false; +} + /** * If active memcg is set, do not fallback to current->mm->memcg. */ From 546087599986be4fe4e39a621cc0828e832caccb Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Sat, 17 Oct 2020 16:13:57 -0700 Subject: [PATCH 144/168] mm/memory-failure: remove a wrapper for alloc_migration_target() There is a well-defined standard migration target callback. Use it directly. Signed-off-by: Joonsoo Kim Signed-off-by: Andrew Morton Acked-by: Vlastimil Babka Cc: Christoph Hellwig Cc: Michal Hocko Cc: Mike Kravetz Cc: Naoya Horiguchi Cc: Roman Gushchin Link: http://lkml.kernel.org/r/1594622517-20681-9-git-send-email-iamjoonsoo.kim@lge.com Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index a2184b721fbf..c0bb186bba62 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1673,16 +1673,6 @@ int unpoison_memory(unsigned long pfn) } EXPORT_SYMBOL(unpoison_memory); -static struct page *new_page(struct page *p, unsigned long private) -{ - struct migration_target_control mtc = { - .nid = page_to_nid(p), - .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, - }; - - return alloc_migration_target(p, (unsigned long)&mtc); -} - /* * Safely get reference count of an arbitrary page. * Returns 0 for a free page, -EIO for a zero refcount page @@ -1797,6 +1787,10 @@ static int __soft_offline_page(struct page *page) char const *msg_page[] = {"page", "hugepage"}; bool huge = PageHuge(page); LIST_HEAD(pagelist); + struct migration_target_control mtc = { + .nid = NUMA_NO_NODE, + .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, + }; /* * Check PageHWPoison again inside page lock because PageHWPoison @@ -1833,8 +1827,8 @@ static int __soft_offline_page(struct page *page) } if (isolate_page(hpage, &pagelist)) { - ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, - MIGRATE_SYNC, MR_MEMORY_FAILURE); + ret = migrate_pages(&pagelist, alloc_migration_target, NULL, + (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_FAILURE); if (!ret) { bool release = !huge; From 203e6e5ca4eac64c8909debfd64aae3fd62b2a16 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Sat, 17 Oct 2020 16:14:00 -0700 Subject: [PATCH 145/168] mm/memory_hotplug: remove a wrapper for alloc_migration_target() To calculate the correct node to migrate the page for hotplug, we need to check node id of the page. Wrapper for alloc_migration_target() exists for this purpose. However, Vlastimil informs that all migration source pages come from a single node. In this case, we don't need to check the node id for each page and we don't need to re-set the target nodemask for each page by using the wrapper. Set up the migration_target_control once and use it for all pages. Signed-off-by: Joonsoo Kim Signed-off-by: Andrew Morton Acked-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Christoph Hellwig Cc: Mike Kravetz Cc: Naoya Horiguchi Cc: Roman Gushchin Link: http://lkml.kernel.org/r/1594622517-20681-10-git-send-email-iamjoonsoo.kim@lge.com Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 46 ++++++++++++++++++++++----------------------- 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 6f203574ca1d..b44d4c7ba73b 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1290,27 +1290,6 @@ found: return 0; } -static struct page *new_node_page(struct page *page, unsigned long private) -{ - nodemask_t nmask = node_states[N_MEMORY]; - struct migration_target_control mtc = { - .nid = page_to_nid(page), - .nmask = &nmask, - .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, - }; - - /* - * try to allocate from a different node but reuse this node if there - * are no other online nodes to be used (e.g. we are offlining a part - * of the only existing node) - */ - node_clear(mtc.nid, nmask); - if (nodes_empty(nmask)) - node_set(mtc.nid, nmask); - - return alloc_migration_target(page, (unsigned long)&mtc); -} - static int do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) { @@ -1370,9 +1349,28 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) put_page(page); } if (!list_empty(&source)) { - /* Allocate a new page from the nearest neighbor node */ - ret = migrate_pages(&source, new_node_page, NULL, 0, - MIGRATE_SYNC, MR_MEMORY_HOTPLUG); + nodemask_t nmask = node_states[N_MEMORY]; + struct migration_target_control mtc = { + .nmask = &nmask, + .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, + }; + + /* + * We have checked that migration range is on a single zone so + * we can use the nid of the first page to all the others. + */ + mtc.nid = page_to_nid(list_first_entry(&source, struct page, lru)); + + /* + * try to allocate from a different node but reuse this node + * if there are no other online nodes to be used (e.g. we are + * offlining a part of the only existing node) + */ + node_clear(mtc.nid, nmask); + if (nodes_empty(nmask)) + node_set(mtc.nid, nmask); + ret = migrate_pages(&source, alloc_migration_target, NULL, + (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_HOTPLUG); if (ret) { list_for_each_entry(page, &source, lru) { pr_warn("migrating pfn %lx failed ret:%d ", From 4dc200cee1950ac3f9b99f0c8d4a750b62958f81 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 17 Oct 2020 16:14:03 -0700 Subject: [PATCH 146/168] mm/migrate: avoid possible unnecessary process right check in kernel_move_pages() There is no need to check if this process has the right to modify the specified process when they are same. And we could also skip the security hook call if a process is modifying its own pages. Add helper function to handle these. Suggested-by: Matthew Wilcox Signed-off-by: Hongxiang Lou Signed-off-by: Miaohe Lin Signed-off-by: Andrew Morton Cc: Christopher Lameter Link: https://lkml.kernel.org/r/20200819083331.19012-1-linmiaohe@huawei.com Signed-off-by: Linus Torvalds --- mm/migrate.c | 85 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 50 insertions(+), 35 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 4cf1af88c1dd..5ca5842df5db 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1864,6 +1864,53 @@ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages, return nr_pages ? -EFAULT : 0; } +static struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes) +{ + struct task_struct *task; + struct mm_struct *mm; + + /* + * There is no need to check if current process has the right to modify + * the specified process when they are same. + */ + if (!pid) { + mmget(current->mm); + *mem_nodes = cpuset_mems_allowed(current); + return current->mm; + } + + /* Find the mm_struct */ + rcu_read_lock(); + task = find_task_by_vpid(pid); + if (!task) { + rcu_read_unlock(); + return ERR_PTR(-ESRCH); + } + get_task_struct(task); + + /* + * Check if this process has the right to modify the specified + * process. Use the regular "ptrace_may_access()" checks. + */ + if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) { + rcu_read_unlock(); + mm = ERR_PTR(-EPERM); + goto out; + } + rcu_read_unlock(); + + mm = ERR_PTR(security_task_movememory(task)); + if (IS_ERR(mm)) + goto out; + *mem_nodes = cpuset_mems_allowed(task); + mm = get_task_mm(task); +out: + put_task_struct(task); + if (!mm) + mm = ERR_PTR(-EINVAL); + return mm; +} + /* * Move a list of pages in the address space of the currently executing * process. @@ -1873,7 +1920,6 @@ static int kernel_move_pages(pid_t pid, unsigned long nr_pages, const int __user *nodes, int __user *status, int flags) { - struct task_struct *task; struct mm_struct *mm; int err; nodemask_t task_nodes; @@ -1885,36 +1931,9 @@ static int kernel_move_pages(pid_t pid, unsigned long nr_pages, if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) return -EPERM; - /* Find the mm_struct */ - rcu_read_lock(); - task = pid ? find_task_by_vpid(pid) : current; - if (!task) { - rcu_read_unlock(); - return -ESRCH; - } - get_task_struct(task); - - /* - * Check if this process has the right to modify the specified - * process. Use the regular "ptrace_may_access()" checks. - */ - if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) { - rcu_read_unlock(); - err = -EPERM; - goto out; - } - rcu_read_unlock(); - - err = security_task_movememory(task); - if (err) - goto out; - - task_nodes = cpuset_mems_allowed(task); - mm = get_task_mm(task); - put_task_struct(task); - - if (!mm) - return -EINVAL; + mm = find_mm_struct(pid, &task_nodes); + if (IS_ERR(mm)) + return PTR_ERR(mm); if (nodes) err = do_pages_move(mm, task_nodes, nr_pages, pages, @@ -1924,10 +1943,6 @@ static int kernel_move_pages(pid_t pid, unsigned long nr_pages, mmput(mm); return err; - -out: - put_task_struct(task); - return err; } SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, From 3903b55a6117ee5a8c90108beaf8d921474aa05a Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Sat, 17 Oct 2020 16:14:06 -0700 Subject: [PATCH 147/168] mm/mmap: add inline vma_next() for readability of mmap code There are three places that the next vma is required which uses the same block of code. Replace the block with a function and add comments on what happens in the case where NULL is encountered. Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Link: http://lkml.kernel.org/r/20200818154707.2515169-1-Liam.Howlett@Oracle.com Signed-off-by: Linus Torvalds --- mm/mmap.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index ebb92f5515a1..b0b8e9e94220 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -558,6 +558,23 @@ static int find_vma_links(struct mm_struct *mm, unsigned long addr, return 0; } +/* + * vma_next() - Get the next VMA. + * @mm: The mm_struct. + * @vma: The current vma. + * + * If @vma is NULL, return the first vma in the mm. + * + * Returns: The next VMA after @vma. + */ +static inline struct vm_area_struct *vma_next(struct mm_struct *mm, + struct vm_area_struct *vma) +{ + if (!vma) + return mm->mmap; + + return vma->vm_next; +} static unsigned long count_vma_pages_range(struct mm_struct *mm, unsigned long addr, unsigned long end) { @@ -1128,10 +1145,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, if (vm_flags & VM_SPECIAL) return NULL; - if (prev) - next = prev->vm_next; - else - next = mm->mmap; + next = vma_next(mm, prev); area = next; if (area && area->vm_end == end) /* cases 6, 7, 8 */ next = next->vm_next; @@ -2632,7 +2646,7 @@ static void unmap_region(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, unsigned long start, unsigned long end) { - struct vm_area_struct *next = prev ? prev->vm_next : mm->mmap; + struct vm_area_struct *next = vma_next(mm, prev); struct mmu_gather tlb; lru_add_drain(); @@ -2831,7 +2845,7 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, if (error) return error; } - vma = prev ? prev->vm_next : mm->mmap; + vma = vma_next(mm, prev); if (unlikely(uf)) { /* From fb8090b699c3e147e5bef8e0008edc3bddc33fad Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Sat, 17 Oct 2020 16:14:09 -0700 Subject: [PATCH 148/168] mm/mmap: add inline munmap_vma_range() for code readability There are two locations that have a block of code for munmapping a vma range. Change those two locations to use a function and add meaningful comments about what happens to the arguments, which was unclear in the previous code. Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Link: http://lkml.kernel.org/r/20200818154707.2515169-2-Liam.Howlett@Oracle.com Signed-off-by: Linus Torvalds --- mm/mmap.c | 48 +++++++++++++++++++++++++++++++++--------------- 1 file changed, 33 insertions(+), 15 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index b0b8e9e94220..d91ecb00d38c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -575,6 +575,33 @@ static inline struct vm_area_struct *vma_next(struct mm_struct *mm, return vma->vm_next; } + +/* + * munmap_vma_range() - munmap VMAs that overlap a range. + * @mm: The mm struct + * @start: The start of the range. + * @len: The length of the range. + * @pprev: pointer to the pointer that will be set to previous vm_area_struct + * @rb_link: the rb_node + * @rb_parent: the parent rb_node + * + * Find all the vm_area_struct that overlap from @start to + * @end and munmap them. Set @pprev to the previous vm_area_struct. + * + * Returns: -ENOMEM on munmap failure or 0 on success. + */ +static inline int +munmap_vma_range(struct mm_struct *mm, unsigned long start, unsigned long len, + struct vm_area_struct **pprev, struct rb_node ***link, + struct rb_node **parent, struct list_head *uf) +{ + + while (find_vma_links(mm, start, start + len, pprev, link, parent)) + if (do_munmap(mm, start, len, uf)) + return -ENOMEM; + + return 0; +} static unsigned long count_vma_pages_range(struct mm_struct *mm, unsigned long addr, unsigned long end) { @@ -1721,13 +1748,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr, return -ENOMEM; } - /* Clear old maps */ - while (find_vma_links(mm, addr, addr + len, &prev, &rb_link, - &rb_parent)) { - if (do_munmap(mm, addr, len, uf)) - return -ENOMEM; - } - + /* Clear old maps, set up prev, rb_link, rb_parent, and uf */ + if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf)) + return -ENOMEM; /* * Private writable mapping: check memory availability */ @@ -3063,14 +3086,9 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla if (error) return error; - /* - * Clear old maps. this also does some error checking for us - */ - while (find_vma_links(mm, addr, addr + len, &prev, &rb_link, - &rb_parent)) { - if (do_munmap(mm, addr, len, uf)) - return -ENOMEM; - } + /* Clear old maps, set up prev, rb_link, rb_parent, and uf */ + if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf)) + return -ENOMEM; /* Check against address space limits *after* clearing old maps... */ if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT)) From f3964599c22f70c37544c06b6b232c42746b940b Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Sat, 17 Oct 2020 16:14:12 -0700 Subject: [PATCH 149/168] mm/gup_benchmark: take the mmap lock around GUP To be safe against concurrent changes to the VMA tree, we must take the mmap lock around GUP operations (excluding the GUP-fast family of operations, which will take the mmap lock by themselves if necessary). This code is only for testing, and it's only reachable by root through debugfs, so this doesn't really have any impact; however, if we want to add lockdep asserts into the GUP path, we need to have clean locking here. Signed-off-by: Jann Horn Signed-off-by: Andrew Morton Reviewed-by: Jason Gunthorpe Reviewed-by: John Hubbard Acked-by: Michel Lespinasse Cc: "Eric W . Biederman" Cc: Mauro Carvalho Chehab Cc: Sakari Ailus Link: https://lkml.kernel.org/r/CAG48ez3SG6ngZLtasxJ6LABpOnqCz5-QHqb0B4k44TQ8F9n6+w@mail.gmail.com Signed-off-by: Linus Torvalds --- mm/gup_benchmark.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c index 464cae1fa3ea..8b3e5b5cd8fa 100644 --- a/mm/gup_benchmark.c +++ b/mm/gup_benchmark.c @@ -72,6 +72,8 @@ static int __gup_benchmark_ioctl(unsigned int cmd, int nr; struct page **pages; int ret = 0; + bool needs_mmap_lock = + cmd != GUP_FAST_BENCHMARK && cmd != PIN_FAST_BENCHMARK; if (gup->size > ULONG_MAX) return -EINVAL; @@ -81,6 +83,11 @@ static int __gup_benchmark_ioctl(unsigned int cmd, if (!pages) return -ENOMEM; + if (needs_mmap_lock && mmap_read_lock_killable(current->mm)) { + ret = -EINTR; + goto free_pages; + } + i = 0; nr = gup->nr_pages_per_call; start_time = ktime_get(); @@ -120,9 +127,8 @@ static int __gup_benchmark_ioctl(unsigned int cmd, pages + i, NULL); break; default: - kvfree(pages); ret = -EINVAL; - goto out; + goto unlock; } if (nr <= 0) @@ -150,8 +156,11 @@ static int __gup_benchmark_ioctl(unsigned int cmd, end_time = ktime_get(); gup->put_delta_usec = ktime_us_delta(end_time, start_time); +unlock: + if (needs_mmap_lock) + mmap_read_unlock(current->mm); +free_pages: kvfree(pages); -out: return ret; } From b2767d97f5ff758250cf28684aaa48bbfd34145f Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Sat, 17 Oct 2020 16:14:15 -0700 Subject: [PATCH 150/168] binfmt_elf: take the mmap lock around find_extend_vma() create_elf_tables() runs after setup_new_exec(), so other tasks can already access our new mm and do things like process_madvise() on it. (At the time I'm writing this commit, process_madvise() is not in mainline yet, but has been in akpm's tree for some time.) While I believe that there are currently no APIs that would actually allow another process to mess up our VMA tree (process_madvise() is limited to MADV_COLD and MADV_PAGEOUT, and uring and userfaultfd cannot reach an mm under which no syscalls have been executed yet), this seems like an accident waiting to happen. Let's make sure that we always take the mmap lock around GUP paths as long as another process might be able to see the mm. (Yes, this diff looks suspicious because we drop the lock before doing anything with `vma`, but that's because we actually don't do anything with it apart from the NULL check.) Signed-off-by: Jann Horn Signed-off-by: Andrew Morton Acked-by: Michel Lespinasse Cc: "Eric W . Biederman" Cc: Jason Gunthorpe Cc: John Hubbard Cc: Mauro Carvalho Chehab Cc: Sakari Ailus Link: https://lkml.kernel.org/r/CAG48ez1-PBCdv3y8pn-Ty-b+FmBSLwDuVKFSt8h7wARLy0dF-Q@mail.gmail.com Signed-off-by: Linus Torvalds --- fs/binfmt_elf.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index e7e9d0cde51a..b6b3d052ca86 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -310,7 +310,10 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec, * Grow the stack manually; some architectures have a limit on how * far ahead a user-space access may be in order to grow the stack. */ + if (mmap_read_lock_killable(mm)) + return -EINTR; vma = find_extend_vma(mm, bprm->p); + mmap_read_unlock(mm); if (!vma) return -EFAULT; From 255965309104fc62e3161997b93aea31c2c59941 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Sat, 17 Oct 2020 16:14:47 -0700 Subject: [PATCH 151/168] selftests/vm: 10x speedup for hmm-tests This patch reduces the running time for hmm-tests from about 10+ seconds, to just under 1.0 second, for an approximately 10x speedup. That brings it in line with most of the other tests in selftests/vm, which mostly run in < 1 sec. This is done with a one-line change that simply reduces the number of iterations of several tests, from 256, to 10. Thanks to Ralph Campbell for suggesting changing NTIMES as a way to get the speedup. Suggested-by: Ralph Campbell Signed-off-by: John Hubbard Signed-off-by: Andrew Morton Cc: SeongJae Park Cc: Shuah Khan Link: https://lkml.kernel.org/r/20201003011721.44238-1-jhubbard@nvidia.com Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/hmm-tests.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/vm/hmm-tests.c b/tools/testing/selftests/vm/hmm-tests.c index 0a28a6a29581..c9404ef9698e 100644 --- a/tools/testing/selftests/vm/hmm-tests.c +++ b/tools/testing/selftests/vm/hmm-tests.c @@ -45,7 +45,7 @@ struct hmm_buffer { #define TWOMEG (1 << 21) #define HMM_BUFFER_SIZE (1024 << 12) #define HMM_PATH_MAX 64 -#define NTIMES 256 +#define NTIMES 10 #define ALIGN(x, a) (((x) + (a - 1)) & (~((a) - 1))) From 0726b01e70455f9900ab524117c7b520d197dc8c Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Sat, 17 Oct 2020 16:14:50 -0700 Subject: [PATCH 152/168] mm/madvise: pass mm to do_madvise Patch series "introduce memory hinting API for external process", v9. Now, we have MADV_PAGEOUT and MADV_COLD as madvise hinting API. With that, application could give hints to kernel what memory range are preferred to be reclaimed. However, in some platform(e.g., Android), the information required to make the hinting decision is not known to the app. Instead, it is known to a centralized userspace daemon(e.g., ActivityManagerService), and that daemon must be able to initiate reclaim on its own without any app involvement. To solve the concern, this patch introduces new syscall - process_madvise(2). Bascially, it's same with madvise(2) syscall but it has some differences. 1. It needs pidfd of target process to provide the hint 2. It supports only MADV_{COLD|PAGEOUT|MERGEABLE|UNMEREABLE} at this moment. Other hints in madvise will be opened when there are explicit requests from community to prevent unexpected bugs we couldn't support. 3. Only privileged processes can do something for other process's address space. For more detail of the new API, please see "mm: introduce external memory hinting API" description in this patchset. This patch (of 3): In upcoming patches, do_madvise will be called from external process context so we shouldn't asssume "current" is always hinted process's task_struct. Furthermore, we must not access mm_struct via task->mm, but obtain it via access_mm() once (in the following patch) and only use that pointer [1], so pass it to do_madvise() as well. Note the vma->vm_mm pointers are safe, so we can use them further down the call stack. And let's pass current->mm as arguments of do_madvise so it shouldn't change existing behavior but prepare next patch to make review easy. [vbabka@suse.cz: changelog tweak] [minchan@kernel.org: use current->mm for io_uring] Link: http://lkml.kernel.org/r/20200423145215.72666-1-minchan@kernel.org [akpm@linux-foundation.org: fix it for upstream changes] [akpm@linux-foundation.org: whoops] [rdunlap@infradead.org: add missing includes] Signed-off-by: Minchan Kim Signed-off-by: Andrew Morton Reviewed-by: Suren Baghdasaryan Reviewed-by: Vlastimil Babka Acked-by: David Rientjes Cc: Jens Axboe Cc: Jann Horn Cc: Tim Murray Cc: Daniel Colascione Cc: Sandeep Patil Cc: Sonny Rao Cc: Brian Geffon Cc: Michal Hocko Cc: Johannes Weiner Cc: Shakeel Butt Cc: John Dias Cc: Joel Fernandes Cc: Alexander Duyck Cc: SeongJae Park Cc: Christian Brauner Cc: Kirill Tkhai Cc: Oleksandr Natalenko Cc: SeongJae Park Cc: Christian Brauner Cc: Florian Weimer Cc: Link: https://lkml.kernel.org/r/20200901000633.1920247-1-minchan@kernel.org Link: http://lkml.kernel.org/r/20200622192900.22757-1-minchan@kernel.org Link: http://lkml.kernel.org/r/20200302193630.68771-2-minchan@kernel.org Link: http://lkml.kernel.org/r/20200622192900.22757-2-minchan@kernel.org Link: https://lkml.kernel.org/r/20200901000633.1920247-2-minchan@kernel.org Signed-off-by: Linus Torvalds --- fs/io_uring.c | 2 +- include/linux/mm.h | 2 +- mm/madvise.c | 32 ++++++++++++++++++-------------- 3 files changed, 20 insertions(+), 16 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 2e1dc354cd08..b58169240c77 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3989,7 +3989,7 @@ static int io_madvise(struct io_kiocb *req, bool force_nonblock) if (force_nonblock) return -EAGAIN; - ret = do_madvise(ma->addr, ma->len, ma->advice); + ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice); if (ret < 0) req_set_fail_links(req); io_req_complete(req, ret); diff --git a/include/linux/mm.h b/include/linux/mm.h index 61a2633fcc7f..ef360fe70aaf 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2579,7 +2579,7 @@ extern int __do_munmap(struct mm_struct *, unsigned long, size_t, struct list_head *uf, bool downgrade); extern int do_munmap(struct mm_struct *, unsigned long, size_t, struct list_head *uf); -extern int do_madvise(unsigned long start, size_t len_in, int behavior); +extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior); #ifdef CONFIG_MMU extern int __mm_populate(unsigned long addr, unsigned long len, diff --git a/mm/madvise.c b/mm/madvise.c index fd1f448b4e1d..d550ef045288 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -258,6 +258,7 @@ static long madvise_willneed(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end) { + struct mm_struct *mm = vma->vm_mm; struct file *file = vma->vm_file; loff_t offset; @@ -294,10 +295,10 @@ static long madvise_willneed(struct vm_area_struct *vma, get_file(file); offset = (loff_t)(start - vma->vm_start) + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); - mmap_read_unlock(current->mm); + mmap_read_unlock(mm); vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED); fput(file); - mmap_read_lock(current->mm); + mmap_read_lock(mm); return 0; } @@ -766,6 +767,8 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, unsigned long start, unsigned long end, int behavior) { + struct mm_struct *mm = vma->vm_mm; + *prev = vma; if (!can_madv_lru_vma(vma)) return -EINVAL; @@ -773,8 +776,8 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, if (!userfaultfd_remove(vma, start, end)) { *prev = NULL; /* mmap_lock has been dropped, prev is stale */ - mmap_read_lock(current->mm); - vma = find_vma(current->mm, start); + mmap_read_lock(mm); + vma = find_vma(mm, start); if (!vma) return -ENOMEM; if (start < vma->vm_start) { @@ -828,6 +831,7 @@ static long madvise_remove(struct vm_area_struct *vma, loff_t offset; int error; struct file *f; + struct mm_struct *mm = vma->vm_mm; *prev = NULL; /* tell sys_madvise we drop mmap_lock */ @@ -855,13 +859,13 @@ static long madvise_remove(struct vm_area_struct *vma, get_file(f); if (userfaultfd_remove(vma, start, end)) { /* mmap_lock was not released by userfaultfd_remove() */ - mmap_read_unlock(current->mm); + mmap_read_unlock(mm); } error = vfs_fallocate(f, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, end - start); fput(f); - mmap_read_lock(current->mm); + mmap_read_lock(mm); return error; } @@ -1045,7 +1049,7 @@ madvise_behavior_valid(int behavior) * -EBADF - map exists, but area maps something that isn't a file. * -EAGAIN - a kernel resource was temporarily unavailable. */ -int do_madvise(unsigned long start, size_t len_in, int behavior) +int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior) { unsigned long end, tmp; struct vm_area_struct *vma, *prev; @@ -1083,10 +1087,10 @@ int do_madvise(unsigned long start, size_t len_in, int behavior) write = madvise_need_mmap_write(behavior); if (write) { - if (mmap_write_lock_killable(current->mm)) + if (mmap_write_lock_killable(mm)) return -EINTR; } else { - mmap_read_lock(current->mm); + mmap_read_lock(mm); } /* @@ -1094,7 +1098,7 @@ int do_madvise(unsigned long start, size_t len_in, int behavior) * ranges, just ignore them, but return -ENOMEM at the end. * - different from the way of handling in mlock etc. */ - vma = find_vma_prev(current->mm, start, &prev); + vma = find_vma_prev(mm, start, &prev); if (vma && start > vma->vm_start) prev = vma; @@ -1131,19 +1135,19 @@ int do_madvise(unsigned long start, size_t len_in, int behavior) if (prev) vma = prev->vm_next; else /* madvise_remove dropped mmap_lock */ - vma = find_vma(current->mm, start); + vma = find_vma(mm, start); } out: blk_finish_plug(&plug); if (write) - mmap_write_unlock(current->mm); + mmap_write_unlock(mm); else - mmap_read_unlock(current->mm); + mmap_read_unlock(mm); return error; } SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) { - return do_madvise(start, len_in, behavior); + return do_madvise(current->mm, start, len_in, behavior); } From 1aa92cd31c1c032ddfed27e79d646bbb429e9b52 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Sat, 17 Oct 2020 16:14:54 -0700 Subject: [PATCH 153/168] pid: move pidfd_get_pid() to pid.c process_madvise syscall needs pidfd_get_pid function to translate pidfd to pid so this patch move the function to kernel/pid.c. Suggested-by: Alexander Duyck Signed-off-by: Minchan Kim Signed-off-by: Andrew Morton Reviewed-by: Suren Baghdasaryan Reviewed-by: Alexander Duyck Reviewed-by: Vlastimil Babka Acked-by: Christian Brauner Acked-by: David Rientjes Cc: Jens Axboe Cc: Jann Horn Cc: Brian Geffon Cc: Daniel Colascione Cc: Joel Fernandes Cc: Johannes Weiner Cc: John Dias Cc: Kirill Tkhai Cc: Michal Hocko Cc: Oleksandr Natalenko Cc: Sandeep Patil Cc: SeongJae Park Cc: SeongJae Park Cc: Shakeel Butt Cc: Sonny Rao Cc: Tim Murray Cc: Christian Brauner Cc: Florian Weimer Cc: Link: http://lkml.kernel.org/r/20200302193630.68771-5-minchan@kernel.org Link: http://lkml.kernel.org/r/20200622192900.22757-3-minchan@kernel.org Link: https://lkml.kernel.org/r/20200901000633.1920247-3-minchan@kernel.org Signed-off-by: Linus Torvalds --- include/linux/pid.h | 1 + kernel/exit.c | 19 ------------------- kernel/pid.c | 19 +++++++++++++++++++ 3 files changed, 20 insertions(+), 19 deletions(-) diff --git a/include/linux/pid.h b/include/linux/pid.h index 176d6cf80e7c..fa10acb8d6a4 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -77,6 +77,7 @@ extern const struct file_operations pidfd_fops; struct file; extern struct pid *pidfd_pid(const struct file *file); +struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags); static inline struct pid *get_pid(struct pid *pid) { diff --git a/kernel/exit.c b/kernel/exit.c index 1f51c27bae59..87a2d515de0d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1474,25 +1474,6 @@ end: return retval; } -static struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags) -{ - struct fd f; - struct pid *pid; - - f = fdget(fd); - if (!f.file) - return ERR_PTR(-EBADF); - - pid = pidfd_pid(f.file); - if (!IS_ERR(pid)) { - get_pid(pid); - *flags = f.file->f_flags; - } - - fdput(f); - return pid; -} - static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, int options, struct rusage *ru) { diff --git a/kernel/pid.c b/kernel/pid.c index 74ddbff1a6ba..a96bc4bf4f86 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -520,6 +520,25 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) return idr_get_next(&ns->idr, &nr); } +struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags) +{ + struct fd f; + struct pid *pid; + + f = fdget(fd); + if (!f.file) + return ERR_PTR(-EBADF); + + pid = pidfd_pid(f.file); + if (!IS_ERR(pid)) { + get_pid(pid); + *flags = f.file->f_flags; + } + + fdput(f); + return pid; +} + /** * pidfd_create() - Create a new pid file descriptor. * From ecb8ac8b1f146915aa6b96449b66dd48984caacc Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Sat, 17 Oct 2020 16:14:59 -0700 Subject: [PATCH 154/168] mm/madvise: introduce process_madvise() syscall: an external memory hinting API There is usecase that System Management Software(SMS) want to give a memory hint like MADV_[COLD|PAGEEOUT] to other processes and in the case of Android, it is the ActivityManagerService. The information required to make the reclaim decision is not known to the app. Instead, it is known to the centralized userspace daemon(ActivityManagerService), and that daemon must be able to initiate reclaim on its own without any app involvement. To solve the issue, this patch introduces a new syscall process_madvise(2). It uses pidfd of an external process to give the hint. It also supports vector address range because Android app has thousands of vmas due to zygote so it's totally waste of CPU and power if we should call the syscall one by one for each vma.(With testing 2000-vma syscall vs 1-vector syscall, it showed 15% performance improvement. I think it would be bigger in real practice because the testing ran very cache friendly environment). Another potential use case for the vector range is to amortize the cost ofTLB shootdowns for multiple ranges when using MADV_DONTNEED; this could benefit users like TCP receive zerocopy and malloc implementations. In future, we could find more usecases for other advises so let's make it happens as API since we introduce a new syscall at this moment. With that, existing madvise(2) user could replace it with process_madvise(2) with their own pid if they want to have batch address ranges support feature. ince it could affect other process's address range, only privileged process(PTRACE_MODE_ATTACH_FSCREDS) or something else(e.g., being the same UID) gives it the right to ptrace the process could use it successfully. The flag argument is reserved for future use if we need to extend the API. I think supporting all hints madvise has/will supported/support to process_madvise is rather risky. Because we are not sure all hints make sense from external process and implementation for the hint may rely on the caller being in the current context so it could be error-prone. Thus, I just limited hints as MADV_[COLD|PAGEOUT] in this patch. If someone want to add other hints, we could hear the usecase and review it for each hint. It's safer for maintenance rather than introducing a buggy syscall but hard to fix it later. So finally, the API is as follows, ssize_t process_madvise(int pidfd, const struct iovec *iovec, unsigned long vlen, int advice, unsigned int flags); DESCRIPTION The process_madvise() system call is used to give advice or directions to the kernel about the address ranges from external process as well as local process. It provides the advice to address ranges of process described by iovec and vlen. The goal of such advice is to improve system or application performance. The pidfd selects the process referred to by the PID file descriptor specified in pidfd. (See pidofd_open(2) for further information) The pointer iovec points to an array of iovec structures, defined in as: struct iovec { void *iov_base; /* starting address */ size_t iov_len; /* number of bytes to be advised */ }; The iovec describes address ranges beginning at address(iov_base) and with size length of bytes(iov_len). The vlen represents the number of elements in iovec. The advice is indicated in the advice argument, which is one of the following at this moment if the target process specified by pidfd is external. MADV_COLD MADV_PAGEOUT Permission to provide a hint to external process is governed by a ptrace access mode PTRACE_MODE_ATTACH_FSCREDS check; see ptrace(2). The process_madvise supports every advice madvise(2) has if target process is in same thread group with calling process so user could use process_madvise(2) to extend existing madvise(2) to support vector address ranges. RETURN VALUE On success, process_madvise() returns the number of bytes advised. This return value may be less than the total number of requested bytes, if an error occurred. The caller should check return value to determine whether a partial advice occurred. FAQ: Q.1 - Why does any external entity have better knowledge? Quote from Sandeep "For Android, every application (including the special SystemServer) are forked from Zygote. The reason of course is to share as many libraries and classes between the two as possible to benefit from the preloading during boot. After applications start, (almost) all of the APIs end up calling into this SystemServer process over IPC (binder) and back to the application. In a fully running system, the SystemServer monitors every single process periodically to calculate their PSS / RSS and also decides which process is "important" to the user for interactivity. So, because of how these processes start _and_ the fact that the SystemServer is looping to monitor each process, it does tend to *know* which address range of the application is not used / useful. Besides, we can never rely on applications to clean things up themselves. We've had the "hey app1, the system is low on memory, please trim your memory usage down" notifications for a long time[1]. They rely on applications honoring the broadcasts and very few do. So, if we want to avoid the inevitable killing of the application and restarting it, some way to be able to tell the OS about unimportant memory in these applications will be useful. - ssp Q.2 - How to guarantee the race(i.e., object validation) between when giving a hint from an external process and get the hint from the target process? process_madvise operates on the target process's address space as it exists at the instant that process_madvise is called. If the space target process can run between the time the process_madvise process inspects the target process address space and the time that process_madvise is actually called, process_madvise may operate on memory regions that the calling process does not expect. It's the responsibility of the process calling process_madvise to close this race condition. For example, the calling process can suspend the target process with ptrace, SIGSTOP, or the freezer cgroup so that it doesn't have an opportunity to change its own address space before process_madvise is called. Another option is to operate on memory regions that the caller knows a priori will be unchanged in the target process. Yet another option is to accept the race for certain process_madvise calls after reasoning that mistargeting will do no harm. The suggested API itself does not provide synchronization. It also apply other APIs like move_pages, process_vm_write. The race isn't really a problem though. Why is it so wrong to require that callers do their own synchronization in some manner? Nobody objects to write(2) merely because it's possible for two processes to open the same file and clobber each other's writes --- instead, we tell people to use flock or something. Think about mmap. It never guarantees newly allocated address space is still valid when the user tries to access it because other threads could unmap the memory right before. That's where we need synchronization by using other API or design from userside. It shouldn't be part of API itself. If someone needs more fine-grained synchronization rather than process level, there were two ideas suggested - cookie[2] and anon-fd[3]. Both are applicable via using last reserved argument of the API but I don't think it's necessary right now since we have already ways to prevent the race so don't want to add additional complexity with more fine-grained optimization model. To make the API extend, it reserved an unsigned long as last argument so we could support it in future if someone really needs it. Q.3 - Why doesn't ptrace work? Injecting an madvise in the target process using ptrace would not work for us because such injected madvise would have to be executed by the target process, which means that process would have to be runnable and that creates the risk of the abovementioned race and hinting a wrong VMA. Furthermore, we want to act the hint in caller's context, not the callee's, because the callee is usually limited in cpuset/cgroups or even freezed state so they can't act by themselves quick enough, which causes more thrashing/kill. It doesn't work if the target process are ptraced(e.g., strace, debugger, minidump) because a process can have at most one ptracer. [1] https://developer.android.com/topic/performance/memory" [2] process_getinfo for getting the cookie which is updated whenever vma of process address layout are changed - Daniel Colascione - https://lore.kernel.org/lkml/20190520035254.57579-1-minchan@kernel.org/T/#m7694416fd179b2066a2c62b5b139b14e3894e224 [3] anonymous fd which is used for the object(i.e., address range) validation - Michal Hocko - https://lore.kernel.org/lkml/20200120112722.GY18451@dhcp22.suse.cz/ [minchan@kernel.org: fix process_madvise build break for arm64] Link: http://lkml.kernel.org/r/20200303145756.GA219683@google.com [minchan@kernel.org: fix build error for mips of process_madvise] Link: http://lkml.kernel.org/r/20200508052517.GA197378@google.com [akpm@linux-foundation.org: fix patch ordering issue] [akpm@linux-foundation.org: fix arm64 whoops] [minchan@kernel.org: make process_madvise() vlen arg have type size_t, per Florian] [akpm@linux-foundation.org: fix i386 build] [sfr@canb.auug.org.au: fix syscall numbering] Link: https://lkml.kernel.org/r/20200905142639.49fc3f1a@canb.auug.org.au [sfr@canb.auug.org.au: madvise.c needs compat.h] Link: https://lkml.kernel.org/r/20200908204547.285646b4@canb.auug.org.au [minchan@kernel.org: fix mips build] Link: https://lkml.kernel.org/r/20200909173655.GC2435453@google.com [yuehaibing@huawei.com: remove duplicate header which is included twice] Link: https://lkml.kernel.org/r/20200915121550.30584-1-yuehaibing@huawei.com [minchan@kernel.org: do not use helper functions for process_madvise] Link: https://lkml.kernel.org/r/20200921175539.GB387368@google.com [akpm@linux-foundation.org: pidfd_get_pid() gained an argument] [sfr@canb.auug.org.au: fix up for "iov_iter: transparently handle compat iovecs in import_iovec"] Link: https://lkml.kernel.org/r/20200928212542.468e1fef@canb.auug.org.au Signed-off-by: Minchan Kim Signed-off-by: YueHaibing Signed-off-by: Stephen Rothwell Signed-off-by: Andrew Morton Reviewed-by: Suren Baghdasaryan Reviewed-by: Vlastimil Babka Acked-by: David Rientjes Cc: Alexander Duyck Cc: Brian Geffon Cc: Christian Brauner Cc: Daniel Colascione Cc: Jann Horn Cc: Jens Axboe Cc: Joel Fernandes Cc: Johannes Weiner Cc: John Dias Cc: Kirill Tkhai Cc: Michal Hocko Cc: Oleksandr Natalenko Cc: Sandeep Patil Cc: SeongJae Park Cc: SeongJae Park Cc: Shakeel Butt Cc: Sonny Rao Cc: Tim Murray Cc: Christian Brauner Cc: Florian Weimer Cc: Link: http://lkml.kernel.org/r/20200302193630.68771-3-minchan@kernel.org Link: http://lkml.kernel.org/r/20200508183320.GA125527@google.com Link: http://lkml.kernel.org/r/20200622192900.22757-4-minchan@kernel.org Link: https://lkml.kernel.org/r/20200901000633.1920247-4-minchan@kernel.org Signed-off-by: Linus Torvalds --- arch/alpha/kernel/syscalls/syscall.tbl | 1 + arch/arm/tools/syscall.tbl | 1 + arch/arm64/include/asm/unistd.h | 2 +- arch/arm64/include/asm/unistd32.h | 2 + arch/ia64/kernel/syscalls/syscall.tbl | 1 + arch/m68k/kernel/syscalls/syscall.tbl | 1 + arch/microblaze/kernel/syscalls/syscall.tbl | 1 + arch/mips/kernel/syscalls/syscall_n32.tbl | 1 + arch/mips/kernel/syscalls/syscall_n64.tbl | 1 + arch/mips/kernel/syscalls/syscall_o32.tbl | 1 + arch/parisc/kernel/syscalls/syscall.tbl | 1 + arch/powerpc/kernel/syscalls/syscall.tbl | 1 + arch/s390/kernel/syscalls/syscall.tbl | 1 + arch/sh/kernel/syscalls/syscall.tbl | 1 + arch/sparc/kernel/syscalls/syscall.tbl | 1 + arch/x86/entry/syscalls/syscall_32.tbl | 1 + arch/x86/entry/syscalls/syscall_64.tbl | 1 + arch/xtensa/kernel/syscalls/syscall.tbl | 1 + include/linux/syscalls.h | 2 + include/uapi/asm-generic/unistd.h | 4 +- kernel/sys_ni.c | 1 + mm/madvise.c | 93 ++++++++++++++++++++- 22 files changed, 117 insertions(+), 3 deletions(-) diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl index ec8bed9e7b75..ee7b01bb7346 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -479,3 +479,4 @@ 547 common openat2 sys_openat2 548 common pidfd_getfd sys_pidfd_getfd 549 common faccessat2 sys_faccessat2 +550 common process_madvise sys_process_madvise diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index 171077cbf419..d056a548358e 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -453,3 +453,4 @@ 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 +440 common process_madvise sys_process_madvise diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h index 3b859596840d..b3b2019f8d16 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -38,7 +38,7 @@ #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5) #define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800) -#define __NR_compat_syscalls 440 +#define __NR_compat_syscalls 441 #endif #define __ARCH_WANT_SYS_CLONE diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index 2a3ad9b9accd..107f08e03b9f 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -887,6 +887,8 @@ __SYSCALL(__NR_openat2, sys_openat2) __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd) #define __NR_faccessat2 439 __SYSCALL(__NR_faccessat2, sys_faccessat2) +#define __NR_process_madvise 440 +__SYSCALL(__NR_process_madvise, sys_process_madvise) /* * Please add new compat syscalls above this comment and update diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl index 4799c96c325f..b96ed8b8a508 100644 --- a/arch/ia64/kernel/syscalls/syscall.tbl +++ b/arch/ia64/kernel/syscalls/syscall.tbl @@ -360,3 +360,4 @@ 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 +440 common process_madvise sys_process_madvise diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl index 81fc799d8392..625fb6d32842 100644 --- a/arch/m68k/kernel/syscalls/syscall.tbl +++ b/arch/m68k/kernel/syscalls/syscall.tbl @@ -439,3 +439,4 @@ 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 +440 common process_madvise sys_process_madvise diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl index b4e263916f41..aae729c95cf9 100644 --- a/arch/microblaze/kernel/syscalls/syscall.tbl +++ b/arch/microblaze/kernel/syscalls/syscall.tbl @@ -445,3 +445,4 @@ 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 +440 common process_madvise sys_process_madvise diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index cf72a0206a87..32817c954435 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -378,3 +378,4 @@ 437 n32 openat2 sys_openat2 438 n32 pidfd_getfd sys_pidfd_getfd 439 n32 faccessat2 sys_faccessat2 +440 n32 process_madvise sys_process_madvise diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl index 557f9954a2b9..9e4ea3c31b1c 100644 --- a/arch/mips/kernel/syscalls/syscall_n64.tbl +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl @@ -354,3 +354,4 @@ 437 n64 openat2 sys_openat2 438 n64 pidfd_getfd sys_pidfd_getfd 439 n64 faccessat2 sys_faccessat2 +440 n64 process_madvise sys_process_madvise diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index a17aab5abeb2..29f5f28cf5ce 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -427,3 +427,4 @@ 437 o32 openat2 sys_openat2 438 o32 pidfd_getfd sys_pidfd_getfd 439 o32 faccessat2 sys_faccessat2 +440 o32 process_madvise sys_process_madvise diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index ae3dab371f6f..38c63e5404bc 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -437,3 +437,4 @@ 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 +440 common process_madvise sys_process_madvise diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index 9d7fb4ced290..1275daec7fec 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -529,3 +529,4 @@ 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 +440 common process_madvise sys_process_madvise diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index 1c3b48165e86..28c168000483 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -442,3 +442,4 @@ 437 common openat2 sys_openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 sys_faccessat2 +440 common process_madvise sys_process_madvise sys_process_madvise diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl index ae0a00beea5f..783738448ff5 100644 --- a/arch/sh/kernel/syscalls/syscall.tbl +++ b/arch/sh/kernel/syscalls/syscall.tbl @@ -442,3 +442,4 @@ 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 +440 common process_madvise sys_process_madvise diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index 37ec52b34c73..78160260991b 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -485,3 +485,4 @@ 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 +440 common process_madvise sys_process_madvise diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 9b6931f8d555..0d0667a9fbd7 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -444,3 +444,4 @@ 437 i386 openat2 sys_openat2 438 i386 pidfd_getfd sys_pidfd_getfd 439 i386 faccessat2 sys_faccessat2 +440 i386 process_madvise sys_process_madvise diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 347809649ba2..1f47e24fb65c 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -361,6 +361,7 @@ 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 +440 common process_madvise sys_process_madvise # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl index 6276e3c2d3fc..b070f272995d 100644 --- a/arch/xtensa/kernel/syscalls/syscall.tbl +++ b/arch/xtensa/kernel/syscalls/syscall.tbl @@ -410,3 +410,4 @@ 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 +440 common process_madvise sys_process_madvise diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 06db09875aa4..2eda7678fe1d 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -879,6 +879,8 @@ asmlinkage long sys_munlockall(void); asmlinkage long sys_mincore(unsigned long start, size_t len, unsigned char __user * vec); asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior); +asmlinkage long sys_process_madvise(int pidfd, const struct iovec __user *vec, + size_t vlen, int behavior, unsigned int flags); asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, unsigned long prot, unsigned long pgoff, unsigned long flags); diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index f2b5d72a46c2..2056318988f7 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -857,9 +857,11 @@ __SYSCALL(__NR_openat2, sys_openat2) __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd) #define __NR_faccessat2 439 __SYSCALL(__NR_faccessat2, sys_faccessat2) +#define __NR_process_madvise 440 +__SYSCALL(__NR_process_madvise, sys_process_madvise) #undef __NR_syscalls -#define __NR_syscalls 440 +#define __NR_syscalls 441 /* * 32 bit systems traditionally used different diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index c925d1e1777e..f27ac94d5fa7 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -280,6 +280,7 @@ COND_SYSCALL(mlockall); COND_SYSCALL(munlockall); COND_SYSCALL(mincore); COND_SYSCALL(madvise); +COND_SYSCALL(process_madvise); COND_SYSCALL(remap_file_pages); COND_SYSCALL(mbind); COND_SYSCALL_COMPAT(mbind); diff --git a/mm/madvise.c b/mm/madvise.c index d550ef045288..416a56b8e757 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -17,6 +17,8 @@ #include #include #include +#include +#include #include #include #include @@ -27,7 +29,6 @@ #include #include #include -#include #include @@ -988,6 +989,18 @@ madvise_behavior_valid(int behavior) } } +static bool +process_madvise_behavior_valid(int behavior) +{ + switch (behavior) { + case MADV_COLD: + case MADV_PAGEOUT: + return true; + default: + return false; + } +} + /* * The madvise(2) system call. * @@ -1035,6 +1048,11 @@ madvise_behavior_valid(int behavior) * MADV_DONTDUMP - the application wants to prevent pages in the given range * from being included in its core dump. * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump. + * MADV_COLD - the application is not expected to use this memory soon, + * deactivate pages in this range so that they can be reclaimed + * easily if memory pressure hanppens. + * MADV_PAGEOUT - the application is not expected to use this memory soon, + * page out the pages in this range immediately. * * return values: * zero - success @@ -1151,3 +1169,76 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) { return do_madvise(current->mm, start, len_in, behavior); } + +SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, + size_t, vlen, int, behavior, unsigned int, flags) +{ + ssize_t ret; + struct iovec iovstack[UIO_FASTIOV], iovec; + struct iovec *iov = iovstack; + struct iov_iter iter; + struct pid *pid; + struct task_struct *task; + struct mm_struct *mm; + size_t total_len; + unsigned int f_flags; + + if (flags != 0) { + ret = -EINVAL; + goto out; + } + + ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); + if (ret < 0) + goto out; + + pid = pidfd_get_pid(pidfd, &f_flags); + if (IS_ERR(pid)) { + ret = PTR_ERR(pid); + goto free_iov; + } + + task = get_pid_task(pid, PIDTYPE_PID); + if (!task) { + ret = -ESRCH; + goto put_pid; + } + + if (task->mm != current->mm && + !process_madvise_behavior_valid(behavior)) { + ret = -EINVAL; + goto release_task; + } + + mm = mm_access(task, PTRACE_MODE_ATTACH_FSCREDS); + if (IS_ERR_OR_NULL(mm)) { + ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; + goto release_task; + } + + total_len = iov_iter_count(&iter); + + while (iov_iter_count(&iter)) { + iovec = iov_iter_iovec(&iter); + ret = do_madvise(mm, (unsigned long)iovec.iov_base, + iovec.iov_len, behavior); + if (ret < 0) + break; + iov_iter_advance(&iter, iovec.iov_len); + } + + if (ret == 0) + ret = total_len - iov_iter_count(&iter); + + mmput(mm); + return ret; + +release_task: + put_task_struct(task); +put_pid: + put_pid(pid); +free_iov: + kfree(iov); +out: + return ret; +} From fa307474c62186649d02d6ce46493a3013bff675 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 17 Oct 2020 16:15:03 -0700 Subject: [PATCH 155/168] mm: update the documentation for vfree Patch series "remove alloc_vm_area", v4. This series removes alloc_vm_area, which was left over from the big vmalloc interface rework. It is a rather arkane interface, basicaly the equivalent of get_vm_area + actually faulting in all PTEs in the allocated area. It was originally addeds for Xen (which isn't modular to start with), and then grew users in zsmalloc and i915 which seems to mostly qualify as abuses of the interface, especially for i915 as a random driver should not set up PTE bits directly. This patch (of 11): * Document that you can call vfree() on an address returned from vmap() * Remove the note about the minimum size -- the minimum size of a vmalloc allocation is one page * Add a Context: section * Fix capitalisation * Reword the prohibition on calling from NMI context to avoid a double negative Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Peter Zijlstra Cc: Boris Ostrovsky Cc: Juergen Gross Cc: Stefano Stabellini Cc: Jani Nikula Cc: Joonas Lahtinen Cc: Tvrtko Ursulin Cc: Chris Wilson Cc: Matthew Auld Cc: Rodrigo Vivi Cc: Minchan Kim Cc: Matthew Wilcox Cc: Nitin Gupta Cc: Uladzislau Rezki (Sony) Link: https://lkml.kernel.org/r/20201002122204.1534411-1-hch@lst.de Link: https://lkml.kernel.org/r/20201002122204.1534411-2-hch@lst.de Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 04ac98bf5045..74af03a2a769 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2321,20 +2321,21 @@ static void __vfree(const void *addr) } /** - * vfree - release memory allocated by vmalloc() - * @addr: memory base address + * vfree - Release memory allocated by vmalloc() + * @addr: Memory base address * - * Free the virtually continuous memory area starting at @addr, as - * obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is - * NULL, no operation is performed. + * Free the virtually continuous memory area starting at @addr, as obtained + * from one of the vmalloc() family of APIs. This will usually also free the + * physical memory underlying the virtual allocation, but that memory is + * reference counted, so it will not be freed until the last user goes away. * - * Must not be called in NMI context (strictly speaking, only if we don't - * have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling - * conventions for vfree() arch-depenedent would be a really bad idea) + * If @addr is NULL, no operation is performed. * + * Context: * May sleep if called *not* from interrupt context. - * - * NOTE: assumes that the object at @addr has a size >= sizeof(llist_node) + * Must not be called in NMI context (strictly speaking, it could be + * if we have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling + * conventions for vfree() arch-depenedent would be a really bad idea). */ void vfree(const void *addr) { From b944afc9d64ddf1b6a152c23ff86bf26e1fd430c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 17 Oct 2020 16:15:06 -0700 Subject: [PATCH 156/168] mm: add a VM_MAP_PUT_PAGES flag for vmap Add a flag so that vmap takes ownership of the passed in page array. When vfree is called on such an allocation it will put one reference on each page, and free the page array itself. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Boris Ostrovsky Cc: Chris Wilson Cc: Jani Nikula Cc: Joonas Lahtinen Cc: Juergen Gross Cc: Matthew Auld Cc: "Matthew Wilcox (Oracle)" Cc: Minchan Kim Cc: Nitin Gupta Cc: Peter Zijlstra Cc: Rodrigo Vivi Cc: Stefano Stabellini Cc: Tvrtko Ursulin Cc: Uladzislau Rezki (Sony) Link: https://lkml.kernel.org/r/20201002122204.1534411-3-hch@lst.de Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 1 + mm/vmalloc.c | 9 +++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 0221f852a7e1..b899681e3ff9 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -24,6 +24,7 @@ struct notifier_block; /* in notifier.h */ #define VM_UNINITIALIZED 0x00000020 /* vm_struct is not fully initialized */ #define VM_NO_GUARD 0x00000040 /* don't add guard page */ #define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */ +#define VM_MAP_PUT_PAGES 0x00000100 /* put pages and free array in vfree */ /* * VM_KASAN is used slighly differently depending on CONFIG_KASAN_VMALLOC. diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 74af03a2a769..63b19a571be7 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2377,8 +2377,11 @@ EXPORT_SYMBOL(vunmap); * @flags: vm_area->flags * @prot: page protection for the mapping * - * Maps @count pages from @pages into contiguous kernel virtual - * space. + * Maps @count pages from @pages into contiguous kernel virtual space. + * If @flags contains %VM_MAP_PUT_PAGES the ownership of the pages array itself + * (which must be kmalloc or vmalloc memory) and one reference per pages in it + * are transferred from the caller to vmap(), and will be freed / dropped when + * vfree() is called on the return value. * * Return: the address of the area or %NULL on failure */ @@ -2404,6 +2407,8 @@ void *vmap(struct page **pages, unsigned int count, return NULL; } + if (flags & VM_MAP_PUT_PAGES) + area->pages = pages; return area->addr; } EXPORT_SYMBOL(vmap); From 3e9a9e256b1e1e6e8f19faf76fa9c37578ae35ee Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 17 Oct 2020 16:15:10 -0700 Subject: [PATCH 157/168] mm: add a vmap_pfn function Add a proper helper to remap PFNs into kernel virtual space so that drivers don't have to abuse alloc_vm_area and open coded PTE manipulation for it. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Boris Ostrovsky Cc: Chris Wilson Cc: Jani Nikula Cc: Joonas Lahtinen Cc: Juergen Gross Cc: Matthew Auld Cc: "Matthew Wilcox (Oracle)" Cc: Minchan Kim Cc: Nitin Gupta Cc: Peter Zijlstra Cc: Rodrigo Vivi Cc: Stefano Stabellini Cc: Tvrtko Ursulin Cc: Uladzislau Rezki (Sony) Link: https://lkml.kernel.org/r/20201002122204.1534411-4-hch@lst.de Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 1 + mm/Kconfig | 3 +++ mm/vmalloc.c | 45 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 49 insertions(+) diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index b899681e3ff9..c77efeac2425 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -122,6 +122,7 @@ extern void vfree_atomic(const void *addr); extern void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_t prot); +void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot); extern void vunmap(const void *addr); extern int remap_vmalloc_range_partial(struct vm_area_struct *vma, diff --git a/mm/Kconfig b/mm/Kconfig index c7f30f8b282b..d42423f884a7 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -816,6 +816,9 @@ config DEVICE_PRIVATE memory; i.e., memory that is only accessible from the device (or group of devices). You likely also want to select HMM_MIRROR. +config VMAP_PFN + bool + config FRAME_VECTOR bool diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 63b19a571be7..c6bf73a64db8 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2413,6 +2413,51 @@ void *vmap(struct page **pages, unsigned int count, } EXPORT_SYMBOL(vmap); +#ifdef CONFIG_VMAP_PFN +struct vmap_pfn_data { + unsigned long *pfns; + pgprot_t prot; + unsigned int idx; +}; + +static int vmap_pfn_apply(pte_t *pte, unsigned long addr, void *private) +{ + struct vmap_pfn_data *data = private; + + if (WARN_ON_ONCE(pfn_valid(data->pfns[data->idx]))) + return -EINVAL; + *pte = pte_mkspecial(pfn_pte(data->pfns[data->idx++], data->prot)); + return 0; +} + +/** + * vmap_pfn - map an array of PFNs into virtually contiguous space + * @pfns: array of PFNs + * @count: number of pages to map + * @prot: page protection for the mapping + * + * Maps @count PFNs from @pfns into contiguous kernel virtual space and returns + * the start address of the mapping. + */ +void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot) +{ + struct vmap_pfn_data data = { .pfns = pfns, .prot = pgprot_nx(prot) }; + struct vm_struct *area; + + area = get_vm_area_caller(count * PAGE_SIZE, VM_IOREMAP, + __builtin_return_address(0)); + if (!area) + return NULL; + if (apply_to_page_range(&init_mm, (unsigned long)area->addr, + count * PAGE_SIZE, vmap_pfn_apply, &data)) { + free_vm_area(area); + return NULL; + } + return area->addr; +} +EXPORT_SYMBOL_GPL(vmap_pfn); +#endif /* CONFIG_VMAP_PFN */ + static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot, int node) { From eeb4a05fcef39a720d24846356cf65a07e71d7a1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 17 Oct 2020 16:15:14 -0700 Subject: [PATCH 158/168] mm: allow a NULL fn callback in apply_to_page_range Besides calling the callback on each page, apply_to_page_range also has the effect of pre-faulting all PTEs for the range. To support callers that only need the pre-faulting, make the callback optional. Based on a patch from Minchan Kim . Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Boris Ostrovsky Cc: Chris Wilson Cc: Jani Nikula Cc: Joonas Lahtinen Cc: Juergen Gross Cc: Matthew Auld Cc: "Matthew Wilcox (Oracle)" Cc: Nitin Gupta Cc: Peter Zijlstra Cc: Rodrigo Vivi Cc: Stefano Stabellini Cc: Tvrtko Ursulin Cc: Uladzislau Rezki (Sony) Link: https://lkml.kernel.org/r/20201002122204.1534411-5-hch@lst.de Signed-off-by: Linus Torvalds --- mm/memory.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 589afe45d0b3..c48f8df6e502 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2391,13 +2391,15 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, arch_enter_lazy_mmu_mode(); - do { - if (create || !pte_none(*pte)) { - err = fn(pte++, addr, data); - if (err) - break; - } - } while (addr += PAGE_SIZE, addr != end); + if (fn) { + do { + if (create || !pte_none(*pte)) { + err = fn(pte++, addr, data); + if (err) + break; + } + } while (addr += PAGE_SIZE, addr != end); + } *mask |= PGTBL_PTE_MODIFIED; arch_leave_lazy_mmu_mode(); From d1b6d2e1fe1d9d3ff2c7cb7ce84d463560aa5e33 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 17 Oct 2020 16:15:17 -0700 Subject: [PATCH 159/168] zsmalloc: switch from alloc_vm_area to get_vm_area Just manually pre-fault the PTEs using apply_to_page_range. Co-developed-by: Minchan Kim Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Boris Ostrovsky Cc: Chris Wilson Cc: Jani Nikula Cc: Joonas Lahtinen Cc: Juergen Gross Cc: Matthew Auld Cc: "Matthew Wilcox (Oracle)" Cc: Nitin Gupta Cc: Peter Zijlstra Cc: Rodrigo Vivi Cc: Stefano Stabellini Cc: Tvrtko Ursulin Cc: Uladzislau Rezki (Sony) Link: https://lkml.kernel.org/r/20201002122204.1534411-6-hch@lst.de Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index c36fdff9a371..918c7b019b3d 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1122,10 +1122,16 @@ static inline int __zs_cpu_up(struct mapping_area *area) */ if (area->vm) return 0; - area->vm = alloc_vm_area(PAGE_SIZE * 2, NULL); + area->vm = get_vm_area(PAGE_SIZE * 2, 0); if (!area->vm) return -ENOMEM; - return 0; + + /* + * Populate ptes in advance to avoid pte allocation with GFP_KERNEL + * in non-preemtible context of zs_map_object. + */ + return apply_to_page_range(&init_mm, (unsigned long)area->vm->addr, + PAGE_SIZE * 2, NULL, NULL); } static inline void __zs_cpu_down(struct mapping_area *area) From bfed6708d6c97406d14420f3288ee775c284ff8a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 17 Oct 2020 16:15:21 -0700 Subject: [PATCH 160/168] drm/i915: use vmap in shmem_pin_map shmem_pin_map somewhat awkwardly reimplements vmap using alloc_vm_area and manual pte setup. The only practical difference is that alloc_vm_area prefeaults the vmalloc area PTEs, which doesn't seem to be required here (and could be added to vmap using a flag if actually required). Switch to use vmap, and use vfree to free both the vmalloc mapping and the page array, as well as dropping the references to each page. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Reviewed-by: Tvrtko Ursulin Cc: Boris Ostrovsky Cc: Chris Wilson Cc: Jani Nikula Cc: Joonas Lahtinen Cc: Juergen Gross Cc: Matthew Auld Cc: "Matthew Wilcox (Oracle)" Cc: Minchan Kim Cc: Nitin Gupta Cc: Peter Zijlstra Cc: Rodrigo Vivi Cc: Stefano Stabellini Cc: Uladzislau Rezki (Sony) Link: https://lkml.kernel.org/r/20201002122204.1534411-7-hch@lst.de Signed-off-by: Linus Torvalds --- drivers/gpu/drm/i915/gt/shmem_utils.c | 76 +++++++-------------------- 1 file changed, 18 insertions(+), 58 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/shmem_utils.c b/drivers/gpu/drm/i915/gt/shmem_utils.c index 43c7acbdc79d..f011ea42487e 100644 --- a/drivers/gpu/drm/i915/gt/shmem_utils.c +++ b/drivers/gpu/drm/i915/gt/shmem_utils.c @@ -49,80 +49,40 @@ struct file *shmem_create_from_object(struct drm_i915_gem_object *obj) return file; } -static size_t shmem_npte(struct file *file) -{ - return file->f_mapping->host->i_size >> PAGE_SHIFT; -} - -static void __shmem_unpin_map(struct file *file, void *ptr, size_t n_pte) -{ - unsigned long pfn; - - vunmap(ptr); - - for (pfn = 0; pfn < n_pte; pfn++) { - struct page *page; - - page = shmem_read_mapping_page_gfp(file->f_mapping, pfn, - GFP_KERNEL); - if (!WARN_ON(IS_ERR(page))) { - put_page(page); - put_page(page); - } - } -} - void *shmem_pin_map(struct file *file) { - const size_t n_pte = shmem_npte(file); - pte_t *stack[32], **ptes, **mem; - struct vm_struct *area; - unsigned long pfn; + struct page **pages; + size_t n_pages, i; + void *vaddr; - mem = stack; - if (n_pte > ARRAY_SIZE(stack)) { - mem = kvmalloc_array(n_pte, sizeof(*mem), GFP_KERNEL); - if (!mem) - return NULL; - } - - area = alloc_vm_area(n_pte << PAGE_SHIFT, mem); - if (!area) { - if (mem != stack) - kvfree(mem); + n_pages = file->f_mapping->host->i_size >> PAGE_SHIFT; + pages = kvmalloc_array(n_pages, sizeof(*pages), GFP_KERNEL); + if (!pages) return NULL; - } - ptes = mem; - for (pfn = 0; pfn < n_pte; pfn++) { - struct page *page; - - page = shmem_read_mapping_page_gfp(file->f_mapping, pfn, - GFP_KERNEL); - if (IS_ERR(page)) + for (i = 0; i < n_pages; i++) { + pages[i] = shmem_read_mapping_page_gfp(file->f_mapping, i, + GFP_KERNEL); + if (IS_ERR(pages[i])) goto err_page; - - **ptes++ = mk_pte(page, PAGE_KERNEL); } - if (mem != stack) - kvfree(mem); - + vaddr = vmap(pages, n_pages, VM_MAP_PUT_PAGES, PAGE_KERNEL); + if (!vaddr) + goto err_page; mapping_set_unevictable(file->f_mapping); - return area->addr; - + return vaddr; err_page: - if (mem != stack) - kvfree(mem); - - __shmem_unpin_map(file, area->addr, pfn); + while (--i >= 0) + put_page(pages[i]); + kvfree(pages); return NULL; } void shmem_unpin_map(struct file *file, void *ptr) { mapping_clear_unevictable(file->f_mapping); - __shmem_unpin_map(file, ptr, shmem_npte(file)); + vfree(ptr); } static int __shmem_rw(struct file *file, loff_t off, From 46ce3a62b1461d6950c0c353f106761d90a45258 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 17 Oct 2020 16:15:25 -0700 Subject: [PATCH 161/168] drm/i915: stop using kmap in i915_gem_object_map kmap for !PageHighmem is just a convoluted way to say page_address, and kunmap is a no-op in that case. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Reviewed-by: Tvrtko Ursulin Cc: Boris Ostrovsky Cc: Chris Wilson Cc: Jani Nikula Cc: Joonas Lahtinen Cc: Juergen Gross Cc: Matthew Auld Cc: "Matthew Wilcox (Oracle)" Cc: Minchan Kim Cc: Nitin Gupta Cc: Peter Zijlstra Cc: Rodrigo Vivi Cc: Stefano Stabellini Cc: Uladzislau Rezki (Sony) Link: https://lkml.kernel.org/r/20201002122204.1534411-8-hch@lst.de Signed-off-by: Linus Torvalds --- drivers/gpu/drm/i915/gem/i915_gem_pages.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_pages.c b/drivers/gpu/drm/i915/gem/i915_gem_pages.c index d6eeefab3d01..6550c0bc824e 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_pages.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_pages.c @@ -162,8 +162,6 @@ static void unmap_object(struct drm_i915_gem_object *obj, void *ptr) { if (is_vmalloc_addr(ptr)) vunmap(ptr); - else - kunmap(kmap_to_page(ptr)); } struct sg_table * @@ -277,11 +275,10 @@ static void *i915_gem_object_map(struct drm_i915_gem_object *obj, * forever. * * So if the page is beyond the 32b boundary, make an explicit - * vmap. On 64b, this check will be optimised away as we can - * directly kmap any page on the system. + * vmap. */ if (!PageHighMem(page)) - return kmap(page); + return page_address(page); } mem = stack; From 534a6687aaccce56c4801b70c651da311b71d402 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 17 Oct 2020 16:15:28 -0700 Subject: [PATCH 162/168] drm/i915: use vmap in i915_gem_object_map i915_gem_object_map implements fairly low-level vmap functionality in a driver. Split it into two helpers, one for remapping kernel memory which can use vmap, and one for I/O memory that uses vmap_pfn. The only practical difference is that alloc_vm_area prefeaults the vmalloc area PTEs, which doesn't seem to be required here for the kernel memory case (and could be added to vmap using a flag if actually required). Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Reviewed-by: Tvrtko Ursulin Cc: Boris Ostrovsky Cc: Chris Wilson Cc: Jani Nikula Cc: Joonas Lahtinen Cc: Juergen Gross Cc: Matthew Auld Cc: "Matthew Wilcox (Oracle)" Cc: Minchan Kim Cc: Nitin Gupta Cc: Peter Zijlstra Cc: Rodrigo Vivi Cc: Stefano Stabellini Cc: Uladzislau Rezki (Sony) Link: https://lkml.kernel.org/r/20201002122204.1534411-9-hch@lst.de Signed-off-by: Linus Torvalds --- drivers/gpu/drm/i915/Kconfig | 1 + drivers/gpu/drm/i915/gem/i915_gem_pages.c | 131 ++++++++++------------ 2 files changed, 62 insertions(+), 70 deletions(-) diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig index 9afa5c4a6bf0..1e1cb245fca7 100644 --- a/drivers/gpu/drm/i915/Kconfig +++ b/drivers/gpu/drm/i915/Kconfig @@ -25,6 +25,7 @@ config DRM_I915 select CRC32 select SND_HDA_I915 if SND_HDA_CORE select CEC_CORE if CEC_NOTIFIER + select VMAP_PFN help Choose this option if you have a system that has "Intel Graphics Media Accelerator" or "HD Graphics" integrated graphics, diff --git a/drivers/gpu/drm/i915/gem/i915_gem_pages.c b/drivers/gpu/drm/i915/gem/i915_gem_pages.c index 6550c0bc824e..f60ca6dc911f 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_pages.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_pages.c @@ -232,34 +232,21 @@ unlock: return err; } -static inline pte_t iomap_pte(resource_size_t base, - dma_addr_t offset, - pgprot_t prot) -{ - return pte_mkspecial(pfn_pte((base + offset) >> PAGE_SHIFT, prot)); -} - /* The 'mapping' part of i915_gem_object_pin_map() below */ -static void *i915_gem_object_map(struct drm_i915_gem_object *obj, - enum i915_map_type type) +static void *i915_gem_object_map_page(struct drm_i915_gem_object *obj, + enum i915_map_type type) { - unsigned long n_pte = obj->base.size >> PAGE_SHIFT; - struct sg_table *sgt = obj->mm.pages; - pte_t *stack[32], **mem; - struct vm_struct *area; + unsigned long n_pages = obj->base.size >> PAGE_SHIFT, i; + struct page *stack[32], **pages = stack, *page; + struct sgt_iter iter; pgprot_t pgprot; + void *vaddr; - if (!i915_gem_object_has_struct_page(obj) && type != I915_MAP_WC) - return NULL; - - if (GEM_WARN_ON(type == I915_MAP_WC && - !static_cpu_has(X86_FEATURE_PAT))) - return NULL; - - /* A single page can always be kmapped */ - if (n_pte == 1 && type == I915_MAP_WB) { - struct page *page = sg_page(sgt->sgl); - + switch (type) { + default: + MISSING_CASE(type); + fallthrough; /* to use PAGE_KERNEL anyway */ + case I915_MAP_WB: /* * On 32b, highmem using a finite set of indirect PTE (i.e. * vmap) to provide virtual mappings of the high pages. @@ -277,30 +264,8 @@ static void *i915_gem_object_map(struct drm_i915_gem_object *obj, * So if the page is beyond the 32b boundary, make an explicit * vmap. */ - if (!PageHighMem(page)) - return page_address(page); - } - - mem = stack; - if (n_pte > ARRAY_SIZE(stack)) { - /* Too big for stack -- allocate temporary array instead */ - mem = kvmalloc_array(n_pte, sizeof(*mem), GFP_KERNEL); - if (!mem) - return NULL; - } - - area = alloc_vm_area(obj->base.size, mem); - if (!area) { - if (mem != stack) - kvfree(mem); - return NULL; - } - - switch (type) { - default: - MISSING_CASE(type); - fallthrough; /* to use PAGE_KERNEL anyway */ - case I915_MAP_WB: + if (n_pages == 1 && !PageHighMem(sg_page(obj->mm.pages->sgl))) + return page_address(sg_page(obj->mm.pages->sgl)); pgprot = PAGE_KERNEL; break; case I915_MAP_WC: @@ -308,30 +273,50 @@ static void *i915_gem_object_map(struct drm_i915_gem_object *obj, break; } - if (i915_gem_object_has_struct_page(obj)) { - struct sgt_iter iter; - struct page *page; - pte_t **ptes = mem; - - for_each_sgt_page(page, iter, sgt) - **ptes++ = mk_pte(page, pgprot); - } else { - resource_size_t iomap; - struct sgt_iter iter; - pte_t **ptes = mem; - dma_addr_t addr; - - iomap = obj->mm.region->iomap.base; - iomap -= obj->mm.region->region.start; - - for_each_sgt_daddr(addr, iter, sgt) - **ptes++ = iomap_pte(iomap, addr, pgprot); + if (n_pages > ARRAY_SIZE(stack)) { + /* Too big for stack -- allocate temporary array instead */ + pages = kvmalloc_array(n_pages, sizeof(*pages), GFP_KERNEL); + if (!pages) + return NULL; } - if (mem != stack) - kvfree(mem); + i = 0; + for_each_sgt_page(page, iter, obj->mm.pages) + pages[i++] = page; + vaddr = vmap(pages, n_pages, 0, pgprot); + if (pages != stack) + kvfree(pages); + return vaddr; +} - return area->addr; +static void *i915_gem_object_map_pfn(struct drm_i915_gem_object *obj, + enum i915_map_type type) +{ + resource_size_t iomap = obj->mm.region->iomap.base - + obj->mm.region->region.start; + unsigned long n_pfn = obj->base.size >> PAGE_SHIFT; + unsigned long stack[32], *pfns = stack, i; + struct sgt_iter iter; + dma_addr_t addr; + void *vaddr; + + if (type != I915_MAP_WC) + return NULL; + + if (n_pfn > ARRAY_SIZE(stack)) { + /* Too big for stack -- allocate temporary array instead */ + pfns = kvmalloc_array(n_pfn, sizeof(*pfns), GFP_KERNEL); + if (!pfns) + return NULL; + } + + i = 0; + for_each_sgt_daddr(addr, iter, obj->mm.pages) + pfns[i++] = (iomap + addr) >> PAGE_SHIFT; + vaddr = vmap_pfn(pfns, n_pfn, pgprot_writecombine(PAGE_KERNEL_IO)); + if (pfns != stack) + kvfree(pfns); + return vaddr; } /* get, pin, and map the pages of the object into kernel space */ @@ -383,7 +368,13 @@ void *i915_gem_object_pin_map(struct drm_i915_gem_object *obj, } if (!ptr) { - ptr = i915_gem_object_map(obj, type); + if (GEM_WARN_ON(type == I915_MAP_WC && + !static_cpu_has(X86_FEATURE_PAT))) + ptr = NULL; + else if (i915_gem_object_has_struct_page(obj)) + ptr = i915_gem_object_map_page(obj, type); + else + ptr = i915_gem_object_map_pfn(obj, type); if (!ptr) { err = -ENOMEM; goto err_unpin; From b723caece361029fa4aec83b951db2888d611c2b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 17 Oct 2020 16:15:32 -0700 Subject: [PATCH 163/168] xen/xenbus: use apply_to_page_range directly in xenbus_map_ring_pv Replacing alloc_vm_area with get_vm_area_caller + apply_page_range allows to fill put the phys_addr values directly instead of doing another loop over all addresses. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Reviewed-by: Boris Ostrovsky Cc: Chris Wilson Cc: Jani Nikula Cc: Joonas Lahtinen Cc: Juergen Gross Cc: Matthew Auld Cc: "Matthew Wilcox (Oracle)" Cc: Minchan Kim Cc: Nitin Gupta Cc: Peter Zijlstra Cc: Rodrigo Vivi Cc: Stefano Stabellini Cc: Tvrtko Ursulin Cc: Uladzislau Rezki (Sony) Link: https://lkml.kernel.org/r/20201002122204.1534411-10-hch@lst.de Signed-off-by: Linus Torvalds --- drivers/xen/xenbus/xenbus_client.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c index 2690318ad50f..fd80e318b99c 100644 --- a/drivers/xen/xenbus/xenbus_client.c +++ b/drivers/xen/xenbus/xenbus_client.c @@ -73,16 +73,13 @@ struct map_ring_valloc { struct xenbus_map_node *node; /* Why do we need two arrays? See comment of __xenbus_map_ring */ - union { - unsigned long addrs[XENBUS_MAX_RING_GRANTS]; - pte_t *ptes[XENBUS_MAX_RING_GRANTS]; - }; + unsigned long addrs[XENBUS_MAX_RING_GRANTS]; phys_addr_t phys_addrs[XENBUS_MAX_RING_GRANTS]; struct gnttab_map_grant_ref map[XENBUS_MAX_RING_GRANTS]; struct gnttab_unmap_grant_ref unmap[XENBUS_MAX_RING_GRANTS]; - unsigned int idx; /* HVM only. */ + unsigned int idx; }; static DEFINE_SPINLOCK(xenbus_valloc_lock); @@ -686,6 +683,14 @@ int xenbus_unmap_ring_vfree(struct xenbus_device *dev, void *vaddr) EXPORT_SYMBOL_GPL(xenbus_unmap_ring_vfree); #ifdef CONFIG_XEN_PV +static int map_ring_apply(pte_t *pte, unsigned long addr, void *data) +{ + struct map_ring_valloc *info = data; + + info->phys_addrs[info->idx++] = arbitrary_virt_to_machine(pte).maddr; + return 0; +} + static int xenbus_map_ring_pv(struct xenbus_device *dev, struct map_ring_valloc *info, grant_ref_t *gnt_refs, @@ -694,18 +699,15 @@ static int xenbus_map_ring_pv(struct xenbus_device *dev, { struct xenbus_map_node *node = info->node; struct vm_struct *area; - int err = GNTST_okay; - int i; - bool leaked; + bool leaked = false; + int err = -ENOMEM; - area = alloc_vm_area(XEN_PAGE_SIZE * nr_grefs, info->ptes); + area = get_vm_area(XEN_PAGE_SIZE * nr_grefs, VM_IOREMAP); if (!area) return -ENOMEM; - - for (i = 0; i < nr_grefs; i++) - info->phys_addrs[i] = - arbitrary_virt_to_machine(info->ptes[i]).maddr; - + if (apply_to_page_range(&init_mm, (unsigned long)area->addr, + XEN_PAGE_SIZE * nr_grefs, map_ring_apply, info)) + goto failed; err = __xenbus_map_ring(dev, gnt_refs, nr_grefs, node->handles, info, GNTMAP_host_map | GNTMAP_contains_pte, &leaked); From 5dd63bf1d0a788d1bbd9c94bb07a70133430133e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 17 Oct 2020 16:15:36 -0700 Subject: [PATCH 164/168] x86/xen: open code alloc_vm_area in arch_gnttab_valloc Replace the last call to alloc_vm_area with an open coded version using an iterator in struct gnttab_vm_area instead of the triple indirection magic in alloc_vm_area. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Reviewed-by: Boris Ostrovsky Cc: Chris Wilson Cc: Jani Nikula Cc: Joonas Lahtinen Cc: Juergen Gross Cc: Matthew Auld Cc: "Matthew Wilcox (Oracle)" Cc: Minchan Kim Cc: Nitin Gupta Cc: Peter Zijlstra Cc: Rodrigo Vivi Cc: Stefano Stabellini Cc: Tvrtko Ursulin Cc: Uladzislau Rezki (Sony) Link: https://lkml.kernel.org/r/20201002122204.1534411-11-hch@lst.de Signed-off-by: Linus Torvalds --- arch/x86/xen/grant-table.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c index 4988e19598c8..1e681bf62561 100644 --- a/arch/x86/xen/grant-table.c +++ b/arch/x86/xen/grant-table.c @@ -25,6 +25,7 @@ static struct gnttab_vm_area { struct vm_struct *area; pte_t **ptes; + int idx; } gnttab_shared_vm_area, gnttab_status_vm_area; int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes, @@ -90,19 +91,31 @@ void arch_gnttab_unmap(void *shared, unsigned long nr_gframes) } } +static int gnttab_apply(pte_t *pte, unsigned long addr, void *data) +{ + struct gnttab_vm_area *area = data; + + area->ptes[area->idx++] = pte; + return 0; +} + static int arch_gnttab_valloc(struct gnttab_vm_area *area, unsigned nr_frames) { area->ptes = kmalloc_array(nr_frames, sizeof(*area->ptes), GFP_KERNEL); if (area->ptes == NULL) return -ENOMEM; - - area->area = alloc_vm_area(PAGE_SIZE * nr_frames, area->ptes); - if (area->area == NULL) { - kfree(area->ptes); - return -ENOMEM; - } - + area->area = get_vm_area(PAGE_SIZE * nr_frames, VM_IOREMAP); + if (!area->area) + goto out_free_ptes; + if (apply_to_page_range(&init_mm, (unsigned long)area->area->addr, + PAGE_SIZE * nr_frames, gnttab_apply, area)) + goto out_free_vm_area; return 0; +out_free_vm_area: + free_vm_area(area->area); +out_free_ptes: + kfree(area->ptes); + return -ENOMEM; } static void arch_gnttab_vfree(struct gnttab_vm_area *area) From 301fa9f2ddf7fb248c188af292c9cc04f8283dff Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 17 Oct 2020 16:15:39 -0700 Subject: [PATCH 165/168] mm: remove alloc_vm_area All users are gone now. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Boris Ostrovsky Cc: Chris Wilson Cc: Jani Nikula Cc: Joonas Lahtinen Cc: Juergen Gross Cc: Matthew Auld Cc: "Matthew Wilcox (Oracle)" Cc: Minchan Kim Cc: Nitin Gupta Cc: Peter Zijlstra Cc: Rodrigo Vivi Cc: Stefano Stabellini Cc: Tvrtko Ursulin Cc: Uladzislau Rezki (Sony) Link: https://lkml.kernel.org/r/20201002122204.1534411-12-hch@lst.de Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 5 +---- mm/nommu.c | 7 ------ mm/vmalloc.c | 48 ----------------------------------------- 3 files changed, 1 insertion(+), 59 deletions(-) diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index c77efeac2425..938eaf9517e2 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -169,6 +169,7 @@ extern struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, unsigned long start, unsigned long end, const void *caller); +void free_vm_area(struct vm_struct *area); extern struct vm_struct *remove_vm_area(const void *addr); extern struct vm_struct *find_vm_area(const void *addr); @@ -204,10 +205,6 @@ static inline void set_vm_flush_reset_perms(void *addr) } #endif -/* Allocate/destroy a 'vmalloc' VM area. */ -extern struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes); -extern void free_vm_area(struct vm_struct *area); - /* for /dev/kmem */ extern long vread(char *buf, char *addr, unsigned long count); extern long vwrite(char *buf, char *addr, unsigned long count); diff --git a/mm/nommu.c b/mm/nommu.c index 0df7ca321314..0faf39b32cdb 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -354,13 +354,6 @@ void vm_unmap_aliases(void) } EXPORT_SYMBOL_GPL(vm_unmap_aliases); -struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes) -{ - BUG(); - return NULL; -} -EXPORT_SYMBOL_GPL(alloc_vm_area); - void free_vm_area(struct vm_struct *area) { BUG(); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index c6bf73a64db8..5c61bf85edb1 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3083,54 +3083,6 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, } EXPORT_SYMBOL(remap_vmalloc_range); -static int f(pte_t *pte, unsigned long addr, void *data) -{ - pte_t ***p = data; - - if (p) { - *(*p) = pte; - (*p)++; - } - return 0; -} - -/** - * alloc_vm_area - allocate a range of kernel address space - * @size: size of the area - * @ptes: returns the PTEs for the address space - * - * Returns: NULL on failure, vm_struct on success - * - * This function reserves a range of kernel address space, and - * allocates pagetables to map that range. No actual mappings - * are created. - * - * If @ptes is non-NULL, pointers to the PTEs (in init_mm) - * allocated for the VM area are returned. - */ -struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes) -{ - struct vm_struct *area; - - area = get_vm_area_caller(size, VM_IOREMAP, - __builtin_return_address(0)); - if (area == NULL) - return NULL; - - /* - * This ensures that page tables are constructed for this region - * of kernel virtual address space and mapped into init_mm. - */ - if (apply_to_page_range(&init_mm, (unsigned long)area->addr, - size, f, ptes ? &ptes : NULL)) { - free_vm_area(area); - return NULL; - } - - return area; -} -EXPORT_SYMBOL_GPL(alloc_vm_area); - void free_vm_area(struct vm_struct *area) { struct vm_struct *ret; From f255935b976729dbd8ddd079b96ddb6ecb1895bc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 17 Oct 2020 16:15:43 -0700 Subject: [PATCH 166/168] mm: cleanup the gfp_mask handling in __vmalloc_area_node Patch series "two small vmalloc cleanups". This patch (of 2): __vmalloc_area_node currently has four different gfp_t variables to just express this simple logic: - use the passed in mask, plus __GFP_NOWARN and __GFP_HIGHMEM (if suitable) for the underlying page allocation - use just the reclaim flags from the passed in mask plus __GFP_ZERO for allocating the page array Simplify this down to just use the pre-existing nested_gfp as-is for the page array allocation, and just the passed in gfp_mask for the page allocation, after conditionally ORing __GFP_HIGHMEM into it. This also makes the allocation warning a little more correct. Also initialize two variables at the time of declaration while touching this area. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Uladzislau Rezki (Sony) Link: https://lkml.kernel.org/r/20201002124035.1539300-1-hch@lst.de Link: https://lkml.kernel.org/r/20201002124035.1539300-2-hch@lst.de Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 5c61bf85edb1..acd11d3b8667 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2461,21 +2461,19 @@ EXPORT_SYMBOL_GPL(vmap_pfn); static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot, int node) { - struct page **pages; - unsigned int nr_pages, array_size, i; const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; - const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN; - const gfp_t highmem_mask = (gfp_mask & (GFP_DMA | GFP_DMA32)) ? - 0 : - __GFP_HIGHMEM; + unsigned int nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; + unsigned int array_size = nr_pages * sizeof(struct page *), i; + struct page **pages; - nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; - array_size = (nr_pages * sizeof(struct page *)); + gfp_mask |= __GFP_NOWARN; + if (!(gfp_mask & (GFP_DMA | GFP_DMA32))) + gfp_mask |= __GFP_HIGHMEM; /* Please note that the recursion is strictly bounded. */ if (array_size > PAGE_SIZE) { - pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask, - node, area->caller); + pages = __vmalloc_node(array_size, 1, nested_gfp, node, + area->caller); } else { pages = kmalloc_node(array_size, nested_gfp, node); } @@ -2493,9 +2491,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, struct page *page; if (node == NUMA_NO_NODE) - page = alloc_page(alloc_mask|highmem_mask); + page = alloc_page(gfp_mask); else - page = alloc_pages_node(node, alloc_mask|highmem_mask, 0); + page = alloc_pages_node(node, gfp_mask, 0); if (unlikely(!page)) { /* Successfully allocated i pages, free them in __vfree() */ From b71df8de41d2d2cdea6c8d2756cea2d91d517596 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 17 Oct 2020 16:15:46 -0700 Subject: [PATCH 167/168] mm: remove the filename in the top of file comment in vmalloc.c No point in having the filename inside the file. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Uladzislau Rezki (Sony) Link: https://lkml.kernel.org/r/20201002124035.1539300-3-hch@lst.de Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index acd11d3b8667..6ae491a8b210 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * linux/mm/vmalloc.c - * * Copyright (C) 1993 Linus Torvalds * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 * SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian , May 2000 From c922781fef43d2ddbdef36a3a281441bb153377b Mon Sep 17 00:00:00 2001 From: Tian Tao Date: Sat, 17 Oct 2020 16:15:49 -0700 Subject: [PATCH 168/168] mm: remove duplicate include statement in mmu.c asm/sections.h is included more than once, Remove the one that isn't necessary. Signed-off-by: Tian Tao Signed-off-by: Andrew Morton Reviewed-by: Mike Rapoport Link: https://lkml.kernel.org/r/1600088607-17327-1-git-send-email-tiantao6@hisilicon.com Signed-off-by: Linus Torvalds --- arch/arm/mm/mmu.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index 698cc740c6b8..ab69250a86bc 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -17,7 +17,6 @@ #include #include -#include #include #include #include