From 5597613fb3cf0e36d26cfd8fb2a63196da249333 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 24 May 2024 18:30:56 +0200 Subject: [PATCH 1/4] selftests: mptcp: lib: support flaky subtests Some subtests can be unstable, failing once every X runs. Fixing them can take time: there could be an issue in the kernel or in the subtest, and it is then important to do a proper analysis, not to hide real bugs. To avoid creating noises on the different CIs, it is important to have a simple way to mark subtests as flaky, and ignore the errors. This is what this patch introduces: subtests can be marked as flaky by setting MPTCP_LIB_SUBTEST_FLAKY env var to 1, e.g. MPTCP_LIB_SUBTEST_FLAKY=1 The subtest will be executed, and errors (if any) will be ignored. It is still good to run these subtests, as it exercises code, and the results can still be useful for the on-going investigations. Note that the MPTCP CI will continue to track these flaky subtests by setting SELFTESTS_MPTCP_LIB_OVERRIDE_FLAKY env var to 1, and a ticket has to be created before marking subtests as flaky. Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://lore.kernel.org/r/20240524-upstream-net-20240524-selftests-mptcp-flaky-v1-1-a352362f3f8e@kernel.org Signed-off-by: Jakub Kicinski --- .../testing/selftests/net/mptcp/mptcp_lib.sh | 30 +++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_lib.sh b/tools/testing/selftests/net/mptcp/mptcp_lib.sh index ad2ebda5cb64..6ffa9b7a3260 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_lib.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_lib.sh @@ -21,6 +21,7 @@ declare -rx MPTCP_LIB_AF_INET6=10 MPTCP_LIB_SUBTESTS=() MPTCP_LIB_SUBTESTS_DUPLICATED=0 +MPTCP_LIB_SUBTEST_FLAKY=0 MPTCP_LIB_TEST_COUNTER=0 MPTCP_LIB_TEST_FORMAT="%02u %-50s" MPTCP_LIB_IP_MPTCP=0 @@ -41,6 +42,16 @@ else readonly MPTCP_LIB_COLOR_RESET= fi +# SELFTESTS_MPTCP_LIB_OVERRIDE_FLAKY env var can be set not to ignore errors +# from subtests marked as flaky +mptcp_lib_override_flaky() { + [ "${SELFTESTS_MPTCP_LIB_OVERRIDE_FLAKY:-}" = 1 ] +} + +mptcp_lib_subtest_is_flaky() { + [ "${MPTCP_LIB_SUBTEST_FLAKY}" = 1 ] && ! mptcp_lib_override_flaky +} + # $1: color, $2: text mptcp_lib_print_color() { echo -e "${MPTCP_LIB_START_PRINT:-}${*}${MPTCP_LIB_COLOR_RESET}" @@ -72,7 +83,16 @@ mptcp_lib_pr_skip() { } mptcp_lib_pr_fail() { - mptcp_lib_print_err "[FAIL]${1:+ ${*}}" + local title cmt + + if mptcp_lib_subtest_is_flaky; then + title="IGNO" + cmt=" (flaky)" + else + title="FAIL" + fi + + mptcp_lib_print_err "[${title}]${cmt}${1:+ ${*}}" } mptcp_lib_pr_info() { @@ -208,7 +228,13 @@ mptcp_lib_result_pass() { # $1: test name mptcp_lib_result_fail() { - __mptcp_lib_result_add "not ok" "${1}" + if mptcp_lib_subtest_is_flaky; then + # It might sound better to use 'not ok # TODO' or 'ok # SKIP', + # but some CIs don't understand 'TODO' and treat SKIP as errors. + __mptcp_lib_result_add "ok" "${1} # IGNORE Flaky" + else + __mptcp_lib_result_add "not ok" "${1}" + fi } # $1: test name From cc73a6577ae64247898269d138dee6b73ff710cc Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 24 May 2024 18:30:57 +0200 Subject: [PATCH 2/4] selftests: mptcp: simult flows: mark 'unbalanced' tests as flaky These tests are flaky since their introduction. This might be less or not visible depending on the CI running the tests, especially if it is also busy doing other tasks in parallel. A first analysis shown that the transfer can be slowed down when there are some re-injections at the MPTCP level. Such re-injections can of course happen, and disturb the transfer, but it looks strange to have them in this lab. That could be caused by the kernel having access to less CPU cycles -- e.g. when other activities are executed in parallel -- or by a misinterpretation on the MPTCP packet scheduler side. While this is being investigated, the tests are marked as flaky not to create noises in other CIs. Fixes: 219d04992b68 ("mptcp: push pending frames when subflow has free space") Link: https://github.com/multipath-tcp/mptcp_net-next/issues/475 Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://lore.kernel.org/r/20240524-upstream-net-20240524-selftests-mptcp-flaky-v1-2-a352362f3f8e@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/simult_flows.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/simult_flows.sh b/tools/testing/selftests/net/mptcp/simult_flows.sh index 4b14b4412166..f74e1c3c126d 100755 --- a/tools/testing/selftests/net/mptcp/simult_flows.sh +++ b/tools/testing/selftests/net/mptcp/simult_flows.sh @@ -244,7 +244,7 @@ run_test() do_transfer $small $large $time lret=$? mptcp_lib_result_code "${lret}" "${msg}" - if [ $lret -ne 0 ]; then + if [ $lret -ne 0 ] && ! mptcp_lib_subtest_is_flaky; then ret=$lret [ $bail -eq 0 ] || exit $ret fi @@ -254,7 +254,7 @@ run_test() do_transfer $large $small $time lret=$? mptcp_lib_result_code "${lret}" "${msg}" - if [ $lret -ne 0 ]; then + if [ $lret -ne 0 ] && ! mptcp_lib_subtest_is_flaky; then ret=$lret [ $bail -eq 0 ] || exit $ret fi @@ -290,7 +290,7 @@ run_test 10 10 0 0 "balanced bwidth" run_test 10 10 1 25 "balanced bwidth with unbalanced delay" # we still need some additional infrastructure to pass the following test-cases -run_test 10 3 0 0 "unbalanced bwidth" +MPTCP_LIB_SUBTEST_FLAKY=1 run_test 10 3 0 0 "unbalanced bwidth" run_test 10 3 1 25 "unbalanced bwidth with unbalanced delay" run_test 10 3 25 1 "unbalanced bwidth with opposed, unbalanced delay" From 8c06ac2178a9dee887929232226e35a5cdda1793 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 24 May 2024 18:30:58 +0200 Subject: [PATCH 3/4] selftests: mptcp: join: mark 'fastclose' tests as flaky These tests are flaky since their introduction. This might be less or not visible depending on the CI running the tests, especially if it is also busy doing other tasks in parallel, and if a debug kernel config is being used. It looks like this issue is often present with the NetDev CI. While this is being investigated, the tests are marked as flaky not to create noises on such CIs. Fixes: 01542c9bf9ab ("selftests: mptcp: add fastclose testcase") Link: https://github.com/multipath-tcp/mptcp_net-next/issues/324 Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://lore.kernel.org/r/20240524-upstream-net-20240524-selftests-mptcp-flaky-v1-3-a352362f3f8e@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index fefa9173bdaa..b869b46823d7 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -261,6 +261,8 @@ reset() TEST_NAME="${1}" + MPTCP_LIB_SUBTEST_FLAKY=0 # reset if modified + if skip_test; then MPTCP_LIB_TEST_COUNTER=$((MPTCP_LIB_TEST_COUNTER+1)) last_test_ignored=1 @@ -448,7 +450,9 @@ reset_with_tcp_filter() # $1: err msg fail_test() { - ret=${KSFT_FAIL} + if ! mptcp_lib_subtest_is_flaky; then + ret=${KSFT_FAIL} + fi if [ ${#} -gt 0 ]; then print_fail "${@}" @@ -3069,6 +3073,7 @@ fullmesh_tests() fastclose_tests() { if reset_check_counter "fastclose test" "MPTcpExtMPFastcloseTx"; then + MPTCP_LIB_SUBTEST_FLAKY=1 test_linkfail=1024 fastclose=client \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 0 0 0 @@ -3077,6 +3082,7 @@ fastclose_tests() fi if reset_check_counter "fastclose server test" "MPTcpExtMPFastcloseRx"; then + MPTCP_LIB_SUBTEST_FLAKY=1 test_linkfail=1024 fastclose=server \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 0 0 0 0 0 0 1 From 38af56e6668b455f7dd0a8e2d9afe74100068e17 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 24 May 2024 18:30:59 +0200 Subject: [PATCH 4/4] selftests: mptcp: join: mark 'fail' tests as flaky These tests are rarely unstable. It depends on the CI running the tests, especially if it is also busy doing other tasks in parallel, and if a debug kernel config is being used. It looks like this issue is sometimes present with the NetDev CI. While this is being investigated, the tests are marked as flaky not to create noises on such CIs. Fixes: b6e074e171bc ("selftests: mptcp: add infinite map testcase") Link: https://github.com/multipath-tcp/mptcp_net-next/issues/491 Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://lore.kernel.org/r/20240524-upstream-net-20240524-selftests-mptcp-flaky-v1-4-a352362f3f8e@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index b869b46823d7..2b66c5fa71eb 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -3101,6 +3101,7 @@ fail_tests() { # single subflow if reset_with_fail "Infinite map" 1; then + MPTCP_LIB_SUBTEST_FLAKY=1 test_linkfail=128 \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 0 0 0 +1 +0 1 0 1 "$(pedit_action_pkts)" @@ -3109,6 +3110,7 @@ fail_tests() # multiple subflows if reset_with_fail "MP_FAIL MP_RST" 2; then + MPTCP_LIB_SUBTEST_FLAKY=1 tc -n $ns2 qdisc add dev ns2eth1 root netem rate 1mbit delay 5ms pm_nl_set_limits $ns1 0 1 pm_nl_set_limits $ns2 0 1